import os
import re
import sys
import time
import urllib
import requests
import threading
from hashlib import md5
from fake_useragent import UserAgent
from multiprocessing.pool import Pool
def image_parser(html):
if html.get('items'):
data = html.get('items')
for item in data: # 字典的遍历
image_url = item.get('ori_pic_url') # json 值的获取
r1 = r'[\\/:*?"<>|.]' # 去除非法文件命名
image_titles = re.sub(r1, '-', item.get('title'))
yield {
'image_title': image_titles,
'image_url': image_url
} # 运用 generator 生成字典
def save_image(item):
img_path = 'imgs' + os.path.sep + item.get('image_title')
# 于上面的函数不一样,只能调用 yeild 生成器的值,不要用作 item.get (‘ orinigl_url')
print("获取路径成功")
if img_path[0:50]: # 字符串的截取,解决文件名过长的导致无法建立文件夹
if not os.path.exists(img_path): # 谨记加 not,否则找不到文件
os.makedirs(img_path)
try:
start = time.time()
size = 0
resp = requests.get(item.get('image_url'),stream=True,allow_redirects=False)
chunk_size = 1024
if resp.status_code == 200:
file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
file_name=md5(resp.content).hexdigest(),
file_suffix='jpg')
if not os.path.exists(file_path): # 谨记加 not,建立新文件
try:
content_size = int(resp.headers['content-length'])
#这里出现的问题
print('[文件大小]:%0.2f MB' %(content_size / chunk_size /1024))
with open(file_path, 'wb')as f:
for data in resp.iter_content(chunk_size=chunk_size):
f.write(resp.content)
size += len(data)
print('\r'+'[下载进度] :%s%0.2f%%'%('>'*int(size*50/content_size),float(size/content_size*100)),end='')
print('图片保存成功!!')
end=time.time()
print('\n'+"下载完成! 用时%0.2f"%(end-start))
except Exception :
print("url:{}".format(item.get('image_url')))
else:
print("Download fail", file_path)
except requests.ConnectionError:
print("ConnectionError")
return None
def image_get(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36',
'Host': 'pic.sogou.com'}
response = requests.get(url, headers=headers,allow_redirects=False)
#防止重定向
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print("Download fail:{}".format(e.args))
return None
def main():
word = str(input("请输入抓取关键字:"))
page = int(input("请输入抓取的页数:"))
words = urllib.parse.quote(word, encoding='gbk')
url = r'https://pic.sogou.com/pics?query='+words+r'&mode=1&start=' + \
str(page*48)+r'&reqType=ajax&reqFrom=result&tn=0'
html = image_get(url)
for item in image_parser(html):
print(item)
save_image(item)
if __name__ == '__main__':
main()
time.sleep(3)
1
julyclyde 2019-03-28 23:25:46 +08:00 via iPad
不一定有啊
|
2
hfg123 OP 还有大佬解析一下吗?
|