分享个 Python 图片下载代码

2018-05-21 17:23:33 +08:00
 ucun
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import re
import os

def getHTMLText(url):
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"}
    try:
        r = requests.get(url,headers=headers)
        r.raise_for_status()
        return r.text
    except requests.exceptions.RequestException as e:
        print(e)

def getURLList(html):
    regex = r"( http(s?):)([/|.|\w|\s|-])*\.(?:jpg|gif|png)"
    lst = []
    matches = re.finditer(regex, html, re.MULTILINE)
    for x,y in enumerate(matches):
        try:
            lst.append(str(y.group()))
        except:
            continue
    return sorted(set(lst),key = lst.index)

def download(lst,filepath='img'):
    if not os.path.isdir(filepath):
        os.makedirs(filepath)

    filecounter = len(lst)
    filenow = 1
    for url in lst:
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"}
        filename = filepath +'/' + url.split('/')[-1]
        with open(filename,'wb') as f :
            try:
                img = requests.get(url,headers=headers)
                img.raise_for_status()
                print("Downloading {}/{} file name:{}".format(filenow,filecounter,filename.split('/')[-1]))
                filenow += 1
                f.write(img.content)
                f.flush()
                f.close()
                print("{} saved".format(filename))
            except requests.exceptions.RequestException as e:
                print(e)
                continue


if __name__ == '__main__':
    url = input('please input the image url:')
    filepath = input('please input the download path:')
    html = getHTMLText(url)
    lst = getURLList(html)
    download(lst,filepath)

需要 requests 库

运行效果

2626 次点击
所在节点    Python
10 条回复
soho176
2018-05-21 19:06:06 +08:00
urllib.urlretrieve 这个下图片不错 你试试
ucun
2018-05-21 19:17:06 +08:00
@soho176 #1 去做饭了,等吃完饭改一下。 小应用 urllib 更方便一些,不用装依赖库。
cy97cool
2018-05-21 21:05:45 +08:00
没考虑中文文件名图片吧
需要 urldecode 一下
另外要不要处理文件名中的特殊符号 可能不能作为文件名的 url?
liyiecho
2018-05-21 21:30:37 +08:00
下载之类的,我觉得还是调用 aira2 来下载比较好,aria2 可以保证下载内容的完整性。如果用 python 模块下载的话,当遇到网络问题或者报错的时候,下载的内容可能不是完整的了。
ucun
2018-05-21 21:34:40 +08:00
@soho176 #1 urlretrieve 下载图片坑多。图片模糊、打不开等等

```python
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from urllib.request import Request,urlopen,urlretrieve
from urllib.error import HTTPError
import re
import os

def getHTMLText(url):
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"}
req = urllib.request.Request(url=url,headers=headers)
try:
with urllib.request.urlopen(req) as f:
return f.read().decode('utf-8')
except HTTPError as e:
print('Error code:',e.code)

def getURLList(html):
regex = r"( http(s?):)([/|.|\w|\s|-])*\.(?:jpg|gif|png)"
lst = []
matches = re.finditer(regex, html, re.MULTILINE)
for x,y in enumerate(matches):
try:
lst.append(str(y.group()))
except:
continue
return sorted(set(lst),key = lst.index)

def download(lst,filepath='img'):
if not os.path.isdir(filepath):
os.makedirs(filepath)

filecounter = len(lst)
filenow = 1
for url in lst:
filename = filepath +'/' + url.split('/')[-1]
opener = urllib.request.build_opener()
opener.addheaders = [("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(url,filename)


if __name__ == '__main__':
url = input('please input the image url:')
filepath = input('please input the download path:')
html = getHTMLText(url)
lst = getURLList(html)
download(lst,filepath)

```
ucun
2018-05-21 21:40:28 +08:00
@cy97cool #3 网页地址中出现中文文件名的情况很少吧,想加 encode 处理起来慢。至于特殊字符作为文件名,网页中都能解析,本地系统应该可以吧。现在遇到问题很少。等出现了再处理?
ucun
2018-05-21 21:42:06 +08:00
@liyiecho #4 只是下载图片 python 够用了,再安装 aria2 就麻烦了。你总不会想把一级棒全站图片下载下来吧?
soho176
2018-05-21 21:46:11 +08:00
@ucun 恩 我也发现了看着图片不完整,但是打开了图确是全的,奇怪了。。
soho176
2018-05-21 21:48:36 +08:00
@ucun cl 的下载方式很恶心,你写个下载的利器吧
cy97cool
2018-05-22 10:45:29 +08:00
@ucun 确定文件名之前还是过滤一下为好

def safefilename(filename):
  """
   convert a string to a safe filename
  :param filename: a string, may be url or name
  :return: special chars replaced with _
  """
   for i in "\\/:*?\"<>|$":
   filename=filename.replace(i,"_")
   return filename

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://www.v2ex.com/t/456582

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX