#!/usr/bin/env python3
import re
import urllib.request
import os
headers = ('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
def obtain_piclist(link):
#创建文件夹并爬取对应的照片
page_info = opener.open(link).read()
page_info = str(page_info, encoding = 'utf-8')
title_match = r'title>mochizukikaoru 的相册-([\s\S]+)?</title>'
title = re.compile(title_match).findall(page_info)
title = title[0].replace
result_dir = os.path.join('/Users/tnb/Pictures/Douban/',str(title).replace('/','|'))
if not os.path.exists(result_dir):
os.makedirs(result_dir)
pic_rule = '<img width="201" src="(
https://img3.doubanio.com/view/photo/m/public/.+?.jpg)" />'
pic_add = re.compile(pic_rule).findall(page_info)
x = 1
for pic_link in pic_add:
pic_name = result_dir + '/' + '_'+str(x)+'.jpg'
try:
urllib.request.urlretrieve(pic_link, pic_name)
urllib.request.urlcleanup()
except urllib.error.URLError as e:
if hasattr(e, 'code'):
x += 1
if hasattr(e, 'reason'):
x += 1
x += 1
def obtain_page_pic(url):
#爬取每页的相册地址
page_url = str(opener.open(url).read())
rule = 'div class="wr".+?class="clear"'
page = str(re.compile(rule).findall(page_url))
rule_2 = r'href="(
https://www.douban.com/photos/album/.+?/)"'
page_2 = re.compile(rule_2).findall(page)
page_2 = list(page_2)
for link in page_2:
obtain_piclist(link)
print(link)
if __name__ == '__main__':
for i in range(0,99):
#获取相册目录和创建相册文件夹
url = '
https://www.douban.com/people/mochizukikaoru/photos?start='+str(i*18) obtain_page_pic(url)
这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。
https://www.v2ex.com/t/482545
V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。
V2EX is a community of developers, designers and creative people.