#encoding=utf8
import requests
import re
def get_page_url(num):
if num is 1:
url = "
http://www.souutu.com/mnmm/index.html"
else:
url = "
http://www.souutu.com/mnmm/index_"+str(num)+".html"
r = requests.get(url).content
return re.findall("<a href=\"
http://www.souutu.com/mnmm/([^\"]+).html\" target=\"_blank\"><img lazy", r)
def get_pic_number(page_url):
url = "
http://www.souutu.com/mnmm/"+page_url+".html"
r = requests.get(url).content
return int(re.search("共(\d+)张",r).group(1))
def get_pic_url(page_url,number):
l=[]
for i in range(1,number+1):
if i is 1:
url = "
http://www.souutu.com/mnmm/"+page_url+".html"
else:
url = "
http://www.souutu.com/mnmm/"+page_url+"_"+str(i)+".html"
r = requests.get(url).content
pic_url = re.search("<img id=\"bigImg\" src=\"([^\"]+)\"",r).group(1)
l.append(pic_url)
return l
for i in range(1,99):
page_urls = get_page_url(i)
for page_url in page_urls:
number = get_pic_number(page_url)
pic_urls = get_pic_url(page_url,number)
建议楼主去学习下正则,比 bs4 方便的多,适用范围也广。