#!/usr/bin/python
#_*_ coding:utf8 _*_
import requests
from requests.exceptions import RequestException
import re
from multiprocessing import Pool
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
#print(response.text)
#print(response.encoding)
#print(response.apparent_encoding)
#r = response.text
#print(requests.utils.get_encodings_from_content(r)[0])
#a = r.encode('utf-8').decode(requests.utils.get_encodings_from_content(r)[0])
#print(a)
#print('------------------------------------')
#b = r.encode('utf-8').decode(response.apparent_encoding)
#print(b)
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<li.*?cover.*?href="(.*?)" title="(.*?)">.*?img src="(.*?)"'
'.*?author">(.*?)</div>.*?year">(.*?)</span>.*?publisher">(.*?)'
'</span>.*?abstract">(.*?)</p>.*?</li>',re.S)
result = re.findall(pattern,html)
for item in result:
yield {
'地址': item[0],
'书名': item[1],
'封面': item[2],
'作者': item[3].strip()[0:],
'出版时间': item[4].strip()[0:],
'出版社': item[5].strip()[0:],
'详细': item[6].strip()[0:]
}
def main():
url = '
https://book.douban.com/' html=get_one_page(url)
for item in parse_one_page(html):
print(item)
if __name__=='__main__':
pool = Pool()
pool.map(main, [i * 10 for i in range(10)])
pool.close()
pool.join()