from concurrent.futures import ThreadPoolExecutor
from requests_futures.sessions import FuturesSession
session = FuturesSession(executor=ThreadPoolExecutor(max_workers=20))
import requests
from bs4 import BeautifulSoup
import re
def fetchPages(first_page):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
content = requests.get(first_page, headers=headers).text
soup = BeautifulSoup(content, "html.parser")
a_tags_final = soup.find("div", { "class" : "paginator" }).find_all("a")[-2].get("href")
page_max = int(re.findall("start=(.*)&",a_tags_final)[0])
pages = []
for k in range(0,page_max+20,20):
pages.append(first_page+"?start="+str(k))
return pages
def fetchBooks(pages):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
books = []
for page in pages:
books.append(session.get(page, headers = headers))
def get_books_url(book):
soup = BeautifulSoup(book, "html.parser")
book_list = list(map(lambda li: li.find("div", { "class" : "info" }).find("h2").find("a").get("href"), soup.find_all("li", { "class" : "subject-item" })))
return book_list
books = list(map(lambda book: get_books_url(book.result().text), books))
books_url = []
for book in books:
books_url += book
return books_url
def fetchBookInfo(books):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
books_info = []
for book in books:
books_info.append(session.get(book, headers = headers))
def get_books_data(book_info):
soup = BeautifulSoup(book_info, "html.parser")
info = soup.find("div", { "id" : "info" })
return info
book_data = list(map(lambda book: get_books_data(book.result().text), books_info))
return book_data
if __name__ == '__main__':
pages = fetchPages("
https://book.douban.com/tag/%E7%BC%96%E7%A8%8B")
books = fetchBooks(pages)
data = fetchBookInfo(books)