好几个月前写的了,写的比较挫。
并没有写成爬取一个博客的所有内容,本来是用来网站的,如果要爬所有内容,会让用户等待太久。
# -*- coding=utf-8 -*-
from threading import Thread
import Queue
import requests
import re
import os
import sys
import time
api_url='http://%s.tumblr.com/api/read?&num=50&start='
UQueue=Queue.Queue()
def getpost(uid,queue):
url='http://%s.tumblr.com/api/read?&num=50'%uid
page=requests.get(url).content
total=re.findall('<posts start="0" total="(.*?)">',page)[0]
total=int(total)
a=[i*50 for i in range(1000) if i*50-total<0]
ul=api_url%uid
for i in a:
queue.put(ul+str(i))
extractpicre = re.compile(r'(?<=<photo-url max-width="1280">).+?(?=</photo-url>)',flags=re.S) #search for url of maxium size of a picture, which starts with '<photo-url max-width="1280">' and ends with '</photo-url>'
extractvideore=re.compile('/tumblr_(.*?)" type="video/mp4"')
video_links = []
pic_links = []
vhead = 'https://vt.tumblr.com/tumblr_%s.mp4'
class Consumer(Thread):
def __init__(self, l_queue):
super(Consumer,self).__init__()
self.queue = l_queue
def run(self):
session = requests.Session()
while 1:
link = self.queue.get()
print 'start parse post: ' + link
try:
content = session.get(link).content
videos = extractvideore.findall(content)
video_links.extend([vhead % v for v in videos])
pic_links.extend(extractpicre.findall(content))
except:
print 'url: %s parse failed\n' % link
if self.queue.empty():
break
def main():
task=[]
for i in range(min(10,UQueue.qsize())):
t=Consumer(UQueue)
task.append(t)
for t in task:
t.start()
for t in task:
t.join
while 1:
for t in task:
if t.is_alive():
continue
else:
task.remove(t)
if len(task)==0:
break
def write():
videos=[i.replace('/480','') for i in video_links]
pictures=pic_links
with open('pictures.txt','w') as f:
for i in pictures:
f.write('%s\n'%i)
with open('videos.txt','w') as f:
for i in videos:
f.write('%s\n'%i)
if __name__=='__main__':
#name=sys.argv[1]
#name=name.strip()
name='mzcyx2011'
getpost(name,UQueue)
main()
write()
这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。
V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。
V2EX is a community of developers, designers and creative people.