我在使用scrapy抓取youtube上关于工业机器人视频的标题与链接,希望输出到json文件里面,
以下是我的代码:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy import log
from youtube.items import YoutubeItem
class YoutubeSpider(CrawlSpider):
name = "youtube"
allowed_domains = ["
youtube.com"]
start_urls = ['
http://www.youtube.com/results?search_query=industrial+robot+Assembling&page=%d' %n for n in range (1,2)]
rules = ()
def parse(self,response):
print "Start scrapping youtube videos info..."
hxs=HtmlXPathSelector(response)
bases = hxs.select('//*[@id="results"]//*[@id="search-results"]')
items=[]
for base in bases:
item = YoutubeItem()
t_title=base.select('//*[@id="search-results"]/li/div/h3/a//text()').extract()
item['title']=map(lambda s: s.strip(), t_title)
item['linkID'] = base.select('//*[@id="search-results"]/li/div/h3/a/@href').extract()
#t_desc=base.select('//*[@id="search-results"]/li/div[2]/div[2]/text()')
#t_desc="".join(base.select('//*[@id="search-results"]/li/div[2]/div[2]/text()').extract_unquoted())
#item['description']=t_desc
#item['thumbnail'] = base.select('//*[@id="search-results"]/li/div/a//img/@src').extract()
items.append(item)
return(items)
但是输出的结果是:
{'linkID': [u'/watch?v=iFKbpbe_9pw',
u'/watch?v=Fnlzl6sBOsA',
u'/watch?v=QbrqeJRy0hY',
u'/watch?v=u6-d5VkOB3I',
u'/watch?v=9--qNRr1VZI',
u'/watch?v=89prwGUZjM0',
u'/watch?v=txahbz9eswk',
u'/watch?v=52ptIgooZ64',
u'/watch?v=goNOPztC_qE',
u'/watch?v=daH5Xs11uQc',
u'/watch?v=V2V3Cu0nWvg',
u'/watch?v=TQwN-YeWXfs',
u'/watch?v=aWDAG3fz-ec',
u'/watch?v=Xmn06cpqngs',
u'/watch?v=iuaAEDrrVyg',
u'/watch?v=TG4yzjV4d8w&list=PLECC02EA2EAE0E159',
u'/watch?v=GCCW9O7IKhY',
u'/watch?v=O8HwEXDLug8',
u'/watch?v=yYCHUT79tFM',
u'/watch?v=82w_r2D1Ooo'],
'title': [u'Assembly Line Robot Arms on How Do They Do It',
u'Engine Assembly Robots - FANUC Robot Industrial Automation',
u'LR Mate 200iC USB Memory Stick Assembly Robot - FANUC Robot Industrial Automation',
u'R-2000iA Automotive Assembly Robots - FANUC Robotics Industrial Automation',
u'M-3iA Flexible Solar Collector Assembly Robot - FANUC Robotics Industrial Automation',
u'LR Mate 200iB Gas Can Assembly Robot - FANUC Robotics Industrial Automation',
u'ABB Robotics - Assembly of electrical sockets',
u'M-1iA Circuit Board Assembly Robots - FANUC Robotics Industrial Automation',
u'ABB Robotics - Assembly of digital camera',
u'M-1iA LED Lens Assembly Robots - FANUC Robotics Industrial Automation',
u'LR Mate Small Piston Engine Assembly Robots - FANUC Robotics Industrial Automation',
u'M-1iA Keyboard Assembly Robots - FANUC Robotics Industrial Automation',
u'M-1iA Ball Bearing Assembly Robot - FANUC Robotics Industrial Automation',
u'M-1iA Intelligent Gear Assembly Robot - FANUC Robotics Industrial Automation',
u'LR Mate 200iC Small Part Assembly Robots - FANUC Robotics Industrial Automation',
u'Assembly Robots - FANUC Robotics Application Videos',
u'M-1iA/LR Mate 200iC Solar Panel Assembly Robots - FANUC Robotics Industrial Automation',
u'ABB Robotics - Assembly',
u'Yaskawa Motoman SDA10 Robot Assembly Video',
u'Toyota Camry Hybrid Factory Robots']}
而我想要的是linkID与title一一对应起来,这是哪里有问题吗?
V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。
V2EX is a community of developers, designers and creative people.