@
binux 是的 多谢提醒 昨天测试没问题 很赞的工具。不过有一个疑问,关于 every 和 age 的使用。我看文档不是很多。现在我用 itag 每次修改完后强制重新 load 一次。但是有个问题是这之后就一直不自动运行了,经常输出:
pyspider task done :on_start data:,on_start
是不是我的配置有问题?
from pyspider.libs.base_handler import *
import re
import datetime
class Handler(BaseHandler):
crawl_config = {
"itag":"v0.0.6"
}
@
every(minutes=3)
@
config(age=3*60)
def on_start(self):
self.crawl('http://www..com.cn/', callback=self.index_page)
@
every(minutes=3)
@
config(age=3*60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
href = each.attr.href
if re.match("
http://www.xxxx.com.cn/gonglu/[^_]*/", href, re.U):
self.crawl(each.attr.href, callback=self.detail_page,save={'main_road_name':each.text()}, retries=10, auto_recrawl =True)
def detail_page(self, response):
road_name = response.save.get('main_road_name', '')
road_img_url = ""
for img_item in response.doc('td.hcenter > img').items():
road_img_url = img_item.attr.src
for each in response.doc('.roadlineB > li > a').items():
href = each.attr.href
if re.match("
http://www.xxx.com.cn/gonglu/gaosu_[^cs]\w*/", href, re.U):
city_road_name = each.text()
self.crawl(each.attr.href, callback=self.detail_page,save={'main_road_name':road_name, 'city_road_name':city_road_name}, retries =10, auto_recrawl=True)
messages = []
for each in response.doc(".LKlistleftE > p").items():
message = each.text()
info = self.process_message(message)
if info:
messages.append(info)
city_road_name = response.save.get('city_road_name', '')
return dict(main_road_name=road_name, city_road_name=city_road_name, messages=messages, road_img_url=road_img_url)
def process_message(self, message):
return dict(content=message)
这是我的代码,不知道是不是有问题。
谢谢