scrapy_redis 设置

class MyCrawler(RedisCrawlSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = 'mycrawler_redis'
redis_key = 'mycrawler:start_urls'

rules = (
# follow all links
Rule(LinkExtractor(), callback='parse_page', follow=True),
)

def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop('domain', '')
self.allowed_domains = filter(None, domain.split(','))
super(MyCrawler, self).__init__(*args, **kwargs)

def parse_page(self, response):
return {
'name': response.css('title::text').extract_first(),
'url': response.url,
}
posted @ 2019-06-08 14:26  C,python,linux,java  阅读(253)  评论(0编辑  收藏  举报