Scrapy: scrapy_redis
1 # 安装 2 pip3 install scrapy_redis 3 # 源码 4 https://github.com/rmax/scrapy-redis.git 5 # 文档 6 https://github.com/rmax/scrapy-redis 7 8 # 配置说明: https://github.com/rmax/scrapy-redis/wiki/Usage 9 REDIS_HOST = 'localhost' 10 REDIS_PORT = 6379 11 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 12 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 13 # 可暂停Spider 14 SCHEDULER_PERSIST = True 15 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" 16 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" 17 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" 18 19 ITEM_PIPELINES = { 20 'scrapy_redis.pipelines.RedisPipeline': 400, 21 } 22 23 # Spider 24 from scrapy_redis.spiders import RedisSpider 25 26 class BaiduSpider(RedisSpider): 27 """Spider that reads urls from redis queue (myspider:start_urls).""" 28 name = 'baiu' 29 redis_key = 'myspider:baiu' 30 # allowed_domains = ['baiu.com'] 31 32 def __init__(self, *args, **kwargs): 33 # Dynamically define the allowed domains list. 34 domain = kwargs.pop('domain', '') 35 self.allowed_domains = filter(None, domain.split(',')) 36 super(BaiduSpider, self).__init__(*args, **kwargs) 37 38 def parse(self, response): 39 print(response.text) 40 # return { 41 # 'name': response.css('title::text').extract_first(), 42 # 'url': response.url, 43 # } 44 45 # CrawlSpider 46 from scrapy.linkextractors import LinkExtractor 47 from scrapy.spiders import CrawlSpider, Rule 48 from scrapy_redis.spiders import RedisCrawlSpider 49 50 51 class FanqienovelSpider(RedisCrawlSpider): 52 name = 'fanqienovel' 53 redis_key = 'mycrawler:fanqienovel' 54 # allowed_domains = ['baiu.com'] 55 56 rules = ( 57 # follow all links 58 Rule(LinkExtractor(), callback='parse_page', follow=True), 59 ) 60 61 def __init__(self, *args, **kwargs): 62 # Dynamically define the allowed domains list. 63 domain = kwargs.pop('domain', '') 64 self.allowed_domains = filter(None, domain.split(',')) 65 super(FanqienovelSpider, self).__init__(*args, **kwargs) 66 67 def parse_page(self, response): 68 print(response.text) 69 return { 70 'name': response.css('title::text').extract_first(), 71 'url': response.url, 72 }