Scrapy: scrapy_redis
1 # 安装 2 pip3 install scrapy_redis 3 # 源码 4 https://github.com/rmax/scrapy-redis.git 5 # 文档 6 https://github.com/rmax/scrapy-redis 7 8 # 配置说明: https://github.com/rmax/scrapy-redis/wiki/Usage 9 REDIS_HOST = 'localhost' 10 REDIS_PORT = 6379 11 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 12 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 13 # 可暂停Spider 14 SCHEDULER_PERSIST = True 15 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" 16 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" 17 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" 18 19 ITEM_PIPELINES = { 20 'scrapy_redis.pipelines.RedisPipeline': 400, 21 } 22 23 # Spider 24 from scrapy_redis.spiders import RedisSpider 25 26 class BaiduSpider(RedisSpider): 27 """Spider that reads urls from redis queue (myspider:start_urls).""" 28 name = 'baiu' 29 redis_key = 'myspider:baiu' 30 # allowed_domains = ['baiu.com'] 31 32 def __init__(self, *args, **kwargs): 33 # Dynamically define the allowed domains list. 34 domain = kwargs.pop('domain', '') 35 self.allowed_domains = filter(None, domain.split(',')) 36 super(BaiduSpider, self).__init__(*args, **kwargs) 37 38 def parse(self, response): 39 print(response.text) 40 # return { 41 # 'name': response.css('title::text').extract_first(), 42 # 'url': response.url, 43 # } 44 45 # CrawlSpider 46 from scrapy.linkextractors import LinkExtractor 47 from scrapy.spiders import CrawlSpider, Rule 48 from scrapy_redis.spiders import RedisCrawlSpider 49 50 51 class FanqienovelSpider(RedisCrawlSpider): 52 name = 'fanqienovel' 53 redis_key = 'mycrawler:fanqienovel' 54 # allowed_domains = ['baiu.com'] 55 56 rules = ( 57 # follow all links 58 Rule(LinkExtractor(), callback='parse_page', follow=True), 59 ) 60 61 def __init__(self, *args, **kwargs): 62 # Dynamically define the allowed domains list. 63 domain = kwargs.pop('domain', '') 64 self.allowed_domains = filter(None, domain.split(',')) 65 super(FanqienovelSpider, self).__init__(*args, **kwargs) 66 67 def parse_page(self, response): 68 print(response.text) 69 return { 70 'name': response.css('title::text').extract_first(), 71 'url': response.url, 72 }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?