scrapy-redis存储数据库,四种设置代码,mycrawler_redis
scrapy-redis的spider:
# -*- coding: utf-8 -*- import scrapy import re import example.items #引用外部文件 from scrapy.linkextractors import LinkExtractor #链接提取 from scrapy.spiders import CrawlSpider, Rule #提取网页URL的功能,规则 from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor from scrapy_redis.spiders import RedisCrawlSpider from scrapy_redis.spiders import RedisMixin class TianyaSpider(RedisCrawlSpider): name = 'tianya_redis' #修改的名字 redis_key = 'tianya_redis:start_urls' #修改设置初始url名字 rules=( Rule(LinkExtractor(".*?shtml"),callback="parse_item",follow=True), ) def set_crawler(self,crawer): #修改添加的函数 CrawlSpider.set_crawler(self,crawer) #修改设置默认爬取 RedisMixin.setup_redis(self) #修改url由redis统一调度 def parse_item(self, response): print(response.url) pagedata=response.body.decode("gbk","ignore") regex=re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})",re.IGNORECASE) #预编译正则 emaillist=regex.findall(pagedata) #抓取所有的邮箱 for mail in emaillist: myitem=example.items.EmailSpiderItem() myitem["email"]=mail myitem["url"]=response.url yield myitem
scrapy-redis的items
# Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/topics/items.html from scrapy.item import Item, Field from scrapy.loader import ItemLoader from scrapy.loader.processors import MapCompose, TakeFirst, Join import scrapy class EmailSpiderItem(scrapy.Item): #修改Iten 为scrapy.Item email = scrapy.Field() #修改存储的email url = scrapy.Field() #修改存储的url crawled = scrapy.Field() #什么时间抓取的 spider = scrapy.Field() #谁抓取的 class ExampleItem(Item): name = Field() description = Field() link = Field() crawled = Field() spider = Field() url = Field() class ExampleLoader(ItemLoader): default_item_class = ExampleItem default_input_processor = MapCompose(lambda s: s.strip()) default_output_processor = TakeFirst() description_out = Join()
scrapy-redis的piplines:
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/topics/item-pipeline.html from datetime import datetime class ExamplePipeline(object): def __init__(self): #修改添加的打开文件 self.file=open("1.txt","w") def __del__(self): #修改关闭文件函数 self.file.close() def process_item(self, item, spider): item["crawled"] = datetime.utcnow() #抓取时间 item["spider"] = spider.name #谁抓取的 self.file.write(str(item)) #修改写入 self.file.flush() #修改实时生效 print(item) #修改打印 return item
scrapy-redis的settings:
# Scrapy settings for example project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/topics/settings.html # SPIDER_MODULES = ['example.spiders'] NEWSPIDER_MODULE = 'example.spiders' USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)' DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" ITEM_PIPELINES = { 'example.pipelines.ExamplePipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 400, } LOG_LEVEL = 'DEBUG' REDIS_HOST="阿里云IP公网" #修改远程连接需添加IP REDIS_PORT=6379 #修改远程连接的端口 # 设置密码 REDIS_PARAMS = { #修改远程连接的密码 'password': '数据库密码', } # Introduce an artifical delay to make use of parallelism. to speed up the # crawl. DOWNLOAD_DELAY = 1
压入客户端start_urls的命令:
lpush tianya_redis:start_urls http://bbs.tianya.cn/post-140-393973-1.shtml
测试可以成功的抓取数据保存至数据库items中。
此类适用于无线抓取数据的类型,压入一个start_urls,redis数据库会自动的发送url抓取数据,然后再收取url去重自动分配url循环抓取。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?