scrapy-redis成功存储到数据库数据,四个代码设置,dmoz
scrapy-redis的提取spider:
# -*- coding: utf-8 -*- import scrapy import re import example.items #修改 引用外部文件 #导入内部itens from scrapy.linkextractors import LinkExtractor #链接提取 from scrapy.spiders import CrawlSpider, Rule #提取网页URL的功能,规则 class TianyaSpider(CrawlSpider): name = 'tianya' allowed_domains = ['tianya.cn'] start_urls = ['http://bbs.tianya.cn/post-140-393973-1.shtml'] rules=( Rule(LinkExtractor(".*?shtml"),callback="parse_item",follow=True), ) def parse_item(self, response): print(response.url) pagedata=response.body.decode("gbk","ignore") regex=re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})",re.IGNORECASE) #预编译正则 emaillist=regex.findall(pagedata) #抓取所有的邮箱 for mail in emaillist: myitem=example.items.EmailSpiderItem() #修改为example里面的items的类EmailSpiderItem myitem["email"]=mail myitem["url"]=response.url yield myitem
scrapy-redis存储数据库-items,代码要把原先的Class(Item)修改为Class(scrapy.Item),并且导入import scrapy,可以解决之前没有存储到数据库的问题。
# Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/topics/items.html from scrapy.item import Item, Field from scrapy.loader import ItemLoader from scrapy.loader.processors import MapCompose, TakeFirst, Join import scrapy class EmailSpiderItem(scrapy.Item): #修改,把Item 修改为scrapy.Item email = scrapy.Field() #修改 需要存储到数据库的数据 email url = scrapy.Field() #修改 需要存储到数据库的数据 url crawled = scrapy.Field() #什么时间抓取的 spider = scrapy.Field() #谁抓取的 class ExampleItem(Item): name = Field() description = Field() link = Field() crawled = Field() spider = Field() url = Field() class ExampleLoader(ItemLoader): default_item_class = ExampleItem default_input_processor = MapCompose(lambda s: s.strip()) default_output_processor = TakeFirst() description_out = Join()
scrapy-redis的piplines:
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/topics/item-pipeline.html from datetime import datetime class ExamplePipeline(object): def __init__(self): #修改写入文件 self.file=open("1.txt","wb") def process_item(self, item, spider): item["crawled"] = datetime.utcnow() #抓取时间 item["spider"] = spider.name #谁抓取的 self.file.write(str(item).encode("utf-8")) #修改写入文件 self.file.flush() #修改文件实时刷新 return item def __del__(self): self.file.close() #修改关闭文件
scrapy-redis的settings:
# Scrapy settings for example project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/topics/settings.html # SPIDER_MODULES = ['example.spiders'] NEWSPIDER_MODULE = 'example.spiders' USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)' DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" ITEM_PIPELINES = { 'example.pipelines.ExamplePipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 400, } LOG_LEVEL = 'DEBUG' REDIS_HOST="阿里云公网IP" #修改远程连接需添加IP REDIS_PORT=6379 #修改远程连接的端口 # 设置密码 REDIS_PARAMS = { #修改远程连接的密码 'password': '密码', } # Introduce an artifical delay to make use of parallelism. to speed up the # crawl. DOWNLOAD_DELAY = 1
完成上面的设置后,就可以成功的采集到数据库信息了。
此类为初始的url在代码里,redis数据库自动发送url,去重,再发送url,回收数据。开启运行后,不用压入数据
此为运行结构
此为scrapy-redis运行的三种方式
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?