scrapy-redis存储数据库,四种设置代码,myspider_redis
scrapy-redis的spider:
# -*- coding: utf-8 -*- import scrapy import example.items from scrapy_redis.spiders import RedisSpider class TencentSpider(RedisSpider): name = 'tencent_redis' #修改名字 redis_key = 'tencent:start_urls' #修改开始的start_urls名字 def __init__(self, *args, **kwargs): #修改需要加入的函数 # Dynamically define the allowed domains list. domain = kwargs.pop('http://hr.tencent.com', '') #修改需要压入的域名 self.allowed_domains = filter(None, domain.split(',')) #修改添加 super(TencentSpider, self).__init__(*args, **kwargs) #修改supei里面的类TencentSpider def parse(self, response): for eachdata in response.xpath("//tr[@class='even'] | //tr[@class='odd'] "): tencentitem=example.items. TencenthrItem() #修改. TencenthrItem() tencentitem["workname"]=eachdata.xpath("./td[1]/a/text()").extract() tencentitem["workLink"]=eachdata.xpath("./td[1]/a/@href").extract() tencentitem["worktype"]=eachdata.xpath("./td[2]/text()").extract() tencentitem["worknumbers"]=eachdata.xpath("./td[3]/text()").extract() tencentitem["workpos"]=eachdata.xpath("./td[4]/text()").extract() tencentitem["worktime"]=eachdata.xpath("./td[5]/text()").extract() yield tencentitem
scrapy-redis的items
# Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/topics/items.html from scrapy.item import Item, Field from scrapy.loader import ItemLoader from scrapy.loader.processors import MapCompose, TakeFirst, Join import scrapy #修改 导入scrapy class TencenthrItem(scrapy.Item): #修改Item改为scrapy # define the fields for your item here like: # name = scrapy.Field() #腾讯职位的六个字段 workname=scrapy.Field() #修改添加要存储的数据 workLink=scrapy.Field() #修改添加要存储的数据 worktype=scrapy.Field() #修改添加要存储的数据 worknumbers=scrapy.Field() #修改添加要存储的数据 workpos=scrapy.Field() #修改添加要存储的数据 worktime=scrapy.Field() #修改添加要存储的数据 crawled=scrapy.Field() spider=scrapy.Field() class ExampleItem(Item): name = Field() description = Field() link = Field() crawled = Field() spider = Field() url = Field() class ExampleLoader(ItemLoader): default_item_class = ExampleItem default_input_processor = MapCompose(lambda s: s.strip()) default_output_processor = TakeFirst() description_out = Join()
scrapy-redis的piplines:
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/topics/item-pipeline.html from datetime import datetime class ExamplePipeline(object): #本地没有保存,所以未修改 def process_item(self, item, spider): item["crawled"] = datetime.utcnow() item["spider"] = spider.name print(item) return item
scrapy-redis的settings:
# Scrapy settings for example project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/topics/settings.html # SPIDER_MODULES = ['example.spiders'] NEWSPIDER_MODULE = 'example.spiders' USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)' DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" ITEM_PIPELINES = { 'example.pipelines.ExamplePipeline': 300, 'scrapy_redis.pipelines.RedisPipeline': 400, } LOG_LEVEL = 'DEBUG' REDIS_HOST="阿里云IP公网" #修改远程连接需添加IP REDIS_PORT=6379 #修改远程连接的端口 # 设置密码 REDIS_PARAMS = { #修改远程连接的密码 'password': '数据库密码', } LOG_LEVEL = 'DEBUG' # Introduce an artifical delay to make use of parallelism. to speed up the # crawl. DOWNLOAD_DELAY = 1
本例是测试的腾讯hr职位,因现在采取了反爬,故获取不了数据,不过相应的存储数据库步骤是对的。
此类适用于已知url,然后通过redis来不断的手动或者设置规则自动压入url来爬取数据,redis数据库发送url,然后回收结果数据。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?