scrapy-redis存储数据库，四种设置代码，myspider_redis

　　scrapy-redis的spider:

# -*- coding: utf-8 -*-
import scrapy
import example.items
from scrapy_redis.spiders import RedisSpider


class TencentSpider(RedisSpider):
    name = 'tencent_redis'    #修改名字
    redis_key = 'tencent:start_urls' #修改开始的start_urls名字


    def __init__(self, *args, **kwargs):   #修改需要加入的函数
        # Dynamically define the allowed domains list.
        domain = kwargs.pop('http://hr.tencent.com', '')   #修改需要压入的域名
        self.allowed_domains = filter(None, domain.split(','))  #修改添加
        super(TencentSpider, self).__init__(*args, **kwargs)  #修改supei里面的类TencentSpider


    def parse(self, response):
        for   eachdata in response.xpath("//tr[@class='even'] | //tr[@class='odd'] "):
            tencentitem=example.items. TencenthrItem()   #修改. TencenthrItem()
            tencentitem["workname"]=eachdata.xpath("./td[1]/a/text()").extract()
            tencentitem["workLink"]=eachdata.xpath("./td[1]/a/@href").extract()
            tencentitem["worktype"]=eachdata.xpath("./td[2]/text()").extract()
            tencentitem["worknumbers"]=eachdata.xpath("./td[3]/text()").extract()
            tencentitem["workpos"]=eachdata.xpath("./td[4]/text()").extract()
            tencentitem["worktime"]=eachdata.xpath("./td[5]/text()").extract()
            yield   tencentitem

　　scrapy-redis的items

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html

from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, TakeFirst, Join
import scrapy  #修改 导入scrapy
class TencenthrItem(scrapy.Item):  #修改Item改为scrapy
    # define the fields for your item here like:
    # name = scrapy.Field()
    #腾讯职位的六个字段
    workname=scrapy.Field()  #修改添加要存储的数据
    workLink=scrapy.Field()  #修改添加要存储的数据
    worktype=scrapy.Field()  #修改添加要存储的数据
    worknumbers=scrapy.Field()  #修改添加要存储的数据
    workpos=scrapy.Field()  #修改添加要存储的数据
    worktime=scrapy.Field()  #修改添加要存储的数据
    crawled=scrapy.Field()
    spider=scrapy.Field()


class ExampleItem(Item):
    name = Field()
    description = Field()
    link = Field()
    crawled = Field()
    spider = Field()
    url = Field()


class ExampleLoader(ItemLoader):
    default_item_class = ExampleItem
    default_input_processor = MapCompose(lambda s: s.strip())
    default_output_processor = TakeFirst()
    description_out = Join()

　　scrapy-redis的piplines:

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
from datetime import datetime

class ExamplePipeline(object):  #本地没有保存，所以未修改
    def process_item(self, item, spider):
        item["crawled"] = datetime.utcnow()
        item["spider"] = spider.name
        print(item)
        
        return item

　　scrapy-redis的settings:

# Scrapy settings for example project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'

USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"

ITEM_PIPELINES = {
    'example.pipelines.ExamplePipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400,
}

LOG_LEVEL = 'DEBUG'

REDIS_HOST="阿里云IP公网"   #修改远程连接需添加IP
REDIS_PORT=6379           #修改远程连接的端口
#  设置密码
REDIS_PARAMS = {             #修改远程连接的密码
    'password': '数据库密码',
}

LOG_LEVEL = 'DEBUG'

# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1

　　本例是测试的腾讯hr职位，因现在采取了反爬，故获取不了数据，不过相应的存储数据库步骤是对的。

　　此类适用于已知url,然后通过redis来不断的手动或者设置规则自动压入url来爬取数据，redis数据库发送url，然后回收结果数据。

posted on 2020-03-30 21:26 共感的艺术阅读(594) 评论(0) 收藏举报