scrapy-redis存储数据库，四种设置代码，mycrawler_redis

　　scrapy-redis的spider:

# -*- coding: utf-8 -*-
import scrapy
import  re

import example.items  #引用外部文件
from scrapy.linkextractors import LinkExtractor #链接提取
from scrapy.spiders import CrawlSpider, Rule  #提取网页URL的功能，规则
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

from scrapy_redis.spiders import RedisCrawlSpider
from scrapy_redis.spiders import RedisMixin


class TianyaSpider(RedisCrawlSpider):
    name = 'tianya_redis'           #修改的名字
    redis_key = 'tianya_redis:start_urls'   #修改设置初始url名字  
    rules=(
        Rule(LinkExtractor(".*?shtml"),callback="parse_item",follow=True),
        )
   
    def set_crawler(self,crawer):    #修改添加的函数
        CrawlSpider.set_crawler(self,crawer) #修改设置默认爬取
        RedisMixin.setup_redis(self) #修改url由redis统一调度


    def parse_item(self, response):
        print(response.url)
        pagedata=response.body.decode("gbk","ignore")
        regex=re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})",re.IGNORECASE) #预编译正则
        emaillist=regex.findall(pagedata) #抓取所有的邮箱
 
     
        for   mail in  emaillist:
            myitem=example.items.EmailSpiderItem()
            myitem["email"]=mail
            myitem["url"]=response.url
            yield myitem

　　scrapy-redis的items

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html

from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, TakeFirst, Join
import scrapy

class EmailSpiderItem(scrapy.Item):  #修改Iten  为scrapy.Item
    email = scrapy.Field()  #修改存储的email
    url = scrapy.Field()   #修改存储的url
   
    crawled = scrapy.Field() #什么时间抓取的
    spider = scrapy.Field() #谁抓取的


class ExampleItem(Item):
    name = Field()
    description = Field()
    link = Field()
    crawled = Field()
    spider = Field()
    url = Field()


class ExampleLoader(ItemLoader):
    default_item_class = ExampleItem
    default_input_processor = MapCompose(lambda s: s.strip())
    default_output_processor = TakeFirst()
    description_out = Join()

　　scrapy-redis的piplines:

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
from datetime import datetime

class ExamplePipeline(object):
    def __init__(self):   #修改添加的打开文件
        self.file=open("1.txt","w")
    def __del__(self):  #修改关闭文件函数
        self.file.close()
    def process_item(self, item, spider):
        item["crawled"] = datetime.utcnow() #抓取时间
        item["spider"] = spider.name #谁抓取的
        self.file.write(str(item))  #修改写入
        self.file.flush()  #修改实时生效
        print(item) #修改打印
        return item

　　scrapy-redis的settings:

# Scrapy settings for example project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'

USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"

ITEM_PIPELINES = {
    'example.pipelines.ExamplePipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400,
}

LOG_LEVEL = 'DEBUG'

REDIS_HOST="阿里云IP公网"   #修改远程连接需添加IP
REDIS_PORT=6379           #修改远程连接的端口
#  设置密码
REDIS_PARAMS = {             #修改远程连接的密码
    'password': '数据库密码',
}

# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1

　　压入客户端start_urls的命令：

lpush tianya_redis:start_urls http://bbs.tianya.cn/post-140-393973-1.shtml

　　测试可以成功的抓取数据保存至数据库items中。

　　此类适用于无线抓取数据的类型，压入一个start_urls，redis数据库会自动的发送url抓取数据，然后再收取url去重自动分配url循环抓取。

posted on 2020-03-30 21:06 共感的艺术阅读(308) 评论(0) 收藏举报