scrapy-redis成功存储到数据库数据，四个代码设置，dmoz

　　scrapy-redis的提取spider:

# -*- coding: utf-8 -*-
import scrapy
import  re

import example.items  #修改 引用外部文件   #导入内部itens
from scrapy.linkextractors import LinkExtractor #链接提取
from scrapy.spiders import CrawlSpider, Rule  #提取网页URL的功能，规则




class TianyaSpider(CrawlSpider):
    name = 'tianya'
    allowed_domains = ['tianya.cn']
    start_urls = ['http://bbs.tianya.cn/post-140-393973-1.shtml']

    rules=(
        Rule(LinkExtractor(".*?shtml"),callback="parse_item",follow=True),
        )

    def parse_item(self, response):
        print(response.url)
        pagedata=response.body.decode("gbk","ignore")
        regex=re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})",re.IGNORECASE) #预编译正则
        emaillist=regex.findall(pagedata) #抓取所有的邮箱
 
     
        for mail in  emaillist:
            myitem=example.items.EmailSpiderItem()  #修改为example里面的items的类EmailSpiderItem
            myitem["email"]=mail
            myitem["url"]=response.url
            yield myitem

　　scrapy-redis存储数据库-items，代码要把原先的Class(Item)修改为Class(scrapy.Item),并且导入import scrapy,可以解决之前没有存储到数据库的问题。

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html

from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, TakeFirst, Join
import scrapy

class EmailSpiderItem(scrapy.Item):  #修改，把Item 修改为scrapy.Item
    email = scrapy.Field()  #修改 需要存储到数据库的数据  email
    url = scrapy.Field()   #修改 需要存储到数据库的数据 url
   
    crawled = scrapy.Field() #什么时间抓取的
    spider = scrapy.Field() #谁抓取的



class ExampleItem(Item):
    name = Field()
    description = Field()
    link = Field()
    crawled = Field()
    spider = Field()
    url = Field()


class ExampleLoader(ItemLoader):
    default_item_class = ExampleItem
    default_input_processor = MapCompose(lambda s: s.strip())
    default_output_processor = TakeFirst()
    description_out = Join()

　　scrapy-redis的piplines:

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
from datetime import datetime

class ExamplePipeline(object):
    def __init__(self):   #修改写入文件
        self.file=open("1.txt","wb")   
    def process_item(self, item, spider):
        item["crawled"] = datetime.utcnow() #抓取时间
        item["spider"] = spider.name #谁抓取的
        self.file.write(str(item).encode("utf-8"))  #修改写入文件
        self.file.flush()  #修改文件实时刷新
        return item
    def __del__(self):
        self.file.close()  #修改关闭文件

　　scrapy-redis的settings:

# Scrapy settings for example project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'

USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"

ITEM_PIPELINES = {
    'example.pipelines.ExamplePipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400,
}

LOG_LEVEL = 'DEBUG'
REDIS_HOST="阿里云公网IP"   #修改远程连接需添加IP
REDIS_PORT=6379           #修改远程连接的端口
#  设置密码
REDIS_PARAMS = {             #修改远程连接的密码
    'password': '密码',
}




# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1

　　完成上面的设置后，就可以成功的采集到数据库信息了。

　　此类为初始的url在代码里，redis数据库自动发送url,去重，再发送url,回收数据。开启运行后，不用压入数据

　　此为运行结构

　　 此为scrapy-redis运行的三种方式

posted on 2020-03-30 20:31 共感的艺术阅读(756) 评论(0) 收藏举报