关于scrapy中如何区分是接着发起请求还是开始保存文件

一.区分

根据yield迭代器生成的对象是request对象还是item对象

二.item

1.配置tem对象

items.py文件中设置类

class MyscrapyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    price = scrapy.Field()
    prostatus = scrapy.Field()

2.在爬虫程序中导入该类写相应的函数

from myscrapy.items import MyscrapyItem
def get_info(self,response):
    elements_list = response.css('.product')
    for element in elements_list:
        title = element.css('.productTitle a::attr(title)').extract_first() #这是css选择器
        price = element.css('.productPrice em::attr(title)').extract_first()
        prostatus = element.css('.productStatus em::text').extract_first()
        item = MyscrapyItem()  #实例话一个item对象
        item['title'] = title  #填写配置的参数
        item['price'] = price
        item['prostatus'] = prostatus
        yield item

三.再获得item参数后scrapy会自动执行pipelines.py文件中内容

1.settings文件进行注册

ITEM_PIPELINES = {
   'myscrapy.pipelines.MyscrapyPipeline': 300,   #小的优先级高
   # 'myscrapy.pipelines.MyscrapyPipeline1': 500,
}
#和中间件一个道理

2.配置MyscrapyPipeline方法

#其中两个方法非常常用
#def open_spider(self): 运行这个函数开始执行,一般都是连接数据库用
#def close_spider(self): 运行完这个函数执行,一般都是关闭数据库用

#简单拿MongoDB举例
from pymongo import MongoClient

class MyscrapyPipeline(object):

    def __init__(self,HOST,PORT,USER,PWD,DB,TABLE):
        self.HOST = HOST
        self.PORT = PORT
        self.USER = USER
        self.PWD = PWD
        self.DB = DB
        self.TABLE = TABLE
	#执行__init__之前执行
    @classmethod
    def from_crawler(cls,crawler):
        HOST = crawler.settings.get('HOST')  #crawler.settings可以直接获得setting文件中的所有名称
        PORT = crawler.settings.get('PORT')
        USER = crawler.settings.get('USER')
        PWD = crawler.settings.get('PWD')
        DB = crawler.settings.get('DB')
        TABLE = crawler.settings.get('TABLE')
        return cls(HOST,PORT,USER,PWD,DB,TABLE)


    def open_spider(self,spider):
        self.client = MongoClient(host=self.HOST,port=self.PORT,username=self.USER,password=self.PWD)
        print('连接数据库成功')

    def close_spider(self,spider):
        self.client.close()
        print('关闭数据库')


    def process_item(self, item, spider):
        self.client[self.DB][self.TABLE].insert_one(dict(item))
        return item
posted @ 2019-10-23 19:59  小小咸鱼YwY  阅读(746)  评论(0编辑  收藏  举报