Scrapy持久化(items+pipelines)
一、items保存爬取的文件
items.py
import scrapy class QuoteItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() text = scrapy.Field() author = scrapy.Field() tags = scrapy.Field()
quote.py
# -*- coding: utf-8 -*- import scrapy from toscrapy.items import QuoteItem class QuoteSpider(scrapy.Spider): name = 'quote' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] """ 知识点 1. text()获取标签的text 2. @属性 获取属性的值 3. extract()查找多个 extract_first() 查找一个 4. response.urljoin url拼接 5. scrapy.Request(url=_next, callback=self.parse) 回调 """ def parse(self, response): # print(response.text) quotes = response.xpath('//div[@class="col-md-8"]/div[@class="quote"]') # print(quotes)'' for quote in quotes: # print('=' * 20) # print(quote) item = QuoteItem() # extract_first() 查找一个 text = quote.xpath('.//span[@class="text"]/text()').extract_first() # print(text) item['text'] = text author = quote.xpath('.//span/small[@class="author"]/text()').extract_first() # print(author) item['author'] = author # extract()查找多个 tags = quote.xpath('.//div[@class="tags"]/a[@class="tag"]/@href').extract() item['tags'] = tags # print(tags) yield item # print('>' * 40) next_url = response.xpath('//div[@class="col-md-8"]/nav/ul[@class="pager"]/li[@class="next"]/a/@href').extract_first() # print(next_url) # 拼接url _next = response.urljoin(next_url) # print(_next) # callback 回调函数 yield scrapy.Request(url=_next, callback=self.parse)
或直接yield QuoteItem()
产生文件命令
scrapy crawl quote -o qutoes.json
scrapy crawl quote -o quotes.jsonlines or scrapy crawl quote -o quotes.jl # 每一个item输出一行json
文件类型:qutoes.xml qutoes.jl qutoes.csv等
二、piplines
1、核心:
爬虫每执行一次 yield item对象 -> 执行一次pipelines中的process_item方法(通过修改配置文件使pipelines生效) -> 将数据存入数据库或写入文件
2、settings.py
ITEM_PIPELINES = { # 后面的参数是不同pipelines类的权重值(0-1000),权重值越小越优先 'toscrapy.pipelines.ToscrapyPipeline': 300, }
配置settings文件是执行pipeline的前提条件
3、pipelines
a、默认
class ToscrapyPipeline(object): def process_item(self, item, spider): """ :param item: item对象 :param spider: 爬虫对象 :return: """ # print('='*20, item) return item
b、其它方法
开始爬虫时,调用的方法
def open_spider(self, spider): """ 开始爬虫,调用 :param spider: :return: """ pass
爬虫结束时,调用的方法
def close_spider(self, spider): """ 关闭爬虫,调用 :param spider: :return: """ pass
from_crawler方法
作用:初始化时,实例化pipleline类对象
目的:将数据储存的路径,写到配置文件中
@classmethod def from_crawler(cls, crawler): """ 初始化方法时,创建pipeline类的对象 :param crawler: :return: """ # crawler.settings 获取全部配置文件 path = crawler.settings.get('FILE_PATH') # 实例化对象 return cls(path)
c、pipeline类方法分析
判断是否有from_crawler方法
有:obj = pipeline类.from_crawler()
无:obj = pipeline类()
当爬虫开始时,执行 open_spider方法
当爬虫yield item对象时, 执行 process_item方法
当爬虫结束时,执行 close_spider方法
d、序列化
pipelines.py
class ToscrapyPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): """ 初始化方法时,创建pipeline类的对象 :param crawler: :return: """ # crawler.settings 获取全部配置文件 path = crawler.settings.get('FILE_PATH') # 实例化对象 return cls(path) def open_spider(self, spider): """ 开始爬虫,调用 :param spider: :return: """ self.f = open(file=self.path, mode='a', encoding="utf-8") def process_item(self, item, spider): """ 爬虫执行yield item对象,调用 :param item: item对象 :param spider: 爬虫对象 :return: """ # print('='*20, item) self.f.write(item['text'] + '\n') return item def close_spider(self, spider): """ 爬虫结束,调用 :param spider: :return: """ self.f.close()
4、多个pipeline类
piplines.py文件中可以有多个类,一个把数据保存到数据库,一个把数据保存到文件
a、执行顺序由settings.py的权重值决定,多个pipeline类中方法的执行顺序可以看成有序的异步
file from_crawl
db from_crawl
file open_spider
db open_spider
file process_item
db process_item
file process_item
db process_item
file process_item
db process_item
db close_spider
file close_spider
b、process_item方法中return item的作用
为下一个pipeline类中的process_item方法提供item
1)、没有返回item
下一个类的item是None
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class ToscrapyPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): """ 初始化方法时,创建pipeline类的对象 :param crawler: :return: """ # crawler.settings 获取全部配置文件 path = crawler.settings.get('FILE_PATH') # 实例化对象 print('file from_crawl') return cls(path) def open_spider(self, spider): """ 开始爬虫,调用 :param spider: :return: """ print('file open_spider') self.f = open(file=self.path, mode='a', encoding="utf-8") def process_item(self, item, spider): """ 爬虫执行yield item对象,调用 :param item: item对象 :param spider: 爬虫对象 :return: """ print('file process_item') # self.f.write(item['text'] + '\n') # return item def close_spider(self, spider): """ 爬虫结束,调用 :param spider: :return: """ print('file close_spider') self.f.close() class DBPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): """ 初始化方法时,创建pipeline类的对象 :param crawler: :return: """ # crawler.settings 获取全部配置文件 path = crawler.settings.get('DB_PATH') # 实例化对象 print('db from_crawl') return cls(path) def open_spider(self, spider): """ 开始爬虫,调用 :param spider: :return: """ print('db open_spider') self.f = open(file=self.path, mode='a', encoding="utf-8") def process_item(self, item, spider): """ 爬虫执行yield item对象,调用 :param item: item对象 :param spider: 爬虫对象 :return: """ print('db process_item value is {}'.format(item)) # self.f.write(item['text'] + '\n') return item def close_spider(self, spider): """ 爬虫结束,调用 :param spider: :return: """ print('db close_spider') self.f.close()
2)、DropItem
不执行后续pipeline类中的process_item方法
导入
from scrapy.exceptions import DropItem
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exceptions import DropItem class ToscrapyPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): """ 初始化方法时,创建pipeline类的对象 :param crawler: :return: """ # crawler.settings 获取全部配置文件 path = crawler.settings.get('FILE_PATH') # 实例化对象 print('file from_crawl') return cls(path) def open_spider(self, spider): """ 开始爬虫,调用 :param spider: :return: """ print('file open_spider') self.f = open(file=self.path, mode='a', encoding="utf-8") def process_item(self, item, spider): """ 爬虫执行yield item对象,调用 :param item: item对象 :param spider: 爬虫对象 :return: """ print('file process_item') # self.f.write(item['text'] + '\n') # return item raise DropItem() def close_spider(self, spider): """ 爬虫结束,调用 :param spider: :return: """ print('file close_spider') self.f.close() class DBPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): """ 初始化方法时,创建pipeline类的对象 :param crawler: :return: """ # crawler.settings 获取全部配置文件 path = crawler.settings.get('DB_PATH') # 实例化对象 print('db from_crawl') return cls(path) def open_spider(self, spider): """ 开始爬虫,调用 :param spider: :return: """ print('db open_spider') self.f = open(file=self.path, mode='a', encoding="utf-8") def process_item(self, item, spider): """ 爬虫执行yield item对象,调用 :param item: item对象 :param spider: 爬虫对象 :return: """ print('db process_item value is {}'.format(item)) # self.f.write(item['text'] + '\n') return item def close_spider(self, spider): """ 爬虫结束,调用 :param spider: :return: """ print('db close_spider') self.f.close()
5、spider参数的作用
作用:pipelines.py中的类和方法是所有爬虫共用的
应用场景:如果先让某个方法,只有一个爬虫可以使用,就要用到spider参数
注意:spider参数对应的是爬虫中的name值
def open_spider(self, spider): """ 开始爬虫,调用 :param spider: :return: """ # if spider == 'quote': print('file open_spider') self.f = open(file=self.path, mode='a', encoding="utf-8")
持久化到redis可参考
https://www.cnblogs.com/wanglan/p/10826678.html
使用mongodb参考
https://blog.csdn.net/qq_41020281/article/details/79459604