scrapy之持久化存储
下面演示基于管道存储到mysql、redis、和本地文件
代码实现流程
1. 将解析到的页面数据存储到items对象
2. 使用yield关键字将items提交给管道文件进行处理
3. 在管道文件中编写代码完成数据存储的操作
4. 在配置文件中开启管道操作
代码实现
items:存储解析到的页面数据
pipelines:处理持久化存储的相关操作
下面以抓取糗百的段子为例:
爬虫相关操作
# -*- coding: utf-8 -*- import scrapy from qiubai.items import QiubaiItem class QiubaiSpiderSpider(scrapy.Spider): name = 'qiubai_spider' # allowed_domains = ['www.qiushibaike.com/text'] # 防止爬取的内容不属于当前域名 start_urls = ['https://www.qiushibaike.com/text/'] def parse(self, response): # 建议使用xpath进行指定内容的解析(框架集成了xpath解析的接口) # 段子的内容和作者 div_list = response.xpath('//div[@id="content-left"]/div') # 存储解析到的页面数据 for div in div_list: # xpath解析到的指定内容被存储到了Selector对象 # extract()该方法可以将Selector对象中存储的数据值拿到 # author = div.xpath('./div/a[2]/h2/text()') # extract_first() == extract()[0] author = div.xpath('./div/a[2]/h2/text()').extract_first() content = div.xpath('.//div[@class="content"]/span/text()').extract_first() # 1. 创建item对象 item = QiubaiItem() item['author'] = author item['content'] = content # 2. 提交给管道 yield item
储存解析到的页面数据:
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class QiubaiItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 声明属性 author = scrapy.Field() content = scrapy.Field()
处理持久化储存
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import redis import pymysql class QiubaiPipeline(object): conn = None def open_spider(self, spider): print('redis连接开始') self.conn = redis.Redis(host='127.0.0.1', port=6379) # 编写向数据库中存储数据的相关代码 def process_item(self, item, spider): """ :param item: 接收到的item对象 :param spider: :return: """ dic = { 'author': item['author'], 'content': item['content'], } self.conn.lpush('data', dic) return item def close_spider(self, spider): print('redis链接结束') class QiubaiByMysql(object): """ 实现将数据值存储到mysql数据库中 """ conn = None cursor = None # 编写向数据库中存储数据的相关代码 def open_spider(self, spider): print('mysql链接开始') # 链接数据库 self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='112233', db='qiubai') def process_item(self, item, spider): """ :param item: 接收到的item对象 :param spider: :return: """ print('数据已写入mysql') # 1. 链接数据库 # 2. 执行sql语句 sql = f'insert into qiubai values("{item["author"]}","{item["content"]}")' self.cursor = self.conn.cursor() try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() # 3. 提交事物 return item def close_spider(self, spider): print('mysql链接结束') self.cursor.close() self.conn.close() class QiubaiByFiles(object): """ 将数据值存储到本地磁盘中 """ fp = None def open_spider(self, spider): print('打开文件') self.fp = open('../qiubai.txt', 'w', encoding='utf-8') def process_item(self, item, spider): print('数据已经写入到文件') author = item['author'] content = item['content'] self.fp.write(author + ':' + content + '\n\n\n') return item def close_spider(self, spider): print('关闭文件') self.fp.close()
配置文件的编写
BOT_NAME = 'qiubai' SPIDER_MODULES = ['qiubai.spiders'] NEWSPIDER_MODULE = 'qiubai.spiders' USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' ROBOTSTXT_OBEY = False # 配置管道 ITEM_PIPELINES = { 'qiubai.pipelines.QiubaiPipeline': 300, 'qiubai.pipelines.QiubaiByFiles': 400, 'qiubai.pipelines.QiubaiByMysql': 500, }