Scrapy持久化存储-爬取数据转义

Scrapy持久化存储

爬虫爬取数据转义问题

使用这种格式,会自动帮我们转义

'insert into wen values(%s,%s)',(item['title'],item['content'])

基于终端的指令:

只可以将parse方法的返回值存储到本地的文本文件中,支持(json,jsonlines,jl,csv,xml,marshal,pickle)

保存指令

scrapy crawl name -o xxx.csv

好处:简介高效便捷

缺点:局限性比较大(只能保存到本地文件,不能保存到数据库)

# -*- coding: utf-8 -*-
import scrapy


class DuanziSpider(scrapy.Spider):
    name = 'duanzi'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://duanziwang.com/']

    def parse(self, response):
        div_list=response.xpath('//main/article')
        data=[]
        for i in div_list:
            title=i.xpath('.//h1/a/text()').extract_first()
            #xpath返回的是存放selector对象的列表,想要拿到数据需要调用extract()函数取出内容,如果列表长度为1可以使用extract_first()
            content=i.xpath('./div[@class="post-content"]/p/text()').extract_first()
            da={
                'title':title,
                'content':content
            }
            data.append(da)
        return data

基于管道的持久化存储操作

编码流程

1.数据解析

# -*- coding: utf-8 -*-
import scrapy
from zx_spider.items import ZxSpiderItem


class Duanzi2Spider(scrapy.Spider):
    name = 'duanzi2'
    start_urls = ['https://ishuo.cn']

    def parse(self, response):
        data_list=response.xpath('//div[@id="list"]/ul/li')

        for i in data_list:
            title=i.xpath('./div[2]/a/text()').extract_first()
            content=i.xpath('./div[1]/text()').extract_first()
            print(title)
            print(content)
            #创建item对象将内容填入
            item=ZxSpiderItem()
            item['title']=title
            item['content']=content

            #将item提交给管道
            yield item

2.解析的数据封装存储到item对象(在item中定义相关的属性)

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class ZxSpiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()
    # pass

3.将item类型对象提交给管道持久化存储操作,在管道类的process_item中要将其接受到的item对象中的数据进行持久化操作

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class ZxSpiderPipeline(object):
    fw=None
    #该方法只在开始爬虫的时候调用一次
    def open_spider(self,spider):
        print("开始写入爬虫数据")
        self.fw=open('./zx/duanzi2.csv',"w",encoding='utf8')

    #该方法可以接受到爬虫文件提交过来的item对象
    def process_item(self, item, spider):
        title=item['title']
        content=item['content']

        self.fw.write(title+"\n"+content+'\n')
        return item

    def close_spider(self,spider):
        print("爬虫数据写入完成")
        self.fw.close()

4.在配置文件中开启管道

ITEM_PIPELINES = {
   'zx_spider.pipelines.ZxSpiderPipeline': 300,
    #300表示优先级,数字越小优先级越高
}

将爬取的数据存储到多个平台(文件,mysql)

ZxSpiderPipeline中的return不是没有用处的,是讲item传入下一个优先级的管道进行处理(前提要在setting里面配置)

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql

class ZxSpiderPipeline(object):
    fw=None
    #该方法只在开始爬虫的时候调用一次
    def open_spider(self,spider):
        print("开始写入爬虫数据")
        self.fw=open('./zx/duanzi2.csv',"w",encoding='utf8')

    #该方法可以接受到爬虫文件提交过来的item对象
    def process_item(self, item, spider):
        title=item['title']
        content=item['content']

        self.fw.write(title+"\n"+content+'\n')
        return item

    def close_spider(self,spider):
        print("爬虫数据写入完成")
        self.fw.close()
class MysqlSpiderPipeline(object):
    conn=None
    cursor=None
    def open_spider(self,spider):
        print("爬虫数据库写入完成")
        self.conn=pymysql.Connect(host='127.0.0.1',port=3306,user="root",password='zx125',db="zx",charset='utf8')

    def process_item(self, item, spider):
        self.cursor=self.conn.cursor()
        try:
            self.cursor.execute('insert into wen values(%s,%s)',(item['title'],item['content']))
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self,spider):
        print("爬虫数据库写入完成")
        self.cursor.close()
        self.conn.close()

配置

ITEM_PIPELINES = {
   'zx_spider.pipelines.ZxSpiderPipeline': 300,
   'zx_spider.pipelines.MysqlSpiderPipeline': 301,
    #300表示优先级,数字越小优先级越高
}
posted @ 2019-09-11 22:58  zx125  阅读(344)  评论(0编辑  收藏  举报