Scrapy之异步配置

pipelines.py配置

import pymysql
from twisted.enterprise import adbapi
from yangguang.items import GuSuItem


class YangguangPipeline:
    def __init__(self, dbpool):
        self.dbpool = dbpool
    '''
    在settings中添加以下配置
    MYSQL_HOST = 'localhost'
    MYSQL_PORT = '3306'
    MYSQL_USER = 'root'
    MYSQL_PASS = '123456'
    MYSQL_DB = 'open_source_intelligence'
    '''
    @classmethod
    def from_settings(cls, settings):
        adbparams = dict(
            host=settings['MYSQL_HOST'],
            db=settings['MYSQL_DB'],
            user=settings['MYSQL_USER'],
            password=settings['MYSQL_PASS'],
            charset='utf8',
            cursorclass=pymysql.cursors.DictCursor
        )
        dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
        return cls(dbpool)

    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self.do_insert, item)  # 指定操作方法和操作数据
        # 添加异常处理
        query.addCallback(self.handle_error)  # 处理异常

    def do_insert(self, cursor, item):
        # 新闻表
        if isinstance(item, GuSuItem):
            # 对数据库进行插入操作,并不需要commit,twisted会自动commit
            select_sql_news = """select title from op_news where title='{}'""".format(
                item['title'])
            cursor.execute(select_sql_news)
            data = cursor.fetchall()
            if not data:
                insert_sql = """insert into op_news(title, content, publish_date, url) values (%s, %s, %s, %s)"""
                try:
                    cursor.execute(insert_sql, (
                        item['title'], item['content'], item['publish_date'], item['url']))
                    print("新闻表数据插入成功!!!!!!")
                except Exception as e:
                    print(e)
            else:
                print(item['title'], ':已存在!!!')

    def handle_error(self, failure):
        if failure:
            print(failure)

setting.py配置

ITEM_PIPELINES = {
   'yangguang.pipelines.YangguangPipeline': 300, #开启管道
}

#连接mysql配置
MYSQL_HOST = 'localhost'
MYSQL_PORT = '3306'
MYSQL_USER = 'root'
MYSQL_PASS = '123456'
MYSQL_DB = 'open_source_intelligence'

gusu.py

import json
import scrapy
from yangguang.items import GuSuItem



class GusuSpider(scrapy.Spider):
    name = 'gusu'
    # allowed_domains = ['gusu.gov.cn'] 二次请求必须注释,否则第二次请求无法生效;否则启用dont_filter=True
    # start_urls = ['http://www.suzhou.gov.cn/consultfront/getGzjdlistFY/']

    def start_requests(self):
        url = 'http://www.suzhou.gov.cn/consultfront/getGzjdlistFY/'
        data = {
            'type': '12',
            'pagesize': '10',
            'keywords': '',
            'currpage': '2',
            'deptcode': '014152419',
            'check': 'do'
        }
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36'
        }
        request = scrapy.FormRequest(url, formdata=data, headers=headers, callback=self.parse)
        yield request

    def parse(self, response):
        txt = response.text
        data = json.loads(txt)
        infolist = data['infolist']
        for info in infolist:
            item = GuSuItem()
            page_id = info['consult_link'].split('/')[-1]
            text = page_id.split('?')[0]
            item['url'] = info['consult_link'].replace(text,'detail')
            print(item)
            yield scrapy.Request(url=item['url'], callback=self.parse_detail, meta={"item": item})

    def parse_detail(self,response):
        item = response.meta['item']
        table = response.xpath('//table[@class="tablecon"]/tbody')
        item['title'] = table.xpath('./tr[1]/td[2]/text()').extract_first()
        item['publish_date'] = table.xpath('./tr[2]/td[2]/text()').extract_first()
        item['content'] = table.xpath('./tr[3]/td[2]/text()').extract_first()
        print(item)
        yield item

item.py

import scrapy


class YangguangItem(scrapy.Item):
    # define the fields for your item here like:
    url = scrapy.Field()
    title = scrapy.Field()  #scrapy.Field()字典
    content = scrapy.Field()
    publish_date = scrapy.Field()
    picture = scrapy.Field()

class GuSuItem(scrapy.Item):
    url = scrapy.Field()
    title = scrapy.Field()  # scrapy.Field()字典
    content = scrapy.Field()
    publish_date = scrapy.Field()

 

posted @ 2021-04-26 17:18  Eliphaz  阅读(323)  评论(0编辑  收藏  举报