scarpy crawl 爬取微信小程序文章(将数据通过异步的方式保存的数据库中)

 

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from wxapp.items import WxappItem


class WxSpider(CrawlSpider):
    name = 'wx'
    allowed_domains = ['wxapp-union.com']
    start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']

    rules = (
        Rule(LinkExtractor(allow=r'.*mod=list&catid=2&page=\d+'), follow=True),
        Rule(LinkExtractor(allow=r'.*article-.+\.html'), callback='parse_detail', follow=False),
    )

    def parse_detail(self, response):
        detail_href = response.request.url
        title = response.xpath('//h1[@class="ph"]/text()').get()
        content = response.xpath('//td[@id="article_content"]//text()').getall()
        content = [c.strip() for c in content]
        content = ''.join(content).strip()
        pub_time = response.xpath('//p[@class="authors"]/span/text()').get()
        author = response.xpath('//p[@class="authors"]/a/text()').get()
        item = WxappItem(title=title, content=content, detail_href=detail_href, pub_time=pub_time, author=author)
        yield item

items:

class WxAppItem(scrapy.Item):
    title = scrapy.Field()
    pub_time = scrapy.Field()
    content = scrapy.Field()
    summary = scrapy.Field()
    article_url = scrapy.Field()
    read_count = scrapy.Field()

pipline:

import pymysql
from pymysql import cursors
from twisted.enterprise import adbapi


class WxAppPipeline(object):
    def __init__(self):
        db_params = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'password': '',
            'database': 'wxapp',
            'charset': 'utf8',
            'cursorclass': cursors.DictCursor  # 指定游标类
        }
        # 定义数据库连接池
        self.db_pool = adbapi.ConnectionPool('pymysql', **db_params)
        self._sql = None

    def process_item(self, item, spider):
        defer = self.db_pool.runInteraction(self.insert_item, item)
        defer.addErrback(self.handle_error, item, spider)
        return item

    def insert_item(self, cursor, item):
        print('kkkkkkkkkkkkkkkkkkkk')
        cursor.execute(self.sql, (item['title'], item['content'], item['summary'], item['read_count'], item['pub_time'], item['article_url']))

    def handle_error(self, error, item, spider):
        print('=' * 10 + 'error' + '=' * 10)
        print(error)

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
               INSERT INTO article(id, title, content, summary, read_count, pub_time, article_url) VALUES (null, %s, %s, %s, %s, %s, %s);
               """
            return self._sql
        return self._sql

 

posted @ 2019-01-30 16:34  20180616  阅读(201)  评论(0编辑  收藏  举报