scarpy crawl 爬取微信小程序文章

 

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from wxapp.items import WxappItem


class WxSpider(CrawlSpider):
    name = 'wx'
    allowed_domains = ['wxapp-union.com']
    start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']

    rules = (
        Rule(LinkExtractor(allow=r'.*mod=list&catid=2&page=\d+'), follow=True),
        Rule(LinkExtractor(allow=r'.*article-.+\.html'), callback='parse_detail', follow=False),
    )

    def parse_detail(self, response):
        detail_href = response.request.url
        title = response.xpath('//h1[@class="ph"]/text()').get()
        content = response.xpath('//td[@id="article_content"]//text()').getall()
        content = [c.strip() for c in content]
        content = ''.join(content).strip()
        pub_time = response.xpath('//p[@class="authors"]/span/text()').get()
        author = response.xpath('//p[@class="authors"]/a/text()').get()
        item = WxappItem(title=title, content=content, detail_href=detail_href, pub_time=pub_time, author=author)
        yield item

 

from scrapy.exporters import JsonLinesItemExporter, JsonItemExporter


class WxappPipeline(object):
    def __init__(self):
        """
        爬虫开始的时候执行
        """
        self.fp = open("data.json", 'wb')
        self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')

    def open_spider(self, spider):
        """
        爬虫开始的时候执行
        :param spider:
        :return:
        """
        pass

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        """
        爬虫结束的时候执行
        :param spider:
        :return:
        """
        self.fp.close()

 

import scrapy


class WxappItem(scrapy.Item):
    title = scrapy.Field()
    content = scrapy.Field()
    pub_time = scrapy.Field()
    author = scrapy.Field()
    detail_href = scrapy.Field()

 

posted @ 2019-01-30 16:29  20180616  阅读(268)  评论(0编辑  收藏  举报