scrapy从入门到放弃 学习项目2

CrawlSpider微信小程序社区教程贴爬取

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wxapp.items import WxappItem

class WxspiderSpider(CrawlSpider):
    name = 'wxspider'
    allowed_domains = ['www.wxapp-union.com']
    start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']

    rules = (
        Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
        Rule(LinkExtractor(allow=r'.+article-.+\.html'), callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        title = response.xpath('//h1[@class="ph"]/text()').get()
        authors = response.xpath('//p[@class="authors"]')
        author = authors.xpath('./a/text()').get()
        date = authors.xpath('./span/text()').get()
        article = ''.join(response.xpath('//td[@id="article_content"]//text()').getall()).strip()
        item = WxappItem(title=title, author=author, date=date, article=article)
        yield item
wxspider.py
from scrapy.exporters import JsonLinesItemExporter

class WxappPipeline(object):
    def __init__(self):
        self.fp = open('wxapp.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp.close()
pipelines.py
import scrapy


class WxappItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    date = scrapy.Field()
    article = scrapy.Field()
items.py

 

posted @ 2019-07-09 21:06  爱学习的红领巾  阅读(156)  评论(0编辑  收藏  举报