scrapy从入门到放弃 学习项目1

scrapy 爬取糗事百科段子

保存为json类型文件

import scrapy
from qiushibaike.items import QiushibaikeItem


class QiushibaikespiderSpider(scrapy.Spider):
    name = 'qiushibaikespider'
    allowed_domains = ['qsbk']
    start_urls = ['https://www.qiushibaike.com/text/page/' + str(i) + '/' for i in range(40)]
    base_url = 'https://www.qiushibaike.com'
    def parse(self, response):
        x1 = response.xpath('//div[@id="content-left"]/div')
        for x2 in x1:
            author = x2.xpath('.//h2/text()').get().strip()
            x3 = x2.xpath('.//div[@class="content"]//text()').getall()
            s = ''.join(x3).strip()
            item = QiushibaikeItem(author=author, content=s)
            yield item
        next_url = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').get()
        if not next_url:
            return
        else:
            yield scrapy.Request(self.base_url + next_url, callback=self.parse)
qiushibaikespider.py
import scrapy


class QiushibaikeItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    author = scrapy.Field()
    content = scrapy.Field()
items.py
import json

class QiushibaikePipeline(object):
    def __init__(self):
        self.fp = open("duanzi.json", 'a', encoding='utf-8')

    def open_spider(self, spider):
        pass

    def process_item(self, item, spider):
        item_json = json.dumps(dict(item), ensure_ascii=False)
        self.fp.write(item_json + '\n')
        return item

    def close_spider(self, spider):
        self.fp.close()
        print('爬虫结束了')
pipelines.py

 

posted @ 2019-07-09 19:52  爱学习的红领巾  阅读(214)  评论(0编辑  收藏  举报