scrapy从入门到放弃 学习项目1
scrapy 爬取糗事百科段子
保存为json类型文件
import scrapy from qiushibaike.items import QiushibaikeItem class QiushibaikespiderSpider(scrapy.Spider): name = 'qiushibaikespider' allowed_domains = ['qsbk'] start_urls = ['https://www.qiushibaike.com/text/page/' + str(i) + '/' for i in range(40)] base_url = 'https://www.qiushibaike.com' def parse(self, response): x1 = response.xpath('//div[@id="content-left"]/div') for x2 in x1: author = x2.xpath('.//h2/text()').get().strip() x3 = x2.xpath('.//div[@class="content"]//text()').getall() s = ''.join(x3).strip() item = QiushibaikeItem(author=author, content=s) yield item next_url = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').get() if not next_url: return else: yield scrapy.Request(self.base_url + next_url, callback=self.parse)
import scrapy class QiushibaikeItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() author = scrapy.Field() content = scrapy.Field()
import json class QiushibaikePipeline(object): def __init__(self): self.fp = open("duanzi.json", 'a', encoding='utf-8') def open_spider(self, spider): pass def process_item(self, item, spider): item_json = json.dumps(dict(item), ensure_ascii=False) self.fp.write(item_json + '\n') return item def close_spider(self, spider): self.fp.close() print('爬虫结束了')