21天打造分布式爬虫-Spider类爬取糗事百科(七)
7.1.糗事百科
安装
pip install pypiwin32
pip install Twisted-18.7.0-cp36-cp36m-win_amd64.whl
pip install scrapy
创建和运行项目
scrapy startproject qsbk #创建项目 scrapy genspider qsbk_spider "qiushibaike.com" #创建爬虫 scrapy crawl qsbk_spider #运行爬虫
代码
qsbk_spider.py
# -*- coding: utf-8 -*- import scrapy from qsbk.items import QsbkItem class QsbkSpiderSpider(scrapy.Spider): name = 'qsbk_spider' allowed_domains = ['qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/8hr/page/1/'] base_domain = "https://www.qiushibaike.com" def parse(self, response): duanzidivs = response.xpath("//div[@id='content-left']/div") for duanzidiv in duanzidivs: author = duanzidiv.xpath(".//h2/text()").get().strip() content = duanzidiv.xpath(".//div[@class='content']//text()").getall() content = "".join(content).strip() item = QsbkItem(author=author,content=content) yield item #爬后面页的数据 next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else: yield scrapy.Request(self.base_domain+next_url,callback=self.parse)
item.py
import scrapy class QsbkItem(scrapy.Item): author = scrapy.Field() content = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- import json #1.手动把dick转换成json格式 # class QsbkPipeline(object): # def __init__(self): # self.fp = open('duanzi.json','w',encoding='utf-8') # # def open_spider(self,spider): # print('开始爬虫') # # def process_item(self, item, spider): # item_json = json.dumps(dict(item),ensure_ascii=False) # self.fp.write(item_json+'\n') # return item # # def close_spider(self,spider): # self.fp.close() # print('爬虫结束了') #2.适用JsonItemExporter,使用与数据量小的情况下 # from scrapy.exporters import JsonItemExporter # class QsbkPipeline(object): # def __init__(self): # self.fp = open('duanzi.json','wb') # self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf-8') # self.exporter.start_exporting() # # def open_spider(self,spider): # print('开始爬虫') # # def process_item(self, item, spider): # self.exporter.export_item(item) # return item # # def close_spider(self,spider): # self.exporter.finish_exporting() # self.fp.close() # print('爬虫结束了') #3.JsonLinesItemExporter,适用与数据量大的情况下 from scrapy.exporters import JsonLinesItemExporter class QsbkPipeline(object): def __init__(self): self.fp = open('duanzi.json','wb') self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8') def open_spider(self,spider): print('开始爬虫') def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self,spider): self.fp.close() print('爬虫结束了')
settings.py
ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 1 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', }
ITEM_PIPELINES = {
'qsbk.pipelines.QsbkPipeline': 300,
}
start.py
from scrapy import cmdline cmdline.execute("scrapy crawl qsbk_spider".split())
posted on 2018-08-05 00:15 zhang_derek 阅读(806) 评论(0) 编辑 收藏 举报