Scrapy_糗事百科
1.进入桌面
1 cd C:\Users\Mr_wa\Desktop
2.新建项目
scrapy startproject qsbk
3.新建爬虫
cd qsbk
scrapy genspider qsbk_spider qiushibaike.com
4.修改settings.py
1 ROBOTSTXT_OBEY = False 2 3 DEFAULT_REQUEST_HEADERS = { 4 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 5 'Accept-Language': 'en', 6 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' 7 }
5.书写qsbk_spider.py(start_urls)
1 import scrapy 2 3 4 class QsbkSpiderSpider(scrapy.Spider): 5 name = 'qsbk_spider' 6 allowed_domains = ['qiushibaike.com'] 7 start_urls = ['https://www.qiushibaike.com/text/page/1/'] 8 9 def parse(self, response): 10 pass
6.根目录新建运行文件start.py
1 from scrapy import cmdline 2 3 cmdline.execute(['scrapy', 'crawl', 'qsbk_spider'])
7.爬虫测试(查看response类型)
1 import scrapy 2 3 4 class QsbkSpiderSpider(scrapy.Spider): 5 name = 'qsbk_spider' 6 allowed_domains = ['qiushibaike.com'] 7 start_urls = ['https://www.qiushibaike.com/text/page/1/'] 8 9 def parse(self, response): 10 print("===================") 11 print(type(response)) 12 print("===================")
8.在start.py中运行结果
=================== <class 'scrapy.http.response.html.HtmlResponse'> ===================
8.xpath解析数据
获取作者和段子内容
1 import scrapy 2 3 4 class QsbkSpiderSpider(scrapy.Spider): 5 name = 'qsbk_spider' 6 allowed_domains = ['qiushibaike.com'] 7 start_urls = ['https://www.qiushibaike.com/text/page/1/'] 8 9 def parse(self, response): 10 # content_list : SelectorList 11 content_list = response.xpath("//div[@class='col1 old-style-col1']/div") 12 # content : selector 13 for content in content_list: 14 author = content.xpath(".//h2/text()").get().strip() 15 text = content.xpath(".//div[@class='content']//text()").getall() 16 text = "".join(text).strip() 17 duanzi = {'author': author, 'content': text} 18 print(duanzi)
9.存储数据
在item.py中定义数据保存的类型,(而不是上面的字典 duanzi = {'author': author, 'content': text})
1 import scrapy 2 3 4 class QsbkItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 author = scrapy.Field() 8 text = scrapy.Field()
qsbk_spider.py
import scrapy from ..items import QsbkItem class QsbkSpiderSpider(scrapy.Spider): name = 'qsbk_spider' allowed_domains = ['qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/text/page/1/'] def parse(self, response): # content_list : SelectorList content_list = response.xpath("//div[@class='col1 old-style-col1']/div") # content : selector for content in content_list: author = content.xpath(".//h2/text()").get().strip() text = content.xpath(".//div[@class='content']//text()").getall() text = "".join(text).strip() item = QsbkItem(author=author, content=text) yield item
settings.py 取消注释
ITEM_PIPELINES = { 'qsbk.pipelines.QsbkPipeline': 300, }
pipelines.py 保存数据
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter import json class QsbkPipeline: def process_item(self, item, spider): with open('qsbk.json', 'a', encoding='utf-8') as f: f.write(json.dumps(dict(item), ensure_ascii=False)) return item
保存为json
爬取多页,并保存为csv格式
qsbk_spider.py
import scrapy from ..items import QsbkItem class QsbkSpiderSpider(scrapy.Spider): name = 'qsbk_spider' allowed_domains = ['qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/text/page/1/'] def parse(self, response): # content_list : SelectorList content_list = response.xpath("//div[@class='col1 old-style-col1']/div") # content : selector for content in content_list: author = content.xpath(".//h2/text()").get().strip() text = content.xpath(".//div[@class='content']//text()").getall() text = "".join(text).strip() item = QsbkItem(author=author, content=text) yield item next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() if next_url: next_url = 'https://www.qiushibaike.com' + next_url yield scrapy.Request(next_url, callback=self.parse)
pipelines.py
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter import csv class QsbkPipeline: def process_item(self, item, spider): with open('qsbk.csv', 'a', encoding='utf-8', newline="") as f: writer = csv.writer(f) writer.writerow([item["author"], item["content"]]) return item