scrapy爬虫案例:问政平台
问政平台
http://wz.sun0769.com/index.php/question/questionType?type=4
爬取投诉帖子的编号、帖子的url、帖子的标题,和帖子里的内容。
items.py
import scrapy class DongguanItem(scrapy.Item): # 每个帖子的标题 title = scrapy.Field() # 每个帖子的编号 number = scrapy.Field() # 每个帖子的文字内容 content = scrapy.Field() # 每个帖子的url url = scrapy.Field()
spiders/sunwz.py
# -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider from scrapyDemo.items import DongguanItem class SunSpider(CrawlSpider): name = 'sun' allowed_domains = ['wz.sun0769.com'] url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=' offset = 0 start_urls = [url + str(offset)] def parse(self, response): # 取出每个页面里帖子链接列表 links = response.xpath("//div[@class='greyframe']/table//td/a[@class='news14']/@href").extract() # 迭代发送每个帖子的请求,调用parse_item方法处理 for link in links: yield scrapy.Request(link, callback=self.parse_item) # 设置页码终止条件,并且每次发送新的页面请求调用parse方法处理 if self.offset <= 3876: self.offset += 30 yield scrapy.Request(self.url + str(self.offset), callback=self.parse) # 处理每个帖子里 def parse_item(self, response): item = DongguanItem() # 标题 item['title'] = response.xpath('//span[contains(@class, "niae2_top")]/text()').extract()[0]
# 编号 item['number'] = response.xpath('//div[contains(@class, "wzy1")]//td//span/text()').extract()[1] # 文字内容,默认先取出有图片情况下的文字内容列表 content = response.xpath('//td[@class="txt16_3"]/text()').extract() # 如果没有内容,则取出没有图片情况下的文字内容列表 if len(content) == 0: content = response.xpath('//div[@class="c1 text14_2"]/text()').extract() # content为列表,通过join方法拼接为字符串,并去除首尾空格 item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() # 链接 item['url'] = response.url yield item
pipelines.py
# -*- coding: utf-8 -*- # 文件处理类库,可以指定编码格式 import codecs import json class DongguanPipeline(object): def __init__(self): # 创建一个只写文件,指定文本编码格式为utf-8 self.filename = codecs.open('sunwz.json', 'w', encoding='utf-8') def process_item(self, item, spider): content = json.dumps(dict(item), ensure_ascii=False) + "\n" self.filename.write(content) return item def spider_closed(self, spider): self.file.close()
settings.py
ITEM_PIPELINES = { 'dongguan.pipelines.DongguanPipeline': 300, } # 日志文件名和处理等级 LOG_FILE = "dg.log" LOG_LEVEL = "DEBUG"
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdline cmdline.execute('scrapy crawl sunwz'.split())
执行程序
py2 main.py
效果:
最后,关注【码上加油站】微信公众号后,有疑惑有问题想加油的小伙伴可以码上加入社群,让我们一起码上加油吧!!!