爬虫框架Scrapy之案例一
阳光热线问政平台
http://wz.sun0769.com/index.php/question/questionType?type=4
爬取投诉帖子的编号、帖子的url、帖子的标题,和帖子里的内容。
items.py
import scrapy
class SunwzItem(scrapy.Item):
number = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
spiders/sunwz.py
# -*- coding: utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from Sunwz.items import SunwzItem
class SunwzSpider(CrawlSpider):
name = 'sunwz'
num = 0
allow_domain = ['http://wz.sun0769.com/']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4']
rules = {
Rule(LinkExtractor(allow='page')),
Rule(LinkExtractor(allow='/index\.php/question/questionType\?type=4$')),
Rule(LinkExtractor(allow='/html/question/\d+/\d+\.shtml$'), follow = True, callback='parse_content')
}
xpathDict = {
'title': '//div[contains(@class, "pagecenter p3")]/div/div/div[contains(@class,"cleft")]/strong/text()',
'content': '//div[contains(@class, "c1 text14_2")]/text()',
'content_first': '//div[contains(@class, "contentext")]/text()'
}
def parse_content(self, response):
item = SunwzItem()
content = response.xpath(self.xpathDict['content_first']).extract()
if len(content) == 0:
content = response.xpath(self.xpathDict['content']).extract()[0]
else:
content = content[0]
title = response.xpath(self.xpathDict['title']).extract()[0]
title_list = title.split(' ')
number = title_list[-1]
number = number.split(':')[-1]
url = response.url
item['url'] = url
item['number'] = number
item['title'] = title
item['content'] = content
yield item
pipelines.py
import json
import codecs
class JsonWriterPipeline(object):
def __init__(self):
self.file = codecs.open('sunwz.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
settings.py
ITEM_PIPELINES = {
'Sunwz.pipelines.JsonWriterPipeline': 300,
}
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdline
cmdline.execute('scrapy crawl sunwz'.split())
执行程序
py2 main.py
Sic Parvis Magna