scrapy从入门到放弃 学习项目2
CrawlSpider微信小程序社区教程贴爬取
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from wxapp.items import WxappItem class WxspiderSpider(CrawlSpider): name = 'wxspider' allowed_domains = ['www.wxapp-union.com'] start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1'] rules = ( Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True), Rule(LinkExtractor(allow=r'.+article-.+\.html'), callback='parse_item', follow=False), ) def parse_item(self, response): title = response.xpath('//h1[@class="ph"]/text()').get() authors = response.xpath('//p[@class="authors"]') author = authors.xpath('./a/text()').get() date = authors.xpath('./span/text()').get() article = ''.join(response.xpath('//td[@id="article_content"]//text()').getall()).strip() item = WxappItem(title=title, author=author, date=date, article=article) yield item
from scrapy.exporters import JsonLinesItemExporter class WxappPipeline(object): def __init__(self): self.fp = open('wxapp.json', 'wb') self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.fp.close()
import scrapy class WxappItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() author = scrapy.Field() date = scrapy.Field() article = scrapy.Field()