scrapy 爬取自己的博客
定义项目
# -*- coding: utf-8 -*- # items.py import scrapy class LianxiCnblogsItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() url = scrapy.Field() title = scrapy.Field() article = scrapy.Field() post_date = scrapy.Field()
定义爬虫
# -*- coding: utf-8 -*- # spider/cnblogs_spider.py from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from lianxi_cnblogs.items import LianxiCnblogsItem class MininovaSpider(CrawlSpider): name = 'cnblogs' allowed_domains = ['cnblogs.com'] start_urls = ['http://www.cnblogs.com/hhh5460/default.html?page=%s' i+1 for i in range(11)] # 分页 rules = [Rule(LinkExtractor(allow=['/p/\d+\.html']), 'parse_cnblogs')] def parse_cnblogs(self, response): res = LianxiCnblogsItem() res['url'] = response.url res['title'] = response.xpath("//h1/a/text()").extract() res['article'] = response.xpath("//div[@id=topics]").extract() res['post_date'] = response.xpath("//span[@id='post-date']/text()").extract() return res
运行爬虫
$ scrapy crawl cnblogs -o results.json