python+scrapy爬取知乎日报全站文章
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class ZhihudailyItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() date=scrapy.Field() title=scrapy.Field() url=scrapy.Field() content=scrapy.Field()
#!/usr/bin/python #coding:utf-8 import scrapy class ZhihudailySpider(scrapy.spider.Spider): name='zhihudaily' allowd_domains=['zhihu.com'] start_urls=[ "http://zhihudaily.ahorn.me/page/1"] def parse(self,response): for sel in response.xpath("//div[@class='post']"): for sub in sel.xpath("./div/div"): url=sub.xpath("./a/@href").extract()[0] yield scrapy.Request(url,callback=self.parse_url) for page in range(2,500): request=scrapy.Request("http://zhihudaily.ahorn.me/page/"+str(page),callback=self.parse) yield request def parse_url(self,response): title=response.xpath("//h1[@class='headline-title']/text()").extract()[0] print "标题:",title print "*************************************************************************" for p in response.xpath("//div[@class='content']/p/text()").extract(): print p