爬虫 crawlspider
基于crawlspider 的 爬虫 阳光投诉网
CrawlSpider的全站数据爬取
- CrawlSpider就是另一种形式的爬虫类。CrawlSpider就是Spider的一个子类
- 创建一个基于CrawlSpider的爬虫文件:
- scrapy genspider -t crawl spiderName www.xxx.com
sun.py
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from sunPro.items import SunproItem,SunProDetail # class SunSpider(CrawlSpider): # name = 'sun' # # allowed_domains = ['www.xxx.com'] # start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] # #连接提取器: # #作用:就是根据指定的规则(allow:正则)进行连接的提取 # link = LinkExtractor(allow=r'type=4&page=\d+') # rules = ( # #规则解析器 # #作用:负责对连接提取器提取到的连接所对应的页面源码数据进行指定规则(callback)的解析 # Rule(link, callback='parse_item', follow=True), # #follow=True:将连接提取器 继续 作用到 连接提取器提取到的连接 所对应的页面源码中 # ) # # def parse_item(self, response): # print(response) #深度爬取 class SunSpider(CrawlSpider): name = 'sun' # allowed_domains = ['www.xxx.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] #连接提取器: #作用:就是根据指定的规则(allow:正则)进行连接的提取 link = LinkExtractor(allow=r'type=4&page=\d+') #使用另一个连接提取期去提取详情页的连接 link_detail = LinkExtractor(allow=r'question/\d+/\d+\.shtml') rules = ( #规则解析器 #作用:负责对连接提取器提取到的连接所对应的页面源码数据进行指定规则(callback)的解析 Rule(link, callback='parse_item', follow=False), #follow=True:将连接提取器 继续 作用到 连接提取器提取到的连接 所对应的页面源码中 Rule(link_detail,callback='parse_detail') ) def parse_item(self, response): tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/text()').extract_first() num = tr.xpath('./td[1]/text()').extract_first() item = SunproItem() item['title'] = title item['num'] = num yield item def parse_detail(self,response): content = response.xpath('/html/body/div[9]/table[2]//tr[1]/td/div[2]/text()').extract_first() num = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first() num = num.split(':')[-1] item = SunProDetail() item['content'] = content item['num'] = num yield item
items.py
import scrapy class SunproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() num = scrapy.Field() class SunProDetail(scrapy.Item): content = scrapy.Field() num = scrapy.Field()
pipline.py
class SunproPipeline(object): def process_item(self, item, spider): if item.__class__.__name__ == 'SunProDetail': content = item['content'] num = item['num'] else: title = item['content'] num = item['num'] return item