scrapy中Spider类与CrawlSpider类的使用比较
创建爬虫时:
scrapy genspider exampleSpider example.com 创建以scrapy.Spider为模板的爬虫类
scrapy genspider -t crawl exampleSpider example.com 创建以scrapy.spiders.CrawlSpider为模板的爬虫类
使用时:
Spider中start_urls返回的Response会使用编写的parse方法去解析,如果继续爬取深度网址需要使用xpath或css selector去解析response中想要继续爬取的网址,再用Scrapy.Request发送请求,得到的Response使用Request中指定的callback函数去解析
CrawlSpider中可使用Rule:
Rule中编写需要深度爬取的网址的正则匹配规则,start_urls中返回的Response会先与Rule中的LinkExtractor规则匹配,满足条件的URL会自动发送Request,得到的Response会调用制定的callback函数去解析,如果指定属性follow=True,返回的每个Response都会与Rule中的LinkExtractor规则匹配,不断循环,直到获取完所有链接(scrapy中Request队列带有去重功能,所以重复网址也只会发送一次Request)
./mySpider/mySpider/spiders/dongguan.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 from dongguan.items import DongguanItem 6 7 8 class DongguanmesSpider(CrawlSpider): 9 name = 'dongguanMes' 10 allowed_domains = ['wz.sun0769.com'] 11 start_urls = ['http://wz.sun0769.com/index.php/question/questionType'] 12 13 rules = ( 14 Rule(LinkExtractor(allow=r'/questionType\?page=\d+'),process_links="deal_links",follow=True), 15 Rule(LinkExtractor(allow=r'/question/\d+/\d+.shtml'), callback='parse_item'), 16 ) 17 #传入Rule中用于处理每个匹配到的链接 18 #返回值为修改后的links 19 def deal_links(self,links): 20 for link in links: 21 print(link) 22 return links 23 24 def parse_item(self, response): 25 item = DongguanItem() 26 item['id'] = response.xpath("//div[@class='wzy1']/table/tr/td[2]/span[2]/text()").extract_first().split(":")[-1] 27 item['title'] = response.xpath("//div[@class='wzy1']/table/tr/td[2]/span[1]/text()").extract_first().split(":")[-1] 28 item['content'] = response.xpath("//td[@class='txt16_3']/text()").extract_first().replace("\xa0"," ") 29 item['state'] = response.xpath("//div[@class='wzy3_1']/span/text()").extract_first() 30 item['time'] = response.xpath("//div[@class='wzy3_2']/span/text()").extract_first().split(":")[-1].replace("\xa0"," ") 31 yield item 32 class Dongguan1Spider(scrapy.Spider): 33 name = "dongguanMes1" 34 allowed_domains = ['wz.sun0769.com'] 35 url = 'http://wz.sun0769.com/index.php/question/questionType/' 36 start_urls = ['http://wz.sun0769.com/index.php/question/questionType/'] 37 offset = 0 38 def parse(self, response): 39 links = response.xpath('//div[@class="greyframe"]/table[2]//tr//a[@class="news14"]/@href').extract() 40 for link in links: 41 yield scrapy.Request(link,callback=self.parse_item) 42 if self.offset<=120: 43 self.offset += 30 44 yield scrapy.Request(self.url+"?page="+str(self.offset),callback=self.parse) 45 def parse_item(self, response): 46 item = DongguanItem() 47 item['id'] = response.xpath("//div[@class='wzy1']/table/tr/td[2]/span[2]/text()").extract_first().split(":")[-1] 48 item['title'] = response.xpath("//div[@class='wzy1']/table/tr/td[2]/span[1]/text()").extract_first().split(":")[-1] 49 if(response.xpath('//div[@class="contentext"]')): 50 item['content'] = response.xpath('//div[@class="contentext"/text()]').extract_first().strip() 51 else: 52 item['content'] = response.xpath("//td[@class='txt16_3']/text()").extract_first().strip() 53 item['state'] = response.xpath("//div[@class='wzy3_1']/span/text()").extract_first() 54 item['time'] = response.xpath("//div[@class='wzy3_2']/span/text()").extract_first().split(":")[-1].replace("\xa0"," ") 55 yield item
./mySpider/mySpider/pipelines.py
1 import scrapy 2 3 4 class DongguanItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 #编号 8 id = scrapy.Field() 9 #主题 10 title = scrapy.Field() 11 #详情 12 content = scrapy.Field() 13 #状态 14 state = scrapy.Field() 15 #时间 16 time = scrapy.Field()
./mySpider/mySpider/items.py
1 import json,codecs 2 3 class DongguanPipeline(object): 4 def __init__(self): 5 self.filename = open("message.json","wb") 6 def process_item(self, item, spider): 7 jsontext = json.dumps(dict(item),ensure_ascii=False) +"\n" 8 self.filename.write(jsontext.encode("utf8")) 9 return item 10 def close_spider(self,spider): 11 self.filename.close() 12 ''' 13 def __init__(self): 14 self.filename = codecs.open("message1.json","w",encoding="utf-8") 15 def process_item(self,item,spider): 16 jsontext = json.dumps(dict(item), ensure_ascii=False) + "\n" 17 self.filename.write(jsontext) 18 return item 19 20 def close_spider(self, spider): 21 self.filename.close() 22 '''
./mySpider/mySpider/settings.py
1 ITEM_PIPELINES = { 2 'dongguan.pipelines.DongguanPipeline': 300, 3 }