scrapy中Spider类与CrawlSpider类的使用比较

创建爬虫时:

  scrapy genspider exampleSpider example.com  创建以scrapy.Spider为模板的爬虫类

  scrapy genspider -t crawl exampleSpider example.com  创建以scrapy.spiders.CrawlSpider为模板的爬虫类

使用时:

  Spider中start_urls返回的Response会使用编写的parse方法去解析,如果继续爬取深度网址需要使用xpath或css selector去解析response中想要继续爬取的网址,再用Scrapy.Request发送请求,得到的Response使用Request中指定的callback函数去解析

  CrawlSpider中可使用Rule:

    Rule中编写需要深度爬取的网址的正则匹配规则,start_urls中返回的Response会先与Rule中的LinkExtractor规则匹配,满足条件的URL会自动发送Request,得到的Response会调用制定的callback函数去解析,如果指定属性follow=True,返回的每个Response都会与Rule中的LinkExtractor规则匹配,不断循环,直到获取完所有链接(scrapy中Request队列带有去重功能,所以重复网址也只会发送一次Request)

./mySpider/mySpider/spiders/dongguan.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.linkextractors import LinkExtractor
 4 from scrapy.spiders import CrawlSpider, Rule
 5 from dongguan.items import DongguanItem
 6 
 7 
 8 class DongguanmesSpider(CrawlSpider):
 9     name = 'dongguanMes'
10     allowed_domains = ['wz.sun0769.com']
11     start_urls = ['http://wz.sun0769.com/index.php/question/questionType']
12 
13     rules = (
14         Rule(LinkExtractor(allow=r'/questionType\?page=\d+'),process_links="deal_links",follow=True),
15         Rule(LinkExtractor(allow=r'/question/\d+/\d+.shtml'), callback='parse_item'),
16     )
17     #传入Rule中用于处理每个匹配到的链接
18     #返回值为修改后的links
19     def deal_links(self,links):
20         for link in links:
21             print(link)
22         return links
23 
24     def parse_item(self, response):
25         item = DongguanItem()
26         item['id'] = response.xpath("//div[@class='wzy1']/table/tr/td[2]/span[2]/text()").extract_first().split(":")[-1]
27         item['title'] = response.xpath("//div[@class='wzy1']/table/tr/td[2]/span[1]/text()").extract_first().split("")[-1]
28         item['content'] = response.xpath("//td[@class='txt16_3']/text()").extract_first().replace("\xa0"," ")
29         item['state'] = response.xpath("//div[@class='wzy3_1']/span/text()").extract_first()
30         item['time'] = response.xpath("//div[@class='wzy3_2']/span/text()").extract_first().split("")[-1].replace("\xa0"," ")
31         yield item
32 class Dongguan1Spider(scrapy.Spider):
33     name = "dongguanMes1"
34     allowed_domains = ['wz.sun0769.com']
35     url = 'http://wz.sun0769.com/index.php/question/questionType/'
36     start_urls = ['http://wz.sun0769.com/index.php/question/questionType/']
37     offset = 0
38     def parse(self, response):
39         links = response.xpath('//div[@class="greyframe"]/table[2]//tr//a[@class="news14"]/@href').extract()
40         for link in links:
41             yield scrapy.Request(link,callback=self.parse_item)
42         if self.offset<=120:
43             self.offset += 30
44             yield scrapy.Request(self.url+"?page="+str(self.offset),callback=self.parse)
45     def parse_item(self, response):
46         item = DongguanItem()
47         item['id'] = response.xpath("//div[@class='wzy1']/table/tr/td[2]/span[2]/text()").extract_first().split(":")[-1]
48         item['title'] = response.xpath("//div[@class='wzy1']/table/tr/td[2]/span[1]/text()").extract_first().split("")[-1]
49         if(response.xpath('//div[@class="contentext"]')):
50             item['content'] = response.xpath('//div[@class="contentext"/text()]').extract_first().strip()
51         else:
52             item['content'] = response.xpath("//td[@class='txt16_3']/text()").extract_first().strip()
53         item['state'] = response.xpath("//div[@class='wzy3_1']/span/text()").extract_first()
54         item['time'] = response.xpath("//div[@class='wzy3_2']/span/text()").extract_first().split("")[-1].replace("\xa0"," ")
55         yield item

./mySpider/mySpider/pipelines.py

 1 import scrapy
 2 
 3 
 4 class DongguanItem(scrapy.Item):
 5     # define the fields for your item here like:
 6     # name = scrapy.Field()
 7     #编号
 8     id = scrapy.Field()
 9     #主题
10     title = scrapy.Field()
11     #详情
12     content = scrapy.Field()
13     #状态
14     state = scrapy.Field()
15     #时间
16     time = scrapy.Field()

./mySpider/mySpider/items.py

 1 import json,codecs
 2 
 3 class DongguanPipeline(object):
 4     def __init__(self):
 5         self.filename = open("message.json","wb")
 6     def process_item(self, item, spider):
 7         jsontext = json.dumps(dict(item),ensure_ascii=False) +"\n"
 8         self.filename.write(jsontext.encode("utf8"))
 9         return item
10     def close_spider(self,spider):
11         self.filename.close()
12     '''
13     def __init__(self):
14         self.filename = codecs.open("message1.json","w",encoding="utf-8")
15     def process_item(self,item,spider):
16         jsontext = json.dumps(dict(item), ensure_ascii=False) + "\n"
17         self.filename.write(jsontext)
18         return item
19 
20     def close_spider(self, spider):
21         self.filename.close()
22     '''

./mySpider/mySpider/settings.py

1 ITEM_PIPELINES = {
2    'dongguan.pipelines.DongguanPipeline': 300,
3 }

 

posted on 2019-08-01 15:18  南华  阅读(532)  评论(0编辑  收藏  举报