增量式爬虫
- 当我们在浏览相关网页的时候会发现,某些网站定时会在原有网页数据的基础上更新一批数据,例如某电影网站会实时更新一批最近热门的电影。小说网站会根据作者创作的进度实时更新最新的章节数据等等
- 增量式爬虫就是通过爬虫程序监测某网站数据更新的情况,以便可以爬取到该网站更新出的新数据
- 如何进行增量式的爬取工作:
- 1)在发送请求之前判断这个URL是不是之前爬取过
- 2)在解析内容后判断这部分内容是不是之前爬取过
- 3)写入存储介质时判断内容是不是已经在介质中存在
- 增量式爬取核心:去重
- 爬取电影数据
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from increment1_pro.items import Increment1ProItem class MovieSpider(CrawlSpider): name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/7.html'] rules = ( Rule(LinkExtractor(allow=r'/index.php/vod/show/id/7/page/\d+\.html'), callback='parse_item', follow=True), ) def parse_item(self, response): conn = Redis(host="127.0.0.1",port=6379) detail_url_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]/div/a/@href').extract() for url in detail_url_list: url = 'https://www.4567tv.tv' + url ex = conn.sadd("movies_url",url) if ex == 1: yield scrapy.Request(url=url,callback=self.parse_detail) else: print("网站数据暂无更新") def parse_detail(self, response): item = Increment1ProItem() item['name'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first() item['actor'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()').extract_first() yield item
from redis import Redis class Increment1ProPipeline(object): conn = None def open_spider(self, spider): self.conn = Redis(host="127.0.0.1",port=6379) def process_item(self, item, spider): dic = { "name":item["name"], "actor":item["actor"] } print("正在爬取新数据入库") self.conn.lpush("movie_data",item) return item
import scrapy class Increment1ProItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() actor = scrapy.Field()
- 爬取糗事 自制数据指纹
# -*- coding: utf-8 -*- import scrapy import hashlib from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from increment2_pro.items import Increment2ProItem class QiubaiSpider(CrawlSpider): name = 'qiubai' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.qiushibaike.com/text/'] rules = ( Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True), ) def parse_item(self, response): div_list = response.xpath( '//div[@class="article block untagged mb15 typs_hot"] | div[@class="article block untagged mb15 typs_old"]') conn = Redis(host="127.0.0.1", port=6379) for div in div_list: item = Increment2ProItem() content = div.xpath('.//div[@class="content"]/span/text()').extract() item["content"] = "".join(content) item["author"] = div.xpath('./div/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first() source = item["author"] + item["content"] # 自制数据指纹 hashValue = hashlib.sha3_256(source.encode()).hexdigest() ex = conn.sadd("qiubai_hash",hashValue) if ex == 1: yield item else: print("暂无数据可以爬取")
from redis import Redis class Increment2ProPipeline(object): conn = None def open_spider(self, spider): self.conn = Redis(host='127.0.0.1', port=6379) def process_item(self, item, spider): dic = { "author":item["author"], "content":item["content"] } print("正在爬取数据...") self.conn.lpush("qiubai_data",dic) return item
import scrapy class Increment2ProItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() content = scrapy.Field() author = scrapy.Field()