增量式爬虫

 - 当我们在浏览相关网页的时候会发现,某些网站定时会在原有网页数据的基础上更新一批数据,例如某电影网站会实时更新一批最近热门的电影。小说网站会根据作者创作的进度实时更新最新的章节数据等等

 - 增量式爬虫就是通过爬虫程序监测某网站数据更新的情况,以便可以爬取到该网站更新出的新数据

 - 如何进行增量式的爬取工作:

   - 1)在发送请求之前判断这个URL是不是之前爬取过

   - 2)在解析内容后判断这部分内容是不是之前爬取过

   - 3)写入存储介质时判断内容是不是已经在介质中存在

 - 增量式爬取核心:去重

- 爬取电影数据

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from increment1_pro.items import Increment1ProItem
class MovieSpider(CrawlSpider):
    name = 'movie'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/7.html']

    rules = (
        Rule(LinkExtractor(allow=r'/index.php/vod/show/id/7/page/\d+\.html'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        conn = Redis(host="127.0.0.1",port=6379)
        detail_url_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]/div/a/@href').extract()
        for url in detail_url_list:
            url = 'https://www.4567tv.tv' + url
            ex = conn.sadd("movies_url",url)
            if ex == 1:
                yield scrapy.Request(url=url,callback=self.parse_detail)
            else:
                print("网站数据暂无更新")

    def parse_detail(self, response):
        item = Increment1ProItem()
        item['name'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
        item['actor'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()').extract_first()

        yield item
movie
from redis import Redis
class Increment1ProPipeline(object):
    conn = None
    def open_spider(self, spider):
        self.conn = Redis(host="127.0.0.1",port=6379)
    def process_item(self, item, spider):
        dic = {
            "name":item["name"],
            "actor":item["actor"]
        }
        print("正在爬取新数据入库")
        self.conn.lpush("movie_data",item)
        return item
pipelines.py
import scrapy

class Increment1ProItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    actor = scrapy.Field()
items

  - 爬取糗事  自制数据指纹

# -*- coding: utf-8 -*-
import scrapy
import hashlib
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from increment2_pro.items import Increment2ProItem


class QiubaiSpider(CrawlSpider):
    name = 'qiubai'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.qiushibaike.com/text/']

    rules = (
        Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        div_list = response.xpath(
            '//div[@class="article block untagged mb15 typs_hot"] | div[@class="article block untagged mb15 typs_old"]')
        conn = Redis(host="127.0.0.1", port=6379)
        for div in div_list:
            item = Increment2ProItem()
            content = div.xpath('.//div[@class="content"]/span/text()').extract()
            item["content"] = "".join(content)
            item["author"] = div.xpath('./div/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
            source = item["author"] + item["content"]

            # 自制数据指纹
            hashValue = hashlib.sha3_256(source.encode()).hexdigest()

            ex = conn.sadd("qiubai_hash",hashValue)
            if ex == 1:
                yield item
            else:
                print("暂无数据可以爬取")
qiubai
from redis import Redis
class Increment2ProPipeline(object):
    conn = None

    def open_spider(self, spider):
        self.conn = Redis(host='127.0.0.1', port=6379)
    def process_item(self, item, spider):
        dic = {
            "author":item["author"],
            "content":item["content"]
        }
        print("正在爬取数据...")
        self.conn.lpush("qiubai_data",dic)
        return item
pipelines.py
import scrapy

class Increment2ProItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    content = scrapy.Field()
    author = scrapy.Field()
items

 

posted @ 2019-03-05 20:27  阵浊秀  阅读(184)  评论(0编辑  收藏  举报