Scrapy 框架 增量式

增量式:

  • 用来检测网站中数据的更新情况

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from redis import Redis


class DianyingSpider(CrawlSpider):
    """
    www.4567tv.tv
    """
    name = 'dianying'
    # allowed_domains = ['https://www.4567tv.tv/index.php/vod/show/id/1/page/388.html']
    start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/8/page/1.html']
    link = LinkExtractor(allow=r'/index.php/vod/show/id/8/page/\d+\.html')
    rules = (
        Rule(link, callback='parse_item', follow=True),
    )
    conn = Redis(host='127.0.0.1', port=6379)

    def parse_item(self, response):
        li_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]')

        for li in li_list:
            detail_url = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first()

            if_num = self.conn.sadd('dianying', detail_url)
            print(if_num)
            if if_num:
                print('有最新数据的更新......')
                # yield scrapy.Request(url=detail_url, callback=self.detail_callback)
            else:
                print('暂无最新数据可爬取......')

    def detail_callback(self, response):
        title = response.xpath('//h1/text()').extract_first()
        zhuyan = response.xpath('//div[@class="stui-content__detail"]/p[2]//text()').extract()
        print(title, zhuyan)

对于文本内容 使用

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from qiubaiPro.items import QiubaiproItem
import hashlib
class QiubaiSpider(CrawlSpider):
    name = 'qiubai'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.******.com/text/']
    conn = Redis(host='127.0.0.1',port=6379)
    rules = (
        Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        # print(response)
        div_list = response.xpath('//div[@id="content-left"]/div')

        for div in div_list:
            item = QiubaiproItem()
            item['author'] = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
            item['content'] = div.xpath('.//div[@class="content"]/span//text()').extract()
            item['content'] = ''.join(item['content'])
            data = item['author']+item['content']
            #对数据生成一个数据指纹
            data_hash = hashlib.sha256(data.encode()).hexdigest()
            ex = self.conn.sadd('if_data',data_hash)
            if ex == 1:
                print('数据更新,可爬......')
                yield item
            else:
                print('暂无更新数据......')

posted @ 2019-04-22 12:31  拐弯  阅读(311)  评论(0编辑  收藏  举报