python之框架篇(scrapy)

项目创建

get和post方法(分页数据爬取)

下载中间件和slenium

正则、xpath、beautifulsoop

持久化存储

crawlspider

分布式爬虫

增量式爬虫

验证码处理

一、项目创建

#创建步骤:
scrapy startproject mySpider   #mySpider是项目名字

scrapy genspider apider文件名 www.XXX.com

#配置文件settings中
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
#parse文件中解析爬取数据
#运行scrapy
scrapy crawl 爬虫文件名字    #crawl(抓取的意思)
scrapy crawl 爬虫名字 --nolog

 

# -*- coding: utf-8 -*-
import scrapy

class FirstSpider(scrapy.Spider):
    #爬虫文件的名称:就是爬虫文件的一个唯一标示
    name = 'first'
    #允许的域名
    # allowed_domains = ['www.xxx.com']
    #起始url列表:列表元素都会被自动的进行请求的发送
    start_urls = ['https://www.qiushibaike.com/text/']

    #解析数据
    def parse(self, response):
        pass
        

 

二、get和post方法

 1、分页数据爬取(发起get请求)

def title_parse(self, response):
    detailnews_div_list = response.xpath('//div[@class="ndi_main"]/div')
    for detailnews_div in detailnews_div_list[0:2]:
        title = detailnews_div.xpath('./div/div[1]/h3/a//text()').extract_first()
        detailnews_url = detailnews_div.xpath('./div/div[1]/h3/a//@href').extract_first()
        item = WangyinewsallItem()
        item['title'] = title
        yield scrapy.Request(url=detailnews_url, callback=self.detail_parse, meta={'item': item})
def detail_parse(self, response):
item=response.meta['item']#进一步解析文件中提取传递的参数item
detail_list=response.xpath('//*[@id="endText"]/p//text()').extract()
desc='\n'.join(detail_list)
print(desc)
item['desc']=desc
yield item

 2、scrapy发送post请求

class PostdemoSpider(scrapy.Spider):
    name = 'postDemo'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://fanyi.baidu.com/sug']

    def start_requests(self):
        data = {
            'kw':'dog'
        }
        for url in self.start_urls:
            yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)

三、下载中间件和selenium

settings文件中开启下载中间件

DOWNLOADER_MIDDLEWARES = {
   'wangyinewsall.middlewares.WangyinewsallDownloaderMiddleware': 543,
}

UA池和代理池的使用

from scrapy import signals
import random
class MiddleproDownloaderMiddleware(object):
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    ]
    PROXY_http = [
        '153.180.102.104:80',
        '195.208.131.189:56055',
    ]
    PROXY_https = [
        '120.83.49.90:9000',
        '95.189.112.214:35508',
    ]
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    #可以处理拦截到所有的非异常的请求
    def process_request(self, request, spider):
        request.headers['User-Agent'] = random.choice(self.user_agent_list)
        request.meta['proxy'] = 'https://218.60.8.83:3129'
        return None
    
    #拦截所有的响应
    def process_response(self, request, response, spider):

        return response
    
    #拦截发生异常的请求对象
    def process_exception(self, request, exception, spider):
        if request.url.split(':')[0] == 'https':
            request.meta['proxy'] = 'https://'+random.choice(self.PROXY_https)
        else:
            request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http)

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

 

scrapy中selenium的使用和process_response 方法

from scrapy import signals
from scrapy.http import HtmlResponse
def process_response(self, request, response, spider):
    bro=spider.bro
    models_url=spider.models_url
    if request.url in models_url:
        bro.get(request.url)
        page_text =bro.page_source
        return HtmlResponse(url=request.url,body=page_text,encoding='utf-8',request=request)

    return response

#selenium浏览器对象的创建(爬虫文件中)
import scrapy
from selenium import webdriver
from selenium.webdriver import ChromeOptions

class WangyinewSpider(scrapy.Spider):
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    #创建一个浏览器对象
    bro=webdriver.Chrome(executable_path='D:\Jpyter_notebook_work\爬虫day06\wangyinewsall\wangyinewsall\spiders\chromedriver.exe',options=option)
    name = "wangyinew"
    # allowed_domains = ["www.asd.com"]
    start_urls = ['https://news.163.com/']
    models_url=[]

    def parse(self, response):
        model_li_list=response.xpath('//div[@class="ns_area list"]/ul/li')
        num_list=[3,4,6,7]
        for num in num_list[0:1]:
            model_url=model_li_list[num].xpath('./a/@href').extract_first()
            self.models_url.append(model_url)
            yield scrapy.Request(url=model_url,callback=self.title_parse)

selenium的详细使用

网络爬虫之图片懒加载技术、selenium和PhantomJS 

四、正则、xpath、beautifulsoop

爬虫中正则的使用

def detail_parse(self, response):
    news_text=response.text
    content_list=re.findall('"digest":"(.*?)"',news_text)
detail_page_text = requests.get(url=detail_url,headers=headers).text
video_url = re.findall(ex,detail_page_text,re.S)[0]
for k,div in enumerate(div_list):
    title=re.findall('<a class=\"title_wl\".*?>(.*?)</a>',page_text)[k]

爬虫中xpath的使用

1.下载:pip install lxml
2.导包:from lxml import etree

3.将html文档或者xml文档转换成一个etree对象,然后调用对象中的方法查找指定的节点

  2.1 本地文件:tree = etree.parse(文件名)
                tree.xpath("xpath表达式")

  2.2 网络数据:tree = etree.HTML(网页内容字符串)
                tree.xpath("xpath表达式")
4.注意xpath匹配的对象均以列表的形式返回

 

def title_parse(self, response):
    detailnews_div_list = response.xpath('//div[@class="ndi_main"]/div')
    for detailnews_div in detailnews_div_list[0:2]:
        title = detailnews_div.xpath('./div/div[1]/h3/a//text()').extract_first()
        detailnews_url = detailnews_div.xpath('./div/div[1]/h3/a//@href').extract_first()
        item = WangyinewsallItem()
        item['title'] = title
        yield scrapy.Request(url=detailnews_url, callback=self.detail_parse, meta={'item': item})
new_detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first()
li_list
= response.xpath('//div[@class="ns_area list"]/ul/li')
VIEWSTATE_value=tree.xpath('//*[@id=\"__VIEWSTATE\"]/@value')[0]
from lxml import etree
page_text = requests.get(url=url,headers=headers,proxies={'http':'116.228.233.90:8082'}).text
tree = etree.HTML(page_text)
code_img_url = tree.xpath('//*[@id=\"verifyPic_login\"]/@src')[0]

爬虫中beautifulsoop的使用

#安装环境
pip install bs4
pip install lxml

#导入beautiful包
from bs4 import BeautifulSoup
#实例化一个BeautifulSoup对象
#本地加载:
    soup=BeautifulSoup(fp,'lxml')
#网络加载:
    soup=BeautifulSoup(page_text,'lxml'),其中page_text为响应对象
基础巩固:
    (1)根据标签名查找
        - soup.a   只能找到第一个符合要求的标签
    (2)获取属性
        - soup.a.attrs  获取a所有的属性和属性值,返回一个字典
        - soup.a.attrs['href']   获取href属性
        - soup.a['href']   也可简写为这种形式
    (3)获取内容
        - soup.a.string
        - soup.a.text
        - soup.a.get_text()
       【注意】如果标签还有标签,那么string获取到的结果为None,而其它两个,可以获取文本内容
    (4)find:找到第一个符合要求的标签
        - soup.find('a')  找到第一个符合要求的
        - soup.find('a', title="xxx")
        - soup.find('a', class_="xxx")
        - soup.find('a', id="xxx")
    (5)find_all:找到所有符合要求的标签
        - soup.find_all('a')
        - soup.find_all(['a','b']) 找到所有的a和b标签
        - soup.find_all('a', limit=2)  限制前两个
    (6)根据选择器选择指定的内容
               select:soup.select('#feng')
        - 常见的选择器:标签选择器(a)、类选择器(.)、id选择器(#)、层级选择器
            - 层级选择器:
                div .dudu #lala .meme .xixi  下面好多级
                div > p > a > .lala          只能是下面一级
        【注意】select选择器返回永远是列表,需要通过下标提取指定的对象
#爬取斗破苍穹网站的小说
import requests
import re
import json
import os
import time
from bs4 import BeautifulSoup
url="https://doupocangqiong1.com/486/37zw/list_1.html"
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
page_text=requests.get(url=url,headers=headers).text
soup=BeautifulSoup(page_text,'lxml')
span_list=soup.find_all('span',class_='n')
with open('神道丹尊.txt','wt',encoding='utf-8')as f:    
    for span in span_list[0:1]:
        title=' '.join(span.a.string.split(' ')[1:])
        data_list=span.a['href'].split('/')
        cid=re.search('\d+',data_list[2]).group()
        data={
            "siteid":'69',
            "bid":data_list[1],
            "cid":cid
        }
        noval_url="https://doupocangqiong1.com/novelsearch/chapter/transcode.html"
        json_content=requests.post(url=noval_url,headers=headers,data=data).content
        init_content=json.loads(json_content).get('info','')
        content=init_content.replace('<br>\n<br>','')
        f.write(title+'\n\n')
        f.write(content)
soup.p.text
soup.p.string
soup.p.get_text()
soup.find_all('div')
soup.find('div',class_=\"song\").get_text()
soup.select('.song')
soup.select('.tang > ul > li')
标签定位:
    soup.标签名称:定位标签;如有多个,返回第一个符合的标签。
    soup.find(tagname,attrName="value"):基于属性定位实现的标签定位,语法格式'div',class_='song',关键字(只有class??)需要下划线,返回单个标签
    soup.find_all(tagname,attrName="value")返回列表
取数据:
    取文本
        soup.tagname.text 取得标签下所有的标签内容
        soup.tagname.get_text 取得标签下所有的标签内容
        soup.tagname.string 取得标签中直系的文本内容
    取属性
        soup.tagname['attrName']
select:使用选择器定位标签
    标签,类,id选择器:soup.select('.song')返回列表
    层级选择器:
        单层级:
            soup.select('.song>ul>li')
        多层级:
            soup.select('.song li')

五、持久化存储

settigs文件中(300表示的是优先级,数值越小优先级越高):

ITEM_PIPELINES = {
    'xioahuaPro.pipelines.XioahuaproPipeline': 300,
    # 'xioahuaPro.pipelines.MysqlPipeline': 301,
    # 'xioahuaPro.pipelines.RedisPipeline': 302,
}
#items文件中:
class WangyinewsallItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    desc = scrapy.Field()
#多个Item
class DetailItem(scrapy.Item):
# define the fields for your item here like:
job_desc = scrapy.Field()
class FirstItem(scrapy.Item):
# define the fields for your item here like:
job_title = scrapy.Field()
#爬虫文件中:
from kuixunexam.items import KuixunexamItem
def detail_parse(self, response):
    news_text = response.text

    content_list = re.findall('"digest":"(.*?)"', news_text)
    for content in content_list[0:2]:
        item = KuixunexamItem()

        title, cont = re.findall('【(.*?)】(.*)', content)[0]
        print(title, cont)
        item['new_title'] = title
        item['new_content'] = cont
        yield item

#示例2,当item存储的内容不在同一页时
from moviePro.items import MovieproItem

class MovieSpider(scrapy.Spider):
    name = 'movie'
    start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/9.html']
    #接收一个请求传递过来的数据
    def detail_parse(self,response):
        item = response.meta['item']
        desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
        item['desc'] = desc
        yield item

    def parse(self, response):
        li_list = response.xpath('//div[@class="stui-pannel_bd"]/ul/li')
        for li in li_list:
            name = li.xpath('.//h4[@class="title text-overflow"]/a/text()').extract_first()
            detail_url = 'https://www.4567tv.tv'+li.xpath('.//h4[@class="title text-overflow"]/a/@href').extract_first()
            item = MovieproItem()
            item['name'] = name
            #meta是一个字典,字典中所有的键值对都可以传递给指定好的回调函数
            yield scrapy.Request(url=detail_url,callback=self.detail_parse,meta={'item':item})
#pipelines.py文件中,文件、mysql、redis三种存储方式
import pymysql
from redis import Redis
class XioahuaproPipeline(object):
    fp = None
    def open_spider(self,spider):

        self.fp = open('./xiaohua.txt','w',encoding='utf-8')
 
   def process_item(self, item, spider):
        name = item['name']
        img_url = item['img_url']
        self.fp.write(name+':'+img_url+'\n')
        #返回值的作用:就是将item传递给下一个即将被执行的管道类
        return item

    def close_spider(self,spider):
        self.fp.close()

class MysqlPipeline(object):
    conn = None
    cursor = None
    def open_spider(self, spider):
        #解决数据库字段无法存储中文处理:alter table tableName convert to charset utf8;
        self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123',db='test')
        print(self.conn)
    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute('insert into xiahua values ("%s","%s")'%(item['name'],item['img_url']))
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item
    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

class RedisPipeline(object):
    conn = None
    def open_spider(self, spider):
        self.conn = Redis(host='127.0.0.1',port=6379)
        print(self.conn)
    def process_item(self, item, spider):
        dic = {
            'name':item['name'],
            'img_url':item['img_url']
        }
        print(dic)
        self.conn.lpush('xiaohua',dic)
        return item
    def close_spider(self, spider):
        pass

六、crawlspider

crawlspider就是spider的一个子类(派生)。功能比crawlspider多
之所以使用crawlspider是为了进行全栈数据爬取

crawlspider具有的机制:
    连接提取器:提取连接
    规则解析器:解析页面数据,解析页面源码数据

crawlspider创建爬虫文件
    scrapy genspider  -t crawl 文件名 域名

对于不显示的页码,可通过将follow=true取到所有页码(包括隐藏页码)
follow的作用:将连接提取器继续作用到连接提取器提取到的连接所对应的的页面源码中

 

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from moviePro.items import MovieproItem
class MovieSpider(CrawlSpider):
    conn = Redis(host='127.0.0.1',port=6379)
    name = 'movie'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.4567tv.tv/frim/index1.html']

    rules = (
        Rule(LinkExtractor(allow=r'/frim/index1-\d+\.html'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        #解析出当前页码对应页面中电影详情页的url
        li_list = response.xpath('//div[@class="stui-pannel_bd"]/ul/li')
        for li in li_list:
            #解析详情页的url
            detail_url = 'https://www.4567tv.tv'+li.xpath('./div/a/@href').extract_first()
            ex = self.conn.sadd('movie_detail_urls',detail_url)
            if ex == 1:
                print('有新数据可爬取......')
                yield scrapy.Request(url=detail_url,callback=self.parse_detail)
            else:
                print('暂无新数据可爬取!')
    def parse_detail(self,response):
        name = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
        m_type = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[1]/a[1]/text()').extract_first()
        item = MovieproItem()
        item['name'] = name
        item['m_type'] = m_type

        yield item

 

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bossPro.items import DetailItem,FirstItem
#爬取的是岗位名称(首页)和岗位描述(详情页)
class BossSpider(CrawlSpider):
    name = 'boss'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.zhipin.com/c101010100/?query=python%E5%BC%80%E5%8F%91&page=1&ka=page-prev']
    #获取所有的页码连接
    link = LinkExtractor(allow=r'page=\d+')
    link_detail = LinkExtractor(allow=r'/job_detail/.*?html')
    rules = (
        Rule(link, callback='parse_item', follow=True),
        Rule(link_detail, callback='parse_detail'),
    )
    #将页码连接对应的页面数据中的岗位名称进行解析
    def parse_item(self, response):
        li_list = response.xpath('//div[@class="job-list"]/ul/li')
        for li in li_list:
            item = FirstItem()
            job_title = li.xpath('.//div[@class="job-title"]/text()').extract_first()
            item['job_title'] = job_title
            # print(job_title)

            yield item
    def parse_detail(self,response):
        job_desc = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract()
        item = DetailItem()
        job_desc = ''.join(job_desc)
        item['job_desc'] = job_desc

        yield item

七、分布式爬虫

pip install scrapy-redis

- 概念:使用多台机器组成一个分布式的机群,在机群中运行同一组程序,进行联合数据的爬取。
- 原生的scrapy是不可以实现分布式:
    - 原生的scrapy中的调度器不可以被共享
    - 原生的scrapy的管道不可以被共享
实现分布式的核心:将所有的url放到一个调度器中,所有的机群共享一个调度器
解决办法:使用scrapy-redis(模块),实现调度器和管道共享
- 如果实现分布式就必须使用scrapy-redis(模块)
    - 可以给原生的scrapy提供可以被共享的管道和调度器
    - pip install scrapy_redis
- 搭建流程:
    - 创建工程
    - 爬虫文件
    - 修改爬虫文件:
        - 导包:from scrapy_redis.spiders import RedisCrawlSpider
        - 将当前爬虫类的父类进行修改RedisCrawlSpider
        - allowed_domains,start_url删除,添加一个新属性redis_key(调度器队列的名称)
        - 数据解析,将解析的数据封装到item中然后向管道提交
    - 配置文件的编写:
        - 指定管道:
                            ITEM_PIPELINES = {
                     'scrapy_redis.pipelines.RedisPipeline': 400
                    }
        - 指定调度器:
            # 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
            DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
            # 使用scrapy-redis组件自己的调度器
            SCHEDULER = "scrapy_redis.scheduler.Scheduler"
            # 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
            SCHEDULER_PERSIST = True
        - 指定具体的redis:
            REDIS_HOST = 'redis服务的ip地址'
            REDIS_PORT = 6379
            REDIS_ENCODING = ‘utf-8’
            REDIS_PARAMS = {‘password’:’123456’}
        - 开启redis服务(携带redis的配置文件:redis-server ./redis.windows.conf),和客户端:
            - 对redis的配置文件进行适当的配置:
                    - #bind 127.0.0.1
                    - protected-mode no
             - 开启
         - 启动程序:scrapy runspider xxx.py
         - 向调度器队列中扔入一个起始的url(redis的客户端):lpush xxx www.xxx.com
            - xxx表示的就是redis_key的属性值
#爬虫文件中
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from fbsPro.items import FbsproItem
class TestSpider(RedisCrawlSpider):
    name = 'test'
    # allowed_domains = ['www.xxx.com']
    # start_urls = ['http://www.xxx.com/']
    #调度器队列的名称
    redis_key = 'dongguan'
    rules = (
        Rule(LinkExtractor(allow=r'type=4&page=\d+'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        a_list = response.xpath('//a[@class="news14"]')
        for a in a_list:
            item = FbsproItem()
            item['title']= a.xpath('./text()').extract_first()

            yield item

settings中配置

#settings中增加配置

ITEM_PIPELINES = {
   # 'fbsPro.pipelines.FbsproPipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400
}
# 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST = True

REDIS_HOST = '192.168.11.154'
REDIS_PORT = 6379

启动redis

#redis的配置文件进行适当配置
#bind 127.0.0.1
protected-mode yes

redis-server +指定配置的文件路径

redis-cli

多台机器启动爬虫文件

scrapy runspider test.py

redis写入其实url开始分布式爬虫

lpush dongguan 起始url(redis当中)

八、增量式爬虫

  • 概念:通过爬虫程序监测某网站数据更新的情况,以便可以爬取到该网站更新出的新数据。
  • 如何进行增量式的爬取工作:
    • 在发送请求之前判断这个URL是不是之前爬取过
    • 在解析内容后判断这部分内容是不是之前爬取过
    • 写入存储介质时判断内容是不是已经在介质中存在
      • 分析:

              不难发现,其实增量爬取的核心是去重, 至于去重的操作在哪个步骤起作用,只能说各有利弊。在我看来,前两种思路需要根据实际情况取一个(也可能都用)。第一种思路适合不断有新页面出现的网站,比如说小说的新章节,每天的最新新闻等等;第二种思路则适合页面内容会更新的网站。第三个思路是相当于是最后的一道防线。这样做可以最大程度上达到去重的目的。

  • 去重方法
    • 将爬取过程中产生的url进行存储,存储在redis的set中。当下次进行数据爬取时,首先对即将要发起的请求对应的url在存储的url的set中做判断,如果存在则不进行请求,否则才进行请求。
    • 对爬取到的网页内容进行唯一标识的制定,然后将该唯一表示存储至redis的set中。当下次爬取到网页数据的时候,在进行持久化存储之前,首先可以先判断该数据的唯一标识在redis的set中是否存在,在决定是否进行持久化存储。

1.基于url的增量式爬虫

#爬虫文件中
import
scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from moviePro.items import MovieproItem class MovieSpider(CrawlSpider): conn = Redis(host='127.0.0.1',port=6379) name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.4567tv.tv/frim/index1.html'] rules = ( Rule(LinkExtractor(allow=r'/frim/index1-\d+\.html'), callback='parse_item', follow=True), ) def parse_item(self, response): #解析出当前页码对应页面中电影详情页的url li_list = response.xpath('//div[@class="stui-pannel_bd"]/ul/li') for li in li_list: #解析详情页的url detail_url = 'https://www.4567tv.tv'+li.xpath('./div/a/@href').extract_first() #ex == 1:该url没有被请求过 ex == 0:该url已经被请求过了 ex = self.conn.sadd('movie_detail_urls',detail_url) if ex == 1: print('有新数据可爬取......') yield scrapy.Request(url=detail_url,callback=self.parse_detail) else: print('暂无新数据可爬取!') def parse_detail(self,response): name = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first() m_type = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[1]/a[1]/text()').extract_first() item = MovieproItem() item['name'] = name item['m_type'] = m_type yield item
#管道文件中
class MovieproPipeline(object):
    def process_item(self, item, spider):
        conn = spider.conn
        dic = {
            'name':item['name'],
            'm_type':item['m_type']
        }
        conn.lpush('movie_data',dic)
        return item

2、基于内容的增量式爬取

- 需求:爬取糗事百科中的段子和作者数据。

#爬虫文件中
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from incrementByDataPro.items import IncrementbydataproItem
from redis import Redis
import hashlib
class QiubaiSpider(CrawlSpider):
    name = 'qiubai'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.qiushibaike.com/text/']

    rules = (
        Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True),
        Rule(LinkExtractor(allow=r'/text/$'), callback='parse_item', follow=True),
    )
    #创建redis链接对象
    conn = Redis(host='127.0.0.1',port=6379)
    def parse_item(self, response):
        div_list = response.xpath('//div[@id="content-left"]/div')

        for div in div_list:
            item = IncrementbydataproItem()
            item['author'] = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
            item['content'] = div.xpath('.//div[@class="content"]/span/text()').extract_first()

            #将解析到的数据值生成一个唯一的标识进行redis存储
            source = item['author']+item['content']
            source_id = hashlib.sha256(source.encode()).hexdigest()
            #将解析内容的唯一表示存储到redis的data_id中
            ex = self.conn.sadd('data_id',source_id)

            if ex == 1:
                print('该条数据没有爬取过,可以爬取......')
                yield item
            else:
                print('该条数据已经爬取过了,不需要再次爬取了!!!')
#管道文件中
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from redis import Redis
class IncrementbydataproPipeline(object):
    conn = None

    def open_spider(self, spider):
        self.conn = Redis(host='127.0.0.1', port=6379)

    def process_item(self, item, spider):
        dic = {
            'author': item['author'],
            'content': item['content']
        }
        # print(dic)
        self.conn.lpush('qiubaiData', dic)
        return item

 

分类: 爬虫
posted @ 2019-06-02 10:20  海予心  阅读(1039)  评论(0编辑  收藏  举报