Scrapy进阶

图片下载的管道

彼岸图网的实例

import scrapy
from imgPileLinePro.items import ImgpilelineproItem


class ImgSpider(scrapy.Spider):
    name = 'img'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://pic.netbian.com/4kmeinv/']
    url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
    page = 2

    def parse(self, response):
        li_list = response.xpath('//*[@id="main"]/div[3]/ul/li')
        for li in li_list:
            img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src').extract_first()
            item = ImgpilelineproItem()
            item['img_src'] = img_src

            yield item

        if self.page <= 2:
            new_url = format(self.url % self.page)
            self.page += 1
            yield scrapy.Request(url=new_url, callback=self.parse)
img.py
import scrapy


class ImgpilelineproItem(scrapy.Item):
    # define the fields for your item here like:
    img_src = scrapy.Field()
    # pass
items.py
BOT_NAME = 'imgPileLinePro'

SPIDER_MODULES = ['imgPileLinePro.spiders']
NEWSPIDER_MODULE = 'imgPileLinePro.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'imgPileLinePro (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

IMAGES_STORE = './imgsLib'


ITEM_PIPELINES = {
    'imgPileLinePro.pipelines.ImgPileLine': 300,
}
settings.py 
from scrapy.pipelines.images import ImagesPipeline
import scrapy


class ImgpilelineproPipeline(object):
    def process_item(self, item, spider):
        return item


class ImgPileLine(ImagesPipeline):
    # 接收item且将item中存储的img_src进行请求发送
    def get_media_requests(self, item, info):
        yield scrapy.Request(url=item['img_src'])

    # 指定数据存储的路径(文件夹【在配置文件中指定】+图片名称【该方法中返回】)
    def file_path(self, request, response=None, info=None):
        img_name = request.url.split('/')[-1]
        return img_name

    # 就是将item传递给下一个即将被执行的管道类
    def item_completed(self, result, item, info):
        return item
pipelines.py

中间件

下载中间件

  作用:批量拦截整个工程中发起的所有请求和响应

  为什么要拦截请求

    - UA伪装:

      - process_request:request.headers['User-Agent'] = xxx

    - 代理ip的设定

      - process_exception:request.meta['proxy'] = 'http://ip:port'

  为什么要拦截响应

    - 篡改响应数据/篡改响应对象

  - 注意:中间件需要在配置文件中手动开启

没用到的类函数可以删除掉

from scrapy import signals
import random

user_agent_list = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
    "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]


class ImgpilelineproDownloaderMiddleware(object):
    # 拦截正常的请求
    def process_request(self, request, spider):
        # UA伪装
        request.headers['User-Agent'] = random.choice(user_agent_list)

        return None

    # 拦截响应
    def process_response(self, request, response, spider):
        return response

    # 拦截发生异常的请求对象
    def process_exception(self, request, exception, spider):
        # 需要将异常的请求进行修正,然后让其进行重新发送
        # 代理操作
        request.meta['proxy'] = 'https://ip:port'
        return request  # 重新发送请求

Scrapy中使用selenium

在scrapy中使用selenium

  - 在爬虫文件中构造方法中实例化一个浏览器对象

  - 在爬虫文件中重写一个closed(self,spider)方法,关闭浏览器对象

  - 在下载中间件的process_response中获取浏览器对象,然后执行浏览器自动化的相关操作

 

案例:网易新闻爬取

  需求:网易新闻中国内,国际,军事,航空,无人机这五个板块下的新闻标题和内容

       使用百度AI自然语言处理分析出标签和分类,并且创建一个四字段的库表(title,content,keys,type)进行持久化存储(mysql)

  分析:

    1. 每一个板块下对应的新闻标题数据都是动态加载的。

    2. 新闻详情页的数据不是动态加载的

# -*- coding: utf-8 -*-
import scrapy
from ..items import WangyiproItem
from selenium import webdriver


class WangyiSpider(scrapy.Spider):
    name = 'wangyi'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://news.163.com/']
    model_urls = []  # 存放5个板块对应的url

    def __init__(self):
        self.bro = webdriver.Chrome(executable_path=r'D:\chromedriver.exe')

    def parse(self, response):
        # 解析5个板块对应的页面链接(非动态加载)
        li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li')
        index_list = [3, 4]
        for index in index_list:
            li = li_list[index]
            # 每个板块的url
            model_url = li.xpath('./a/@href').extract_first()
            self.model_urls.append(model_url)
            # 对每个板块的url发起请求
            yield scrapy.Request(url=model_url, callback=self.parse_title)

    # 解析每个板块新闻的标题(动态加载的)
    def parse_title(self, response):
        # 没有动态加载数据的response
        div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div')
        for div in div_list:
            new_title = div.xpath('.//div[@class="news_title"]/h3/a/text()').extract_first()
            item = WangyiproItem()
            item['title'] = new_title
            detail_url = div.xpath('.//div[@class="news_title"]/h3/a/@href').extract_first()
            print(detail_url)
            # 对新闻详情页发起请求
            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})

    # 解析新闻内容(不是动态加载的)
    def parse_detail(self, response):
        item = response.meta['item']
        content = response.xpath('//*[@id="endText"]//text()').extract()
        content = ''.join(content)
        item['content'] = content

        yield item

        # 程序全部结束的时候被调用
        def closed(self, spider):
            print('结束爬虫!!!')
            self.bro.quit()
爬虫文件
import scrapy


class WangyiproItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    content = scrapy.Field()
items.py
from aip import AipNlp
import pymysql


class WangyiproPipeline(object):
    conn = None
    cursor = None

    def open_spider(self, spider):
        self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', db='spider', charset='utf8')
        print(self.conn)

    def process_item(self, item, spider):

        self.cursor = self.conn.cursor()
        
        #调用百度AI
        """ 你的 APPID AK SK """
        APP_ID = '16821895'
        API_KEY = 'kLRYwUHKHwgGeowOdfeU9MmZ'
        SECRET_KEY = 'mtLySt8KRkF0hrFLKnlEQ8L8WNK4CTwu'

        client = AipNlp(APP_ID, API_KEY, SECRET_KEY)

        title = item['title']
        content = item['content']
        content = content.replace("\n", "")
        content = content.replace(u'\xa0', u'')

        # """ 调用文章标签 """
        keys = None
        result = client.keyword(title, content).get('items')
        for key in result:
            if key.get('score') > 0.60:
                keys = key.get("tag")

        # """ 调用文章分类 """
        type = None
        types = client.topic(title, content).get('item').get('lv2_tag_list')
        for t in types:
            if t.get('score') > 0.50:
                type = t.get('tag')
        print('标题:', title, '标签:', keys, '分类', type)
        sql = 'insert into wangyi values ("%s","%s","%s","%s")' % (title, content, keys, type)
        
        # 进行事物处理
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()

        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()
pipelines.py
from scrapy.http import HtmlResponse
from time import sleep


class WangyiproDownloaderMiddleware(object):

    def process_request(self, request, spider):
        return None

    # 拦截所有的相应
    def process_response(self, request, response, spider):
        # 将五个板块对应的相应对象改成符合要求的相应对象并返回
        model_urls = spider.model_urls  # 获得对应的url
        bro = spider.bro
        if request.url in model_urls:
            # response  # 对应的相应对象
            bro.get(request.url)
            sleep(2)
            bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            sleep(1)
            bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            sleep(1)
            page_text = bro.page_source  # 获取到的动态加载数据
            new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf8', request=request)
            return new_response
        return response

    def process_exception(self, request, exception, spider):
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
middlewares.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3861.0 Safari/537.36 Edg/77.0.230.2'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'


DOWNLOADER_MIDDLEWARES = {
   'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543,
}


ITEM_PIPELINES = {
   'wangyiPro.pipelines.WangyiproPipeline': 300,
}
settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3861.0 Safari/537.36 Edg/77.0.230.2'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'


DOWNLOADER_MIDDLEWARES = {
   'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543,
}


ITEM_PIPELINES = {
   'wangyiPro.pipelines.WangyiproPipeline': 300,
}
settings.py

CrawlSpider全站爬取

  CrawlSpider其实是Spider的一个子类,除了继承到Spider的特性和功能外,还派生除了其自己独有的更加强大的特性和功能。其中最显著的功能就是”LinkExtractors链接提取器“。Spider是所有爬虫的基类,其设计原则只是为了爬取start_url列表中网页,而从爬取到的网页中提取出的url进行继续的爬取工作使用CrawlSpider更合适。

创建scrapy工程:

scrapy startproject projectName

创建一个基于CrawlSpider的爬虫文件:

scrapy genspider -t crawl spiderName www.xxx.com

执行爬虫文件工程:

scrapy crawl spiderName

实例:东莞阳光网

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

# 简单的实现全站爬取
class SunSpider(CrawlSpider):
    name = 'sun'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']
    # 链接提取器 -- 根据指定的规则(allow:正则)进行链接的提取
    link = LinkExtractor(allow=r'type=4&page=\d+')
    rules = (
        # 规则解析器  -- 解析页面数据  根据callback进行解析
        # -- 负责对链接提取器提取到的链接所对应的页面源码进行指定规则(callback)
        Rule(link, callback='parse_item', follow=True),  # follow=True保证取到所有页,而不是当前页面
        # follow=True : 将链接提取器 继续 作用到 链接提取器提取到的连接 所对应的页面源码中
    )

    def parse_item(self, response):
        print(response)  # 所有的页面地址

深度爬取:

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import SunProDetail, SunproItem

# 深度爬取
class SunSpider(CrawlSpider):
    name = 'sun'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']
    # 链接提取器 -- 根据指定的规则(allow:正则)进行链接的提取
    link = LinkExtractor(allow=r'type=4&page=\d+')
    # 为提取详情页数据,新建一个连接提取器
    link_detail = LinkExtractor(allow=r'question/\d+/\d+\.shtml')
    rules = (
        # 规则解析器  -- 解析页面数据  根据callback进行解析
        # -- 负责对链接提取器提取到的链接所对应的页面源码进行指定规则(callback)
        Rule(link, callback='parse_item', follow=True),  # follow=True保证取到所有页,而不是当前页面
        Rule(link_detail, callback='parse_detail')  # 这里不用写follow=True,默认就是false,这里是不需要的
    )

    def parse_item(self, response):
        tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
        for tr in tr_list:
            title = tr.xpath('./td[2]/a[2]/text()').extract_first()
            num = tr.xpath('./td[1]/text()').extract_first()
            item = SunproItem()
            item['title'] = title
            item['num'] = num

            yield item

    def parse_detail(self, response):
        content = response.xpath('/html/body/div[9]/table[2]//tr[1]/td/div[2]/text()').extract_first()
        num = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first()
        num = num.split(':')[-1]
        item = SunProDetail()
        item['content'] = content
        item['num'] = num

        yield item
sun.py
import scrapy


class SunproItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    num = scrapy.Field()


class SunProDetail(scrapy.Item):
    content = scrapy.Field()
    num = scrapy.Field()
items.py
class SunproPipeline(object):
    def process_item(self, item, spider):
        if item.__clsss__.__name__ == "SunProDetail":
            content = item['content']
            num = item['num']
            # 这里就可以导入数据库来存储,三个字段(num,title,content)
            # 先将num和content插入数据库
        else:
            title = item['title']
            num = item['num']
            # 再这里插入数据库时,where num = num
        return item
pipelines.py

不涉及中间件和管道存储

分布式爬虫

分布式:

  概念:组建一个分布式的机群,让分布式机群对同一组数据进行分布爬取.

  作用:提升数据爬取的效率

  如何实现分布式?

    scrapy+redis实现的分布式

      scrapy结合着scrapy-redis组建实现的分布式

  原生的scrapy是无法实现分布式

    调度器无法被分布式机群共享

    管道无法被分布式机群共享

  scrapy-redis的作用是什么?

    给原生的scrapy提供了可以被共享的调度器和管道

  为什么这个组建叫做scrapy-redis?

    分布爬取的数据必须存储到redis中

 编码流程:

- pip install scrapy-redis

- 创建爬虫文件(CrawlSpider/Spider)

- 修改爬虫文件:
    - 导入scrapy-redis模块封装好的类
        - from scrapy_redis.spiders import RedisCrawlSpider
    - 将爬虫类的父类指定成RedisCrawlSpider
    - 将allowed_domains和start_urls删除
    - 添加一个新属性:redis_key = 'xxx'#可以被共享的调度器队列的名称

- 进行settings文件的配置:
    - 指定管道:
        ITEM_PIPELINES = {
            'scrapy_redis.pipelines.RedisPipeline': 400
        }
    - 指定调度器:
        DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
        SCHEDULER = "scrapy_redis.scheduler.Scheduler"
        SCHEDULER_PERSIST = True
    - 指定数据库
        REDIS_HOST = 'redis服务的ip地址'
        REDIS_PORT = 6379

- 配置redis的配置文件redis.windows.conf:
    - 56行:#bind 127.0.0.1
    - 关闭保护模式:protected-mode no

- 启动redis服务:
    - redis-server ./redis.windows.conf
    - redis-cli

- 执行程序:
    进入到爬虫文件对应的目录中:scrapy runspider xxx.py

- 向调度器队列中放入一个起始的url:
    - 调度器的队列名称就是redis_key值
        - 在redis-cli:lpush 队列名称  www.xxx.com

案例:

爬虫文件:

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from scrapy_redis.spiders import RedisCrawlSpider
from fbsPro.items import FbsproItem


class FbsSpider(RedisCrawlSpider):
    name = 'fbs'
    # allowed_domains = ['www.xxx.com']
    # start_urls = ['http://www.xxx.com/']
    redis_key = 'sunQueue'  # 可以被共享的调度器队列的名称
    rules = (
        Rule(LinkExtractor(allow=r'type=4&page=\d+'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
        for tr in tr_list:
            title = tr.xpath('./td[2]/a[2]/text()').extract_first()
            item = FbsproItem()
            item['title'] = title

            yield item

items

import scrapy


class FbsproItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    # pass

管道:

class FbsproPipeline(object):
    def process_item(self, item, spider):
        return item

settings

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'

ROBOTSTXT_OBEY = False

CONCURRENT_REQUESTS = 2

ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 400
}

# 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST = True

REDIS_HOST = 'xxx.xxx.xx.xx'
REDIS_PORT = 6379

 

增量式爬虫

- 概念:监测网站数据更新的情况,爬取最新更新出来的数据。

- 实现增量式

  - 去重!

实例:

  - 电影网站:爬取的数据没有在同一张页面上

  -  需要对每一个电影的详情页url做记录

  - 下载执行程序的时候,需要将即将被爬取电影详情页的url做记录监测

  - 电影详情页的url记录可以存储到set或者redis的set

  - 爬取到的所有电影数据可以存储到redis

  解析:
    - 对一个url对应页面中的数据做监测

    - 数据指纹:对一组数据制定的一个唯一标识

示例代码:

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from zls_movie_Pro.items import ZlsMovieProItem


class MovieSpider(CrawlSpider):
    conn = Redis(host='127.0.0.1', port=6379)
    name = 'movie'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://www.4567kan.com/index.php/vod/show/id/5.html']

    rules = (
        Rule(LinkExtractor(allow=r'/index\.php/vod/show/id/5/page/\d+\.html'), callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        # 电影名称和详情页的url
        li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
        for li in li_list:
            name = li.xpath('.//div[@class="stui-vodlist__detail"]/h4/a/text()').extract_first()

            item = ZlsMovieProItem()
            item['name'] = name

            detail_url = 'http://www.4567kan.com' + li.xpath(
                './/div[@class="stui-vodlist__detail"]/h4/a/@href').extract_first()
            # ex == 1 :字符串插入成功   ex == 0 插入的字符串重复了
            ex = self.conn.sadd('movie_detail_urls', detail_url)

            if ex == 1:
                print('有最新更新的数据可爬......')
                yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
            else:
                print('暂无数据更新!')

    def parse_detail(self, response):
        movie_desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
        item = response.meta['item']
        item['desc'] = movie_desc

        yield item
movies.py
import scrapy


class ZlsMovieProItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    desc = scrapy.Field()
    # pass
items.py
class ZlsMovieProPipeline(object):
    def process_item(self, item, spider):
        conn = spider.conn
        conn.lpush('movie_data',item)
        return item
pipelines.py

settings中固定配置好,开启管道即可。

posted @ 2019-08-12 20:00  blog_wu  阅读(259)  评论(0编辑  收藏  举报