6,UA&代理池-selenium在scrapy中的应用,基于CrawlSpider的全栈数据解析,网易新闻爬取

6,UA&代理池-selenium在scrapy中的应用,基于CrawlSpider的全栈数据解析,网易新闻爬取

 

参考链接:

  https://www.cnblogs.com/bobo-zhang/p/9670764.html

 

 

 UA:身份标识

代理池:ip地址代理池

 

 

 开启下载中间件.

 

 

 

 

 

/

 

 

 

UA&代理池的使用:

  

 

需要在settings文件里面开启一下 

 

 

 

 

/

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

#这是middleware文件


#启用UA池
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware  #这是引入的类

import random
user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]

#注意,UAPool这个类是自己写的,需要继承引入的类,
class UAPool(UserAgentMiddleware):
    def process_request(self, request, spider):
        ua = random.choices(user_agent_list)
        #关注!!!
        request.headers['User-Agent'] = ua
        print(request.headers['User-Agent'])

        
        
proxy_http = ['125.27.10.150:56292','114.34.168.157:46160']
proxy_https = ['1.20.101.81:35454','113.78.254.156:9000']


#这个是框架自带的类
class UapoolproDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    #request参数就是拦截到的请求对象
    #spider就是爬虫对象
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        
        #此处是根据对所请求的地址协议类型进行判断,使用正确的代理地址
        if request.url.split(':')[0] == 'https':
            request.meta['proxy'] = 'https://'+random.choice(proxy_https)
        else:
            request.meta['proxy'] = 'http://'+random.choice(proxy_http)
        print(request.meta['proxy'])
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
middlewares文件

 

 

 

/

selenium在scrapy中的应用:

/

 

 

 

 

1.爬虫文件,selenium的具体使用,代码写在spider文件及下载中间件里面.

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver


#这个是爬虫文件
class SelenuimtestSpider(scrapy.Spider):
    name = 'selenuimTest'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['http://war.163.com/']
    
    #此处进行调用selenium,
    def __init__(self):
        self.bro = webdriver.Chrome(r'C:\Users\Administrator\Desktop\爬虫+数据\day03\驱动程序\chromedriver.exe')
    
    def parse(self, response):
        print('start!!!')
        print(response.xpath('/html/body/div[1]/div[3]/div[4]/div[1]/div/div/ul/li/div/div[4]/div/div[1]/h3/a/text()').extract_first())
    
    #此方法用于在爬虫结束之后,将浏览器关闭(将selenium打开的浏览器关闭),
    def closed(self,spider):
        self.bro.quit()

 

2.下载中间件文件

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

from scrapy.http import HtmlResponse  # 注意模块的引入,


class SeleniumproDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    def process_response(self, request, response, spider):
        #使用selenium获取动态加载出来的数据
        if request.url in ['http://war.163.com/']:
            spider.bro.get('http://war.163.com/')
            page_text = spider.bro.page_source  
            return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request)  # 注意此处返回的响应内容各参数的使用

        return response

 

 

/

CrawlSpider

 

1.爬虫文件

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class CrawltestSpider(CrawlSpider):
    name = 'crawlTest'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['https://dig.chouti.com/r/scoff/hot/1']
    #连接提取器对象:去页面中提取符合要求的连接
    link = LinkExtractor(allow=r'/r/scoff/hot/\d+')
    rules = (
        #规则解析器:可以将连接提取器提取到的连接对应的页面进行指定规则(callback)的解析
        Rule(link, callback='parse_item', follow=True),  # 注意follow的使用
        #连接提取器作用到连接提取器提取出的连接所对应的页面中
    )

    def parse_item(self, response):
       print(response)

 

/

2.当一个正则表达式表示不了全部的url时,需要写多个链接提取器,对应多个规则解析器

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class QiubaiSpider(CrawlSpider):
    name = 'qiubai'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.qiushibaike.com/pic/']
    link = LinkExtractor(allow=r'/pic/page/\d+\?')   # 此处是写了两个链接提取器, 以为第1页的地址跟后面的页数地址不能使用相同的正则解析, 
    link1 = LinkExtractor(allow=r'/pic/$')
    rules = (
        Rule(link, callback='parse_item', follow=True),
        Rule(link1, callback='parse_item', follow=True), # 不同的链接提取器,对应不同的解析器
    )

    def parse_item(self, response):
       print(response)

 

 

/

 

网易新闻的爬取

/

/

 

posted @ 2019-10-15 17:30  Flr  阅读(215)  评论(0)    收藏  举报