使用Scrapy框架爬取Boss招聘信息

免责声明:本文仅供学习学习参考使用,不能用于恶意攻击网站。考虑到安全性以及法律问题本人仅仅提供部分代码以及破解思路。

 

思路:

  首先Boss采用的反爬虫机制是IP封禁,以及所有内容都是动态加载的。既然是动态加载的都需要借助selenium和驱动或者splash。这里我所使用的是selenium。对于IP封禁,就使用代理IP池来解决。但是之前我们搭建的代理IP池是爬取所有代理IP并筛选并没有筛选代理IP的类型,如果在爬取过程中使用了普通代理或者透明代理同样也会被封掉,所以想完完整整的爬取下来所有信息就只能使用付费代理了,既然是作为学习参考使用我就直接使用之前搭建的的代理IP池了。

 

部分代码如下:

import random
from scrapy.http import HtmlResponse
from time import sleep
import requests
import logging

class PRoxyMiddleware(): #动态修改IP
    def __init__(self,proxy_url):
        self.logger = logging.getLogger(__name__)
        self.proxies_pool_url = 'http://localhost:5555/random'

    def get_random_proxy(self):
        try:
            response = requests.get(url=self.proxies_pool_url)
            if response.status_code == 200:
                proxy = response.text
                return proxy
        except requests.exceptions.ConnectionError:
            return False

    def process_request(self,request,spider):
        if request.meta.get('retry_times'):
            proxy = self.get_random_proxy()
            if proxy:
                uri = f'https://{proxy}'.format(proxy=proxy)
                self.logger.debug('使用代理' + proxy)
                request.meta['proxy'] = uri

    @classmethod
    def from_crawler(cls,crawler):
        settings = crawler.settings
        return cls(
            proxy_url = settings.get('PROXY_URL')
        )

class BossproDownloaderMiddleware:
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

def process_request(self, request, spider):
        request.headers['User-Agent'] = random.choice(self.user_agent_list) #UA伪装

        return None

    def process_response(self, request, response, spider): #获取动态加载的数据
        bro = spider.bro
        bro.get(response.url)
        sleep(2)
        page_text = bro.page_source  # 包含了动态加载的数据
        new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
        return new_response

    def process_exception(self, request, exception, spider):
        pass
posted @ 2021-04-20 20:32  Ccdjun  阅读(217)  评论(0编辑  收藏  举报