爬取拉勾网

一、requests模块爬取

View Code
# 分析页面:
调试的时候,前端反爬措施,一调试就禁止,报异常
解决方案:1、关闭掉右边的断点 2、或者用抓包工具进行调试

# https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false
import requests
#实际要爬取的url
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

payload = {
    'first': 'true',
    'pn': '1',  # 页数
    'kd': 'python',
}

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
    'Accept': 'application/json, text/javascript, */*; q=0.01'
}

# 原始的url,朝这个地址发个get请求先拿回cookie
urls ='https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='

#建立session
session = requests.Session()
# 获取搜索页的cookie
session.get(urls, headers=header, timeout=3)
# 为此次获取的cookie
cookie = session.cookies

# 获取此次文本,带上cookie朝要爬取的页面发送post请求
response = session.post(url, data=payload, headers=header, cookies=cookie, timeout=5).text
print(response)

 

二、selenium模块爬取

# 拉勾加了反爬措施后data和response被用js加密,不知道post请求传的参数以及无法直接获取响应数据
# 由于拉勾网对未登录用户的访问数量做了限制,所以在浏览一定数量的网页后,网页会自动跳转到登陆界面
# 用selenium模块更方便爬取

1、请求分析

1、打开拉勾网的首页(URL:https://www.lagou.com/)
在输入框中输入“python”进行搜索,可以发现网页跳转到如下的 URL:
https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=

这个就是我们的起始 URL:朝这个地址发get请求先拿回cookie
https://www.lagou.com/jobs/list_{position}?labelWords=&fromSearch=true&suginput=
其中,参数 position 就是我们在输入框中输入的内容(需要进行 URL 编码)

2、浏览器调试,查看ajax请求,发现实际获取数据的URL:https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
但data和response经过了js加密,无法直接获取

2、流程分析

1、利用selenium,先访问首页,会自动处理cookie
2、手动登录获取cookie,将cookie信息添加到 browser 中
3、模拟搜索,直接访问https://www.lagou.com/jobs/list_{position}?labelWords=&fromSearch=true&suginput=
4、对页面进行解析,提取需要的数据进行保存
# position和cookie需要手动输入
View Code
 import requests
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from urllib.parse import quote
import time
import redis
import json
import uuid
from redis_pool import POOL


class LagouSpider(object):
    # 初始化类属性,为driver添加配置:1、规避检测selenium框架 2、添加代理
    option = ChromeOptions()
    option.add_experimental_option('useAutomationExtension', False)
    option.add_experimental_option('excludeSwitches', ['enable-automation'])

    def __init__(self):
        self.proxy = self._get_proxy()
        self.position = self._get_position()
        self.url = self._get_url()
        LagouSpider.option.add_argument(self.proxy)
        self.driver = webdriver.Chrome(options=LagouSpider.option)  
        # chrome79版本后,规避监测的方法
        self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                Object.defineProperty(navigator, 'webdriver', {
                  get: () => undefined
                })
              """
        })
        self.driver.implicitly_wait(10)

    @staticmethod
    def _get_position():
        position = input('输入搜索的职位: ').strip()
        return quote(position, "utf-8")

    @staticmethod
    def _get_proxy():
        return requests.get('http://127.0.0.1:8000/get/').text

    def _get_url(self):
        lg_url = 'https://www.lagou.com/jobs/list_{position}/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput='
        return lg_url.format(position=self.position)

    def _add_cookies(self):
        cookie = input('请输入cookie:').strip()  
        # cookie的格式 k1=v1; k2=v2; k3=v3.......  
        for item in cookie.split(';'):
            k, v = item.strip().split('=')
            self.driver.add_cookie({'name': k, 'value': v})

    def pares_page(self):
        li_list = self.driver.find_element(By.CSS_SELECTOR, '#s_position_list > ul').find_elements(By.TAG_NAME, 'li')
        for li in li_list:
            try:
                position_name = li.find_element(By.CSS_SELECTOR, '.position_link h3').text
                job_addr = li.find_element(By.CSS_SELECTOR, '.add em').text.split('·')[0]
                salary = li.find_element(By.CSS_SELECTOR, '.p_bot .li_b_l').text.split()[0]
                experience = li.find_element(By.CSS_SELECTOR, '.p_bot .li_b_l').text.split()[1]
                education = li.find_element(By.CSS_SELECTOR, '.p_bot .li_b_l').text.split()[3]
                trade = li.find_element(By.CSS_SELECTOR, '.company .industry').text.split()[0]

                data_dic = {'职位名称': position_name,
                            '工作地点': job_addr,
                            '薪资': salary,
                            '工作经验': experience,
                            '学历': education,
                            '行业': trade}
            except Exception as e:
                continue
            self.save(data_dic)
            time.sleep(0.5)            
            
        self.turn_page()
        self.pares_page()

    def turn_page(self):
        next_button = self.driver.find_element(By.CSS_SELECTOR, '.pager_container .pager_next')
        time.sleep(1)
        next_button.click()

    def run_crawl(self):
        try:
            self.driver.get('https://www.lagou.com/')   # 获取首页cookie,直接访问信息页会被拒绝,需要携带此cookie才能访问
            self._add_cookies()                         # 手动把登录后抠出来的cookie添加到driver
            self.driver.get(self.url)                   # 访问信息页
            time.sleep(3) 
            self.pares_page()                           # 解析数据
            print('爬取结束')
        except Exception as e:
            print(e)
        finally:
            self.driver.close()

    def save(self, data):
        pid = str(uuid.uuid4())
        json_payload = json.dumps(data)
        conn = redis.Redis(connection_pool=POOL)
        conn.hset('lg_position', pid, json_payload)

if __name__ == '__main__':
    spider = LagouSpider()
    spider.run_crawl()

 

posted @ 2022-10-01 17:01  不会钓鱼的猫  阅读(81)  评论(0编辑  收藏  举报