拉勾网爬虫

# -*- coding: utf-8 -*-
# TODO https://www.lagou.com/wn/jobs?kd=Java&city=%E5%85%A8%E5%9B%BD
# @Date    : 2022/4/25 9:53
# @Author  : layman
import requests
import json
from lxml import etree


def getNextUrl(kd, pn):
    headers = {
        'Referer': 'https://www.lagou.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36'
    }
    parms = {
        'pn': pn,
        'cl': 'false',
        'fromSearch': 'true',
        'kd': kd
    }
    url = 'https://www.lagou.com/wn/jobs'
    headers = {
        'origin': 'https://www.lagou.com',
        'referer': f'https://www.lagou.com/wn/jobs?kd={kd}&city=%E5%85%A8%E5%9B%BD',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
        'cookie': 'RECOMMEND_TIP=true; user_trace_token=20220220111830-3bc268fdsfsfsfd2-e379bb0a6e1e; LGUID=20220220111830-4acc255d-b370-468d-8d4d-517f0755b875; _ga=GA1.2.1717447248.1645327108; smidV2=20220313164640140aa5fbc1e260b461e911b961866f1c009ed560315758d80; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAAECABFAACEA4414CB00A33EF5FE9D415B7DD089665E; WEBTJ-ID=20220425093829-1805e5ec1d52f8-048da4ab60938c-9771a3f-1327104-1805e5ec1d64d6; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGSID=20220425093831-e2ffe5b0-9198-47ad-bc84-a23cbcf5ff18; PRE_SITE=https%3A%2F%2Fwww.lagou.com; _gid=GA1.2.952409338.1650850711; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1649565890,1649923373,1650850711; privacyPolicyPopup=false; sensorsdata2015session=%7B%7D; LG_LOGIN_USER_ID=576eb66efed94bf89ae4f0f382542744027d9d3bda167101a9c1ab4c84eeda03; LG_HAS_LOGIN=1; _putrc=7353C66353E1FA2E123F89F2B170EADC; login=true; unick=%E5%BC%A0%E9%A1%BA; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=7; __SAFETY_CLOSE_TIME__19393310=1; gate_login_token=9166364f3eefd44bafa66a710af72feb9811c092a785c503d44e4379fca7a353; TG-TRACK-CODE=index_navigation; __lg_stoken__=8382b3ec8ec3d4b5622e9c1f6a8747ee723a44d4e7596be643eba21610c0f3b7a03abe1c2f9b1c0e856ead2d2d283c9f09ece56180188eb1cccf60ebe129a7d32e12d6d5da54; X_HTTP_TOKEN=d65cba845dbca000770158056188260baa07dfe93c; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1650851076; LGRID=20220425094441-b47f8533-991b-489e-a13a-493c1eb35141; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219393310%22%2C%22first_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2299.0.4844.82%22%7D%2C%22%24device_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%7D'}
    response = requests.get(url=url, headers=headers, params=parms)
    html = etree.HTML(response.text)
    json_str = html.xpath('//script[@id="__NEXT_DATA__"]/text()')
    url_list = []
    json_data = json.loads(json_str[0])
    content = json_data["props"]["pageProps"]["initData"]["content"]["hrInfoMap"]
    for key in content:
        url = 'https://www.lagou.com/wn/jobs/' + key + '.html'
        # print(url)
        url_list.append(url)


# url_list = getNextUrl(kd='Java', pn=4)


def getDetail(url):
    headers = {
        'origin': 'https://www.lagou.com',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
        'cookie': 'RECOMMEND_TIP=true; user_trace_token=20220220111830-3bc26860-d88rerere379bb0a6e1e; LGUID=20220220111830-4acc255d-b370-468d-8d4d-517f0755b875; _ga=GA1.2.1717447248.1645327108; smidV2=20220313164640140aa5fbc1e260b461e911b961866f1c009ed560315758d80; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAAECABFAACEA4414CB00A33EF5FE9D415B7DD089665E; WEBTJ-ID=20220425093829-1805e5ec1d52f8-048da4ab60938c-9771a3f-1327104-1805e5ec1d64d6; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGSID=20220425093831-e2ffe5b0-9198-47ad-bc84-a23cbcf5ff18; PRE_SITE=https%3A%2F%2Fwww.lagou.com; _gid=GA1.2.952409338.1650850711; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1649565890,1649923373,1650850711; privacyPolicyPopup=false; sensorsdata2015session=%7B%7D; LG_LOGIN_USER_ID=576eb66efed94bf89ae4f0f382542744027d9d3bda167101a9c1ab4c84eeda03; LG_HAS_LOGIN=1; _putrc=7353C66353E1FA2E123F89F2B170EADC; login=true; unick=%E5%BC%A0%E9%A1%BA; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=7; __SAFETY_CLOSE_TIME__19393310=1; gate_login_token=9166364f3eefd44bafa66a710af72feb9811c092a785c503d44e4379fca7a353; TG-TRACK-CODE=index_navigation; __lg_stoken__=8382b3ec8ec3d4b5622e9c1f6a8747ee723a44d4e7596be643eba21610c0f3b7a03abe1c2f9b1c0e856ead2d2d283c9f09ece56180188eb1cccf60ebe129a7d32e12d6d5da54; X_HTTP_TOKEN=d65cba845dbca000770158056188260baa07dfe93c; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1650851076; LGRID=20220425094441-b47f8533-991b-489e-a13a-493c1eb35141; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219393310%22%2C%22first_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2299.0.4844.82%22%7D%2C%22%24device_id%22%3A%2217f15234974228-0deb05b76a48e9-576153e-1327104-17f152349759f3%22%7D'}
    resp = requests.get(url, headers=headers)
    html = etree.HTML(resp.text)
    description = html.xpath('//*[@id="job_detail"]//text()')
    print(description)


getDetail('https://www.lagou.com/wn/jobs/7999778.html')

posted @ 2022-04-25 11:59  biglayman  阅读(537)  评论(0编辑  收藏  举报