爬取腾讯职位

# coding=gbk
import requests

from lxml import etree

import time

# 每页的职位数
PAGE_SIZE = 10

BASE_DOMAIN = 'https://hr.tencent.com/'

HEADERS = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
    'Referer': 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=10',
    'Cookie': '_ga=GA1.2.1222789966.1535530525; pgv_pvi=8193187840; pgv_si=s2985358336; PHPSESSID=22e3m8aknd19s1gqkh0i9eisk0; Hm_lvt_0bd5902d44e80b78cb1cd01ca0e85f4a=1536726429,1536908218,1537154694,1537166987; Hm_lpvt_0bd5902d44e80b78cb1cd01ca0e85f4a=1537167106'
}


def get_jo_detail_urls(page_url):
    """
	1.根据当前页面url地址获取每一个职位的详情页面url
	:param page_url:当前页面的url
	:return:
	"""
    response = requests.get(page_url, headers=HEADERS)

    html_element = etree.HTML(response.text)

    # print(etree.tostring(html_element, encoding='utf-8').decode('utf-8'))

    detail_urls = html_element.xpath('//tr[@class="even" or @class="odd"]//a/@href')

    # 获取所有职位详情页面的url
    detail_urls = map(lambda detail_url: BASE_DOMAIN + detail_url, detail_urls)

    return detail_urls


def get_detail_msg(detail_url):
    """
	2.获取某个职位的详细数据
	:param detail_url: 职位详细页面的url
	:return: 职位数据
	"""
    # print('请求的详细地址是:' + detail_url)
    response = requests.get(detail_url, headers=HEADERS)
    html_element = etree.HTML(response.text)

    position = {}

    # 获取职位标题
    title = html_element.xpath('//tr[@class="h"]/td/text()')[0]
    position['title'] = title

    # 工作地点/职位类别
    top_infos = html_element.xpath('//tr[@class="c bottomline"]//text()')
    position['location'] = top_infos[top_infos.index('工作地点:') + 1]
    position['category'] = top_infos[top_infos.index('职位类别:') + 1]

    content_infos = html_element.xpath('//ul[@class="squareli"]')
    # 工作职责
    work_do_info = content_infos[0]
    position['duty'] = work_do_info.xpath("./li/text()")

    # 工作要求
    work_ask_info = content_infos[1]
    position['ask'] = work_ask_info.xpath('./li/text()')

    return position


def spider():
    # 0.待返回的职位数据
    positions = []

    # 1.获取前10页的职位数据
    for page_num in range(0, 10):
        print('开始爬取第{}页数据'.format(page_num + 1))

        # 2.每一页的地址
        url = 'https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a'.format(page_num * PAGE_SIZE)

        # 3.获取当前页所有职位的【详情页面的url】
        detail_urls = get_jo_detail_urls(url)

        # 4.一个个去解析详情页面的数据
        for detail_url in detail_urls:
            position = get_detail_msg(detail_url)
            positions.append(position)

        time.sleep(1)

    print('爬取完成!')
    print(positions)


if __name__ == '__main__':
    spider()

posted @ 2022-12-08 21:42  __username  阅读(17)  评论(0编辑  收藏  举报

本文作者:DIVMonster

本文链接:https://www.cnblogs.com/guangzan/p/12886111.html

版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。