抓取腾讯招聘python岗位

# -*- coding: utf-8 -*-
"""
@author: Dell Created on Mon Dec 23 17:55:06 2019
"""
import re
import time
import requests
from lxml import etree

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


HEADERS = {
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    'Referer': 'https://careers.tencent.com/',
    'Accept': 'application/json, text/javascript, */*; q=0.01'
}

def parse(url):
    driver= webdriver.Chrome()
    driver.get(url)
    text = driver.page_source
    
    # resp = requests.get(url, headers=HEADERS)
    # text = resp.content.decode("utf-8", errors="ignore")
    
    #解析网页
    html = etree.HTML(text)
    divs = html.xpath("//div[@class='recruit-list']")
    
    pos_infos = []
    for div in divs:
        title = div.xpath("./a/h4/text()")[0]#提取职位名称
        address = div.xpath("./a/p/span[2]/text()")[0]#提取职位工作地点
        require = div.xpath("./a/p[@class='recruit-text']/text()")[0]#提取职位要求
       
        pos_info = {'title':title, 'address':address, 'require':require}
        pos_infos.append(pos_info)
    
    driver.close()
    return pos_infos

def save(list):
    with open("tencent.txt", "a+", encoding="utf-8") as f:
        for line in list:
            f.write(str(line) + "\n")
            
    

if __name__ == "__main__":
    baseurl = "https://careers.tencent.com/search.html?index={}&keyword=python"
    for i in range(1,70):
        url = baseurl.format(i)
        pos_list = parse(url)
        
        save(pos_list)
        for pos in pos_list:
            print(pos)
        print("第%s页解析完成" % str(i), "-" * 50)
    pass

posted on 2019-12-23 20:11  行之间  阅读(400)  评论(0编辑  收藏  举报