# -*- coding: utf-8 -*-
"""
@author: Dell Created on Mon Dec 23 17:55:06 2019
"""
import re
import time
import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
HEADERS = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Referer': 'https://careers.tencent.com/',
'Accept': 'application/json, text/javascript, */*; q=0.01'
}
def parse(url):
driver= webdriver.Chrome()
driver.get(url)
text = driver.page_source
# resp = requests.get(url, headers=HEADERS)
# text = resp.content.decode("utf-8", errors="ignore")
#解析网页
html = etree.HTML(text)
divs = html.xpath("//div[@class='recruit-list']")
pos_infos = []
for div in divs:
title = div.xpath("./a/h4/text()")[0]#提取职位名称
address = div.xpath("./a/p/span[2]/text()")[0]#提取职位工作地点
require = div.xpath("./a/p[@class='recruit-text']/text()")[0]#提取职位要求
pos_info = {'title':title, 'address':address, 'require':require}
pos_infos.append(pos_info)
driver.close()
return pos_infos
def save(list):
with open("tencent.txt", "a+", encoding="utf-8") as f:
for line in list:
f.write(str(line) + "\n")
if __name__ == "__main__":
baseurl = "https://careers.tencent.com/search.html?index={}&keyword=python"
for i in range(1,70):
url = baseurl.format(i)
pos_list = parse(url)
save(pos_list)
for pos in pos_list:
print(pos)
print("第%s页解析完成" % str(i), "-" * 50)
pass