xpath+selenium工具爬取腾讯招聘全球岗位需求
from selenium import webdriver import time from lxml import etree # 格式化html页面 import json from pprint import pprint import re class Tencent(object): def __init__(self): self.start_url = "https://careers.tencent.com/search.html" # 在内存中运行 self.options = webdriver.ChromeOptions() self.options.add_argument("--headless") self.driver = webdriver.Chrome(options=self.options) # 有可视化界面的工具 # self.driver = webdriver.Chrome() # 实例一个自动化工具 # self.driver.set_window_size(1280, 960) # 设置工具窗口 def get_content_list(self, html_str): html = etree.HTML(html_str) content_list = [] # div_list = html.xpath("//div[@class='correlation-degree']/div/div[position()>1]") 特殊页面有11个元素的 div_list = html.xpath("//div[@class='correlation-degree']/div/div") for div in div_list: item = {} item["腾讯岗位标题"] = div.xpath(".//h4[@class='recruit-title']/text()")[0] item["岗位地点"] = div.xpath(".//p[@class='recruit-tips']/span[2]/text()")[0] item["招聘信息发布时间"] = div.xpath(".//p[@class='recruit-tips']/span[4]/text()")[0] item["岗位类型"] = div.xpath(".//p[@class='recruit-tips']/span[3]/text()")[0] if len( div.xpath(".//p[@class='recruit-tips']/span[3]/text()")) > 0 else None item["岗位要求"] = div.xpath(".//a[@class='recruit-list-link']/p[2]/text()")[0] if len( div.xpath(".//a[@class='recruit-list-link']/p[2]/text()")) > 0 else None item["岗位要求"] = re.sub("\s", "", item["岗位要求"]) content_list.append(item) return content_list def save_content(self, content_list): """保存数据至本地""" with open("腾讯招聘信息.txt", "a") as f: for i in content_list: f.write(json.dumps(i, ensure_ascii=False, indent=2) + "\n") pprint("保存成功") def run(self): """控制爬虫逻辑""" self.driver.get(self.start_url) # 进入开始页 html_str = self.driver.page_source # 获得经过js、css、ajax完全加载后渲染的页面element字符串 content_lsit = self.get_content_list(html_str) # 使用etree格式化html页面后提取数据 self.save_content(content_lsit) # 保存数据 # 进入下一页 while True: # 进入循环 # 重点 next_page = self.driver.find_element_by_xpath("//li[@class='next']") # 在当前页获得下一页按钮 self.driver.execute_script("arguments[0].click();", next_page) # 执行下一页js代码,固定写法, time.sleep(5) print(self.driver.current_url) # 输出当前页url地址 html_str = self.driver.page_source content_lsit = self.get_content_list(html_str) self.save_content(content_lsit) if len(content_lsit) < 10: self.driver.save_screenshot("最后一页.png") break def __del__(self): self.driver.quit() if __name__ == '__main__': tencent = Tencent() tencent.run()