python爬取招聘网站数据

# -*- coding: utf-8 -*- # 爬虫分析 from bs4 import BeautifulSoup from lxml import etree from selenium import webdriver import time from pymongo import MongoClient class WorkSpider: def __init__(self): self.client = MongoClient('mongodb://localhost:27017/') self.zfdb = self.client.zfdb #self.zfdb.authenticate("mongodbUser", "yourpassward") # 要爬取的城市列表 def getCity(self): return [ "全国", "北京", "郑州", #"上海", #"深圳", #"广州", ] # 要爬取的语言列表 def getLanguage(self): return [ "Java", "Python", # "C", # "机器学习", # "图像识别", # "自然语言处理", # "区块链", # "精准推荐", # "Node.js", # "Go", # "Hadoop", # "Php", # ".NET", # "Android", # "iOS", # "web前端", ] # 经过观察发现,拉钩的 url 随语言和城市的变化如下 def getUrl(self, language, city): url = "https://www.lagou.com/jobs/list_" + language + "?px=default&city=" + city return url # 获取一个城市,列表中所有语言的 url 列表 def getCityUrl(self, city): urlList = [] for language in self.getLanguage(): urlList.append(self.getUrl(language, city)) return urlList # 获取一门语言,不同城市的 url 列表 def getLanguageUrl(self, language): urlList = [] for city in self.getCity(): urlList.append(self.getUrl(language, city)) return urlList def getOnePageData(self): pass # MongoDB 存储数据结构 def getRentMsg(self, name, company, welfare, salaryMin, salaryMid, salaryMax, experience, education, companyType, companyLevel, companySize): return { "name": name, # 职位名称(python工程师) "company": company, # 公司名称(xxx有限公司) "welfare": welfare, # 福利(餐补、下午茶、带薪年假) "salaryMin": salaryMin, # 工资下限(9k) "salaryMid": salaryMid, # 工资下限(9k+15k)/2 "salaryMax": salaryMax, # 工资上限(15k) "experience": experience, # 工作经验(经验3-5年) "education": education, # 教育程度(本科) "companyType": companyType, # 公司类型(移动互联网/信息安全) "companyLevel": companyLevel, # 公司级别(上市公司) "companySize": companySize, # 公司人数规模(150-500人) } # 获取网页源码数据 # language => 编程语言 # city => 城市 # collectionType => 值:True/False True => 数据库表以编程语言命名 False => 以城市命名 def main(self, language, city, collectionType): print(" 当前爬取的语言为 => " + language + " 当前爬取的城市为 => " + city) #print(" 当前爬取的语言为 => " + language + " 当前爬取的城市为 => " + city) #print(" 当前爬取的语言为 => " + language + " 当前爬取的城市为 => " + city) url = self.getUrl(language, city) print(" 当前爬取的路径为 => " + url ) chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--start-maximized') # 最大化运行(全屏窗口),不设置,取元素会报错 chrome_options.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示 chrome_options.add_argument('--incognito') # 隐身模式(无痕模式) #chrome_options.add_argument('--headless') # 浏览器不提供可视化页面 browser = webdriver.Chrome(executable_path = "chromedriver",options=chrome_options) #browser = webdriver.Chrome("chromedriver") browser.get(url) browser.implicitly_wait(10) for i in range(30): selector = etree.HTML(browser.page_source) # 获取源码 soup = BeautifulSoup(browser.page_source, "html.parser") span = soup.find("div", attrs={"class": "pager_container"}).find("span", attrs={"action": "next"}) print("span =>" + str(span)) # <span action="next" class="pager_next pager_next_disabled" hidefocus="hidefocus">下一页<strong class="pager_lgthen pager_lgthen_dis"></strong></span> classArr = span['class'] print("classArr =>"+ str(classArr)) # 输出内容为 -> ['pager_next', 'pager_next_disabled'] attr2 = list(classArr)[1] if attr2 == "pager_next_disabled": print("已经爬到最后一页,爬虫结束") break else: print("还有下一页,爬虫继续") #browser.find_element_by_xpath('//*[@id="order"]/li/div[4]/div[2]').click() # 点击下一页 browser.find_element_by_xpath('//span[@class="pager_is_current"]/following-sibling::span').click() # 点击下一页 time.sleep(5) print('第{}页抓取完毕'.format(i + 1)) self.getItemData(selector, language, city, collectionType) browser.close() # 解析一条 item 数据,并存进数据库 def getItemData(self, selector, language, city, collectionType): items = selector.xpath('//*[@id="s_position_list"]/ul/li') for item in items: try: name = item.xpath('div[1]/div[1]/div[1]/a/h3/text()')[0] company = item.xpath('div[1]/div[2]/div[1]/a/text()')[0] welfare = item.xpath('div[2]/div[2]/text()')[0] salaryArray = item.xpath('div[1]/div[1]/div[2]/div/span/text()')[0].strip().split("-") salaryMin = salaryArray[0][:len(salaryArray[0]) - 1] salaryMax = salaryArray[1][:len(salaryArray[1]) - 1] salaryMid = (int(salaryMin) + int(salaryMax)) / 2 educationArray = item.xpath('div[1]/div[1]/div[2]/div//text()')[3].strip().split("/") education = educationArray[0].strip() experience = educationArray[1].strip() conmpanyMsgArray = item.xpath('div[1]/div[2]/div[2]/text()')[0].strip().split("/") companyType = conmpanyMsgArray[0].strip() companyLevel = conmpanyMsgArray[1].strip() companySize = conmpanyMsgArray[2].strip() data = self.getRentMsg( name, company, welfare, int(salaryMin), salaryMid, int(salaryMax), experience, education, companyType, companyLevel, companySize ) if collectionType: self.zfdb["z_" + language].insert_one(data) else: self.zfdb["z_" + city].insert_one(data) print(data) except: print("======= exception =======") continue spider = WorkSpider()# 职业爬虫 for language in spider.getLanguage(): for city in spider.getCity(): spider.main(language, city, True) time.sleep(5)
以上是爬取功能的全部代码:
参考github上的源码修改:
主要步骤如下:
1、组装url
2、selenium爬取数据
3、存入数据库mongo
4、去广告:
browser.get(url)
browser.implicitly_wait(10)
try:
browser.find_element_by_xpath('//div[@class="body-container showData"]/div/div[2]').click() # 点击广告
except:
pass
---------------------------------------------------------------------------------------------------------------------------------------------------------
分析数据:

# -*- coding: utf-8 -*- # 数据分析,数据可视化 from os import path from wordcloud import WordCloud, ImageColorGenerator import jieba.analyse import matplotlib.pyplot as plt #from scipy.misc import imageio import os import time from pymongo import MongoClient class Analycis: def __init__(self): self.client = MongoClient('mongodb://localhost:27017/') self.zfdb = self.client.zfdb #self.zfdb.authenticate("mongodbUser", "yourpassward") def getCity(self): return [ "全国", "北京", "郑州", #"上海", #"深圳", #"广州", ] def getLanguage(self): return [ "Java", "Python", # "C", # "机器学习", # "图像识别", # "自然语言", # "区块链", # "Go", # "Php", # ".NET", # "Android", # "iOS", # "web前端", # "精准推荐", # "Node.js", # "Hadoop", ] # 统计的数据量 # 各语言平均工资 # 各语言学历要求 # 各语言工作年限要求 # # 福利词云 # 公司级别排行(A轮、B轮) # 公司类型排行 # 获取各语言样本数量 def getLanguageNum(self): analycisList = [] for index, language in enumerate(self.getLanguage()): collection = self.zfdb["z_" + language] totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}]) totalNum2 = list(totalNum)[0]["total_num"] analycisList.append(totalNum2) return (self.getLanguage(), analycisList) # 获取各语言的平均工资 def getLanguageAvgSalary(self): analycisList = [] for index, language in enumerate(self.getLanguage()): collection = self.zfdb["z_" + language] totalSalary = collection.aggregate([{'$group': {'_id': '', 'total_salary': {'$sum': '$salaryMid'}}}]) totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}]) totalNum2 = list(totalNum)[0]["total_num"] totalSalary2 = list(totalSalary)[0]["total_salary"] analycisList.append(round(totalSalary2 / totalNum2, 2)) return (self.getLanguage(), analycisList) # 获取一门语言的学历要求(用于 pyecharts 的词云) def getEducation(self, language): results = self.zfdb["z_" + language].aggregate([{'$group': {'_id': '$education', 'weight': {'$sum': 1}}}]) educationList = [] weightList = [] for result in results: educationList.append(result["_id"]) weightList.append(result["weight"]) # print(list(result)) return (educationList, weightList) # 获取一门语言的工作年限要求(用于 pyecharts 的词云) def getExperience(self, language): results = self.zfdb["z_" + language].aggregate([{'$group': {'_id': '$experience', 'weight': {'$sum': 1}}}]) totalAvgPriceDirList = [] for result in results: totalAvgPriceDirList.append( {"value": result["weight"], "name": result["_id"] + " " + str(result["weight"])}) return totalAvgPriceDirList # 获取 welfare 数据,用于构建福利词云 def getWelfare(self): content = '' queryArgs = {} projectionFields = {'_id': False, 'welfare': True} # 用字典指定 for language in self.getLanguage(): collection = self.zfdb["z_" + language] searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000) for result in searchRes: print(result["welfare"]) content += result["welfare"] return content # 获取公司级别排行(用于条形图) def getAllCompanyLevel(self): levelList = [] weightList = [] newWeightList = [] attrList = ["A轮", "B轮", "C轮", "D轮及以上", "不需要融资", "上市公司"] for language in self.getLanguage(): collection = self.zfdb["z_" + language] # searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000) results = collection.aggregate([{'$group': {'_id': '$companyLevel', 'weight': {'$sum': 1}}}]) for result in results: levelList.append(result["_id"]) weightList.append(result["weight"]) for index, attr in enumerate(attrList): newWeight = 0 for index2, level in enumerate(levelList): if attr == level: newWeight += weightList[index2] newWeightList.append(newWeight) return (attrList, newWeightList) # ======================================================== # 展示饼图 def showPie(self, title, attr, value): from pyecharts import Pie pie = Pie(title) # pie.add("aa", attr, value, is_label_show=True, title_pos='center') pie.add("", attr, value, radius=[40, 75], label_text_color=None, is_label_show=True, legend_orient="vertical", legend_pos="left", ) pie.render() # 展示矩形树图 def showTreeMap(self, title, data): from pyecharts import TreeMap data = data treemap = TreeMap(title, width=1200, height=600) treemap.add("深圳", data, is_label_show=True, label_pos='inside', label_text_size=19) treemap.render() # 展示条形图 def showLine(self, title, attr, value): from pyecharts import Bar bar = Bar(title) bar.add("深圳", attr, value, is_convert=False, is_label_show=True, label_text_size=18, is_random=True, xaxis_interval=0, # xaxis_label_textsize=9, legend_text_size=18, label_text_color=["#000"]) bar.render() # 展示词云 def showWorkCloud(self, content, image_filename, font_filename, out_filename): d = path.dirname(__name__) # content = open(path.join(d, filename), 'rb').read() # 基于TF-IDF算法的关键字抽取, topK返回频率最高的几项, 默认值为20, withWeight # 为是否返回关键字的权重 tags = jieba.analyse.extract_tags(content, topK=100, withWeight=False) text = " ".join(tags) # 需要显示的背景图片 img = imageio.imread(path.join(d, image_filename)) # 指定中文字体, 不然会乱码的 wc = WordCloud(font_path=font_filename, background_color='black', # 词云形状, mask=img, # 允许最大词汇 max_words=500, # 最大号字体,如果不指定则为图像高度 max_font_size=130, # 画布宽度和高度,如果设置了msak则不会生效 # width=600, # height=400, margin=2, # 词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1 prefer_horizontal=0.9 ) wc.generate(text) img_color = ImageColorGenerator(img) plt.imshow(wc.recolor(color_func=img_color)) wc.to_file("loutput.jpeg") plt.axis("off") plt.show() wc.to_file(path.join(d, out_filename)) # 展示 pyecharts 的词云 def showPyechartsWordCloud(self, attr, value): from pyecharts import WordCloud wordcloud = WordCloud(width=1300, height=620) wordcloud.add("", attr, value, word_size_range=[20, 100]) wordcloud.render() analycis = Analycis() # 计算样本数量 (attr, value) = analycis.getLanguageNum() analycis.showLine("样本数量", attr, value) os.rename("render.html","sampleNum.html") # 计算样本数量 (attr, value) = analycis.getLanguageAvgSalary() analycis.showLine("各语言平均工资", attr, value) os.rename("render.html","languageAvgSalary.html") # 语言学历要求 for language in analycis.getLanguage(): (attr, value) = analycis.getEducation(language) print(attr, value) analycis.showPie(" "+language + " 工作年限", attr, value) os.rename("render.html", "./languageEducation/" + language + "Education.html") # # 语言工作年限要求要求 for language in analycis.getLanguage(): data = analycis.getExperience(language) print(data) analycis.showTreeMap(" "+language+"工作学历要求", data) os.rename("render.html", "./languageExperience/" + language + "Experience.html") # 福利词云 analycis.showWorkCloud(analycis.getWelfare(), "docker.jpeg", "kh.ttf", out_filename="loutput.jpeg") # 公司级别(A轮、B轮) pyechart 词云 (attr, value) = analycis.getAllCompanyLevel() print(attr, value) analycis.showLine("公司级别", attr, value) os.rename("render.html", "companyLevel.html")
标签:
Python
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人