运用selenium、urllib抓取51job上的python任职要求,保存为txt文本
运用selenium、urllib抓取51job上的python岗位任职要求,形成一个txt文本:
import selenium #测试框架 import selenium.webdriver #模拟浏览器 import re import urllib import urllib.request def geturllistsh(searchname): url="https://search.51job.com/list/020000,000000,0000,00,9,99,"+searchname+",2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" driver=selenium.webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver") #调用火狐浏览器 driver.get(url) #访问链接 pagesource=driver.page_source #抓取网页源代码 restr="""title\">(.*?)</span""" #正则表达式 regex=re.compile(restr,re.IGNORECASE) mylist=regex.findall(pagesource) driver.close() #关闭 #getnumberbyname("python") #num=eval(getnumberbyname("python")) #1731 #if num%50==0: # pages=num//50+1 #else: # pages=num//50+1 mylist = [] for i in range(1,130): newurl="https://search.51job.com/list/020000,000000,0000,00,9,99,"+searchname+",2,{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=".format(i) mylist.append(newurl) for line in mylist: print(line) return mylist def downloadgeturllist(url): headers={"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"} request=urllib.request.Request(url,headers=headers)#发起请求, # 也可以通过调⽤Request.add_header() 添加/修改⼀个特定的 header request.add_header("Connection", "keep-alive") #一直活着 try: response=urllib.request.urlopen(request) data=response.read().decode("gbk")#打开请求,抓取数据 print(response.code) # 可以查看响应状态码 restr = "<div class=\"dw_table\" id=\"resultList\">([\s\S]*?)<!--列表表格 END-->" # 正则表达式,()只要括号内的数据 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) #print(mylist[0])#抓取整个表格 restr = "el title\">([\s\S]*?)<!--列表表格 END-->" # 正则表达式,()只要括号内的数据 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) restr = "<span class=\"t5\">发布时间</span>([\s\S]*?)<!--列表表格 END-->" # 正则表达式,()只要括号内的数据 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) #print(mylist[0])#抓取整个表格 #returnurllist=[] #存储url,最终返回 for line in mylist: restr = '<a target="_blank" title=".*?" href="(.*?)" onmousedown=".*?">[.\s\S]*?</a>' regex = re.compile(restr, re.IGNORECASE) geturllist = regex.findall(line) for getlist in geturllist: print(getlist) return geturllist except: return "" def getworkinfo(url): headers={"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"} request=urllib.request.Request(url,headers=headers)#发起请求, # 也可以通过调⽤Request.add_header() 添加/修改⼀个特定的 header request.add_header("Connection", "keep-alive") #一直活着 try: response=urllib.request.urlopen(request) data=response.read().decode("gbk","ignore")#打开请求,抓取数据 restr = "<div class=\"bmsg job_msg inbox\">([\s\S]*?).*?\s<div class=\"mt10\">" # 正则表达式,()只要括号内的数据 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) if len(mylist) > 0: datas = mylist[0].strip().replace("</p>", "").replace("<p>", "") return datas else: return "" except: return "" savefilepath="workinfo.txt" savefile=open(savefilepath,"wb") urllist=geturllistsh("python") #抓取urllist for url in urllist: templist=downloadgeturllist(url) for tempurl in templist: workstr=getworkinfo(tempurl) print(workstr) savefile.write((workstr+"\r\n").encode("utf-8")) savefile.close()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?