python3爬虫 -----爬取职位招聘信息-------from腾讯社会招聘
1 # -*- coding: utf-8 -*- 2 # author:zxy 3 #Date:2018-9-23 4 5 from lxml import etree 6 import requests 7 8 BASE_DOMAIN="http://hr.tencent.com/" 9 HEADERS = { 10 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) ' 11 'AppleWebKit/537.36 (KHTML, like Gecko)' 12 ' Chrome/67.0.3396.99 Safari/537.36' 13 } 14 BASE_URL="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start=0" 15 16 def parse_detail_page(url): 17 position={} 18 response=requests.get(url,headers=HEADERS) 19 html=etree.HTML(response.text) 20 work_name=html.xpath("//tr[@class='h']/td/text()")[0] 21 work_place=html.xpath("//tr[@class='c bottomline']/td[1]/text()")[0] 22 work_category=html.xpath("//tr[@class='c bottomline']/td[2]/text()")[0] 23 work_lack_number=html.xpath("//tr[@class='c bottomline']/td[3]/text()")[0] 24 # print(work_lack_number) 25 more_infos=html.xpath("//ul[@class='squareli']") 26 work_duty=more_infos[0].xpath(".//text()") 27 work_require=more_infos[1].xpath(".//text()") 28 29 position['work_name']=work_name 30 position['work_place']=work_place 31 position['work_category']=work_category 32 position['work_lack_number']=work_lack_number 33 position['work_duty']=work_duty 34 position['work_require']=work_require 35 36 return position 37 38 def get_detail_urls(url): 39 response=requests.get(url=BASE_URL,headers=HEADERS) 40 text=response.text 41 html=etree.HTML(text) 42 links=html.xpath("//tr[@class='even']//a/@href") 43 links=map(lambda url:BASE_DOMAIN+url,links) 44 return links 45 46 def spider(): 47 base_url="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a" 48 positions=[] 49 for x in range(0,4): #43 50 x*=10 51 url=base_url.format(x) 52 detail_urls=get_detail_urls(url) 53 for detail_url in detail_urls: 54 position=parse_detail_page(detail_url) 55 positions.append(position) 56 #print(position) 57 with open('tecentRecruit.txt','a',encoding='utf-8') as f: 58 for (key,value) in position.items(): 59 if(key=='work_duty'): 60 str='work_duty :{}' 61 f.write(str.format(value)) 62 f.write('\n') 63 elif(key=='work_require'): 64 str="work_require :{}" 65 f.write(str.format(value)) 66 f.write('\n') 67 else: 68 f.write(key+":"+value) 69 f.write('\n') 70 f.write('\n'*3) 71 72 #print(positions) 73 74 if __name__ == '__main__': 75 spider()
效果如图所示:
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 智能桌面机器人:用.NET IoT库控制舵机并多方法播放表情
· Linux glibc自带哈希表的用例及性能测试
· 深入理解 Mybatis 分库分表执行原理
· 如何打造一个高并发系统?
· .NET Core GC压缩(compact_phase)底层原理浅谈
· 新年开篇:在本地部署DeepSeek大模型实现联网增强的AI应用
· DeepSeek火爆全网,官网宕机?本地部署一个随便玩「LLM探索」
· Janus Pro:DeepSeek 开源革新,多模态 AI 的未来
· 上周热点回顾(1.20-1.26)
· 【译】.NET 升级助手现在支持升级到集中式包管理