python3爬虫 -----爬取职位招聘信息-------from腾讯社会招聘

复制代码
 1 # -*- coding: utf-8 -*-
 2 # author:zxy
 3 #Date:2018-9-23
 4 
 5 from lxml import etree
 6 import requests
 7 
 8 BASE_DOMAIN="http://hr.tencent.com/"
 9 HEADERS = {
10     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
11                   'AppleWebKit/537.36 (KHTML, like Gecko)'
12                   ' Chrome/67.0.3396.99 Safari/537.36'
13 }
14 BASE_URL="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start=0"
15 
16 def parse_detail_page(url):
17     position={}
18     response=requests.get(url,headers=HEADERS)
19     html=etree.HTML(response.text)
20     work_name=html.xpath("//tr[@class='h']/td/text()")[0]
21     work_place=html.xpath("//tr[@class='c bottomline']/td[1]/text()")[0]
22     work_category=html.xpath("//tr[@class='c bottomline']/td[2]/text()")[0]
23     work_lack_number=html.xpath("//tr[@class='c bottomline']/td[3]/text()")[0]
24     # print(work_lack_number)
25     more_infos=html.xpath("//ul[@class='squareli']")
26     work_duty=more_infos[0].xpath(".//text()")
27     work_require=more_infos[1].xpath(".//text()")
28 
29     position['work_name']=work_name
30     position['work_place']=work_place
31     position['work_category']=work_category
32     position['work_lack_number']=work_lack_number
33     position['work_duty']=work_duty
34     position['work_require']=work_require
35 
36     return position
37 
38 def get_detail_urls(url):
39     response=requests.get(url=BASE_URL,headers=HEADERS)
40     text=response.text
41     html=etree.HTML(text)
42     links=html.xpath("//tr[@class='even']//a/@href")
43     links=map(lambda url:BASE_DOMAIN+url,links)
44     return links
45 
46 def spider():
47     base_url="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a"
48     positions=[]
49     for x in range(0,4): #43
50         x*=10
51         url=base_url.format(x)
52         detail_urls=get_detail_urls(url)
53         for detail_url in detail_urls:
54             position=parse_detail_page(detail_url)
55             positions.append(position)
56             #print(position)
57             with open('tecentRecruit.txt','a',encoding='utf-8') as f:
58                 for (key,value) in position.items():
59                     if(key=='work_duty'):
60                         str='work_duty :{}'
61                         f.write(str.format(value))
62                         f.write('\n')
63                     elif(key=='work_require'):
64                         str="work_require :{}"
65                         f.write(str.format(value))
66                         f.write('\n')
67                     else:
68                         f.write(key+":"+value)
69                         f.write('\n')
70                 f.write('\n'*3)
71 
72     #print(positions)
73 
74 if __name__ == '__main__':
75     spider()
复制代码

 

效果如图所示:

posted @   浅忆~  阅读(749)  评论(0编辑  收藏  举报
编辑推荐:
· 智能桌面机器人:用.NET IoT库控制舵机并多方法播放表情
· Linux glibc自带哈希表的用例及性能测试
· 深入理解 Mybatis 分库分表执行原理
· 如何打造一个高并发系统?
· .NET Core GC压缩(compact_phase)底层原理浅谈
阅读排行:
· 新年开篇:在本地部署DeepSeek大模型实现联网增强的AI应用
· DeepSeek火爆全网,官网宕机?本地部署一个随便玩「LLM探索」
· Janus Pro:DeepSeek 开源革新,多模态 AI 的未来
· 上周热点回顾(1.20-1.26)
· 【译】.NET 升级助手现在支持升级到集中式包管理
点击右上角即可分享
微信分享提示