python爬虫爬取腾讯招聘信息 (静态爬虫)
环境:
windows7,python3.4
代码:(亲测可正常执行)
1 import requests 2 from bs4 import BeautifulSoup 3 from math import ceil 4 5 header = { 6 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} 7 8 9 # 获取岗位页数 10 def getJobPage(url): 11 ret = requests.get(url, headers=header) 12 ret.encoding = "utf-8" # 解决乱码问题 13 html = ret.text 14 soup = BeautifulSoup(html, 'html.parser') 15 # 获取岗位总数,< span class ="lightblue total" > 512 < / span > 16 totalJob = soup.select('span[class="lightblue total"]')[0].text 17 jobPage = ceil(int(totalJob) / 10) 18 return jobPage 19 20 21 def getJobOrder(url): 22 ret = requests.get(url, headers=header) 23 ret.encoding = "utf-8" # 解决乱码问题 24 html = ret.text 25 soup = BeautifulSoup(html, 'html.parser') 26 # 工作职责 27 jobRequests = soup.select('ul[class="squareli"]')[0].text 28 # 工作要求 29 jobOrder = soup.select('ul[class="squareli"]')[1].text 30 return jobRequests, jobOrder 31 32 33 # 获取岗位信息 34 def getJobInfo(url): 35 myfile = open("tencent_job.txt", "a", encoding='gb18030', errors='ignore') # 解决乱码问题 36 ret = requests.get(url, headers=header) 37 ret.encoding = "utf-8" # 解决乱码问题 38 html = ret.text 39 soup = BeautifulSoup(html, 'html.parser') 40 jobList = soup.find_all('tr', class_=['even', 'odd']) 41 for job in jobList: 42 # url 43 jobUrl = "https://hr.tencent.com/" + job.select('td:nth-of-type(1) > a')[0]['href'] 44 # 职位名称 45 jobName = job.select('td:nth-of-type(1) > a')[0].text 46 # 人数 47 jobPeople = job.select('td:nth-of-type(3)')[0].text 48 # 地点 49 jobAddre = job.select('td:nth-of-type(4)')[0].text 50 # 发布时间 51 jobTime = job.select('td:nth-of-type(5)')[0].text 52 # 工作职责 53 jobRequests = getJobOrder(jobUrl)[0] 54 # 工作要求 55 jobOrder = getJobOrder(jobUrl)[1] 56 57 #print(jobName, jobUrl, jobAddre, jobPeople, jobTime, jobRequests, jobOrder) 58 59 tt = jobName + " " + jobUrl + " " + jobAddre + " " + jobPeople + " " + jobTime + " " + jobRequests + " " + jobOrder 60 myfile.write(tt + "\n") 61 62 63 if __name__ == '__main__': 64 mainurl = 'https://hr.tencent.com/position.php?keywords=python' 65 jobPage = getJobPage(mainurl) 66 print(jobPage) 67 for page in range(jobPage): 68 pageUrl = 'https://hr.tencent.com/position.php?keywords=python&start=' + str(page * 10) + '#a' 69 print("第" + str(page + 1) + "页") 70 getJobInfo(pageUrl)