爬取招聘职位一
爬取百度招聘上关于IT职业的有关信息进行分析,只是对常见职业进行分析,未解决异步加载,只能显示第一页,而且最多显示100条信息,爬取二升级若对全部职业和城市,则需对数据库进行写入
1 #-*- coding:UTF-8 -*- 2 #@author:若鸟 3 #functions: 爬取百度招聘上关于IT职业的有关信息进行分析,只是对常见职业进行分析,未解决异步加载,只能显示第一页,而且最多显示100条信息,爬取二升级若对全部职业和城市,则需对数据库进行写入 4 import requests 5 import re 6 import xlwt 7 from bs4 import BeautifulSoup 8 from Get_jobList import GetJobList 9 def getHtml(url): 10 try: 11 r = requests.get(url) 12 r.raise_for_status() 13 r.encoding = r.apparent_encoding 14 return r.text 15 except Exception as e: 16 print(url,"爬取失败,失败信息:",e) 17 def getJoblist(): 18 citys = ['肥城','泰安','北京','上海','深圳','杭州','济南'] 19 jobs = GetJobList()[:50] #获取部分职位名称,爬取自拉勾网 20 url_job = [] 21 for city in citys: 22 for job in jobs: 23 url_job.append("http://zhaopin.baidu.com/quanzhi?query={}&city={}&rn=100".format(job,city)) #网站实现的js翻页,经过每页最多显示100条 24 25 return url_job 26 def getJobData(urls): 27 count = 0 28 job_data = [] #<p class="area line-clamp1">济南 | 济南宅客信息科技有限公司</p> 29 for url in urls: 30 31 html = getHtml(url) 32 job = re.findall(r'<p class="title-h3 line-clamp1">(.+)</p>',html) 33 addr = re.findall(r'<p class="area line-clamp1">(.+) \| .+</p>',html) 34 company = re.findall(r'<p class="area line-clamp1">.+ \| (.+)</p>',html) 35 salary = re.findall(r'<p class="salary">(.+)</p>',html) 36 source = re.findall(r'<p>.+ \| 来自(.+)</p>',html) 37 num = len(job) 38 for i in range(num): 39 job_data.append([job[i],addr[i],company[i],salary[i],source[i]]) 40 41 42 count +=1 43 print(count) 44 45 return job_data 46 def writeXls(data_list): 47 #print(data_list) 48 num = len(data_list) 49 f = xlwt.Workbook() 50 sheet1 = f.add_sheet('sheet1',cell_overwrite_ok = True) 51 for i in range(num): 52 num_ = len(data_list[i]) 53 for x in range(num_): 54 sheet1.write(i,x,data_list[i][x]) 55 f.save("JobData.xls") 56 if __name__ == '__main__': 57 job_data = getJobData(getJoblist()) 58 writeXls(job_data)