爬虫
################################################################################################################################################ #该爬虫爬取boss网站,搜索关键字出来的结果,包公司名字,括发布日期,职位名称,薪资,地区,经验限制,学历,职位关键字,职位描述,任职要求 #start_url = 'http://www.zhipin.com/c101270100/?query=Python&page=' #这个设置爬取关键字,本例关键字为python #dethp = 6 #这里设置爬取深度,boss网站一次爬取深度不能太深,否则会被封,一般爬取深度设置为5 #for i in range(1,dethp):#这里需要设置开始爬取的深度 #excel.save('boss_python_1_150.xlsx') #这个设置保存爬取的数据到excel,每次爬取的时候,需要修改名字,以区分 ################################################################################################################################################ import requests from bs4 import BeautifulSoup import bs4 import xlwt def get_html(url): kv={'User-Agent':'Mozilla/5.0'} try: r = requests.get(url,headers=kv,timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: print('爬取失败') print(url) def get_dange_info(company_info,url): html = get_html(url) soup = BeautifulSoup(html,'html.parser') #获取公司名字,行业,规模 company = soup.find('div','info-company') company_info.append(company.find('h3').get_text())#获取公司名字 company_info.append(company.find('p').get_text())#获取行业,规模 #获取发布日期,职位名称,薪资,地区,经验限制,学历,职位关键字 zhiwei_info = soup.find('div','info-primary') for i in zhiwei_info.children: if isinstance(i,bs4.element.Tag):#去除\n子标签 if '\n' in i.get_text():#去除文字中间的\n符合 a = i.get_text().split('\n') b = '' for j in a: if len(j) != 0: b = b + j + ' ' company_info.append(b) else: company_info.append(i.get_text().strip()) #获取职位描述,任职要求 job_sec = soup.find('div','job-sec') text = job_sec.find('div','text') #zhiweizhize0 = text.get_text().split('任职要求')[0].strip().replace('任职要求','') #zhiweizhize = '岗位职责:' + zhiweizhize0.split('岗位职责')[-1].strip() #renzhiyaoqiu = '任职要求:' + text.get_text().split('任职要求')[-1].strip() #company_info.append(zhiweizhize.strip()) #company_info.append(renzhiyaoqiu.strip()) company_info.append(text.get_text().strip()) def get_urllist(url_list,url): url_list_html = get_html(url) soup = BeautifulSoup(url_list_html,'html.parser') job_list =soup.find_all('div','job-primary')#获取job-primary标签 for i in job_list: url_list.append('http://www.zhipin.com' + i.find('a').attrs['href']) def get_quanbuinfo(tatol_info,url_list): for i in url_list: company_info = [] get_dange_info(company_info,i) tatol_info.append(company_info) def print_html(tatol_info): tplt = '{0:^10}\t{1:^6}\t{2:^6}\t{3:^6}\t{4:^6}\t{5:^6}\n\n{6:^50}\n\n{7:^50}' print(tplt.format('公司名称','行业规模','发布时间','职位名称及薪资','地区经验学历','职位标签','职位描述','任职要求')) for i in tatol_info: print(tplt.format(i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7])) print('\n\n\n') with open('boss.txt','w',encoding='utf-8') as f: for j in tatol_info: f.write(str(j[0:6]) + '\n\n' + str(j[6]) + '\n\n' + str(j[7]) + '\n\n\n') def save_tatol_info(tatol_info): excel = xlwt.Workbook() sheet = excel.add_sheet('sheet') style = xlwt.XFStyle() font = xlwt.Font() font.name = 'Times New Roman' style.font = font #row0 = ['公司名称','行业规模','发布时间','职位名称及薪资','地区经验学历','职位标签','职位描述','任职要求'] row0 = ['公司名称','行业规模','发布时间','职位名称及薪资','地区经验学历','职位标签','职位描述'] h = 0 for k in row0: sheet.write(0,h,k,style) h += 1 m = 1 for i in tatol_info: n = 0 for j in i: sheet.write(m,n,j,style) n += 1 m += 1 excel.save('boss_大数据_151_200.xlsx') #这个设置保存爬取的数据到excel,每次爬取的时候,需要修改名字,以区分 def main(): url_list = [] tatol_info = [] start_url = 'http://www.zhipin.com/c101270100/?query=Python&page=' #这个设置爬取关键字,本例关键字为python dethp = 6 #这里设置爬取深度,boss网站一次爬取深度不能太深,否则会被封,一般爬取深度设置为5 for i in range(1,dethp):#这里需要设置开始爬取的深度 try: url = start_url + str(i) get_urllist(url_list,url) except: print('ngngng') print('url=',url) continue get_quanbuinfo(tatol_info,url_list) #print_html(tatol_info) save_tatol_info(tatol_info) main()