爬虫10:爬取拉勾的职位信息
很久以前的代码,整理文件时找到了,不知道还能不能用
主要还是json的处理
重点可以关注下信息保存那部分,写入excel,可能比较通用吧
import requests import json import time from bs4 import BeautifulSoup import xlwt BaseUrl='https://www.lagou.com/jobs/positionAjax.json?' All_detail=[] #访问页面 def read_page(tag): page_header={ 'Host':'www.lagou.com', 'Origin': 'https: // www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_php%E5%90%8E%E7%AB%AF?px=default&city=%E4%B8%8A%E6%B5%B7', 'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'cookie':'' } page_data={ 'first':1, 'pn':tag, 'kd':'PHP' } page = requests.post(url=BaseUrl,data=page_data,headers=page_header) return page.text #处理json,返回单页面的urllist列表 def read_json(page): item_list=[] page_json=json.loads(page) page_json=page_json['content']['positionResult']['result'] # time.sleep(3) for i in range(15): item_list.append('https://www.lagou.com/jobs/'+str(page_json[i]['positionId'])+'.html') return item_list #抓取页面 def get_detail(url): dict = {} page_header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.lagou.com', 'Origin': 'https: // www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_php%E5%90%8E%E7%AB%AF?px=default&city=%E4%B8%8A%E6%B5%B7', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'cookie': '', 'Upgrade-Insecure-Requests': '1' } page_html=requests.get(url,headers=page_header).content soup=BeautifulSoup(page_html,'html.parser',from_encoding='utf-8') dict['公司']=soup.find("div",class_="company").text dict['职位']=soup.find("span",class_="name").text #待遇/要求 sal = [] for tag in soup.select("dd p span"): sal.append(tag.text.replace('/','')) str_sal=''.join(sal) dict["待遇/要求"]=str_sal #职位诱惑 dict['职位诱惑']=soup.find("dd",class_="job-advantage").select('p')[0].text #职位描述 req = [] for tag in soup.find('dd',class_='job_bt').select('div p'): req.append(tag.text+'\n') str_req=''.join(req) dict["职位描述"]=str_req return dict def saveall(): book = xlwt.Workbook() sheet = book.add_sheet('ronytest', cell_overwrite_ok=True) heads = ['公司', '职位', '待遇/要求', '职位诱惑', '职位描述'] ii = 0 for head in heads: sheet.write(0, ii, head) ii += 1 row = 1 for xx in All_detail: #print(dict['公司']) sheet.write(row, 0, xx['公司']) sheet.write(row, 1, xx['职位']) sheet.write(row, 2, xx['待遇/要求']) sheet.write(row, 3, xx['职位诱惑']) sheet.write(row, 4, xx['职位描述']) row += 1 book.save('拉勾网' + '.xls') if __name__=='__main__': AllUrl=[] #爬虫爬取的多页面,并把所有url加入AllUrl列表 for tag in range(1,3): AllUrl.extend(read_json(read_page(tag))) time.sleep(1) print('URL爬取完成') #解析网页,把所有详情加入All_detail列表 for i in AllUrl: All_detail.append(get_detail(i)) print('Detail爬取完成') #保存数据 saveall()
我走的很慢,但从不后退