爬虫10:爬取拉勾的职位信息

很久以前的代码,整理文件时找到了,不知道还能不能用

主要还是json的处理

重点可以关注下信息保存那部分,写入excel,可能比较通用吧

import requests
import json
import time
from bs4 import BeautifulSoup
import xlwt
BaseUrl='https://www.lagou.com/jobs/positionAjax.json?'
All_detail=[]
#访问页面
def read_page(tag):
    page_header={
        'Host':'www.lagou.com',
        'Origin': 'https: // www.lagou.com',
         'Referer': 'https://www.lagou.com/jobs/list_php%E5%90%8E%E7%AB%AF?px=default&city=%E4%B8%8A%E6%B5%B7',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
       'cookie':''
      }

    page_data={
        'first':1,
        'pn':tag,
        'kd':'PHP'
    }
    page = requests.post(url=BaseUrl,data=page_data,headers=page_header)
    return page.text

#处理json,返回单页面的urllist列表
def read_json(page):
     item_list=[]
     page_json=json.loads(page)
     page_json=page_json['content']['positionResult']['result']
    # time.sleep(3)
     for i in range(15):
         item_list.append('https://www.lagou.com/jobs/'+str(page_json[i]['positionId'])+'.html')
     return item_list

#抓取页面
def get_detail(url):
    dict = {}
    page_header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': 'www.lagou.com',
        'Origin': 'https: // www.lagou.com',
        'Referer': 'https://www.lagou.com/jobs/list_php%E5%90%8E%E7%AB%AF?px=default&city=%E4%B8%8A%E6%B5%B7',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
        'cookie': '',
        'Upgrade-Insecure-Requests': '1'
    }
    page_html=requests.get(url,headers=page_header).content
    soup=BeautifulSoup(page_html,'html.parser',from_encoding='utf-8')
    dict['公司']=soup.find("div",class_="company").text
    dict['职位']=soup.find("span",class_="name").text
    #待遇/要求
    sal = []
    for tag in soup.select("dd p span"):
         sal.append(tag.text.replace('/',''))
    str_sal=''.join(sal)
    dict["待遇/要求"]=str_sal
    #职位诱惑
    dict['职位诱惑']=soup.find("dd",class_="job-advantage").select('p')[0].text

    #职位描述
    req = []
    for tag in soup.find('dd',class_='job_bt').select('div p'):
           req.append(tag.text+'\n')
    str_req=''.join(req)
    dict["职位描述"]=str_req
    return dict

def saveall():
    book = xlwt.Workbook()
    sheet = book.add_sheet('ronytest', cell_overwrite_ok=True)
    heads = ['公司', '职位', '待遇/要求', '职位诱惑', '职位描述']
    ii = 0
    for head in heads:
        sheet.write(0, ii, head)
        ii += 1

    row = 1
    for xx in All_detail:
        #print(dict['公司'])
        sheet.write(row, 0, xx['公司'])
        sheet.write(row, 1, xx['职位'])
        sheet.write(row, 2, xx['待遇/要求'])
        sheet.write(row, 3, xx['职位诱惑'])
        sheet.write(row, 4, xx['职位描述'])
        row += 1
    book.save('拉勾网' + '.xls')

if __name__=='__main__':
     AllUrl=[]
    #爬虫爬取的多页面,并把所有url加入AllUrl列表
     for tag in range(1,3):
        AllUrl.extend(read_json(read_page(tag)))
        time.sleep(1)
     print('URL爬取完成')
     #解析网页,把所有详情加入All_detail列表
     for i in AllUrl:
         All_detail.append(get_detail(i))
     print('Detail爬取完成')
     #保存数据
     saveall()

 

posted @ 2018-01-03 17:06  RonyJay  阅读(184)  评论(0编辑  收藏  举报