Fork me on GitHub

爬虫投递简历小示例

一、流程分析

第一步:获取登录页,获取X_Anti_Forge_Token,X_Anti_Forge_Code
    1、请求url:https://passport.lagou.com/login/login.html
    2、请求方式:get
    3、请求头:
           - cookie:用session处理了
           - User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name
第二步:登录
    1、请求url:https://passport.lagou.com/login/login.json
    2、请求方式:post
    3、请求头:
        cookie
        User-agent
        Referer:https://passport.lagou.com/login/login.html
        X-Anit-Forge-Code:53165984
        X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
        X-Requested-With:XMLHttpRequest
    4、请求体
        isValidate:true
        username:18611453110
        password:70621c64832c4d4d66a47be6150b4a8e
        request_form_verifyCode:''
        submit:''
第三步:授权
        1、请求url:https://passport.lagou.com/grantServiceTicket/grant.html
        2、请求方法:GET
        3、请求头:
           User-agent
           Referer:https://passport.lagou.com/login/login.html
第四步:验证
第五步:筛选职位信息
    请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91
    请求方法:GET
    请求头:
        User-Agent
    请求参数:
        gj:3年及以下
        px:default
        yx:25k-50k
        city:北京
第六步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
    请求url:详情页地址
    请求方式:GET
    请求头:User-Agent
第七步:投递简历
    请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
    请求方式:POST
    请求头:
        Referer:详情页地址
        User-agent
        X-Anit-Forge-Code:53165984
        X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
        X-Requested-With:XMLHttpRequest
    请求体:
    positionId:职位ID
    type:1
    force:true

二、代码实现

  1 import requests
  2 import re
  3 from urllib.parse import urlencode
  4 session = requests.session()
  5 r1 = session.get(
  6     "https://passport.lagou.com/login/login.html",
  7     headers = {
  8         "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
  9     }
 10 )
 11 X_Anit_Forge_Code  = re.findall("X_Anit_Forge_Code ='(.*?)'",r1.text,re.S)
 12 X_Anit_Forge_Token = re.findall("X_Anit_Forge_Token ='(.*?)'",r1.text,re.S)
 13 r2 = session.post(
 14     "https://passport.lagou.com/login/login.json",
 15     headers = {
 16         "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
 17         "Referer":"https://passport.lagou.com/login/login.html",
 18         "X-Anit-Forge-Code":X_Anit_Forge_Code,
 19         "X-Anit-Forge-Token":X_Anit_Forge_Token,
 20         "X-Requested-With":"XMLHttpRequest"
 21     },
 22     data={
 23         "isValidate": True,
 24         'username': '18611453110',
 25         'password': '70621c64832c4d4d66a47be6150b4a8e',
 26         'request_form_verifyCode': '',
 27         'submit': ''
 28     }
 29 )
 30 r3 = session.get(
 31     "https://passport.lagou.com/grantServiceTicket/grant.html",
 32     headers = {
 33         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
 34         'Referer': 'https://passport.lagou.com/login/login.html',
 35     }
 36 )
 37 r4 = session.get(
 38     'https://www.lagou.com/resume/myresume.html',
 39     headers = {
 40         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
 41     }
 42 )
 43 
 44 print('18611453110' in r4.text)
 45 
 46 # ============================
 47 # res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]
 48 # url = "https://www.lagou.com/jobs/list_"+res
 49 # r5 =session.get(url,
 50 #             headers={
 51 #                     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
 52 #                 },
 53 #              params={
 54 #                      'gj': '3年及以下',
 55 #                      'px': 'default',
 56 #                      'yx': '25k-50k',
 57 #                      'city': '北京'
 58 #                 }
 59 #          ) #按照套路进行,结果取不到值,因为人家发的是ajax请求获取的数据,所以选择了r6的方式
 60 res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]
 61 url = "https://www.lagou.com/jobs/list_"+res
 62 r6 = session.post(
 63     'https://www.lagou.com/jobs/postionAjax.json',
 64     headers = {
 65         'Referer': url,
 66         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
 67     },
 68     data = {
 69         "first":True,
 70         "pn":1,
 71         "kd":"java高级开发"
 72     },
 73     params = {
 74         "gj":"3年及以下",
 75         "gx":"default",
 76         "yx":"15k-25k",
 77         "city":"北京",
 78         "needAddtionResult":False,
 79         "isSchoolJob":0
 80     }
 81 )
 82 from pprint import pprint
 83 # print(r6.json())
 84 comapines_list=r6.json()['content']['positionResult']['result']
 85 for comapiny in comapines_list:
 86     positionId=comapiny['positionId']
 87     company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)
 88     companyShortName = comapiny['companyShortName']
 89     positionName = comapiny['positionName']
 90     salary = comapiny['salary']
 91     print('''
 92     详情连接:%s
 93     公司名:%s
 94     职位名:%s
 95     薪资:%s
 96     ''' %(company_link,companyShortName,positionName,salary))
 97     r7=session.get(company_link,
 98                 headers={
 99                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
100                 }
101                 )
102     X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]
103     X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0]
104     # print(X_Anti_Forge_Token,X_Anti_Forge_Code)
105 
106 
107     session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
108                  headers={
109                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
110                      'Referer': company_link,
111                      'X-Anit-Forge-Code': X_Anti_Forge_Code,
112                      'X-Anit-Forge-Token': X_Anti_Forge_Token,
113                      'X-Requested-With': 'XMLHttpRequest'
114                  },
115                  data={
116     'positionId':positionId,
117     'type':1,
118     'force':True
119                  }
120                  )
121     print('%s 投递成功' %(companyShortName))
View Code

 

posted on 2018-03-03 00:20  vmaze  阅读(283)  评论(0编辑  收藏  举报

导航