爬虫投递简历小示例
一、流程分析
第一步:获取登录页,获取X_Anti_Forge_Token,X_Anti_Forge_Code 1、请求url:https://passport.lagou.com/login/login.html 2、请求方式:get 3、请求头: - cookie:用session处理了 - User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name 第二步:登录 1、请求url:https://passport.lagou.com/login/login.json 2、请求方式:post 3、请求头: cookie User-agent Referer:https://passport.lagou.com/login/login.html X-Anit-Forge-Code:53165984 X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78 X-Requested-With:XMLHttpRequest 4、请求体 isValidate:true username:18611453110 password:70621c64832c4d4d66a47be6150b4a8e request_form_verifyCode:'' submit:'' 第三步:授权 1、请求url:https://passport.lagou.com/grantServiceTicket/grant.html 2、请求方法:GET 3、请求头: User-agent Referer:https://passport.lagou.com/login/login.html 第四步:验证 第五步:筛选职位信息 请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91 请求方法:GET 请求头: User-Agent 请求参数: gj:3年及以下 px:default yx:25k-50k city:北京 第六步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code 请求url:详情页地址 请求方式:GET 请求头:User-Agent 第七步:投递简历 请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json 请求方式:POST 请求头: Referer:详情页地址 User-agent X-Anit-Forge-Code:53165984 X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78 X-Requested-With:XMLHttpRequest 请求体: positionId:职位ID type:1 force:true
二、代码实现
1 import requests 2 import re 3 from urllib.parse import urlencode 4 session = requests.session() 5 r1 = session.get( 6 "https://passport.lagou.com/login/login.html", 7 headers = { 8 "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", 9 } 10 ) 11 X_Anit_Forge_Code = re.findall("X_Anit_Forge_Code ='(.*?)'",r1.text,re.S) 12 X_Anit_Forge_Token = re.findall("X_Anit_Forge_Token ='(.*?)'",r1.text,re.S) 13 r2 = session.post( 14 "https://passport.lagou.com/login/login.json", 15 headers = { 16 "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", 17 "Referer":"https://passport.lagou.com/login/login.html", 18 "X-Anit-Forge-Code":X_Anit_Forge_Code, 19 "X-Anit-Forge-Token":X_Anit_Forge_Token, 20 "X-Requested-With":"XMLHttpRequest" 21 }, 22 data={ 23 "isValidate": True, 24 'username': '18611453110', 25 'password': '70621c64832c4d4d66a47be6150b4a8e', 26 'request_form_verifyCode': '', 27 'submit': '' 28 } 29 ) 30 r3 = session.get( 31 "https://passport.lagou.com/grantServiceTicket/grant.html", 32 headers = { 33 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", 34 'Referer': 'https://passport.lagou.com/login/login.html', 35 } 36 ) 37 r4 = session.get( 38 'https://www.lagou.com/resume/myresume.html', 39 headers = { 40 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", 41 } 42 ) 43 44 print('18611453110' in r4.text) 45 46 # ============================ 47 # res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1] 48 # url = "https://www.lagou.com/jobs/list_"+res 49 # r5 =session.get(url, 50 # headers={ 51 # "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", 52 # }, 53 # params={ 54 # 'gj': '3年及以下', 55 # 'px': 'default', 56 # 'yx': '25k-50k', 57 # 'city': '北京' 58 # } 59 # ) #按照套路进行,结果取不到值,因为人家发的是ajax请求获取的数据,所以选择了r6的方式 60 res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1] 61 url = "https://www.lagou.com/jobs/list_"+res 62 r6 = session.post( 63 'https://www.lagou.com/jobs/postionAjax.json', 64 headers = { 65 'Referer': url, 66 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", 67 }, 68 data = { 69 "first":True, 70 "pn":1, 71 "kd":"java高级开发" 72 }, 73 params = { 74 "gj":"3年及以下", 75 "gx":"default", 76 "yx":"15k-25k", 77 "city":"北京", 78 "needAddtionResult":False, 79 "isSchoolJob":0 80 } 81 ) 82 from pprint import pprint 83 # print(r6.json()) 84 comapines_list=r6.json()['content']['positionResult']['result'] 85 for comapiny in comapines_list: 86 positionId=comapiny['positionId'] 87 company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId) 88 companyShortName = comapiny['companyShortName'] 89 positionName = comapiny['positionName'] 90 salary = comapiny['salary'] 91 print(''' 92 详情连接:%s 93 公司名:%s 94 职位名:%s 95 薪资:%s 96 ''' %(company_link,companyShortName,positionName,salary)) 97 r7=session.get(company_link, 98 headers={ 99 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 100 } 101 ) 102 X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0] 103 X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0] 104 # print(X_Anti_Forge_Token,X_Anti_Forge_Code) 105 106 107 session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', 108 headers={ 109 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 110 'Referer': company_link, 111 'X-Anit-Forge-Code': X_Anti_Forge_Code, 112 'X-Anit-Forge-Token': X_Anti_Forge_Token, 113 'X-Requested-With': 'XMLHttpRequest' 114 }, 115 data={ 116 'positionId':positionId, 117 'type':1, 118 'force':True 119 } 120 ) 121 print('%s 投递成功' %(companyShortName))