爬取拉勾网示例
爬取需求分析
# 第一步:访问登陆页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code # 1、请求url:https://passport.lagou.com/login/login.html # 2、请求方法:GET # 3、请求头: # User-agent r1 = session.get('https://passport.lagou.com/login/login.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }, ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # 第二步:登陆 # 1、请求url:https://passport.lagou.com/login/login.json # 2、请求方法:POST # 3、请求头: # cookie # User-agent # Referer:https://passport.lagou.com/login/login.html # X-Anit-Forge-Code:53165984 # X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78 # X-Requested-With:XMLHttpRequest # 4、请求体: # isValidate:true # username:18611453110 # password:70621c64832c4d4d66a47be6150b4a8e # request_form_verifyCode:'' # submit:'' r2 = session.post('https://passport.lagou.com/login/login.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Referer': 'https://passport.lagou.com/login/login.html', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ "isValidate": True, 'username': '18611453110', 'password': '70621c64832c4d4d66a47be6150b4a8e', 'request_form_verifyCode': '', 'submit': '' } ) # 第三步:授权 # 1、请求url:https://passport.lagou.com/grantServiceTicket/grant.html # 2、请求方法:GET # 3、请求头: # User-agent # Referer:https://passport.lagou.com/login/login.html r3 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Referer': 'https://passport.lagou.com/login/login.html', } ) # 第四步:验证 r4 = session.get('https://www.lagou.com/resume/myresume.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } ) # 第五步:筛选职位信息 # 请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91 # 请求方法:GET # 请求头: # User-Agent # 请求参数: # gj:3年及以下 # px:default # yx:25k-50k # city:北京 #第六步,详细的筛选出职位信息条件 #请求参数 # params={ # 'gj': '3年及以下', # 'px': 'default', # 'yx': '25k-50k', # 'city': '北京', # 'needAddtionalResult':False, # 'isSchoolJob':0 # } #第七步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code # 请求url:详情页地址 # 请求方式:GET # 请求头:User-Agent r7=session.get(company_link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0] #第八步:投递简历 #请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json #请求方式:POST #请求头: #Referer:详情页地址 #User-agent #X-Anit-Forge-Code:53165984 #X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78 #X-Requested-With:XMLHttpRequest #请求体: # positionId:职位ID # type:1 # force:true session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Referer': company_link, 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'positionId':positionId, 'type':1, 'force':True } ) print('%s 投递成功' %(companyShortName))
import requests import re from urllib.parse import urlencode session = requests.session() r1 = session.get( "https://passport.lagou.com/login/login.html", headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", } ) X_Anit_Forge_Code = re.findall("X_Anit_Forge_Code ='(.*?)'",r1.text,re.S) X_Anit_Forge_Token = re.findall("X_Anit_Forge_Token ='(.*?)'",r1.text,re.S) r2 = session.post( "https://passport.lagou.com/login/login.json", headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", "Referer":"https://passport.lagou.com/login/login.html", "X-Anit-Forge-Code":X_Anit_Forge_Code, "X-Anit-Forge-Token":X_Anit_Forge_Token, "X-Requested-With":"XMLHttpRequest" }, data={ "isValidate": True, 'username': '18611453110', 'password': '70621c64832c4d4d66a47be6150b4a8e', 'request_form_verifyCode': '', 'submit': '' } ) r3 = session.get( "https://passport.lagou.com/grantServiceTicket/grant.html", headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", 'Referer': 'https://passport.lagou.com/login/login.html', } ) r4 = session.get( 'https://www.lagou.com/resume/myresume.html', headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", } ) print('18611453110' in r4.text) # ============================ # res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1] # url = "https://www.lagou.com/jobs/list_"+res # r5 =session.get(url, # headers={ # "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", # }, # params={ # 'gj': '3年及以下', # 'px': 'default', # 'yx': '25k-50k', # 'city': '北京' # } # ) #按照套路进行,结果取不到值,因为人家发的是ajax请求获取的数据,所以选择了r6的方式 res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1] url = "https://www.lagou.com/jobs/list_"+res r6 = session.post( 'https://www.lagou.com/jobs/postionAjax.json', headers = { 'Referer': url, "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name", }, data = { "first":True, "pn":1, "kd":"java高级开发" }, params = { "gj":"3年及以下", "gx":"default", "yx":"15k-25k", "city":"北京", "needAddtionResult":False, "isSchoolJob":0 } ) from pprint import pprint # print(r6.json()) comapines_list=r6.json()['content']['positionResult']['result'] for comapiny in comapines_list: positionId=comapiny['positionId'] company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId) companyShortName = comapiny['companyShortName'] positionName = comapiny['positionName'] salary = comapiny['salary'] print(''' 详情连接:%s 公司名:%s 职位名:%s 薪资:%s ''' %(company_link,companyShortName,positionName,salary)) r7=session.get(company_link, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0] # print(X_Anti_Forge_Token,X_Anti_Forge_Code) session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Referer': company_link, 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'positionId':positionId, 'type':1, 'force':True } ) print('%s 投递成功' %(companyShortName))
有一种能力,是持续不断的努力