python3拉勾网爬虫之(您操作太频繁,请稍后访问)
你是否经历过这个:
那就对了~
因为需要post和相关的cookie来请求~
所以,一个简单的代码爬拉钩~~~
1 import requests 2 import time 3 import json 4 5 6 def main(): 7 url_start = "https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=" 8 url_parse = "https://www.lagou.com/jobs/positionAjax.json?city=成都&needAddtionalResult=false" 9 headers = { 10 'Accept': 'application/json, text/javascript, */*; q=0.01', 11 'Referer': 'https://www.lagou.com/jobs/list_%E8%BF%90%E7%BB%B4?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=', 12 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' 13 } 14 for x in range(1, 5): 15 data = { 16 'first': 'true', 17 'pn': str(x), 18 'kd': '运维' 19 } 20 s = requests.Session() # 创建一个session对象 21 s.get(url_start, headers=headers, timeout=3) # 用session对象发出get请求,请求首页获取cookies 22 cookie = s.cookies # 为此次获取的cookies 23 response = s.post(url_parse, data=data, headers=headers, cookies=cookie, timeout=3) # 获取此次文本 24 time.sleep(5) 25 response.encoding = response.apparent_encoding 26 text = json.loads(response.text) 27 info = text["content"]["positionResult"]["result"] 28 for i in info: 29 print(i["companyFullName"]) 30 companyFullName = i["companyFullName"] 31 print(i["positionName"]) 32 positionName = i["positionName"] 33 print(i["salary"]) 34 salary = i["salary"] 35 print(i["companySize"]) 36 companySize = i["companySize"] 37 print(i["skillLables"]) 38 skillLables = i["skillLables"] 39 print(i["createTime"]) 40 createTime = i["createTime"] 41 print(i["district"]) 42 district = i["district"] 43 print(i["stationname"]) 44 stationname = i["stationname"] 45 46 if __name__ == '__main__': 47 main()