1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | from urllib import request import urllib import ssl import json url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' headers = { 'User-Agent' : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/75.0.3770.100 Safari/537.36" , 'Referer' : "https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput=" , 'Origin' : "https://www.lagou.com" , 'Accept' : "application/json, text/javascript, */*; q=0.01" , 'Content-Type' : "application/x-www-form-urlencoded; charset=UTF-8" , 'Accept-Language' : "zh-CN,zh;q=0.9" , 'Connection' : "keep-alive" , 'Content-Length' : "25" , 'Cookie' : "JSESSIONID=ABAAABAAAIAACBI7B0E6DD979133FD3E0688BD2A172D462; user_trace_token=20190625152253-372d4fd2-d2d9-4a1e-b1db-adbaf15de59b; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1561447375; _ga=GA1.2.502816238.1561447375; LGSID=20190625152254-0c9bc1d7-971a-11e9-a4bc-5254005c3644; LGUID=20190625152254-0c9bc483-971a-11e9-a4bc-5254005c3644; _gid=GA1.2.1461701224.1561447375; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; X_HTTP_TOKEN=d0da23584e25293624994416516081f1b40cdf8579; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1561449942; LGRID=20190625160542-0718c5c5-9720-11e9-a4bc-5254005c3644; SEARCH_ID=af21aa4087114adf8c011b4f809dc9bd" , } data = { 'first' : 'true' , 'pn' : 1 , 'kd' : 'Python' } new_data = urllib.parse.urlencode(data) req = request.Request(url, headers = headers, data = new_data.encode( 'utf-8' ), method = 'POST' ) context = ssl._create_unverified_context() res = request.urlopen(req, context = context, timeout = 60 ) res_json = json.loads(res.read()) print (res_json) print (res_json[ 'content' ][ 'positionResult' ][ 'result' ]) with open ( '/Users/mac/PycharmProjects/TEST/TEST/爬虫day/file/lago.txt' , 'w' ) as f: f.write(res_json) # 出现请求太频繁的解决 伪造浏览器 完善请求头 |
避免请求太频繁 方法
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | import requests import time import json def main(): url_start = "https://www.lagou.com/jobs/list_python?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=" url_parse = "https://www.lagou.com/jobs/positionAjax.json?city=天津&needAddtionalResult=false" headers = { 'Accept' : 'application/json, text/javascript, */*; q=0.01' , 'Referer' : "https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput=" , 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' } for x in range ( 1 , 5 ): data = { 'first' : 'true' , 'pn' : str (x), 'kd' : 'Python' } s = requests.Session() # 创建一个session对象 s.get(url_start, headers = headers, timeout = 3 ) # 用session对象发出get请求,请求首页获取cookies cookie = s.cookies # 为此次获取的cookies response = s.post(url_parse, data = data, headers = headers, cookies = cookie, timeout = 3 ) # 获取此次文本 time.sleep( 5 ) response.encoding = response.apparent_encoding text = json.loads(response.text) info = text[ "content" ][ "positionResult" ][ "result" ] print (info) if __name__ = = '__main__' : main() |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 为什么说在企业级应用开发中,后端往往是效率杀手?
· 用 C# 插值字符串处理器写一个 sscanf
· Java 中堆内存和栈内存上的数据分布和特点
· 开发中对象命名的一点思考
· .NET Core内存结构体系(Windows环境)底层原理浅谈
· 为什么说在企业级应用开发中,后端往往是效率杀手?
· DeepSeek 解答了困扰我五年的技术问题。时代确实变了!
· 本地部署DeepSeek后,没有好看的交互界面怎么行!
· 趁着过年的时候手搓了一个低代码框架
· 推荐一个DeepSeek 大模型的免费 API 项目!兼容OpenAI接口!