百度翻译爬虫-Web版(自动生成sign)
1 # 面向对象 2 # 百度翻译 -- 网页版(自动获取token,sign) 3 import requests 4 import js2py 5 import json 6 import re 7 8 9 class WebFanyi: 10 """百度翻译网页版爬虫""" 11 def __init__(self,query_str): 12 self.session = requests.session() 13 headers = { 14 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36", 15 } 16 self.session.headers = headers 17 self.baidu_url = "https://www.baidu.com/" 18 self.root_url = "https://fanyi.baidu.com/" 19 self.lang_url = "https://fanyi.baidu.com/langdetect" 20 self.trans_url = "https://fanyi.baidu.com/v2transapi" 21 self.query_str = query_str 22 23 def get_token_gtk(self): 24 '''获取token和gtk(用于合成Sign)''' 25 self.session.get(self.root_url) 26 resp = self.session.get(self.root_url) 27 html_str = resp.content.decode() 28 token = re.findall(r"token: '(.*?)'", html_str)[0] 29 gtk = re.findall(r"window.gtk = '(.*?)'", html_str)[0] 30 return token,gtk 31 32 def generate_sign(self,gtk): 33 """生成sign""" 34 # 1. 准备js编译环境 35 context = js2py.EvalJs() 36 with open('webtrans.js', encoding='utf8') as f: 37 js_data = f.read() 38 js_data = re.sub("window\[l\]",'"'+gtk+'"',js_data) 39 # js_data = re.sub("window\[l\]", "\"{}\"".format(gtk), js_data) 40 # print(js_data) 41 context.execute(js_data) 42 sign = context.e(self.query_str) 43 return sign 44 45 def lang_detect(self): 46 '''获取语言转换类型.eg: zh-->en''' 47 lang_resp = self.session.post(self.lang_url,data={"query":self.query_str}) 48 lang_json_str = lang_resp.content.decode() # {"error":0,"msg":"success","lan":"zh"} 49 lan = json.loads(lang_json_str)['lan'] 50 to = "en" if lan == "zh" else "zh" 51 return lan,to 52 53 54 def parse_url(self,post_data): 55 trans_resp = self.session.post(self.trans_url,data=post_data) 56 trans_json_str = trans_resp.content.decode() 57 trans_json = json.loads(trans_json_str) 58 result = trans_json["trans_result"]["data"][0]["dst"] 59 print("{}: {}".format(self.query_str,result)) 60 61 def run(self): 62 """实现逻辑""" 63 # 1.获取百度的cookie,(缺乏百度首页的cookie会始终报错998) 64 self.session.get(self.baidu_url) 65 # 2. 获取百度翻译的token和gtk(用于合成sign) 66 token, gtk = self.get_token_gtk() 67 # 3. 生成sign 68 sign = self.generate_sign(gtk) 69 # 4. 获取语言转换类型.eg: zh-->en 70 lan, to = self.lang_detect() 71 # 5. 发送请求,获取响应,输出结果 72 post_data = { 73 "from": lan, 74 "to": to, 75 "query": self.query_str, 76 "transtype": "realtime", 77 "simple_means_flag": 3, 78 "sign": sign, 79 "token": token 80 } 81 self.parse_url(post_data) 82 83 if __name__ == '__main__': 84 webfanyi = WebFanyi('lover') 85 webfanyi.run()
上述代码中用于生成sign的 webtrans.js 文件具体代码如下(可以自己抓包,在js中打断点获取):
1 // webtrans.js 2 3 function n(r, o) { 4 for (var t = 0; t < o.length - 2; t += 3) { 5 var a = o.charAt(t + 2); 6 a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a), 7 a = "+" === o.charAt(t + 1) ? r >>> a : r << a, 8 r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a 9 } 10 return r 11 } 12 function e(r) { 13 var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g); 14 if (null === o) { 15 var t = r.length; 16 t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10)) 17 } else { 18 for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++) 19 "" !== e[C] && f.push.apply(f, a(e[C].split(""))), 20 C !== h - 1 && f.push(o[C]); 21 var g = f.length; 22 g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join("")) 23 } 24 var u = void 0 25 , 26 // l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107); 27 i = null; 28 u = null !== i ? i : (i = window[l] || "") || ""; 29 for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) { 30 var A = r.charCodeAt(v); 31 128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)), 32 S[c++] = A >> 18 | 240, 33 S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224, 34 S[c++] = A >> 6 & 63 | 128), 35 S[c++] = 63 & A | 128) 36 } 37 for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++) 38 p += S[b], 39 p = n(p, F); 40 return p = n(p, D), 41 p ^= s, 42 0 > p && (p = (2147483647 & p) + 2147483648), 43 p %= 1e6, 44 p.toString() + "." + (p ^ m) 45 }
实际上,除了用js2py作为python中执行js代码的环境编译器外,还可以使用另一个方法 'execjs' ,不过要先通过 pip install PyExecJS 安装PyExecJS模块.具体实现代码如下:
1 import execjs 2 with open("webtrans.js") as f: 3 js_data = f.read() 4 js_data = re.sub("window\[l\]", '"' + gtk + '"', js_data) 5 sign = execjs.compile(js_data).call("e", query_str) # 调用webtrans.js代码中的 e函数,传入参数为 query_str 6 print(sign)
<人追求理想之时,便是坠入孤独之际.> By 史泰龙