百度翻译js逆向
百度翻译
数据分析
通过抓取数据,百度翻译是请求这个接口 https://fanyi.baidu.com/basetrans
,请求参数如下所示:
通过观察参数,sign
为加密算法生成的数据。
token
经过重复获取,发现token
是网页传回的。
请求的cookie
也是请求网页,在请求结果后设置的
设置cookie
请求网页,获取请求结果的headers
,获取cookie
。
python实现
url = "https://fanyi.baidu.com/"
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Mobile Safari/537.36',
}
res = requests.get(url, headers=headers)
headers["Cookie"] = res.headers["Set-Cookie"]
获取token
通过请求多次接口发现,需要在设置cookie
后才会返回token
,不然返回的为空值。
对上述设置好cookie
的请求再请求一遍,通过正则获取token
(后面的js逆向需要获取gtk
也在网页上获取,需提取获取到 gtk
的值)
python实现
res = requests.get(url, headers=self.headers)
token = re.findall('token: \'(.*?)\'', res.text, re.S)[0]
gtk = re.findall('gtk: \'(.*?)\'', res.text, re.S)[0]
设置sign
通过全局搜索,在每个出现sign
位置打上断点,找到生成sign
的函数
sign
是由A
函数生成,并把要翻译的词作为参数传给A
函数,点击A
函数跳转到具体位置。
可以看到a
函数经过复杂的数据处理,因此不予探究期具体操作,直接使用execjs
库,把数据交个js进行处理。
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var e = o.charAt(t + 2);
e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e),
e = "+" === o.charAt(t + 1) ? r >>> e : r << e,
r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e
}
return r
}
function e(r) {
if (Array.isArray(r)) {
for (var o = 0, t = Array(r.length); o < r.length; o++)
t[o] = r[o];
return t
}
return Array.from(r)
}
function a(r,gtk) {
var i = null;
var t = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === t) {
var a = r.length;
a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var C = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), h = 0, f = C.length, u = []; f > h; h++)
"" !== C[h] && u.push.apply(u, e(C[h].split(""))),
h !== f - 1 && u.push(t[h]);
var g = u.length;
g > 30 && (r = u.slice(0, 10).join("") + u.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + u.slice(-10).join(""))
}
var l = void 0
, d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
l = null !== i ? i : (i = gtk || "") || "";
for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) {
var p = r.charCodeAt(F);
128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)),
c[v++] = p >> 18 | 240,
c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224,
c[v++] = p >> 6 & 63 | 128),
c[v++] = 63 & p | 128)
}
for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++)
w += c[D],
w = n(w, A);
return w = n(w, b),
w ^= s,
0 > w && (w = (2147483647 & w) + 2147483648),
w %= 1e6,
w.toString() + "." + (w ^ S)
}
经过js处理后的我们就能得到sign
加密的数据,python实现
with open("百度.js", 'r', encoding='utf-8') as po:
signs = execjs.compile(po.read()).call('a', query, self.gtk)
代码封装
根据上述,我们通过对js逆向对百度翻译进行处理得到数据,对上述步骤进行封装整理成类。
import os
import re
import json
import requests
import subprocess
from functools import partial
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
subprocess.Popen = partial(subprocess.Popen,startupinfo=startupinfo, encoding='utf-8')
import execjs
class BaiDuTranslate():
def __init__(self):
path = os.path.dirname(os.path.abspath(__file__))
self.js_path = os.path.join(path, "百度.js")
self.headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Mobile Safari/537.36',
}
self.init_token()
def init_token(self):
"""
初始化:获取token,gtk以及cookie
:return:
"""
url = "https://fanyi.baidu.com/"
res = requests.get(url, headers=self.headers)
self.token = re.findall('token: \'(.*?)\'', res.text, re.S)[0]
langList = re.findall('langList: ({.*?}),', res.text, re.S)[0]
langMap = re.findall('langMap: ({.*?}),', res.text, re.S)[0]
self.gtk = re.findall('gtk: \'(.*?)\'', res.text, re.S)[0]
self.langDict = json.loads(langList.replace("'",'"'))
self.langMap = json.loads(langMap.replace("'",'"'))
self.headers["Cookie"] = res.headers["Set-Cookie"]
if not self.token:
res = requests.get(url, headers=self.headers)
self.token = re.findall('token: \'(.*?)\'', res.text, re.S)[0]
def translate(self,query,from_=None,to=None):
"""翻译"""
# 自动检测语言
if not from_:
from_ = self.langDetect(query)
if not to:
if from_ == "zh":
to = 'en'
else:
to = 'zh'
url = "https://fanyi.baidu.com/basetrans"
with open(self.js_path, 'r', encoding='utf-8') as po:
signs = execjs.compile(po.read()).call('a', query, self.gtk)
data = {
"query": query,
"from": from_,
"to": to,
"token": self.token,
"sign": signs
}
response = requests.post(url=url, headers=self.headers, data=data)
text = response.json()
result_list = [i['dst'].strip() for i in text['trans']]
res_dict = {
"result": "\n".join(result_list),
"CEDict": []
}
if text['dict']:
for mean in text['dict']['symbols'][0]['parts'][0]["means"]:
if isinstance(mean,dict):
if not mean.get("means"):
continue
res_dict["CEDict"].append({"text":mean['text'],"tran":mean['means'][0]})
return res_dict
def langDetect(self,query):
"""语言检测"""
url = "https://fanyi.baidu.com/langdetect"
data= {"query":query}
response = requests.post(url=url, headers=self.headers, data=data)
text = response.json()
return text["lan"]
if __name__ == '__main__':
obj = BaiDuTranslate()
res = obj.translate("你好")
该类下还增加了获取语言检测方法,通过translate
方法的from_
和to
参数控制翻译的语言。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?
· 如何调用 DeepSeek 的自然语言处理 API 接口并集成到在线客服系统