百度翻译js逆向

百度翻译

数据分析

通过抓取数据，百度翻译是请求这个接口 https://fanyi.baidu.com/basetrans，请求参数如下所示：

1694054848880

通过观察参数，sign为加密算法生成的数据。

token经过重复获取，发现token是网页传回的。

1694055008633

请求的cookie也是请求网页，在请求结果后设置的

1694055153058

设置cookie

请求网页，获取请求结果的headers，获取cookie。

python实现

url = "https://fanyi.baidu.com/"
headers = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Mobile Safari/537.36',
}
res = requests.get(url, headers=headers)
headers["Cookie"] = res.headers["Set-Cookie"]

获取token

通过请求多次接口发现，需要在设置cookie后才会返回token，不然返回的为空值。

对上述设置好cookie的请求再请求一遍，通过正则获取token（后面的js逆向需要获取gtk也在网页上获取，需提取获取到 gtk 的值）

python实现

res = requests.get(url, headers=self.headers)
token = re.findall('token: \'(.*?)\'', res.text, re.S)[0]
gtk = re.findall('gtk: \'(.*?)\'', res.text, re.S)[0]

设置sign

通过全局搜索，在每个出现sign位置打上断点，找到生成sign的函数

1694056286149

sign是由A函数生成，并把要翻译的词作为参数传给A函数，点击A函数跳转到具体位置。

1694056510564

可以看到a函数经过复杂的数据处理，因此不予探究期具体操作，直接使用execjs库，把数据交个js进行处理。

function n(r, o) {
    for (var t = 0; t < o.length - 2; t += 3) {
        var e = o.charAt(t + 2);
        e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e),
        e = "+" === o.charAt(t + 1) ? r >>> e : r << e,
        r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e
    }
    return r
}
function e(r) {
    if (Array.isArray(r)) {
        for (var o = 0, t = Array(r.length); o < r.length; o++)
            t[o] = r[o];
        return t
    }
    return Array.from(r)
}


function a(r,gtk) {
    var i = null;
    var t = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
    if (null === t) {
        var a = r.length;
        a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10))
    } else {
        for (var C = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), h = 0, f = C.length, u = []; f > h; h++)
            "" !== C[h] && u.push.apply(u, e(C[h].split(""))),
            h !== f - 1 && u.push(t[h]);
        var g = u.length;
        g > 30 && (r = u.slice(0, 10).join("") + u.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + u.slice(-10).join(""))
    }
    var l = void 0
        , d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
    l = null !== i ? i : (i = gtk || "") || "";
    for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) {
        var p = r.charCodeAt(F);
        128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)),
            c[v++] = p >> 18 | 240,
            c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224,
            c[v++] = p >> 6 & 63 | 128),
            c[v++] = 63 & p | 128)
    }
    for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++)
        w += c[D],
            w = n(w, A);
    return w = n(w, b),
        w ^= s,
    0 > w && (w = (2147483647 & w) + 2147483648),
        w %= 1e6,
    w.toString() + "." + (w ^ S)
}

经过js处理后的我们就能得到sign加密的数据，python实现

with open("百度.js", 'r', encoding='utf-8') as po:
    signs = execjs.compile(po.read()).call('a', query, self.gtk)

代码封装

根据上述，我们通过对js逆向对百度翻译进行处理得到数据，对上述步骤进行封装整理成类。

import os
import re
import json
import requests

import subprocess
from functools import partial
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
subprocess.Popen = partial(subprocess.Popen,startupinfo=startupinfo, encoding='utf-8')
import execjs



class BaiDuTranslate():
    def __init__(self):
        path = os.path.dirname(os.path.abspath(__file__))
        self.js_path = os.path.join(path, "百度.js")
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Mobile Safari/537.36',
        }
        self.init_token()

    def init_token(self):
        """
        初始化：获取token,gtk以及cookie
        :return:
        """
        url = "https://fanyi.baidu.com/"
        res = requests.get(url, headers=self.headers)
        self.token = re.findall('token: \'(.*?)\'', res.text, re.S)[0]
        langList = re.findall('langList: ({.*?}),', res.text, re.S)[0]
        langMap = re.findall('langMap: ({.*?}),', res.text, re.S)[0]
        self.gtk = re.findall('gtk: \'(.*?)\'', res.text, re.S)[0]
        self.langDict = json.loads(langList.replace("'",'"'))
        self.langMap = json.loads(langMap.replace("'",'"'))
        self.headers["Cookie"] = res.headers["Set-Cookie"]
        if not self.token:
            res = requests.get(url, headers=self.headers)
            self.token = re.findall('token: \'(.*?)\'', res.text, re.S)[0]


    def translate(self,query,from_=None,to=None):
        """翻译"""
        # 自动检测语言
        if not from_:
            from_ = self.langDetect(query)
            if not to:
                if from_ == "zh":
                    to = 'en'
                else:
                    to = 'zh'
        url = "https://fanyi.baidu.com/basetrans"
        with open(self.js_path, 'r', encoding='utf-8') as po:
            signs = execjs.compile(po.read()).call('a', query, self.gtk)
        data = {
            "query": query,
            "from": from_,
            "to": to,
            "token": self.token,
            "sign": signs
        }
        response = requests.post(url=url, headers=self.headers, data=data)
        text = response.json()

        result_list = [i['dst'].strip() for i in text['trans']]
        res_dict = {
            "result": "\n".join(result_list),
            "CEDict": []
        }
        if text['dict']:
            for mean in text['dict']['symbols'][0]['parts'][0]["means"]:
                if isinstance(mean,dict):
                    if not mean.get("means"):
                        continue
                    res_dict["CEDict"].append({"text":mean['text'],"tran":mean['means'][0]})
        return res_dict



    def langDetect(self,query):
        """语言检测"""
        url = "https://fanyi.baidu.com/langdetect"
        data= {"query":query}
        response = requests.post(url=url, headers=self.headers, data=data)
        text = response.json()
        return text["lan"]


if __name__ == '__main__':
    obj = BaiDuTranslate()
    res = obj.translate("你好")