请求分析
首先,我们打开开发者工具,然后打开Google翻译(translate.google.cn),
我们过滤器选择JS,发现第一个js就很可疑,先记下来。
接着,我们随便输入一些内容,便可以很轻松地抓取到这个请求:
观察GET参数,不用说都能看出来下面几个参数是重要的参数:
我们可以猜测:sl为源语言,tl为目标语言,q为要翻译的内容,tk为某种验证参数。
那么tk参数如何获得呢?
tk参数获取
打开我们最开始发现的那个js文件:
translate.google.cn/translate/releases/twsfe_w_20200622_RC00/r/js/translate_m_zh-CN.js
将其格式化,再经过漫长的搜寻,找到了下面的一串代码:
var pu = function(a) {
return function() {
return a
}
},
qu = function(a, b) {
for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2);
d = "a" <= d ? d.charCodeAt(0) - 87 : Number(d);
d = "+" == b.charAt(c + 1) ? a >>> d: a << d;
a = "+" == b.charAt(c) ? a + d & 4294967295 : a ^ d
}
return a
},
ru = null,
su = function(a) {
if (null !== ru) var b = ru;
else {
b = pu(String.fromCharCode(84));
var c = pu(String.fromCharCode(75));
b = [b(), b()];
b[1] = c();
b = (ru = window[b.join(c())] || "") || ""
}
var d = pu(String.fromCharCode(116));
c = pu(String.fromCharCode(107));
d = [d(), d()];
d[1] = c();
c = "&" + d.join("") + "=";
d = b.split(".");
b = Number(d[0]) || 0;
for (var e = [], f = 0, g = 0; g < a.length; g++) {
var h = a.charCodeAt(g);
128 > h ? e[f++] = h: (2048 > h ? e[f++] = h >> 6 | 192 : (55296 == (h & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (h = 65536 + ((h & 1023) << 10) + (a.charCodeAt(++g) & 1023), e[f++] = h >> 18 | 240, e[f++] = h >> 12 & 63 | 128) : e[f++] = h >> 12 | 224, e[f++] = h >> 6 & 63 | 128), e[f++] = h & 63 | 128)
}
a = b;
for (f = 0; f < e.length; f++) a += e[f],
a = qu(a, "+-a^+6");
a = qu(a, "+-3^+b+-f");
a ^= Number(d[1]) || 0;
0 > a && (a = (a & 2147483647) + 2147483648);
a %= 1E6;
return c + (a.toString() + "." + (a ^ b))
};
很明显,这段代码是一段算法,而且是经过混淆的(变量名很奇怪)。
经过验证,这就是tk参数的加密算法。把不需要部分的去除,稍微进行整理一下:
var qu = function(a, b) {
for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2);
d = "a" <= d ? d.charCodeAt(0) - 87 : Number(d);
d = "+" == b.charAt(c + 1) ? a >>> d: a << d;
a = "+" == b.charAt(c) ? a + d & 4294967295 : a ^ d
}
return a
},
su = function(a, tkk) {
d = tkk.split(".");
b = Number(d[0]) || 0;
for (var e = [], f = 0, g = 0; g < a.length; g++) {
var h = a.charCodeAt(g);
128 > h ? e[f++] = h: (2048 > h ? e[f++] = h >> 6 | 192 : (55296 == (h & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (h = 65536 + ((h & 1023) << 10) + (a.charCodeAt(++g) & 1023), e[f++] = h >> 18 | 240, e[f++] = h >> 12 & 63 | 128) : e[f++] = h >> 12 | 224, e[f++] = h >> 6 & 63 | 128), e[f++] = h & 63 | 128)
}
a = b;
for (f = 0; f < e.length; f++) a += e[f],
a = qu(a, "+-a^+6");
a = qu(a, "+-3^+b+-f");
a ^= Number(d[1]) || 0;
0 > a && (a = (a & 2147483647) + 2147483648);
a %= 1E6;
return (a.toString() + "." + (a ^ b))
};
这里的tkk就在源代码里面可以找到,如下图:
execjs库的调用
接下来,我们用python的execjs库进行测试,如果没有安装请使用pip install PyExecJS
进行安装。
import execjs
a=execjs.compile("""
var qu = function(a, b) {
for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2);
d = "a" <= d ? d.charCodeAt(0) - 87 : Number(d);
d = "+" == b.charAt(c + 1) ? a >>> d: a << d;
a = "+" == b.charAt(c) ? a + d & 4294967295 : a ^ d
}
return a
},
su = function(a, tkk) {
d = tkk.split(".");
b = Number(d[0]) || 0;
for (var e = [], f = 0, g = 0; g < a.length; g++) {
var h = a.charCodeAt(g);
128 > h ? e[f++] = h: (2048 > h ? e[f++] = h >> 6 | 192 : (55296 == (h & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (h = 65536 + ((h & 1023) << 10) + (a.charCodeAt(++g) & 1023), e[f++] = h >> 18 | 240, e[f++] = h >> 12 & 63 | 128) : e[f++] = h >> 12 | 224, e[f++] = h >> 6 & 63 | 128), e[f++] = h & 63 | 128)
}
a = b;
for (f = 0; f < e.length; f++) a += e[f],
a = qu(a, "+-a^+6");
a = qu(a, "+-3^+b+-f");
a ^= Number(d[1]) || 0;
0 > a && (a = (a & 2147483647) + 2147483648);
a %= 1E6;
return (a.toString() + "." + (a ^ b))
};
""")
print(a.call("su",'test','442788.2585626513')) #获取tk
我们将运行得到的tk带入那个GET请求中,发现仍然可以正常得到返回json,说明tk获取成功。
于是我们就可以通过修改GET请求的参数实现获取翻译结果了。
python版算法
execjs的性能并不是很好,执行时间会很长。因此我们采用另一种方法:将js转换成python代码。
转换后大致代码如下:
def getGoogleToken(a, TKK):
def RL(a, b):
for d in range(0, len(b)-2, 3):
c = b[d + 2]
c = ord(c[0]) - 87 if 'a' <= c else int(c)
c = a >> c if '+' == b[d + 1] else a << c
a = a + c & 4294967295 if '+' == b[d] else a ^ c
return a
g = []
f = 0
while f < len(a):
c = ord(a[f])
if 128 > c:
g.append(c)
else:
if 2048 > c:
g.append((c >> 6) | 192)
else:
if (55296 == (c & 64512)) and (f + 1 < len(a)) and (56320 == (ord(a[f+1]) & 64512)):
f += 1
c = 65536 + ((c & 1023) << 10) + (ord(a[f]) & 1023)
g.append((c >> 18) | 240)
g.append((c >> 12) & 63 | 128)
else:
g.append((c >> 12) | 224)
g.append((c >> 6) & 63 | 128)
g.append((c & 63) | 128)
f += 1
e = TKK.split('.')
h = int(e[0]) or 0
t = h
for item in g:
t += item
t = RL(t, '+-a^+6')
t = RL(t, '+-3^+b+-f')
t ^= int(e[1]) or 0
if 0 > t:
t = (t & 2147483647) + 2147483648
result = t % 1000000
return str(result) + '.' + str(result ^ h)
最终代码
import requests,re,json,time
class GoogleTranslator():
_host = 'translate.google.cn'
_headers = {
'Host': _host,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
'Referer': 'https://' + _host,
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0'
}
_url = 'https://' + _host + '/translate_a/single'
_params = { #除关键参数,其它参数直接复制过来就行了
'client': 'webapp',
'sl': 'en',
'tl': 'zh-CN',
'hl': 'zh-CN',
'dt': 'at',
'dt': 'bd',
'dt': 'ex',
'dt': 'ld',
'dt': 'md',
'dt': 'qca',
'dt': 'rw',
'dt': 'rm',
'dt': 'ss',
'dt': 't',
'otf': '1',
'ssel': '0',
'tsel': '0',
'kc': '1'
}
__cookies = None
__googleTokenKey = ''
__googleTokenKeyUpdataTime = 600.0
__googleTokenKeyRetireTime = time.time() + 600.0
def __init__(self, src = 'auto', dest = 'zh-CN', tkkUpdataTime = 600.0):
self._params['sl'] = src
self._params['tl'] = dest
self.googleTokenKeyUpdataTime = tkkUpdataTime
self.__updateGoogleTokenKey()
def __updateGoogleTokenKey(self):
self.__googleTokenKey = self.__getGoogleTokenKey()
self.__googleTokenKeyRetireTime = time.time() + self.__googleTokenKeyUpdataTime
def __getGoogleTokenKey(self):
result = ''
try:
res = requests.get('https://' + self._host, timeout = 3)
res.raise_for_status()
self.__cookies = res.cookies
result = re.search(r'tkk\:\'(\d+\.\d+)?\'', res.text).group(1) #自动从源代码中获取tkk,或者这里改成一个定值也可以
except requests.exceptions.ReadTimeout as ex:
print('ERROR: ' + str(ex))
time.sleep(1)
return result
def __getGoogleToken(self, a, TKK): #获取tk
def RL(a, b):
for d in range(0, len(b)-2, 3):
c = b[d + 2]
c = ord(c[0]) - 87 if 'a' <= c else int(c)
c = a >> c if '+' == b[d + 1] else a << c
a = a + c & 4294967295 if '+' == b[d] else a ^ c
return a
g = []
f = 0
while f < len(a):
c = ord(a[f])
if 128 > c:
g.append(c)
else:
if 2048 > c:
g.append((c >> 6) | 192)
else:
if (55296 == (c & 64512)) and (f + 1 < len(a)) and (56320 == (ord(a[f+1]) & 64512)):
f += 1
c = 65536 + ((c & 1023) << 10) + (ord(a[f]) & 1023)
g.append((c >> 18) | 240)
g.append((c >> 12) & 63 | 128)
else:
g.append((c >> 12) | 224)
g.append((c >> 6) & 63 | 128)
g.append((c & 63) | 128)
f += 1
e = TKK.split('.')
h = int(e[0]) or 0
t = h
for item in g:
t += item
t = RL(t, '+-a^+6')
t = RL(t, '+-3^+b+-f')
t ^= int(e[1]) or 0
if 0 > t:
t = (t & 2147483647) + 2147483648
result = t % 1000000
return str(result) + '.' + str(result ^ h)
def translate(self, text): #获取返回的json中的结果
if time.time() > self.__googleTokenKeyRetireTime:
self.__updateGoogleTokenKey()
data = {'q': text}
self._params['tk'] = self.__getGoogleToken(text, self.__googleTokenKey)
result = ''
try:
res = requests.post(self._url,
headers = self._headers,
cookies = self.__cookies,
data = data,
params = self._params,
timeout = 6)
res.raise_for_status()
jsonText = res.text
#print(jsonText)
if len(jsonText)>0:
jsonResult = json.loads(jsonText)
if len(jsonResult[0])>0:
for item in jsonResult[0]:
result += item[0]
return result
except Exception as ex:
print('ERROR: ' + str(ex))
return ''
def Translate(text,to):
translator = GoogleTranslator(dest=to)
result = translator.translate(text)
return result
print(Translate(input(': '),'zh-CN')) #自动检测语言到中文翻译
以上代码只是获取了翻译结果,在返回的json中其实还包含了很多其它的信息,可以自行扩展此代码。
最后,附上语言对应的代码(sl和tl所用):
source_code_name: [{
code: 'auto',
name: '自动检测语言'
},
{
code: 'sq',
name: '阿尔巴尼亚语'
},
{
code: 'ar',
name: '阿拉伯语'
},
{
code: 'am',
name: '阿姆哈拉语'
},
{
code: 'az',
name: '阿塞拜疆语'
},
{
code: 'ga',
name: '爱尔兰语'
},
{
code: 'et',
name: '爱沙尼亚语'
},
{
code: 'or',
name: '奥里亚语(奥里亚文)'
},
{
code: 'eu',
name: '巴斯克语'
},
{
code: 'be',
name: '白俄罗斯语'
},
{
code: 'bg',
name: '保加利亚语'
},
{
code: 'is',
name: '冰岛语'
},
{
code: 'pl',
name: '波兰语'
},
{
code: 'bs',
name: '波斯尼亚语'
},
{
code: 'fa',
name: '波斯语'
},
{
code: 'af',
name: '布尔语(南非荷兰语)'
},
{
code: 'tt',
name: '鞑靼语'
},
{
code: 'da',
name: '丹麦语'
},
{
code: 'de',
name: '德语'
},
{
code: 'ru',
name: '俄语'
},
{
code: 'fr',
name: '法语'
},
{
code: 'tl',
name: '菲律宾语'
},
{
code: 'fi',
name: '芬兰语'
},
{
code: 'fy',
name: '弗里西语'
},
{
code: 'km',
name: '高棉语'
},
{
code: 'ka',
name: '格鲁吉亚语'
},
{
code: 'gu',
name: '古吉拉特语'
},
{
code: 'kk',
name: '哈萨克语'
},
{
code: 'ht',
name: '海地克里奥尔语'
},
{
code: 'ko',
name: '韩语'
},
{
code: 'ha',
name: '豪萨语'
},
{
code: 'nl',
name: '荷兰语'
},
{
code: 'ky',
name: '吉尔吉斯语'
},
{
code: 'gl',
name: '加利西亚语'
},
{
code: 'ca',
name: '加泰罗尼亚语'
},
{
code: 'cs',
name: '捷克语'
},
{
code: 'kn',
name: '卡纳达语'
},
{
code: 'co',
name: '科西嘉语'
},
{
code: 'hr',
name: '克罗地亚语'
},
{
code: 'ku',
name: '库尔德语'
},
{
code: 'la',
name: '拉丁语'
},
{
code: 'lv',
name: '拉脱维亚语'
},
{
code: 'lo',
name: '老挝语'
},
{
code: 'lt',
name: '立陶宛语'
},
{
code: 'lb',
name: '卢森堡语'
},
{
code: 'rw',
name: '卢旺达语'
},
{
code: 'ro',
name: '罗马尼亚语'
},
{
code: 'mg',
name: '马尔加什语'
},
{
code: 'mt',
name: '马耳他语'
},
{
code: 'mr',
name: '马拉地语'
},
{
code: 'ml',
name: '马拉雅拉姆语'
},
{
code: 'ms',
name: '马来语'
},
{
code: 'mk',
name: '马其顿语'
},
{
code: 'mi',
name: '毛利语'
},
{
code: 'mn',
name: '蒙古语'
},
{
code: 'bn',
name: '孟加拉语'
},
{
code: 'my',
name: '缅甸语'
},
{
code: 'hmn',
name: '苗语'
},
{
code: 'xh',
name: '南非科萨语'
},
{
code: 'zu',
name: '南非祖鲁语'
},
{
code: 'ne',
name: '尼泊尔语'
},
{
code: 'no',
name: '挪威语'
},
{
code: 'pa',
name: '旁遮普语'
},
{
code: 'pt',
name: '葡萄牙语'
},
{
code: 'ps',
name: '普什图语'
},
{
code: 'ny',
name: '齐切瓦语'
},
{
code: 'ja',
name: '日语'
},
{
code: 'sv',
name: '瑞典语'
},
{
code: 'sm',
name: '萨摩亚语'
},
{
code: 'sr',
name: '塞尔维亚语'
},
{
code: 'st',
name: '塞索托语'
},
{
code: 'si',
name: '僧伽罗语'
},
{
code: 'eo',
name: '世界语'
},
{
code: 'sk',
name: '斯洛伐克语'
},
{
code: 'sl',
name: '斯洛文尼亚语'
},
{
code: 'sw',
name: '斯瓦希里语'
},
{
code: 'gd',
name: '苏格兰盖尔语'
},
{
code: 'ceb',
name: '宿务语'
},
{
code: 'so',
name: '索马里语'
},
{
code: 'tg',
name: '塔吉克语'
},
{
code: 'te',
name: '泰卢固语'
},
{
code: 'ta',
name: '泰米尔语'
},
{
code: 'th',
name: '泰语'
},
{
code: 'tr',
name: '土耳其语'
},
{
code: 'tk',
name: '土库曼语'
},
{
code: 'cy',
name: '威尔士语'
},
{
code: 'ug',
name: '维吾尔语'
},
{
code: 'ur',
name: '乌尔都语'
},
{
code: 'uk',
name: '乌克兰语'
},
{
code: 'uz',
name: '乌兹别克语'
},
{
code: 'es',
name: '西班牙语'
},
{
code: 'iw',
name: '希伯来语'
},
{
code: 'el',
name: '希腊语'
},
{
code: 'haw',
name: '夏威夷语'
},
{
code: 'sd',
name: '信德语'
},
{
code: 'hu',
name: '匈牙利语'
},
{
code: 'sn',
name: '修纳语'
},
{
code: 'hy',
name: '亚美尼亚语'
},
{
code: 'ig',
name: '伊博语'
},
{
code: 'it',
name: '意大利语'
},
{
code: 'yi',
name: '意第绪语'
},
{
code: 'hi',
name: '印地语'
},
{
code: 'su',
name: '印尼巽他语'
},
{
code: 'id',
name: '印尼语'
},
{
code: 'jw',
name: '印尼爪哇语'
},
{
code: 'en',
name: '英语'
},
{
code: 'yo',
name: '约鲁巴语'
},
{
code: 'vi',
name: '越南语'
},
{
code: 'zh-CN',
name: '中文'
}]