爬虫:爬取海词的翻译内容
在爬取海词的时候遇到了一个问题,在异步加载的时候,需要一个t值,但是这个t值是js加载出来的,直接拼接的,我们无法从网页中得到;
当在搜索框输入单词的时候:你在干嘛
替换下图中的page的值就能达到翻页的目的:
那么当前的目的就是要能够找到这段js代码,同时获取对应输入的t的值,来重新组合url
真正的url只需要如下内容:
我提前把关键字和t都处理了,写成了字典的形式,
key:你在干嘛 ff[key]:WuqarCRs
{“你好”:“WuqarCRs”} #提前处理成了这种,方便提取 url = "http://fuzz.dict.cn/dict/api.php?&action=fuzz&from=jsonp&q=" + key + "&t="+ ff[key]+"&page="
那么关键部分来了,我是如何获取T的呢。
大概思路,是在本地搭建一个服务器,然后输入每个词去访问这段js代码,然后返回给词的结果保存起来。
处理过程的代码:
第一步:先找到那段js代码,里面是如何把输入的文字转换成8位字符串的算法
第二步:先安装node.js 服务器,然后提取出来这段js代码,转换成node.js代码,如果不转换的话在浏览器里面直接访问时无法触发js加载的。
下面是node.js的代码,先执行node.js代码
var http = require('http'); var querystring = require('querystring'); var util = require('util'); http.createServer(function(req, res){ var post = ''; var mm = '' function dictCrypto(J) { function r(g, f) { var e, d, a, b, c; a = g & 2147483648; b = f & 2147483648; e = g & 1073741824; d = f & 1073741824; c = (g & 1073741823) + (f & 1073741823); if (e & d) { return c ^ 2147483648 ^ a ^ b } return e | d ? c & 1073741824 ? c ^ 3221225472 ^ a ^ b : c ^ 1073741824 ^ a ^ b : c ^ a ^ b } function I(g, f, e, d, a, b, c) { g = r(g, r(r(f & e | ~f & d, a), c)); return r(g << b | g >>> 32 - b, f) } function s(g, f, e, d, a, b, c) { g = r(g, r(r(f & d | e & ~d, a), c)); return r(g << b | g >>> 32 - b, f) } function w(g, f, e, d, a, b, c) { g = r(g, r(r(f ^ e ^ d, a), c)); return r(g << b | g >>> 32 - b, f) } function v(g, f, e, d, a, b, c) { g = r(g, r(r(e ^ (f | ~d), a), c)); return r(g << b | g >>> 32 - b, f) } function K(c) { for (var b = "++"; c > 0;) { var a = c % 64; b += a == 0 ? "+" : a == 1 ? "-" : a > 1 && a < 12 ? String.fromCharCode(a + 46) : a > 11 && a < 38 ? String.fromCharCode(a + 54) : String.fromCharCode(a + 59); c = (c - a) / 64 } return b.substr(b.length - 2, 2) } function H(d) { var c = "", b = "", a; for (a = 0; a <= 3; a++) { b = d >>> a * 8 & 255; b = "0" + b.toString(16); c += b.substr(b.length - 2, 2) } return c } var x = [],G, L, q, p, F, E, D, C; J = function(d) { // var rrr = //; // d = d.replace(rrr,""); process.stdout.write(d +"***"+'\n'); process.stdout.write(typeof rrr); d = d.replace(/\r\n/g, "\n"); for (var c = "",b = 0; b < d.length; b++) { var a = d.charCodeAt(b); if (a < 128) { c += String.fromCharCode(a) } else { if (a > 127 && a < 2048) { c += String.fromCharCode(a >> 6 | 192) } else { c += String.fromCharCode(a >> 12 | 224); c += String.fromCharCode(a >> 6 & 63 | 128) } c += String.fromCharCode(a & 63 | 128) } } c += String.fromCharCode(80, 97, 83, 115); if (global.dict_pagetoken) { c += global.dict_pagetoken } return c }(J); x = function(g) { var f, e = g.length; f = e + 8; for (var d = ((f - f % 64) / 64 + 1) * 16, a = Array(d - 1), b = 0, c = 0; c < e;) { f = (c - c % 4) / 4; b = c % 4 * 8; a[f] |= g.charCodeAt(c) << b; c++ } a[(c - c % 4) / 4] |= 128 << c % 4 * 8; a[d - 2] = e << 3; a[d - 1] = e >>> 29; return a }(J); F = 1732584193; E = 4023233417; D = 2562383102; C = 271733878; for (J = 0; J < x.length; J += 16) { G = F; L = E; q = D; p = C; F = I(F, E, D, C, x[J + 0], 7, 3614090360); C = I(C, F, E, D, x[J + 1], 12, 3905402710); D = I(D, C, F, E, x[J + 2], 17, 606105819); E = I(E, D, C, F, x[J + 3], 22, 3250441966); F = I(F, E, D, C, x[J + 4], 7, 4118548399); C = I(C, F, E, D, x[J + 5], 12, 1200080426); D = I(D, C, F, E, x[J + 6], 17, 2821735955); E = I(E, D, C, F, x[J + 7], 22, 4249261313); F = I(F, E, D, C, x[J + 8], 7, 1770035416); C = I(C, F, E, D, x[J + 9], 12, 2336552879); D = I(D, C, F, E, x[J + 10], 17, 4294925233); E = I(E, D, C, F, x[J + 11], 22, 2304563134); F = I(F, E, D, C, x[J + 12], 7, 1804603682); C = I(C, F, E, D, x[J + 13], 12, 4254626195); D = I(D, C, F, E, x[J + 14], 17, 2792965006); E = I(E, D, C, F, x[J + 15], 22, 1236535329); F = s(F, E, D, C, x[J + 1], 5, 4129170786); C = s(C, F, E, D, x[J + 6], 9, 3225465664); D = s(D, C, F, E, x[J + 11], 14, 643717713); E = s(E, D, C, F, x[J + 0], 20, 3921069994); F = s(F, E, D, C, x[J + 5], 5, 3593408605); C = s(C, F, E, D, x[J + 10], 9, 38016083); D = s(D, C, F, E, x[J + 15], 14, 3634488961); E = s(E, D, C, F, x[J + 4], 20, 3889429448); F = s(F, E, D, C, x[J + 9], 5, 568446438); C = s(C, F, E, D, x[J + 14], 9, 3275163606); D = s(D, C, F, E, x[J + 3], 14, 4107603335); E = s(E, D, C, F, x[J + 8], 20, 1163531501); F = s(F, E, D, C, x[J + 13], 5, 2850285829); C = s(C, F, E, D, x[J + 2], 9, 4243563512); D = s(D, C, F, E, x[J + 7], 14, 1735328473); E = s(E, D, C, F, x[J + 12], 20, 2368359562); F = w(F, E, D, C, x[J + 5], 4, 4294588738); C = w(C, F, E, D, x[J + 8], 11, 2272392833); D = w(D, C, F, E, x[J + 11], 16, 1839030562); E = w(E, D, C, F, x[J + 14], 23, 4259657740); F = w(F, E, D, C, x[J + 1], 4, 2763975236); C = w(C, F, E, D, x[J + 4], 11, 1272893353); D = w(D, C, F, E, x[J + 7], 16, 4139469664); E = w(E, D, C, F, x[J + 10], 23, 3200236656); F = w(F, E, D, C, x[J + 13], 4, 681279174); C = w(C, F, E, D, x[J + 0], 11, 3936430074); D = w(D, C, F, E, x[J + 3], 16, 3572445317); E = w(E, D, C, F, x[J + 6], 23, 76029189); F = w(F, E, D, C, x[J + 9], 4, 3654602809); C = w(C, F, E, D, x[J + 12], 11, 3873151461); D = w(D, C, F, E, x[J + 15], 16, 530742520); E = w(E, D, C, F, x[J + 2], 23, 3299628645); F = v(F, E, D, C, x[J + 0], 6, 4096336452); C = v(C, F, E, D, x[J + 7], 10, 1126891415); D = v(D, C, F, E, x[J + 14], 15, 2878612391); E = v(E, D, C, F, x[J + 5], 21, 4237533241); F = v(F, E, D, C, x[J + 12], 6, 1700485571); C = v(C, F, E, D, x[J + 3], 10, 2399980690); D = v(D, C, F, E, x[J + 10], 15, 4293915773); E = v(E, D, C, F, x[J + 1], 21, 2240044497); F = v(F, E, D, C, x[J + 8], 6, 1873313359); C = v(C, F, E, D, x[J + 15], 10, 4264355552); D = v(D, C, F, E, x[J + 6], 15, 2734768916); E = v(E, D, C, F, x[J + 13], 21, 1309151649); F = v(F, E, D, C, x[J + 4], 6, 4149444226); C = v(C, F, E, D, x[J + 11], 10, 3174756917); D = v(D, C, F, E, x[J + 2], 15, 718787259); E = v(E, D, C, F, x[J + 9], 21, 3951481745); F = r(F, G); E = r(E, L); D = r(D, q); C = r(C, p) } return function(d) { var c = parseInt("0x" + d.substr(0, 3), 16), b = parseInt("0x" + d.substr(3, 3), 16), a = parseInt("0x" + d.substr(6, 3), 16); d = parseInt("0x" + d.substr(9, 3), 16); return K(c) + K(b) + K(a) + K(d); console.log(K(c) + K(b) + K(a) + K(d)) } (H(F).substr(0, 4) + H(E).substr(0, 4) + H(D).substr(0, 4)) } //传过来的时候,chunk = “你好” req.on('data', function(chunk){ #添加post请求 process.stdout.write(chunk+'\n'); // // process.stdout.write(hh + '\n'); // var hhh = "你好" rrr = chunk.toString() process.stdout.write(typeof rrr + '\n'); process.stdout.write(rrr+'\n'); mm = dictCrypto(rrr); post +=mm; }); req.on('end', function(){ post = querystring.parse(post); res.end(util.inspect(post)); }); }).listen(8888); console.log('Server running at http://127.0.0.1:8888/');
第三步:正常的python代码,去访问本地的服务器,直接把转换完的数据存储到本地
#! /usr/bin/env python #coding: utf-8 import re import os import requests import sys import json reload(sys) sys.setdefaultencoding('utf-8') path = "D:\\106_data\\juhai_data\\" ff = open(path + "answer_1.txt",'a') f = open("data_1.dict") #这个是你的词典,按照行来访问词典 tt = {} i = 1 j = 1 s = requests.session() s.keep_alive = False while 1: word = f.readline() if not word: ans = json.dumps(tt) ff.write(ans) break print word, if (i%100000 == 0):#一万个词存储一次,存的格式为字典 j = j + 1 ans = json.dumps(tt) ff.write(ans) ff.close() ff = open(path + "answer_" +str(j) + ".txt",'a') tt = {} word = word.strip('\n') html = requests.post("http://127.0.0.1:8888/",data =word,headers={'Connection':'close'}) print html.text xx = re.search("{ (.*?): '' }",html.text,re.S)#用到了正则去提取内容 try: xx = xx.group(1) xx = xx.strip("'") except: continue tt[word] = xx print xx i = i+1 s = requests.session() s.keep_alive = False ff.close() f.close()