爬虫终阶上
1.执行JS代码
假如在逆向分析时,发现某个js加密算法比较繁琐,用Python还原同样的算法比较费劲。此时,可以不必使用Python还原,而是利用Python去直接调用JavaScript中定义的功能。
想实现Python调用JavaScript代码,需如下步骤:
- 在电脑上安装node.js(软件)
- 安装Python的第三方模块pyexecjs
- 利用 pyexecjs 调用 nodejs 去执行JavaScript代码
方式1
function func(arg) {
return arg + 'i666';
}
var a1 = process.argv[0]
var data = func(a1);
console.log(data)
import subprocess
res = subprocess.check_output('node demo.js "zbb"', shell=True)
data_string = res.decode('utf-8')
print(data_string)
方式2
pip3.11 install pyexecjs
import execjs
js_string = """
function func(arg) {
return arg + '666';
}
"""
JS = execjs.compile(js_string)
sign = JS.call("func", "zbb")
print(sign) #
2.案例xx
import execjs
import requests
import ddddocr
from bs4 import BeautifulSoup
from lxml import etree
# 1.首页请求
# cookie_dict = {}
s=requests.session()
res = s.get(url="https://xuexi.chinabett.com/")
# cookie_dict.update(res.cookies.get_dict())
# 2.获取验证码地址
tree = etree.HTML(res.text)
image_tag2 = tree.xpath('//*[@id="imgVerifity"]/@src')
code_src = image_tag2[0]
# 3.读取验证码并实现
res = s.get(url=f"https://xuexi.chinabett.com{code_src}")
# cookie_dict.update(res.cookies.get_dict())
ocr = ddddocr.DdddOcr(show_ad=False)
code = ocr.classification(res.content)
# 4.处理用户名&密码
js_string = """
function base64encode(str) {
var base64EncodeChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
var base64DecodeChars = new Array(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1);
var out, i, len;
var c1, c2, c3;
len = str.length;
i = 0;
out = "";
while (i < len) {
c1 = str.charCodeAt(i++) & 0xff;
if (i == len) {
out += base64EncodeChars.charAt(c1 >> 2);
out += base64EncodeChars.charAt((c1 & 0x3) << 4);
out += "==";
break;
}
c2 = str.charCodeAt(i++);
if (i == len) {
out += base64EncodeChars.charAt(c1 >> 2);
out += base64EncodeChars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));
out += base64EncodeChars.charAt((c2 & 0xF) << 2);
out += "=";
break;
}
c3 = str.charCodeAt(i++);
out += base64EncodeChars.charAt(c1 >> 2);
out += base64EncodeChars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));
out += base64EncodeChars.charAt(((c2 & 0xF) << 2) | ((c3 & 0xC0) >> 6));
out += base64EncodeChars.charAt(c3 & 0x3F);
}
return out;
};
function s1() {
var data = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"];
var r = Math.floor(Math.random() * 62);
return data[r];
}
function encryptPwd(password){
//base64编码的密码每隔1位插入一个随机数 最后一位后面不插入
var newPwd = [];
var pwdlength = password.length;
for (i = 0; i < pwdlength; i++) {
newPwd.push(password[i]);
if (i < pwdlength - 1)
newPwd.push(s1());
}
var res = newPwd.join('');
return res;
}
"""
JS = execjs.compile(js_string)
# 用户名
username = JS.call("base64encode", "21321323")
# 密码
temp = JS.call("base64encode", "123")
password = JS.call("encryptPwd", temp)
# 5.登录
res = s.post(
url="https://xuexi.chinabett.com/Login/Entry",
data={
"userAccount": username,
"password": password,
"returnUrl": "/PersonalCenter",
"proVing": code,
},
# cookies=cookie_dict
)
print(res.text)
3.浏览器环境
在使用pyexecjs执行JavaScript代码时,如果存在读取浏览器环境,会失败。例如:
import execjs
js_string = """
function func(arg) {
return arg + '666' + document.location.hostname + window.navigator.userAgent;
}
"""
JS = execjs.compile(js_string)
sign = JS.call("func", "zzz")
print(sign)
此时,就需要创造浏览器环境然后再执行JavaScript代码。
npm config set registry https://registry.npmmirror.com
npm install -g jsdom 【主要】
npm install -g node-gyp
npm install -g canvas
查看安装
npm root -g
C:\nodejs\node_global\node_modules #把这个加入环境变量
import execjs
js_string = """
const jsdom = require("jsdom");
const {JSDOM} = jsdom;
const html = `<!DOCTYPE html><p>Hello world</p>`;
const dom = new JSDOM(html, {
url: "https://user.qunar.com/passport/login.jsp",
referrer: "https://www.qunar.com/",
contentType: "text/html"
});
document = dom.window.document;
window = global;
Object.assign(global, {
location: {
hash: "",
host: "user.qunar.com",
hostname: "user.qunar.com",
href: "https://user.qunar.com/passport/login.jsp",
origin: "https://user.qunar.com",
pathname: "/passport/login.jsp",
port: "",
protocol: "https:",
search: "",
},
navigator: {
appCodeName: "Mozilla",
appName: "Netscape",
appVersion: "5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
cookieEnabled: true,
deviceMemory: 8,
doNotTrack: null,
hardwareConcurrency: 4,
language: "zh-CN",
languages: ["zh-CN", "zh"],
maxTouchPoints: 0,
onLine: true,
platform: "MacIntel",
product: "Gecko",
productSub: "20030107",
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
vendor: "Google Inc.",
vendorSub: "",
webdriver: false
}
});
location = window.location;
XMLHttpRequest = function(){};
function func(arg) {
var xhr = new XMLHttpRequest();
return arg + '666' + document.location.hostname + window.navigator.userAgent;
}
"""
JS = execjs.compile(js_string)
sign = JS.call("func", "zzz")
print(sign)
关于XMLHttpRequest
XMLHttpRequest = function () {
return {
open:function(){},
setRequestHeader:function(){},
send:function(){},
}
}
function func(arg) {
var xhr = new XMLHttpRequest();
xhr.open('POST', "/test/", true);
xhr.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded; charset-UTF-8');
xhr.send('n1=1;n2=2;');
return arg + "666" + location.href + window.navigator.userAgent;
}
4.AES加密
import base64
import binascii
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad
data_string='{"openTime":1710319912672,"startTime":1710319913852,"endTime":1710319914153,"userAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36","uid":"0000ee00306c5d354d3029a9","track":["14010;499.00;553.00;0.00","14031;517.00;558.00;18.00","14056;568.00;564.00;69.00","14079;656.00;566.00;157.00","14102;752.00;566.00;253.00","14126;841.00;566.00;342.00","14150;918.00;566.00;419.00"],"acc":[],"ori":[],"deviceMotion":[{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true},{"isTrusted":true}]}'
# key = "227V2xYeHTARSh1R".encode('utf-8')
key_string = "32323756327859654854415253683152"
key = binascii.a2b_hex(key_string)
aes = AES.new(
key=key,
mode=AES.MODE_ECB
)
raw = pad(data_string.encode('utf-8'), 16)
aes_bytes = aes.encrypt(raw)
res = base64.b64encode(aes_bytes).decode('utf-8')
print(res)
5.X哪了案例
逆向轨迹snapshot
import json
import random
import time
import base64
import binascii
import requests
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad
##自己生成轨迹或者复制浏览器的注意每个人的浏览器窗口大小都不一样
# def get_slider_list():
# slider_list = []
# client_x = 300
# client_y = 500
# start_time = int(int(time.time() * 1000) % 1e5)
# width = random.randint(419, 431)
# for slice_distance in range(3, width, 26):
# if width - slice_distance <= 26:
# slice_distance = width
# start_time += random.randint(10, 1000)
# i = start_time
# o = f"{client_x + slice_distance}.00"
# u = f"{client_y + random.randint(-5, 5)}.00"
# a = f"{slice_distance}.00"
# f = f"{i};{o};{u};{a}"
# slider_list.append(f)
# return slider_list
slider_list = ["14010;499.00;553.00;0.00", "14031;517.00;558.00;18.00", "14056;568.00;564.00;69.00",
"14079;656.00;566.00;157.00", "14102;752.00;566.00;253.00", "14126;841.00;566.00;342.00",
"14150;918.00;566.00;419.00"]
#1.实现加密
def aes_encrypt(data_string):
# key = "227V2xYeHTARSh1R".encode('utf-8')
key_string = "32323756327859654854415253683152"
key = binascii.a2b_hex(key_string)
aes = AES.new(
key=key,
mode=AES.MODE_ECB
)
raw = pad(data_string.encode('utf-8'), 16)
aes_bytes = aes.encrypt(raw)
res_string = base64.b64encode(aes_bytes).decode('utf-8')
return res_string
def run():
res = requests.get("https://user.qunar.com/passport/login.jsp")
cookie_dict = res.cookies.get_dict()
cookie_qn1 = cookie_dict['QN1']
# slider_list = get_slider_list() #自己生成轨迹时开启
slider_info = {
"openTime": int((time.time() - random.randint(500, 3000)) * 1000),
"startTime": int((time.time() - random.uniform(2, 4)) * 1000),
"endTime": int((time.time() - random.uniform(0, 1)) * 1000),
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"uid": cookie_qn1,
"track": slider_list,
"acc": [],
"ori": [],
"deviceMotion": [{"isTrusted": True} for _ in range(random.randint(10, 100))]
}
data_string = json.dumps(slider_info, separators=(',', ':'))
data = aes_encrypt(data_string)
r = {
"appCode": "register_pc",
"cs": "pc",
"data": data,
"orca": 2
}
print(r)
res = requests.post(
url="https://vercode.qunar.com/inner/captcha/snapshot",
json=r,
cookies=cookie_dict
)
print(res.text)
if __name__ == '__main__':
run()
逆向提交sendLoginCode
看到这个 window.Bella
后就应该想到一个开发的潜规则:记载某个js文件,在内部将函数赋值给window,后续其他文件中就可以调用此方法。
html验证
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<script src="sdk.js"></script>
<script>
var res = window.Bella(
{slideToken: "15cf502c3128593b1a3237e5c484d6c9"},
{v: 2}
)
console.log(res);
</script>
</body>
</html>
补环境报错
还得加上上面的基础环境
XMLHttpRequest = function () {
return {
open:function (){},
send:function (){},
onreadystatechange :function (){}
};
};
window.XMLHttpRequest = XMLHttpRequest;
补环境失败一直卡主 直接退出JS
window['Bella'] = _0x47bb39;
var _0x6bf389 = Date[_0x5a69('0x324')]();
var _0x194046 = _0x6bf389 - _0x6ffe8a;
_0x51d2f1[_0x5a69('0x57')]('quinn_qlogj', _0x194046);
}
var bella =window.Bella(
{slideToken: process.argv[2},
{v: 2}
)
process.exit();
代码整合
import json
import random
import time
import base64
import binascii
import requests
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad
slider_list = ["14010;499.00;553.00;0.00", "14031;517.00;558.00;18.00", "14056;568.00;564.00;69.00",
"14079;656.00;566.00;157.00", "14102;752.00;566.00;253.00", "14126;841.00;566.00;342.00",
"14150;918.00;566.00;419.00"]
#1.实现加密
def aes_encrypt(data_string):
# key = "227V2xYeHTARSh1R".encode('utf-8')
key_string = "32323756327859654854415253683152"
key = binascii.a2b_hex(key_string)
aes = AES.new(
key=key,
mode=AES.MODE_ECB
)
raw = pad(data_string.encode('utf-8'), 16)
aes_bytes = aes.encrypt(raw)
res_string = base64.b64encode(aes_bytes).decode('utf-8')
return res_string
def run():
res = requests.get("https://user.qunar.com/passport/login.jsp")
cookie_dict = res.cookies.get_dict()
cookie_qn1 = cookie_dict['QN1']
# slider_list = get_slider_list() #自己生成轨迹时开启
slider_info = {
"openTime": int((time.time() - random.randint(500, 3000)) * 1000),
"startTime": int((time.time() - random.uniform(2, 4)) * 1000),
"endTime": int((time.time() - random.uniform(0, 1)) * 1000),
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"uid": cookie_qn1,
"track": slider_list,
"acc": [],
"ori": [],
"deviceMotion": [{"isTrusted": True} for _ in range(random.randint(10, 100))]
}
data_string = json.dumps(slider_info, separators=(',', ':'))
data = aes_encrypt(data_string)
r = {
"appCode": "register_pc",
"cs": "pc",
"data": data,
"orca": 2
}
print(r)
res = requests.post(
url="https://vercode.qunar.com/inner/captcha/snapshot",
json=r,
cookies=cookie_dict
)
res_dict = res.json()
slide_token = res_dict['data']["cst"]
cookie_dict.update(res.cookies.get_dict())
import subprocess
res = subprocess.check_output(f'node v1.js "{slide_token}"', shell=True)
bella_string = res.decode('utf-8').strip()
res = requests.post(
url="https://user.qunar.com/weblogin/sendLoginCode",
data={
"usersource": "",
"source": "",
"ret": "",
"ref": "",
"business": "",
"pid": "",
"originChannel": "",
"activityCode": "",
"origin": "",
"mobile": "自己的手机号",
"prenum": "86",
"loginSource": "1",
"slideToken": slide_token,
"smsType": "0",
"appcode": "register_pc",
"bella": bella_string,
"captchaType": ""
},
cookies=cookie_dict
)
print(res.text)
if __name__ == '__main__':
run()
短信登录
def run():
mobile = input("请输入手机号:")
res = requests.get("https://user.qunar.com/passport/login.jsp")
cookie_dict = res.cookies.get_dict()
cookie_qn1 = cookie_dict['QN1']
slider_list = get_slider_list()
slider_info = {
"openTime": int((time.time() - random.randint(500, 3000)) * 1000),
"startTime": int((time.time() - random.uniform(2, 4)) * 1000),
"endTime": int((time.time() - random.uniform(0, 1)) * 1000),
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"uid": cookie_qn1,
"track": slider_list,
"acc": [],
"ori": [],
"deviceMotion": [{"isTrusted": True} for _ in range(random.randint(10, 100))]
}
data_string = json.dumps(slider_info, separators=(',', ':'))
data = aes_encrypt(data_string)
res = requests.post(
url="https://vercode.qunar.com/inner/captcha/snapshot",
json={
"appCode": "register_pc",
"cs": "pc",
"data": data,
"orca": 2
},
cookies=cookie_dict
)
res_dict = res.json()
slide_token = res_dict['data']["cst"]
cookie_dict.update(res.cookies.get_dict())
import subprocess
res = subprocess.check_output(f'node v1.js "{slide_token}"', shell=True)
bella_string = res.decode('utf-8').strip()
res = requests.post(
url="https://user.qunar.com/weblogin/sendLoginCode",
data={
"usersource": "",
"source": "",
"ret": "",
"ref": "",
"business": "",
"pid": "",
"originChannel": "",
"activityCode": "",
"origin": "",
"mobile": mobile,
"prenum": "86",
"loginSource": "1",
"slideToken": slide_token,
"smsType": "0",
"appcode": "register_pc",
"bella": bella_string,
"captchaType": ""
},
cookies=cookie_dict
)
print(res.text)
cookie_dict.update(res.cookies.get_dict())
sms_code = input("请输入短信验证码:")
res = requests.post(
url="https://user.qunar.com/weblogin/verifyMobileVcode",
json={
"piccoloT": "login_register_pc",
"mobile": mobile,
"prenum": "86",
"vcode": sms_code,
"type": "3",
"slideToken": slide_token,
"appcode": "register_pc",
"loginSource": 1,
"captchaType": "",
"source": "",
"usersource": "",
"ret": "",
"ref": "",
"business": "",
"pid": "",
"originChannel": "",
"activityCode": ""
}
)
cookie_dict.update(res.cookies.get_dict())
print(res.text)
print(cookie_dict)
if __name__ == '__main__':
run()
6.TLS指纹绕过
只存在https
curl_cffi
- curl是一个可以发送网络请求的工具。
- curl-impersonate是一个基于curl基础上进行开发的一个工具,可以完美的模拟主流的浏览器。
- curl_cffi,是套壳curl-impersonate,让此工具可以更方便的应用在Python中。
pip install curl-cffi
from curl_cffi import requests
res = requests.get(
# url="https://ascii2d.net/",
# url="https://cn.investing.com/equities/amazon-com-inc-historical-data",
url="https://match.yuanrenxue.cn/api/match/19?page=1",
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
},
impersonate="chrome101",
)
print(res.text)
热爱技术,享受生活,感谢推荐!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?