python请求有关ja3指纹问题
遇见一个网站采集,无论怎样都返回空数据(实际上是有数据的),但是抓包下来又确实是那样的,请教了一些人推测是指纹验证,拜读了网上其他大佬的博客文章后实验了一下,发现确实是这个问题!
第一次知道tcp还有这个东西,让我大受震撼,值此搬运一下。
参考链接及来源:
Python 爬虫进阶必备 | JA3 指纹在爬虫中的应用与定向突破
python爬虫 requests、httpx、aiohttp、scrapy突破ja3指纹识别
实例:
requests
from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.ssl_ import create_urllib3_context import requests import random ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:' 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES') class DESAdapter(HTTPAdapter): def __init__(self, *args, **kwargs): """ A TransportAdapter that re-enables 3DES support in Requests. """ CIPHERS = ORIGIN_CIPHERS.split(':') random.shuffle(CIPHERS) CIPHERS = ':'.join(CIPHERS) self.CIPHERS = CIPHERS + ':!aNULL:!eNULL:!MD5' super().__init__(*args, **kwargs) def init_poolmanager(self, *args, **kwargs): context = create_urllib3_context(ciphers=self.CIPHERS) kwargs['ssl_context'] = context return super(DESAdapter, self).init_poolmanager(*args, **kwargs) def proxy_manager_for(self, *args, **kwargs): context = create_urllib3_context(ciphers=self.CIPHERS) kwargs['ssl_context'] = context return super(DESAdapter, self).proxy_manager_for(*args, **kwargs) headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'} s = requests.Session() s.headers.update(headers) s.mount('https://ja3er.com', DESAdapter()) resp = s.get('https://ja3er.com/json').json() print(resp)
aiohttp
import random import ssl import asyncio import aiohttp # ssl._create_default_https_context = ssl._create_unverified_context ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:' 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES') class SSLFactory: def __init__(self): self.ciphers = ORIGIN_CIPHERS.split(":") def __call__(self) -> ssl.SSLContext: random.shuffle(self.ciphers) ciphers = ":".join(self.ciphers) ciphers = ciphers + ":!aNULL:!eNULL:!MD5" context = ssl.create_default_context() context.set_ciphers(ciphers) return context sslgen = SSLFactory() async def main(): async with aiohttp.ClientSession() as session: async with session.get("https://ja3er.com/json", headers={}, ssl=sslgen()) as resp: data = await resp.json() print(data) asyncio.get_event_loop().run_until_complete(main())
httpx:
异步模式:
import httpx import asyncio import random import ssl ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:' 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES') class SSLFactory: def __init__(self): self.ciphers = ORIGIN_CIPHERS.split(":") def __call__(self) -> ssl.SSLContext: random.shuffle(self.ciphers) ciphers = ":".join(self.ciphers) ciphers = ciphers + ":!aNULL:!eNULL:!MD5" context = ssl.create_default_context() context.set_ciphers(ciphers) return context sslgen = SSLFactory() async def main(): async with httpx.AsyncClient(verify=sslgen()) as client: resp = await client.get('https://ja3er.com/json') result = resp.json() print(result) asyncio.run(main())
同步模式:
import httpx import asyncio import random import ssl ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:' 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES') class SSLFactory: def __init__(self): self.ciphers = ORIGIN_CIPHERS.split(":") def __call__(self) -> ssl.SSLContext: random.shuffle(self.ciphers) ciphers = ":".join(self.ciphers) ciphers = ciphers + ":!aNULL:!eNULL:!MD5" context = ssl.create_default_context() context.set_ciphers(ciphers) return context sslgen = SSLFactory() with httpx.Client(headers={}, http2=True, verify=sslgen()) as client: response = client.get('https://ja3er.com/json') print(response.text)
scrapy
class MyHTTPDownloadHandler(HTTPDownloadHandler): def shuffle_ciphers(self): self.ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:' 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES') CIPHERS = self.ORIGIN_CIPHERS.split(':') random.shuffle(CIPHERS) CIPHERS = ':'.join(CIPHERS) + ':!aNULL:!eNULL:!MD5' return CIPHERS def download_request(self, request, spider): tls_ciphers = self.shuffle_ciphers() self._contextFactory = ScrapyClientContextFactory(tls_ciphers=tls_ciphers) return super().download_request(request, spider)
爬虫配置文件
custom_settings = { "CONCURRENT_REQUESTS": 5, "DOWNLOAD_DELAY": 1, "DOWNLOAD_TIMEOUT": 10, "RETRY_TIMES": 3, "DOWNLOAD_HANDLERS": { 'http': 'scrapy_project.middlewares.MyHTTPDownloadHandler', 'https': 'scrapy_project.middlewares.MyHTTPDownloadHandler', } }
curl_cffi
首先,安装一个第三方库:curl_cffi
:
python3 -m pip install curl_cffi
然后,修改我们这段代码的第一行,把import requests
改成from curl_cffi import requests
。最后,在requests.get
中加一个参数:impersonate="chrome110"
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理