python请求有关ja3指纹问题
遇见一个网站采集,无论怎样都返回空数据(实际上是有数据的),但是抓包下来又确实是那样的,请教了一些人推测是指纹验证,拜读了网上其他大佬的博客文章后实验了一下,发现确实是这个问题!
第一次知道tcp还有这个东西,让我大受震撼,值此搬运一下。
参考链接及来源:
Python 爬虫进阶必备 | JA3 指纹在爬虫中的应用与定向突破
python爬虫 requests、httpx、aiohttp、scrapy突破ja3指纹识别
实例:
requests
from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.ssl_ import create_urllib3_context import requests import random ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:' 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES') class DESAdapter(HTTPAdapter): def __init__(self, *args, **kwargs): """ A TransportAdapter that re-enables 3DES support in Requests. """ CIPHERS = ORIGIN_CIPHERS.split(':') random.shuffle(CIPHERS) CIPHERS = ':'.join(CIPHERS) self.CIPHERS = CIPHERS + ':!aNULL:!eNULL:!MD5' super().__init__(*args, **kwargs) def init_poolmanager(self, *args, **kwargs): context = create_urllib3_context(ciphers=self.CIPHERS) kwargs['ssl_context'] = context return super(DESAdapter, self).init_poolmanager(*args, **kwargs) def proxy_manager_for(self, *args, **kwargs): context = create_urllib3_context(ciphers=self.CIPHERS) kwargs['ssl_context'] = context return super(DESAdapter, self).proxy_manager_for(*args, **kwargs) headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'} s = requests.Session() s.headers.update(headers) s.mount('https://ja3er.com', DESAdapter()) resp = s.get('https://ja3er.com/json').json() print(resp)
aiohttp
import random import ssl import asyncio import aiohttp # ssl._create_default_https_context = ssl._create_unverified_context ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:' 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES') class SSLFactory: def __init__(self): self.ciphers = ORIGIN_CIPHERS.split(":") def __call__(self) -> ssl.SSLContext: random.shuffle(self.ciphers) ciphers = ":".join(self.ciphers) ciphers = ciphers + ":!aNULL:!eNULL:!MD5" context = ssl.create_default_context() context.set_ciphers(ciphers) return context sslgen = SSLFactory() async def main(): async with aiohttp.ClientSession() as session: async with session.get("https://ja3er.com/json", headers={}, ssl=sslgen()) as resp: data = await resp.json() print(data) asyncio.get_event_loop().run_until_complete(main())
httpx:
异步模式:
import httpx import asyncio import random import ssl ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:' 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES') class SSLFactory: def __init__(self): self.ciphers = ORIGIN_CIPHERS.split(":") def __call__(self) -> ssl.SSLContext: random.shuffle(self.ciphers) ciphers = ":".join(self.ciphers) ciphers = ciphers + ":!aNULL:!eNULL:!MD5" context = ssl.create_default_context() context.set_ciphers(ciphers) return context sslgen = SSLFactory() async def main(): async with httpx.AsyncClient(verify=sslgen()) as client: resp = await client.get('https://ja3er.com/json') result = resp.json() print(result) asyncio.run(main())
同步模式:
import httpx import asyncio import random import ssl ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:' 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES') class SSLFactory: def __init__(self): self.ciphers = ORIGIN_CIPHERS.split(":") def __call__(self) -> ssl.SSLContext: random.shuffle(self.ciphers) ciphers = ":".join(self.ciphers) ciphers = ciphers + ":!aNULL:!eNULL:!MD5" context = ssl.create_default_context() context.set_ciphers(ciphers) return context sslgen = SSLFactory() with httpx.Client(headers={}, http2=True, verify=sslgen()) as client: response = client.get('https://ja3er.com/json') print(response.text)
scrapy
class MyHTTPDownloadHandler(HTTPDownloadHandler): def shuffle_ciphers(self): self.ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:' 'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES') CIPHERS = self.ORIGIN_CIPHERS.split(':') random.shuffle(CIPHERS) CIPHERS = ':'.join(CIPHERS) + ':!aNULL:!eNULL:!MD5' return CIPHERS def download_request(self, request, spider): tls_ciphers = self.shuffle_ciphers() self._contextFactory = ScrapyClientContextFactory(tls_ciphers=tls_ciphers) return super().download_request(request, spider)
爬虫配置文件
custom_settings = { "CONCURRENT_REQUESTS": 5, "DOWNLOAD_DELAY": 1, "DOWNLOAD_TIMEOUT": 10, "RETRY_TIMES": 3, "DOWNLOAD_HANDLERS": { 'http': 'scrapy_project.middlewares.MyHTTPDownloadHandler', 'https': 'scrapy_project.middlewares.MyHTTPDownloadHandler', } }
curl_cffi
首先,安装一个第三方库:curl_cffi
:
python3 -m pip install curl_cffi
然后,修改我们这段代码的第一行,把import requests
改成from curl_cffi import requests
。最后,在requests.get
中加一个参数:impersonate="chrome110"