python请求有关ja3指纹问题

遇见一个网站采集,无论怎样都返回空数据(实际上是有数据的),但是抓包下来又确实是那样的,请教了一些人推测是指纹验证,拜读了网上其他大佬的博客文章后实验了一下,发现确实是这个问题!
第一次知道tcp还有这个东西,让我大受震撼,值此搬运一下。

参考链接及来源:
Python 爬虫进阶必备 | JA3 指纹在爬虫中的应用与定向突破
python爬虫 requests、httpx、aiohttp、scrapy突破ja3指纹识别

实例:

requests

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
import requests
import random
ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
                  'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')


class DESAdapter(HTTPAdapter):
    def __init__(self, *args, **kwargs):
        """
        A TransportAdapter that re-enables 3DES support in Requests.
        """
        CIPHERS = ORIGIN_CIPHERS.split(':')
        random.shuffle(CIPHERS)
        CIPHERS = ':'.join(CIPHERS)
        self.CIPHERS = CIPHERS + ':!aNULL:!eNULL:!MD5'
        super().__init__(*args, **kwargs)

    def init_poolmanager(self, *args, **kwargs):
        context = create_urllib3_context(ciphers=self.CIPHERS)
        kwargs['ssl_context'] = context
        return super(DESAdapter, self).init_poolmanager(*args, **kwargs)

    def proxy_manager_for(self, *args, **kwargs):
        context = create_urllib3_context(ciphers=self.CIPHERS)
        kwargs['ssl_context'] = context
        return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'}
s = requests.Session()
s.headers.update(headers)
s.mount('https://ja3er.com', DESAdapter())
resp = s.get('https://ja3er.com/json').json()
print(resp)

aiohttp

import random
import ssl
import asyncio
import aiohttp

# ssl._create_default_https_context = ssl._create_unverified_context


ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
                  'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')


class SSLFactory:
    def __init__(self):
        self.ciphers = ORIGIN_CIPHERS.split(":")

    def __call__(self) -> ssl.SSLContext:
        random.shuffle(self.ciphers)
        ciphers = ":".join(self.ciphers)
        ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
        context = ssl.create_default_context()
        context.set_ciphers(ciphers)
        return context


sslgen = SSLFactory()
async def main():
    async with aiohttp.ClientSession() as session:
         async with session.get("https://ja3er.com/json", headers={}, ssl=sslgen()) as resp:
                data = await resp.json()
                print(data)

asyncio.get_event_loop().run_until_complete(main())

httpx:

异步模式:
import httpx
import asyncio
import random
import ssl

ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
                  'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')


class SSLFactory:
    def __init__(self):
        self.ciphers = ORIGIN_CIPHERS.split(":")

    def __call__(self) -> ssl.SSLContext:
        random.shuffle(self.ciphers)
        ciphers = ":".join(self.ciphers)
        ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
        context = ssl.create_default_context()
        context.set_ciphers(ciphers)
        return context


sslgen = SSLFactory()
async def main():
    async with httpx.AsyncClient(verify=sslgen()) as client:
        resp = await client.get('https://ja3er.com/json')
        result = resp.json()
        print(result)


asyncio.run(main())
同步模式:
import httpx
import asyncio
import random
import ssl

ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
                  'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')


class SSLFactory:
    def __init__(self):
        self.ciphers = ORIGIN_CIPHERS.split(":")

    def __call__(self) -> ssl.SSLContext:
        random.shuffle(self.ciphers)
        ciphers = ":".join(self.ciphers)
        ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
        context = ssl.create_default_context()
        context.set_ciphers(ciphers)
        return context


sslgen = SSLFactory()
with httpx.Client(headers={}, http2=True, verify=sslgen()) as client:
    response = client.get('https://ja3er.com/json')
    print(response.text)

scrapy

class MyHTTPDownloadHandler(HTTPDownloadHandler):
    def shuffle_ciphers(self):
        self.ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
                               'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')
        CIPHERS = self.ORIGIN_CIPHERS.split(':')
        random.shuffle(CIPHERS)
        CIPHERS = ':'.join(CIPHERS) + ':!aNULL:!eNULL:!MD5'
        return CIPHERS

    def download_request(self, request, spider):
        tls_ciphers = self.shuffle_ciphers()
        self._contextFactory = ScrapyClientContextFactory(tls_ciphers=tls_ciphers)
        return super().download_request(request, spider)

爬虫配置文件

custom_settings = {
        "CONCURRENT_REQUESTS": 5,
        "DOWNLOAD_DELAY": 1,
        "DOWNLOAD_TIMEOUT": 10,
        "RETRY_TIMES": 3,
           "DOWNLOAD_HANDLERS": {
            'http': 'scrapy_project.middlewares.MyHTTPDownloadHandler',
            'https': 'scrapy_project.middlewares.MyHTTPDownloadHandler',
        }
    }

 curl_cffi

  首先,安装一个第三方库:curl_cffi:

python3 -m pip install curl_cffi

     然后,修改我们这段代码的第一行,把import requests改成from curl_cffi import requests。最后,在requests.get中加一个参数:impersonate="chrome110"



posted @ 2023-06-21 14:11  阿布_alone  阅读(690)  评论(0编辑  收藏  举报
TOP