[Python] Selenium监控网络请求

  Selenium监控网络有两种方式,第一种使用代理,第二种是使用CDP(Chrome DevTools Protocol)协议,下面直接分别介绍如何使用代理和CDP协议监控网络请求。

  一、使用Selenium-Wire设置代理拦截处理请求。

  Selenium-Wire是基于Selenium开发的抓包工具,基本使用方式如下:

from selenium import webdriver
from seleniumwire import webdriver as webdriverwire
from seleniumwire.request import Request, Response


def request_interceptor(request: Request):
    """处理请求"""
    print("request_interceptor", request.method, request.url)


def response_interceptor(request: Request, response: Response):
    """处理请求响应"""
    print("response_interceptor", request.method, request.url, response.status_code)

options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-errors")

# url正则表达式集合
scopes = set()
scopes.add("https?://.+/confirm")
ignore_http_methods = [
    "OPTIONS",
    "HEAD",
    "CONNECT",
    "TRACE",
    "PATCH",
]
seleniumwire_options = {
    # 过滤域名
    "exclude_hosts": ["www.exclude.com"],
    # 过滤请求方法
    "ignore_http_methods": ignore_http_methods,
    "verify_ssl": False,  # 不验证证书
    "enable_logging": True,
    "request_storage": "memory",  # 缓存到内存
    # "request_storage_base_dir": request_storage_base_dir,  # 设置请求缓存的目录
    "request_storage_max_size": 100,  # Store no more than 100 requests in memory
}

driver = webdriverwire.Chrome(
    options=options,
    seleniumwire_options=seleniumwire_options,
)
driver.request_interceptor = request_interceptor
driver.response_interceptor = response_interceptor
driver.scopes = list(scopes)

driver.get('https://www.baidu.com')

  解释:

  scopes:通过正则表达式过滤网络请求。

  exclude_hosts:过滤不使用代理的域名。

  request_interceptor:拦截处理请求。

  response_interceptor:拦截处理请求响应结果。

  注意:

  (1)要把https证书放程序目录的seleniumwire文件夹,若网页提示https不安全,需要导入seleniumwire https证书到受信任的根目录。

  (2)部分网站无法使用seleniumwire打开,会提示502,需要设置代理,下面示范使用Fiddler代理。

  使用Fiddler代理抓包:

  1. 安装并运行Fiddler Classic,设置开启HTTPS抓包,保证软件开启就行,不需要实时抓包,Fiddler默认使用8888端口代理。

  2. 设置seleniumwire_options使用Fiddler代理:

# 设置代理
seleniumwire_options["proxy"] = {
    "https": "https://127.0.0.1:8888",
    "http": "http://127.0.0.1:8888",
    "no_proxy": "localhost,127.0.0.1",  # 不使用代理的地址
}

 

  二、使用CDP协议监控网络请求。

"""CDP监控网络请求"""

import asyncio
import json
import re
import time
from typing import Dict, List
from selenium.webdriver.chrome.webdriver import WebDriver

from utils.log import ILog


class CdpNetworkMonitor:
    """CDP监控网络请求"""

    def __init__(self, driver: WebDriver, url_patterns: List[str], log: ILog):
        """
        :param url_patterns url正则表达式集合
        """
        self.driver = driver
        self.url_patterns = url_patterns
        self._log = log
        """url正则表达式集合"""
        self.last_log_timestamp = 0
        """最新日志时间戳"""
        self.requests: Dict[str, dict] = {}
        """请求集合"""
        self.last_flush_time: float = 0
        """上次读取性能日志时间"""
        self._stop_event = asyncio.Event()

    def start(self):
        """启用网络监控"""
        try:
            self.driver.execute_cdp_cmd("Network.enable", {})
            self._stop_event.clear()
            asyncio.create_task(self.__flush_periodically())
        except Exception as ex:
            self._log.write("启用网络监控", type(ex), ex)

    async def __flush_periodically(self):
        """定时读取性能日志"""
        self._log.write("定时读取性能日志")
        while not self._stop_event.is_set():
            self.flush()  # 每5秒调用一次flush
            await asyncio.sleep(1)

    def stop(self):
        """禁用网络监控"""
        try:
            self._stop_event.set()
            self.driver.execute_cdp_cmd("Network.disable", {})
        except Exception as ex:
            self._log.write("禁用网络监控", type(ex), ex)

    def flush(self, flush_interval=0):
        """CDP读取性能日志"""
        self._log.write("CDP读取性能日志")
        timestamp = time.time()
        if flush_interval and timestamp - self.last_flush_time < flush_interval:
            return
        self.last_flush_time = timestamp
        # 获取性能日志
        try:
            cdp_logs: List[dict] = self.driver.get_log("performance")
            for cdp_log in cdp_logs:
                try:
                    log_timestamp: int = cdp_log.get("timestamp")
                    if log_timestamp < self.last_log_timestamp:
                        # 日志已读取
                        continue
                    self.last_log_timestamp = log_timestamp
                    log_json: dict = json.loads(cdp_log["message"])
                    message: dict = log_json.get("message")
                    if not message:
                        self._log.write("CDP性能日志无法获取message")
                        continue
                    method: str = message.get("method")
                    if not method:
                        self._log.write("CDP性能日志无法获取method")
                        continue
                    if method == "Network.requestWillBeSent":
                        # 发起请求
                        request: dict = message["params"]["request"]
                        url = request.get("url")
                        if not any(
                            re.search(url_pattern, url)
                            for url_pattern in self.url_patterns
                        ):
                            continue
                        # 符合url正则表达式
                        request_id = message["params"]["requestId"]
                        self._log.write(
                            "network", "Network.requestWillBeSent", request_id, request
                        )
                        self.requests[request_id] = request
                    elif message.get("method") == "Network.responseReceived":
                        # 响应请求
                        request_id = message["params"]["requestId"]
                        if request_id not in self.requests:
                            continue
                        request = self.requests[request_id]
                        response: dict = message["params"]["response"]
                        try:
                            # 获取响应内容
                            response_body: dict = self.driver.execute_cdp_cmd(
                                "Network.getResponseBody", {"requestId": request_id}
                            )
                            response["body"] = response_body
                            self._log.write(
                                "network",
                                "Network.responseReceived",
                                request_id,
                                response,
                            )
                            request["response"] = response
                        except Exception as ex:
                            self._log.write("CDP获取响应内容", type(ex), ex)
                except Exception as ex:
                    self._log.write("CDP解析单个性能日志", type(ex), ex)
        except Exception as ex:
            self._log.write("CDP读取性能日志", type(ex), ex)

    def clear(self):
        """清理请求缓存"""
        try:
            self.requests.clear()
            self.driver.execute_cdp_cmd("Log.clear", {})
        except Exception as ex:
            self._log.write("清理请求缓存", type(ex), ex)

   解释:

  通过读取性能日志获取网络相关的内容,解析出请求和响应数据。

  CDP也可以拦截修改请求,但需要轮询性能日志并通过execute_cdp_cmd处理请求,若不及时处理拦截的请求会导致请求失败,假如driver.get()打开页面期间有未处理的请求导致网页打不开,则会一直卡在driver.get()。

"""CDP监控网络请求"""

import asyncio
import json
import re
import time
from typing import Dict, List
from selenium.webdriver.chrome.webdriver import WebDriver

from utils.log import ILog


class CdpNetworkMonitor:
    """CDP监控网络请求"""

    def __init__(self, driver: WebDriver, url_patterns: List[str], log: ILog):
        """
        :param url_patterns url正则表达式集合
        """
        self.driver = driver
        self.url_patterns = url_patterns
        self._log = log
        """url正则表达式集合"""
        self.last_log_timestamp = 0
        """最新日志时间戳"""
        self.requests: Dict[str, dict] = {}
        """请求集合"""
        self.last_flush_time: float = 0
        """上次读取性能日志时间"""
        self._stop_event = asyncio.Event()

    def start(self):
        """启用网络监控"""
        try:
            self.driver.execute_cdp_cmd("Network.enable", {})
            self._stop_event.clear()
            asyncio.create_task(self.__flush_periodically())
        except Exception as ex:
            self._log.write("启用网络监控", type(ex), ex)

    async def __flush_periodically(self):
        """定时读取性能日志"""
        self._log.write("定时读取性能日志")
        while not self._stop_event.is_set():
            self.flush()  # 每5秒调用一次flush
            await asyncio.sleep(1)

    def stop(self):
        """禁用网络监控"""
        try:
            self._stop_event.set()
            self.driver.execute_cdp_cmd("Network.disable", {})
        except Exception as ex:
            self._log.write("禁用网络监控", type(ex), ex)

    def flush(self, flush_interval=0):
        """CDP读取性能日志"""
        self._log.write("CDP读取性能日志")
        timestamp = time.time()
        if flush_interval and timestamp - self.last_flush_time < flush_interval:
            return
        self.last_flush_time = timestamp
        # 获取性能日志
        try:
            cdp_logs: List[dict] = self.driver.get_log("performance")
            for cdp_log in cdp_logs:
                try:
                    log_timestamp: int = cdp_log.get("timestamp")
                    if log_timestamp < self.last_log_timestamp:
                        # 日志已读取
                        continue
                    self.last_log_timestamp = log_timestamp
                    log_json: dict = json.loads(cdp_log["message"])
                    message: dict = log_json.get("message")
                    if not message:
                        self._log.write("CDP性能日志无法获取message")
                        continue
                    method: str = message.get("method")
                    if not method:
                        self._log.write("CDP性能日志无法获取method")
                        continue
                    if method == "Network.requestWillBeSent":
                        # 发起请求
                        request: dict = message["params"]["request"]
                        url = request.get("url")
                        if not any(
                            re.search(url_pattern, url)
                            for url_pattern in self.url_patterns
                        ):
                            continue
                        # 符合url正则表达式
                        request_id = message["params"]["requestId"]
                        self._log.write(
                            "network", "Network.requestWillBeSent", request_id, request
                        )
                        self.requests[request_id] = request
                    elif message.get("method") == "Network.responseReceived":
                        # 响应请求
                        request_id = message["params"]["requestId"]
                        if request_id not in self.requests:
                            continue
                        request = self.requests[request_id]
                        response: dict = message["params"]["response"]
                        try:
                            # 获取响应内容
                            response_body: dict = self.driver.execute_cdp_cmd(
                                "Network.getResponseBody", {"requestId": request_id}
                            )
                            response["body"] = response_body
                            self._log.write(
                                "network",
                                "Network.responseReceived",
                                request_id,
                                response,
                            )
                            request["response"] = response
                        except Exception as ex:
                            self._log.write("CDP获取响应内容", type(ex), ex)
                except Exception as ex:
                    self._log.write("CDP解析单个性能日志", type(ex), ex)
        except Exception as ex:
            self._log.write("CDP读取性能日志", type(ex), ex)

    def clear(self):
        """清理请求缓存"""
        try:
            self.requests.clear()
            self.driver.execute_cdp_cmd("Log.clear", {})
        except Exception as ex:
            self._log.write("清理请求缓存", type(ex), ex)
posted @ 2024-10-22 17:54  孤独成派  阅读(523)  评论(0编辑  收藏  举报