[Python] Selenium监控网络请求
Selenium监控网络有两种方式,第一种使用代理,第二种是使用CDP(Chrome DevTools Protocol)协议,下面直接分别介绍如何使用代理和CDP协议监控网络请求。
一、使用Selenium-Wire设置代理拦截处理请求。
Selenium-Wire是基于Selenium开发的抓包工具,基本使用方式如下:
from selenium import webdriver from seleniumwire import webdriver as webdriverwire from seleniumwire.request import Request, Response def request_interceptor(request: Request): """处理请求""" print("request_interceptor", request.method, request.url) def response_interceptor(request: Request, response: Response): """处理请求响应""" print("response_interceptor", request.method, request.url, response.status_code) options = webdriver.ChromeOptions() options.add_argument("--ignore-certificate-errors") # url正则表达式集合 scopes = set() scopes.add("https?://.+/confirm") ignore_http_methods = [ "OPTIONS", "HEAD", "CONNECT", "TRACE", "PATCH", ] seleniumwire_options = { # 过滤域名 "exclude_hosts": ["www.exclude.com"], # 过滤请求方法 "ignore_http_methods": ignore_http_methods, "verify_ssl": False, # 不验证证书 "enable_logging": True, "request_storage": "memory", # 缓存到内存 # "request_storage_base_dir": request_storage_base_dir, # 设置请求缓存的目录 "request_storage_max_size": 100, # Store no more than 100 requests in memory } driver = webdriverwire.Chrome( options=options, seleniumwire_options=seleniumwire_options, ) driver.request_interceptor = request_interceptor driver.response_interceptor = response_interceptor driver.scopes = list(scopes) driver.get('https://www.baidu.com')
解释:
scopes:通过正则表达式过滤网络请求。
exclude_hosts:过滤不使用代理的域名。
request_interceptor:拦截处理请求。
response_interceptor:拦截处理请求响应结果。
注意:
(1)要把https证书放程序目录的seleniumwire文件夹,若网页提示https不安全,需要导入seleniumwire https证书到受信任的根目录。
(2)部分网站无法使用seleniumwire打开,会提示502,需要设置代理,下面示范使用Fiddler代理。
使用Fiddler代理抓包:
1. 安装并运行Fiddler Classic,设置开启HTTPS抓包,保证软件开启就行,不需要实时抓包,Fiddler默认使用8888端口代理。
2. 设置seleniumwire_options使用Fiddler代理:
# 设置代理 seleniumwire_options["proxy"] = { "https": "https://127.0.0.1:8888", "http": "http://127.0.0.1:8888", "no_proxy": "localhost,127.0.0.1", # 不使用代理的地址 }
二、使用CDP协议监控网络请求。
"""CDP监控网络请求"""
import asyncio
import json
import re
import time
from typing import Dict, List
from selenium.webdriver.chrome.webdriver import WebDriver
from utils.log import ILog
class CdpNetworkMonitor:
"""CDP监控网络请求"""
def __init__(self, driver: WebDriver, url_patterns: List[str], log: ILog):
"""
:param url_patterns url正则表达式集合
"""
self.driver = driver
self.url_patterns = url_patterns
self._log = log
"""url正则表达式集合"""
self.last_log_timestamp = 0
"""最新日志时间戳"""
self.requests: Dict[str, dict] = {}
"""请求集合"""
self.last_flush_time: float = 0
"""上次读取性能日志时间"""
self._stop_event = asyncio.Event()
def start(self):
"""启用网络监控"""
try:
self.driver.execute_cdp_cmd("Network.enable", {})
self._stop_event.clear()
asyncio.create_task(self.__flush_periodically())
except Exception as ex:
self._log.write("启用网络监控", type(ex), ex)
async def __flush_periodically(self):
"""定时读取性能日志"""
self._log.write("定时读取性能日志")
while not self._stop_event.is_set():
self.flush() # 每5秒调用一次flush
await asyncio.sleep(1)
def stop(self):
"""禁用网络监控"""
try:
self._stop_event.set()
self.driver.execute_cdp_cmd("Network.disable", {})
except Exception as ex:
self._log.write("禁用网络监控", type(ex), ex)
def flush(self, flush_interval=0):
"""CDP读取性能日志"""
self._log.write("CDP读取性能日志")
timestamp = time.time()
if flush_interval and timestamp - self.last_flush_time < flush_interval:
return
self.last_flush_time = timestamp
# 获取性能日志
try:
cdp_logs: List[dict] = self.driver.get_log("performance")
for cdp_log in cdp_logs:
try:
log_timestamp: int = cdp_log.get("timestamp")
if log_timestamp < self.last_log_timestamp:
# 日志已读取
continue
self.last_log_timestamp = log_timestamp
log_json: dict = json.loads(cdp_log["message"])
message: dict = log_json.get("message")
if not message:
self._log.write("CDP性能日志无法获取message")
continue
method: str = message.get("method")
if not method:
self._log.write("CDP性能日志无法获取method")
continue
if method == "Network.requestWillBeSent":
# 发起请求
request: dict = message["params"]["request"]
url = request.get("url")
if not any(
re.search(url_pattern, url)
for url_pattern in self.url_patterns
):
continue
# 符合url正则表达式
request_id = message["params"]["requestId"]
self._log.write(
"network", "Network.requestWillBeSent", request_id, request
)
self.requests[request_id] = request
elif message.get("method") == "Network.responseReceived":
# 响应请求
request_id = message["params"]["requestId"]
if request_id not in self.requests:
continue
request = self.requests[request_id]
response: dict = message["params"]["response"]
try:
# 获取响应内容
response_body: dict = self.driver.execute_cdp_cmd(
"Network.getResponseBody", {"requestId": request_id}
)
response["body"] = response_body
self._log.write(
"network",
"Network.responseReceived",
request_id,
response,
)
request["response"] = response
except Exception as ex:
self._log.write("CDP获取响应内容", type(ex), ex)
except Exception as ex:
self._log.write("CDP解析单个性能日志", type(ex), ex)
except Exception as ex:
self._log.write("CDP读取性能日志", type(ex), ex)
def clear(self):
"""清理请求缓存"""
try:
self.requests.clear()
self.driver.execute_cdp_cmd("Log.clear", {})
except Exception as ex:
self._log.write("清理请求缓存", type(ex), ex)
解释:
通过读取性能日志获取网络相关的内容,解析出请求和响应数据。
CDP也可以拦截修改请求,但需要轮询性能日志并通过execute_cdp_cmd处理请求,若不及时处理拦截的请求会导致请求失败,假如driver.get()打开页面期间有未处理的请求导致网页打不开,则会一直卡在driver.get()。
"""CDP监控网络请求"""
import asyncio
import json
import re
import time
from typing import Dict, List
from selenium.webdriver.chrome.webdriver import WebDriver
from utils.log import ILog
class CdpNetworkMonitor:
"""CDP监控网络请求"""
def __init__(self, driver: WebDriver, url_patterns: List[str], log: ILog):
"""
:param url_patterns url正则表达式集合
"""
self.driver = driver
self.url_patterns = url_patterns
self._log = log
"""url正则表达式集合"""
self.last_log_timestamp = 0
"""最新日志时间戳"""
self.requests: Dict[str, dict] = {}
"""请求集合"""
self.last_flush_time: float = 0
"""上次读取性能日志时间"""
self._stop_event = asyncio.Event()
def start(self):
"""启用网络监控"""
try:
self.driver.execute_cdp_cmd("Network.enable", {})
self._stop_event.clear()
asyncio.create_task(self.__flush_periodically())
except Exception as ex:
self._log.write("启用网络监控", type(ex), ex)
async def __flush_periodically(self):
"""定时读取性能日志"""
self._log.write("定时读取性能日志")
while not self._stop_event.is_set():
self.flush() # 每5秒调用一次flush
await asyncio.sleep(1)
def stop(self):
"""禁用网络监控"""
try:
self._stop_event.set()
self.driver.execute_cdp_cmd("Network.disable", {})
except Exception as ex:
self._log.write("禁用网络监控", type(ex), ex)
def flush(self, flush_interval=0):
"""CDP读取性能日志"""
self._log.write("CDP读取性能日志")
timestamp = time.time()
if flush_interval and timestamp - self.last_flush_time < flush_interval:
return
self.last_flush_time = timestamp
# 获取性能日志
try:
cdp_logs: List[dict] = self.driver.get_log("performance")
for cdp_log in cdp_logs:
try:
log_timestamp: int = cdp_log.get("timestamp")
if log_timestamp < self.last_log_timestamp:
# 日志已读取
continue
self.last_log_timestamp = log_timestamp
log_json: dict = json.loads(cdp_log["message"])
message: dict = log_json.get("message")
if not message:
self._log.write("CDP性能日志无法获取message")
continue
method: str = message.get("method")
if not method:
self._log.write("CDP性能日志无法获取method")
continue
if method == "Network.requestWillBeSent":
# 发起请求
request: dict = message["params"]["request"]
url = request.get("url")
if not any(
re.search(url_pattern, url)
for url_pattern in self.url_patterns
):
continue
# 符合url正则表达式
request_id = message["params"]["requestId"]
self._log.write(
"network", "Network.requestWillBeSent", request_id, request
)
self.requests[request_id] = request
elif message.get("method") == "Network.responseReceived":
# 响应请求
request_id = message["params"]["requestId"]
if request_id not in self.requests:
continue
request = self.requests[request_id]
response: dict = message["params"]["response"]
try:
# 获取响应内容
response_body: dict = self.driver.execute_cdp_cmd(
"Network.getResponseBody", {"requestId": request_id}
)
response["body"] = response_body
self._log.write(
"network",
"Network.responseReceived",
request_id,
response,
)
request["response"] = response
except Exception as ex:
self._log.write("CDP获取响应内容", type(ex), ex)
except Exception as ex:
self._log.write("CDP解析单个性能日志", type(ex), ex)
except Exception as ex:
self._log.write("CDP读取性能日志", type(ex), ex)
def clear(self):
"""清理请求缓存"""
try:
self.requests.clear()
self.driver.execute_cdp_cmd("Log.clear", {})
except Exception as ex:
self._log.write("清理请求缓存", type(ex), ex)