Selenium中如何抓取网络请求响应及WebSocket信息
我们在使用Selenium测试Web或Electronjs/Cef框架应用时,有时候操作一个元素需要判断是否发送了请求以及请求的参数是否正确
我们可以通过,开启Chrome的性能日志来然后配合driver.get_log("performance")来查看请求,然后对Network相关的日子进行过滤,
实现如下:
获取Chrome性能日志
import json
from pprint import pprint
from selenium import webdriver
caps = {
'browserName': 'chrome',
'version': '',
'platform': 'ANY',
'goog:loggingPrefs': {'performance': 'ALL'}, # 记录性能日志
'goog:chromeOptions': {'extensions': [], 'args': ['--headless']} # 无界面模式
}
driver = webdriver.Chrome(desired_capabilities=caps)
driver.get('https://httpbin.org/get')
logs = driver.get_log("performance")
for item in logs:
log = json.loads(item["message"])["message"]
pprint(log)
if "Network.response" in log["method"] or "Network.request" in log["method"] or "Network.webSocket" in log["method"]:
pprint(log)
运行结果如下:
{'method': 'Network.responseReceived',
'params': {'frameId': '2445B94E9E1DB51A1B1F4F3B0A3F03F5',
'loaderId': 'D0DE1754D5C5F1E54DC3B0DB2A09ADD6',
'requestId': 'D0DE1754D5C5F1E54DC3B0DB2A09ADD6',
'response': {'connectionId': 0,
'connectionReused': False,
'encodedDataLength': -1,
'fromDiskCache': False,
'fromPrefetchCache': False,
'fromServiceWorker': False,
'headers': {'Content-Type': 'text/plain;charset=US-ASCII'},
'mimeType': 'text/plain',
'protocol': 'data',
'remoteIPAddress': '',
'remotePort': 0,
'securityState': 'secure',
'status': 200,
'statusText': 'OK',
'url': 'data:,'},
'timestamp': 57524.763168,
'type': 'Document'}}
{'method': 'Network.requestWillBeSent',
'params': {'documentURL': 'https://httpbin.org/get',
'frameId': '2445B94E9E1DB51A1B1F4F3B0A3F03F5',
'hasUserGesture': False,
'initiator': {'type': 'other'},
'loaderId': '8BB61F3D2448E8BC91A4A5AD7E690673',
'request': {'headers': {'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; '
'Intel Mac OS X 10_15_7) '
'AppleWebKit/537.36 (KHTML, '
'like Gecko) '
'HeadlessChrome/91.0.4472.114 '
'Safari/537.36'},
'initialPriority': 'VeryHigh',
'method': 'GET',
'mixedContentType': 'none',
'referrerPolicy': 'strict-origin-when-cross-origin',
'url': 'https://httpbin.org/get'},
'requestId': '8BB61F3D2448E8BC91A4A5AD7E690673',
'timestamp': 57524.961438,
'type': 'Document',
'wallTime': 1626501610.512192}}
{'method': 'Network.requestWillBeSentExtraInfo',
'params': {'associatedCookies': [],
'headers': {':authority': 'httpbin.org',
':method': 'GET',
':path': '/get',
':scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X '
'10_15_7) AppleWebKit/537.36 (KHTML, '
'like Gecko) '
'HeadlessChrome/91.0.4472.114 '
'Safari/537.36'},
'requestId': '8BB61F3D2448E8BC91A4A5AD7E690673'}}
{'method': 'Network.responseReceivedExtraInfo',
'params': {'blockedCookies': [],
'headers': {'access-control-allow-credentials': 'true',
'access-control-allow-origin': '*',
'content-length': '754',
'content-type': 'application/json',
'date': 'Sat, 17 Jul 2021 06:00:11 GMT',
'server': 'gunicorn/19.9.0'},
'requestId': '8BB61F3D2448E8BC91A4A5AD7E690673',
'resourceIPAddressSpace': 'Public'}}
{'method': 'Network.responseReceived',
'params': {'frameId': '2445B94E9E1DB51A1B1F4F3B0A3F03F5',
'loaderId': '8BB61F3D2448E8BC91A4A5AD7E690673',
'requestId': '8BB61F3D2448E8BC91A4A5AD7E690673',
'response': {'connectionId': 12,
'connectionReused': False,
'encodedDataLength': 123,
'fromDiskCache': False,
'fromPrefetchCache': False,
'fromServiceWorker': False,
'headers': {'access-control-allow-credentials': 'true',
'access-control-allow-origin': '*',
'content-length': '754',
'content-type': 'application/json',
'date': 'Sat, 17 Jul 2021 06:00:11 GMT',
'server': 'gunicorn/19.9.0'},
'mimeType': 'application/json',
'protocol': 'h2',
'remoteIPAddress': '52.201.75.114',
'remotePort': 443,
'requestHeaders': {':authority': 'httpbin.org',
':method': 'GET',
':path': '/get',
':scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, '
'br',
'accept-language': 'en-US',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 '
'(Macintosh; Intel '
'Mac OS X 10_15_7) '
'AppleWebKit/537.36 '
'(KHTML, like Gecko) '
'HeadlessChrome/91.0.4472.114 '
'Safari/537.36'},
'responseTime': 1626501611316.694,
'securityDetails': {'certificateId': 0,
'certificateTransparencyCompliance': 'unknown',
'cipher': 'AES_128_GCM',
'issuer': 'Amazon',
'keyExchange': 'ECDHE_RSA',
'keyExchangeGroup': 'P-256',
'protocol': 'TLS 1.2',
'sanList': ['httpbin.org',
'*.httpbin.org'],
'signedCertificateTimestampList': [],
'subjectName': 'httpbin.org',
'validFrom': 1608508800,
'validTo': 1642636799},
'securityState': 'secure',
'status': 200,
'statusText': '',
'timing': {'connectEnd': 548.386,
'connectStart': 26.524,
'dnsEnd': 26.524,
'dnsStart': 14.11,
'proxyEnd': -1,
'proxyStart': -1,
'pushEnd': 0,
'pushStart': 0,
'receiveHeadersEnd': 803.146,
'requestTime': 57524.962922,
'sendEnd': 548.745,
'sendStart': 548.611,
'sslEnd': 548.36,
'sslStart': 277.934,
'workerFetchStart': -1,
'workerReady': -1,
'workerRespondWithSettled': -1,
'workerStart': -1},
'url': 'https://httpbin.org/get'},
'timestamp': 57525.76746,
'type': 'Document'}}
获取请求及响应信息
由于日志中没有接口后台数据和响应数据,我们可以通过执行cdp名利获取,修改后代码如下
import json
from pprint import pprint
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
caps = {
'browserName': 'chrome',
'version': '',
'platform': 'ANY',
'goog:loggingPrefs': {'performance': 'ALL'},
'goog:chromeOptions': {'extensions': [], 'args': ['--headless']}
}
driver = webdriver.Chrome(desired_capabilities=caps)
driver.get('https://httpbin.org/get')
logs = driver.get_log("performance")
for item in logs:
log = json.loads(item["message"])["message"]
# if "Network.response" in log["method"] or "Network.request" in log["method"] or "Network.webSocket" in log["method"]:
# pprint(log)
if log["method"] == 'Network.responseReceived':
url = log['params']['response']['url']
if url == 'data:,': # 过滤掉初始data页面,后续可以根据 log['params']['response']['type']过滤请求类型
continue
print('请求', url)
request_id = log['params']['requestId']
request_headers = log['params']['response']['requestHeaders']
response_headers = log['params']['response']['headers']
response_time = log['params']['response']['responseTime']
status_code = log['params']['response']['status']
try:
request_data = driver.execute_cdp_cmd('Network.getRequestPostData', {'requestId': request_id})
except WebDriverException: # 没有后台数据获取时会有异常
request_data = None
response_body = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})['body']
print('响应', response_body)
执行后显示如下:
请求 https://httpbin.org/get
响应 {
"args": {},
"headers": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US",
"Cache-Control": "max-age=0",
"Host": "httpbin.org",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/91.0.4472.114 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-60f2dd9d-6533f9526707f25f7d6c38de"
},
"origin": "123.118.150.190",
"url": "https://httpbin.org/get"
}
参考:How to Capture Network Traffic When Scraping with Selenium & Python
Chrome DevTools Protocol