...

Selenium中如何抓取网络请求响应及WebSocket信息

我们在使用Selenium测试Web或Electronjs/Cef框架应用时,有时候操作一个元素需要判断是否发送了请求以及请求的参数是否正确

我们可以通过,开启Chrome的性能日志来然后配合driver.get_log("performance")来查看请求,然后对Network相关的日子进行过滤,
实现如下:

获取Chrome性能日志

import json
from pprint import pprint
from selenium import webdriver

caps = {
    'browserName': 'chrome',
    'version': '',
    'platform': 'ANY',
    'goog:loggingPrefs': {'performance': 'ALL'},   # 记录性能日志
    'goog:chromeOptions': {'extensions': [], 'args': ['--headless']}  # 无界面模式
}

driver = webdriver.Chrome(desired_capabilities=caps)

driver.get('https://httpbin.org/get')
logs = driver.get_log("performance")
for item in logs:
    log = json.loads(item["message"])["message"]
    pprint(log)
    if "Network.response" in log["method"] or "Network.request" in log["method"] or "Network.webSocket" in log["method"]:
        pprint(log)

运行结果如下:

{'method': 'Network.responseReceived',
 'params': {'frameId': '2445B94E9E1DB51A1B1F4F3B0A3F03F5',
            'loaderId': 'D0DE1754D5C5F1E54DC3B0DB2A09ADD6',
            'requestId': 'D0DE1754D5C5F1E54DC3B0DB2A09ADD6',
            'response': {'connectionId': 0,
                         'connectionReused': False,
                         'encodedDataLength': -1,
                         'fromDiskCache': False,
                         'fromPrefetchCache': False,
                         'fromServiceWorker': False,
                         'headers': {'Content-Type': 'text/plain;charset=US-ASCII'},
                         'mimeType': 'text/plain',
                         'protocol': 'data',
                         'remoteIPAddress': '',
                         'remotePort': 0,
                         'securityState': 'secure',
                         'status': 200,
                         'statusText': 'OK',
                         'url': 'data:,'},
            'timestamp': 57524.763168,
            'type': 'Document'}}
{'method': 'Network.requestWillBeSent',
 'params': {'documentURL': 'https://httpbin.org/get',
            'frameId': '2445B94E9E1DB51A1B1F4F3B0A3F03F5',
            'hasUserGesture': False,
            'initiator': {'type': 'other'},
            'loaderId': '8BB61F3D2448E8BC91A4A5AD7E690673',
            'request': {'headers': {'Upgrade-Insecure-Requests': '1',
                                    'User-Agent': 'Mozilla/5.0 (Macintosh; '
                                                  'Intel Mac OS X 10_15_7) '
                                                  'AppleWebKit/537.36 (KHTML, '
                                                  'like Gecko) '
                                                  'HeadlessChrome/91.0.4472.114 '
                                                  'Safari/537.36'},
                        'initialPriority': 'VeryHigh',
                        'method': 'GET',
                        'mixedContentType': 'none',
                        'referrerPolicy': 'strict-origin-when-cross-origin',
                        'url': 'https://httpbin.org/get'},
            'requestId': '8BB61F3D2448E8BC91A4A5AD7E690673',
            'timestamp': 57524.961438,
            'type': 'Document',
            'wallTime': 1626501610.512192}}
{'method': 'Network.requestWillBeSentExtraInfo',
 'params': {'associatedCookies': [],
            'headers': {':authority': 'httpbin.org',
                        ':method': 'GET',
                        ':path': '/get',
                        ':scheme': 'https',
                        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                        'accept-encoding': 'gzip, deflate, br',
                        'accept-language': 'en-US',
                        'sec-fetch-dest': 'document',
                        'sec-fetch-mode': 'navigate',
                        'sec-fetch-site': 'none',
                        'sec-fetch-user': '?1',
                        'upgrade-insecure-requests': '1',
                        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X '
                                      '10_15_7) AppleWebKit/537.36 (KHTML, '
                                      'like Gecko) '
                                      'HeadlessChrome/91.0.4472.114 '
                                      'Safari/537.36'},
            'requestId': '8BB61F3D2448E8BC91A4A5AD7E690673'}}
{'method': 'Network.responseReceivedExtraInfo',
 'params': {'blockedCookies': [],
            'headers': {'access-control-allow-credentials': 'true',
                        'access-control-allow-origin': '*',
                        'content-length': '754',
                        'content-type': 'application/json',
                        'date': 'Sat, 17 Jul 2021 06:00:11 GMT',
                        'server': 'gunicorn/19.9.0'},
            'requestId': '8BB61F3D2448E8BC91A4A5AD7E690673',
            'resourceIPAddressSpace': 'Public'}}
{'method': 'Network.responseReceived',
 'params': {'frameId': '2445B94E9E1DB51A1B1F4F3B0A3F03F5',
            'loaderId': '8BB61F3D2448E8BC91A4A5AD7E690673',
            'requestId': '8BB61F3D2448E8BC91A4A5AD7E690673',
            'response': {'connectionId': 12,
                         'connectionReused': False,
                         'encodedDataLength': 123,
                         'fromDiskCache': False,
                         'fromPrefetchCache': False,
                         'fromServiceWorker': False,
                         'headers': {'access-control-allow-credentials': 'true',
                                     'access-control-allow-origin': '*',
                                     'content-length': '754',
                                     'content-type': 'application/json',
                                     'date': 'Sat, 17 Jul 2021 06:00:11 GMT',
                                     'server': 'gunicorn/19.9.0'},
                         'mimeType': 'application/json',
                         'protocol': 'h2',
                         'remoteIPAddress': '52.201.75.114',
                         'remotePort': 443,
                         'requestHeaders': {':authority': 'httpbin.org',
                                            ':method': 'GET',
                                            ':path': '/get',
                                            ':scheme': 'https',
                                            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                                            'accept-encoding': 'gzip, deflate, '
                                                               'br',
                                            'accept-language': 'en-US',
                                            'sec-fetch-dest': 'document',
                                            'sec-fetch-mode': 'navigate',
                                            'sec-fetch-site': 'none',
                                            'sec-fetch-user': '?1',
                                            'upgrade-insecure-requests': '1',
                                            'user-agent': 'Mozilla/5.0 '
                                                          '(Macintosh; Intel '
                                                          'Mac OS X 10_15_7) '
                                                          'AppleWebKit/537.36 '
                                                          '(KHTML, like Gecko) '
                                                          'HeadlessChrome/91.0.4472.114 '
                                                          'Safari/537.36'},
                         'responseTime': 1626501611316.694,
                         'securityDetails': {'certificateId': 0,
                                             'certificateTransparencyCompliance': 'unknown',
                                             'cipher': 'AES_128_GCM',
                                             'issuer': 'Amazon',
                                             'keyExchange': 'ECDHE_RSA',
                                             'keyExchangeGroup': 'P-256',
                                             'protocol': 'TLS 1.2',
                                             'sanList': ['httpbin.org',
                                                         '*.httpbin.org'],
                                             'signedCertificateTimestampList': [],
                                             'subjectName': 'httpbin.org',
                                             'validFrom': 1608508800,
                                             'validTo': 1642636799},
                         'securityState': 'secure',
                         'status': 200,
                         'statusText': '',
                         'timing': {'connectEnd': 548.386,
                                    'connectStart': 26.524,
                                    'dnsEnd': 26.524,
                                    'dnsStart': 14.11,
                                    'proxyEnd': -1,
                                    'proxyStart': -1,
                                    'pushEnd': 0,
                                    'pushStart': 0,
                                    'receiveHeadersEnd': 803.146,
                                    'requestTime': 57524.962922,
                                    'sendEnd': 548.745,
                                    'sendStart': 548.611,
                                    'sslEnd': 548.36,
                                    'sslStart': 277.934,
                                    'workerFetchStart': -1,
                                    'workerReady': -1,
                                    'workerRespondWithSettled': -1,
                                    'workerStart': -1},
                         'url': 'https://httpbin.org/get'},
            'timestamp': 57525.76746,
            'type': 'Document'}}

获取请求及响应信息

由于日志中没有接口后台数据和响应数据,我们可以通过执行cdp名利获取,修改后代码如下

import json
from pprint import pprint
from selenium import webdriver
from selenium.common.exceptions import WebDriverException


caps = {
    'browserName': 'chrome',
    'version': '',
    'platform': 'ANY',
    'goog:loggingPrefs': {'performance': 'ALL'},
    'goog:chromeOptions': {'extensions': [], 'args': ['--headless']}
}

driver = webdriver.Chrome(desired_capabilities=caps)

driver.get('https://httpbin.org/get')
logs = driver.get_log("performance")
for item in logs:
    log = json.loads(item["message"])["message"]
    # if "Network.response" in log["method"] or "Network.request" in log["method"] or "Network.webSocket" in log["method"]:
        # pprint(log)
    if log["method"] == 'Network.responseReceived':
        url = log['params']['response']['url']
        if url == 'data:,':  # 过滤掉初始data页面,后续可以根据 log['params']['response']['type']过滤请求类型
            continue
        print('请求', url)
        request_id = log['params']['requestId']

        request_headers = log['params']['response']['requestHeaders']
        response_headers = log['params']['response']['headers']
        response_time = log['params']['response']['responseTime']
        status_code = log['params']['response']['status']

        try:
            request_data = driver.execute_cdp_cmd('Network.getRequestPostData', {'requestId': request_id})
        except WebDriverException:  # 没有后台数据获取时会有异常
            request_data = None

        response_body = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})['body']
        print('响应', response_body)

执行后显示如下:

请求 https://httpbin.org/get
响应 {
  "args": {}, 
  "headers": {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Accept-Language": "en-US", 
    "Cache-Control": "max-age=0", 
    "Host": "httpbin.org", 
    "Sec-Fetch-Dest": "document", 
    "Sec-Fetch-Mode": "navigate", 
    "Sec-Fetch-Site": "none", 
    "Sec-Fetch-User": "?1", 
    "Upgrade-Insecure-Requests": "1", 
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/91.0.4472.114 Safari/537.36", 
    "X-Amzn-Trace-Id": "Root=1-60f2dd9d-6533f9526707f25f7d6c38de"
  }, 
  "origin": "123.118.150.190", 
  "url": "https://httpbin.org/get"
}

参考:How to Capture Network Traffic When Scraping with Selenium & Python
Chrome DevTools Protocol

posted @ 2021-07-17 14:02  韩志超  阅读(5300)  评论(0编辑  收藏  举报