用selenium+浏览器的performance log
具体方法是webdriver(python代码控制浏览器的一个组件)能够让我们给浏览器发送Network.getResponseBody命令得到response。webdriver提供的API文档:
https://chromedevtools.github.io/devtools-protocol/tot/Network/
需要我们通过一个叫做requestId的参数才能得到response。
首先,初始化一个浏览器控制实例的时候,要开启{"performance": "ALL"}
def __init_driver(self):
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"} # chromedriver 75+
option = webdriver.ChromeOptions()
option.add_argument(r"user-data-dir=./var/chrome-data")
self.__driver = webdriver.Chrome(desired_capabilities=capabilities, options=option)
然后
def __scrape(self, url):
self.__driver.get(url)
time.sleep(3) # 等待页面中的请求完成
logs = self.__driver.get_log("performance")
logs里面就是页面中所有的请求和响应,我们接下来就是需要遍历里面的每一条数据,找到属于Network.responseReceived类型的,并且request url是我们要抓取的那些数据,从中拿到requestId,然后就可以用Network.getResponseBody拿到response了。
def process_network_event(driver, logs, match_url):
for entry in logs:
message = json.loads(entry["message"]).get("message", {})
method = message.get("method", "")
is_method_match = method.startswith("Network.responseReceived")
if not is_method_match:
continue
url = message.get("params", {}).get("response", {}).get("url", "")
if url == "":
continue
if not url.startswith(match_url): # 匹配我们想要的url
continue
request_id = message.get("params", {}).get("requestId", "")
if request_id == "":
continue
try:
response_body = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
except Exception as e:
print(f"getResponseBody by {request_id} failed: {e}, with message: {message}")
response_body = None
if not response_body:
continue
json_string = response_body.get("body", "")
if json_string == "":
continue
response = json.loads(json_string)
return response