爬虫获取网页开发者模式NetWork信息
using System; using System.Collections.Generic; using System.Linq; using System.Threading; using System.Threading.Tasks; using OpenQA.Selenium; using OpenQA.Selenium.DevTools; using OpenQA.Selenium.DevTools.V85.Network; using DevToolsSessionDomains = OpenQA.Selenium.DevTools.V85.DevToolsSessionDomains; // 2023.11.20 author by Zingu ft. NewBing namespace PuppeteerSharp_Test { public class Demo { // 构造方法按需设置 public Demo() { } private List<Dictionary<string, string>> _responses = new List<Dictionary<string, string>>(); // 线程锁 private object obj_lock = new object(); public async Task<List<Tuple<string, string>>> GetListAsync(string url) { List<Tuple<string, string>> ls = new List<Tuple<string, string>>(); using (IWebDriver driver = new OpenQA.Selenium.IE.InternetExplorerDriver()) { driver.Navigate().GoToUrl(url); Thread.Sleep(500); var videoElements1 = driver.FindElement(By.ClassName("course-list")); var videoElements= videoElements1.FindElements(By.ClassName("course-link")); foreach (var item in videoElements) { var aElement = item.FindElement(By.TagName("a")); string href = aElement.GetAttribute("href"); string title = aElement.GetAttribute("title"); ls.Add(new Tuple<string, string>(href, title)); } } return ls; } /// <summary> /// 主要使用逻辑, 异步方法 /// </summary> /// <returns></returns> public async Task RunAsync() { // 初始化一个驱动, 本例中未设置 options 参数 // var dr = new OpenQA.Selenium.Chrome.ChromeDriver(); var dr = new OpenQA.Selenium.Edge.EdgeDriver(); // 初始化 session var session = dr.GetDevToolsSession(85); // 初始化 domains var domains = session.GetVersionSpecificDomains<DevToolsSessionDomains>(); // 设置 Network 为 Enable await domains.Network.Enable(new OpenQA.Selenium.DevTools.V85.Network.EnableCommandSettings()); // 订阅 接收 Response 事件 domains.Network.ResponseReceived += Network_ResponseReceived; // 打开目标网站 dr.Url = "https://open.163.com/newview/movie/free?pid=MA5T0OVML&mid=MA5T1488U"; // 设置 获取 Response body 的参数 var cmd = new OpenQA.Selenium.DevTools.V85.Network.GetResponseBodyCommandSettings(); await Task.Delay(500); foreach (var item in GetRequestUrl()) { Console.WriteLine(item); } /* // 获取 RequestId 加入参数中 cmd.RequestId = GetRequestId(); // rlt 是最后获得的 Response body var rlt = domains.Network.GetResponseBody(cmd).GetAwaiter().GetResult(); // 输出得到的结果 //Console.WriteLine(rlt.Body); */ // 设置 Network 为不可用 await domains.Network.Disable(); // 关闭驱动 dr.Quit(); Console.ReadLine(); } private List<string> GetRequestUrl() { List<string> rlt =new List<string>(); List<Dictionary<string, string>> box; // responses List 要加锁 lock (obj_lock) { box = _responses.ToList(); } foreach (var u in box) { // 结合需求设置 // if (u["url"] != null&& (u["url"].Contains(".mp4")|| u["url"].Contains(".srt")))// <筛选的条件> if (u["url"] != null && (u["url"].Contains(".mp4") || u["url"].Contains(".srt"))) rlt.Add(u["url"]); } return rlt; } /// <summary> /// 获取 RequestId /// </summary> /// <returns></returns> private string GetRequestId() { string rlt = ""; List<Dictionary<string, string>> box; // responses List 要加锁 lock (obj_lock) { box = _responses.ToList(); } foreach (var u in box) { // 结合需求设置 // if (u["url"] != null&& (u["url"].Contains(".mp4")|| u["url"].Contains(".srt")))// <筛选的条件> if (u["url"] != null && (u["url"].Contains(".mp4") || u["url"].Contains(".srt"))) Console.WriteLine(u["url"]); if (u["url"] != null )// <筛选的条件> { rlt = u["requestId"]; } else { continue; } } return rlt; } /// <summary> /// 接收 Response 事件 /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void Network_ResponseReceived(object sender, OpenQA.Selenium.DevTools.V85.Network.ResponseReceivedEventArgs e) { var dic = new Dictionary<string, string>(); // e 中是接收到的 Response 信息, 本例中只需求 requestId 和 url dic.Add("requestId", e.RequestId); dic.Add("url", e.Response.Url); // responses List 要加锁 lock (obj_lock) { _responses.Add(dic); } } } }