使用 WebBrowser 获取Ajax动态加载网页信息
直接上代码(代码较粗糙,可根据需要优化):
WebBrowser 直接执行时会报一个单线程的问题,我的解决方法是:使用“STAThread”,指定线程模型为单线程单元
[STAThread]
static void Main(string[] args)
using System; using System.IO; using System.Net; using System.Text; using System.Windows.Forms; using System.Text.RegularExpressions; using System.Collections.Specialized; namespace CrawlerTest { public class HttpHelper { /// <summary> /// 下载Ajax Html /// </summary> /// <param name="url"></param> /// <returns></returns> public static string DownloadAjaxHtml(string url) { string htmlstr = null; try { WebBrowser wb = new WebBrowser(); wb.AllowNavigation = true; wb.ScriptErrorsSuppressed = true; int hitCount = 1; wb.Navigating += (sender, e) => { hitCount++; }; wb.DocumentCompleted += (sender, e) => { hitCount++; }; wb.Navigate(url); DateTime dtime = DateTime.Now; double timespan = 0; while (timespan <= 3 || wb.ReadyState != WebBrowserReadyState.Complete) { Application.DoEvents(); DateTime time2 = DateTime.Now; timespan = (time2 - dtime).TotalSeconds; } if (wb.ReadyState == WebBrowserReadyState.Complete) { htmlstr = wb.Document.Body.OuterHtml; htmlstr = System.Web.HttpUtility.UrlDecode(htmlstr);//解码 } } catch (Exception ex) { Console.WriteLine($"DownloadAjaxHtml-Error:{ex.ToString()}"); } return htmlstr; } //获取Html后再获取想要的内容 public static List<NewsHotTitle> GetHotTitle(Encoding encoding) { var url = "http://www.news.cn/2021homepro/rsznb/"; string strHtml = HttpHelper.DownloadAjaxHtml(url); if (string.IsNullOrEmpty(strHtml)) { Console.WriteLine($"获取数据失败"); } HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(strHtml); HtmlNode rootnode = doc.DocumentNode; HtmlNodeCollection hotlist = rootnode.SelectNodes("//ul[@class='htList']//li"); if (hotlist == null || !hotlist.Any()) { Console.WriteLine($"获取数据失败"); } var list = new List<NewsHotTitle>(); foreach (HtmlNode item in hotlist) { NewsHotTitle model = new NewsHotTitle(); model.Title = HttpHelper.RemoveHtml(item.InnerHtml); model.PublishTime = DateTime.Now; Console.WriteLine($"{model.ToJson()}"); } return list; } } }