使用 WebBrowser 获取Ajax动态加载网页信息

直接上代码(代码较粗糙,可根据需要优化):

WebBrowser  直接执行时会报一个单线程的问题,我的解决方法是:使用“STAThread”,指定线程模型为单线程单元

[STAThread]
static void Main(string[] args)

using System;
using System.IO;
using System.Net;
using System.Text;
using System.Windows.Forms;
using System.Text.RegularExpressions;
using System.Collections.Specialized;

namespace CrawlerTest
{
    public class HttpHelper
    {
        /// <summary>
        /// 下载Ajax Html
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string DownloadAjaxHtml(string url)
        {
            string htmlstr = null;
            try
            {
                WebBrowser wb = new WebBrowser();
                wb.AllowNavigation = true;
                wb.ScriptErrorsSuppressed = true;

                int hitCount = 1;
                wb.Navigating += (sender, e) =>
                {
                    hitCount++;
                };

                wb.DocumentCompleted += (sender, e) =>
                {
                    hitCount++;
                };

                wb.Navigate(url);

                DateTime dtime = DateTime.Now;
                double timespan = 0;
                while (timespan <= 3 || wb.ReadyState != WebBrowserReadyState.Complete)
                {
                    Application.DoEvents();
                    DateTime time2 = DateTime.Now;
                    timespan = (time2 - dtime).TotalSeconds;
                }

                if (wb.ReadyState == WebBrowserReadyState.Complete)
                {
                    htmlstr = wb.Document.Body.OuterHtml;
                    htmlstr = System.Web.HttpUtility.UrlDecode(htmlstr);//解码
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine($"DownloadAjaxHtml-Error:{ex.ToString()}");
            }

            return htmlstr;
        }
		
        //获取Html后再获取想要的内容
        public static List<NewsHotTitle> GetHotTitle(Encoding encoding)
        {
            var url = "http://www.news.cn/2021homepro/rsznb/";

            string strHtml = HttpHelper.DownloadAjaxHtml(url);
            if (string.IsNullOrEmpty(strHtml)) { Console.WriteLine($"获取数据失败"); }

            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(strHtml);

            HtmlNode rootnode = doc.DocumentNode;
            HtmlNodeCollection hotlist = rootnode.SelectNodes("//ul[@class='htList']//li");
            if (hotlist == null || !hotlist.Any()) { Console.WriteLine($"获取数据失败"); }

            var list = new List<NewsHotTitle>();
            foreach (HtmlNode item in hotlist)
            {
                NewsHotTitle model = new NewsHotTitle();
                model.Title = HttpHelper.RemoveHtml(item.InnerHtml);
                model.PublishTime = DateTime.Now;

                Console.WriteLine($"{model.ToJson()}");
            }

            return list;
        }
    }
}

  

posted @ 2021-12-13 16:01  醉裏挑燈看劍  阅读(289)  评论(0编辑  收藏  举报