打赏

还是处理视频下载所相关的问题。

有些网站,它的页面代码是由页面加载后js动态生成,那么其原始的html便不能用。页面渲染后的代码,是我们需要的

c#中,我用WebBrowser这个控件处理。设置项目类型为控制台程序,加Form承载WebBrowser实现。

记录代码以做备忘:

using System;
using System.IO;
using System.Net;
using System.Runtime.InteropServices;
using System.Text;
using System.Windows.Forms;
using Microsoft.Win32;

namespace crpj
{
    [ComVisible(true)]
    public class Form : System.Windows.Forms.Form
    {
        protected override void SetVisibleCore(bool value)
        {
            base.SetVisibleCore(false);
        }

        public string GetHtmlCode(string url)
        {
            using (var wc = new WebClient())
            {
                wc.Encoding = Encoding.UTF8;
                return wc.DownloadString(url);
            }
        }
    }

    class Program
    {
        private static Timer tmrGet = new Timer();
        private static Timer tmrExit = new Timer();
        private static WebBrowser browser = new WebBrowser();
        //延时获取?
        private static int delay = 0;
        //js注入脚本
        private static string jsCode;

        //禁止网页跳转声音
        const int FEATURE_DISABLE_NAVIGATION_SOUNDS = 21;
        const int SET_FEATURE_ON_PROCESS = 0x00000002;

        [DllImport("urlmon.dll")]
        [PreserveSig]
        [return: MarshalAs(UnmanagedType.Error)]
        static extern int CoInternetSetFeatureEnabled(
            int FeatureEntry,
            [MarshalAs(UnmanagedType.U4)] int dwFlags,
            bool fEnable);

        /// <summary>
        /// 应用程序的主入口点。
        /// </summary>
        /// 参数列表:url delay jscode
        [STAThread]
        static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("error: You must provide at least one URL.");
                return;
            }

            CoInternetSetFeatureEnabled(
                FEATURE_DISABLE_NAVIGATION_SOUNDS,
                SET_FEATURE_ON_PROCESS,
                true);
            ChackAndSetBrowserEmulation();

            var form = new Form();
            form.Controls.Add(browser);
            browser.ObjectForScripting = form;
            browser.ScriptErrorsSuppressed = true;
            browser.DocumentCompleted += browser_DocumentCompleted;
            browser.Navigate(args[0]);

            if (args.Length > 1)
                delay = int.Parse(args[1]);
            if (args.Length > 2)
                jsCode = args[2];

            //因为页面有时需加载js初始化等操作,延时获取其页面内容
            tmrGet.Tick += new EventHandler(tmrGet_Tick);
            if (delay > 0)
                tmrGet.Interval = delay;

            //有些网页不触发complete事件,或者时间很长,此定时器做判断,以60秒为界,自结束
            tmrExit.Tick += new EventHandler(tmrExit_Tick);
            tmrExit.Interval = 90000;
            tmrExit.Start();

            Application.Run(form);
        }

        static void tmrExit_Tick(object sender, EventArgs e)
        {
            OutputHtml();
        }

        //WebBrowser以IE11版本做页面渲染 
        static void ChackAndSetBrowserEmulation()
        {
            try
            {
                string keyName = @"SOFTWARE\Microsoft\Internet Explorer\MAIN\FeatureControl\FEATURE_BROWSER_EMULATION";
                using (var key = Registry.CurrentUser.OpenSubKey(keyName, true))
                {
                    string valueName = Path.GetFileName(Application.ExecutablePath);
                    if (key.GetValue(valueName) == null)
                        key.SetValue(valueName, 11001);
                }
            }
            catch
            {
            }
        }

        static void tmrGet_Tick(object sender, EventArgs e)
        {
            tmrGet.Stop();
            OutputHtml();
        }

        static void OutputHtml()
        {
            tmrExit.Stop();
            //避免韩文等乱码
            Console.OutputEncoding = Encoding.UTF8;
            //browser.DocumentText取不到执行js之后的body文件
            string html = browser.Document.GetElementsByTagName("html")[0].OuterHtml;
            Console.Write(html);
            Application.Exit();
        }

        static void ExecJS(string jsCode)
        {
            var script = browser.Document.CreateElement("script");
            script.SetAttribute("type", "text/javascript");
            script.SetAttribute("text", "function _func() {" + jsCode + "}");
            browser.Document.Body.AppendChild(script);
            browser.Document.InvokeScript("_func");
        }

        static void browser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            if (browser.ReadyState == WebBrowserReadyState.Complete && e.Url == browser.Url)
            {
                //是否需要js注入?
                if (!string.IsNullOrEmpty(jsCode))
                {
                    ExecJS(jsCode);
                    System.Threading.Thread.Sleep(500);
                }

                if (delay == 0)
                    OutputHtml();
                else
                    tmrGet.Start();
            }
        }
    }
}
 

如此处理,可能得到所需要的html代码。

其在控制台输出图示效果:

 

并基于此思路,设计进程输出管理器:

    internal class ProcessOutputMgr
    {
        private static object syncObj = new Object();
        private Process process = new Process();
        private StringBuilder allData = new StringBuilder();
        private bool exitedCalled = false;

        public ProcessMgr(string fileName, string args)
        {
            var startInfo = new ProcessStartInfo(fileName);
            startInfo.WindowStyle = ProcessWindowStyle.Hidden;
            startInfo.Arguments = args;
            startInfo.UseShellExecute = false;
            startInfo.CreateNoWindow = true;
            //crpj皆以utf-8输出,避免乱码
            startInfo.StandardOutputEncoding = Encoding.UTF8; startInfo.RedirectStandardOutput
= true; startInfo.RedirectStandardError = true; process.StartInfo = startInfo; process.EnableRaisingEvents = true; //一定要有这个才能触发Exited 事件 process.Exited += process_Exited; process.OutputDataReceived += process_OutputDataReceived; process.ErrorDataReceived += process_ErrorDataReceived; } public event DataReceivedEventHandler OutputDataReceived; public event DataReceivedEventHandler ErrorDataReceived; public event Action<string> AllDataReceived; public bool Start() { bool result = process.Start(); process.BeginOutputReadLine(); process.BeginErrorReadLine(); return result; } public void WaitForExit() { process.WaitForExit(); } public bool WaitForExit(int milliseconds) { return process.WaitForExit(milliseconds); } private void process_Exited(object sender, EventArgs e) { if (!this.exitedCalled && this.allData.Length != 0) { this.exitedCalled = true; var handler = AllDataReceived; if (handler != null) handler(this.allData.ToString()); } } private void process_OutputDataReceived(object sender, DataReceivedEventArgs e) { lock (syncObj) { var handler = OutputDataReceived; if (handler != null) handler(sender, e); if (e.Data != null) this.allData.AppendLine(e.Data); else { var process = sender as Process; if (process.HasExited && !this.exitedCalled) { this.exitedCalled = true; if (AllDataReceived != null) AllDataReceived(this.addData.ToString()); } } } } private void process_ErrorDataReceived(object sender, DataReceivedEventArgs e) { lock (syncObj) { var handler = ErrorDataReceived; if (handler != null) handler(sender, e); } } }

 

posted on 2018-11-20 20:17  楚人无衣  阅读(2589)  评论(0编辑  收藏  举报