论请求网页时的乱码问题
乱码问题很烦人,本人始终没有找到很好的解决方案,在一个抓取网页数据的程序中,最后还是使用了WebBrownser.
开始时使用HttpWebRequest
/// <summary> /// 根据给定的URL获取网页源代码 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetWebPageSource(string url) { ServicePointManager.ServerCertificateValidationCallback = new System.Net.Security.RemoteCertificateValidationCallback( delegate(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors) { return true; }); HttpWebRequest request = WebRequest.CreateDefault(new Uri(url)) as HttpWebRequest; request.Method = "GET"; request.Proxy = new WebProxy { UseDefaultCredentials = true }; try { HttpWebResponse response = request.GetResponse() as HttpWebResponse; System.IO.Stream responseStream = response.GetResponseStream(); int bytesRead = 0; byte[] buffer = new byte[64 * 1024]; MemoryStream stmMemory = new MemoryStream(); //save stream to byte[] while ((bytesRead = responseStream.Read(buffer, 0, buffer.Length)) > 0) { stmMemory.Write(buffer, 0, bytesRead); } var bytes = stmMemory.ToArray(); stmMemory.Close(); //get charset var html = string.Empty; var charset = response.CharacterSet; if (string.IsNullOrWhiteSpace(charset)) charset = "UTF-8"; //read html as default charset from byte[] var ms = new System.IO.MemoryStream(bytes); System.IO.StreamReader reader = new System.IO.StreamReader(ms, Encoding.GetEncoding(charset)); //get charset from html var innerCharset = Pubs.DetectCharset(reader.ReadToEnd()); if (innerCharset != string.Empty) charset = innerCharset; //read html as detected charset from byte[] ms = new System.IO.MemoryStream(bytes); reader = new System.IO.StreamReader(ms, Encoding.GetEncoding(charset)); html = reader.ReadToEnd(); return html; } catch (Exception) { throw; } } public static string DetectCharset(string html) { var wb = new WebBrowser(); wb.ScriptErrorsSuppressed = true; wb.Navigate("about:blank"); wb.Document.Write(html); var heads = wb.Document.GetElementsByTagName("head"); if (heads.Count > 0) { var head = heads[0]; foreach (HtmlElement child in head.Children) { if (child.DomElement != null && child.DomElement as mshtml.HTMLMetaElement != null) { var meta = child.DomElement as mshtml.HTMLMetaElement; try { if (meta.charset != null && meta.charset.Trim() != string.Empty) return meta.charset; if (meta.content != null && meta.content.Contains("charset=")) { return meta.content.Substring(meta.content.IndexOf("charset=") + "charset=".Length); } } catch { continue; } } } } return string.Empty; }
反正已经使用了WebBrownser了,就干脆直接用WebBrowser请求得了,而且没有乱码,代码又少,使用中没发现有什么大的问题,比较稳定,但发现它归根结义是一个COM组件,导致程序运行时内存一直在升,最后找到原因后在适当的位置Dispose就没什么问题了:
public static WebBrowser GetBrowser(string url, int secondsTimeOut) { var wb = new WebBrowser(); wb.ScriptErrorsSuppressed = true; wb.Navigate(url); var startTime = DateTime.Now ; while (true) { Application.DoEvents(); if (wb.ReadyState == WebBrowserReadyState.Loaded || wb.ReadyState == WebBrowserReadyState.Complete || startTime.AddSeconds(secondsTimeOut) < DateTime.Now) break; } return wb; }
桂棹兮兰桨,击空明兮溯流光。