C#获取网页内容,并且处理正确编码
控制台调用
static void Main(string[] args) { string code = GetEncodings("http://www.cnblogs.com"); Encoding pp = Encoding.GetEncoding(code); string pl = GetHtml("http://www.cnblogs.com", pp); }
下面的代码不重要,只是可以获取标题或其它内容
// 获取网页的HTML内容,根据网页的charset自动判断Encoding static string GetHtml(string url) { return GetHtmls(url, null); } // 获取网页的HTML内容,指定Encoding static string GetHtmls(string url, Encoding encoding) { byte[] buf = new WebClient().DownloadData(url); if (encoding != null) return encoding.GetString(buf); string html = Encoding.UTF8.GetString(buf); encoding = GetEncoding(html); if (encoding == null || encoding == Encoding.UTF8) return html; return encoding.GetString(buf); } // 根据网页的HTML内容提取网页的Encoding static Encoding GetEncoding(string html) { string pattern = @"(?i)\bcharset=(? <charset>[-a-zA-Z_0-9]+)"; string charset = Regex.Match(html, pattern).Groups["charset"].Value; try { return Encoding.GetEncoding(charset); } catch (ArgumentException) { return null; } } // 根据网页的HTML内容提取网页的Title static string GetTitle(string html) { string pattern = @"(?si) <title(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(? <title>.*?) </title>"; return Regex.Match(html, pattern).Groups["title"].Value.Trim(); } // 打印网页的Encoding和Title static void PrintEncodingAndTitle(string url) { string html = GetHtml(url); Console.WriteLine("[{0}] [{1}]", GetEncoding(html), GetTitle(html)); }
/// <summary> /// 获取源代码 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetHtml(string url, Encoding encoding) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 20000; request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding); else reader = new StreamReader(response.GetResponseStream(), encoding); string html = reader.ReadToEnd(); return html; } } catch { } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } return string.Empty; } public static string GetEncodings(string url) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.Timeout = 20000; request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase)) reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)); else reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); string pp = html.Substring(html.IndexOf("charset"),100); int p2 = pp.IndexOf(">"); pp=pp.Substring(0,p2); pp = pp.Replace("\\", "").Replace("\"", "").Replace("charset=","").Replace(">","")..Replace("/","").Replace(" ","");; string p3 = pp; return p3; //Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)"); //if (reg_charset.IsMatch(html)) //{ // return reg_charset.Match(html).Groups["charset"].Value; //} //else if (response.CharacterSet != string.Empty) //{ // return response.CharacterSet; //} //else // return Encoding.Default.BodyName; ////XmlDocument xml = new XmlDocument(); ////xml.LoadXml(html); } return null; } catch { return null; } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } }