C#获取网页内容,并且处理正确编码

控制台调用
static void Main(string[] args) { string code = GetEncodings("http://www.cnblogs.com"); Encoding pp = Encoding.GetEncoding(code); string pl = GetHtml("http://www.cnblogs.com", pp); }

下面的代码不重要,只是可以获取标题或其它内容

// 获取网页的HTML内容,根据网页的charset自动判断Encoding 
        static string GetHtml(string url)
        {
            return GetHtmls(url, null);
        }

        // 获取网页的HTML内容,指定Encoding 
        static string GetHtmls(string url, Encoding encoding)
        {
            byte[] buf = new WebClient().DownloadData(url);
            if (encoding != null) return encoding.GetString(buf);
            string html = Encoding.UTF8.GetString(buf);
            encoding = GetEncoding(html);
            if (encoding == null || encoding == Encoding.UTF8) return html;
            return encoding.GetString(buf);
        }

        // 根据网页的HTML内容提取网页的Encoding 
        static Encoding GetEncoding(string html)
        {
            string pattern = @"(?i)\bcharset=(? <charset>[-a-zA-Z_0-9]+)";
            string charset = Regex.Match(html, pattern).Groups["charset"].Value;
            try { return Encoding.GetEncoding(charset); }
            catch (ArgumentException) { return null; }
        }

        // 根据网页的HTML内容提取网页的Title 
        static string GetTitle(string html)
        {
            string pattern = @"(?si) <title(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(? <title>.*?) </title>";
            return Regex.Match(html, pattern).Groups["title"].Value.Trim();
        }

        // 打印网页的Encoding和Title 
        static void PrintEncodingAndTitle(string url)
        {
            string html = GetHtml(url);
            Console.WriteLine("[{0}] [{1}]", GetEncoding(html), GetTitle(html));
        } 
里面的代码不重要,只是获取其它的内容
/// <summary>
        /// 获取源代码
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string GetHtml(string url, Encoding encoding)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            StreamReader reader = null;
            try
            {
                request = (HttpWebRequest)WebRequest.Create(url);
                request.Timeout = 20000;
                request.AllowAutoRedirect = false;
                response = (HttpWebResponse)request.GetResponse();
                if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                {
                    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                        reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
                    else
                        reader = new StreamReader(response.GetResponseStream(), encoding);
                    string html = reader.ReadToEnd();
                    return html;
                }
            }
            catch
            {
            }
            finally
            {
                if (response != null)
                {
                    response.Close();
                    response = null;
                }
                if (reader != null)
                    reader.Close();
                if (request != null)
                    request = null;
            }
            return string.Empty;
        }

        public static string GetEncodings(string url)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            StreamReader reader = null;
            try
            {
                request = (HttpWebRequest)WebRequest.Create(url);
                request.Timeout = 20000;
                request.AllowAutoRedirect = false;
                response = (HttpWebResponse)request.GetResponse();
                if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                {
                    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                        reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                    else
                        reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
                    string html = reader.ReadToEnd();
                    string pp = html.Substring(html.IndexOf("charset"),100);
                    int p2 = pp.IndexOf(">");
                    pp=pp.Substring(0,p2);
                    pp = pp.Replace("\\", "").Replace("\"", "").Replace("charset=","").Replace(">","")..Replace("/","").Replace(" ","");;
                    string p3 = pp;
                    return p3;
                    //Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
                    //if (reg_charset.IsMatch(html))
                    //{
                    //    return reg_charset.Match(html).Groups["charset"].Value;
                    //}
                    //else if (response.CharacterSet != string.Empty)
                    //{
                    //    return response.CharacterSet;
                    //}
                    //else
                    //    return Encoding.Default.BodyName;
                    ////XmlDocument xml = new XmlDocument();
                    ////xml.LoadXml(html); 
                }
                return null; 
                
            }
            catch
            {
                return null;
            }
            finally
            {
                if (response != null)
                {
                    response.Close();
                    response = null;
                }
                if (reader != null)
                    reader.Close();
                if (request != null)
                    request = null;
            }
        }
这里才是真正的代码,这里一个是获取正确的编码,一个是根据编码解析源码

 

 

posted @ 2013-06-20 15:25  小锋神  阅读(835)  评论(0编辑  收藏  举报