解决C#下载网页源码时的编码问题

public string Get(string url)
        {
            string retval = string.Empty;
            #region 初始化
            char[] chars = url.ToCharArray(0, 1);

            Uri url_s = new Uri(url);
            #endregion
            try
            {//小心防火墙
                #region 发送并接收数据(如果需要解压就解压)
                WebRequest request_o = WebRequest.Create(url_s);
                //request_o.Timeout = 1000;
                request_o.Headers.Add("Accept-Encoding", "gzip,deflate");
                HttpWebResponse response_o = (HttpWebResponse)request_o.GetResponse();                
                Stream html_o = response_o.GetResponseStream();
                #region 检查是否需要解压
                string sEncodings = string.Empty;
                int compressType = 0;
                try
                {
                    sEncodings = response_o.Headers.Get("Content-Encoding");
                    if (sEncodings.Contains("gzip"))
                        compressType = 1;
                    else if (sEncodings.Contains("deflate"))
                        compressType = 2;
                }
                catch
                {
                }

                try
                {
                    if (compressType == 1)
                        html_o = new System.IO.Compression.GZipStream(html_o, System.IO.Compression.CompressionMode.Decompress);
                    else if (compressType == 2)
                        html_o = new System.IO.Compression.DeflateStream(html_o, System.IO.Compression.CompressionMode.Decompress);
                }
                catch
                {
                }
                #endregion
                #endregion
                byte[] bytes = new byte[1024000];
                int numBytesRead = 0;
                while (true)
                {
                    int k = html_o.Read(bytes, numBytesRead, 1024);
                    if (k == 0)
                        break;
                    numBytesRead += k;
                }
                string sTmp = Encoding.Default.GetString(bytes);
                string sEncodingName = GetCharset(sTmp);

                retval = Encoding.GetEncoding(sEncodingName).GetString(bytes, 0, numBytesRead);

                html_o.Close();
                response_o.Close();
            }
            catch //(Exception e)
            {
                //retval = e.Message;
            }

            return retval;
        }
        private static string GetCharset(string s)
        {
            string sRet = "";
            if (string.IsNullOrEmpty(s))
                return sRet;
            string sReg = "charset[ ]*=[' \"]*([^ '\";\\\\/\\>]*)";
            Regex reg = new Regex(sReg, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            Match m = reg.Match(s);
            if (m.Success)
            {
                if (m.Groups.Count > 1)
                {
                    if (m.Groups[1].Captures.Count > 0)
                        sRet = m.Groups[1].Captures[0].Value;
                }
            }
            return sRet;
        }

 

posted @ 2017-06-05 19:41  吾非无心  阅读(168)  评论(0编辑  收藏  举报