获取页面Html代码,自动识别编码。

 
public string GetHtml(string url)
    {
        string code = DecodeData(url);
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
        request.Timeout = 30000;
        request.Headers.Set("Pragma", "no-cache");
        HttpWebResponse response = (HttpWebResponse)request.GetResponse();
        Stream streamReceive = response.GetResponseStream();
        Encoding encoding = code != string.Empty ? Encoding.GetEncoding(code.ToUpper()) : Encoding.Default;
        StreamReader streamReader = new StreamReader(streamReceive, encoding);
        string strResult = streamReader.ReadToEnd();
        streamReader.Close();
        streamReader.Dispose();

        return strResult;
    }
//http://blog.sunmast.com/natas/archive/2004/10/30/989.aspx,略有改动.   

    private string DecodeData(string Url)
    {
        WebRequest r = WebRequest.Create(Url);
        WebResponse w = r.GetResponse();
        //    
        //   first   see   if   content   length   header   has   charset   =   calue    
        //    
        String charset = string.Empty;
        String ctype = w.Headers["content-type"];
        if (ctype != null)
        {
            int ind = ctype.IndexOf("charset=");
            if (ind != -1)
            {
                charset = ctype.Substring(ind + 8);
            }
        }

        //   save   data   to   a   memorystream    
        MemoryStream rawdata = new MemoryStream();
        byte[] buffer = new byte[1024];
        Stream rs = w.GetResponseStream();
        int read = rs.Read(buffer, 0, buffer.Length);
        while (read > 0)
        {
            rawdata.Write(buffer, 0, read);
            read = rs.Read(buffer, 0, buffer.Length);
        }

        rs.Close();

        //    
        //   if   ContentType   is   null,   or   did   not   contain   charset,   we   search   in   body    
        //    
        if (charset == null)
        {
            MemoryStream ms = rawdata;
            ms.Seek(0, SeekOrigin.Begin);

            StreamReader srr = new StreamReader(ms, Encoding.ASCII);
            String meta = srr.ReadToEnd();

            if (meta != null)
            {
                int start_ind = meta.IndexOf("charset=");
                int end_ind = -1;
                if (start_ind != -1)
                {
                    end_ind = meta.IndexOf("\"", start_ind);
                    if (end_ind != -1)
                    {
                        int start = start_ind + 8;
                        charset = meta.Substring(start, end_ind - start + 1);
                        charset = charset.TrimEnd(new Char[] { '>', '"' });
                    }
                }
            }
        }
        return charset.ToString();
    } 

posted on 2007-11-14 09:40  尹洪亮  阅读(1551)  评论(2编辑  收藏  举报

导航