获取页面编码的方法
1,通过分析Header提取编码。
WebRequest webRequest = WebRequest.Create(url); HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)") ; WebHeaderCollection headers = webResponse.Headers; string encodingName = string.Empty; string contentType = headers["Content-Type"]; if (contentType.IndexOf("charset") > 0 && reg_charset.IsMatch(ContentType)) { encodingName = reg_charset.Match(contentType).Groups["charset"].Value; }
引用地址 http://blog.useasp.net/default.aspx
2. 通过网页分析
1 //获取页面 2 string strResult = Encoding.Default.GetString(e.Result); 3 4 const string regCharset = "(<meta[^>]*charset=(?<charset>[^>'\"]*)[\\s\\S]*?>)|(xml[^>]+encoding=(\"|')*(?<charset>[^>'\"]*)[\\s\\S]*?>)"; 5 6 var r = new Regex(regCharset, RegexOptions.IgnoreCase); 7 var m1 = r.Match(strResult); 8 string encodingName = (m1.Captures.Count != 0) ? m1.Groups["charset"].Value : ""; 9 10 if (string.IsNullOrEmpty(encodingName)) 11 { 12 //如果未获取 这手动替换判断 13 string str = m1.Groups[1].Value; 14 const string pattern = "<meta charset=\"|\">|\" />"; 15 encodingName = Regex.Replace(str, pattern, ""); 16 17 }