上半年发了一篇 网页提取正文代码 的博客,今天想到了其中还有些问题没有解决,于是利用了几个小时的时间做了一下修正,再将新的代码贴出来,供需要的朋友使用。补充说明,这些代码只适合新闻资讯博客类的网页,论坛知道微博等等都不适用。欢迎朋友提更好的解决方案,谢谢 。 chenping@live.cn
1 using System;
2 using System.Collections.Generic;
3 using System.Linq;
4 using System.Text;
5 using System.Windows.Forms;
6
7 namespace SmartReader
8 {
9 public class WebContent
10 {
11 public WebContent() { }
12 private string Url { get; set; }
13 private string Content { get; set; }
14 public WebContent(string url)
15 {
16 this.Url = url;
17 }
18 /// <summary>
19 /// 根据网址获取网页正文
20 /// </summary>
21 /// <param name="url">网址</param>
22 /// <returns>网页正文</returns>
23 public string GetContent(string url)//使用时调用此方法
24 {
25 ThreadWebBrowser(url);
26 return this.Content;
27 }
28 private void ThreadWebBrowser(string url)
29 {
30 System.Threading.Thread thread = new System.Threading.Thread(new System.Threading.ParameterizedThreadStart(BeginCatch));
31 thread.SetApartmentState(System.Threading.ApartmentState.STA);
32 thread.Start(url);
33 thread.Join();
34 while (thread.IsAlive)
35 {
36 System.Windows.Forms.Application.DoEvents();
37 }
38 }
39 private void BeginCatch(object obj)
40 {
41 try
42 {
43 string url = obj.ToString();
44 System.Windows.Forms.WebBrowser webBrowser = new System.Windows.Forms.WebBrowser();
45 webBrowser.ScriptErrorsSuppressed = true;
46 webBrowser.Navigate("about:blank");
47 string htmlCode = string.Empty;
48 htmlCode = GetHtmlCode(url, Encoding.Default);
49 string charset = SniffwebCodeReturnList(htmlCode, "charset=", "\"");
50 if (charset != Encoding.Default.BodyName)//如果网页的编码与默认编码不同,则重新使用网页的编码获取网页源代码
51 {
52 htmlCode = GetHtmlCode(url, Encoding.GetEncoding(charset));
53 }
54 webBrowser.Document.Write(htmlCode);
55
56 Dictionary<string, string> dict = new Dictionary<string, string>();
57 HtmlElementCollection allElement = webBrowser.Document.Body.All;
58 for (int i = 0; i < allElement.Count; i++)
59 {
60 if (!dict.Keys.Contains(allElement[i].OuterHtml))
61 {
62 if (allElement[i].InnerText != null && allElement[i].InnerText.Length > 100)
63 {
64 dict.Add(allElement[i].OuterHtml, allElement[i].InnerText);
65 }
66 }
67 }
68 string content = dict.OrderByDescending(p => p.Value.Length * p.Value.Length / p.Key.Length)
69 .FirstOrDefault().Value ?? string.Empty;
70
71 this.Content = content;
72 }
73 catch
74 {
75
76 }
77 }
78 //根据网址获取网页源代码
79 private static string GetHtmlCode(string url, Encoding encode)
80 {
81 string htmlCode = string.Empty;
82 System.Net.HttpWebRequest webRequest;
83 webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
84 webRequest.Timeout = 30000;
85 webRequest.Method = "GET";
86 webRequest.UserAgent = "Mozilla/4.0";
87 webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
88 System.Net.HttpWebResponse webResponse;
89 webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
90 if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
91 {
92 using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
93 {
94 using (System.IO.Compression.GZipStream zipStream =
95 new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
96 {
97 using (System.IO.StreamReader sr = new System.IO.StreamReader(zipStream, encode))
98 {
99 htmlCode = sr.ReadToEnd();
100 }
101 }
102 }
103 }
104 else
105 {
106 using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
107 {
108 using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, encode))
109 {
110 htmlCode = sr.ReadToEnd();
111 }
112 }
113 }
114
115 return htmlCode;
116 }
117
118 //从html源代码中截取一段代码
119 private static string SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)
120 {
121 try
122 {
123 System.Collections.ArrayList codeList = new System.Collections.ArrayList();
124 System.Text.RegularExpressions.Regex regex =
125 new System.Text.RegularExpressions.Regex(wordsBegin + @"(?<code>[\s\S]+?)" + wordsEnd,
126 System.Text.RegularExpressions.RegexOptions.Compiled | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
127 for (System.Text.RegularExpressions.Match match = regex.Match(code); match.Success; match = match.NextMatch())
128 {
129 codeList.Add(match.Groups["code"].ToString());
130 }
131 if (codeList.Count > 0)
132 {
133 return codeList[0].ToString();
134 }
135 else
136 {
137 return string.Empty;
138 }
139 }
140 catch
141 {
142 return string.Empty;
143 }
144 }
145 }
146 }
3 using System.Linq;
4 using System.Text;
5 using System.Windows.Forms;
6
7 namespace SmartReader
8 {
9 public class WebContent
10 {
11 public WebContent() { }
12 private string Url { get; set; }
13 private string Content { get; set; }
14 public WebContent(string url)
15 {
16 this.Url = url;
17 }
18 /// <summary>
19 /// 根据网址获取网页正文
20 /// </summary>
21 /// <param name="url">网址</param>
22 /// <returns>网页正文</returns>
23 public string GetContent(string url)//使用时调用此方法
24 {
25 ThreadWebBrowser(url);
26 return this.Content;
27 }
28 private void ThreadWebBrowser(string url)
29 {
30 System.Threading.Thread thread = new System.Threading.Thread(new System.Threading.ParameterizedThreadStart(BeginCatch));
31 thread.SetApartmentState(System.Threading.ApartmentState.STA);
32 thread.Start(url);
33 thread.Join();
34 while (thread.IsAlive)
35 {
36 System.Windows.Forms.Application.DoEvents();
37 }
38 }
39 private void BeginCatch(object obj)
40 {
41 try
42 {
43 string url = obj.ToString();
44 System.Windows.Forms.WebBrowser webBrowser = new System.Windows.Forms.WebBrowser();
45 webBrowser.ScriptErrorsSuppressed = true;
46 webBrowser.Navigate("about:blank");
47 string htmlCode = string.Empty;
48 htmlCode = GetHtmlCode(url, Encoding.Default);
49 string charset = SniffwebCodeReturnList(htmlCode, "charset=", "\"");
50 if (charset != Encoding.Default.BodyName)//如果网页的编码与默认编码不同,则重新使用网页的编码获取网页源代码
51 {
52 htmlCode = GetHtmlCode(url, Encoding.GetEncoding(charset));
53 }
54 webBrowser.Document.Write(htmlCode);
55
56 Dictionary<string, string> dict = new Dictionary<string, string>();
57 HtmlElementCollection allElement = webBrowser.Document.Body.All;
58 for (int i = 0; i < allElement.Count; i++)
59 {
60 if (!dict.Keys.Contains(allElement[i].OuterHtml))
61 {
62 if (allElement[i].InnerText != null && allElement[i].InnerText.Length > 100)
63 {
64 dict.Add(allElement[i].OuterHtml, allElement[i].InnerText);
65 }
66 }
67 }
68 string content = dict.OrderByDescending(p => p.Value.Length * p.Value.Length / p.Key.Length)
69 .FirstOrDefault().Value ?? string.Empty;
70
71 this.Content = content;
72 }
73 catch
74 {
75
76 }
77 }
78 //根据网址获取网页源代码
79 private static string GetHtmlCode(string url, Encoding encode)
80 {
81 string htmlCode = string.Empty;
82 System.Net.HttpWebRequest webRequest;
83 webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
84 webRequest.Timeout = 30000;
85 webRequest.Method = "GET";
86 webRequest.UserAgent = "Mozilla/4.0";
87 webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
88 System.Net.HttpWebResponse webResponse;
89 webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
90 if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
91 {
92 using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
93 {
94 using (System.IO.Compression.GZipStream zipStream =
95 new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
96 {
97 using (System.IO.StreamReader sr = new System.IO.StreamReader(zipStream, encode))
98 {
99 htmlCode = sr.ReadToEnd();
100 }
101 }
102 }
103 }
104 else
105 {
106 using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
107 {
108 using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, encode))
109 {
110 htmlCode = sr.ReadToEnd();
111 }
112 }
113 }
114
115 return htmlCode;
116 }
117
118 //从html源代码中截取一段代码
119 private static string SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)
120 {
121 try
122 {
123 System.Collections.ArrayList codeList = new System.Collections.ArrayList();
124 System.Text.RegularExpressions.Regex regex =
125 new System.Text.RegularExpressions.Regex(wordsBegin + @"(?<code>[\s\S]+?)" + wordsEnd,
126 System.Text.RegularExpressions.RegexOptions.Compiled | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
127 for (System.Text.RegularExpressions.Match match = regex.Match(code); match.Success; match = match.NextMatch())
128 {
129 codeList.Add(match.Groups["code"].ToString());
130 }
131 if (codeList.Count > 0)
132 {
133 return codeList[0].ToString();
134 }
135 else
136 {
137 return string.Empty;
138 }
139 }
140 catch
141 {
142 return string.Empty;
143 }
144 }
145 }
146 }
调用方式就很简单了,见下面的代码
1 WebContent webContent = new WebContent();
2 string content = webContent.GetContent("http://www.cnblogs.com/kandy/archive/2011/08/30/how_to_get_web_content_from_url.html");
3 MessageBox.Show(content);
3 MessageBox.Show(content);