网页提取正文 - 陈平

网页提取正文

上半年发了一篇网页提取正文代码的博客，今天想到了其中还有些问题没有解决，于是利用了几个小时的时间做了一下修正，再将新的代码贴出来，供需要的朋友使用。补充说明，这些代码只适合新闻资讯博客类的网页，论坛知道微博等等都不适用。欢迎朋友提更好的解决方案，谢谢。 chenping@live.cn

1 using System;

  2 using System.Collections.Generic;
  3 using System.Linq;
  4 using System.Text;
  5 using System.Windows.Forms;
  6
  7 namespace SmartReader
  8 {
  9     public class WebContent
10     {
11         public WebContent() { }
12         private string Url { get; set; }
13         private string Content { get; set; }
14         public WebContent(string url)
15         {
16             this.Url = url;
17         }
18         /// <summary>
19         /// 根据网址获取网页正文
20         /// </summary>
21         /// <param name="url">网址</param>
22         /// <returns>网页正文</returns>
23         public string GetContent(string url)//使用时调用此方法
24         {
25             ThreadWebBrowser(url);
26             return this.Content;
27         }
28         private void ThreadWebBrowser(string url)
29         {
30             System.Threading.Thread thread = new System.Threading.Thread(new System.Threading.ParameterizedThreadStart(BeginCatch));
31             thread.SetApartmentState(System.Threading.ApartmentState.STA);
32             thread.Start(url);
33             thread.Join();
34             while (thread.IsAlive)
35             {
36                 System.Windows.Forms.Application.DoEvents();
37             }
38         }
39         private void BeginCatch(object obj)
40         {
41             try
42             {
43                 string url = obj.ToString();
44                 System.Windows.Forms.WebBrowser webBrowser = new System.Windows.Forms.WebBrowser();
45                 webBrowser.ScriptErrorsSuppressed = true;
46                 webBrowser.Navigate("about:blank");
47                 string htmlCode = string.Empty;
48                 htmlCode = GetHtmlCode(url, Encoding.Default);
49                 string charset = SniffwebCodeReturnList(htmlCode, "charset=", "\"");
50                 if (charset != Encoding.Default.BodyName)//如果网页的编码与默认编码不同，则重新使用网页的编码获取网页源代码
51                 {
52                     htmlCode = GetHtmlCode(url, Encoding.GetEncoding(charset));
53                 }
54                 webBrowser.Document.Write(htmlCode);
55
56                 Dictionary<string, string> dict = new Dictionary<string, string>();
57                 HtmlElementCollection allElement = webBrowser.Document.Body.All;
58                 for (int i = 0; i < allElement.Count; i++)
59                 {
60                     if (!dict.Keys.Contains(allElement[i].OuterHtml))
61                     {
62                         if (allElement[i].InnerText != null && allElement[i].InnerText.Length > 100)
63                         {
64                             dict.Add(allElement[i].OuterHtml, allElement[i].InnerText);
65                         }
66                     }
67                 }
68                 string content = dict.OrderByDescending(p => p.Value.Length * p.Value.Length / p.Key.Length)
69                     .FirstOrDefault().Value ?? string.Empty;
70
71                 this.Content = content;
72             }
73             catch
74             {
75
76             }
77         }
78         //根据网址获取网页源代码
79         private static string GetHtmlCode(string url, Encoding encode)
80         {
81             string htmlCode = string.Empty;
82             System.Net.HttpWebRequest webRequest;
83             webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
84             webRequest.Timeout = 30000;
85             webRequest.Method = "GET";
86             webRequest.UserAgent = "Mozilla/4.0";
87             webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
88             System.Net.HttpWebResponse webResponse;
89             webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
90             if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
91             {
92                 using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
93                 {
94                     using (System.IO.Compression.GZipStream zipStream =
95                         new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
96                     {
97                         using (System.IO.StreamReader sr = new System.IO.StreamReader(zipStream, encode))
98                         {
99                             htmlCode = sr.ReadToEnd();
100                         }
101                     }
102                 }
103             }
104             else
105             {
106                 using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
107                 {
108                     using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, encode))
109                     {
110                         htmlCode = sr.ReadToEnd();
111                     }
112                 }
113             }
114
115             return htmlCode;
116         }
117
118         //从html源代码中截取一段代码
119         private static string SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)
120         {
121             try
122             {
123                 System.Collections.ArrayList codeList = new System.Collections.ArrayList();
124                 System.Text.RegularExpressions.Regex regex =
125                     new System.Text.RegularExpressions.Regex(wordsBegin + @"(?<code>[\s\S]+?)" + wordsEnd,
126                         System.Text.RegularExpressions.RegexOptions.Compiled | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
127                 for (System.Text.RegularExpressions.Match match = regex.Match(code); match.Success; match = match.NextMatch())
128                 {
129                     codeList.Add(match.Groups["code"].ToString());
130                 }
131                 if (codeList.Count > 0)
132                 {
133                     return codeList[0].ToString();
134                 }
135                 else
136                 {
137                     return string.Empty;
138                 }
139             }
140             catch
141             {
142                 return string.Empty;
143             }
144         }
145     }
146 }

调用方式就很简单了，见下面的代码

1 WebContent webContent = new WebContent();

2 string content = webContent.GetContent("http://www.cnblogs.com/kandy/archive/2011/08/30/how_to_get_web_content_from_url.html");
3 MessageBox.Show(content);

posted on 2011-08-30 16:50 陈平阅读(595) 评论(0) 编辑收藏举报

刷新页面返回顶部

kandy

公告