网页提取正文代码

去年年底自己突发奇想的想做一个智能的RSS阅读器，其中涉及到网页提取正文的功能，于是上网查找了很多办法，最后觉得计算文本密度的方法比较简单实用，也就是标签内的文本长度与html代码总长度的关系。

这期间涉及到了很多问题，有些解决了，有些还没有解决，发上来请高手协助解决，并将源代码公布供需要的朋友参考。

目前的一个问题是：

在本地搜狐网的网页就无法获取正文，放在租用的空间上除搜狐网以外，还有很多网站也无法获取网页正文

http://xuanzeti.130.china123.net/GetWebContent.aspx?url=http://news.xinmin.cn/rollnews/2011/03/22/9870995_2.html

附代码：

public class WebContent

{

public WebContent() { }

private string Url { get; set; }

private string Content { get; set; }

public WebContent(string url)

{

this.Url = url;

}

public string GetContent(string url)//使用时调用此方法

{

ThreadWebBrowser(url);

return this.Content;

}

private void ThreadWebBrowser(string url)

{

Thread thread = new Thread(new ParameterizedThreadStart(BeginCatch));

thread.SetApartmentState(ApartmentState.STA);

thread.Start(url);

thread.Join();

while (thread.IsAlive)

{

System.Windows.Forms.Application.DoEvents();

}

private void BeginCatch(object obj)

{

try

{

string url = obj.ToString();

WebBrowser webBrowser = new WebBrowser();

webBrowser.ScriptErrorsSuppressed = true;

webBrowser.Navigate("about:blank");

string charset = SniffwebCodeReturnList(GetHtmlCode(url, Encoding.Default), "charset=", "\"");

webBrowser.Document.Write(GetHtmlCode(url, Encoding.GetEncoding(charset)));

//上面这两行看上去也很别扭，主要是想解决部分网页获取源代码的时候中文变成乱码

//即使这样处理了，还是有不少网页无法获取正确的源代码

//此处期待支招

Dictionary<string, string> dict = new Dictionary<string, string>();

HtmlElementCollection allElement = webBrowser.Document.Body.All;

for (int i = 0; i < allElement.Count; i++)

{

if (!dict.Keys.Contains(allElement[i].OuterHtml))

{

if (allElement[i].InnerText != null && allElement[i].InnerText.Length > 100)//这里设置文本长度超过100的才算有效正文，否则太短了就没有什么意义了

{

dict.Add(allElement[i].OuterHtml, allElement[i].InnerText);

}

string content = dict.OrderByDescending(p => p.Value.Length * p.Value.Length / p.Key.Length).FirstOrDefault().Value ?? string.Empty;

//原来我只是先按照长度排序，然后再按照密度排序，后来发现这样不准确，不惯我是先按照密度排序还是先按照文本长度排序结果都不准确，

//后来我在密度的基础上乘上文本长度后发现准确度提高了不少，不过还是害怕一些特殊的网页

this.Content = content;

}

catch

{

}

//根据网址获取网页源代码

private static string GetHtmlCode(string url, Encoding encode)

{

string htmlCode = string.Empty;

System.Net.WebRequest webRequest;

webRequest = System.Net.WebRequest.Create(url);

System.Net.WebResponse webResponse;

webResponse = webRequest.GetResponse();

System.IO.Stream stream;

stream = webResponse.GetResponseStream();

System.IO.StreamReader streamReader = new System.IO.StreamReader(stream, encode);

htmlCode = streamReader.ReadToEnd();

stream.Close();

stream.Dispose();

return htmlCode;

}

//从html源代码中截取一段代码

private static string SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)

{

try

{

System.Collections.ArrayList codeList = new System.Collections.ArrayList();

System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(string.Empty + wordsBegin + @"(?<code>[\s\S]+?)" + wordsEnd + string.Empty, System.Text.RegularExpressions.RegexOptions.Compiled | System.Text.RegularExpressions.RegexOptions.IgnoreCase);

for (System.Text.RegularExpressions.Match match = regex.Match(code); match.Success; match = match.NextMatch())

{

codeList.Add(match.Groups["code"].ToString());

}

if (codeList.Count > 0)

{

return codeList[0].ToString();

}

else

{

return string.Empty;

}

catch

{

return string.Empty;

}

如果你有百度空间帐号，请直接访问这里 http://hi.baidu.com/kandychen/blog/item/d8645743ace6f8049313c6e2.html

如果你有新浪帐号，请直接访问这里 http://blog.sina.com.cn/s/blog_408cb6ff0100pq6l.html

posted on 2011-03-22 13:43 陈平阅读(2899) 评论(0) 编辑收藏举报

刷新页面返回顶部

kandy

公告