正文提取部分。
这部分实现大部分是参考蛙蛙的,但是作了少许修改。
主要在下面两方面:
一,动态阈值。蛙蛙设置的是三百,所以他的算法对于短新闻来说失效。我的阈值设置为当前新闻正文所有汉字数量的函数。这样可以避免一些短新闻失效的情况,但还是不够完美。
二。有的正文页,含有很多链接到其他新闻的URL,这部分模块汉数量也很大,所以,如果不把这种链接块儿去掉,统计字符进行正文定位的时候,容易跑偏。
上代码
判断两段儿文本里哪个中文占的比例高
/// <summary>
/// 函数名称:CompareDinosByChineseLength
/// 函数功能:判断两段儿文本里哪个中文占的比例高
/// 函数参数:待比较的两个文本块x,y
/// 函数返回值:1表明文本块x中中文比例高于y,0则反之
/// </summary>
/// <param name="x"></param>
/// <param name="y"></param>
/// <returns></returns>
public static int CompareDinosByChineseLength(string x, string y)
{
if (x == null)
{
if (y == null)
{
return 0;
}
else
{
return -1;
}
}
else
{
if (y == null)
{
return 1;
}
else
{
Regex r = new Regex("[\u4e00-\u9fa5]");//UTF8 中中文汉字的范围
float xCount = (float)(r.Matches(x).Count) / (float)x.Length;
float yCount = (float)(r.Matches(y).Count) / (float)y.Length;
int retval = xCount.CompareTo(yCount);
if (retval != 0)
{
return retval;
}
else//如果从百分比上看不出差距,则从汉字数量上来看
{
return x.CompareTo(y);
}
}
}
}
/// 函数名称:CompareDinosByChineseLength
/// 函数功能:判断两段儿文本里哪个中文占的比例高
/// 函数参数:待比较的两个文本块x,y
/// 函数返回值:1表明文本块x中中文比例高于y,0则反之
/// </summary>
/// <param name="x"></param>
/// <param name="y"></param>
/// <returns></returns>
public static int CompareDinosByChineseLength(string x, string y)
{
if (x == null)
{
if (y == null)
{
return 0;
}
else
{
return -1;
}
}
else
{
if (y == null)
{
return 1;
}
else
{
Regex r = new Regex("[\u4e00-\u9fa5]");//UTF8 中中文汉字的范围
float xCount = (float)(r.Matches(x).Count) / (float)x.Length;
float yCount = (float)(r.Matches(y).Count) / (float)y.Length;
int retval = xCount.CompareTo(yCount);
if (retval != 0)
{
return retval;
}
else//如果从百分比上看不出差距,则从汉字数量上来看
{
return x.CompareTo(y);
}
}
}
}
获取一个网页源码中的标签列表,支持嵌套,一般或去div,td等容器
/// <summary>
/// 函数名称:GetTags
/// 函数功能:获取一个网页源码中的标签列表,支持嵌套,一般或去div,td等容器
/// 函数参数:input网页源码,tag指定要获取列表的表签
/// 函数返回值:源码中某一标签的标签列表
/// </summary>
/// <param name="input"></param>
/// <param name="tag"></param>
/// <returns></returns>
public static List<string> GetTags(string input, string tag)
{
StringReader strReader = new StringReader(input);
int lowerThanCharCounter = 0;//标记是否出现<,如果出现了,其值为1,否则其值为0
int lowerThanCharPos = 0;//记录标签<...>的开始位置
Stack<int> tagPos = new Stack<int>();//
List<string> taglist = new List<string>();
int i = 0;//下标记录当前char在整个字符串中的位置
while (true)
{
try
{
int intCharacter = strReader.Read();
if (intCharacter == -1) break;//退出循环条件
char convertedCharacter = Convert.ToChar(intCharacter);
if (lowerThanCharCounter > 0)
{
if (convertedCharacter == '>')
{
lowerThanCharCounter--;
string biaoqian = input.Substring(lowerThanCharPos, i - lowerThanCharPos + 1);//取出整个标签(标签包括开始标签和结束标签)
if (biaoqian.StartsWith(string.Format("<{0}", tag)))//如果是开始标签
{
tagPos.Push(lowerThanCharPos);//将"<"对应的位置压栈
}
if (biaoqian.StartsWith(string.Format("</{0}", tag)))//如果是结束标签
{
if (tagPos.Count < 1)
continue;//
int tempTagPos = tagPos.Pop();//弹栈
string strdiv = input.Substring(tempTagPos, i - tempTagPos + 1);//将整个元素<...>...<...>压入保持表
taglist.Add(strdiv);//taglist[0],taglist[1],taglist[2]...分别是从内层到外层的元素
}
}
}
if (convertedCharacter == '<')
{
lowerThanCharCounter++;//标记是否找到了"<"标签
lowerThanCharPos = i;//记录<的位置
}
}
finally
{
i++;//下标指针后移
}
}
return taglist;
}
/// 函数名称:GetTags
/// 函数功能:获取一个网页源码中的标签列表,支持嵌套,一般或去div,td等容器
/// 函数参数:input网页源码,tag指定要获取列表的表签
/// 函数返回值:源码中某一标签的标签列表
/// </summary>
/// <param name="input"></param>
/// <param name="tag"></param>
/// <returns></returns>
public static List<string> GetTags(string input, string tag)
{
StringReader strReader = new StringReader(input);
int lowerThanCharCounter = 0;//标记是否出现<,如果出现了,其值为1,否则其值为0
int lowerThanCharPos = 0;//记录标签<...>的开始位置
Stack<int> tagPos = new Stack<int>();//
List<string> taglist = new List<string>();
int i = 0;//下标记录当前char在整个字符串中的位置
while (true)
{
try
{
int intCharacter = strReader.Read();
if (intCharacter == -1) break;//退出循环条件
char convertedCharacter = Convert.ToChar(intCharacter);
if (lowerThanCharCounter > 0)
{
if (convertedCharacter == '>')
{
lowerThanCharCounter--;
string biaoqian = input.Substring(lowerThanCharPos, i - lowerThanCharPos + 1);//取出整个标签(标签包括开始标签和结束标签)
if (biaoqian.StartsWith(string.Format("<{0}", tag)))//如果是开始标签
{
tagPos.Push(lowerThanCharPos);//将"<"对应的位置压栈
}
if (biaoqian.StartsWith(string.Format("</{0}", tag)))//如果是结束标签
{
if (tagPos.Count < 1)
continue;//
int tempTagPos = tagPos.Pop();//弹栈
string strdiv = input.Substring(tempTagPos, i - tempTagPos + 1);//将整个元素<...>...<...>压入保持表
taglist.Add(strdiv);//taglist[0],taglist[1],taglist[2]...分别是从内层到外层的元素
}
}
}
if (convertedCharacter == '<')
{
lowerThanCharCounter++;//标记是否找到了"<"标签
lowerThanCharPos = i;//记录<的位置
}
}
finally
{
i++;//下标指针后移
}
}
return taglist;
}
下一页URL
/// <summary>
/// 函数名称:GetNextPageUrl
/// 函数功能:从网页源码中提取下一页URL,并且如果下一页URL不完整,补全下一页URL
/// 函数参数:rawtext,当前网页源码,url 当前网页的绝对url
/// 函数返回值:下一页的绝对url,若无下一页url则返回空字符串
/// </summary>
/// <param name="rawtext"></param>
/// <returns></returns>
public static string GetNextPageUrl(string rawtext,string url)
{
string urlpart = "";//存放提取出的URL
Lexer lexer = new Lexer(rawtext);
Parser parser = new Parser(lexer);
NodeFilter filter = new TagNameFilter("a");
NodeList htmlNodes = parser.Parse(filter);
for (int i = htmlNodes.Count - 1; i >= 0; i--)
{
ATag link = (ATag)htmlNodes.ElementAt(i);
string temp = link.StringText;
Regex fetchNextPage = new Regex(@"下一页");
if (!fetchNextPage.IsMatch(temp))
{
htmlNodes.Remove(i);
}
}
//如果<a>标签链表不空,则说明当前页不是最后一页,含有下一页URL,并提取出下一页URL
if (htmlNodes.Count > 0)
{
ATag link = (ATag)htmlNodes.ElementAt(0);
urlpart = link.GetAttribute("href");
//判断从源码中提取出的URL是否需要补全
if (new Regex(@"^http:").IsMatch(urlpart))
return urlpart;
//否则需要补全
else
{
Regex r1 = new Regex(@"^/");//如果被补全的url不是以“/”打头,则在它前面加上“/”
if (!r1.IsMatch(urlpart))
{
urlpart = "/" + urlpart;
}
Regex r2 = new Regex(@"/");
int[] npos = new int[20];//从npos[1]开始,数组元素用于储存url中"/"出现的位置
for (int i = 0; i < npos.Length; i++)
{
npos[i] = 100;
}
for (int i = 1; i <= r2.Matches(url).Count; i++)
{
npos[0] = 0;
npos[i] = url.IndexOf("/", npos[i - 1] + 1);
}
//从相对url中构造出完整的绝对ur
int tmp = r2.Matches(url).Count - r2.Matches(urlpart).Count + 1;
int destination = npos[tmp];
urlpart= url.Remove(destination) + urlpart;
return urlpart;
}
}
else return "";
}
/// 函数名称:GetNextPageUrl
/// 函数功能:从网页源码中提取下一页URL,并且如果下一页URL不完整,补全下一页URL
/// 函数参数:rawtext,当前网页源码,url 当前网页的绝对url
/// 函数返回值:下一页的绝对url,若无下一页url则返回空字符串
/// </summary>
/// <param name="rawtext"></param>
/// <returns></returns>
public static string GetNextPageUrl(string rawtext,string url)
{
string urlpart = "";//存放提取出的URL
Lexer lexer = new Lexer(rawtext);
Parser parser = new Parser(lexer);
NodeFilter filter = new TagNameFilter("a");
NodeList htmlNodes = parser.Parse(filter);
for (int i = htmlNodes.Count - 1; i >= 0; i--)
{
ATag link = (ATag)htmlNodes.ElementAt(i);
string temp = link.StringText;
Regex fetchNextPage = new Regex(@"下一页");
if (!fetchNextPage.IsMatch(temp))
{
htmlNodes.Remove(i);
}
}
//如果<a>标签链表不空,则说明当前页不是最后一页,含有下一页URL,并提取出下一页URL
if (htmlNodes.Count > 0)
{
ATag link = (ATag)htmlNodes.ElementAt(0);
urlpart = link.GetAttribute("href");
//判断从源码中提取出的URL是否需要补全
if (new Regex(@"^http:").IsMatch(urlpart))
return urlpart;
//否则需要补全
else
{
Regex r1 = new Regex(@"^/");//如果被补全的url不是以“/”打头,则在它前面加上“/”
if (!r1.IsMatch(urlpart))
{
urlpart = "/" + urlpart;
}
Regex r2 = new Regex(@"/");
int[] npos = new int[20];//从npos[1]开始,数组元素用于储存url中"/"出现的位置
for (int i = 0; i < npos.Length; i++)
{
npos[i] = 100;
}
for (int i = 1; i <= r2.Matches(url).Count; i++)
{
npos[0] = 0;
npos[i] = url.IndexOf("/", npos[i - 1] + 1);
}
//从相对url中构造出完整的绝对ur
int tmp = r2.Matches(url).Count - r2.Matches(urlpart).Count + 1;
int destination = npos[tmp];
urlpart= url.Remove(destination) + urlpart;
return urlpart;
}
}
else return "";
}
计算链接文字浓度
/// <summary>
/// 函数名称:PercentageOfATag
/// 函数功能:计算某文本段内链接文字(汉字)的浓度
/// 函数参数:text某文本段文本
/// 函数返回值:当前文本段内链接文字个数/文本段内总的汉字个数
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static float PercentageOfATag(string text)
{ //找到文本段中的所有链接元素形成链接元素表
Lexer lexer = new Lexer(text);
Parser parser = new Parser(lexer);
NodeFilter filter = new TagNameFilter("a");
NodeList htmlNodes = parser.Parse(filter);
//取出所有链接文本
string aTagText = "";
for (int i = 0; i < htmlNodes.Count; i++)
{
aTagText += htmlNodes.ElementAt(i).ToPlainTextString();
}
//计算链接文字与文本块中总汉字的比例
Regex r = new Regex("[\u4e00-\u9fa5]");
float precise = (float)r.Matches(aTagText).Count / (float)r.Matches(text).Count;
return precise;
}
/// 函数名称:PercentageOfATag
/// 函数功能:计算某文本段内链接文字(汉字)的浓度
/// 函数参数:text某文本段文本
/// 函数返回值:当前文本段内链接文字个数/文本段内总的汉字个数
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static float PercentageOfATag(string text)
{ //找到文本段中的所有链接元素形成链接元素表
Lexer lexer = new Lexer(text);
Parser parser = new Parser(lexer);
NodeFilter filter = new TagNameFilter("a");
NodeList htmlNodes = parser.Parse(filter);
//取出所有链接文本
string aTagText = "";
for (int i = 0; i < htmlNodes.Count; i++)
{
aTagText += htmlNodes.ElementAt(i).ToPlainTextString();
}
//计算链接文字与文本块中总汉字的比例
Regex r = new Regex("[\u4e00-\u9fa5]");
float precise = (float)r.Matches(aTagText).Count / (float)r.Matches(text).Count;
return precise;
}
获取网页源码
/// <summary>
/// 函数名称:GetDataFromUrl
/// 功能说明:获取url指定的网页的源码
/// 参数:string url用于指定 url
/// 参数:ref Encoding encode用来获取网页中的字符集编码
/// </summary>
/// <param name="url"></param>
/// <param name="encode"></param>
/// <returns></returns>
public static string GetDataFromUrl(string url, ref Encoding encode)
{
string str = string.Empty;
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
//设置http头
request.AllowAutoRedirect = true;
request.AllowWriteStreamBuffering = true;
request.Referer = "";
request.Timeout = 10 * 1000;
request.UserAgent = "";
HttpWebResponse response = null;
response = (HttpWebResponse)request.GetResponse();
//根据http应答的http头来判断编码
string characterSet = response.CharacterSet;
//Encoding encode;
if (characterSet != "")
{
if (characterSet == "ISO-8859-1")
{
characterSet = "gb2312";
}
encode = Encoding.GetEncoding(characterSet);
}
else
{
encode = Encoding.Default;
}
//声明一个内存流来保存http应答流
Stream receiveStream = response.GetResponseStream();
MemoryStream mStream = new MemoryStream();
byte[] bf = new byte[255];
int count = receiveStream.Read(bf, 0, 255);
while (count > 0)
{
mStream.Write(bf, 0, count);
count = receiveStream.Read(bf, 0, 255);
}
receiveStream.Close();
mStream.Seek(0, SeekOrigin.Begin);
//从内存流里读取字符串
StreamReader reader = new StreamReader(mStream, encode);
char[] buffer = new char[1024];
count = reader.Read(buffer, 0, 1024);
while (count > 0)
{
str += new String(buffer, 0, count);
count = reader.Read(buffer, 0, 1024);
}
//从解析出的字符串里判断charset,如果和http应答的编码不一直
//那么以页面声明的为准,再次从内存流里重新读取文本
Regex reg =
new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>",
RegexOptions.Multiline | RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(str);
if (mc.Count > 0)
{
string tempCharSet = mc[0].Result("$1");
if (string.Compare(tempCharSet, characterSet, true) != 0)
{
encode = Encoding.GetEncoding(tempCharSet);
str = string.Empty;
mStream.Seek(0, SeekOrigin.Begin);
reader = new StreamReader(mStream, encode);
buffer = new char[255];
count = reader.Read(buffer, 0, 255);
while (count > 0)
{
str += new String(buffer, 0, count);
count = reader.Read(buffer, 0, 255);
}
}
}
reader.Close();
mStream.Close();
if (response != null)
response.Close();
return str;
}
/// 函数名称:GetDataFromUrl
/// 功能说明:获取url指定的网页的源码
/// 参数:string url用于指定 url
/// 参数:ref Encoding encode用来获取网页中的字符集编码
/// </summary>
/// <param name="url"></param>
/// <param name="encode"></param>
/// <returns></returns>
public static string GetDataFromUrl(string url, ref Encoding encode)
{
string str = string.Empty;
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
//设置http头
request.AllowAutoRedirect = true;
request.AllowWriteStreamBuffering = true;
request.Referer = "";
request.Timeout = 10 * 1000;
request.UserAgent = "";
HttpWebResponse response = null;
response = (HttpWebResponse)request.GetResponse();
//根据http应答的http头来判断编码
string characterSet = response.CharacterSet;
//Encoding encode;
if (characterSet != "")
{
if (characterSet == "ISO-8859-1")
{
characterSet = "gb2312";
}
encode = Encoding.GetEncoding(characterSet);
}
else
{
encode = Encoding.Default;
}
//声明一个内存流来保存http应答流
Stream receiveStream = response.GetResponseStream();
MemoryStream mStream = new MemoryStream();
byte[] bf = new byte[255];
int count = receiveStream.Read(bf, 0, 255);
while (count > 0)
{
mStream.Write(bf, 0, count);
count = receiveStream.Read(bf, 0, 255);
}
receiveStream.Close();
mStream.Seek(0, SeekOrigin.Begin);
//从内存流里读取字符串
StreamReader reader = new StreamReader(mStream, encode);
char[] buffer = new char[1024];
count = reader.Read(buffer, 0, 1024);
while (count > 0)
{
str += new String(buffer, 0, count);
count = reader.Read(buffer, 0, 1024);
}
//从解析出的字符串里判断charset,如果和http应答的编码不一直
//那么以页面声明的为准,再次从内存流里重新读取文本
Regex reg =
new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>",
RegexOptions.Multiline | RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(str);
if (mc.Count > 0)
{
string tempCharSet = mc[0].Result("$1");
if (string.Compare(tempCharSet, characterSet, true) != 0)
{
encode = Encoding.GetEncoding(tempCharSet);
str = string.Empty;
mStream.Seek(0, SeekOrigin.Begin);
reader = new StreamReader(mStream, encode);
buffer = new char[255];
count = reader.Read(buffer, 0, 255);
while (count > 0)
{
str += new String(buffer, 0, count);
count = reader.Read(buffer, 0, 255);
}
}
}
reader.Close();
mStream.Close();
if (response != null)
response.Close();
return str;
}
修改正文
/// <summary>
/// 函数名称:ModifyRawText
/// 功能说明:将html源码中的字符串进行转义以便正常显示
/// 参数: ref string rawtext表示待修改的html源码
/// </summary>
/// <param name="rawtext"></param>
public static void ModifyRawText(ref string rawtext)
{
string[] aryReg ={
@"&(quot;|#34;)",
@"&(amp;|#38;)",
@"&(lt;|#60;)",
@"&(gt;|#62;)",
@"&(nbsp;|#160;)",
@"&(iexcl;|#161;)",
@"&(cent;|#162;)",
@"&(pound;|#163;)",
@"&(copy|;#169;)",
@"&#(\d+);",
};
string[] aryRep = {
"\"",
"&",
"<",
">",
" ",
"\xa1",//chr(161),
"\xa2",//chr(162),
"\xa3",//chr(163),
"\xa9",//chr(169),
"",
};
//转义字符置换
for (int i = 0; i < aryReg.Length; i++)
{
Regex regexfinal = new Regex(aryReg[i], RegexOptions.IgnoreCase);
rawtext = regexfinal.Replace(rawtext, aryRep[i]);
}
}
/// 函数名称:ModifyRawText
/// 功能说明:将html源码中的字符串进行转义以便正常显示
/// 参数: ref string rawtext表示待修改的html源码
/// </summary>
/// <param name="rawtext"></param>
public static void ModifyRawText(ref string rawtext)
{
string[] aryReg ={
@"&(quot;|#34;)",
@"&(amp;|#38;)",
@"&(lt;|#60;)",
@"&(gt;|#62;)",
@"&(nbsp;|#160;)",
@"&(iexcl;|#161;)",
@"&(cent;|#162;)",
@"&(pound;|#163;)",
@"&(copy|;#169;)",
@"&#(\d+);",
};
string[] aryRep = {
"\"",
"&",
"<",
">",
" ",
"\xa1",//chr(161),
"\xa2",//chr(162),
"\xa3",//chr(163),
"\xa9",//chr(169),
"",
};
//转义字符置换
for (int i = 0; i < aryReg.Length; i++)
{
Regex regexfinal = new Regex(aryReg[i], RegexOptions.IgnoreCase);
rawtext = regexfinal.Replace(rawtext, aryRep[i]);
}
}
正文提取模块
/// <summary>
/// 函数名称:GetMainContent
/// 函数功能:从内容页源码中获取正文
/// 函数参数:input内容页源码
/// 函数返回值:提纯后的文本
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
public static string GetMainContent(string input)
{
string reg1 = @"<(p|br)[^<]*>";
string reg2 = @"(\[([^=]*)(=[^\]]*)?\][\s\S]*?\[/\1\])|(?<lj>(?<=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");《])<a\s+[^>]*>[^<]{2,}</a>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");》]))|(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)|(\&\#\d+\;)";
//1、获取网页的所有div标签
List<string> list = GetTags(input, "div");
List<string> needToRemove = new List<string>();
foreach (string s in list)
{
Regex r = new Regex("[\u4e00-\u9fa5]");
if (r.Matches(s).Count < 200)//2、去除汉字少于200字的div
{
needToRemove.Add(s);
}
if (PercentageOfATag(s) > 0.9)//3、去除链接文字浓度过高(超过90%)的div段
needToRemove.Add(s);
}
//将非目标div从div链中摘除
foreach (string s in needToRemove)
{
list.Remove(s);
}
//5、把剩下的div按汉字比例多少倒序排列,
list.Sort(CompareDinosByChineseLength);//
if (list.Count < 1)
{
return "";
}
input = list[list.Count - 1];
//6、把p和br替换成特殊的占位符[p][br]
input = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "[$1]");
//7、去掉HTML标签,保留汉字
input = new Regex(reg2, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "");
//8、把特殊占维护替换成回车和换行
input = new Regex(@"\[p\]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "\r\n ");
input = new Regex(@"\[br\]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "\r\n");
string[] aryReg ={
@"&(quot|#34)",
@"&(amp|#38)",
@"&(lt|#60)",
@"&(gt|#62)",
@"&(nbsp|#160)",
@"&(iexcl|#161)",
@"&(cent|#162)",
@"&(pound|#163)",
@"&(copy|#169)",
@"&#(\d+)",
};
string[] aryRep = {
"\"",
"&",
"<",
">",
" ",
"\xa1",//chr(161),
"\xa2",//chr(162),
"\xa3",//chr(163),
"\xa9",//chr(169),
"",
};
for (int i = 0; i < aryReg.Length; i++)
{
Regex regexfinal = new Regex(aryReg[i], RegexOptions.IgnoreCase);
input = regexfinal.Replace(input, aryRep[i]);
}
//针对腾讯新闻内容页打的小补丁
input = new Regex(@"(·[\s\S]+?\r\n|\[\])", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "");
return input;
}
}
}
/// 函数名称:GetMainContent
/// 函数功能:从内容页源码中获取正文
/// 函数参数:input内容页源码
/// 函数返回值:提纯后的文本
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
public static string GetMainContent(string input)
{
string reg1 = @"<(p|br)[^<]*>";
string reg2 = @"(\[([^=]*)(=[^\]]*)?\][\s\S]*?\[/\1\])|(?<lj>(?<=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");《])<a\s+[^>]*>[^<]{2,}</a>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");》]))|(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)|(\&\#\d+\;)";
//1、获取网页的所有div标签
List<string> list = GetTags(input, "div");
List<string> needToRemove = new List<string>();
foreach (string s in list)
{
Regex r = new Regex("[\u4e00-\u9fa5]");
if (r.Matches(s).Count < 200)//2、去除汉字少于200字的div
{
needToRemove.Add(s);
}
if (PercentageOfATag(s) > 0.9)//3、去除链接文字浓度过高(超过90%)的div段
needToRemove.Add(s);
}
//将非目标div从div链中摘除
foreach (string s in needToRemove)
{
list.Remove(s);
}
//5、把剩下的div按汉字比例多少倒序排列,
list.Sort(CompareDinosByChineseLength);//
if (list.Count < 1)
{
return "";
}
input = list[list.Count - 1];
//6、把p和br替换成特殊的占位符[p][br]
input = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "[$1]");
//7、去掉HTML标签,保留汉字
input = new Regex(reg2, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "");
//8、把特殊占维护替换成回车和换行
input = new Regex(@"\[p\]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "\r\n ");
input = new Regex(@"\[br\]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "\r\n");
string[] aryReg ={
@"&(quot|#34)",
@"&(amp|#38)",
@"&(lt|#60)",
@"&(gt|#62)",
@"&(nbsp|#160)",
@"&(iexcl|#161)",
@"&(cent|#162)",
@"&(pound|#163)",
@"&(copy|#169)",
@"&#(\d+)",
};
string[] aryRep = {
"\"",
"&",
"<",
">",
" ",
"\xa1",//chr(161),
"\xa2",//chr(162),
"\xa3",//chr(163),
"\xa9",//chr(169),
"",
};
for (int i = 0; i < aryReg.Length; i++)
{
Regex regexfinal = new Regex(aryReg[i], RegexOptions.IgnoreCase);
input = regexfinal.Replace(input, aryRep[i]);
}
//针对腾讯新闻内容页打的小补丁
input = new Regex(@"(·[\s\S]+?\r\n|\[\])", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "");
return input;
}
}
}
注:由于时间久远,以及代码改之又改,上传的版本一定能用,但是可能会和讲解有些出入。另外,所有的这些工作都是小孩子的把戏,欢迎大家留言指点,包括思路,以及编码规范。希望自己的一些不足能够被大家指出,不吝赐教。在此谢过
下载地址