正文提取部分。

 

这部分实现大部分是参考蛙蛙的,但是作了少许修改。

主要在下面两方面:

一,动态阈值。蛙蛙设置的是三百,所以他的算法对于短新闻来说失效。我的阈值设置为当前新闻正文所有汉字数量的函数。这样可以避免一些短新闻失效的情况,但还是不够完美。

二。有的正文页,含有很多链接到其他新闻的URL,这部分模块汉数量也很大,所以,如果不把这种链接块儿去掉,统计字符进行正文定位的时候,容易跑偏。

上代码

 

判断两段儿文本里哪个中文占的比例高
 /// <summary>
        
/// 函数名称:CompareDinosByChineseLength
        
/// 函数功能:判断两段儿文本里哪个中文占的比例高
        
/// 函数参数:待比较的两个文本块x,y
        
/// 函数返回值:1表明文本块x中中文比例高于y,0则反之
        
/// </summary>
        
/// <param name="x"></param>
        
/// <param name="y"></param>
        
/// <returns></returns>
        
        
public static int CompareDinosByChineseLength(string x, string y)
        {
            
if (x == null)
            {
                
if (y == null)
                {
                    
return 0;
                }
                
else
                {
                    
return -1;
                }
            }
            
else
            {
                
if (y == null)
                {
                    
return 1;
                }
                
else
                {
                    Regex r 
= new Regex("[\u4e00-\u9fa5]");//UTF8 中中文汉字的范围
                    float xCount = (float)(r.Matches(x).Count) / (float)x.Length;
                    
float yCount = (float)(r.Matches(y).Count) / (float)y.Length;

                    
int retval = xCount.CompareTo(yCount);

                    
if (retval != 0)
                    {
                        
return retval;
                    }
                    
else//如果从百分比上看不出差距,则从汉字数量上来看
                    {
                        
return x.CompareTo(y);
                    }
                }
            }
        }

 

 

获取一个网页源码中的标签列表,支持嵌套,一般或去div,td等容器
 /// <summary>
        
/// 函数名称:GetTags
        
/// 函数功能:获取一个网页源码中的标签列表,支持嵌套,一般或去div,td等容器
        
/// 函数参数:input网页源码,tag指定要获取列表的表签
        
/// 函数返回值:源码中某一标签的标签列表
        
/// </summary>
        
/// <param name="input"></param>
        
/// <param name="tag"></param>
        
/// <returns></returns>
        public static List<string> GetTags(string input, string tag)
        {
            StringReader strReader 
= new StringReader(input);
            
int lowerThanCharCounter = 0;//标记是否出现<,如果出现了,其值为1,否则其值为0
            int lowerThanCharPos = 0;//记录标签<...>的开始位置
            Stack<int> tagPos = new Stack<int>();//
            List<string> taglist = new List<string>();
            
int i = 0;//下标记录当前char在整个字符串中的位置
            while (true)
            {
                
try
                {
                    
int intCharacter = strReader.Read();
                    
if (intCharacter == -1break;//退出循环条件

                    
char convertedCharacter = Convert.ToChar(intCharacter);

                    
if (lowerThanCharCounter > 0)
                    {
                        
if (convertedCharacter == '>')
                        {
                            lowerThanCharCounter
--;

                            
string biaoqian = input.Substring(lowerThanCharPos, i - lowerThanCharPos + 1);//取出整个标签(标签包括开始标签和结束标签)
                            if (biaoqian.StartsWith(string.Format("<{0}", tag)))//如果是开始标签
                            {
                                tagPos.Push(lowerThanCharPos);
//将"<"对应的位置压栈
                            }
                            
if (biaoqian.StartsWith(string.Format("</{0}", tag)))//如果是结束标签
                            {
                                
if (tagPos.Count < 1)
                                    
continue;//
                                int tempTagPos = tagPos.Pop();//弹栈
                                string strdiv = input.Substring(tempTagPos, i - tempTagPos + 1);//将整个元素<...>...<...>压入保持表
                                taglist.Add(strdiv);//taglist[0],taglist[1],taglist[2]...分别是从内层到外层的元素
                            }
                        }
                    }

                    
if (convertedCharacter == '<')
                    {
                        lowerThanCharCounter
++;//标记是否找到了"<"标签
                        lowerThanCharPos = i;//记录<的位置
                    }
                }
                
finally
                {
                    i
++;//下标指针后移
                }
            }
            
return taglist;
        }

 

 

 

下一页URL
 /// <summary>
        
/// 函数名称:GetNextPageUrl
        
/// 函数功能:从网页源码中提取下一页URL,并且如果下一页URL不完整,补全下一页URL
        
/// 函数参数:rawtext,当前网页源码,url 当前网页的绝对url
        
/// 函数返回值:下一页的绝对url,若无下一页url则返回空字符串
        
/// </summary>
        
/// <param name="rawtext"></param>
        
/// <returns></returns>

        
public static string GetNextPageUrl(string rawtext,string url)
        {
            
string urlpart = "";//存放提取出的URL
            Lexer lexer = new Lexer(rawtext);
            Parser parser 
= new Parser(lexer);
            NodeFilter filter 
= new TagNameFilter("a");
            NodeList htmlNodes 
= parser.Parse(filter);
            
for (int i = htmlNodes.Count - 1; i >= 0; i--)
            {
                ATag link 
= (ATag)htmlNodes.ElementAt(i);
                
string temp = link.StringText;
                Regex fetchNextPage 
= new Regex(@"下一页");
                
if (!fetchNextPage.IsMatch(temp))
                {
                    htmlNodes.Remove(i);
                }
            }
            
//如果<a>标签链表不空,则说明当前页不是最后一页,含有下一页URL,并提取出下一页URL
            if (htmlNodes.Count > 0)
            {
                ATag link 
= (ATag)htmlNodes.ElementAt(0);
                urlpart 
= link.GetAttribute("href");

                
//判断从源码中提取出的URL是否需要补全
                if (new Regex(@"^http:").IsMatch(urlpart))
                    
return urlpart;
               
//否则需要补全
                else
                {
                    Regex r1 
= new Regex(@"^/");//如果被补全的url不是以“/”打头,则在它前面加上“/”
                    if (!r1.IsMatch(urlpart))
                    {
                        urlpart 
= "/" + urlpart;

                    }
                    Regex r2 
= new Regex(@"/");

                    
int[] npos = new int[20];//从npos[1]开始,数组元素用于储存url中"/"出现的位置
                    for (int i = 0; i < npos.Length; i++)
                    {
                        npos[i] 
= 100;
                    }
                    
for (int i = 1; i <= r2.Matches(url).Count; i++)
                    {
                        npos[
0= 0;
                        npos[i] 
= url.IndexOf("/", npos[i - 1+ 1);

                    }
                    
//从相对url中构造出完整的绝对ur
                    int tmp = r2.Matches(url).Count - r2.Matches(urlpart).Count + 1;
                    
int destination = npos[tmp];
                    urlpart
= url.Remove(destination) + urlpart;
                    
return urlpart;


                }
               
                
            }


            
else return "";



        }

 

 

 

计算链接文字浓度
 /// <summary>
        
/// 函数名称:PercentageOfATag
        
/// 函数功能:计算某文本段内链接文字(汉字)的浓度
        
/// 函数参数:text某文本段文本
        
/// 函数返回值:当前文本段内链接文字个数/文本段内总的汉字个数
        
/// </summary>
        
/// <param name="text"></param>
        
/// <returns></returns>
        public static float PercentageOfATag(string text)
        {  
//找到文本段中的所有链接元素形成链接元素表
            Lexer lexer = new Lexer(text);
            Parser parser 
= new Parser(lexer);
            NodeFilter filter 
= new TagNameFilter("a");
            NodeList htmlNodes 
= parser.Parse(filter);
            
//取出所有链接文本
            string aTagText = "";
            
for (int i = 0; i < htmlNodes.Count; i++)
            {

                aTagText 
+= htmlNodes.ElementAt(i).ToPlainTextString();
            }
            
//计算链接文字与文本块中总汉字的比例
            Regex r = new Regex("[\u4e00-\u9fa5]");
            
float precise = (float)r.Matches(aTagText).Count / (float)r.Matches(text).Count;
            
return precise;
 
        }


 

 

获取网页源码
/// <summary>
        
/// 函数名称:GetDataFromUrl
        
/// 功能说明:获取url指定的网页的源码
        
/// 参数:string url用于指定 url
        
/// 参数:ref Encoding encode用来获取网页中的字符集编码
        
/// </summary>
        
/// <param name="url"></param>
        
/// <param name="encode"></param>
        
/// <returns></returns>
        public static string GetDataFromUrl(string url, ref Encoding encode)
        {
            
string str = string.Empty;
            HttpWebRequest request 
= (HttpWebRequest)HttpWebRequest.Create(url);

            
//设置http头
            request.AllowAutoRedirect = true;
            request.AllowWriteStreamBuffering 
= true;
            request.Referer 
= "";
            request.Timeout 
= 10 * 1000;
            request.UserAgent 
= "";
            HttpWebResponse response 
= null;
            response 
= (HttpWebResponse)request.GetResponse();

            
//根据http应答的http头来判断编码
            string characterSet = response.CharacterSet;
            
//Encoding encode;
            if (characterSet != "")
            {
                
if (characterSet == "ISO-8859-1")
                {
                    characterSet 
= "gb2312";
                }
                encode 
= Encoding.GetEncoding(characterSet);
            }
            
else
            {
                encode 
= Encoding.Default;
            }

            
//声明一个内存流来保存http应答流
            Stream receiveStream = response.GetResponseStream();
            MemoryStream mStream 
= new MemoryStream();

            
byte[] bf = new byte[255];
            
int count = receiveStream.Read(bf, 0255);
            
while (count > 0)
            {
                mStream.Write(bf, 
0, count);
                count 
= receiveStream.Read(bf, 0255);
            }
            receiveStream.Close();

            mStream.Seek(
0, SeekOrigin.Begin);

            
//从内存流里读取字符串
            StreamReader reader = new StreamReader(mStream, encode);
            
char[] buffer = new char[1024];
            count 
= reader.Read(buffer, 01024);
            
while (count > 0)
            {
                str 
+= new String(buffer, 0, count);
                count 
= reader.Read(buffer, 01024);
            }

            
//从解析出的字符串里判断charset,如果和http应答的编码不一直
            
//那么以页面声明的为准,再次从内存流里重新读取文本
            Regex reg =
               
new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>",
                          RegexOptions.Multiline 
| RegexOptions.IgnoreCase);
            MatchCollection mc 
= reg.Matches(str);
            
if (mc.Count > 0)
            {
                
string tempCharSet = mc[0].Result("$1");
                
if (string.Compare(tempCharSet, characterSet, true!= 0)
                {
                    encode 
= Encoding.GetEncoding(tempCharSet);
                    str 
= string.Empty;
                    mStream.Seek(
0, SeekOrigin.Begin);
                    reader 
= new StreamReader(mStream, encode);
                    buffer 
= new char[255];
                    count 
= reader.Read(buffer, 0255);
                    
while (count > 0)
                    {
                        str 
+= new String(buffer, 0, count);
                        count 
= reader.Read(buffer, 0255);
                    }
                }
            }
            reader.Close();
            mStream.Close();
            
if (response != null)
                response.Close();

            
return str;

        }



 

 

修改正文
 /// <summary>
       
/// 函数名称:ModifyRawText
       
/// 功能说明:将html源码中的字符串进行转义以便正常显示
       
/// 参数: ref string rawtext表示待修改的html源码
       
/// </summary>
       
/// <param name="rawtext"></param>
        public static void ModifyRawText(ref string rawtext)
        {
            
string[] aryReg ={
         
          
@"&(quot;|#34;)",
          
@"&(amp;|#38;)",
          
@"&(lt;|#60;)",
          
@"&(gt;|#62;)"
          
@"&(nbsp;|#160;)"
          
@"&(iexcl;|#161;)",
          
@"&(cent;|#162;)",
          
@"&(pound;|#163;)",
          
@"&(copy|;#169;)",
          
@"&#(\d+);",
         
         
         };
            
string[] aryRep = {
          
           
"\"",
           "&",
           
"<",
           
">",
           
"  ",
           
"\xa1",//chr(161),
           "\xa2",//chr(162),
           "\xa3",//chr(163),
           "\xa9",//chr(169),
           "",
          
          };
            
//转义字符置换
            for (int i = 0; i < aryReg.Length; i++)
            {
                Regex regexfinal 
= new Regex(aryReg[i], RegexOptions.IgnoreCase);
                rawtext 
= regexfinal.Replace(rawtext, aryRep[i]);
            }



        }

 

正文提取模块
/// <summary>
        
/// 函数名称:GetMainContent
        
/// 函数功能:从内容页源码中获取正文
        
/// 函数参数:input内容页源码
        
/// 函数返回值:提纯后的文本
        
/// </summary>
        
/// <param name="input"></param>
        
/// <returns></returns>
        public static string GetMainContent(string input)
        {
            
string reg1 = @"<(p|br)[^<]*>";

            
string reg2 = @"(\[([^=]*)(=[^\]]*)?\][\s\S]*?\[/\1\])|(?<lj>(?<=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");《])<a\s+[^>]*>[^<]{2,}</a>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");》]))|(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)|(\&\#\d+\;)";

            
//1、获取网页的所有div标签
            List<string> list = GetTags(input, "div");


            
            List
<string> needToRemove = new List<string>();
            
foreach (string s in list)
            {
                Regex r 
= new Regex("[\u4e00-\u9fa5]");
                
if (r.Matches(s).Count < 200)//2、去除汉字少于200字的div
                {
                    needToRemove.Add(s);
                }
                
if (PercentageOfATag(s) > 0.9)//3、去除链接文字浓度过高(超过90%)的div段
                    needToRemove.Add(s);
            }
            
//将非目标div从div链中摘除
            foreach (string s in needToRemove)
            {
                list.Remove(s);
            }

            
//5、把剩下的div按汉字比例多少倒序排列,
            list.Sort(CompareDinosByChineseLength);//
            if (list.Count < 1)
            {
                
return "";
            }
            input 
= list[list.Count - 1];
            
//6、把p和br替换成特殊的占位符[p][br]
           input = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "[$1]");

            
//7、去掉HTML标签,保留汉字
            input = new Regex(reg2, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "");

            
//8、把特殊占维护替换成回车和换行
            input = new Regex(@"\[p\]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "\r\n  ");
            input 
= new Regex(@"\[br\]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "\r\n");
            
string[] aryReg ={
         
          
@"&(quot|#34)",
          
@"&(amp|#38)",
          
@"&(lt|#60)",
          
@"&(gt|#62)"
          
@"&(nbsp|#160)"
          
@"&(iexcl|#161)",
          
@"&(cent|#162)",
          
@"&(pound|#163)",
          
@"&(copy|#169)",
          
@"&#(\d+)",
         
         
         };

            
string[] aryRep = {
          
           
"\"",
           "&",
           
"<",
           
">",
           
"  ",
           
"\xa1",//chr(161),
           "\xa2",//chr(162),
           "\xa3",//chr(163),
           "\xa9",//chr(169),
           "",
          
          };
            
for (int i = 0; i < aryReg.Length; i++)
            {
                Regex regexfinal 
= new Regex(aryReg[i], RegexOptions.IgnoreCase);
                input 
= regexfinal.Replace(input, aryRep[i]);
            }

            
//针对腾讯新闻内容页打的小补丁
            input = new Regex(@"(·[\s\S]+?\r\n|\[\])", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "");
            
return input;
        }


    }
}

 注:由于时间久远,以及代码改之又改,上传的版本一定能用,但是可能会和讲解有些出入。另外,所有的这些工作都是小孩子的把戏,欢迎大家留言指点,包括思路,以及编码规范。希望自己的一些不足能够被大家指出,不吝赐教。在此谢过

下载地址

北京邮电大学毕业设计,新闻网页解析器

 

 

posted on 2010-03-20 11:16  finallyly  阅读(2580)  评论(5编辑  收藏  举报