启发式提取一个网页的主体内容

对于一个网页来说，一般都会有非常丰富的边框或导航条信息，但是用户往往比较专注于主题内容，边框内容可以说是没有太大价值的。尤其是对于手持设备来说，显示大量的边框信息将变得非常讨厌。在搜索引擎中，其实只要搜索引网页的主题内容，对于边框内容的索引意义不大。

上面说了一下提取一个网页主体内容的作用，下面来讨论一下方法。如果真要准确地提取一个网页的主题将非常困难，因为网页的结构多种多样，如果要做到程序自动化提取还是有一定困难的。

在网上看资料的时候，看到一篇文章http://w-shadow.com/blog/2008/01/25/extracting-the-main-content-from-a-webpage/，讲述了如何基于启发式提取网页内容，还有php实现的代码。

我们知道对于导航信息来说，一般是都是一些链接，那么网页中的链接文本的占比程度就会比较高，我们只要去除这些链接占比程度比较高的文本，就基本可以去除这些丰富的边框信息了。

下面是我用C#实现的代码：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using Sgml;
using System.IO;

namespace Soso.Youwang
{

    class HtmlMainContentExtractor
    {
        private HashSet containerTags = new HashSet() { "div", "table", "td", "th", "tbody", "thead", "tfoot", "col", "colgroup", "ul", "ol", "html", "center", "span", "form" };
        private HashSet removeTags = new HashSet() { "script", "noscript", "style", "meta", "input", "iframe", "embed", "hr", "img", "#comment", "link", "label" };
        private HashSet igonoreLenTags = new HashSet() { "span" };

        /// 
        /// 链接文本的长度
        /// 
        private int totalLinkLen = 0;
        /// 
        /// 总的文本的长度
        /// 
        private int totalLen = 0;

        /// 
        /// 链接度，是平均的多少倍以上就删除
        /// 
        private double rate = 1.1;

        /// 
        /// 平均链接度
        /// 
        private double avgLinkRate;

        /// 
        /// 小于一定字符数的container将被删除
        /// 
        private int minLen = 20;

        public String Extract(string html)
        {
            html = html.ToLower();
            XmlDocument doc = ConvertHtml2Xhtml(html);
            HeuristicRemove(doc.DocumentElement);
            avgLinkRate = 1.0 * totalLinkLen / totalLen;
            int total, link;
            ContainerRemove(doc.DocumentElement, out total, out link);
            return doc.OuterXml;
            
        }

        private bool ContainerRemove(XmlNode node, out int total, out int link)
        {
            total = 0;
            link = 0;
            List toRemove = new List();
            foreach (XmlNode el in node.ChildNodes)
            {
                int t;
                int l;
                if (ContainerRemove(el, out t, out l))
                {
                    toRemove.Add(el);
                }
                else
                {
                    total += t;
                    link += l;
                }
            }

            foreach (XmlNode el in toRemove)
            {
                node.RemoveChild(el);
            }

            if (containerTags.Contains(node.Name))
            {
                if ((!igonoreLenTags.Contains(node.Name) && total <= minLen) || 1.0 * link / total >= rate * avgLinkRate)
                {
                    return true;
                }
            }
            else if (node.NodeType == XmlNodeType.Text)
            {
                total += node.Value.Length;
            }
            else if (node.Name == "a")
            {
                link += node.InnerText.Length;
            }
            return false;
        }

        private bool HeuristicRemove(XmlNode node)
        {
            if (removeTags.Contains(node.Name))
            {
                return true;
            }
            List toRemove = new List();
            foreach (XmlNode el in node.ChildNodes)
            {
                if (HeuristicRemove(el))
                {
                    toRemove.Add(el);
                }
            }
            foreach (XmlNode el in toRemove)
            {
                node.RemoveChild(el);
            }

            if (node.Name == "a")
            {
                totalLinkLen += node.InnerText.Length;
            }
            else if (node.NodeType == XmlNodeType.Text)
            {
                totalLen += node.Value.Length;
            }
            return false;
        }

        private XmlDocument ConvertHtml2Xhtml(string html)
        {
            using (SgmlReader reader = new SgmlReader())
            {
                reader.DocType = "HTML";
                reader.InputStream = new StringReader(html);
                using (StringWriter stringWriter = new StringWriter())
                {
                    using (XmlTextWriter writer = new XmlTextWriter(stringWriter))
                    {
                        reader.WhitespaceHandling = WhitespaceHandling.None;
                        writer.Formatting = Formatting.Indented;
                        XmlDocument doc = new XmlDocument();
                        doc.Load(reader);
                        return doc;
                    }
                }
            }
        }
    }
}