C#去掉HTML标记

(1)方法一 (未测试)

public string RemoveHTMLTags(string htmlStream)
        {
            if (htmlStream == null)
            {
                throw new Exception("Your input html stream is null!");
                return null;
            }

            /*
             * 最好把所有的特殊HTML标记都找出来，然后把与其相对应的Unicode字符一起影射到Hash表内，最后一起都替换掉
             */

//先单独测试,成功后,再把所有模式合并

            //注:这两个必须单独处理
            //去掉嵌套了HTML标记的JavaScript:(<script)[\\s\\S]*(</script>)
            //去掉css标记:(<style)[\\s\\S]*(</style>)
            //去掉css标记:\\..*\\{[\\s\\S]*\\}
            htmlStream = Regex.Replace(htmlStream, "(<script)[\\s\\S]*?(</script>)|(<style)[\\s\\S]*?(</style>)", " ", RegexOptions.IgnoreCase);
            //htmlStream = RemoveTag(htmlStream, "script");
            //htmlStream = RemoveTag(htmlStream, "style");

            //去掉普通HTML标记:<[^>]+>
            //替换空格: |&|瓅 |
            htmlStream = Regex.Replace(htmlStream, "<[^>]+>| |&|瓅 |瓅"|<|>", " ", RegexOptions.IgnoreCase);
            //htmlStream = RemoveTag(htmlStream);

//替换左尖括号
//htmlStream = Regex.Replace(htmlStream, "<", "<");

//替换右尖括号
//htmlStream = Regex.Replace(htmlStream, ">", ">");

            //替换空行
            //htmlStream = Regex.Replace(htmlStream, "[\n|\r|\t]", " ");//[\n|\r][\t*| *]*[\n|\r]
            htmlStream = Regex.Replace(htmlStream, "(\r\n[\r|\n|\t| ]*\r\n)|(\n[\r|\n|\t| ]*\n)", "\r\n");
            htmlStream = Regex.Replace(htmlStream, "[\t| ]{1,}", " ");

return htmlStream.Trim();
}

（2）网上搜索到的方法 (可以使用)

// 除去所有在html元素中标记
    using System.Text.RegularExpressions;
    public static string striphtml(string strhtml)
    {
        string stroutput = strhtml;
        Regex regex = new Regex(@"<[^>]+>|</[^>]+>");

stroutput = regex.Replace(stroutput, "");
return stroutput;

}

posted on 2008-06-19 15:12 NullReferenceException 阅读(1257) 评论(2) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

NullReferenceException

C#去掉HTML标记

导航

公告