火焰

valeb
  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

过滤HTML代码

Posted on 2014-03-31 23:55  valeb  阅读(325)  评论(0编辑  收藏  举报
  public static string FilterHtml(string string_include_html)
        {
            string[] HtmlRegexArr ={ 
                                       #region Html 正则数组
                            @"<script[^>]*?>.*?</script>",  
                             @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", 
                             @"([\r\n])[\s]+", 
                             @"&(quot|#34);", 
                             @"&(amp|#38);", 
                             @"&(lt|#60);", 
                             @"&(gt|#62);", 
                             @"&(nbsp|#160);", 
                             @"&(iexcl|#161);", 
                             @"&(cent|#162);", 
                             @"&(pound|#163);", 
                             @"&(copy|#169);", 
                             @"&#(\d+);", 
                             @"-->", 
                             @"<!--.*\n"  
                                        #endregion 
                                   };
            string[] HtmlReplaceArr = { 
                                       #region 替换Html字符
                             "", 
                             "", 
                             "", 
                             "\"", 
                             "&", 
                             "<", 
                             ">", 
                             " ", 
                             "\xa1", 
                             "\xa2", 
                             "\xa3", 
                             "\xa9", 
                             "", 
                             "\r\n", 
                             ""  
                            #endregion
                                      };
            string string_no_html = null;
            for (int i = 0; i < HtmlRegexArr.Length; i++)
            {
                System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(HtmlRegexArr[i], System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                string_no_html = regex.Replace(string_include_html, HtmlReplaceArr[i]);
            }
            string_no_html.Replace("<", "");
            string_no_html.Replace(">", "");
            string_no_html.Replace("\r\n", "");
            return string_no_html;
        }

 以上来至网络,但个人认为还是不行。故有以下自己写的:

        /// <summary> 
        /// 将Html标签转化为空 
        /// </summary> 
        /// <param name="strHtml">待转化的字符串</param> 
        /// <returns>经过转化的字符串</returns> 
        public static string GetStringNoHtml(string string_include_html)
        {
            if (String.IsNullOrEmpty(string_include_html))
            {
                return "";
            }
            else
            {
                string_include_html = string_include_html.Replace("<BR>", "\r\n").Replace("<br>", "\r\n");
                //第一种
                string string_no_html = System.Text.RegularExpressions.Regex.Replace(string_include_html, @"(<script[^>]*?>.*?</script>)|(<(.[^>]*)>)", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                return string_no_html.Replace("&nbsp;", " ");
                //第二种
                //return System.Text.RegularExpressions.Regex.Replace(string_include_html, @"(<script[^>]*?>.*?</script>)|(<(.[^>]*)>)|(&nbsp;)", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            }
        }