C# 过滤html标签

 1 ///   <summary>
 2 ///   去除HTML标记
 3 ///   </summary>
 4 ///   <param   name=”NoHTML”>包括HTML的源码   </param>
 5 ///   <returns>已经去除后的文字</returns>
 6 public static string NoHTML(string Htmlstring)
 7 {
 8     //删除脚本
 9     Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "",
10     RegexOptions.IgnoreCase);
11     //删除HTML 
12     Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "",
13     RegexOptions.IgnoreCase);
14     Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "",
15     RegexOptions.IgnoreCase);
16     Htmlstring = Regex.Replace(Htmlstring, @"–>", "", RegexOptions.IgnoreCase);
17     Htmlstring = Regex.Replace(Htmlstring, @"<!–.*", "", RegexOptions.IgnoreCase);
18     Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"",
19     RegexOptions.IgnoreCase);
20     Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&",
21     RegexOptions.IgnoreCase);
22     Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<",
23     RegexOptions.IgnoreCase);
24     Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">",
25     RegexOptions.IgnoreCase);
26     Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "   ",
27     RegexOptions.IgnoreCase);
28     Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
29     Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
30     Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
31     Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
32     Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
33     Htmlstring.Replace("<", "");
34     Htmlstring.Replace(">", "");
35     Htmlstring.Replace("\r\n", "");
36     Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
37     return Htmlstring;
38 }
 1 写一个静态方法移除HTML标签
 2 #region
 3 ///   <summary>
 4 ///   移除HTML标签
 5 ///   </summary>
 6 ///   <param   name="HTMLStr">HTMLStr</param>
 7 public static string ParseTags(string HTMLStr)
 8 {
 9   return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", "");
10 }
11 #endregion
 1 #region
 2 ///   <summary>
 3 ///   取出文本中的图片地址
 4 ///   </summary>
 5 ///   <param   name="HTMLStr">HTMLStr</param>
 6 public static string GetImgUrl(string HTMLStr)
 7 {
 8   string str = string.Empty;
 9   string sPattern = @"^<img\s+[^>]*>";
10   Regex r = new Regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\S+)'?[^>]*>",
11     RegexOptions.Compiled);
12   Match m = r.Match(HTMLStr.ToLower());
13   if (m.Success)
14     str = m.Result("${url}");
15   return str;
16 }
17 #endregion
 1 ///   <summary>
 2 ///   提取HTML代码中文字的C#函数
 3 ///   </summary>
 4 ///   <param   name="strHtml">包括HTML的源码   </param>
 5 ///   <returns>已经去除后的文字</returns>
 6 using System;
 7 using System.Text.RegularExpressions;
 8 public class StripHTMLTest
 9 {
10   public static void Main()
11   {
12     string s = StripHTML(
13       "<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");
14     Console.WriteLine(s);
15   }
16 
17   public static string StripHTML(string strHtml)
18   {
19     string[]aryReg =
20     {
21       @"<script[^>]*?>.*?</script>",
22 
23       @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\["
24         "'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", @"([\r\n])[\s]+", @
25         "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @
26         "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);",
27         @"&(copy|#169);", @"&#(\d+);", @"-->", @"<!--.*\n"
28     };
29 
30     string[]aryRep =
31     {
32       "", "", "", "\"", "&", "<", ">", "   ", "\xa1",  //chr(161),
33       "\xa2",  //chr(162),
34       "\xa3",  //chr(163),
35       "\xa9",  //chr(169),
36       "", "\r\n", ""
37     };
38 
39     string newReg = aryReg[0];
40     string strOutput = strHtml;
41     for (int i = 0; i < aryReg.Length; i++)
42     {
43       Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
44       strOutput = regex.Replace(strOutput, aryRep[i]);
45     }
46     strOutput.Replace("<", "");
47     strOutput.Replace(">", "");
48     strOutput.Replace("\r\n", "");
49     return strOutput;
50   }
51 }

 

posted @ 2013-05-22 11:01  Mr.Leo  阅读(8438)  评论(2编辑  收藏  举报