C# 正则 清除html标签
//获取所有img标签
<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>
//清除 除img和 标签外的所有标签,并留下@保留位置
string stringPattern = @"</?(?(?=img| @)notag|[a-zA-Z0-9]+)(?:\s[a-zA-Z0-9\-]+=?(?:(["",']?).*?\1?)?)*\s*/?>";
html = Regex.Replace(html, stringPattern, "@");
/// <summary> /// 取得HTML中所有图片的 URL。 /// </summary> /// <param name="sHtmlText">HTML代码</param> /// <returns>图片的URL列表</returns> public static string[] GetHtmlImageUrlList(string sHtmlText) { if (string.IsNullOrEmpty(sHtmlText)) { var s = new string[1]; s[0] = ""; return s; } // 定义正则表达式用来匹配 img 标签 Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); // 搜索匹配的字符串 MatchCollection matches = regImg.Matches(sHtmlText); int i = 0; string[] sUrlList = new string[matches.Count]; // 取得匹配项列表 foreach (Match match in matches) { //sUrlList[i++] = match.Groups["imgUrl"].Value;//获取img中url内容 sUrlList[i++] = match.Value;//获取完整img标签 } return sUrlList; }
通过多个正则来清除html标签。原文地址:https://www.cnblogs.com/fanmulu/p/13636899.html
using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; namespace RegexTestWin { public class ReplaceHtml { private IList<Regex> _regexs = new List<Regex>(); private IList<string> _replacement = new List<string>(); private static ReplaceHtml _replaceHtml = null; private static readonly object _object = new object(); private ReplaceHtml() { } public static ReplaceHtml Instance { get { if (_replaceHtml == null) { lock (_object) { if (_replaceHtml == null) { _replaceHtml = SetInstance(new ReplaceHtml()); } } } return _replaceHtml; } } public Regex GetRegex(int count) { return _regexs[count]; } public string GetReplacement(int count) { return _replacement[count]; } public int GetReplacementCount() { return _replacement.Count; } public string ReplaceHtmlTag(string Htmlstring) { Htmlstring = Htmlstring.Replace("\r\n", ""); Regex aRegex = null; for (int count = 0; count < this.GetReplacementCount(); count++) { aRegex = this.GetRegex(count); if (aRegex != null) { Htmlstring = aRegex.Replace(Htmlstring, this.GetReplacement(count), -1, 0); } } Htmlstring = Htmlstring.Replace("<", ""); Htmlstring = Htmlstring.Replace(">", ""); Htmlstring = Htmlstring.Replace("\r\n", ""); return Htmlstring; } private void AddRegex(Regex aRegex, string Replacement) { _regexs.Add(aRegex); _replacement.Add(Replacement); } private static ReplaceHtml SetInstance(ReplaceHtml aReplaceHtml) { #region 赋值正则表达式和替换后的字符数组 string[] pattern = new string[] { @"<script.*?</script>",@"<style.*?</style>",@"<.*?>", @"<(.[^>]*)>",@"([\r\n])[\s]+",@"-->", @"<!--.*",@"&(quot|#34);",@"&(amp|#38);", @"&(lt|#60);",@"&(gt|#62);",@"&(nbsp|#160);", @"&(iexcl|#161);",@"&(cent|#162);",@"&(pound|#163);", @"&(copy|#169);",@"&#(\d+);" }; string[] replacement = new string[] { "","","","","","","","\"","&","<",">","","\xa1","\xa2","\xa3","\xa9","" }; #endregion if (pattern.Length != replacement.Length) { throw new Exception("正则表达式数组和替换后的字符数组的长度不一致!"); } int count = 0; //计数器 foreach (string str in pattern) { Regex aRegex = new Regex(str); aReplaceHtml.AddRegex(aRegex, replacement[count]); count += 1; } return aReplaceHtml; } } }