C# 正则 清除html标签

//获取所有img标签

<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>

//清除 除img和&nbsp;标签外的所有标签,并留下@保留位置
string stringPattern = @"</?(?(?=img|&nbsp;@)notag|[a-zA-Z0-9]+)(?:\s[a-zA-Z0-9\-]+=?(?:(["",']?).*?\1?)?)*\s*/?>";
html = Regex.Replace(html, stringPattern, "@");

 

 /// <summary>    
        /// 取得HTML中所有图片的 URL。    
        /// </summary>    
        /// <param name="sHtmlText">HTML代码</param>    
        /// <returns>图片的URL列表</returns>    
        public static string[] GetHtmlImageUrlList(string sHtmlText)
        {
            if (string.IsNullOrEmpty(sHtmlText))
            {
                var s = new string[1];
                s[0] = "";
                return s;
            }
            // 定义正则表达式用来匹配 img 标签    
            Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);

            // 搜索匹配的字符串    
            MatchCollection matches = regImg.Matches(sHtmlText);
            int i = 0;
            string[] sUrlList = new string[matches.Count];

            // 取得匹配项列表    
            foreach (Match match in matches)
            {
                //sUrlList[i++] = match.Groups["imgUrl"].Value;//获取img中url内容
                sUrlList[i++] = match.Value;//获取完整img标签
            }
            return sUrlList;
        }

通过多个正则来清除html标签。原文地址:https://www.cnblogs.com/fanmulu/p/13636899.html

using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

namespace RegexTestWin
{
    public class ReplaceHtml
    {
        private IList<Regex> _regexs = new List<Regex>();
        private IList<string> _replacement = new List<string>();

        private static ReplaceHtml _replaceHtml = null;
        private static readonly object _object = new object();
        private ReplaceHtml() { }
        public static ReplaceHtml Instance
        {
            get
            {
                if (_replaceHtml == null)
                {
                    lock (_object)
                    {
                        if (_replaceHtml == null)
                        {
                            _replaceHtml = SetInstance(new ReplaceHtml());
                        }
                    }
                }
                return _replaceHtml;
            }
        }

        public Regex GetRegex(int count)
        {
            return _regexs[count];
        }
        public string GetReplacement(int count)
        {
            return _replacement[count];
        }
        public int GetReplacementCount()
        {
            return _replacement.Count;
        }
        public string ReplaceHtmlTag(string Htmlstring)
        {
            Htmlstring = Htmlstring.Replace("\r\n", "");
            Regex aRegex = null;
            for (int count = 0; count < this.GetReplacementCount(); count++)
            {
                aRegex = this.GetRegex(count);
                if (aRegex != null)
                {
                    Htmlstring = aRegex.Replace(Htmlstring, this.GetReplacement(count), -1, 0);
                }
            }
            Htmlstring = Htmlstring.Replace("<", "");
            Htmlstring = Htmlstring.Replace(">", "");
            Htmlstring = Htmlstring.Replace("\r\n", "");
            return Htmlstring;
        }

        private void AddRegex(Regex aRegex, string Replacement)
        {
            _regexs.Add(aRegex);
            _replacement.Add(Replacement);
        }

        private static ReplaceHtml SetInstance(ReplaceHtml aReplaceHtml)
        {
            #region 赋值正则表达式和替换后的字符数组
            string[] pattern = new string[]
            {
                @"<script.*?</script>",@"<style.*?</style>",@"<.*?>",
                @"<(.[^>]*)>",@"([\r\n])[\s]+",@"-->",
                @"<!--.*",@"&(quot|#34);",@"&(amp|#38);",
                @"&(lt|#60);",@"&(gt|#62);",@"&(nbsp|#160);",
                @"&(iexcl|#161);",@"&(cent|#162);",@"&(pound|#163);",
                @"&(copy|#169);",@"&#(\d+);"
            };
            string[] replacement = new string[]
            {
                "","","","","","","","\"","&","<",">","","\xa1","\xa2","\xa3","\xa9",""
            };
            #endregion

            if (pattern.Length != replacement.Length)
            {
                throw new Exception("正则表达式数组和替换后的字符数组的长度不一致!");
            }

            int count = 0; //计数器
            foreach (string str in pattern)
            {
                Regex aRegex = new Regex(str);
                aReplaceHtml.AddRegex(aRegex, replacement[count]);
                count += 1;
            }
            return aReplaceHtml;
        }
    }
}

 

posted @ 2022-02-24 16:00  学竹  阅读(673)  评论(0编辑  收藏  举报