C# 去掉字符串中的html 标签,保留指定的标签和属性

 /// <summary>
 /// 使用示例
 /// </summary>
 public static void HtmlRemove() 
 {
     string requestBody = "<html><head><title>Test</title></head><body><a lay-her='123' href=\"https://example.com\">Link</a><p>Not allowed</p><span class=\"developer\"><img src=\"https://www.luocore.com/assets/logo-dark.be3794d7.png\">           <span>LuoCore</span></span><img lay-her='123' data-luo='222' src=\"图片路径\" data=\"test\" /> <a data-luo='222' href=\"baidu.com\" /> <div><span>测试标签</span><img src=\"https://pic.cnblogs.com/face/646489/20140908123308.png\" class=\"avatar\" alt=\"博主头像\"></div></body></html>";
     Dictionary<string, string[]> allowedTags = new Dictionary<string, string[]>()
                 {
                     { "a", new string[]{ "href", "data-luo" } },
                     { "img", new string[]{ "src", "lay-her", "data-luo" } }
                 };
     // 过滤HTML标签  
     string filteredRequestBody = HtmlRemoveTagsExcept(requestBody, allowedTags);
     Console.WriteLine(filteredRequestBody);
 }

 /// <summary>
 /// html 代码移除
 /// </summary>
 /// <param name="html"></param>
 /// <param name="allowedTags"></param>
 /// <returns></returns>
 public static string HtmlRemoveTagsExcept(string html, Dictionary<string, string[]> allowedTags)
 {
     // 预编译正则表达式以提高性能
     var tagsPattern = new Regex($"<(/?)(?!({string.Join("|", allowedTags.Keys)})(\\s|/?>))[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
     var attributePatterns = allowedTags.ToDictionary(
         tag => tag.Key,
         tag => new Regex($@"\s+({string.Join("|", tag.Value.Select(attr => Regex.Escape(attr)))})\s*=\s*(['""][^'""]*['""])", RegexOptions.IgnoreCase | RegexOptions.Compiled)
     );

     // 移除不允许的标签
     string intermediateResult = tagsPattern.Replace(html, "");

     // 处理允许的标签,只保留允许的属性
     foreach (var tag in allowedTags.Keys)
     {
         string fullPattern = $"<{tag}(\\s[^>]*?)?(/?>)";
         intermediateResult = Regex.Replace(intermediateResult, fullPattern, match =>
         {
             string insideTag = match.Groups[1].Value;
             string tagClose = match.Groups[2].Value;
             string filteredAttributes = attributePatterns[tag].Matches(insideTag)
                 .Cast<Match>()
                 .Aggregate("", (current, attrMatch) => current + attrMatch.Value);

             return $"<{tag}{filteredAttributes}{tagClose}";
         }, RegexOptions.IgnoreCase);
     }

     // 移除多余的空格和修正属性格式
     intermediateResult = Regex.Replace(intermediateResult, @"\s{2,}", " ");
     intermediateResult = Regex.Replace(intermediateResult, @"<(\w+)(\s[^>]*?)?\s*(/?>)", "<$1$2$3");

     return intermediateResult;
 }

 

posted @ 2024-05-16 16:12  LuoCore  阅读(101)  评论(0编辑  收藏  举报