C# 去掉字符串中的html 标签,保留指定的标签和属性
/// <summary> /// 使用示例 /// </summary> public static void HtmlRemove() { string requestBody = "<html><head><title>Test</title></head><body><a lay-her='123' href=\"https://example.com\">Link</a><p>Not allowed</p><span class=\"developer\"><img src=\"https://www.luocore.com/assets/logo-dark.be3794d7.png\"> <span>LuoCore</span></span><img lay-her='123' data-luo='222' src=\"图片路径\" data=\"test\" /> <a data-luo='222' href=\"baidu.com\" /> <div><span>测试标签</span><img src=\"https://pic.cnblogs.com/face/646489/20140908123308.png\" class=\"avatar\" alt=\"博主头像\"></div></body></html>"; Dictionary<string, string[]> allowedTags = new Dictionary<string, string[]>() { { "a", new string[]{ "href", "data-luo" } }, { "img", new string[]{ "src", "lay-her", "data-luo" } } }; // 过滤HTML标签 string filteredRequestBody = HtmlRemoveTagsExcept(requestBody, allowedTags); Console.WriteLine(filteredRequestBody); } /// <summary> /// html 代码移除 /// </summary> /// <param name="html"></param> /// <param name="allowedTags"></param> /// <returns></returns> public static string HtmlRemoveTagsExcept(string html, Dictionary<string, string[]> allowedTags) { // 预编译正则表达式以提高性能 var tagsPattern = new Regex($"<(/?)(?!({string.Join("|", allowedTags.Keys)})(\\s|/?>))[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled); var attributePatterns = allowedTags.ToDictionary( tag => tag.Key, tag => new Regex($@"\s+({string.Join("|", tag.Value.Select(attr => Regex.Escape(attr)))})\s*=\s*(['""][^'""]*['""])", RegexOptions.IgnoreCase | RegexOptions.Compiled) ); // 移除不允许的标签 string intermediateResult = tagsPattern.Replace(html, ""); // 处理允许的标签,只保留允许的属性 foreach (var tag in allowedTags.Keys) { string fullPattern = $"<{tag}(\\s[^>]*?)?(/?>)"; intermediateResult = Regex.Replace(intermediateResult, fullPattern, match => { string insideTag = match.Groups[1].Value; string tagClose = match.Groups[2].Value; string filteredAttributes = attributePatterns[tag].Matches(insideTag) .Cast<Match>() .Aggregate("", (current, attrMatch) => current + attrMatch.Value); return $"<{tag}{filteredAttributes}{tagClose}"; }, RegexOptions.IgnoreCase); } // 移除多余的空格和修正属性格式 intermediateResult = Regex.Replace(intermediateResult, @"\s{2,}", " "); intermediateResult = Regex.Replace(intermediateResult, @"<(\w+)(\s[^>]*?)?\s*(/?>)", "<$1$2$3"); return intermediateResult; }