html代码转换成为纯文本
先解码。接着转换为纯文本,用这段代码:
public static string HtmlToText(string source)
{
string result;
//remove line breaks,tabs
result = source.Replace("\r", " ");
result = result.Replace("\n", " ");
result = result.Replace("\t", " ");
//remove the header
result = Regex.Replace(result, "(<head>).*(</head>)", string.Empty, RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*script([^>])*>", "<script>", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, RegexOptions.IgnoreCase);
//remove all styles
result = Regex.Replace(result, @"<( )*style([^>])*>", "<style>", RegexOptions.IgnoreCase); //clearing attributes
result = Regex.Replace(result, "(<style>).*(</style>)", string.Empty, RegexOptions.IgnoreCase);
//insert tabs in spaces of <td> tags
result = Regex.Replace(result, @"<( )*td([^>])*>", " ", RegexOptions.IgnoreCase);
//insert line breaks in places of <br> and <li> tags
result = Regex.Replace(result, @"<( )*br( )*>", "\r", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*li( )*>", "\r", RegexOptions.IgnoreCase);
//insert line paragraphs in places of <tr> and <p> tags
result = Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", RegexOptions.IgnoreCase);
//remove anything thats enclosed inside < >
result = Regex.Replace(result, @"<[^>]*>", string.Empty, RegexOptions.IgnoreCase);
//replace special characters:
result = Regex.Replace(result, @"&", "&", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @" ", " ", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<", "<", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @">", ">", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&(.{2,6});", string.Empty, RegexOptions.IgnoreCase);
//remove extra line breaks and tabs
result = Regex.Replace(result, @" ( )+", " ");
result = Regex.Replace(result, "(\r)( )+(\r)", "\r\r");
result = Regex.Replace(result, @"(\r\r)+", "\r\n");
return result;
}
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 智能桌面机器人:用.NET IoT库控制舵机并多方法播放表情
· Linux glibc自带哈希表的用例及性能测试
· 深入理解 Mybatis 分库分表执行原理
· 如何打造一个高并发系统?
· .NET Core GC压缩(compact_phase)底层原理浅谈
· 手把手教你在本地部署DeepSeek R1,搭建web-ui ,建议收藏!
· 新年开篇:在本地部署DeepSeek大模型实现联网增强的AI应用
· Janus Pro:DeepSeek 开源革新,多模态 AI 的未来
· 互联网不景气了那就玩玩嵌入式吧,用纯.NET开发并制作一个智能桌面机器人(三):用.NET IoT库
· 【非技术】说说2024年我都干了些啥