static void Main(string[] args) { if (args.Length == 0 || String.IsNullOrEmpty(args[0])) { Console.WriteLine("No filename provided."); return; } string filepath = args[0]; if (Path.GetFileName(filepath) == args[0]) { filepath = Path.Combine(Environment.CurrentDirectory, filepath); } if (!File.Exists(args[0])) { Console.WriteLine("File doesn't exist."); } string html = File.ReadAllText(filepath); Console.WriteLine("input html is " html.Length " chars"); html = CleanWordHtml(html); html = FixEntities(html); filepath = Path.GetFileNameWithoutExtension(filepath) ".modified.htm"; File.WriteAllText(filepath, html); Console.WriteLine("cleaned html is " html.Length " chars"); } static string CleanWordHtml(string html) { StringCollection sc = new StringCollection(); // get rid of unnecessary tag spans (comments and title) sc.Add(@"<!--(w|W) ?-->"); sc.Add(@"<title>(w|W) ?</title>"); // Get rid of classes and styles sc.Add(@"s?class=w "); sc.Add(@"s style='[^'] '"); // Get rid of unnecessary tags sc.Add( @"<(meta|link|/?o:|/?style|/?div|/?std|/?head|/?html|body|/?body|/?span|![)[^>]*?>"); // Get rid of empty paragraph tags sc.Add(@"(<[^>] >) (</w >) "); // remove bizarre v: element attached to <img> tag sc.Add(@"s v:w =""[^""] """); // remove extra lines sc.Add(@"( ){2,}"); foreach (string s in sc) { html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase); } return html; } static string FixEntities(string html) { NamueCollection nvc = new NamueCollection(); nvc.Add("“", "“"); nvc.Add("”", "”"); nvc.Add("?", "—"); foreach (string key in nvc.Keys) { html = html.Replace(key, nvc[key]); } return html; }
【推荐】还在用 ECharts 开发大屏?试试这款永久免费的开源 BI 工具!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步