使用 Open XML SDK 实现 html 富文本转换为 docx 格式示例

 

使用 Open XML SDK 实现 html 富文本转换为 docx 格式文档相对复杂。下面是一个示例。手动检测 <strong>和 <em> 标签并应用相应的文本格式。

using System;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;

class Program
{
    static void Main()
    {
        string htmlContent = "<p>This is <strong>bold</strong> and <em>italic</em> text.</p>";

        // 创建一个新的docx文档
        using (WordprocessingDocument doc = WordprocessingDocument.Create("output.docx", WordprocessingDocumentType.Document))
        {
            MainDocumentPart mainPart = doc.AddMainDocumentPart();
            mainPart.Document = new Document();
            Body body = mainPart.Document.AppendChild(new Body());

            // 解析HTML并创建docx段落
            string[] paragraphs = htmlContent.Split(new[] { "<p>", "</p>" }, StringSplitOptions.RemoveEmptyEntries);
            foreach (string paragraphContent in paragraphs)
            {
                Paragraph paragraph = new Paragraph();
                Run run = new Run();

                string[] tags = paragraphContent.Split(new[] { "<strong>", "</strong>", "<em>", "</em>" }, StringSplitOptions.None);
                foreach (string tag in tags)
                {
                    RunProperties runProperties = new RunProperties();
                    if (tag.Contains("<strong>"))
                    {
                        runProperties.Bold = new Bold();
                    }
                    if (tag.Contains("<em>"))
                    {
                        runProperties.Italic = new Italic();
                    }

                    run.Append(runProperties);
                    run.Append(new Text(tag));
                }

                paragraph.Append(run);
                body.Append(paragraph);
            }
        }

        Console.WriteLine("HTML to docx conversion complete.");
    }
}

 

 

需要根据 HTML 标记的不同来创建相应的 docx元素,例如将<p>标签映射到docx段落,将<strong>标签映射到粗体等。

using System;
using System.IO;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;

class Program
{
    static void Main()
    {
        string htmlContent = "<p>This is <strong>bold</strong> and <em>italic</em> text.</p>";

        // 创建一个新的docx文档
        using (WordprocessingDocument doc = WordprocessingDocument.Create("output.docx", WordprocessingDocumentType.Document))
        {
            MainDocumentPart mainPart = doc.AddMainDocumentPart();
            mainPart.Document = new Document();
            Body body = mainPart.Document.AppendChild(new Body());

            // 解析HTML内容并创建相应的docx元素
            ProcessHtmlContent(htmlContent, body);

            doc.Save();
        }

        Console.WriteLine("HTML to docx conversion complete.");
    }

    static void ProcessHtmlContent(string htmlContent, OpenXmlElement parentElement)
    {
        // 解析HTML内容并将其映射到docx元素
        // 这里需要根据HTML标记的不同来创建相应的docx元素
        // 例如,<p>标签可以映射到段落,<strong>可以映射到粗体文本等

        // 示例:将HTML段落转换为docx段落
        if (htmlContent.StartsWith("<p>") && htmlContent.EndsWith("</p>"))
        {
            string paragraphText = htmlContent.Substring(3, htmlContent.Length - 7);
            Paragraph paragraph = new Paragraph(new Run(new Text(paragraphText)));
            parentElement.Append(paragraph);
        }

        // 添加更多的HTML标记处理逻辑以满足你的需求
    }
}

 

 

下例使用 Html2OpenXml 将 html 转 docx
using System;
using System.IO;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using HtmlToOpenXml;

class Program
{
    static void Main()
    {
        string htmlContent = "<p>This is <strong>bold</strong> and <em>italic</em> text.</p>";

        // 创建一个新的docx文档
        using (WordprocessingDocument doc = WordprocessingDocument.Create("output.docx", WordprocessingDocumentType.Document))
        {
            MainDocumentPart mainPart = doc.AddMainDocumentPart();
            mainPart.Document = new Document();
            Body body = mainPart.Document.AppendChild(new Body());

            // 使用Html2OpenXml将HTML内容转换为docx元素
            var converter = new HtmlConverter(mainPart);
            converter.ImageProcessing = ImageProcessing.AutomaticDownload;
            var paragraphs = converter.Parse(htmlContent);
            
            foreach (var paragraph in paragraphs)
            {
                body.Append(paragraph);
            }

            doc.Save();
        }

        Console.WriteLine("HTML to docx conversion complete.");
    }
}

 

带内联式 css 样式。

using System;
using System.IO;
using System.Text;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using HtmlToOpenXml;

class Program
{
    static void Main()
    {
        string htmlContent = "<p style=\"color: blue; font-size: 14px;\">This is <strong>bold</strong> and <em>italic</em> text.</p>";

        // 创建一个新的docx文档
        using (WordprocessingDocument doc = WordprocessingDocument.Create("output.docx", WordprocessingDocumentType.Document))
        {
            MainDocumentPart mainPart = doc.AddMainDocumentPart();
            mainPart.Document = new Document();
            Body body = mainPart.Document.AppendChild(new Body());

            // 使用Html2OpenXml将HTML内容转换为docx元素
            var converter = new HtmlConverter(mainPart);
            converter.ImageProcessing = ImageProcessing.AutomaticDownload;
            var paragraphs = converter.Parse(htmlContent);

            foreach (var paragraph in paragraphs)
            {
                body.Append(paragraph);
            }

            doc.Save();
        }

        Console.WriteLine("HTML to docx conversion complete.");
    }
}

 

 

带 css 样式应用

using System;
using System.IO;
using System.Text;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using HtmlToOpenXml;

class Program
{
    static void Main()
    {
        string htmlContent = "<p class=\"my-paragraph\">This is <strong>bold</strong> and <em>italic</em> text.</p>";
        string externalCss = ".my-paragraph { color: blue; font-size: 14px; }";

        // 创建一个新的docx文档
        using (WordprocessingDocument doc = WordprocessingDocument.Create("output.docx", WordprocessingDocumentType.Document))
        {
            MainDocumentPart mainPart = doc.AddMainDocumentPart();
            mainPart.Document = new Document();
            Body body = mainPart.Document.AppendChild(new Body());

            // 将外部CSS样式转化为内联样式
            htmlContent = ApplyExternalCssToHtml(htmlContent, externalCss);

            // 使用Html2OpenXml将HTML内容转换为docx元素
            var converter = new HtmlConverter(mainPart);
            converter.ImageProcessing = ImageProcessing.AutomaticDownload;
            var paragraphs = converter.Parse(htmlContent);

            foreach (var paragraph in paragraphs)
            {
                body.Append(paragraph);
            }

            doc.Save();
        }

        Console.WriteLine("HTML to docx conversion complete.");
    }

    static string ApplyExternalCssToHtml(string htmlContent, string externalCss)
    {
        // 解析外部CSS文件并将其应用于HTML内容
        // 这里需要将CSS规则应用到HTML标记的内联样式中
        // 在此示例中,我们简单地将CSS类名替换为内联样式
        // 你可能需要更复杂的CSS处理逻辑,取决于外部CSS文件的内容和结构

        // 将样式规则拆分为每个类
        var cssRules = externalCss.Split('}');

        foreach (var rule in cssRules)
        {
            if (!string.IsNullOrWhiteSpace(rule))
            {
                // 提取类名和样式属性
                var parts = rule.Split('{');
                var className = parts[0].Trim();
                var style = parts[1].Trim();

                // 将类名替换为内联样式
                htmlContent = htmlContent.Replace($"class=\"{className}\"", $"style=\"{style}\"");
            }
        }

        return htmlContent;
    }
}

 

posted @ 2023-10-12 21:05  神游虚空  阅读(239)  评论(0编辑  收藏  举报