Java:将 rtf 或 doc 转成 html 格式

什么是 rtf 格式? rtf 是一种富文本格式 Rich Text Format,可以包含文字 图片 等内容。rtf 可以用 word 或者 wps 直接打开,也可以用文本编辑器打开,如果用文本编辑器打开则可以显示其源码。rtf 源码格式解析可以参考这里。 

用 Java 代码解析 rtf 格式,可以用 Apache Tika 解析,且支持 rtf 格式,但是网上可参考的文档较少。但是网上 doc 转成 html 的参考文档较多,因此可采用如下步骤:

  1. 将 rtf 转成 doc 格式
  2. 将 doc 格式转换成 html

步骤 1 较为简单,可以先用 word 或者 wps 打开 rtf 文件,然后 文件 另存为 doc 即可。如果一个文件可以这样操作,如果有多上百个文件这样操作肯定较为繁琐,可以查看这篇文章,批量将 rtf 另存为 doc 格式。

步骤 2 可以参考网上的这篇文章, 通过 Apache POI 将 doc 转成 html 格式,且样式图片不会丢失。

对于步骤 2 中网上那篇文章将 doc 转 html 时,提取了其中的图片,然后 html 引入了图片的相对位置。其实还有一种办法,是将图片转成 base64 编码,直接内嵌在 html 网页中。代码如下:

首先导入依赖,这里采用的是 poi 3.17 版本,其他版本也可以。

    <dependencies>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.17</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.17</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>3.17</version>
        </dependency>
    </dependencies>
View Code

然后写工具类

package conv;

import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Base64;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.w3c.dom.Document;

public class Word2HtmlAboutPic {
    public static void main(String argv[]) {
        try {
            convert2Html("D://2.doc", "D://2.html");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void writeFile(String content, String path) {
        FileOutputStream fos = null;
        BufferedWriter bw = null;
        try {
            File file = new File(path);
            fos = new FileOutputStream(file);
            bw = new BufferedWriter(new OutputStreamWriter(fos, "GB2312"));
            bw.write(content);
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                if (bw != null)
                    bw.close();
                if (fos != null)
                    fos.close();
            } catch (IOException ie) {
                ie.printStackTrace();
            }
        }
    }

    public static void write2File(byte[] content, String path) {
        FileOutputStream fos = null;
        BufferedWriter bw = null;
        try {
            File file = new File(path);
            fos = new FileOutputStream(file);
            fos.write(content);
            fos.close();
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                if (bw != null)
                    bw.close();
                if (fos != null)
                    fos.close();
            } catch (IOException ie) {
                ie.printStackTrace();
            }
        }
    }

    public static void convert2Html(String fileName, String outPutFile)
            throws TransformerException, IOException, ParserConfigurationException {

        Base64.Encoder encoder = Base64.getEncoder();

        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));

        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
                    float heightInches) {
                String encodedText = new String(encoder.encode(content));
                String imgSrc = "data:" + pictureType.getMime() + ";" + "base64," + encodedText;
                return imgSrc;
            }
        });
        wordToHtmlConverter.processDocument(wordDocument);

        Document htmlDocument = wordToHtmlConverter.getDocument();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);

        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        out.close();
        write2File(out.toByteArray(), outPutFile);
    }
}
View Code

然后批量转换

package conv;

import java.io.File;

public class BatchWord2Html {

    public static void main(String[] args) throws Exception {
        String originalFolder = "D:/develop/temp/original";
        String destinationFolder = "D:/develop/temp/destination";
        File folder = new File(originalFolder);
        for (File fileEntry : folder.listFiles()) {
            if (fileEntry.isDirectory()) {
                System.out.println("Subfolders in the folder");
                break;
            }
            String filename = fileEntry.getName();
            String caselsh = filename.substring(0, filename.lastIndexOf("."));
            String outFileName = destinationFolder + "/" + caselsh + ".html";
            System.out.println(filename + " " + outFileName);
            Word2HtmlAboutPic.convert2Html(originalFolder + "/" + filename, outFileName);
        }
    }

}
View Code

 

 

参考文章:

rtf 格式解析 https://blog.csdn.net/dream_dt/article/details/79215798

word 转 html https://www.cnblogs.com/jameslif/p/3356588.html

posted @ 2020-01-02 10:24  colin220  阅读(2311)  评论(0编辑  收藏  举报