Java:将 rtf 或 doc 转成 html 格式
什么是 rtf 格式? rtf 是一种富文本格式 Rich Text Format,可以包含文字 图片 等内容。rtf 可以用 word 或者 wps 直接打开,也可以用文本编辑器打开,如果用文本编辑器打开则可以显示其源码。rtf 源码格式解析可以参考这里。
用 Java 代码解析 rtf 格式,可以用 Apache Tika 解析,且支持 rtf 格式,但是网上可参考的文档较少。但是网上 doc 转成 html 的参考文档较多,因此可采用如下步骤:
- 将 rtf 转成 doc 格式
- 将 doc 格式转换成 html
步骤 1 较为简单,可以先用 word 或者 wps 打开 rtf 文件,然后 文件 另存为 doc 即可。如果一个文件可以这样操作,如果有多上百个文件这样操作肯定较为繁琐,可以查看这篇文章,批量将 rtf 另存为 doc 格式。
步骤 2 可以参考网上的这篇文章, 通过 Apache POI 将 doc 转成 html 格式,且样式图片不会丢失。
对于步骤 2 中网上那篇文章将 doc 转 html 时,提取了其中的图片,然后 html 引入了图片的相对位置。其实还有一种办法,是将图片转成 base64 编码,直接内嵌在 html 网页中。代码如下:
首先导入依赖,这里采用的是 poi 3.17 版本,其他版本也可以。
<dependencies> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.17</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.17</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.17</version> </dependency> </dependencies>
然后写工具类
package conv; import java.io.BufferedWriter; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.Base64; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.PictureType; import org.w3c.dom.Document; public class Word2HtmlAboutPic { public static void main(String argv[]) { try { convert2Html("D://2.doc", "D://2.html"); } catch (Exception e) { e.printStackTrace(); } } public static void writeFile(String content, String path) { FileOutputStream fos = null; BufferedWriter bw = null; try { File file = new File(path); fos = new FileOutputStream(file); bw = new BufferedWriter(new OutputStreamWriter(fos, "GB2312")); bw.write(content); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { if (bw != null) bw.close(); if (fos != null) fos.close(); } catch (IOException ie) { ie.printStackTrace(); } } } public static void write2File(byte[] content, String path) { FileOutputStream fos = null; BufferedWriter bw = null; try { File file = new File(path); fos = new FileOutputStream(file); fos.write(content); fos.close(); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { if (bw != null) bw.close(); if (fos != null) fos.close(); } catch (IOException ie) { ie.printStackTrace(); } } } public static void convert2Html(String fileName, String outPutFile) throws TransformerException, IOException, ParserConfigurationException { Base64.Encoder encoder = Base64.getEncoder(); HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName)); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); wordToHtmlConverter.setPicturesManager(new PicturesManager() { public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { String encodedText = new String(encoder.encode(content)); String imgSrc = "data:" + pictureType.getMime() + ";" + "base64," + encodedText; return imgSrc; } }); wordToHtmlConverter.processDocument(wordDocument); Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); out.close(); write2File(out.toByteArray(), outPutFile); } }
然后批量转换
package conv; import java.io.File; public class BatchWord2Html { public static void main(String[] args) throws Exception { String originalFolder = "D:/develop/temp/original"; String destinationFolder = "D:/develop/temp/destination"; File folder = new File(originalFolder); for (File fileEntry : folder.listFiles()) { if (fileEntry.isDirectory()) { System.out.println("Subfolders in the folder"); break; } String filename = fileEntry.getName(); String caselsh = filename.substring(0, filename.lastIndexOf(".")); String outFileName = destinationFolder + "/" + caselsh + ".html"; System.out.println(filename + " " + outFileName); Word2HtmlAboutPic.convert2Html(originalFolder + "/" + filename, outFileName); } } }
参考文章:
rtf 格式解析 https://blog.csdn.net/dream_dt/article/details/79215798
word 转 html https://www.cnblogs.com/jameslif/p/3356588.html