POI实现word文档转html文件

POI word文件转html
package com.feiruo.officeConvert;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;

import org.apache.poi.hwpf.usermodel.Picture;

public abstract class OfficeConvert {
    
        // 图片的存放地址
        private String imgPath = null;
        // 文件存放的地址
        private String parentPath = null;
        // 文件内容
        private String fileContent = null;
        private String encode = "UTF-8";
        
        
    /**
     * 将指定的doc文档进行格式转换
     * 
     * @param docPath
     *            *.doc文档地址
     * 
     * @throws FileNotFoundException
     * @throws IOException
     * @throws ParserConfigurationException
     * @throws TransformerException
     */
    public abstract void convert(String docPath) throws FileNotFoundException,
            IOException, ParserConfigurationException, TransformerException;

    /**
     * 将文件内容写入到磁盘
     * 
     * @param filepath
     *            保存转换文件的地址
     */
    public void writeFile(String filepath) {
        FileOutputStream fos = null;
        BufferedWriter bw = null;
        File f=new File(this.parentPath);
        
        if(!f.exists()){
            f.mkdirs();
        }
        try {
            File file = new File(filepath);
            fos = new FileOutputStream(file);
            bw = new BufferedWriter(new OutputStreamWriter(fos, encode));
            bw.write(fileContent);
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                if (bw != null)
                    bw.close();
                if (fos != null)
                    fos.close();
            } catch (IOException ie) {
            }
        }
    }
    public String checkSetPath(String path){
        path=path.trim();
        if(path.lastIndexOf("/")<path.length()-1) path+="/";
        if(path.indexOf("\"")>0)path=path.replaceAll("\"", "");
        if(path.indexOf(">")>0)path=path.replaceAll(">", "&gt;");
        if(path.indexOf("<")>0)path=path.replaceAll("<", "&lt;");
        //TODO if(path.indexOf("*")>0)path=path.replaceAll("/*", "");
        return path;
    }
    public String getEncode() {
        return encode;
    }

    public void setEncode(String encode) {
        this.encode = encode;
    }

    /**
     * 获取图片存放地址
     * 
     * @return <strong>java.lang.String</strong>
     */
    public String getImgPath() {
        return imgPath;
    }

    /**
     * 设置图片的存放地址文件夹路径
     * 
     * @param imgPath
     *            设置图片的存放文件夹名称
     */
    public void setImgPath(String imgPath) {
        this.imgPath = checkSetPath(imgPath);
    }

    /**
     * 获取存放文件的目录地址
     * 
     * @return <strong>java.lang.String</strong>
     */
    public String getParentPath() {
        return parentPath;
    }

    /**
     * 设置文件存放的路径
     * 
     * @param parentPath
     *            文件地址
     */
    public void setParentPath(String parentPath) {
        this.parentPath = checkSetPath(parentPath);
    }

    /**
     * 获取文件内容
     * 
     * @return <strong>java.lang.String</strong>
     */
    public String getFileContent() {
        return fileContent;
    }
    public void setFileContent(String content){
        this.fileContent=content;
    }
}
package com.feiruo.officeConvert;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.w3c.dom.Document;

/**
 * 将*.doc文档转换为*.html文件格式
 * 
 * @author Jdk.feiruo.
 * @since JDK 1.7 POI 3.8
 * @version 1.0
 */
public class DocToHtml extends OfficeConvert implements IOfficeConvert {
    private List<Picture> pics = null;
    

    /**
     * @param parentPath
     *            html文件存放地址
     * @param imageppth
     *            html图片存放地址
     * @param encoding
     *            设置html的编码格式
     */
    public DocToHtml(String parentPath, String imageppth, String encoding) {
        setParentPath(checkSetPath(parentPath));
        setImgPath(checkSetPath(imageppth));
        this.setEncode(encoding);
    }

    public DocToHtml() {

    }

    /**
     * 将*doc文档转为*html文件
     * 
     * @param docPath
     *            *doc文档的所在地址
     * 
     * @throws FileNotFoundException
     * @throws IOException
     * @throws ParserConfigurationException
     * @throws TransformerException
     */
    public void convert(String docPath) throws FileNotFoundException,
            IOException, ParserConfigurationException, TransformerException {
        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(
                docPath));
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument());
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content, PictureType pictureType,
                    String suggestedName, float widthInches, float heightInches) {
                return suggestedName;
            }
        });
        wordToHtmlConverter.processDocument(wordDocument);
        pics = wordDocument.getPicturesTable().getAllPictures();

        Document htmlDocument = wordToHtmlConverter.getDocument();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();

        serializer.setOutputProperty(OutputKeys.ENCODING, this.getEncode());
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);

        out.close();
        
        String htmlContent = new String(out.toByteArray());
        if(htmlContent.indexOf("<img src=\"") > 0){
            htmlContent=htmlContent.replaceAll("<img src=\"", "<img src=\"" + getImgPath());
        }
        setFileContent(htmlContent);
    }

    @Override
    public void writeWithName(String fileName) {
        // 先保存文档中的图片
        if (pics != null) {
            File imgfile = new File(this.getParentPath() + this.getImgPath());
            // 如果当前文件夹不存在,则创建新文件夹
            if (!imgfile.exists())
                imgfile.mkdirs();
            for (int i = 0; i < pics.size(); i++) {
                Picture pic = (Picture) pics.get(i);
                try {
                    pic.writeImageContent(new FileOutputStream(imgfile + "//"
                            + pic.suggestFullFileName()));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        // 保存html源码文件
        this.writeFile(getParentPath()+fileName+".html");
    }
}
package com.feiruo.Test;

import java.io.FileNotFoundException;
import java.io.IOException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;

import com.yinhai.officeConvert.DocToHtml;

public class Test{
    public static void main(String[] args) {
        Test t=new Test();
    }
      public Test(){
          DocToHtml dth=new DocToHtml("C://test", "f", "UTF-8");
          try {
            dth.convert("D://test//test.doc");
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        } catch (TransformerException e) {
            e.printStackTrace();
        }
          dth.writeWithName("feiruo");
      }
}
package com.feiruo.officeConvert;

public interface IOfficeConvert {
    /**
     * 将文件写入到磁盘
     * @param fileName 要写入文件的名称
     */
    public void writeWithName(String fileName);
}
posted @ 2016-09-30 17:43  非若  阅读(4998)  评论(0编辑  收藏  举报