*(00)*

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::

原文地址:http://ansjsun.iteye.com/blog/791142

读取OFFICE文件纯文本

package org.css.resource.businesssoft.searchengine.quwenjiansuo;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.poi.POITextExtractor;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
/**
 * 
 * @author lizh
 *
 */
public class CovertFile {

    /**
     * 从word 2003文档中提取纯文本
     * @param is
     * @return
     * @throws IOException
     */
    public static String extractTextFromDOC(InputStream is) throws IOException {
        WordExtractor ex = new WordExtractor(is); // is是WORD文件的InputStream
        return ex.getText();
    }

    /**
     * 从word 2007文档中提取纯文本
     * @param fileName
     * @return
     */
    public static String extractTextFromDOC2007(String fileName) {
        try {
            OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
            POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
            return ex.getText();
        } catch (Exception e) {
            return "";
        }
    }

    /**
     * 从excel 2003文档中提取纯文本
     * @param is
     * @return
     * @throws IOException
     */
    private static String extractTextFromXLS(InputStream is) throws IOException {
        StringBuffer content = new StringBuffer();
        HSSFWorkbook workbook = new HSSFWorkbook(is); // 创建对Excel工作簿文件的引用

        for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
            if (null != workbook.getSheetAt(numSheets)) {
                HSSFSheet aSheet = workbook.getSheetAt(numSheets); // 获得一个sheet

                for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
                        .getLastRowNum(); rowNumOfSheet++) {
                    if (null != aSheet.getRow(rowNumOfSheet)) {
                        HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一行

                        for (short cellNumOfRow = 0; cellNumOfRow <= aRow
                                .getLastCellNum(); cellNumOfRow++) {
                            if (null != aRow.getCell(cellNumOfRow)) {
                                HSSFCell aCell = aRow.getCell(cellNumOfRow); // 获得列值

                                if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
                                    content.append(aCell.getNumericCellValue());
                                } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
                                    content.append(aCell.getBooleanCellValue());
                                } else {
                                    content.append(aCell.getStringCellValue());
                                }
                            }
                        }
                    }
                }
            }
        }
        return content.toString();
    }

    /**
     * 从excel 2007文档中提取纯文本
     * @param fileName
     * @return
     * @throws Exception
     */
    private static String extractTextFromXLS2007(String fileName)
            throws Exception {
        StringBuffer content = new StringBuffer();

        // 构造 XSSFWorkbook 对象,strPath 传入文件路径
        XSSFWorkbook xwb = new XSSFWorkbook(fileName);

        // 循环工作表Sheet
        for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
            XSSFSheet xSheet = xwb.getSheetAt(numSheet);
            if (xSheet == null) {
                continue;
            }

            // 循环行Row
            for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
                XSSFRow xRow = xSheet.getRow(rowNum);
                if (xRow == null) {
                    continue;
                }

                // 循环列Cell
                for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
                    XSSFCell xCell = xRow.getCell(cellNum);
                    if (xCell == null) {
                        continue;
                    }

                    if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
                        content.append(xCell.getBooleanCellValue());
                    } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
                        content.append(xCell.getNumericCellValue());
                    } else {
                        content.append(xCell.getStringCellValue());
                    }
                }
            }
        }

        return content.toString();
    }
    
    /**
     * 从excel 2007文档中提取纯文本
     * @param fileName
     * @return
     */
    public static String getXLS2007(String fileName){
        String doc = "";
        try{
            doc = extractTextFromXLS2007(fileName);
            return doc;
        }catch(Exception e){
            return "";
        }
    }
    
    /**
     * 从ppt 2003、2007文档中提取纯文本
     * @param fileName
     * @return
     */
    public static String getPPTX(String fileName){
        String doc = "";
        try{
            File inputFile = new File(fileName);   
            POITextExtractor extractor = ExtractorFactory.createExtractor(inputFile);
            doc = extractor.getText();
            return doc;
        }catch(Exception e){
            return "";
        }
    }
    
    
    public static void main(String[] args) {
        try {
//            String wordFile = "D:/松山血战.docx";
//            String wordText2007 = CovertFile.extractTextFromDOC2007(wordFile);
//            System.out.println("wordText2007=======" + wordText2007);
//
//            InputStream is = new FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");
//            String excelText = CovertFile.extractTextFromXLS(is);
//            System.out.println("text2003==========" + excelText);

//            String excelFile = "D:/zh.xlsx";
//            String excelText2007 = CovertFile.extractTextFromXLS2007(excelFile);
//            System.out.println("excelText2007==========" + excelText2007);
            
            String pptFile = "D:/zz3.ppt";
            String pptx = CovertFile.getPPTX(pptFile);
            System.out.println("pptx==========" + pptx);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

 

最后突然发现其实只用两行代码就能搞定 
office 2003 - office 2007

POITextExtractor extractor = ExtractorFactory.createExtractor(f);
            return extractor.getText();

 

于是我泪流满面....白忙乎了..顺路奉上解析pdf的吧

 

package com.lingjoin.extractors;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.util.Date;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
import com.lingjoin.paser.LingJoinFile;

/**
 * PDF解析器
 * 
 * @author Ansj
 * 
 */
public class PDFExtractor extends AbstractExtractor {
    
    private String getContent(LingJoinFile f) {
        // TODO Auto-generated method stub
        PDDocument doc = null ;
        try {
            doc = PDDocument.load(f);
            PDFTextStripper stripper = new PDFTextStripper();
            /**
             * 设置文件的信息
             */
            this.setLingJoinFileInfo(f, doc
                    .getDocumentInformation());
            return stripper.getText(doc);
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            if (doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
        return "";
    }

    private BufferedReader getContentReader(LingJoinFile f) {
        return new BufferedReader(new StringReader(this.getContent(f)));
    }

    /**
    *    
    * 项目名称:FilePaser 
    * 类描述:   设置文件的信息
    * 创建人:ANSJ   
    * 创建时间:2010-4-14 下午04:27:57  
    * 修改备注:   
    * @version    
     */
    private void setLingJoinFileInfo(LingJoinFile f, PDDocumentInformation info) {
        if (info.getAuthor() != null) {
            f.setlAuthor(info.getAuthor());
        }
//        try {
//            if (info.getModificationDate() != null) {
//                Date date = info.getModificationDate().getTime();
//                f.setlModificationDate(date.getTime());
//            }
//        } catch (IOException e) {
//            // TODO Auto-generated catch block
//            e.printStackTrace();
//        }
        //设置标题
//        if (info.getTitle() != null) {
//            f.setlTitle(info.getTitle());
//        }
    }

    public void paserFileToReader(LingJoinFile f) throws Exception {
        f.setlContentReader(this.getContentReader(f)) ;

    }

    public void paserFileToString(LingJoinFile f) throws Exception {
        // TODO Auto-generated method stub
        f.setlContent(this.getContent(f)) ;
    }
    
    public PDFExtractor(Integer typeFlag) {
        // TODO Auto-generated constructor stub
        this.typeFlag = typeFlag ;
    }
    
    private Integer typeFlag = null ;
    
    public Integer getTypeFlag() {
        // TODO Auto-generated method stub
        return typeFlag;
    }
}

 

posted on 2013-08-05 11:15  *(00)*  阅读(765)  评论(0编辑  收藏  举报