*(00)*

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::
  613 随笔 :: 0 文章 :: 45 评论 :: 159万 阅读
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

原文地址:http://ansjsun.iteye.com/blog/791142

读取OFFICE文件纯文本

复制代码
package org.css.resource.businesssoft.searchengine.quwenjiansuo;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.poi.POITextExtractor;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
/**
 * 
 * @author lizh
 *
 */
public class CovertFile {

    /**
     * 从word 2003文档中提取纯文本
     * @param is
     * @return
     * @throws IOException
     */
    public static String extractTextFromDOC(InputStream is) throws IOException {
        WordExtractor ex = new WordExtractor(is); // is是WORD文件的InputStream
        return ex.getText();
    }

    /**
     * 从word 2007文档中提取纯文本
     * @param fileName
     * @return
     */
    public static String extractTextFromDOC2007(String fileName) {
        try {
            OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
            POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
            return ex.getText();
        } catch (Exception e) {
            return "";
        }
    }

    /**
     * 从excel 2003文档中提取纯文本
     * @param is
     * @return
     * @throws IOException
     */
    private static String extractTextFromXLS(InputStream is) throws IOException {
        StringBuffer content = new StringBuffer();
        HSSFWorkbook workbook = new HSSFWorkbook(is); // 创建对Excel工作簿文件的引用

        for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
            if (null != workbook.getSheetAt(numSheets)) {
                HSSFSheet aSheet = workbook.getSheetAt(numSheets); // 获得一个sheet

                for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
                        .getLastRowNum(); rowNumOfSheet++) {
                    if (null != aSheet.getRow(rowNumOfSheet)) {
                        HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一行

                        for (short cellNumOfRow = 0; cellNumOfRow <= aRow
                                .getLastCellNum(); cellNumOfRow++) {
                            if (null != aRow.getCell(cellNumOfRow)) {
                                HSSFCell aCell = aRow.getCell(cellNumOfRow); // 获得列值

                                if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
                                    content.append(aCell.getNumericCellValue());
                                } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
                                    content.append(aCell.getBooleanCellValue());
                                } else {
                                    content.append(aCell.getStringCellValue());
                                }
                            }
                        }
                    }
                }
            }
        }
        return content.toString();
    }

    /**
     * 从excel 2007文档中提取纯文本
     * @param fileName
     * @return
     * @throws Exception
     */
    private static String extractTextFromXLS2007(String fileName)
            throws Exception {
        StringBuffer content = new StringBuffer();

        // 构造 XSSFWorkbook 对象,strPath 传入文件路径
        XSSFWorkbook xwb = new XSSFWorkbook(fileName);

        // 循环工作表Sheet
        for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
            XSSFSheet xSheet = xwb.getSheetAt(numSheet);
            if (xSheet == null) {
                continue;
            }

            // 循环行Row
            for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
                XSSFRow xRow = xSheet.getRow(rowNum);
                if (xRow == null) {
                    continue;
                }

                // 循环列Cell
                for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
                    XSSFCell xCell = xRow.getCell(cellNum);
                    if (xCell == null) {
                        continue;
                    }

                    if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
                        content.append(xCell.getBooleanCellValue());
                    } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
                        content.append(xCell.getNumericCellValue());
                    } else {
                        content.append(xCell.getStringCellValue());
                    }
                }
            }
        }

        return content.toString();
    }
    
    /**
     * 从excel 2007文档中提取纯文本
     * @param fileName
     * @return
     */
    public static String getXLS2007(String fileName){
        String doc = "";
        try{
            doc = extractTextFromXLS2007(fileName);
            return doc;
        }catch(Exception e){
            return "";
        }
    }
    
    /**
     * 从ppt 2003、2007文档中提取纯文本
     * @param fileName
     * @return
     */
    public static String getPPTX(String fileName){
        String doc = "";
        try{
            File inputFile = new File(fileName);   
            POITextExtractor extractor = ExtractorFactory.createExtractor(inputFile);
            doc = extractor.getText();
            return doc;
        }catch(Exception e){
            return "";
        }
    }
    
    
    public static void main(String[] args) {
        try {
//            String wordFile = "D:/松山血战.docx";
//            String wordText2007 = CovertFile.extractTextFromDOC2007(wordFile);
//            System.out.println("wordText2007=======" + wordText2007);
//
//            InputStream is = new FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");
//            String excelText = CovertFile.extractTextFromXLS(is);
//            System.out.println("text2003==========" + excelText);

//            String excelFile = "D:/zh.xlsx";
//            String excelText2007 = CovertFile.extractTextFromXLS2007(excelFile);
//            System.out.println("excelText2007==========" + excelText2007);
            
            String pptFile = "D:/zz3.ppt";
            String pptx = CovertFile.getPPTX(pptFile);
            System.out.println("pptx==========" + pptx);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}
复制代码

 

最后突然发现其实只用两行代码就能搞定 
office 2003 - office 2007

POITextExtractor extractor = ExtractorFactory.createExtractor(f);
            return extractor.getText();

 

于是我泪流满面....白忙乎了..顺路奉上解析pdf的吧

 

复制代码
package com.lingjoin.extractors;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.util.Date;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
import com.lingjoin.paser.LingJoinFile;

/**
 * PDF解析器
 * 
 * @author Ansj
 * 
 */
public class PDFExtractor extends AbstractExtractor {
    
    private String getContent(LingJoinFile f) {
        // TODO Auto-generated method stub
        PDDocument doc = null ;
        try {
            doc = PDDocument.load(f);
            PDFTextStripper stripper = new PDFTextStripper();
            /**
             * 设置文件的信息
             */
            this.setLingJoinFileInfo(f, doc
                    .getDocumentInformation());
            return stripper.getText(doc);
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            if (doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
        return "";
    }

    private BufferedReader getContentReader(LingJoinFile f) {
        return new BufferedReader(new StringReader(this.getContent(f)));
    }

    /**
    *    
    * 项目名称:FilePaser 
    * 类描述:   设置文件的信息
    * 创建人:ANSJ   
    * 创建时间:2010-4-14 下午04:27:57  
    * 修改备注:   
    * @version    
     */
    private void setLingJoinFileInfo(LingJoinFile f, PDDocumentInformation info) {
        if (info.getAuthor() != null) {
            f.setlAuthor(info.getAuthor());
        }
//        try {
//            if (info.getModificationDate() != null) {
//                Date date = info.getModificationDate().getTime();
//                f.setlModificationDate(date.getTime());
//            }
//        } catch (IOException e) {
//            // TODO Auto-generated catch block
//            e.printStackTrace();
//        }
        //设置标题
//        if (info.getTitle() != null) {
//            f.setlTitle(info.getTitle());
//        }
    }

    public void paserFileToReader(LingJoinFile f) throws Exception {
        f.setlContentReader(this.getContentReader(f)) ;

    }

    public void paserFileToString(LingJoinFile f) throws Exception {
        // TODO Auto-generated method stub
        f.setlContent(this.getContent(f)) ;
    }
    
    public PDFExtractor(Integer typeFlag) {
        // TODO Auto-generated constructor stub
        this.typeFlag = typeFlag ;
    }
    
    private Integer typeFlag = null ;
    
    public Integer getTypeFlag() {
        // TODO Auto-generated method stub
        return typeFlag;
    }
}
复制代码

 

posted on   *(00)*  阅读(769)  评论(0编辑  收藏  举报
编辑推荐:
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
阅读排行:
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
点击右上角即可分享
微信分享提示