Java解析word,excel,pdf

这个是我在做Lucene全文检索的时候写的,郁闷的是,我们的环境用的是jdk1.4,但是要解析office2007必须用到POI3.5以上版本,但是POI3.5必须运行在JDK1.5以上的版本,

~只好寻求其他方法了

package org.gaoyoubo.resolve;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;

public class Resolve {
/**
* 得到文件后缀名
*/
public String getExt(String path) {
String ext
= "";
if (path != null && !"".equals(path)) {
File file
= new File(path);
if (file.exists()) {
ext
= path.substring(path.lastIndexOf(".") + 1);
}
}
return ext;
}

public String execute(String path) {
String content
= "";
String ext
= getExt(path);
if (ext != null && !"".equals(ext)) {
if (Const.FILE_TYPE_LIST.contains(ext)) {
if (ext.equals("txt")) {
content
= resolveText(path);
}
else if (ext.equals("doc")) {
content
= resolveWord2003(path);
}
else if (ext.equals("docx")) {
content
= resolveWord2007(path);
}
else if (ext.equals("pdf")) {
content
= resolvePdf(path);
}
else if (ext.equals("xls")) {
content
= resolveExcel2003(path);
}
else if (ext.equals("xlsx")) {
content
= resolveExcel2007(path);
}

}
}
else {
System.err.println(
"无法解析文件:" + path + "!");
}
return content;
}

/**
* 解析word2007
*
@param path 文件路径
*
@return 文件内容
*/
public String resolveWord2007(String path){
String content
= "";
try {
OPCPackage opcPackage
= POIXMLDocument.openPackage(path);
POIXMLTextExtractor ex
= new XWPFWordExtractor(opcPackage);
content
= ex.getText();
}
catch (Exception e) {
System.err.println(
"解析文件:" + path + "失败!");
}
return content;
}

/**
* 解析word2003
*
@param path 文件路径
*
@return 文件内容
*/
public String resolveWord2003(String path){
String content
= "";
try {
File file
= new File(path);
FileInputStream fis
= new FileInputStream(file);
WordExtractor ex
= new WordExtractor(fis);
content
= ex.getText();
fis.close();
}
catch (Exception e) {
System.err.println(
"解析文件:" + path + "失败!");
}
return content;
}


/**
* 解析Excel2003
*
@param path
*
@return
*/
public String resolveExcel2003(String path){
StringBuffer content
= new StringBuffer("");
try {
File file
= new File(path);
FileInputStream fis
= new FileInputStream(file);
HSSFWorkbook wordbook
= new HSSFWorkbook(fis);

//遍历sheet
for (int i = 0; i < wordbook.getNumberOfSheets(); i++) {
if (null != wordbook.getSheetAt(i)) {
HSSFSheet sheet
= wordbook.getSheetAt(i);//得到sheet
//遍历该sheet中的数据
for (int j = 0; j < sheet.getLastRowNum(); j++) {
HSSFRow row
= sheet.getRow(j);//获取一行
//循环遍历cell
for (int k = 0; k < row.getLastCellNum(); k++) {
if (null != row.getCell(k)) {
HSSFCell cell
= row.getCell(k);//获取单元格的值

if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
content.append(cell.getNumericCellValue());
}
else if (cell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
content.append(cell.getBooleanCellValue());
}
else {
content.append(cell.getStringCellValue());
}

}
}
}
}
}

fis.close();

}
catch (Exception e) {
System.err.println(
"解析文件:" + path + "失败!");
}
return content.toString();
}


/**
* 解析Excel2007
*
@param path
*
@return
*/
public String resolveExcel2007(String path){
StringBuffer content
= new StringBuffer("");
try {
XSSFWorkbook wb
= new XSSFWorkbook(path);
//遍历sheet
for (int i = 0; i < wb.getNumberOfSheets(); i++) {
XSSFSheet sheet
= wb.getSheetAt(i);
if (sheet == null) {
continue;
}
//遍历行
for (int j = 0; j < sheet.getLastRowNum(); j++) {
XSSFRow row
= sheet.getRow(j);
if (row == null) {
continue;
}
//遍历单元格
for (int k = 0; k < row.getLastCellNum(); k++) {
XSSFCell cell
= row.getCell(k);
if (cell == null) {
continue;
}
if (cell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
content.append(cell.getBooleanCellValue());
}
else if (cell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
content.append(cell.getNumericCellValue());
}
else {
content.append(cell.getStringCellValue());
}
}
}
}
}
catch (Exception e) {
System.err.println(
"解析文件:" + path + "失败!");
}
return content.toString();
}


/**
* 解析pdf
*
@param path 文件路径
*
@return 文件内容
*/
public String resolvePdf(String path) {
String content
= "";
//StringBuffer content = new StringBuffer("");
try {
/*
FileInputStream fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
fis.close();
*/


File file
= new File(path);
PDDocument doc
= PDDocument.load(file);

ByteArrayOutputStream out
= new ByteArrayOutputStream();
OutputStreamWriter writer
= new OutputStreamWriter(out);

PDFTextStripper ts
= new PDFTextStripper();
ts.writeText(doc, writer);
doc.close();
out.close();
writer.close();
byte[] contents = out.toByteArray();

content
= new String(contents);

}
catch (Exception e) {
System.err.println(
"解析文件:" + path + "失败!");
}
return content.toString();
}

/**
* 解析普通文本文件
*
@param path
*
@return
*/
public String resolveText(String path){
StringBuffer content
= new StringBuffer("");
try {
File file
= new File(path);

FileReader reader
= new FileReader(file);
BufferedReader br
= new BufferedReader(reader);
while (br.read() != -1) {
content.append(br.readLine());
}
br.close();
reader.close();
}
catch (Exception e) {
System.err.println(
"读取文件:" + path + "失败!");
}
return content.toString();
}

}

 

 

posted @ 2010-06-23 09:51  Me疯子_(~  阅读(1199)  评论(0编辑  收藏  举报