Java读取各种文件格式内容
所需的jar包哦也不要太记得了,大家可以搜搜,直接上代码:
import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.text.NumberFormat; import org.apache.commons.io.FileUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.POIXMLDocument; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; /** *文件内容读取转换器 */ public class ReadFileConverter { public String getContents(String path) throws Exception { String contents = ""; int index = path.lastIndexOf("."); String file_suffix = path.substring(index+1).toLowerCase(); if(file_suffix.equalsIgnoreCase("txt")||file_suffix.equalsIgnoreCase("log")){ contents = this.readTXT(path); } else if(file_suffix.equalsIgnoreCase("xls")){ contents = this.readXLS(path); } else if(file_suffix.equalsIgnoreCase("xlsx")){ contents = this.readXLSX(path); } else if(file_suffix.equalsIgnoreCase("doc")){ contents = this.readDOC(path); } else if(file_suffix.equalsIgnoreCase("docx")){ contents = this.readDOCX(path); } else if(file_suffix.equalsIgnoreCase("pdf")){ contents = this.readPDF(path); } return contents; } public String readXLS(String file) throws Exception { StringBuilder content = new StringBuilder(); HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(file)); try{ for(int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++){ if (null != workbook.getSheetAt(numSheets)){ HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet for(int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++){ if (null != aSheet.getRow(rowNumOfSheet)){ HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行 for(short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++){ if (null != aRow.getCell(cellNumOfRow)){ HSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值 if (this.convertCell(aCell).length() > 0){ content.append(this.convertCell(aCell)); } } content.append("\n"); } } } } } } catch(Exception e){ content.append("xls文件格式不对或损坏"); } finally{ if(workbook!=null){ workbook.close(); } } return content.toString(); } public String readXLSX(String file) throws Exception { StringBuilder content = new StringBuilder(); XSSFWorkbook workbook = new XSSFWorkbook(file); try{ for(int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++){ if (null != workbook.getSheetAt(numSheets)){ XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet for(int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++){ if (null != aSheet.getRow(rowNumOfSheet)){ XSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行 for(short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++){ if (null != aRow.getCell(cellNumOfRow)){ XSSFCell aCell = aRow.getCell(cellNumOfRow);// 获得列值 if (this.convertCell(aCell).length() > 0){ content.append(this.convertCell(aCell)); } } content.append("\n"); } } } } } }catch(Exception e){ content.append("xlsx文件格式不对或损坏"); } finally{ if(workbook!=null){ workbook.close(); } } return content.toString(); } public String readTXT(String file) throws Exception { String contents = ""; try{ String encoding = this.get_charset(new File(file)); if (encoding.equalsIgnoreCase("GBK")) { contents = FileUtils.readFileToString(new File(file), "gbk"); } else { contents = FileUtils.readFileToString(new File(file), "utf8"); } }catch(Exception e){ contents = "txt文件格式不对或损坏"; } return contents; } public String readDOC(String file) throws Exception { String returnStr; WordExtractor wordExtractor = new WordExtractor(new FileInputStream(new File(file))); try{ returnStr = wordExtractor.getText(); }catch(Exception e){ returnStr="doc文件格式不对或损坏"; } finally{ if(wordExtractor != null){ wordExtractor.close(); } } return returnStr; } public String readDOCX(String file) throws Exception { String docx; XWPFWordExtractor xwp= new XWPFWordExtractor(POIXMLDocument.openPackage(file)); try{ docx= xwp.getText(); }catch(Exception e){ docx="docx文件格式不对或损坏"; } finally{ if(xwp !=null){ xwp.close(); } } return docx; } public String readPDF(String file) throws Exception { String result = null; FileInputStream is = null; PDDocument document = null; try{ is = new FileInputStream(file); document = PDDocument.load(is); PDFTextStripper stripper = new PDFTextStripper(); result = stripper.getText(document); }catch(Exception e){ result="pdf文件格式不对或损坏"; } finally{ if (is != null){ is.close(); } if (document != null){ document.close(); } } return result; } private String get_charset(File file) throws IOException { String charset = "GBK"; byte[] first3Bytes = new byte[3]; BufferedInputStream bis = null; try { boolean checked = false; bis = new BufferedInputStream(new FileInputStream(file)); bis.mark(0); int read = bis.read(first3Bytes, 0, 3); if (read == -1) return charset; if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { charset = "UTF-16LE"; checked = true; } else if (first3Bytes[0] == (byte) 0xFE&& first3Bytes[1] == (byte) 0xFF) { charset = "UTF-16BE"; checked = true; } else if (first3Bytes[0] == (byte) 0xEF&& first3Bytes[1] == (byte) 0xBB&& first3Bytes[2] == (byte) 0xBF) { charset = "UTF-8"; checked = true; } bis.reset(); if (!checked) { // int len = 0; int loc = 0; while ((read = bis.read()) != -1) { loc=loc+1; if (read >= 0xF0) break; if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK break; if (0xC0 <= read && read <= 0xDF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF) // (0x80 // - 0xBF),也可能在GB编码内 continue; else break; } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小 read = bis.read(); if (0x80 <= read && read <= 0xBF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) { charset = "UTF-8"; break; } else break; } else break; } } } } catch (Exception e) { e.printStackTrace(); } finally { if (bis != null) { bis.close(); } } return charset; } @SuppressWarnings("deprecation") private String convertCell(Cell cell) { NumberFormat formater = NumberFormat.getInstance(); formater.setGroupingUsed(false); String cellValue = ""; if (cell == null) { return cellValue; } switch (cell.getCellTypeEnum()) { case NUMERIC: cellValue = formater.format(cell.getNumericCellValue()); break; case STRING: cellValue = cell.getStringCellValue(); break; case BLANK: cellValue = cell.getStringCellValue(); break; case BOOLEAN: cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString(); break; case ERROR: cellValue = String.valueOf(cell.getErrorCellValue()); break; default: cellValue = ""; } return cellValue.trim(); } }
版权声明:如需转载,请注明!PS:如是转载随便,请忽略