获取pdf、doc/docx文本数据
1、依赖关系
<dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.12</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.17</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.17</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.17</version> </dependency>
2.代码
package com.lucene.util; import com.zxf.lucene.common.consts.FileSuffix; import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream; import org.apache.pdfbox.io.RandomAccessRead; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileInputStream; public class TextUtil { private static final Logger logger = LoggerFactory.getLogger(TextUtil.class); private TextUtil(){} /**获取pdf、doc、docx文档的文本信息*/ public static String getTextOfFile(String filepath){ String text = ""; File file = new File(filepath); if(!file.isFile()){ return text; } String fileName = file.getName(); String suffix = getSuffix(fileName); if (FileSuffix.PDF.equalsIgnoreCase(suffix)) { try{ RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream(new FileInputStream(file)); PDFParser pdfParser = new PDFParser(randomAccessRead); pdfParser.parse(); try(PDDocument pdDocument = pdfParser.getPDDocument()){ PDFTextStripper pdfTextStripper = new PDFTextStripper(); text = pdfTextStripper.getText(pdDocument); } }catch (Exception e){ logger.error("获取pdf文本信息出错",e); return text; } } else if (FileSuffix.DOCX.equalsIgnoreCase(suffix)) { try(XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(OPCPackage.open(file))){ text = xwpfWordExtractor.getText(); }catch(Exception e){ logger.error("获取word文档(.docx)文本信息出错",e); return text; } } else if (FileSuffix.DOC.equalsIgnoreCase(suffix)) { try(WordExtractor wordExtractor = new WordExtractor(new FileInputStream(file))){ text = wordExtractor.getText(); }catch (Exception e){ logger.error("获取word文档(.doc)文本信息出错",e); return text; } }else{ return text; } return text.trim().replaceAll("\\r", "").replaceAll("\\n", "").replaceAll("\\t", "").replaceAll("\\s", ""); } /**获取文件后缀*/ public static String getSuffix(String string){ int one = 1; String douhao = "."; return string.substring(string.lastIndexOf(douhao)+one); } /**剔除路径中的不合法字符卷*/ public static String clearIllegalCharacter(String fieldValue) { return fieldValue.replaceAll("[\\/:\\*\\?\"<>\\\\|]", ""); } }
人生没有彩排,每天都是现场直播!