Apache Tika是一个用于文件类型检测和文件内容,其中PDF解析器可以读取pdf内容
所用jar包:
<dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> <version>1.20</version> </dependency> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-parsers</artifactId> <version>1.16</version> </dependency>
public static void main(String[] args) { File file =new File("D:\\101.pdf"); BodyContentHandler handler=new BodyContentHandler(); //元数据对象 Metadata metadata=new Metadata(); FileInputStream inputStream=new FileInputStream(file); ParseContext parseContext=new ParseContext(); // PDFParser pdfParser=new PDFParser(); pdfParser.parse(inputStream, handler, metadata, parseContext); System.out.println("文件属性信息:"); for(String name: metadata.names()){ System.out.println(name+":"+metadata.get(name)); } System.out.println("pdf文件内容:"); System.out.println(handler.toString()); }