java 获取pdf内容

1. 说明

将pdf中的文字读取处理还有一些限制:1. 文档的安全属性不能过于严格 2. 不能存在图片。

2. 直接贴相关的源码

有两种读取方式,maven对应的pom文件

<dependencies>
  	<dependency>
	    <groupId>org.apache.pdfbox</groupId>
	    <artifactId>pdfbox</artifactId>
	    <version>1.8.8</version>
	</dependency>
	<dependency>
	    <groupId>com.itextpdf</groupId>
	    <artifactId>itextpdf</artifactId>
	    <version>5.0.6</version>
	</dependency>
  </dependencies>

2.1 pdfbox

/**
 * PdfboxUtil.java
 */
package com.hsm.pdfTest;
 
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStream;
 
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
 
/**
 * @author hsm
 */
public class PdfboxUtil {
	private static String PDFPATH = "D:/Maven权威指南中文版.pdf";
	private static String FILEPATH = "D:/Maven权威指南中文版.doc";
	public static void main(String[] args) throws Exception {
		String content=getPdfContent(PDFPATH);
		toFile(content,FILEPATH);
	}
	/**
	 * 获取pdf的内容<br/>
	 * @param pdfPath
	 * @return
	 * @throws Exception 
	 */
	private static String getPdfContent(String pdfPath) throws Exception {
	    boolean sort = false;// 是否排序
	    int startPage = 1;// 开始提取页数
	    int endPage = Integer.MAX_VALUE;   // 结束提取页数 
	    String content = null;//暂时存放pdf内容
	    
	    InputStream input = null;
	    File pdfFile = new File(pdfPath);
	    PDDocument document = null;
	    try {
	    	input = new FileInputStream(pdfFile);
	    	// 加载 pdf 文档
	    	PDFParser parser = new PDFParser(input);
	    	parser.parse();
	    	document = parser.getPDDocument();
	    	// 获取内容信息
	    	PDFTextStripper pts = new PDFTextStripper();
	    	pts.setSortByPosition(sort);
	    	endPage = document.getNumberOfPages();
	    	System.out.println("Total Page: " + endPage);
	    	pts.setStartPage(startPage);
	    	pts.setEndPage(endPage);
	    	try {
	    		content = pts.getText(document);
	    	}catch(Exception e) {
	    		throw e;
	    	}
	    	System.out.println("Get PDF Content ...");
      }catch(Exception e){
         throw e;
      } finally {
         if (null != input)
            input.close();
         if (null != document)
            document.close();
      }
      return content;
	}
	private static void toFile(String content,String filePath) {
		 try {
	         File f = new File(filePath);
	         if (!f.exists()) {
	            f.createNewFile();
	         }
	         System.out.println("Write PDF Content to txt file ...");
	         BufferedWriter output = new BufferedWriter(new FileWriter(f));
	         output.write(content);
	         output.close();
	      } catch (Exception e) {
	         e.printStackTrace();
	      }
	}
}

2.2 itext

package com.hsm.pdfTest;
 
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
/**
 * @author hsm
 */
public class ItextpdfUtil {
	private static String PDFPATH = "D:/Maven权威指南中文版.pdf";
	private static String FILEPATH = "D:/Maven权威指南中文版.doc";
	public static void main(String[] args)  {
		String content=getPdfContent(PDFPATH);
		System.out.println(content);
		
		toFile(PDFPATH,FILEPATH);
	}
	/**
	 * 获取pdf的内容
	 * @param pdfPath
	 * @return
	 */
	private static String getPdfContent(String pdfPath) {
		PdfReader reader = null;
	    StringBuffer buff = new StringBuffer();
	    try {
			reader = new PdfReader(pdfPath);
			PdfReaderContentParser parser = new PdfReaderContentParser(reader);
			int num = reader.getNumberOfPages();// 获得页数
			TextExtractionStrategy strategy;
			for (int i = 1; i <= num; i++) {
			   strategy = parser.processContent(i,
			          new SimpleTextExtractionStrategy());
			   buff.append(strategy.getResultantText());
			}
	     } catch (IOException e) {
	        e.printStackTrace();
	     } 
	   return buff.toString();
	}
	/**
	 * 将对应的pdf文件读到指定的文件中
	 * @param pdfPath
	 * @param filePath
	 */
	private static void toFile(String pdfPath, String filePath) {
		PrintWriter writer = null;
		PdfReader reader = null;
		try {
			writer = new PrintWriter(new FileOutputStream(filePath));
			reader = new PdfReader(pdfPath);
			int num = reader.getNumberOfPages();// 获得页数
			System.out.println("Total Page: " + num);
			StringBuffer content = new StringBuffer(""); // 存放读取出的文档内容
			for (int i = 1; i <= num; i++) {
				// 读取第i页的文档内容
				content.append(PdfTextExtractor.getTextFromPage(reader, i));
			}
			writer.write(content.toString());// 写入文件内容
			writer.flush();
			writer.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}
posted @ 2017-10-12 16:10  雾里看花的少年  阅读(2063)  评论(0编辑  收藏  举报