java识别pdf文本内容

import java.io.IOException;
 
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;

import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
 
public class PDFReader {
 
 /**
  * @param args
  * @throws IOException
  */
 public static void main(String[] args) throws IOException {
  //System.out.print(getPdfFileText("C:/ceshi/001.pdf"));
  getPdfFileText2("C:/ceshi/001.pdf");

 }
 
 public static String getPdfFileText(String fileName) throws IOException {
  PdfReader reader = new PdfReader(fileName);
  PdfReaderContentParser parser = new PdfReaderContentParser(reader);
  StringBuffer buff = new StringBuffer();
  TextExtractionStrategy strategy;
  for (int i = 1; i <= reader.getNumberOfPages(); i++) {
   strategy = parser.processContent(i,
     new SimpleTextExtractionStrategy());
   buff.append(strategy.getResultantText());
  }
  return buff.toString();
 }


 public static void getPdfFileText2(String filePath) throws IOException {
   try {
    PDDocument document = PDDocument.load(new File(filePath));
    PDFTextStripper stripper = new PDFTextStripper();
    String text = stripper.getText(document);
    System.out.println(text);
    document.close();
   } catch (IOException e) {
    e.printStackTrace();
   }
 }


}

  

posted @ 2023-05-09 16:25  Amy清风  阅读(1797)  评论(0编辑  收藏  举报