java识别pdf文本内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import java.io.IOException;
  
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
 
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
  
public class PDFReader {
  
 /**
  * @param args
  * @throws IOException
  */
 public static void main(String[] args) throws IOException {
  //System.out.print(getPdfFileText("C:/ceshi/001.pdf"));
  getPdfFileText2("C:/ceshi/001.pdf");
 
 }
  
 public static String getPdfFileText(String fileName) throws IOException {
  PdfReader reader = new PdfReader(fileName);
  PdfReaderContentParser parser = new PdfReaderContentParser(reader);
  StringBuffer buff = new StringBuffer();
  TextExtractionStrategy strategy;
  for (int i = 1; i <= reader.getNumberOfPages(); i++) {
   strategy = parser.processContent(i,
     new SimpleTextExtractionStrategy());
   buff.append(strategy.getResultantText());
  }
  return buff.toString();
 }
 
 
 public static void getPdfFileText2(String filePath) throws IOException {
   try {
    PDDocument document = PDDocument.load(new File(filePath));
    PDFTextStripper stripper = new PDFTextStripper();
    String text = stripper.getText(document);
    System.out.println(text);
    document.close();
   } catch (IOException e) {
    e.printStackTrace();
   }
 }
 
 
}

  

posted @   Amy清风  阅读(1803)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
点击右上角即可分享
微信分享提示