java识别pdf文本内容
import java.io.IOException; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.PdfReaderContentParser; import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; import com.itextpdf.text.pdf.parser.TextExtractionStrategy; import java.io.File; import java.io.IOException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; public class PDFReader { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { //System.out.print(getPdfFileText("C:/ceshi/001.pdf")); getPdfFileText2("C:/ceshi/001.pdf"); } public static String getPdfFileText(String fileName) throws IOException { PdfReader reader = new PdfReader(fileName); PdfReaderContentParser parser = new PdfReaderContentParser(reader); StringBuffer buff = new StringBuffer(); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); buff.append(strategy.getResultantText()); } return buff.toString(); } public static void getPdfFileText2(String filePath) throws IOException { try { PDDocument document = PDDocument.load(new File(filePath)); PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(document); System.out.println(text); document.close(); } catch (IOException e) { e.printStackTrace(); } } }