获取 pdf 关键字坐标
package Demo.qd; import com.itextpdf.awt.geom.Rectangle2D.Float; import com.itextpdf.text.pdf.PdfDictionary; import com.itextpdf.text.pdf.PdfName; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.*; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.math.BigDecimal; import java.util.ArrayList; import java.util.List; public class PdfPositionTool { public static void main(String[] args) throws Exception { PdfPositionTool pdfPositionTool = new PdfPositionTool(); List<double[]> positions = pdfPositionTool.getPositions("/Users/yourouniu/Desktop/111.pdf", "%盖章处%"); if (positions != null && positions.size() > 0) { for (double[] position : positions) { System.out.println("pageNum: " + (int) position[0]); System.out.println("x: " + position[1]); System.out.println("y: " + position[2]); } } } /** * @return List<float [ ]> 坐标数组:float[0]:页码,float[1]:x ,float[2]:y * @Description 获取关键字坐标 * @Param filePath:pdf 路径 * @Param keyword:关键字 */ public List<double[]> getPositions(String filePath, String keyword) throws IOException { PdfPositionTool pdfPositionTool = new PdfPositionTool(); //1.给定文件 File pdfFile = new File(filePath); //2.定义一个byte数组,长度为文件的长度 byte[] pdfData = new byte[(int) pdfFile.length()]; //3.IO流读取文件内容到byte数组 FileInputStream inputStream = null; try { inputStream = new FileInputStream(pdfFile); inputStream.read(pdfData); } catch (IOException e) { throw e; } finally { if (inputStream != null) { try { inputStream.close(); } catch (IOException e) { } } } //5.调用方法,给定关键字和文件 List<double[]> positions = pdfPositionTool.findKeywordPostions(pdfData, keyword); return positions; } /** * @Description pdf 坐标转换为 ofd 坐标,比值为 25.4/72 ,该转换存在误差 * 最好的转换方式为按距离原点的百分比计算 */ private double transForPosition(double pdfPosition) { double ofdPosition = pdfPosition * 25.4 / 72; return ofdPosition; } /** * @param pdfData 通过IO流 PDF文件转化的byte数组 * @param keyword 关键字 * @return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y * @throws IOException */ public List<double[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException { List<double[]> result = new ArrayList<>(); List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData); for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) { List<double[]> charPositions = findPositions(keyword, pdfPageContentPosition); if (charPositions == null || charPositions.size() < 1) { continue; } result.addAll(charPositions); } return result; } private List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException { PdfReader reader = new PdfReader(pdfData); List<PdfPageContentPositions> result = new ArrayList<>(); int pages = reader.getNumberOfPages(); for (int pageNum = 1; pageNum <= pages; pageNum++) { float width = reader.getPageSize(pageNum).getWidth(); float height = reader.getPageSize(pageNum).getHeight(); PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height); //解析pdf,定位位置 PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener); PdfDictionary pageDic = reader.getPageN(pageNum); PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES); try { processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic); } catch (IOException e) { reader.close(); throw e; } String content = pdfRenderListener.getContent(); List<CharPosition> charPositions = pdfRenderListener.getcharPositions(); List<double[]> positionsList = new ArrayList<>(); for (CharPosition charPosition : charPositions) { double[] positions = new double[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()}; positionsList.add(positions); } PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions(); pdfPageContentPositions.setContent(content); pdfPageContentPositions.setPostions(positionsList); result.add(pdfPageContentPositions); } reader.close(); return result; } private static List<double[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) { List<double[]> result = new ArrayList<>(); String content = pdfPageContentPositions.getContent(); List<double[]> charPositions = pdfPageContentPositions.getPositions(); for (int pos = 0; pos < content.length(); ) { int positionIndex = content.indexOf(keyword, pos); if (positionIndex == -1) { break; } double[] postions = charPositions.get(positionIndex); result.add(postions); pos = positionIndex + 1; } return result; } private class PdfPageContentPositions { private String content; private List<double[]> positions; public String getContent() { return content; } public void setContent(String content) { this.content = content; } public List<double[]> getPositions() { return positions; } public void setPostions(List<double[]> positions) { this.positions = positions; } } private class PdfRenderListener implements RenderListener { private int pageNum; private float pageWidth; private float pageHeight; private StringBuilder contentBuilder = new StringBuilder(); private List<CharPosition> charPositions = new ArrayList<>(); public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) { this.pageNum = pageNum; this.pageWidth = pageWidth; this.pageHeight = pageHeight; } public void beginTextBlock() { } /** * @Description 计算转换后的 ofd 坐标值 * 如有需要,可转为计算距离原点的百分比值。在知道 ofd 长宽的情况下,用百分比重新计算坐标更精确 */ public void renderText(TextRenderInfo renderInfo) { List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos(); for (TextRenderInfo textRenderInfo : characterRenderInfos) { String word = textRenderInfo.getText(); if (word.length() > 1) { word = word.substring(word.length() - 1, word.length()); } Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange(); float x = (float) rectangle.getX(); float y = (float) rectangle.getY(); //这两个是关键字在所在页面的XY轴的百分比 float xPercent = Math.round(x / pageWidth * 10000) / 10000f; // pdf 原点在左下,ofd 原点在左上 float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f; CharPosition charPosition = new CharPosition(pageNum, transForPosition(x), transForPosition((yPercent) * pageHeight)); charPositions.add(charPosition); contentBuilder.append(word); } } public void endTextBlock() { } public void renderImage(ImageRenderInfo renderInfo) { } public String getContent() { return contentBuilder.toString(); } public List<CharPosition> getcharPositions() { return charPositions; } } private class CharPosition { private int pageNum = 0; private double x = 0; private double y = 0; public CharPosition(int pageNum, double x, double y) { this.pageNum = pageNum; this.x = x; this.y = y; } public int getPageNum() { return pageNum; } public double getX() { return x; } public double getY() { return y; } @Override public String toString() { return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]"; } } }
当你看清人们的真相,于是你知道了,你可以忍受孤独