最近用到解析PDF, 整个小公举
解析PDF 中的文本 和 图片 PDF 转 HTML HTML 转 PDF
依赖:
<dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.19</version> </dependency>
import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.IOUtils; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripperByArea; import javax.imageio.ImageIO; import java.awt.*; import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import java.io.*; import java.util.Base64; import java.util.HashMap; import java.util.Iterator; import java.util.Map; /*************************** *<pre> * @Project Name : sea-dep-service * @Package : com.sea.bx * @File Name : PDFParserUtils * @Author : Sea * @Mail : lshan523@163.com * @Date : 2023/4/17 12:23 * @Purpose : * @History : Sea *</pre> ***************************/ @Slf4j public class PDFParserUtils { public final static String TEXT="text"; public final static String IMAGE="image"; /** * 获取文本 * @param file * @return */ public static String readPDF(File file){ return readPDF(file, false).getOrDefault(TEXT, "")+""; } /** * 获取文本 * @param inputStream * @return */ public static String readPDF(InputStream inputStream){ return readPDF(inputStream, false).getOrDefault(TEXT, "")+""; } /** * 获取文本 和图片 * @param file * @param isImgRead * @return {"text":"...", "imageMap":{'fileName':'byte[]'}} */ public static Map<String, Object> readPDF(File file, Boolean isImgRead) { Map<String, Object> result = new HashMap<>(); if(file==null||!file.exists()){return result;} try { return readPDF(new FileInputStream(file), isImgRead); } catch (FileNotFoundException e) { log.error("parse pdf error : {}",e); } return result; } /** * 读取 文本 和 图片 * @param inputStream * @param isImgRead 是否读取 图片 * @return {"text":"...", "imageMap":{'fileName':'byte[]'}} */ public static Map<String, Object> readPDF(InputStream inputStream, Boolean isImgRead) { Map<String, Object> result = new HashMap<>(); if (inputStream == null) { return result; } //收集图片 Map<String, byte[]> imageFileAndByteMap = new HashMap<>(); //收集文本 StringBuilder sb = new StringBuilder(""); PDDocument doc = null; try { PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(inputStream)); parser.parse(); doc = parser.getPDDocument(); PDFTextStripper textStripper = new PDFTextStripper(); for (int i = 1; i <= doc.getNumberOfPages(); i++) { textStripper.setStartPage(i); textStripper.setEndPage(i); // 一次输出多个页时,按顺序输出 textStripper.setSortByPosition(true); String s = textStripper.getText(doc); sb.append(s); //读取图片 if(isImgRead){getImage(doc, i,imageFileAndByteMap);} } doc.close(); } catch (Exception e) { e.printStackTrace(); log.info("parse PDF error {}", e.getMessage()); } finally { if (doc != null) { try { doc.close(); } catch (IOException e) { e.printStackTrace(); } } } result.put(TEXT,sb.toString()); result.put(IMAGE,imageFileAndByteMap); return result; } /** * 读取每一页中的屙图片 返回map fileName byte[] * @param doc * @param pageIndex 从 1 开始 * @throws Exception */ private static Map<String, byte[]> getImage(PDDocument doc,int pageIndex,Map<String, byte[]> imageFileAndByteMap) throws Exception{ PDPage page = doc.getPage(pageIndex - 1); PDResources resources = page.getResources(); // 获取页中的对象 Iterable<COSName> xobjects = resources.getXObjectNames(); if (xobjects != null) { Iterator<COSName> imageIter = xobjects.iterator(); while (imageIter.hasNext()) { COSName cosName = imageIter.next(); String fileName = cosName.getName(); if (resources.isImageXObject(cosName)) { // 获取每页资源的图片 PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputStream); imageFileAndByteMap.put(fileName+"."+ixt.getSuffix(),outputStream.toByteArray()); } } } return imageFileAndByteMap; } /** * 读取文本内容和图片 * * @param file 文件路徑 */ public static void readTextImage(File file) { if (file == null) { return; } PDDocument doc = null; try { doc = PDDocument.load(file); PDFTextStripper textStripper = new PDFTextStripper(); for (int i = 1; i <= doc.getNumberOfPages(); i++) { textStripper.setStartPage(i); textStripper.setEndPage(i); // String s = textStripper.getText(doc); // 读取图片 PDPage page = doc.getPage(i - 1); PDResources resources = page.getResources(); // 获取页中的对象 Iterable<COSName> xobjects = resources.getXObjectNames(); if (xobjects != null) { Iterator<COSName> imageIter = xobjects.iterator(); while (imageIter.hasNext()) { COSName cosName = imageIter.next(); boolean isImageXObject = resources.isImageXObject(cosName); if (isImageXObject) { // 获取每页资源的图片 PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName); File outputfile = new File("第 " + (i) + " 页" + cosName.getName() + "."+ ixt.getSuffix()); ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputfile); } } } } doc.close(); } catch (IOException e) { e.printStackTrace(); } finally { if (doc != null) { try { doc.close(); } catch (IOException e) { e.printStackTrace(); } } } } /** * 读取指定区域 * @param file * @param x 指定的x坐标 * @param y 指定的y坐标 * @param width 矩形的宽度 * @param height 矩形的高度 * @return */ public static String readRectangle(File file, int x, int y, int width, int height) { if (file == null) { return ""; } PDDocument doc = null; try { doc = PDDocument.load(file); // y轴向下为正,x轴向右为正。 PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea(); stripperByArea.setSortByPosition(true); // 划定区域 Rectangle2D rect = new Rectangle(x, y, width, height); stripperByArea.addRegion("area", rect); PDPage page = doc.getPage(1); stripperByArea.extractRegions(page); // 获取区域的text String text = stripperByArea.getTextForRegion("area"); text = text.trim(); doc.close(); return text; } catch (IOException e) { e.printStackTrace(); } finally { if (doc != null) { try { doc.close(); } catch (IOException e) { e.printStackTrace(); } } } return ""; } /** * byte数组转换成16进制字符串 * @param src * @return */ private static String bytesToHexString(byte[] src) { StringBuilder stringBuilder = new StringBuilder(); if (src == null || src.length <= 0) { return null; } for (int i = 0; i < src.length; i++) { int v = src[i] & 0xFF; String hv = Integer.toHexString(v); if (hv.length() < 2) { stringBuilder.append(0); } stringBuilder.append(hv); } return stringBuilder.toString(); } private static final String PRE_HTML_CODE = "<html><head><meta charset=\"UTF-8\"></head>" + "<body style=\"background-color:gray;\"><style>" + "img {background-color:#fff; text-align:center; " + "width:100%; max-width:100%;margin-top:6px;}</style>"; private static final String SUF_HTML_CODE = "</body></html>"; private static final String MID_HTML_CODE = "<img src=\"data:image/png;base64,"; private static final String MIDD_HTML_CODE = "\">"; /** * pdf转html */ public static String pdfToHtml(InputStream inputStream) { StringBuilder sb = new StringBuilder(); sb.append(PRE_HTML_CODE); PDDocument document = null; ByteArrayOutputStream outputStream = null; try { document = PDDocument.load(inputStream); int pages = document.getNumberOfPages(); PDFRenderer render = new PDFRenderer(document); BufferedImage image; for (int i = 0; i < pages; i++) { sb.append(MID_HTML_CODE); outputStream = new ByteArrayOutputStream(); image = render.renderImage(i, 2.5f); ImageIO.write(image, "png", outputStream); sb.append(Base64.getEncoder().encodeToString(outputStream.toByteArray())); sb.append(MIDD_HTML_CODE); } } catch (IOException e) { log.error(e.getMessage()); } finally { if (null != outputStream) { try { outputStream.close(); } catch (IOException ex) { } } if (null != document) { try { document.close(); } catch (IOException ex) { } } } sb.append(SUF_HTML_CODE); return sb.toString(); } /** * Sea write 2023-04-20 * @throws Exception */ // @Test public void testReadPDFIMg() throws Exception { File file = new File("C:\\Users\\Sea\\Downloads\\seatest.pdf"); file = new File("C:\\Users\\Sea\\Downloads\\2.pdf"); // PDFParserUtils.readTextImage(file); Map<String, Object> stringObjectMap = PDFParserUtils.readPDF(file, true); Map<String, byte[]> imgs = (Map<String, byte[]>) stringObjectMap.get(PDFParserUtils.IMAGE); Object o = stringObjectMap.get(PDFParserUtils.TEXT); System.err.println( " text : " + o); imgs.forEach((filename,bt)->{ try { FileOutputStream fileOutputStream = new FileOutputStream(filename); IOUtils.write(bt, fileOutputStream); } catch (Exception e) { e.printStackTrace(); } }); } /** 常用文件的文件头如下: JPEG (jpg),文件头:FFD8FF PNG (png),文件头:89504E47 GIF (gif),文件头:47494638 TIFF (tif),文件头:49492A00 Windows Bitmap (bmp),文件头:424D CAD (dwg),文件头:41433130 Adobe Photoshop (psd),文件头:38425053 Rich Text Format (rtf),文件头:7B5C727466 XML (xml),文件头:3C3F786D6C HTML (html),文件头:68746D6C3E Email [thorough only] (eml),文件头:44656C69766572792D646174653A Outlook Express (dbx),文件头:CFAD12FEC5FD746F Outlook (pst),文件头:2142444E MS Word/Excel (xls.or.doc),文件头:D0CF11E0 MS Access (mdb),文件头:5374616E64617264204A WordPerfect (wpd),文件头:FF575043 Postscript. (eps.or.ps),文件头:252150532D41646F6265 Adobe Acrobat (pdf),文件头:255044462D312E Quicken (qdf),文件头:AC9EBD8F Windows Password (pwl),文件头:E3828596 ZIP Archive (zip),文件头:504B0304 RAR Archive (rar),文件头:52617221 Wave (wav),文件头:57415645 AVI (avi),文件头:41564920 Real Audio (ram),文件头:2E7261FD Real Media (rm),文件头:2E524D46 MPEG (mpg),文件头:000001BA MPEG (mpg),文件头:000001B3 Quicktime (mov),文件头:6D6F6F76 Windows Media (asf),文件头:3026B2758E66CF11 MIDI (mid),文件头:4D546864 */ /** * 根據io流前4個字節,判斷文件類型 * @param ioBytes * @return */ private static String getFileType(byte[] ioBytes) throws Exception { if (ioBytes == null || ioBytes.length < 4) { log.error("非正常文件"); throw new Exception("Abnormal image file."); } byte[] b = new byte[4]; for (int i = 0; i < 4; i++) { b[i] = ioBytes[i]; } String type = bytesToHexString(b).toUpperCase(); if (type.contains("25504446")) { return "PDF"; } else if (type.contains("504B0304")) { return "ZIP"; } else if (type.contains("52617221")) { return "RAR"; } return ""; } }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?