pdf转图片,提取文字,提取图片
1、使用pdfbox
<dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.21</version> </dependency>
2、code
import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; /** * @Author: xu.dm * @Date: 2020/10/27 17:06 * @Version: 1.0 * @Description: pdf转图片,提取文字,提取图片 **/ public class PdfUtil { /** * 转换全部的pdf * * @param filename PDF文件全路径 * @param type 图片类型 */ public static void pdf2png(String filename, String type) { // 将pdf装图片 并且自定义图片得格式大小 File file = new File(filename); String parentPath = file.getParent(); String name = file.getName(); try { PDDocument doc = PDDocument.load(file); PDFRenderer renderer = new PDFRenderer(doc); int pageCount = doc.getNumberOfPages(); for (int i = 0; i < pageCount; i++) { BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI ImageIO.write(image, type, new File(parentPath + "/" + name + "_" + (i + 1) + "." + type)); } } catch (IOException e) { e.printStackTrace(); } } /** * 自由确定起始页和终止页 * * @param filename PDF文件全路径 * @param indexOfStart 开始页 开始转换的页码,从0开始 * @param indexOfEnd 结束页 停止转换的页码,-1为全部 * @param type 图片类型 */ public static void pdf2png(String filename, int indexOfStart, int indexOfEnd, String type) { // 将pdf装图片 并且自定义图片得格式大小 File file = new File(filename); String parentPath = file.getParent(); String name = file.getName(); try { PDDocument doc = PDDocument.load(file); PDFRenderer renderer = new PDFRenderer(doc); int pageCount = doc.getNumberOfPages(); for (int i = indexOfStart; i < indexOfEnd; i++) { BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI ImageIO.write(image, type, new File(parentPath + "\\" + name + "_" + (i + 1) + "." + type)); } } catch (IOException e) { e.printStackTrace(); } } /** * 提取pdf中的文字,例如:用word转存pdf,那么里面文字就可以提取,如果是图片转存pdf不能提取 * @param filename pdf全路径 */ public static String extractText(String filename) { File file = new File(filename); try { PDDocument doc = PDDocument.load(file); int pages = doc.getNumberOfPages(); // 读文本内容 PDFTextStripper stripper = new PDFTextStripper(); // 设置按顺序输出 stripper.setSortByPosition(true); stripper.setStartPage(1); stripper.setEndPage(pages); return stripper.getText(doc); } catch (IOException e) { e.printStackTrace(); } return null; } /** * 从PDF中提取图片 * @param filename pdf全路径 * @param type 图片类型,后缀 */ public static void extractImage(String filename, String type) { File file = new File(filename); String parentPath = file.getParent(); String name = file.getName(); try { PDDocument doc = PDDocument.load(file); int pages = doc.getNumberOfPages(); int j = 0; for (int i = 0; i < pages; i++) { PDPage page = doc.getPage(i); PDResources resources = page.getResources(); Iterable<COSName> xObjectNames = resources.getXObjectNames(); if (xObjectNames == null) continue; for (COSName cosName : xObjectNames) { if (resources.isImageXObject(cosName)) { PDImageXObject image = (PDImageXObject) resources.getXObject(cosName); BufferedImage bufferedImage = image.getImage(); ImageIO.write(bufferedImage, type, new File(parentPath + "\\" + name + "_" + (++j) + "." + type)); } } } } catch (IOException e) { e.printStackTrace(); } } }