随笔 - 836  文章 - 1 评论 - 40 阅读 - 102万
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

 

最近用到解析PDF,  整个小公举

解析PDF 中的文本 和 图片   PDF 转 HTML     HTML 转 PDF

 

依赖:

      <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.19</version>
        </dependency>

 

 

 

复制代码

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.Base64;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/***************************
 *<pre>
 * @Project Name : sea-dep-service
 * @Package      : com.sea.bx
 * @File Name    : PDFParserUtils
 * @Author       : Sea
 * @Mail         : lshan523@163.com
 * @Date         : 2023/4/17 12:23
 * @Purpose      :
 * @History      : Sea
 *</pre>
 ***************************/

@Slf4j
public class PDFParserUtils {
    public  final  static  String  TEXT="text";
    public  final  static  String  IMAGE="image";

    /**
     * 获取文本
     * @param file
     * @return
     */
    public static  String readPDF(File file){
        return readPDF(file, false).getOrDefault(TEXT, "")+"";
    }

    /**
     * 获取文本
     * @param inputStream
     * @return
     */
    public static  String readPDF(InputStream inputStream){
        return readPDF(inputStream, false).getOrDefault(TEXT, "")+"";
    }

    /**
     * 获取文本 和图片
     * @param file
     * @param isImgRead
     * @return {"text":"...",  "imageMap":{'fileName':'byte[]'}}
     */
    public static  Map<String, Object> readPDF(File file, Boolean isImgRead) {
        Map<String, Object> result = new HashMap<>();
        if(file==null||!file.exists()){return result;}
        try {
            return  readPDF(new FileInputStream(file), isImgRead);
           } catch (FileNotFoundException e) {
            log.error("parse pdf error : {}",e);
          }
           return result;
    }

    /**
     * 读取 文本 和 图片
     * @param inputStream
     * @param isImgRead  是否读取 图片
     * @return {"text":"...",  "imageMap":{'fileName':'byte[]'}}
     */
    public static  Map<String, Object> readPDF(InputStream inputStream, Boolean isImgRead) {
        Map<String, Object> result = new HashMap<>();
        if (inputStream == null) {
            return result;
        }
        //收集图片
        Map<String, byte[]> imageFileAndByteMap = new HashMap<>();
        //收集文本
        StringBuilder sb = new StringBuilder("");
        PDDocument doc = null;
        try {
            PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(inputStream));
            parser.parse();
            doc = parser.getPDDocument();
            PDFTextStripper textStripper = new PDFTextStripper();
            for (int i = 1; i <= doc.getNumberOfPages(); i++) {
                textStripper.setStartPage(i);
                textStripper.setEndPage(i);
                // 一次输出多个页时,按顺序输出
                textStripper.setSortByPosition(true);
                String s = textStripper.getText(doc);
                sb.append(s);
                //读取图片
                if(isImgRead){getImage(doc, i,imageFileAndByteMap);}
            }
            doc.close();
        } catch (Exception e) {
            e.printStackTrace();
            log.info("parse PDF error {}", e.getMessage());
        } finally {
            if (doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        result.put(TEXT,sb.toString());
        result.put(IMAGE,imageFileAndByteMap);
        return result;
    }


    /**
     * 读取每一页中的屙图片  返回map fileName  byte[]
     * @param doc
     * @param pageIndex 从 1 开始
     * @throws Exception
     */
    private static Map<String, byte[]>  getImage(PDDocument doc,int pageIndex,Map<String, byte[]> imageFileAndByteMap) throws Exception{
        PDPage page = doc.getPage(pageIndex - 1);
        PDResources resources = page.getResources();
        // 获取页中的对象
        Iterable<COSName> xobjects = resources.getXObjectNames();
        if (xobjects != null) {
            Iterator<COSName> imageIter = xobjects.iterator();
            while (imageIter.hasNext()) {
                COSName cosName = imageIter.next();
                String fileName = cosName.getName();
                if (resources.isImageXObject(cosName)) {
                    // 获取每页资源的图片
                    PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
                    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
                    ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputStream);
                    imageFileAndByteMap.put(fileName+"."+ixt.getSuffix(),outputStream.toByteArray());
                }
            }
        }
        return  imageFileAndByteMap;
    }


    /**
     * 读取文本内容和图片
     *
     * @param file 文件路徑
     */
    public static void readTextImage(File file) {
        if (file == null) {
            return;
        }
        PDDocument doc = null;
        try {
            doc = PDDocument.load(file);
            PDFTextStripper textStripper = new PDFTextStripper();
            for (int i = 1; i <= doc.getNumberOfPages(); i++) {
                textStripper.setStartPage(i);
                textStripper.setEndPage(i);
//                String s = textStripper.getText(doc);
                // 读取图片
                PDPage page = doc.getPage(i - 1);
                PDResources resources = page.getResources();
                // 获取页中的对象
                Iterable<COSName> xobjects = resources.getXObjectNames();
                if (xobjects != null) {
                    Iterator<COSName> imageIter = xobjects.iterator();
                    while (imageIter.hasNext()) {
                        COSName cosName = imageIter.next();
                        boolean isImageXObject = resources.isImageXObject(cosName);
                        if (isImageXObject) {
                            // 获取每页资源的图片
                            PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
                            File outputfile = new File("" + (i) + "" + cosName.getName() + "."+ ixt.getSuffix());
                            ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputfile);
                        }
                    }
                }
            }
            doc.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

    }



    /**
     * 读取指定区域
     * @param file
     * @param x      指定的x坐标
     * @param y      指定的y坐标
     * @param width  矩形的宽度
     * @param height 矩形的高度
     * @return
     */
    public static String readRectangle(File file, int x, int y, int width, int height) {
        if (file == null) {
            return "";
        }
        PDDocument doc = null;
        try {
            doc = PDDocument.load(file);
            // y轴向下为正,x轴向右为正。
            PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
            stripperByArea.setSortByPosition(true);
            // 划定区域
            Rectangle2D rect = new Rectangle(x, y, width, height);
            stripperByArea.addRegion("area", rect);
            PDPage page = doc.getPage(1);
            stripperByArea.extractRegions(page);
            // 获取区域的text
            String text = stripperByArea.getTextForRegion("area");
            text = text.trim();
            doc.close();
            return text;
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return "";
    }


    /**
     * byte数组转换成16进制字符串
     * @param src
     * @return
     */
    private static String bytesToHexString(byte[] src) {
        StringBuilder stringBuilder = new StringBuilder();
        if (src == null || src.length <= 0) {
            return null;
        }
        for (int i = 0; i < src.length; i++) {
            int v = src[i] & 0xFF;
            String hv = Integer.toHexString(v);
            if (hv.length() < 2) {
                stringBuilder.append(0);
            }
            stringBuilder.append(hv);
        }
        return stringBuilder.toString();
    }






    private static final String PRE_HTML_CODE = "<html><head><meta charset=\"UTF-8\"></head>" +
            "<body style=\"background-color:gray;\"><style>" +
            "img {background-color:#fff; text-align:center; " +
            "width:100%; max-width:100%;margin-top:6px;}</style>";
    private static final String SUF_HTML_CODE = "</body></html>";
    private static final String MID_HTML_CODE = "<img src=\"data:image/png;base64,";
    private static final String MIDD_HTML_CODE = "\">";
    /**
     * pdf转html
     */
    public static String pdfToHtml(InputStream inputStream) {
        StringBuilder sb = new StringBuilder();
        sb.append(PRE_HTML_CODE);
        PDDocument document = null;
        ByteArrayOutputStream outputStream = null;
        try {
            document = PDDocument.load(inputStream);
            int pages = document.getNumberOfPages();
            PDFRenderer render = new PDFRenderer(document);
            BufferedImage image;
            for (int i = 0; i < pages; i++) {
                sb.append(MID_HTML_CODE);
                outputStream = new ByteArrayOutputStream();
                image = render.renderImage(i, 2.5f);
                ImageIO.write(image, "png", outputStream);
                sb.append(Base64.getEncoder().encodeToString(outputStream.toByteArray()));
                sb.append(MIDD_HTML_CODE);
            }
        } catch (IOException e) {
            log.error(e.getMessage());
        } finally {
            if (null != outputStream) {
                try {
                    outputStream.close();
                } catch (IOException ex) {
                }
            }
            if (null != document) {
                try {
                    document.close();
                } catch (IOException ex) {
                }
            }
        }
        sb.append(SUF_HTML_CODE);
        return sb.toString();
    }



    /**
     * Sea  write 2023-04-20
     * @throws Exception
     */
//    @Test
    public void testReadPDFIMg() throws Exception
    {
        File file = new File("C:\\Users\\Sea\\Downloads\\seatest.pdf");
        file = new File("C:\\Users\\Sea\\Downloads\\2.pdf");
//        PDFParserUtils.readTextImage(file);
        Map<String, Object> stringObjectMap = PDFParserUtils.readPDF(file, true);
        Map<String, byte[]>  imgs = (Map<String, byte[]>) stringObjectMap.get(PDFParserUtils.IMAGE);
        Object o = stringObjectMap.get(PDFParserUtils.TEXT);
        System.err.println( " text : " + o);
        imgs.forEach((filename,bt)->{
            try {
                FileOutputStream fileOutputStream = new FileOutputStream(filename);
                IOUtils.write(bt, fileOutputStream);
            } catch (Exception e) {
                e.printStackTrace();
            }

        });
    }





    /** 常用文件的文件头如下:
     JPEG (jpg),文件头:FFD8FF
     PNG (png),文件头:89504E47
     GIF (gif),文件头:47494638
     TIFF (tif),文件头:49492A00
     Windows Bitmap (bmp),文件头:424D
     CAD (dwg),文件头:41433130
     Adobe Photoshop (psd),文件头:38425053
     Rich Text Format (rtf),文件头:7B5C727466
     XML (xml),文件头:3C3F786D6C
     HTML (html),文件头:68746D6C3E
     Email [thorough only] (eml),文件头:44656C69766572792D646174653A
     Outlook Express (dbx),文件头:CFAD12FEC5FD746F
     Outlook (pst),文件头:2142444E
     MS Word/Excel (xls.or.doc),文件头:D0CF11E0
     MS Access (mdb),文件头:5374616E64617264204A
     WordPerfect (wpd),文件头:FF575043
     Postscript. (eps.or.ps),文件头:252150532D41646F6265
     Adobe Acrobat (pdf),文件头:255044462D312E
     Quicken (qdf),文件头:AC9EBD8F
     Windows Password (pwl),文件头:E3828596
     ZIP Archive (zip),文件头:504B0304
     RAR Archive (rar),文件头:52617221
     Wave (wav),文件头:57415645
     AVI (avi),文件头:41564920
     Real Audio (ram),文件头:2E7261FD
     Real Media (rm),文件头:2E524D46
     MPEG (mpg),文件头:000001BA
     MPEG (mpg),文件头:000001B3
     Quicktime (mov),文件头:6D6F6F76
     Windows Media (asf),文件头:3026B2758E66CF11
     MIDI (mid),文件头:4D546864
     */
    /**
     * 根據io流前4個字節,判斷文件類型
     * @param ioBytes
     * @return
     */
    private static String getFileType(byte[] ioBytes) throws Exception {
        if (ioBytes == null || ioBytes.length < 4) {
            log.error("非正常文件");
            throw new Exception("Abnormal image file.");
        }
        byte[] b = new byte[4];
        for (int i = 0; i < 4; i++) {
            b[i] = ioBytes[i];
        }
        String type = bytesToHexString(b).toUpperCase();
        if (type.contains("25504446")) {
            return "PDF";
        } else if (type.contains("504B0304")) {
            return "ZIP";
        } else if (type.contains("52617221")) {
            return "RAR";
        }
        return "";
    }

}
复制代码

 

posted on   lshan  阅读(452)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?
点击右上角即可分享
微信分享提示