PDFUtils (解析PDF 中的文本和图片 PDF 转 HTML HTML 转 PDF)

最近用到解析PDF, 整个小公举

解析PDF 中的文本和图片 PDF 转 HTML HTML 转 PDF

依赖：

　　　　  <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.19</version>
        </dependency>


import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.Base64;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/***************************
 *<pre>
 * @Project Name : sea-dep-service
 * @Package      : com.sea.bx
 * @File Name    : PDFParserUtils
 * @Author       : Sea
 * @Mail         : lshan523@163.com
 * @Date         : 2023/4/17 12:23
 * @Purpose      :
 * @History      : Sea
 *</pre>
 ***************************/

@Slf4j
public class PDFParserUtils {
    public  final  static  String  TEXT="text";
    public  final  static  String  IMAGE="image";

    /**
     * 获取文本
     * @param file
     * @return
     */
    public static  String readPDF(File file){
        return readPDF(file, false).getOrDefault(TEXT, "")+"";
    }

    /**
     * 获取文本
     * @param inputStream
     * @return
     */
    public static  String readPDF(InputStream inputStream){
        return readPDF(inputStream, false).getOrDefault(TEXT, "")+"";
    }

    /**
     * 获取文本 和图片
     * @param file
     * @param isImgRead
     * @return {"text":"...",  "imageMap":{'fileName':'byte[]'}}
     */
    public static  Map<String, Object> readPDF(File file, Boolean isImgRead) {
        Map<String, Object> result = new HashMap<>();
        if(file==null||!file.exists()){return result;}
        try {
            return  readPDF(new FileInputStream(file), isImgRead);
           } catch (FileNotFoundException e) {
            log.error("parse pdf error : {}",e);
          }
           return result;
    }

    /**
     * 读取 文本 和 图片
     * @param inputStream
     * @param isImgRead  是否读取 图片
     * @return {"text":"...",  "imageMap":{'fileName':'byte[]'}}
     */
    public static  Map<String, Object> readPDF(InputStream inputStream, Boolean isImgRead) {
        Map<String, Object> result = new HashMap<>();
        if (inputStream == null) {
            return result;
        }
        //收集图片
        Map<String, byte[]> imageFileAndByteMap = new HashMap<>();
        //收集文本
        StringBuilder sb = new StringBuilder("");
        PDDocument doc = null;
        try {
            PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(inputStream));
            parser.parse();
            doc = parser.getPDDocument();
            PDFTextStripper textStripper = new PDFTextStripper();
            for (int i = 1; i <= doc.getNumberOfPages(); i++) {
                textStripper.setStartPage(i);
                textStripper.setEndPage(i);
                // 一次输出多个页时，按顺序输出
                textStripper.setSortByPosition(true);
                String s = textStripper.getText(doc);
                sb.append(s);
                //读取图片
                if(isImgRead){getImage(doc, i,imageFileAndByteMap);}
            }
            doc.close();
        } catch (Exception e) {
            e.printStackTrace();
            log.info("parse PDF error {}", e.getMessage());
        } finally {
            if (doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        result.put(TEXT,sb.toString());
        result.put(IMAGE,imageFileAndByteMap);
        return result;
    }


    /**
     * 读取每一页中的屙图片  返回map fileName  byte[]
     * @param doc
     * @param pageIndex 从 1 开始
     * @throws Exception
     */
    private static Map<String, byte[]>  getImage(PDDocument doc,int pageIndex,Map<String, byte[]> imageFileAndByteMap) throws Exception{
        PDPage page = doc.getPage(pageIndex - 1);
        PDResources resources = page.getResources();
        // 获取页中的对象
        Iterable<COSName> xobjects = resources.getXObjectNames();
        if (xobjects != null) {
            Iterator<COSName> imageIter = xobjects.iterator();
            while (imageIter.hasNext()) {
                COSName cosName = imageIter.next();
                String fileName = cosName.getName();
                if (resources.isImageXObject(cosName)) {
                    // 获取每页资源的图片
                    PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
                    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
                    ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputStream);
                    imageFileAndByteMap.put(fileName+"."+ixt.getSuffix(),outputStream.toByteArray());
                }
            }
        }
        return  imageFileAndByteMap;
    }


    /**
     * 读取文本内容和图片
     *
     * @param file 文件路徑
     */
    public static void readTextImage(File file) {
        if (file == null) {
            return;
        }
        PDDocument doc = null;
        try {
            doc = PDDocument.load(file);
            PDFTextStripper textStripper = new PDFTextStripper();
            for (int i = 1; i <= doc.getNumberOfPages(); i++) {
                textStripper.setStartPage(i);
                textStripper.setEndPage(i);
//                String s = textStripper.getText(doc);
                // 读取图片
                PDPage page = doc.getPage(i - 1);
                PDResources resources = page.getResources();
                // 获取页中的对象
                Iterable<COSName> xobjects = resources.getXObjectNames();
                if (xobjects != null) {
                    Iterator<COSName> imageIter = xobjects.iterator();
                    while (imageIter.hasNext()) {
                        COSName cosName = imageIter.next();
                        boolean isImageXObject = resources.isImageXObject(cosName);
                        if (isImageXObject) {
                            // 获取每页资源的图片
                            PDImageXObject ixt = (PDImageXObject) resources.getXObject(cosName);
                            File outputfile = new File("第 " + (i) + " 页" + cosName.getName() + "."+ ixt.getSuffix());
                            ImageIO.write(ixt.getImage(), ixt.getSuffix(), outputfile);
                        }
                    }
                }
            }
            doc.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

    }



    /**
     * 读取指定区域
     * @param file
     * @param x      指定的x坐标
     * @param y      指定的y坐标
     * @param width  矩形的宽度
     * @param height 矩形的高度
     * @return
     */
    public static String readRectangle(File file, int x, int y, int width, int height) {
        if (file == null) {
            return "";
        }
        PDDocument doc = null;
        try {
            doc = PDDocument.load(file);
            // y轴向下为正，x轴向右为正。
            PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
            stripperByArea.setSortByPosition(true);
            // 划定区域
            Rectangle2D rect = new Rectangle(x, y, width, height);
            stripperByArea.addRegion("area", rect);
            PDPage page = doc.getPage(1);
            stripperByArea.extractRegions(page);
            // 获取区域的text
            String text = stripperByArea.getTextForRegion("area");
            text = text.trim();
            doc.close();
            return text;
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (doc != null) {
                try {
                    doc.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return "";
    }


    /**
     * byte数组转换成16进制字符串
     * @param src
     * @return
     */
    private static String bytesToHexString(byte[] src) {
        StringBuilder stringBuilder = new StringBuilder();
        if (src == null || src.length <= 0) {
            return null;
        }
        for (int i = 0; i < src.length; i++) {
            int v = src[i] & 0xFF;
            String hv = Integer.toHexString(v);
            if (hv.length() < 2) {
                stringBuilder.append(0);
            }
            stringBuilder.append(hv);
        }
        return stringBuilder.toString();
    }






    private static final String PRE_HTML_CODE = "<html><head><meta charset=\"UTF-8\"></head>" +
            "<body style=\"background-color:gray;\"><style>" +
            "img {background-color:#fff; text-align:center; " +
            "width:100%; max-width:100%;margin-top:6px;}</style>";
    private static final String SUF_HTML_CODE = "</body></html>";
    private static final String MID_HTML_CODE = "<img src=\"data:image/png;base64,";
    private static final String MIDD_HTML_CODE = "\">";
    /**
     * pdf转html
     */
    public static String pdfToHtml(InputStream inputStream) {
        StringBuilder sb = new StringBuilder();
        sb.append(PRE_HTML_CODE);
        PDDocument document = null;
        ByteArrayOutputStream outputStream = null;
        try {
            document = PDDocument.load(inputStream);
            int pages = document.getNumberOfPages();
            PDFRenderer render = new PDFRenderer(document);
            BufferedImage image;
            for (int i = 0; i < pages; i++) {
                sb.append(MID_HTML_CODE);
                outputStream = new ByteArrayOutputStream();
                image = render.renderImage(i, 2.5f);
                ImageIO.write(image, "png", outputStream);
                sb.append(Base64.getEncoder().encodeToString(outputStream.toByteArray()));
                sb.append(MIDD_HTML_CODE);
            }
        } catch (IOException e) {
            log.error(e.getMessage());
        } finally {
            if (null != outputStream) {
                try {
                    outputStream.close();
                } catch (IOException ex) {
                }
            }
            if (null != document) {
                try {
                    document.close();
                } catch (IOException ex) {
                }
            }
        }
        sb.append(SUF_HTML_CODE);
        return sb.toString();
    }



    /**
     * Sea  write 2023-04-20
     * @throws Exception
     */
//    @Test
    public void testReadPDFIMg() throws Exception
    {
        File file = new File("C:\\Users\\Sea\\Downloads\\seatest.pdf");
        file = new File("C:\\Users\\Sea\\Downloads\\2.pdf");
//        PDFParserUtils.readTextImage(file);
        Map<String, Object> stringObjectMap = PDFParserUtils.readPDF(file, true);
        Map<String, byte[]>  imgs = (Map<String, byte[]>) stringObjectMap.get(PDFParserUtils.IMAGE);
        Object o = stringObjectMap.get(PDFParserUtils.TEXT);
        System.err.println( " text : " + o);
        imgs.forEach((filename,bt)->{
            try {
                FileOutputStream fileOutputStream = new FileOutputStream(filename);
                IOUtils.write(bt, fileOutputStream);
            } catch (Exception e) {
                e.printStackTrace();
            }

        });
    }





    /** 常用文件的文件头如下：
     JPEG (jpg)，文件头：FFD8FF
     PNG (png)，文件头：89504E47
     GIF (gif)，文件头：47494638
     TIFF (tif)，文件头：49492A00
     Windows Bitmap (bmp)，文件头：424D
     CAD (dwg)，文件头：41433130
     Adobe Photoshop (psd)，文件头：38425053
     Rich Text Format (rtf)，文件头：7B5C727466
     XML (xml)，文件头：3C3F786D6C
     HTML (html)，文件头：68746D6C3E
     Email [thorough only] (eml)，文件头：44656C69766572792D646174653A
     Outlook Express (dbx)，文件头：CFAD12FEC5FD746F
     Outlook (pst)，文件头：2142444E
     MS Word/Excel (xls.or.doc)，文件头：D0CF11E0
     MS Access (mdb)，文件头：5374616E64617264204A
     WordPerfect (wpd)，文件头：FF575043
     Postscript. (eps.or.ps)，文件头：252150532D41646F6265
     Adobe Acrobat (pdf)，文件头：255044462D312E
     Quicken (qdf)，文件头：AC9EBD8F
     Windows Password (pwl)，文件头：E3828596
     ZIP Archive (zip)，文件头：504B0304
     RAR Archive (rar)，文件头：52617221
     Wave (wav)，文件头：57415645
     AVI (avi)，文件头：41564920
     Real Audio (ram)，文件头：2E7261FD
     Real Media (rm)，文件头：2E524D46
     MPEG (mpg)，文件头：000001BA
     MPEG (mpg)，文件头：000001B3
     Quicktime (mov)，文件头：6D6F6F76
     Windows Media (asf)，文件头：3026B2758E66CF11
     MIDI (mid)，文件头：4D546864
     */
    /**
     * 根據io流前4個字節，判斷文件類型
     * @param ioBytes
     * @return
     */
    private static String getFileType(byte[] ioBytes) throws Exception {
        if (ioBytes == null || ioBytes.length < 4) {
            log.error("非正常文件");
            throw new Exception("Abnormal image file.");
        }
        byte[] b = new byte[4];
        for (int i = 0; i < 4; i++) {
            b[i] = ioBytes[i];
        }
        String type = bytesToHexString(b).toUpperCase();
        if (type.contains("25504446")) {
            return "PDF";
        } else if (type.contains("504B0304")) {
            return "ZIP";
        } else if (type.contains("52617221")) {
            return "RAR";
        }
        return "";
    }

}

posted on 2023-04-20 12:27 lshan 阅读(525) 评论(0) 收藏举报

PDFUtils (解析PDF 中的文本 和 图片 PDF 转 HTML HTML 转 PDF)

PDFUtils (解析PDF 中的文本和图片 PDF 转 HTML HTML 转 PDF)