Java poi 读取 word 、 pdf

🤔从各个博客 CV 出来的,不好意思

pom

	<dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml-schemas -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>4.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>ooxml-schemas</artifactId>
            <version>1.4</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-compress</artifactId>
            <version>1.21</version>
        </dependency>
        <!--读取pdf信息-->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.12</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>fontbox</artifactId>
            <version>2.0.12</version>
        </dependency>	

按段落 读取 docx

    @SneakyThrows
    private void readDocx(MultipartFile file) {
        InputStream inputStream = file.getInputStream();
        XWPFDocument document = new XWPFDocument(inputStream);
      	// 读取段落
        List<XWPFParagraph> paragraphs = document.getParagraphs();
        List<WordFileInfo> infos = new ArrayList<>();
        for (XWPFParagraph paragraph : paragraphs) {
            String text = paragraph.getParagraphText();
        }
      
    }

按段落 读取 doc

    @SneakyThrows
    private void readDoc(MultipartFile file) {
        InputStream inputStream = file.getInputStream();
        HWPFDocument document = new HWPFDocument(inputStream);
        Range range = document.getRange();
        List<WordFileInfo> infos = new ArrayList<>();
        for (int i = 0; i < range.numParagraphs(); i++) {
            Paragraph paragraph = range.getParagraph(i);
            String text = paragraph.text();
        }
    }

读取 pdf

/**
     * 读取 pdf 文件内容
     *
     * @param inputStream
     * @return
     */
    private String readPDF(InputStream inputStream) {
        StringBuilder content = new StringBuilder();
        try {
            RandomAccessBuffer buffer = new RandomAccessBuffer(inputStream);
            PDFParser pdfParser = new PDFParser(buffer);
            pdfParser.parse();
            PDDocument document = pdfParser.getPDDocument();
            // 获取页码
            int pages = document.getNumberOfPages();
            PDFTextStripper stripper = new PDFTextStripper();
            // 设置按顺序输出
            stripper.setSortByPosition(true);
            stripper.setStartPage(1);
            stripper.setEndPage(pages);
            content.append(stripper.getText(document));
        } catch (Exception e) {
            e.printStackTrace();
        }
        return content.toString();

    }
posted @ 2022-04-18 16:33  暮雨寒冬  阅读(1098)  评论(0编辑  收藏  举报