提取数据(共同区域)

复制代码
package org.pdffolder.pdffolder01;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.utils.StringUtils;

import java.io.File;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//提取指定共同区域的内容
public class BeginArea {
    public static void main(String[] args) throws Exception {
        try {

            String folderPath = "C:\\Users\\kfeng5\\OneDrive - DXC Production\\Desktop\\年假工作";
            File folder = new File(folderPath);
            File[] files = folder.listFiles();

            if (files != null) {
                for (File file : files) {
                    if (file.getName().endsWith(".pdf")) {
                        PDDocument document = PDDocument.load(file);
                        PDFTextStripper stripper = new PDFTextStripper(); // 初始化文本剥离器
                        String text = stripper.getText(document);
                        String[] lines = text.split("\n");

                        boolean nameFlag = false;
                        boolean infoFlag = false;
                        String name = null;
                        StringBuilder sb = new StringBuilder();
                        for (String line : lines) {
                            if (line.startsWith("(2023年版)")) {
                                continue;
                            }

                            Pattern compile0 = Pattern.compile("^\\d+(.*)");
                            Matcher matcher0 = compile0.matcher(line.trim());
                            if (matcher0.find()) {
                                if (StringUtils.isBlank(matcher0.group(1))) {
                                    continue;//跳出此line
                                }
                            }

                            if (line.startsWith("附件")) {
                                nameFlag = true;
                                continue;
                            }

                            if (nameFlag) {
                                Pattern pattern = Pattern.compile("(.*)诊疗方案");
                                Matcher matcher = pattern.matcher(line);
                                if (matcher.find()) {
                                    name = matcher.group(1);
                                    nameFlag = false;
                                    continue;
                                }
                            }

                            Pattern compile1 = Pattern.compile("四、病理改变(.*)");
                            Matcher matcher1 = compile1.matcher(line.trim());
                            if (matcher1.find()) {
                                infoFlag = true;
                                continue;//跳出此line
                            }
                            if(infoFlag){
                            if (line.startsWith("五")) {
                                System.out.println(name + "!" + sb.toString());
                                sb = new StringBuilder();
                                break;
                            }
                            sb.append(line.replaceAll("\r", ""));
                            continue;
                            }
                        }
                        document.close();
                    }
                }
            }
        }catch(Exception e){
            throw new Exception(e);
        }
    }
}
复制代码

 

posted @   Anne起飞记  阅读(4)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示