利用java从docx文档中提取文本内容

利用java从docx文档中提取文本内容

使用Apache的第三方jar包,地址为https://poi.apache.org/
docx文档内容如图:
这里写图片描述
目录结构:
这里写图片描述
每个文件夹的名称为日期加上来源,例如:20180618医院,每个docx文档的名称是被试的姓名和来源地,例如:小明-xx社区
代码如下:
MriReportService.java

package services;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.regex.Pattern;

public class MriReportService {

    public static String[] findYearAndSource(File file) {
        String[] result = new String[2];
        // 日期
        String dateStr = file.getParentFile().getName();
//        System.out.println(dateStr);
        if (Pattern.compile("\\d").matcher(dateStr).find()) {
            dateStr = Pattern.compile("-").matcher(dateStr).replaceAll("");
            result[0] = dateStr.substring(0, 8);
        } else {
            result[0] = "";
        }

        // 社区
        String fileName = file.getName();
        if (fileName.indexOf("-") < 0) {
            fileName = Pattern.compile("\\.").matcher(fileName).replaceAll("-.");
        }
        fileName = Pattern.compile("--+").matcher(fileName).replaceAll("-");
        result[1] = fileName.substring(fileName.indexOf("-") + 1, fileName.indexOf("."));

        return result;
    }


    public static LinkedList<File> findAllFile(String rootPath) {
        File file = new File(rootPath);
        LinkedList<File> list = new LinkedList<>();
        if (file.exists()) {
            File[] subDirs = file.listFiles();
            for (File tmpDir : subDirs) {
//                System.out.println(tmpDir);
                for (File tmpFile : tmpDir.listFiles()) {
                    if (tmpFile.isFile() && tmpFile.getName().indexOf("~$") < 0) {
                        list.add(tmpFile);
                    }
                }
            }
        }

        return list;
    }

    public static ArrayList<String> findSub(String docx) {
        String name = "";
        String gender = "";
        String age = "";
        String MRICheck = "";
        String MRIMem = "";


        if (!Pattern.compile("性别:").matcher(docx).find() || !Pattern.compile("年龄:").matcher(docx).find()) {
            try {
                name = docx.substring(docx.indexOf("姓名:") + 3, docx.indexOf("检查项目:"));
                MRICheck = docx.substring(docx.indexOf("MRI检查描述:") + 8, docx.indexOf("MRI印象:"));
                MRIMem = docx.substring(docx.indexOf("MRI印象:") + 6, docx.indexOf("报告医师:"));
            } catch (StringIndexOutOfBoundsException e) {
//                name = "";
            }
        } else {
            name = docx.substring(docx.indexOf("姓名:") + 3, docx.indexOf("性别:"));
            gender = docx.substring(docx.indexOf("性别:") + 3, docx.indexOf("年龄:"));
            age = docx.substring(docx.indexOf("年龄:") + 3, docx.indexOf("检查项目:"));
            MRICheck = docx.substring(docx.indexOf("MRI检查描述:") + 8, docx.indexOf("MRI印象:"));
            MRIMem = docx.substring(docx.indexOf("MRI印象:") + 6, docx.indexOf("报告医师:"));
        }

        ArrayList<String> result = new ArrayList<>();
        result.add(name);
        result.add(gender);
        result.add(age);
        result.add(MRICheck);
        result.add(MRIMem);
        return result;
    }
}

Main.java

import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.*;
import java.util.ArrayList;
import java.util.regex.*;

import static services.MriReportService.findAllFile;
import static services.MriReportService.findSub;
import static services.MriReportService.findYearAndSource;

public class Main {

    public static void main(String[] args) throws Exception {

        if (args.length < 2) {
            System.out.println("请输入源文件和目标文件的完整路径!");
            System.out.println("举个例子:java -jar docx2csv.jar d:\\核磁报告 d:\\result.csv");
            System.exit(-1);
        }

        String srcPath = args[0];
        String outPath = args[1];
        ArrayList<ArrayList<String>> result = new ArrayList<>();
        for (File tmpFile : findAllFile(srcPath)) {

            String[] yearAndSrc = findYearAndSource(tmpFile);

            FileInputStream fis = new FileInputStream(tmpFile);
            XWPFDocument xdoc = new XWPFDocument(fis);
            XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
            String docx = extractor.getText();

            docx = Pattern.compile("\\s").matcher(docx).replaceAll("");
            ArrayList<String> tmpRe = findSub(docx);
            tmpRe.add(yearAndSrc[0]);
            tmpRe.add(yearAndSrc[1]);
            result.add(tmpRe);
            fis.close();
        }
        write(result, outPath);
    }

    public static void write(ArrayList<ArrayList<String>> result, String outPath) throws IOException {
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(
                new FileOutputStream(outPath), "GBK"));
        for (ArrayList<String> tmpStrs : result) {
//            System.out.println();
            bufferedWriter.write(tmpStrs.get(0) + "," + tmpStrs.get(1) + ","
                    + tmpStrs.get(2) + "," + tmpStrs.get(3) + ","
                    + tmpStrs.get(4) + "," + tmpStrs.get(5) + ","
                    + tmpStrs.get(6));
            bufferedWriter.newLine();
        }
        bufferedWriter.close();
    }
}
posted @ 2018-06-18 21:27  海拉鲁捡垃圾  阅读(1073)  评论(0编辑  收藏  举报