文件中文本提取

由于最近在写一个全文检索的项目,需要添加对非结构化数据源的数据采集,所以就暂时整理一下对常见文件的文本数据提取。

引入依赖

// 文件数据提取
implementation 'org.apache.poi:poi:5.2.2'
implementation 'org.apache.poi:poi-ooxml:5.2.2'
implementation 'org.apache.poi:poi-scratchpad:5.2.2'
implementation 'org.apache.pdfbox:pdfbox:2.0.26'

创建一个文件数据提取接口

public interface DataCollect {

    String getContent(InputStream inputStream,String filename);

    String getType();
}

PDF文件数据提取实现类

@Slf4j
public class PdfDataCollect implements DataCollect {
    @Override
    public String getContent(InputStream inputStream, String filename) {
        return getPDFContent(inputStream, filename);
    }

    @Override
    public String getType() {
        return "pdf";
    }

    private String getPDFContent(InputStream inputStream, String filename) {
        StringBuffer content = new StringBuffer();
        try (PDDocument document = PDDocument.load(inputStream);) {
            // 获取页码
            int pages = document.getNumberOfPages();
            // 读文本内容
            PDFTextStripper stripper = new PDFTextStripper();
            // 设置按顺序输出
            stripper.setSortByPosition(true);
            for (int i = 1; i <= pages; i++) {
                stripper.setStartPage(i);
                stripper.setEndPage(i);
                content.append(stripper.getText(document));
            }

        } catch (Exception e) {
            log.error("文件【" + filename + "】提取文本数据失败", e);
            return "";
        }

        return content.toString();
    }

}

doc文件文本提取实现类

@Slf4j
public class DocDataCollect implements DataCollect {
    @Override
    public String getContent(InputStream inputStream,String filename) {
        String text = "";
        try (WordExtractor exr = new WordExtractor(inputStream)) {

            text = exr.getText();
        } catch (Exception e) {
            log.error("文件【" + filename + "】提取文本数据失败", e);
            return "";
        }
        return text;
    }

    @Override
    public String getType() {
        return "doc";
    }
}

docx文件文本数据提取实现类

@Slf4j
public class DocxDataCollect implements DataCollect {
    @Override
    public String getContent(InputStream inputStream, String filename) {
        String text = "";
        try (XWPFDocument doc = new XWPFDocument(inputStream);
             XWPFWordExtractor workbook = new XWPFWordExtractor(doc);) {

            text = workbook.getText();
        } catch (Exception e) {
            log.error("文件【" + filename + "】提取文本数据失败", e);
            return "";
        }
        return text;
    }

    @Override
    public String getType() {
        return "docx";
    }
}

ppt文件文本数据提取实现类

@Slf4j
public class PptDataCollect implements DataCollect {
    @Override
    public String getContent(InputStream inputStream,String filename) {
        String text = "";
        try (HSLFSlideShow hss = new HSLFSlideShow(inputStream);
             SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hss);) {

            //得到全部文本
            text = slideShowExtractor.getText();
        } catch (Exception e) {
            log.error("文件【" + filename + "】提取文本数据失败", e);
            return "";
        }
        return text;
    }

    @Override
    public String getType() {
        return "ppt";
    }

}

pptx文件文本数据提取实现类

@Slf4j
public class PptxDataCollect implements DataCollect {
    @Override
    public String getContent(InputStream inputStream, String filename) {
        String text = "";
        try (XMLSlideShow xss = new XMLSlideShow(inputStream);
             SlideShowExtractor slideShowExtractor = new SlideShowExtractor(xss);) {

            //得到全部文本
            text = slideShowExtractor.getText();
        } catch (Exception e) {
            log.error("文件【" + filename + "】提取文本数据失败", e);
            return "";
        }

        return text;
    }

    @Override
    public String getType() {
        return "pptx";
    }

}

xls文件文本数据提取实现类

@Slf4j
public class XlsDataCollect implements DataCollect {
    @Override
    public String getContent(InputStream inputStream, String filename) {
        String text = "";
        try (HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(inputStream));
             ExcelExtractor extractor = new ExcelExtractor(wb);) {

            extractor.setFormulasNotResults(false);
            extractor.setIncludeSheetNames(false);
            text = extractor.getText();
        } catch (Exception e) {
            log.error("文件【" + filename + "】提取文本数据失败", e);
            return "";
        }
        return text;
    }

    @Override
    public String getType() {
        return "xls";
    }
}

xlsx文件文本数据提取实现类

@Slf4j
public class XlsxDataCollect implements DataCollect {
    @Override
    public String getContent(InputStream inputStream, String filename) {
        String text = "";
        try (XSSFWorkbook workBook = new XSSFWorkbook(inputStream);
             XSSFExcelExtractor extractor = new XSSFExcelExtractor(workBook);) {

            extractor.setIncludeSheetNames(false);
            text = extractor.getText();
        } catch (Exception e) {
            log.error("文件【" + filename + "】提取文本数据失败", e);
            return "";
        }
        return text;
    }

    @Override
    public String getType() {
        return "xlsx";
    }
}

文件数据提取管理类

@Slf4j
public class DataCollectManager {
    private static final List<DataCollect> dataCollectList = new ArrayList<>();

    public static void add(DataCollect dataCollect) {
        dataCollectList.add(dataCollect);
    }

    public static List<DataCollect> get() {
        return dataCollectList;
    }

    public static DataCollect getDataCollect(String accessType) {
        for (DataCollect dataCollect : dataCollectList) {
            if (dataCollect.getType().contains(accessType)) {
                return dataCollect;
            }
        }
        return null;
    }

    public static String getContent(String filename, InputStream inputStream) {
        final String fileType = FilenameUtils.getExtension(filename);
        final DataCollect dataCollect = getDataCollect(fileType);
        if (Objects.isNull(dataCollect)) {
            log.error("没有适配该文件【" + filename + "】类型的文本数据提取");
            return "";
        }
        return dataCollect.getContent(inputStream, filename);
    }
}

初始化文件提取实现类

配置文本数据提取实现类

在resource文件夹在创建META-INF\services文件夹,在并在里面创建一个 com.tfswx.fulltextsearch.infrastructure.elasticsearch.data.DataCollect文件(就是DataCollect的全类名),在文件中配置该接口所有实现类的全类名

加载文本数据实现类

@PostConstruct
private void loadDataCollect() {
    if (CollUtil.isEmpty(DataCollectManager.get())) {
        final ServiceLoader<DataCollect> serviceLoader = ServiceLoader.load(DataCollect.class);
        for (DataCollect dataCollect : serviceLoader) {
            DataCollectManager.add(dataCollect);
        }
    }
}
posted @ 2022-10-15 17:38  purple910  阅读(173)  评论(0编辑  收藏  举报