文件中文本提取
由于最近在写一个全文检索的项目,需要添加对非结构化数据源的数据采集,所以就暂时整理一下对常见文件的文本数据提取。
引入依赖
// 文件数据提取
implementation 'org.apache.poi:poi:5.2.2'
implementation 'org.apache.poi:poi-ooxml:5.2.2'
implementation 'org.apache.poi:poi-scratchpad:5.2.2'
implementation 'org.apache.pdfbox:pdfbox:2.0.26'
创建一个文件数据提取接口
public interface DataCollect {
String getContent(InputStream inputStream,String filename);
String getType();
}
PDF文件数据提取实现类
@Slf4j
public class PdfDataCollect implements DataCollect {
@Override
public String getContent(InputStream inputStream, String filename) {
return getPDFContent(inputStream, filename);
}
@Override
public String getType() {
return "pdf";
}
private String getPDFContent(InputStream inputStream, String filename) {
StringBuffer content = new StringBuffer();
try (PDDocument document = PDDocument.load(inputStream);) {
// 获取页码
int pages = document.getNumberOfPages();
// 读文本内容
PDFTextStripper stripper = new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
for (int i = 1; i <= pages; i++) {
stripper.setStartPage(i);
stripper.setEndPage(i);
content.append(stripper.getText(document));
}
} catch (Exception e) {
log.error("文件【" + filename + "】提取文本数据失败", e);
return "";
}
return content.toString();
}
}
doc文件文本提取实现类
@Slf4j
public class DocDataCollect implements DataCollect {
@Override
public String getContent(InputStream inputStream,String filename) {
String text = "";
try (WordExtractor exr = new WordExtractor(inputStream)) {
text = exr.getText();
} catch (Exception e) {
log.error("文件【" + filename + "】提取文本数据失败", e);
return "";
}
return text;
}
@Override
public String getType() {
return "doc";
}
}
docx文件文本数据提取实现类
@Slf4j
public class DocxDataCollect implements DataCollect {
@Override
public String getContent(InputStream inputStream, String filename) {
String text = "";
try (XWPFDocument doc = new XWPFDocument(inputStream);
XWPFWordExtractor workbook = new XWPFWordExtractor(doc);) {
text = workbook.getText();
} catch (Exception e) {
log.error("文件【" + filename + "】提取文本数据失败", e);
return "";
}
return text;
}
@Override
public String getType() {
return "docx";
}
}
ppt文件文本数据提取实现类
@Slf4j
public class PptDataCollect implements DataCollect {
@Override
public String getContent(InputStream inputStream,String filename) {
String text = "";
try (HSLFSlideShow hss = new HSLFSlideShow(inputStream);
SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hss);) {
//得到全部文本
text = slideShowExtractor.getText();
} catch (Exception e) {
log.error("文件【" + filename + "】提取文本数据失败", e);
return "";
}
return text;
}
@Override
public String getType() {
return "ppt";
}
}
pptx文件文本数据提取实现类
@Slf4j
public class PptxDataCollect implements DataCollect {
@Override
public String getContent(InputStream inputStream, String filename) {
String text = "";
try (XMLSlideShow xss = new XMLSlideShow(inputStream);
SlideShowExtractor slideShowExtractor = new SlideShowExtractor(xss);) {
//得到全部文本
text = slideShowExtractor.getText();
} catch (Exception e) {
log.error("文件【" + filename + "】提取文本数据失败", e);
return "";
}
return text;
}
@Override
public String getType() {
return "pptx";
}
}
xls文件文本数据提取实现类
@Slf4j
public class XlsDataCollect implements DataCollect {
@Override
public String getContent(InputStream inputStream, String filename) {
String text = "";
try (HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(inputStream));
ExcelExtractor extractor = new ExcelExtractor(wb);) {
extractor.setFormulasNotResults(false);
extractor.setIncludeSheetNames(false);
text = extractor.getText();
} catch (Exception e) {
log.error("文件【" + filename + "】提取文本数据失败", e);
return "";
}
return text;
}
@Override
public String getType() {
return "xls";
}
}
xlsx文件文本数据提取实现类
@Slf4j
public class XlsxDataCollect implements DataCollect {
@Override
public String getContent(InputStream inputStream, String filename) {
String text = "";
try (XSSFWorkbook workBook = new XSSFWorkbook(inputStream);
XSSFExcelExtractor extractor = new XSSFExcelExtractor(workBook);) {
extractor.setIncludeSheetNames(false);
text = extractor.getText();
} catch (Exception e) {
log.error("文件【" + filename + "】提取文本数据失败", e);
return "";
}
return text;
}
@Override
public String getType() {
return "xlsx";
}
}
文件数据提取管理类
@Slf4j
public class DataCollectManager {
private static final List<DataCollect> dataCollectList = new ArrayList<>();
public static void add(DataCollect dataCollect) {
dataCollectList.add(dataCollect);
}
public static List<DataCollect> get() {
return dataCollectList;
}
public static DataCollect getDataCollect(String accessType) {
for (DataCollect dataCollect : dataCollectList) {
if (dataCollect.getType().contains(accessType)) {
return dataCollect;
}
}
return null;
}
public static String getContent(String filename, InputStream inputStream) {
final String fileType = FilenameUtils.getExtension(filename);
final DataCollect dataCollect = getDataCollect(fileType);
if (Objects.isNull(dataCollect)) {
log.error("没有适配该文件【" + filename + "】类型的文本数据提取");
return "";
}
return dataCollect.getContent(inputStream, filename);
}
}
初始化文件提取实现类
配置文本数据提取实现类
在resource文件夹在创建
META-INF\services
文件夹,在并在里面创建一个com.tfswx.fulltextsearch.infrastructure.elasticsearch.data.DataCollect
文件(就是DataCollect
的全类名),在文件中配置该接口所有实现类的全类名
加载文本数据实现类
@PostConstruct
private void loadDataCollect() {
if (CollUtil.isEmpty(DataCollectManager.get())) {
final ServiceLoader<DataCollect> serviceLoader = ServiceLoader.load(DataCollect.class);
for (DataCollect dataCollect : serviceLoader) {
DataCollectManager.add(dataCollect);
}
}
}