java 实现poi方式读取word文件内容
1、下载poi的jar包
下载地址:https://www.apache.org/dyn/closer.lua/poi/release/bin/poi-bin-3.17-20170915.tar.gz
下载解压后用到的jar包
maven:
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>4.1.2</version> </dependency> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.5.7</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>4.1.2</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml-schemas --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>4.1.2</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.1</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>4.1.2</version> </dependency>
一、读取word全部内容(这个不区分doc和docx)
1 package com.wordcom; 2 3 import java.io.File; 4 import java.io.FileInputStream; 5 import java.io.InputStream; 6 import org.apache.poi.POIXMLDocument; 7 import org.apache.poi.POIXMLTextExtractor; 8 import org.apache.poi.hwpf.extractor.WordExtractor; 9 import org.apache.poi.openxml4j.opc.OPCPackage; 10 import org.apache.poi.xwpf.extractor.XWPFWordExtractor; 11 /** 12 * @Author:hp 13 * @Description: 14 * @Date:2021年11月4日14:58:11 15 * @Modified by:读取word所有内容 16 **/ 17 public class DocUtil { 18 public static void main(String[] args) { 19 String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\忻州地调中心站11楼机房更换通信电源三措一案.docx"; 20 String content = readWord(filePath); 21 System.out.println(content); 22 } 23 24 public static String readWord(String path) { 25 String buffer = ""; 26 try { 27 if (path.endsWith(".doc")) { 28 InputStream is = new FileInputStream(new File(path)); 29 WordExtractor ex = new WordExtractor(is); 30 buffer = ex.getText(); 31 ex.close(); 32 } else if (path.endsWith("docx")) { 33 OPCPackage opcPackage = POIXMLDocument.openPackage(path); 34 POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage); 35 buffer = extractor.getText(); 36 extractor.close(); 37 } else { 38 System.out.println("此文件不是word文件!"); 39 } 40 41 } catch (Exception e) { 42 e.printStackTrace(); 43 } 44 45 return buffer; 46 } 47 }
二、获取word各级标题(doc格式)
这个需要保证word格式提前定义好标题格式才能读出来
1 package com.wordcom; 2 import org.apache.poi.hwpf.HWPFDocument; 3 import org.apache.poi.hwpf.model.StyleDescription; 4 import org.apache.poi.hwpf.model.StyleSheet; 5 import org.apache.poi.hwpf.usermodel.Paragraph; 6 import org.apache.poi.hwpf.usermodel.ParagraphProperties; 7 import org.apache.poi.hwpf.usermodel.Range; 8 import java.io.*; 9 10 /** 11 * @author hp 12 *获取doc文档的标题 13 */ 14 public class WordTitle { 15 public static void main(String[] args) throws Exception { 16 17 String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\正文查找.doc"; 18 printWord(filePath); 19 20 } 21 public static void printWord(String filePath) throws IOException { 22 23 InputStream is = new FileInputStream(filePath); 24 25 HWPFDocument doc = new HWPFDocument(is); 26 27 Range r = doc.getRange();// 文档范围 28 29 for (int i = 0; i < r.numParagraphs(); i++) { 30 31 Paragraph p = r.getParagraph(i);// 获取段落 32 int numStyles = doc.getStyleSheet().numStyles(); 33 34 int styleIndex = p.getStyleIndex(); 35 36 if (numStyles > styleIndex) { 37 38 StyleSheet style_sheet = doc.getStyleSheet(); 39 40 StyleDescription style = style_sheet.getStyleDescription(styleIndex); 41 ParagraphProperties style1 = style_sheet.getParagraphStyle(styleIndex); 42 43 String styleName = style.getName();// 获取每个段落样式名称 44 //System.out.println(style_sheet); 45 //System.out.println(styleName); 46 // 获取自己理想样式的段落文本信息 47 //String styleLoving = "标题"; 48 String text = p.text();// 段落文本 49 //if (styleName != null && styleName.contains(styleLoving)) { 50 if (styleName.equals("标题")) { 51 52 System.out.println(text); 53 } 54 } 55 } 56 doc.close(); 57 } 58 }
三、按段落读取word(doc)(docx)
可以按照自己的需求提取特定的内容
doc
1 package com.wordcom; 2 import org.apache.poi.hwpf.HWPFDocument; 3 import org.apache.poi.hwpf.model.StyleDescription; 4 import org.apache.poi.hwpf.model.StyleSheet; 5 import org.apache.poi.hwpf.usermodel.Paragraph; 6 import org.apache.poi.hwpf.usermodel.ParagraphProperties; 7 import org.apache.poi.hwpf.usermodel.Range; 8 import java.io.*; 9 10 /** 11 * 12 * @author hp 13 *获取doc文档的标题 14 */ 15 public class WordTitledoc { 16 public static void main(String[] args) throws Exception { 17 18 String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\一案 .doc"; 19 20 printWord(filePath); 21 22 } 23 24 public static void printWord(String filePath) throws IOException { 25 26 InputStream is = new FileInputStream(filePath); 27 28 HWPFDocument doc = new HWPFDocument(is); 29 30 Range r = doc.getRange();// 文档范围 31 32 for (int i = 0; i < r.numParagraphs(); i++) { 33 34 Paragraph p = r.getParagraph(i);// 获取段落 35 int numStyles = doc.getStyleSheet().numStyles(); 36 37 int styleIndex = p.getStyleIndex(); 38 39 if (numStyles > styleIndex) { 40 41 StyleSheet style_sheet = doc.getStyleSheet(); 42 43 StyleDescription style = style_sheet.getStyleDescription(styleIndex); 44 ParagraphProperties style1 = style_sheet.getParagraphStyle(styleIndex); 45 46 String styleName = style.getName();// 获取每个段落样式名称 47 //System.out.println(style_sheet); 48 //System.out.println(styleName); 49 // 获取自己理想样式的段落文本信息 50 //String styleLoving = "标题"; 51 String text = p.text();// 段落文本 52 //if (styleName != null && styleName.contains(styleLoving)) { 53 if (text.contains(".") || text.contains("、")) { 54 //String text = p.text();// 段落文本 55 if (!text.contains(",") && !text.contains(";") && !text.contains("。") && !text.contains("") && !text.contains("20")) { 56 System.out.println(text); 57 } 58 } 59 } 60 } 61 doc.close(); 62 } 63 }
docx
package com.wordcom; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.Map; /** * * @author hp *获取docx文档的标题 */ public class WordTitledocx { public static void main(String[] args) throws Exception { String filePath = "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\忻州地调中心站11楼机房更换通信电源三措一案.docx"; printWord(filePath); } public static void printWord(String filePath) throws IOException { InputStream is = new FileInputStream(filePath); XWPFDocument doc = new XWPFDocument(is); List<Map<String,Object>> list = new ArrayList(); List<XWPFParagraph> paragraphs2 = doc.getParagraphs(); for (XWPFParagraph xwpfParagraph : paragraphs2) { String text = xwpfParagraph.getParagraphText(); if (text.contains(".") || text.contains("、")) { //String text = p.text();// 段落文本 if (!text.contains(",") && !text.contains(";") && !text.contains("。") && !text.contains("") && !text.contains("20")) { System.out.println(text); } } } } }