读取word 文本以及 图片
依赖:
<poi.version>4.0.0</poi.version>
<dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>${poi.version}</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>${poi.version}</version> </dependency>
WordUtils:
import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFPictureData; import org.junit.jupiter.api.Test; import java.io.*; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /*************************** *<pre> * @Project Name : sea-dep-service * @Package : com.sea.x.common.utils * @File Name : ReadDOCUtil * @Author : Sea * @Mail : lshan@523@163 * @Date : 2023/2/3 15:23 * @Purpose : * @History : *</pre> ***************************/ @Slf4j public class WordUtil { public final static String TEXT="text"; public final static String IMAGE="image"; /** * 读取文本内容 * @param inputStream * @return */ public static String readWord(InputStream inputStream) { return readWord(inputStream, false).getOrDefault(TEXT,"")+""; } public static String readWord(File file) { return readWord(file, false).getOrDefault(TEXT,"")+""; } /** * @param file * @param isImgRead * @return {"text":"...", "image":{'fileName':'byte[]'}} */ public static Map readWord(File file, boolean isImgRead) { Map<String, Object> result = new HashMap<>(); try { result =readWord(new FileInputStream(file), isImgRead); } catch (FileNotFoundException e) { e.printStackTrace(); log.error("parse word :{} error :{}",file.getName(), e); } return result; } /** * @param fis * @param isImgRead 是否获取图片数据 * @return {"text":"...", "image":{'fileName':'byte[]'}} */ public static Map<String, Object> readWord(InputStream fis, boolean isImgRead) { Map<String, Object> result = new HashMap<>(); XWPFDocument document = null; XWPFWordExtractor extractor = null; try { document = new XWPFDocument(fis); //文件名, byte[] HashMap<String, byte[]> picData =null; if(isImgRead){ picData = new HashMap<>(); List<XWPFPictureData> allPictures = document.getAllPackagePictures(); for(XWPFPictureData p : allPictures) { //获取简历中个人图片 byte[] data = p.getData(); picData.put(data.length+"_"+p.getFileName(),data); // IOUtils.write(p.getData(), new FileOutputStream("C:\\Users\\Sea\\Desktop\\pic\\"+p.getFileName()+"")); } } extractor = new XWPFWordExtractor(document); String text = extractor.getText(); result.put(TEXT,text); result.put(IMAGE,picData); } catch (Exception e) { e.printStackTrace(); log.error("parse word error :{}", e); }finally { try { extractor.close(); document.close(); } catch (IOException e) { } } return result; } static Pattern emailPattern = Pattern.compile("[a-zA-Z0-9]+@[a-zA-Z0-9]+\\.[a-zA-Z0-9]+"); /** * 以姓名: or 姓 名:开头, 空格结尾 */ static Pattern namePattern = Pattern.compile("(姓.*?名)(.*?)( )"); static Pattern phonePattern = Pattern.compile("1(3\\d|4[5-9]|5[0-35-9]|6[567]|7[0-8]|8\\d|9[0-35-9])\\d{8}"); /** * 提取部分信息 * @param text * @return */ private static Map<String,String> getMainInfo(String text){ Matcher nameMatcher = namePattern.matcher(text); Matcher emailMatcher = emailPattern.matcher(text); Matcher phoneMatcher = phonePattern.matcher(text); String userName = (nameMatcher.find()?nameMatcher.group(2):"").replace(":","").replace(":",""); String email = emailMatcher.find()?emailMatcher.group(0):""; String phone = phoneMatcher.find()?phoneMatcher.group(0):""; userName = StringUtils.isNotBlank(userName)?userName : text.trim().substring(0, 10).replace("\n"," ").replace("\t"," ").split(" ")[0]; String sex = text.contains("女")?"0":"1"; System.err.println("name : "+ userName); System.err.println("email : "+ email); System.err.println("phone : "+ phone); System.err.println("sex : "+ sex); String lastUserName = userName; return new HashMap<String,String>(){{ put("sex",sex); put("userName", lastUserName); put("email",email); put("phone",phone); }}; } @Test public void testName() throws Exception { String fileName = "C:/Users/Sea/Downloads/job_word_20230202/sea工程师46152253.docx"; File file = new File(fileName); Map map = readWord(file, true); String txt = map.get(TEXT)+""; getMainInfo(txt); System.err.println(txt); Map<String,byte[]> imgs =(Map<String,byte[]>) map.get(IMAGE); imgs.forEach((fn,bt)->{ try { FileOutputStream fileOutputStream = new FileOutputStream(fn); IOUtils.write(bt,fileOutputStream); } catch (Exception e) { e.printStackTrace(); } }); } }
分类:
java
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?