JAVA - 实现 - 利用POI读取word文档实例
package read.document; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.sql.Connection; import java.util.ArrayList; import java.util.List; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.Range; import pers.mysql.DBUtil; import pers.mysql.MysqlDao; import pers.mysql.MysqlDaoImp; public class WordReading { public static void main(String[] args) { String filePath = "*****.doc"; readOnWord(filePath); } public static void readOnWord(String filePath) { if (filePath.endsWith(".doc")) { // 输入流-基类 InputStream is = null; try { is = new FileInputStream(filePath); } catch (FileNotFoundException e) { e.printStackTrace(); System.out.println("文件打开失败。"); } // 加载doc文档 try { HWPFDocument doc = new HWPFDocument(is); Range text = doc.getRange();// 整个文档 /* * 分解word:文本 ->小节 ->段落 ->characterRun(理解为小单元) * section -小节; paragraph - 段落 */ //1分出内容节点 Range hotWord = text.getSection(2);// 0-封面,1-目录,2-文本;第3小节 //2段落处理 /* * 维护两个变量 * * 热词和解释区别 :大小-word:26,explaining:18 * */ String word = ""; String explaining = ""; int wordOK = 0; int explainOK = 0;// 判断当前word&explain是否可以填入数据库 int count = 24;// 读取几条数据到数据库 int begin = 2;// 段落读取位置 for (int i = 0; i < count;) { Range para = hotWord.getParagraph(begin); CharacterRun field = para.getCharacterRun(0); int fontSize = field.getFontSize(); if (fontSize == 26) { word = para.text(); wordOK = 1; begin++; } else { while (fontSize < 26) { explaining += para.text(); begin++; para = hotWord.getParagraph(begin); field = para.getCharacterRun(0); fontSize = field.getFontSize(); } explainOK = 1; } // 判断word&explain是否可以填入数据库 if (wordOK == 1 && explainOK == 1) { MysqlDaoImp.addData(word, explaining); i++; //填入数据库后,一切归"0" wordOK = 0; explainOK = 0; word=""; explaining=""; } } // 输出测试 // System.out.println("读取:" + "head:"); } catch (IOException e) { e.printStackTrace(); System.out.println("IO错误。"); } } else { System.out.println("文件格式 error:not .doc"); } }
...................................................