Word试卷文档模型化解析存储到数据库
最近在搞一套在线的考试系统,有许多人反映试题的新增比较麻烦(需要逐个输入),于是呼就整个了试卷批量导入了
poi实现word转html
模型化解析html
html转Map数组
Map数组(数组的操作处理不做说明)
1.导jar包。
2.word试卷导入模板
链接:http://pan.baidu.com/s/1gfK6g5H
3.代码实现
1 package com.web.onlinexam.util; 2 3 import java.io.BufferedWriter; 4 import java.io.File; 5 import java.io.FileInputStream; 6 import java.io.FileNotFoundException; 7 import java.io.FileOutputStream; 8 import java.io.IOException; 9 import java.io.OutputStream; 10 import java.io.OutputStreamWriter; 11 import java.io.PrintWriter; 12 import java.util.ArrayList; 13 import java.util.Date; 14 import java.util.HashMap; 15 import java.util.LinkedList; 16 import java.util.List; 17 import java.util.Map; 18 import java.util.regex.Matcher; 19 import java.util.regex.Pattern; 20 21 import org.apache.commons.lang.StringUtils; 22 import org.apache.poi.hwpf.HWPFDocument; 23 import org.apache.poi.hwpf.model.PicturesTable; 24 import org.apache.poi.hwpf.usermodel.CharacterRun; 25 import org.apache.poi.hwpf.usermodel.Picture; 26 import org.apache.poi.hwpf.usermodel.Range; 27 import org.apache.poi.hwpf.usermodel.Paragraph; 28 import org.apache.poi.hwpf.usermodel.Table; 29 import org.apache.poi.hwpf.usermodel.TableCell; 30 import org.apache.poi.hwpf.usermodel.TableIterator; 31 import org.apache.poi.hwpf.usermodel.TableRow; 32 33 import com.common.util.DateFormatUtil; 34 import com.common.util.FileUploadPathConfig; 35 36 /** 37 * 38 39 * @Description:Word试卷文档模型化解析 40 41 * @author <a href="mailto:thoslbt@163.com">Thos</a> 42 * @ClassName: WordToHtml 44 * @version V1.0 45 * 46 */ 47 public class WordToHtml { 48 49 /** 50 * 回车符ASCII码 51 */ 52 private static final short ENTER_ASCII = 13; 53 54 /** 55 * 空格符ASCII码 56 */ 57 private static final short SPACE_ASCII = 32; 58 59 /** 60 * 水平制表符ASCII码 61 */ 62 private static final short TABULATION_ASCII = 9; 63 64 public static String htmlText = ""; 65 public static String htmlTextTbl = ""; 66 public static int counter=0; 67 public static int beginPosi=0; 68 public static int endPosi=0; 69 public static int beginArray[]; 70 public static int endArray[]; 71 public static String htmlTextArray[]; 72 public static boolean tblExist=false; 73 74 public static final String inputFile="C:\\Users\\java\\Downloads\\111222.doc"; 75 public static final String htmlFile="E:/abc.html"; 76 77 public static void main(String argv[]) 78 { 79 try { 80 getWordAndStyle(inputFile); 81 } catch (Exception e) { 82 e.printStackTrace(); 83 } 84 } 85 86 /** 87 * word文档图片存储路径 88 * @return 89 */ 90 public static String wordImageFilePath(){ 91 92 return FileUploadPathConfig.FILE_UPLOAD_BASE+"upload/wordImage/"+ DateFormatUtil.formatDate(new Date()); 93 } 94 95 /** 96 * word文档图片Web访问路径 97 * @return 98 */ 99 public static String wordImgeWebPath(){ 100 101 return "D:/var/e_learning/upload/wordImage/"+ DateFormatUtil.formatDate(new Date())+"/"; 102 } 103 104 /** 105 * 读取每个文字样式 106 * 107 * @param fileName 108 * @throws Exception 109 */ 110 111 112 public static void getWordAndStyle(String fileName) throws Exception { 113 FileInputStream in = new FileInputStream(new File(fileName)); 114 HWPFDocument doc = new HWPFDocument(in); 115 116 Range rangetbl = doc.getRange();//得到文档的读取范围 117 TableIterator it = new TableIterator(rangetbl); 118 int num=100; 119 120 beginArray=new int[num]; 121 endArray=new int[num]; 122 htmlTextArray=new String[num]; 123 124 // 取得文档中字符的总数 125 int length = doc.characterLength(); 126 // 创建图片容器 127 PicturesTable pTable = doc.getPicturesTable(); 128 129 htmlText = "<html><head><title>" + doc.getSummaryInformation().getTitle() + "</title></head><body>"; 130 // 创建临时字符串,好加以判断一串字符是否存在相同格式 131 132 if(it.hasNext()) 133 { 134 readTable(it,rangetbl); 135 } 136 137 int cur=0; 138 139 String tempString = ""; 140 for (int i = 0; i < length - 1; i++) { 141 // 整篇文章的字符通过一个个字符的来判断,range为得到文档的范围 142 Range range = new Range(i, i + 1, doc); 143 144 CharacterRun cr = range.getCharacterRun(0); 145 146 if(tblExist) 147 { 148 if(i==beginArray[cur]) 149 { 150 htmlText+=tempString+htmlTextArray[cur]; 151 tempString=""; 152 i=endArray[cur]-1; 153 cur++; 154 continue; 155 } 156 } 157 if (pTable.hasPicture(cr)) { 158 htmlText += tempString ; 159 // 读写图片 160 readPicture(pTable, cr); 161 tempString = ""; 162 } 163 else { 164 165 Range range2 = new Range(i + 1, i + 2, doc); 166 // 第二个字符 167 CharacterRun cr2 = range2.getCharacterRun(0); 168 char c = cr.text().charAt(0); 169 170 // 判断是否为空格符 171 if (c == SPACE_ASCII) 172 tempString += " "; 173 // 判断是否为水平制表符 174 else if (c == TABULATION_ASCII) 175 tempString += " "; 176 // 比较前后2个字符是否具有相同的格式 177 boolean flag = compareCharStyle(cr, cr2); 178 if (flag&&c !=ENTER_ASCII) 179 tempString += cr.text(); 180 else { 181 String fontStyle = "<span style='font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 182 + "pt;color:"+getHexColor(cr.getIco24())+";"; 183 184 if (cr.isBold()) 185 fontStyle += "font-weight:bold;"; 186 if (cr.isItalic()) 187 fontStyle += "font-style:italic;"; 188 189 htmlText += fontStyle + "' >" + tempString + cr.text(); 190 htmlText +="</span>"; 191 tempString = ""; 192 } 193 // 判断是否为回车符 194 if (c == ENTER_ASCII) 195 htmlText += "<br/>"; 196 197 } 198 } 199 200 htmlText += tempString+"</body></html>"; 201 //生成html文件 202 writeFile(htmlText); 203 System.out.println("------------WordToHtml转换成功----------------"); 204 //word试卷数据模型化 205 analysisHtmlString(htmlText); 206 System.out.println("------------WordToHtml模型化成功----------------"); 207 } 208 209 /** 210 * 读写文档中的表格 211 * 212 * @param pTable 213 * @param cr 214 * @throws Exception 215 */ 216 public static void readTable(TableIterator it, Range rangetbl) throws Exception { 217 218 htmlTextTbl=""; 219 //迭代文档中的表格 220 221 counter=-1; 222 while (it.hasNext()) 223 { 224 tblExist=true; 225 htmlTextTbl=""; 226 Table tb = (Table) it.next(); 227 beginPosi=tb.getStartOffset() ; 228 endPosi=tb.getEndOffset(); 229 230 //System.out.println("............"+beginPosi+"...."+endPosi); 231 counter=counter+1; 232 //迭代行,默认从0开始 233 beginArray[counter]=beginPosi; 234 endArray[counter]=endPosi; 235 236 htmlTextTbl+="<table border>"; 237 for (int i = 0; i < tb.numRows(); i++) { 238 TableRow tr = tb.getRow(i); 239 240 htmlTextTbl+="<tr>"; 241 //迭代列,默认从0开始 242 for (int j = 0; j < tr.numCells(); j++) { 243 TableCell td = tr.getCell(j);//取得单元格 244 int cellWidth=td.getWidth(); 245 246 //取得单元格的内容 247 for(int k=0;k<td.numParagraphs();k++){ 248 Paragraph para =td.getParagraph(k); 249 String s = para.text().toString().trim(); 250 if(s=="") 251 { 252 s=" "; 253 } 254 htmlTextTbl += "<td width="+cellWidth+ ">"+s+"</td>"; 255 } 256 } 257 } 258 htmlTextTbl+="</table>" ; 259 htmlTextArray[counter]=htmlTextTbl; 260 261 } //end while 262 } 263 264 /** 265 * 读写文档中的图片 266 * 267 * @param pTable 268 * @param cr 269 * @throws Exception 270 */ 271 public static void readPicture(PicturesTable pTable, CharacterRun cr) throws Exception { 272 // 提取图片 273 Picture pic = pTable.extractPicture(cr, false); 274 // 返回POI建议的图片文件名 275 String afileName = pic.suggestFullFileName(); 276 277 File file = new File(wordImageFilePath()); 278 System.out.println(file.mkdirs()); 279 OutputStream out = new FileOutputStream(new File( wordImageFilePath()+ File.separator + afileName)); 280 pic.writeImageContent(out); 281 htmlText += "<img src='"+wordImgeWebPath()+ afileName 282 + "' mce_src='"+wordImgeWebPath()+ afileName + "' />"; 283 } 284 285 286 public static boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2) 287 { 288 boolean flag = false; 289 if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic() && cr1.getFontName().equals(cr2.getFontName()) 290 && cr1.getFontSize() == cr2.getFontSize()&& cr1.getColor() == cr2.getColor()) 291 { 292 flag = true; 293 } 294 return flag; 295 } 296 297 /*** 字体颜色模块start ********/ 298 public static int red(int c) { 299 return c & 0XFF; 300 } 301 302 public static int green(int c) { 303 return (c >> 8) & 0XFF; 304 } 305 306 public static int blue(int c) { 307 return (c >> 16) & 0XFF; 308 } 309 310 public static int rgb(int c) { 311 return (red(c) << 16) | (green(c) << 8) | blue(c); 312 } 313 314 public static String rgbToSix(String rgb) { 315 int length = 6 - rgb.length(); 316 String str = ""; 317 while (length > 0) { 318 str += "0"; 319 length--; 320 } 321 return str + rgb; 322 } 323 324 325 public static String getHexColor(int color) { 326 color = color == -1 ? 0 : color; 327 int rgb = rgb(color); 328 return "#" + rgbToSix(Integer.toHexString(rgb)); 329 } 330 /** 字体颜色模块end ******/ 331 332 /** 333 * 写文件 334 * 335 * @param s 336 */ 337 public static void writeFile(String s) { 338 FileOutputStream fos = null; 339 BufferedWriter bw = null; 340 PrintWriter writer = null; 341 try { 342 File file = new File(htmlFile); 343 fos = new FileOutputStream(file); 344 bw = new BufferedWriter(new OutputStreamWriter(fos)); 345 bw.write(s); 346 bw.close(); 347 fos.close(); 348 //编码转换 349 writer = new PrintWriter(file, "GB2312"); 350 writer.write(s); 351 writer.flush(); 352 writer.close(); 353 } catch (FileNotFoundException fnfe) { 354 fnfe.printStackTrace(); 355 } catch (IOException ioe) { 356 ioe.printStackTrace(); 357 } 358 359 } 360 361 /** 362 * 分析html 363 * @param s 364 */ 365 public static void analysisHtmlString(String s){ 366 367 String q[] = s.split("<br/>"); 368 369 LinkedList<String> list = new LinkedList<String>(); 370 371 //清除空字符 372 for (int i = 0; i < q.length; i++) { 373 if(StringUtils.isNotBlank(q[i].toString().replaceAll("</?[^>]+>","").trim())){ 374 375 list.add(q[i].toString().trim()); 376 } 377 } 378 String[] result = {}; 379 String ws[]=list.toArray(result); 380 int singleScore = 0; 381 int multipleScore = 0; 382 int fillingScore = 0; 383 int judgeScore = 0; 384 int askScore = 0; 385 int singleNum = 0; 386 int multipleNum = 0; 387 int fillingNum = 0; 388 int judgeNum = 0; 389 int askNum = 0; 390 /***********试卷基础数据赋值*********************/ 391 for (int i = 0; i < ws.length; i++) { 392 String delHtml=ws[i].toString().replaceAll("</?[^>]+>","").trim();//去除html 393 if(delHtml.contains("、单选题")){ 394 String numScore=numScore(delHtml); 395 singleNum= Integer.parseInt(numScore.split(",")[0]) ; 396 singleScore=Integer.parseInt(numScore.split(",")[1]) ; 397 }else if(delHtml.contains("、多择题")){ 398 String numScore=numScore(delHtml); 399 multipleNum= Integer.parseInt(numScore.split(",")[0]) ; 400 multipleScore=Integer.parseInt(numScore.split(",")[1]) ; 401 }else if(delHtml.contains("、填空题")){ 402 String numScore=numScore(delHtml); 403 fillingNum= Integer.parseInt(numScore.split(",")[0]) ; 404 fillingScore=Integer.parseInt(numScore.split(",")[1]) ; 405 }else if(delHtml.contains("、判断题")){ 406 String numScore=numScore(delHtml); 407 judgeNum= Integer.parseInt(numScore.split(",")[0]) ; 408 judgeScore=Integer.parseInt(numScore.split(",")[1]) ; 409 }else if(delHtml.contains("、问答题")){ 410 String numScore=numScore(delHtml); 411 askNum= Integer.parseInt(numScore.split(",")[0]) ; 412 askScore=Integer.parseInt(numScore.split(",")[1]) ; 413 } 414 415 } 416 /**************word试卷数据模型化****************/ 417 List<Map<String, Object>> bigTiMaps = new ArrayList<Map<String,Object>>(); 418 List<Map<String, Object>> smalMaps = new ArrayList<Map<String,Object>>(); 419 List<Map<String, Object>> sleMaps = new ArrayList<Map<String,Object>>(); 420 String htmlText=""; 421 int smalScore=0; 422 for (int j = ws.length-1; j>=0; j--) { 423 String html= ws[j].toString().trim();//html格式 424 String delHtml=ws[j].toString().replaceAll("</?[^>]+>","").trim();//去除html 425 if(!isSelecteTitele(delHtml)&&!isTitele(delHtml)&&!isBigTilete(delHtml)){//无 426 if(isTitele(delHtml)){ 427 smalScore=itemNum(delHtml); 428 } 429 htmlText=html+htmlText; 430 }else if(isSelecteTitele(delHtml)){//选择题选择项 431 Map<String, Object> sleMap = new HashMap<String, Object>();//选择题选择项 432 sleMap.put("seleteItem", delHtml.substring(0, 1)); 433 sleMap.put("seleteQuest", html+htmlText); 434 sleMaps.add(sleMap); 435 }else if(isTitele(delHtml)){//小标题 436 Map<String, Object> smalMap = new HashMap<String, Object>();//小标题 437 smalMap.put("smalTilete", html+htmlText); 438 smalMap.put("smalScore", smalScore>0?smalScore+"":itemNum(delHtml)+""); 439 smalMap.put("sleMaps", sleMaps); 440 smalMaps.add(smalMap); 441 }else if(isBigTilete(delHtml)){//大标题 442 Map<String, Object> bigTiMap = new HashMap<String, Object>();//大标题 443 bigTiMap.put("bigTilete", delHtml.substring(2, 5)); 444 bigTiMap.put("smalMaps", smalMaps); 445 bigTiMaps.add(bigTiMap); 446 } 447 448 } 449 //System.out.println(bigTiMaps.toString()); 450 } 451 452 //获取大题-题目数量以及题目总计分数 453 public static String numScore(String delHtml){ 454 455 String regEx="[^0-9+,|,+^0-9]"; 456 Pattern p = Pattern.compile(regEx); 457 Matcher m = p.matcher(delHtml); 458 String s=m.replaceAll("").trim(); 459 if(StringUtils.isNotBlank(s)){ 460 if(s.contains(",")){ 461 return s; 462 }else if(s.contains(",")){ 463 return s.replace(",", ","); 464 }else{ 465 return "0,0"; 466 } 467 }else{ 468 return "0,0"; 469 } 470 471 } 472 //获取每小题分数 473 public static int itemNum(String delHtml){ 474 Pattern pattern = Pattern.compile("((.*?))"); //中文括号 475 Matcher matcher = pattern.matcher(delHtml); 476 if (matcher.find()&&isNumeric(matcher.group(1))){ 477 return Integer.parseInt(matcher.group(1)); 478 }else { 479 return 0; 480 } 481 } 482 //判断Str是否是 数字 483 public static boolean isNumeric(String str){ 484 Pattern pattern = Pattern.compile("[0-9]*"); 485 return pattern.matcher(str).matches(); 486 } 487 //判断Str是否存在小标题号 488 public static boolean isTitele(String str){ 489 Pattern pattern = Pattern.compile("^([\\d]+[-\\、].*)"); 490 return pattern.matcher(str).matches(); 491 } 492 //判断Str是否是选择题选择项 493 public static boolean isSelecteTitele(String str){ 494 Pattern pattern = Pattern.compile("^([a-zA-Z]+[-\\:].*)"); 495 return pattern.matcher(str).matches(); 496 } 497 //判断Str是否是大标题 498 public static boolean isBigTilete(String str){ 499 boolean iso= false ; 500 if(str.contains("一、")){ 501 iso=true; 502 }else if(str.contains("二、")){ 503 iso=true; 504 }else if(str.contains("三、")){ 505 iso=true; 506 }else if(str.contains("四、")){ 507 iso=true; 508 }else if(str.contains("五、")){ 509 iso=true; 510 }else if(str.contains("六、")){ 511 iso=true; 512 }else if(str.contains("七、")){ 513 iso=true; 514 }else if(str.contains("八、")){ 515 iso=true; 516 } 517 return iso; 518 } 519 }
so 我们已经完成所有步骤。
文章出自:http://www.cnblogs.com/libaoting/p/wordToMap.html 可自由引用,但请注明来源,谢谢。