Word试卷文档模型化解析存储到数据库

最近在搞一套在线的考试系统,有许多人反映试题的新增比较麻烦(需要逐个输入),于是呼就整个了试卷批量导入了

poi实现word转html

模型化解析html

html转Map数组

Map数组(数组的操作处理不做说明)

1.导jar包。 

2.word试卷导入模板

链接:http://pan.baidu.com/s/1gfK6g5H

3.代码实现

  1 package com.web.onlinexam.util;
  2 
  3 import java.io.BufferedWriter;  
  4 import java.io.File;  
  5 import java.io.FileInputStream;  
  6 import java.io.FileNotFoundException;  
  7 import java.io.FileOutputStream;  
  8 import java.io.IOException;  
  9 import java.io.OutputStream;  
 10 import java.io.OutputStreamWriter;  
 11 import java.io.PrintWriter;
 12 import java.util.ArrayList;
 13 import java.util.Date;
 14 import java.util.HashMap;
 15 import java.util.LinkedList;
 16 import java.util.List;
 17 import java.util.Map;
 18 import java.util.regex.Matcher;
 19 import java.util.regex.Pattern;
 20 
 21 import org.apache.commons.lang.StringUtils;
 22 import org.apache.poi.hwpf.HWPFDocument;  
 23 import org.apache.poi.hwpf.model.PicturesTable;  
 24 import org.apache.poi.hwpf.usermodel.CharacterRun;  
 25 import org.apache.poi.hwpf.usermodel.Picture;  
 26 import org.apache.poi.hwpf.usermodel.Range;  
 27 import org.apache.poi.hwpf.usermodel.Paragraph;     
 28 import org.apache.poi.hwpf.usermodel.Table;     
 29 import org.apache.poi.hwpf.usermodel.TableCell;     
 30 import org.apache.poi.hwpf.usermodel.TableIterator;     
 31 import org.apache.poi.hwpf.usermodel.TableRow;  
 32 
 33 import com.common.util.DateFormatUtil;
 34 import com.common.util.FileUploadPathConfig;
 35 
 36 /**
 37  *
 38 
 39  * @Description:Word试卷文档模型化解析
 40 
 41  * @author <a href="mailto:thoslbt@163.com">Thos</a> 42  * @ClassName: WordToHtml 44  * @version V1.0
 45  *
 46  */
 47 public class WordToHtml {
 48 
 49     /**
 50      * 回车符ASCII码
 51      */
 52     private static final short ENTER_ASCII = 13;
 53 
 54     /**
 55      * 空格符ASCII码
 56      */
 57     private static final short SPACE_ASCII = 32;
 58 
 59     /**
 60      * 水平制表符ASCII码
 61      */
 62     private static final short TABULATION_ASCII = 9;
 63 
 64     public static String htmlText = "";
 65     public static String htmlTextTbl = "";
 66     public static int counter=0;
 67     public static int beginPosi=0;
 68     public static int endPosi=0;
 69     public static int beginArray[];
 70     public static int endArray[];
 71     public static String htmlTextArray[];
 72     public static boolean tblExist=false;
 73 
 74     public static final String inputFile="C:\\Users\\java\\Downloads\\111222.doc";
 75     public static final String htmlFile="E:/abc.html";
 76 
 77     public static void main(String argv[])
 78     {        
 79         try {
 80             getWordAndStyle(inputFile);
 81         } catch (Exception e) {
 82             e.printStackTrace();
 83         }
 84     }
 85 
 86     /**
 87      * word文档图片存储路径
 88      * @return
 89      */
 90     public static String wordImageFilePath(){
 91 
 92         return  FileUploadPathConfig.FILE_UPLOAD_BASE+"upload/wordImage/"+ DateFormatUtil.formatDate(new Date());
 93     }
 94 
 95     /**
 96      *  word文档图片Web访问路径
 97      * @return
 98      */
 99     public static String wordImgeWebPath(){
100 
101         return  "D:/var/e_learning/upload/wordImage/"+ DateFormatUtil.formatDate(new Date())+"/";
102     }
103 
104     /**
105      * 读取每个文字样式
106      * 
107      * @param fileName
108      * @throws Exception
109      */
110 
111 
112     public static void getWordAndStyle(String fileName) throws Exception {
113         FileInputStream in = new FileInputStream(new File(fileName));
114         HWPFDocument doc = new HWPFDocument(in);
115 
116         Range rangetbl = doc.getRange();//得到文档的读取范围   
117         TableIterator it = new TableIterator(rangetbl); 
118         int num=100;         
119 
120         beginArray=new int[num];
121         endArray=new int[num];
122         htmlTextArray=new String[num];
123 
124         // 取得文档中字符的总数
125         int length = doc.characterLength();
126         // 创建图片容器
127         PicturesTable pTable = doc.getPicturesTable();
128 
129         htmlText = "<html><head><title>" + doc.getSummaryInformation().getTitle() + "</title></head><body>";
130         // 创建临时字符串,好加以判断一串字符是否存在相同格式
131 
132         if(it.hasNext())
133         {
134             readTable(it,rangetbl);
135         }
136 
137         int cur=0;
138 
139         String tempString = "";
140         for (int i = 0; i < length - 1; i++) {
141             // 整篇文章的字符通过一个个字符的来判断,range为得到文档的范围
142             Range range = new Range(i, i + 1, doc);
143 
144             CharacterRun cr = range.getCharacterRun(0); 
145             
146             if(tblExist)
147             {
148                 if(i==beginArray[cur])
149                 {         
150                     htmlText+=tempString+htmlTextArray[cur];
151                     tempString="";
152                     i=endArray[cur]-1;
153                     cur++;
154                     continue;
155                 }
156             }
157             if (pTable.hasPicture(cr)) {
158                 htmlText +=  tempString ;                
159                 // 读写图片                
160                 readPicture(pTable, cr);
161                 tempString = "";                
162             } 
163             else {
164 
165                 Range range2 = new Range(i + 1, i + 2, doc);
166                 // 第二个字符
167                 CharacterRun cr2 = range2.getCharacterRun(0);
168                 char c = cr.text().charAt(0);
169 
170                 // 判断是否为空格符
171                 if (c == SPACE_ASCII)
172                     tempString += "&nbsp;";
173                 // 判断是否为水平制表符
174                 else if (c == TABULATION_ASCII)
175                     tempString += "&nbsp;&nbsp;&nbsp;&nbsp;";
176                 // 比较前后2个字符是否具有相同的格式
177                 boolean flag = compareCharStyle(cr, cr2);
178                 if (flag&&c !=ENTER_ASCII)
179                     tempString += cr.text();
180                 else {
181                     String fontStyle = "<span style='font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2
182                     + "pt;color:"+getHexColor(cr.getIco24())+";";
183 
184                     if (cr.isBold())
185                         fontStyle += "font-weight:bold;";
186                     if (cr.isItalic())
187                         fontStyle += "font-style:italic;";
188 
189                     htmlText += fontStyle + "' >" + tempString + cr.text();
190                     htmlText +="</span>";
191                     tempString = "";
192                 }
193                 // 判断是否为回车符
194                 if (c == ENTER_ASCII)
195                     htmlText += "<br/>";
196 
197             }
198         }
199 
200         htmlText += tempString+"</body></html>";
201         //生成html文件
202         writeFile(htmlText);
203         System.out.println("------------WordToHtml转换成功----------------");
204         //word试卷数据模型化
205         analysisHtmlString(htmlText);
206         System.out.println("------------WordToHtml模型化成功----------------");
207     }
208 
209     /**
210      * 读写文档中的表格
211      * 
212      * @param pTable
213      * @param cr
214      * @throws Exception
215      */
216     public static void readTable(TableIterator it, Range rangetbl) throws Exception {
217 
218         htmlTextTbl="";
219         //迭代文档中的表格  
220 
221         counter=-1;
222         while (it.hasNext()) 
223         { 
224             tblExist=true;
225             htmlTextTbl="";
226             Table tb = (Table) it.next();    
227             beginPosi=tb.getStartOffset() ;
228             endPosi=tb.getEndOffset();
229 
230             //System.out.println("............"+beginPosi+"...."+endPosi);
231             counter=counter+1;
232             //迭代行,默认从0开始
233             beginArray[counter]=beginPosi;
234             endArray[counter]=endPosi;
235 
236             htmlTextTbl+="<table border>";
237             for (int i = 0; i < tb.numRows(); i++) {      
238                 TableRow tr = tb.getRow(i);   
239 
240                 htmlTextTbl+="<tr>";
241                 //迭代列,默认从0开始   
242                 for (int j = 0; j < tr.numCells(); j++) {      
243                     TableCell td = tr.getCell(j);//取得单元格
244                     int cellWidth=td.getWidth();
245 
246                     //取得单元格的内容   
247                     for(int k=0;k<td.numParagraphs();k++){      
248                         Paragraph para =td.getParagraph(k);      
249                         String s = para.text().toString().trim();   
250                         if(s=="")
251                         {
252                             s=" ";
253                         }
254                         htmlTextTbl += "<td width="+cellWidth+ ">"+s+"</td>";
255                     }       
256                 }      
257             }   
258             htmlTextTbl+="</table>" ;    
259             htmlTextArray[counter]=htmlTextTbl;
260 
261         } //end while 
262     }    
263 
264     /**
265      * 读写文档中的图片
266      * 
267      * @param pTable
268      * @param cr
269      * @throws Exception
270      */
271     public static void readPicture(PicturesTable pTable, CharacterRun cr) throws Exception {
272         // 提取图片
273         Picture pic = pTable.extractPicture(cr, false);
274         // 返回POI建议的图片文件名
275         String afileName = pic.suggestFullFileName();
276 
277         File file = new File(wordImageFilePath());
278         System.out.println(file.mkdirs());
279         OutputStream out = new FileOutputStream(new File( wordImageFilePath()+ File.separator + afileName));
280         pic.writeImageContent(out);
281         htmlText += "<img src='"+wordImgeWebPath()+ afileName
282         + "' mce_src='"+wordImgeWebPath()+ afileName + "' />";
283     }
284 
285 
286     public static boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2) 
287     {
288         boolean flag = false;
289         if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic() && cr1.getFontName().equals(cr2.getFontName()) 
290                 && cr1.getFontSize() == cr2.getFontSize()&& cr1.getColor() == cr2.getColor()) 
291         {
292             flag = true;
293         }
294         return flag;
295     }
296 
297     /*** 字体颜色模块start ********/
298     public static int red(int c) {  
299         return c & 0XFF;  
300     }  
301 
302     public static int green(int c) {  
303         return (c >> 8) & 0XFF;  
304     }  
305 
306     public static int blue(int c) {  
307         return (c >> 16) & 0XFF;  
308     }  
309 
310     public static int rgb(int c) {  
311         return (red(c) << 16) | (green(c) << 8) | blue(c);  
312     }  
313 
314     public static String rgbToSix(String rgb) {  
315         int length = 6 - rgb.length();  
316         String str = "";  
317         while (length > 0) {  
318             str += "0";  
319             length--;  
320         }  
321         return str + rgb;  
322     }  
323 
324 
325     public static String getHexColor(int color) {  
326         color = color == -1 ? 0 : color;  
327         int rgb = rgb(color);  
328         return "#" + rgbToSix(Integer.toHexString(rgb));  
329     }  
330     /** 字体颜色模块end ******/
331 
332     /**
333      * 写文件
334      * 
335      * @param s
336      */
337     public static void writeFile(String s) {
338         FileOutputStream fos = null;
339         BufferedWriter bw = null;
340         PrintWriter writer = null;
341         try {
342             File file = new File(htmlFile);
343             fos = new FileOutputStream(file);
344             bw = new BufferedWriter(new OutputStreamWriter(fos));
345             bw.write(s);
346             bw.close();
347             fos.close();
348             //编码转换
349             writer = new PrintWriter(file, "GB2312");
350             writer.write(s);
351             writer.flush();
352             writer.close();
353         } catch (FileNotFoundException fnfe) {
354             fnfe.printStackTrace();
355         } catch (IOException ioe) {
356             ioe.printStackTrace();
357         }
358 
359     }
360 
361     /**
362      * 分析html
363      * @param s
364      */
365     public static void analysisHtmlString(String s){
366 
367         String q[] = s.split("<br/>");
368 
369         LinkedList<String> list = new LinkedList<String>();
370 
371         //清除空字符
372         for (int i = 0; i < q.length; i++) {
373             if(StringUtils.isNotBlank(q[i].toString().replaceAll("</?[^>]+>","").trim())){
374 
375                 list.add(q[i].toString().trim());
376             }
377         }
378         String[] result = {};
379         String ws[]=list.toArray(result);
380         int singleScore = 0;
381         int multipleScore = 0;
382         int fillingScore = 0;
383         int judgeScore = 0;
384         int askScore = 0;
385         int singleNum = 0;
386         int multipleNum = 0;
387         int fillingNum = 0;
388         int judgeNum = 0;
389         int askNum = 0;
390         /***********试卷基础数据赋值*********************/
391         for (int i = 0; i < ws.length; i++) {
392             String delHtml=ws[i].toString().replaceAll("</?[^>]+>","").trim();//去除html
393             if(delHtml.contains("、单选题")){
394                 String numScore=numScore(delHtml);
395                 singleNum= Integer.parseInt(numScore.split(",")[0]) ;
396                 singleScore=Integer.parseInt(numScore.split(",")[1]) ;
397             }else if(delHtml.contains("、多择题")){
398                 String numScore=numScore(delHtml);
399                 multipleNum= Integer.parseInt(numScore.split(",")[0]) ;
400                 multipleScore=Integer.parseInt(numScore.split(",")[1]) ;
401             }else if(delHtml.contains("、填空题")){
402                 String numScore=numScore(delHtml);
403                 fillingNum= Integer.parseInt(numScore.split(",")[0]) ;
404                 fillingScore=Integer.parseInt(numScore.split(",")[1]) ;
405             }else if(delHtml.contains("、判断题")){
406                 String numScore=numScore(delHtml);
407                 judgeNum= Integer.parseInt(numScore.split(",")[0]) ;
408                 judgeScore=Integer.parseInt(numScore.split(",")[1]) ;
409             }else if(delHtml.contains("、问答题")){
410                 String numScore=numScore(delHtml);
411                 askNum= Integer.parseInt(numScore.split(",")[0]) ;
412                 askScore=Integer.parseInt(numScore.split(",")[1]) ;
413             }
414 
415         }
416         /**************word试卷数据模型化****************/
417         List<Map<String, Object>> bigTiMaps = new ArrayList<Map<String,Object>>();
418         List<Map<String, Object>> smalMaps = new ArrayList<Map<String,Object>>();
419         List<Map<String, Object>> sleMaps = new ArrayList<Map<String,Object>>();
420         String htmlText="";
421         int smalScore=0;
422         for (int j = ws.length-1; j>=0; j--) {
423             String html= ws[j].toString().trim();//html格式
424             String delHtml=ws[j].toString().replaceAll("</?[^>]+>","").trim();//去除html
425             if(!isSelecteTitele(delHtml)&&!isTitele(delHtml)&&!isBigTilete(delHtml)){//
426                 if(isTitele(delHtml)){
427                     smalScore=itemNum(delHtml);
428                 }
429                 htmlText=html+htmlText;
430             }else if(isSelecteTitele(delHtml)){//选择题选择项
431                 Map<String, Object> sleMap = new HashMap<String, Object>();//选择题选择项
432                 sleMap.put("seleteItem", delHtml.substring(0, 1));
433                 sleMap.put("seleteQuest", html+htmlText);
434                 sleMaps.add(sleMap);
435             }else if(isTitele(delHtml)){//小标题
436                 Map<String, Object> smalMap = new HashMap<String, Object>();//小标题
437                 smalMap.put("smalTilete", html+htmlText);
438                 smalMap.put("smalScore", smalScore>0?smalScore+"":itemNum(delHtml)+"");
439                 smalMap.put("sleMaps", sleMaps);
440                 smalMaps.add(smalMap);
441             }else if(isBigTilete(delHtml)){//大标题
442                 Map<String, Object> bigTiMap = new HashMap<String, Object>();//大标题
443                 bigTiMap.put("bigTilete", delHtml.substring(2, 5));
444                 bigTiMap.put("smalMaps", smalMaps);
445                 bigTiMaps.add(bigTiMap);
446             }    
447 
448         }
449         //System.out.println(bigTiMaps.toString());
450     }
451 
452     //获取大题-题目数量以及题目总计分数
453     public static String numScore(String delHtml){
454 
455         String regEx="[^0-9+,|,+^0-9]";   
456         Pattern p = Pattern.compile(regEx);   
457         Matcher m = p.matcher(delHtml);
458         String s=m.replaceAll("").trim();
459         if(StringUtils.isNotBlank(s)){
460             if(s.contains(",")){
461                 return s;
462             }else if(s.contains(",")){
463                 return s.replace(",", ",");
464             }else{
465                 return "0,0";
466             }
467         }else{
468             return "0,0";
469         }
470 
471     }
472     //获取每小题分数
473     public static int itemNum(String delHtml){
474         Pattern pattern = Pattern.compile("((.*?))"); //中文括号 
475         Matcher matcher = pattern.matcher(delHtml);
476         if (matcher.find()&&isNumeric(matcher.group(1))){
477             return Integer.parseInt(matcher.group(1));
478         }else {
479             return 0;
480         }
481     }
482     //判断Str是否是 数字
483     public static boolean isNumeric(String str){ 
484         Pattern pattern = Pattern.compile("[0-9]*"); 
485         return pattern.matcher(str).matches();    
486     } 
487     //判断Str是否存在小标题号
488     public static boolean isTitele(String str){
489         Pattern pattern = Pattern.compile("^([\\d]+[-\\、].*)"); 
490         return pattern.matcher(str).matches();
491     }
492     //判断Str是否是选择题选择项
493     public static boolean isSelecteTitele(String str){
494         Pattern pattern = Pattern.compile("^([a-zA-Z]+[-\\:].*)"); 
495         return pattern.matcher(str).matches();
496     }
497     //判断Str是否是大标题
498     public static boolean isBigTilete(String str){
499         boolean iso= false ;
500         if(str.contains("一、")){
501             iso=true;
502         }else if(str.contains("二、")){
503             iso=true;
504         }else if(str.contains("三、")){
505             iso=true;
506         }else if(str.contains("四、")){
507             iso=true;
508         }else if(str.contains("五、")){
509             iso=true;
510         }else if(str.contains("六、")){
511             iso=true;
512         }else if(str.contains("七、")){
513             iso=true;
514         }else if(str.contains("八、")){
515             iso=true;
516         }
517         return iso;
518     }
519 }
so 我们已经完成所有步骤。
文章出自:http://www.cnblogs.com/libaoting/p/wordToMap.html
可自由引用,但请注明来源,谢谢。 
posted @ 2015-03-02 16:27  礼拜天001  阅读(6268)  评论(8编辑  收藏  举报