在JAVA下实现数据库对word文件的读取与字段的提取

  实现excel中的数据导入数据库,在java或是C#下是不难实现的,即使想实现在txt中的数据导入也不是什么难事,但是,最近接了个任务,学校要建一个英文版的教学平台,然后各个学院的课程资料与简介什么的都是英文的,学校有20个学院,每个学院多的有两百来个科目,少的也有百八十种,可他偏偏就是个word,而且,做的格式很不规范。这可愁着我了,我首先想到的是POI,于是google了一下,原来真的很容易实现,这个后面的代码可以发上去,可以实现03版,和07版的。差别主要是jar包的问题。03的要3个jar包,07的需要7个jar包。

 1 * POI 读取 word 2003 和 word 2007 中文字内容的测试类<br />
2 * @createDate 2009-07-25
3 * @author Carl He
4 */
5 public class Test {
6 public static void main(String[] args) {
7 try {
8 ////word 2003: 图片不会被读取
9 InputStream is = new FileInputStream(new File("files\\2003.doc"));
10 WordExtractor ex = new WordExtractor(is);//is是WORD文件的InputStream
11 String text2003 = ex.getText();
12
13 System.out.println(text2003);
14 //对字符串进行分解
15
16 //word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后
17 OPCPackage opcPackage = POIXMLDocument.openPackage("files\\2007.docx");
18 POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
19 String text2007 = extractor.getText();
20 System.out.println(text2007);
21
22 } catch (Exception e) {
23 e.printStackTrace();
24 }
25 }
26 }



  然后,重要的问题是,如何从word的字段中抓去文件才是关键,因为他们提供的word文件并不是excel,并不能直接导入,我还是果断上一个word文件吧,这样好理解:

Course Description of Biochemistry

Course Name: Biochemistry              Nature of CourseCompulsory course

Course Code: B1700025                 Total Credits: 5.0

Total Credit Hours80                 Lecture Hours80

Experimental Hours: 0                 Oriented Majors: Bioscience, Biotechnology

Prerequisite Courses:

Penner:                               Validator(s):

Briefing of Course Content:

Biochemistry is a science exploring the chemical compositions and chemical reactions during life activitiesof living organisms.  It is an important compulsive fundamental course for undergraduates majoring in bioscience and biotechnology. The main content of this course includes 1. The structure, function and the relationship between the structure and function of biological macromolecule such as protein and nucleic acid; 2. The metabolisms and regulation of biological macromolecules including carbohydrate, lipid, protein, nucleic acid etc.; 3. The transfer and expression of genetic information.

我把截取的java文件源代码也发一下吧。本来想用正则的,但是正则不大会用,只好从字符串方面下手来截取了,上代码共同交流。

 

 1 package qnit.course;
2
3 import java.io.File;
4 import java.util.ArrayList;
5
6 public class Directory {
7 private ArrayList nameList = new ArrayList();
8 private static String dirName = "d:\\EclipseWorkSpace\\Test\\files";
9
10 public void getSubFile(String FileName) {
11 File parentF = new File(FileName);
12 if (!parentF.exists()) {
13 System.out.println("文件或目录不存在");
14 return;
15 }
16 if (parentF.isFile()) {
17 nameList.add(parentF.getAbsoluteFile());
18 return;
19 }
20 String[] subFiles = parentF.list();
21 for (int i = 0; i < subFiles.length; i++) {
22 getSubFile(dirName + "/" + subFiles[i]);
23 }
24 }
25
26 public ArrayList getNameList() {
27 return nameList;
28 }
29 public static void main(String[] args){
30 Directory d=new Directory();
31 d.getSubFile("d:\\EclipseWorkSpace\\Test\\files");
32 }
33 }

 

package qnit.course;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;

import org.apache.poi.hwpf.extractor.WordExtractor;

public class FileTool {

/**
* 列出目录
*
@throws Exception
*/
public static void listDirs()throws Exception{

File f=new File("d:\\EclipseWorkSpace\\Test\\files\\英文课程简介");
String[] dirs=f.list();
for(int i=0;i<dirs.length;i++){
// System.out.println(dirs[i]);
listFiles(getDeptID(dirs[i].trim()),dirs[i]);
}
}
public static void main(String[] args)throws Exception{
listDirs();

}
/**
* 列出文件名
*
@throws Exception
*/
public static void listFiles(int deptID,String dirName)throws Exception{

File f=new File("d:\\EclipseWorkSpace\\Test\\files\\英文课程简介\\"+dirName);
String[] files=f.list();
for(int i=0;i<files.length;i++){
//System.out.println("d:\\EclipseWorkSpace\\Test\\files\\英文课程简介\\"+dirName+"\\"+files[i]);
//列出文件名
String docName=files[i];
if(docName.length()>30)
docName=docName.substring(30, docName.length()-4);
//System.out.println(docName+"开始写入数据库");
//从word中读取信息

boolean res=insertCoursesIntoDB(deptID,docName,getDetailFromWord("d:\\EclipseWorkSpace\\Test\\files\\英文课程简介\\"+dirName+"\\"+files[i]));
if(res){
File file=new File("d:\\EclipseWorkSpace\\Test\\files\\英文课程简介\\"+dirName+"\\"+files[i]);
file.delete();}else{

System.out.println("导入错误的文件为:"+"d:\\EclipseWorkSpace\\Test\\files\\英文课程简介\\"+dirName+"\\"+files[i]);
System.out.println("----------------以上-------------------");
}
}
}
//将提出的数据写入数据库
public static boolean insertCoursesIntoDB(int deptID,String courseName,String courseDetail)throws Exception{
//显示内容
try{
//System.out.println(courseDetail);
//分析字符串
courseDetail=courseDetail.replace(":", ":");
String courseCode=courseDetail.substring(courseDetail.indexOf("Course Code:"), courseDetail.indexOf("Total Credits:")).trim();
if(courseCode.length()>12)
courseCode=courseCode.substring(12, courseCode.length()).trim();

String nature=courseDetail.substring(courseDetail.indexOf("Nature of Course:"), courseDetail.indexOf("Course Code:")).trim();
if(nature.length()>17)
nature=nature.substring(17, nature.length()).trim();

String totalCredit=courseDetail.substring(courseDetail.indexOf("Total Credits:"), courseDetail.indexOf("Total Credit Hours :")).trim();
if(totalCredit.length()>14)
totalCredit=totalCredit.substring(14, totalCredit.length()).trim();

String totalCreditHours=courseDetail.substring(courseDetail.indexOf("Total Credit Hours :"), courseDetail.indexOf("Lecture Hours:")).trim();
if(totalCreditHours.length()>19)
totalCreditHours=totalCreditHours.substring(19, totalCreditHours.length()).trim();

String lectureHours=courseDetail.substring(courseDetail.indexOf("Lecture Hours:"), courseDetail.indexOf("Experiment Hours:")).trim();
if(lectureHours.length()>14)
lectureHours=lectureHours.substring(14, lectureHours.length()).trim();

String experimentalHours=courseDetail.substring(courseDetail.indexOf("Experiment Hours:"), courseDetail.indexOf("Oriented Majors:")).trim();
if(experimentalHours.length()>19)
experimentalHours=experimentalHours.substring(19, experimentalHours.length()).trim();

String orientedMajors=courseDetail.substring(courseDetail.indexOf("Oriented Majors:"), courseDetail.indexOf("Prerequisite Course:")).trim();
if(orientedMajors.length()>16)
orientedMajors=orientedMajors.substring(16, orientedMajors.length()).trim();

String prerequisiteCourse=courseDetail.substring(courseDetail.indexOf("Prerequisite Course:"), courseDetail.indexOf("Penner:")).trim();
if(prerequisiteCourse.length()>23)
prerequisiteCourse=prerequisiteCourse.substring(23, prerequisiteCourse.length()).trim();

String penner=courseDetail.substring(courseDetail.indexOf("Penner:"), courseDetail.indexOf("Validators :")).trim();
if(penner.length()>7)
penner=penner.substring(7, penner.length()).trim();

String validator=courseDetail.substring(courseDetail.indexOf("Validators :"), courseDetail.indexOf("Briefing of Course Content:")).trim();
if(validator.length()>13)
validator=validator.substring(13, validator.length()).trim();

String content=courseDetail.substring(courseDetail.indexOf("Briefing of Course Content:"), courseDetail.length()).trim();
if(content.length()>27)
content=content.substring(27, content.length()).trim();



//写入数据库

// 从mysql数据库中读取正确信息
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
try {
conn = getConn();
stmt = conn.createStatement();
String sql="insert into course(deptId,name,courseCode,nature,totalCredits,totalCreditHours,lectureHours,experimentalHours,orientedMajors,prerequisiteCourse,penner,validator,content) values(" +
deptID+",'"+courseName+"','"+courseCode+"','"+nature+"','"+totalCredit+"','"+totalCreditHours+"','"+lectureHours+"','"+experimentalHours+"','"+orientedMajors+"','"+prerequisiteCourse+"','"+penner+"','"+validator+"','"+content+"')";
stmt.executeUpdate(sql);
} catch (Exception e) {

System.out.println("数据库错误"+e.getMessage());
return false;
} finally {
if (rs != null)
try {
rs.close();
} catch (Exception e) {
System.out.println(e.getMessage());
}
if (stmt != null)
try {
stmt.close();
} catch (Exception e) {
System.out.println(e.getMessage());
}
if (conn != null)
try {
conn.close();
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
return true;
}catch(Exception e){
System.out.println("字符转换截取错误:"+e.getMessage());
return false;
}

}


/**
* 读取word中的内容
*
@param fileName
*
@return
*
@throws Exception
*/
public static String getDetailFromWord(String fileName){
//获取word中的内容
String text="";
try{
InputStream is = new FileInputStream(new File(fileName));
WordExtractor ex = new WordExtractor(is);//is是WORD文件的InputStream
text = ex.getText();
}catch(Exception e){
System.out.println("错误文件为:"+fileName);
return null;
}
return text;
}
/**
* 获得单位ID
*
*
@param deptName
*
@return
*
@throws Exception
*/
public static int getDeptID(String deptName)throws Exception{
int deptID=0;
// 从mysql数据库中读取正确信息
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
try {
conn = getConn();
stmt = conn.createStatement();
rs=stmt.executeQuery("select id from dept where name='"+deptName+"'");
if(rs.next())
deptID=rs.getInt(1);
} catch (Exception e) {
System.out.println(e.getMessage());
} finally {
if (rs != null)
try {
rs.close();
} catch (Exception e) {
System.out.println(e.getMessage());
}
if (stmt != null)
try {
stmt.close();
} catch (Exception e) {
System.out.println(e.getMessage());
}
if (conn != null)
try {
conn.close();
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
return deptID;
}



public static Connection getConn() throws Exception{
Class.forName("com.mysql.jdbc.Driver");
DriverManager.registerDriver(new com.mysql.jdbc.Driver());
String dbUrl = "jdbc:mysql://210.44.**.**:3306/qauenDB?useUnicode=true&CharacterEncoding=utf8";
String dbUser = "***";
String dbPassword = "******";
return java.sql.DriverManager.getConnection(dbUrl, dbUser,dbPassword);
}

}



反正这次任务很麻烦,看起来,各个科目弄的很专业,很标准,但是,不同老师做的,一些标点符号还是整的很乱,有的是英文的标点,有的是中文的,有的是半角的,有的是全角的,造成了数据库里面全是乱码,到现在没发改了,全办公室集体上阵,一人分几个学院,挨条记录看,挨条记录改,纠结ing。。。。。。

 

 

 

 

 

 

 

 

 

 

 

 

posted @ 2012-04-08 09:52  陽倌_℡__  阅读(3239)  评论(1编辑  收藏  举报