初步分析美国科研项目申报项目
将XML转换为新格式的XML、excel、和SQlserver数据
package hander; /** * @author ZTX * @function:读取XML中字段到excel中 * @Configuration:将文件放在 Xml\data下,或者data改为您的数据文件夹置于 Xml\下 * @time:2013年11月12日19:37:45--2013年11月12日21:48:22--2013年11月13日17:28:45--2013年11月13日21:00:45 */ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.Iterator; import java.util.List; import java.util.Map; import org.dom4j.*; import org.dom4j.io.OutputFormat; import org.dom4j.io.SAXReader; import org.dom4j.io.XMLWriter; import org.apache.poi.*; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; public class Read { public static String inputXmlDir;//要处理的xml文件的文件夹 public static String data="2006";//要处理的xml文件的文件夹名称默认是data public static boolean head=false;//表头是否写了 public static String pathPrefix="";//文件路径前缀 public static long recordNum=0;//记录批量 public static void main(String args[]){ PreparedStatement ppst = null ; ConnectSQLserver mssq=new ConnectSQLserver(true);//连sqlserver //建表 String reCreWord= "create table Project"+ " "+ "( "+ "AwardTitle varchar(200) null, "+ "Time varchar(1000) null, "+ "Organization varchar(200) null, "+ "ProgramOfficer varchar(100) null, "+ "AbstractNarration varchar(8000) null, "+ "AwardID varchar(10) null, "+ "PrincipalInvestigator varchar(60) null, "+ "EmailAddress varchar(40) null, "+ "Name varchar(80) null, "+ "CountryName varchar(60) null, "+ "StateName varchar(60) null, "+ "CityName varchar(60) null, "+ "ProgramReference varchar(500) null, "+ ") "; //插入 String inSQL="insert into Project(" + "AwardTitle,Time,Organization," + "ProgramOfficer,AbstractNarration," + "AwardID,PrincipalInvestigator,EmailAddress," + "Name,CountryName,StateName," + "CityName,ProgramReference)values(?,?,?,?,?,?,?,?,?,?,?,?,?);"; try { mssq.sta.execute(reCreWord); System.out.println("操作新表:"+inSQL); ppst=mssq.conn.prepareStatement(inSQL); } catch (SQLException e1) { e1.printStackTrace(); } // try { // //mssq.conn.setAutoCommit(false); // } catch (SQLException e1) { // e1.printStackTrace(); // } long start,end=0; //创建空表 Document creDocument = DocumentHelper.createDocument(); //添加空字段 //项目标题 Element root= creDocument.addElement("root"); Element element[ ] = new Element[13]; element[0] = root.addElement("AwardTitle"); element[1] = root.addElement("Time"); element[2]= root.addElement("Organization"); element[3] = root.addElement("ProgramOfficer"); element[4]= root.addElement("AbstractNarration"); element[5]= root.addElement("AwardID"); element[6]= root.addElement("PrincipalInvestigator"); element[7]= root.addElement("EmailAddress"); element[8] = root.addElement("Name"); element[9]= root.addElement("CountryName"); element[10] = root.addElement("StateName"); element[11]= root.addElement("CityName"); element[12]= root.addElement("ProgramReference"); SAXReader saxReader = new SAXReader(); //得到待处理的文件夹 //前缀 File getPrefix = new File("");// 空文件夹 用来得到 文件路径前缀 pathPrefix=getPrefix.getAbsolutePath();//得到 文件路径前缀 //组合 inputXmlDir=File.separator + data; inputXmlDir=pathPrefix+inputXmlDir;//文件夹绝对路径 System.out.println(inputXmlDir); File file = new File(data); //遍历文件夹下的每个文件 if(file.isDirectory()){ File[] files = file.listFiles(); start=System.currentTimeMillis(); Document document; for(int i = 0; i < files.length; i++){ System.out.println("第"+(i+1)+"篇"); try { document = saxReader.read(inputXmlDir+File.separator + files[i].getName()); //拿出一个xml去处理 try { String record[]=Handle( document,creDocument ,element); //System.out.println("record[0]:"+record[0].toString()); for( int k=0;k<record.length;k++ ){ ppst.setString(k+1, record[k]); //System.out.println(k); // System.out.println(record[k]); } ppst.execute(); } catch (SQLException e) { e.printStackTrace(); } } catch (DocumentException e) { e.printStackTrace(); } } end=System.currentTimeMillis()-start; System.out.println("所用时间:"+end/1000+"s"); } //输出xml结果文件 try{ XMLWriter output = new XMLWriter( new FileWriter( new File(inputXmlDir+File.separator+"result.xml") )); output.write( creDocument ); output.close(); } catch(IOException e){ System.out.println(e.getMessage()); } } //处理一个xml public static String[] Handle(Document document, Document creDocument, Element[] element) throws SQLException{ String [] record=new String [13]; //解析 Element root=document.getRootElement(); // 获取根节点 Iterator Award = root.elementIterator("Award"); //根节点#Award // 遍历Award节点: while (Award.hasNext()) { Element AwardEle = (Element) Award.next(); //项目标题 String AwardTitle = AwardEle.elementTextTrim("AwardTitle"); // Award#AwardTitle //添加到xml中 Element Title=element[0].addElement("Title");//AwardTitle的子标签 Title.setText(AwardTitle); record[0]=AwardTitle; //项目时间 String Time=""; String AwardEffectiveDate=AwardEle.elementTextTrim("AwardEffectiveDate"); // Award#AwardEffectiveDate String AwardExpirationDate=AwardEle.elementTextTrim("AwardExpirationDate"); // Award#AwardExpirationDate Time=AwardEffectiveDate +"-"+AwardExpirationDate; Element time=element[1].addElement("time"); time.setText(Time); record[1]=Time; //组织 String org=""; Iterator Organization = AwardEle.elementIterator("Organization");//Award#Organization // 遍历Organization节点: while (Organization.hasNext()) { Element OrganizationEle = (Element) Organization.next(); //组织合成 org= getChild(OrganizationEle,"Division","LongName")+">"+getChild(OrganizationEle,"Directorate","LongName"); } Element organization=element[2].addElement("organization"); organization.setText(org); record[2]=org; //项目主任 String proOffice=""; proOffice=getChild(AwardEle,"ProgramOfficer","SignBlockName"); Element officers=element[3].addElement("officers"); officers.setText(proOffice); record[3]=proOffice; //摘要叙述 String abs=""; abs=getChild(AwardEle,"","AbstractNarration"); Element abstracts=element[4].addElement("abstracts"); abstracts.setText(abs); record[4]=abs; //项目编号 String Id=""; Id=getChild(AwardEle,"","AwardID"); Element id=element[5].addElement("id"); id.setText(Id); record[5]=Id; //主要研究者 String Inv=""; Inv=getChild(AwardEle,"Investigator","FirstName")+" "+getChild(AwardEle,"Investigator","LastName"); if(getChild(AwardEle,"Investigator","RoleCode").equals("Principal Investigator")){ Element investigator=element[6].addElement("investigator"); investigator.setText(Inv); record[6]=Inv; } //主要研究者邮箱 String Em=""; Em=getChild(AwardEle,"Investigator","EmailAddress"); if(getChild(AwardEle,"Investigator","RoleCode").equals("Principal Investigator")){ Element email=element[7].addElement("email"); email.setText(Em); record[7]=Em; } //机构名称 String Name=""; Name=getChild(AwardEle,"Institution","Name"); Element name=element[8].addElement("name"); name.setText(Name); record[8]=Name; //机构国家 String Nation=""; Nation=getChild(AwardEle,"Institution","CountryName"); Element nation=element[9].addElement("nation"); nation.setText(Nation); record[9]=Nation; //机构州 String State=""; State=getChild(AwardEle,"Institution","StateName"); Element state=element[10].addElement("state"); state.setText(State); record[10]=State; //机构城市 String City=""; City=getChild(AwardEle,"Institution","CityName"); Element city=element[11].addElement("city"); city.setText(City); record[11]=City; //研究主题 String Ref=""; //(2)子节点的子元素 Iterator iters = AwardEle.elementIterator("ProgramReference"); //遍历mom父节点 while (iters.hasNext()){ Element itemEle = (Element) iters.next(); Ref+=itemEle.elementTextTrim("Text")+" ;"; // 拿到head下的子节点script下的字节点username的值 } Element reference=element[12].addElement("reference"); reference.setText(Ref); record[12]=Ref; } // ppst.addBatch(); // ppst.executeBatch(); // ppst.execute(); recordNum++; if(!head){//写表头 String heads[]=new String [13]; heads[0]="项目标题"; heads[1]="项目时间"; heads[2]="项目主任"; heads[3]="组织"; heads[4]="摘要叙述"; heads[5]="项目编号"; heads[6]="主要研究者"; heads[7]="主要研究者邮箱"; heads[8]="机构名称"; heads[9]="机构国家"; heads[10]="机构州"; heads[11]="机构城市"; heads[12]="研究主题"; //writeResult2Excel(heads); head=true; } //writeResult2Excel(record);//数据量大就没用 return record; } //写入excel一行 public static void writeResult2Excel(String [] record) { String DB=pathPrefix+File.separator+"data/results.xls"; HSSFWorkbook wb = getDBwb(DB);//据已有的DB 建立新HSSFWorkbook对象 HSSFSheet sheet = wb.getSheet("sheet1");//得到新的sheet对象 int rowNum=sheet.getLastRowNum();//获得总行数 int coloumNum=13;//总列数 HSSFRow rowTo =null; HSSFCell cellTo =null; rowTo = sheet.createRow(rowNum+1); for(int j=0;j<coloumNum;j++){//列循环 cellTo=rowTo.createCell(j); cellTo.setCellValue(record[j]); } //要把结果写到文件中去 writeBack2Excel(DB,wb); }//writeBack public static HSSFWorkbook getDBwb(String getDB) { HSSFWorkbook workbook=null; try { FileInputStream fis=new FileInputStream(getDB); workbook=new HSSFWorkbook(fis); } catch (IOException e) { e.printStackTrace(); } return workbook; } public static void writeBack2Excel(String DBurl,HSSFWorkbook wb) { FileOutputStream fileOut; try { fileOut = new FileOutputStream(DBurl); wb.write(fileOut);//把Workbook对象输出到文件中 fileOut.flush(); fileOut.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } private static String getChild(Element AwardEle,String mom,String kid) { String target=""; //(1)直接输出的是根节点的子元素 if(mom.equals("")){ target= AwardEle.elementTextTrim(kid); } //(2)子节点的子元素 Iterator iters = AwardEle.elementIterator(mom); //遍历mom父节点 while (iters.hasNext()){ Element itemEle = (Element) iters.next(); target= itemEle.elementTextTrim(kid); // 拿到head下的子节点script下的字节点username的值 } return target; } }