Fork me on GitHub

java解析超大xml(1G),一般数据挖掘dblp.xml文件的解析

在网上找了很多关于解析超大xml的例子,都说java再带的jar包中有相关的SAXparse类来解析xml,但是试过了好多次,之后还是不行,还有dom4j.jar等等,都不能解析太多条数的xml,大概超过30M,就会解析报错。

不过偶尔看到过xercesImpl.jar,sax2.jar,jaxen-1.1.1.jar

import java.io.IOException;
 
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.File;
import org.xml.sax.SAXException;
 
public class SAX {
 
    public static void main(String[] args) {
        try {
            SAXParserFactory factory=SAXParserFactory.newInstance();
            factory.setNamespaceAware(true);
            factory.setValidating(true);
            SAXParser parser=factory.newSAXParser();
            SAXparse p1=new SAXparse();
            parser.parse(new File("D:\\dblp.xml"), p1);
        } catch (ParserConfigurationException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (SAXException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
 
}

  或者

import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
 
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
 
/**
 *
 */
public class XMLParse {
    private String configName = "dblp_little.xml";
    private SAXReader saxReader;
    private Document doc;
    private Element root;
 
    /**
     */
    public XMLParse() {
//      InputStream in = Thread.currentThread().getContextClassLoader()
//              .getResourceAsStream(configName);
        saxReader = new SAXReader();
        try {
            doc = saxReader.read(configName);
        } catch (DocumentException e) {
            e.printStackTrace();
        }
        root = doc.getRootElement();
    }
 
    /**
     * get Data Type
     *
     * @throws IOException
     */
    public void getModelElement(String attribute) {
        FileWriter fileWriter = null;
        try {
            fileWriter = new FileWriter(attribute + ".txt");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        List list = root.elements();
        Element model = null;
        List childList = null;
        Element modelEle = null;
        Element returnModel = null;
        String dataType = null;
        StringBuffer sb = new StringBuffer();
        int temp = 0;
            for (Iterator it = list.iterator(); it.hasNext();) {
                model = (Element) it.next();
                temp++;
                System.out.println("temp:"+temp);
                childList = model.elements();
                 
                     
                    for (Iterator ite = childList.iterator(); ite.hasNext();) {
                        modelEle = (Element) ite.next();
                        if (attribute.equals(modelEle.getName())) {
                            dataType = modelEle.getText();
                            dataType = dataType;
                            if (sb.length() > 1) {
                                sb.append(",");
                            }
                            sb.append(dataType);
                        }
                    }
                    dataType = sb.toString();
                    if (!"".equals(dataType)) { // 没有值的话,跳过往txt中写值
                        try {
                            fileWriter.write(dataType);
                            fileWriter.write("\r\n");
                        } catch (IOException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
                    }
                    sb.delete(0, sb.length());
                    try {
                        fileWriter.flush();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                     
                }
            try {
//              fileWriter.flush();
                fileWriter.close();
                System.out.println("xml解析成功,并成功写入到"+attribute+".txt 文件中!");
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            System.out.println("list.size:"+list.size());
    }
 
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        String attribute = null;
        XMLParse parse = new XMLParse();
        attribute = "author";
        parse.getModelElement(attribute);
    }
}

  

posted @   symbolJerry²º¹³  阅读(2375)  评论(0编辑  收藏  举报
努力加载评论中...
点击右上角即可分享
微信分享提示