java解析超大xml(1G),一般数据挖掘dblp.xml文件的解析
在网上找了很多关于解析超大xml的例子,都说java再带的jar包中有相关的SAXparse类来解析xml,但是试过了好多次,之后还是不行,还有dom4j.jar等等,都不能解析太多条数的xml,大概超过30M,就会解析报错。
不过偶尔看到过xercesImpl.jar,sax2.jar,jaxen-1.1.1.jar
import java.io.IOException; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import java.io.File; import org.xml.sax.SAXException; public class SAX { public static void main(String[] args) { try { SAXParserFactory factory=SAXParserFactory.newInstance(); factory.setNamespaceAware( true ); factory.setValidating( true ); SAXParser parser=factory.newSAXParser(); SAXparse p1= new SAXparse(); parser.parse( new File( "D:\\dblp.xml" ), p1); } catch (ParserConfigurationException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (SAXException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } |
或者
import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.util.Iterator; import java.util.List; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; /** * */ public class XMLParse { private String configName = "dblp_little.xml" ; private SAXReader saxReader; private Document doc; private Element root; /** */ public XMLParse() { // InputStream in = Thread.currentThread().getContextClassLoader() // .getResourceAsStream(configName); saxReader = new SAXReader(); try { doc = saxReader.read(configName); } catch (DocumentException e) { e.printStackTrace(); } root = doc.getRootElement(); } /** * get Data Type * * @throws IOException */ public void getModelElement(String attribute) { FileWriter fileWriter = null ; try { fileWriter = new FileWriter(attribute + ".txt" ); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } List list = root.elements(); Element model = null ; List childList = null ; Element modelEle = null ; Element returnModel = null ; String dataType = null ; StringBuffer sb = new StringBuffer(); int temp = 0 ; for (Iterator it = list.iterator(); it.hasNext();) { model = (Element) it.next(); temp++; System.out.println( "temp:" +temp); childList = model.elements(); for (Iterator ite = childList.iterator(); ite.hasNext();) { modelEle = (Element) ite.next(); if (attribute.equals(modelEle.getName())) { dataType = modelEle.getText(); dataType = dataType; if (sb.length() > 1 ) { sb.append( "," ); } sb.append(dataType); } } dataType = sb.toString(); if (! "" .equals(dataType)) { // 没有值的话,跳过往txt中写值 try { fileWriter.write(dataType); fileWriter.write( "\r\n" ); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } sb.delete( 0 , sb.length()); try { fileWriter.flush(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } try { // fileWriter.flush(); fileWriter.close(); System.out.println( "xml解析成功,并成功写入到" +attribute+ ".txt 文件中!" ); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println( "list.size:" +list.size()); } public static void main(String[] args) { // TODO Auto-generated method stub String attribute = null ; XMLParse parse = new XMLParse(); attribute = "author" ; parse.getModelElement(attribute); } } |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步