package testlucene; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.io.StringReader; import java.io.UnsupportedEncodingException; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.log4j.BasicConfigurator; import org.apache.log4j.Logger; import org.htmlcleaner.HtmlCleaner; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; public class SAXxhtml extends DefaultHandler { /** * Logger for this class */ private static final Logger logger = Logger.getLogger(SAXxhtml.class); public StringBuffer sb = new StringBuffer(); public boolean usable = true; private String sPath = ""; public SAXxhtml() { super(); // TODO Auto-generated constructor stub // PropertyConfigurator.configure("log4j.properties"); BasicConfigurator.configure(); } public void startElement(String namespaceURI, String localName, String rawName, Attributes atts) throws SAXException { if (rawName.equals("style") || rawName.equals("script")) { usable = false; } } // 解析完成后的统计工作 public void endDocument() throws SAXException { try { PrintWriter pw = new PrintWriter(new FileOutputStream(sPath)); pw.print(sb.toString()); pw.flush(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public void characters(char[] ch, int start, int length) { String charEncontered = new String(ch, start, length); /* * if (!charEncontered.startsWith("<!")||!charEncontered.startsWith("< * ")) { sb.append("\n"); sb.append(charEncontered); } */ if (usable) { sb.append(charEncontered); sb.append("\n"); } usable = true; } /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub /* * SAXParserFactory spf = SAXParserFactory.newInstance(); XMLReader * xmlReader = null; SAXParser saxParser = null; try { // * 创建一个解析器SAXParser对象 saxParser = spf.newSAXParser(); // * 得到SAXParser中封装的SAX XMLReader xmlReader = saxParser.getXMLReader(); * saxParser.parse(new File("d:/sina.xml"), new SAXxhtml()); } catch * (Exception ex) { logger.error("main(String[]) - " + ex, ex); * System.exit(1); } */ } @Override public void endElement(String arg0, String arg1, String arg2) throws SAXException { // TODO Auto-generated method stub super.endElement(arg0, arg1, arg2); } public void parse(String sPath, String Scontent) { this.sPath = sPath; try { // System.out.println(Scontent); HtmlCleaner hc = new HtmlCleaner(Scontent); hc.clean(); PrintWriter pw = new PrintWriter(new FileOutputStream("e:/tmpfile/tmp.txt")); pw.print(sb.toString()); pw.flush(); pw.close(); FileInputStream fis = new FileInputStream(new File("e:/tmpfile/tmp.txt")); String mid = hc.getBrowserCompactXmlAsString(); StringReader sr = new StringReader(mid); InputSource iSrc = new InputSource(sr); System.out.println(iSrc.toString()); SAXParserFactory spf = SAXParserFactory.newInstance(); XMLReader xmlReader = null; SAXParser saxParser = null; // 创建一个解析器SAXParser对象 saxParser = spf.newSAXParser(); // 得到SAXParser中封装的SAX XMLReader xmlReader = saxParser.getXMLReader(); saxParser.parse(fis, new SAXxhtml()); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ParserConfigurationException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (SAXException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
具体思路是Html->xml,然后就可以用sax对xml解析,但是程序总调不通,有人能帮助解决一下么?