君子博学而日参省乎己 则知明而行无过矣

博客园 首页 新随笔 联系 订阅 管理
package testlucene;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

public class SAXxhtml extends DefaultHandler {
    /**
     * Logger for this class
     */
    private static final Logger logger = Logger.getLogger(SAXxhtml.class);

    public StringBuffer sb = new StringBuffer();
    public boolean usable = true;
    private String sPath = "";

    public SAXxhtml() {
        super();
        // TODO Auto-generated constructor stub
        // PropertyConfigurator.configure("log4j.properties");
        BasicConfigurator.configure();
    }

    public void startElement(String namespaceURI, String localName,
            String rawName, Attributes atts) throws SAXException {
        if (rawName.equals("style") || rawName.equals("script")) {
            usable = false;
        }

    }

    // 解析完成后的统计工作
    public void endDocument() throws SAXException {
        try {
            PrintWriter pw = new PrintWriter(new FileOutputStream(sPath));
            pw.print(sb.toString());
            pw.flush();
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public void characters(char[] ch, int start, int length) {
        String charEncontered = new String(ch, start, length);
        /*
         * if (!charEncontered.startsWith("<!")||!charEncontered.startsWith("<
         * ")) { sb.append("\n"); sb.append(charEncontered); }
         */
        if (usable) {
            sb.append(charEncontered);
            sb.append("\n");
        }

        usable = true;

    }

    
    
    /**
     * @param args
     */

    public static void main(String[] args) {
        // TODO Auto-generated method stub
        /*
         * SAXParserFactory spf = SAXParserFactory.newInstance(); XMLReader
         * xmlReader = null; SAXParser saxParser = null; try { //
         * 创建一个解析器SAXParser对象 saxParser = spf.newSAXParser(); //
         * 得到SAXParser中封装的SAX XMLReader xmlReader = saxParser.getXMLReader();
         * saxParser.parse(new File("d:/sina.xml"), new SAXxhtml()); } catch
         * (Exception ex) { logger.error("main(String[]) - " + ex, ex);
         * System.exit(1); }
         */

    }

    @Override
    public void endElement(String arg0, String arg1, String arg2)
            throws SAXException {
        // TODO Auto-generated method stub
        super.endElement(arg0, arg1, arg2);

    }

    public void parse(String sPath, String Scontent) {
        this.sPath = sPath;
        try {
            // System.out.println(Scontent);
            HtmlCleaner hc = new HtmlCleaner(Scontent);
            hc.clean();
            PrintWriter pw = new PrintWriter(new FileOutputStream("e:/tmpfile/tmp.txt"));
            pw.print(sb.toString());
            pw.flush();
            pw.close();
            FileInputStream fis = new FileInputStream(new File("e:/tmpfile/tmp.txt"));
            String mid = hc.getBrowserCompactXmlAsString();
            StringReader sr = new StringReader(mid);
            InputSource iSrc = new InputSource(sr);
            System.out.println(iSrc.toString());
            SAXParserFactory spf = SAXParserFactory.newInstance();
            XMLReader xmlReader = null;
            SAXParser saxParser = null;
            // 创建一个解析器SAXParser对象
            saxParser = spf.newSAXParser();
            // 得到SAXParser中封装的SAX XMLReader
            xmlReader = saxParser.getXMLReader();
            saxParser.parse(fis, new SAXxhtml());

        } catch (UnsupportedEncodingException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (ParserConfigurationException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (SAXException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }


    }
}

具体思路是Html->xml,然后就可以用sax对xml解析,但是程序总调不通,有人能帮助解决一下么?

转载http://tiantian911.iteye.com/blog/185225

posted on 2013-05-15 18:10  刺猬的温驯  阅读(341)  评论(0编辑  收藏  举报