君子博学而日参省乎己 则知明而行无过矣

博客园 首页 新随笔 联系 订阅 管理

The important point about Java HTML parsing is to use a parser designed for it. While you can parse HTML using the default XML parser, it's a brittle thing because it will only accept well formed, strict XHTML.

TagSoup library

Hence, I highly recommend using the TagSoup library which slots right into the parsing framework but handles crappy HTML.

import java.net.URL;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;

public class HTMLParseExample {
    public static void main(String args[]) throws Exception {

        // print the 'src' attributes of <img> tags
        // from http://www.yahoo.com/
        // using the TagSoup parser

        SAXParserImpl.newInstance(null).parse(
            new URL("http://www.yahoo.com/").openConnection().getInputStream(),
            new DefaultHandler() {
                public void startElement(String uri, String localName,
                                         String name, Attributes a)
                {
                    if (name.equalsIgnoreCase("img"))
                        System.out.println(a.getValue("src"));
                }
            }
        );
    }
}

Xerces

And here's a slightly more complex example (collect and print the text inside nested <p> tags), this time using the standard Java XML parser Xerxes instead of TagSoup.

import java.net.URL;
import java.util.ArrayList;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.helpers.DefaultHandler;

public class XHTMLParseExample {
    public static void main(String args[]) throws Exception {

        // print the text in <p> ... </p> tags on http://www.w3.org/
        // using the standard Java XML parser, Xerxes

        javax.xml.parsers.SAXParserFactory.newInstance().newSAXParser().parse(
            new URL("http://www.w3.org/").openConnection().getInputStream(),
            new DefaultHandler() {
                ArrayList<StringBuilder> p = new ArrayList<StringBuilder>();

                public void startElement(String uri, String localName,
                                         String name, Attributes a)
                {
                    // push a string buffer for every <p> tag
                    if (name.equalsIgnoreCase("p")) {
                        p.add(new StringBuilder());
                    }
                }

                public void endElement(String uri, String localName, String name)
                {
                    // pop and print a string buffer for every </p> tag
                    if (name.equalsIgnoreCase("p")) {
                        int lastIdx = p.size() - 1;
                        System.out.print("PARA: " + p.remove(lastIdx));
                    }
                }

                public void characters(char[] ch, int start, int length) {
                    // append any characters to the current string buffer
                    int lastIdx = p.size() - 1;
                    if (lastIdx > -1) {
                        p.get(lastIdx).append(new String(ch, start, length))
                                      .append(' ');
                    }
                }

                // if we don't include a fake resolveEntity() method, Xerxes
                // will try to download the entity URI listed its cached DTD:
                // http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
                public InputSource resolveEntity(String publicId, String systemId)
                    throws org.xml.sax.SAXException, java.io.IOException
                {
                    final String fake = "<!ENTITY nbsp \" \">";
                    return new InputSource(new java.io.StringReader(fake));
                }
            }
        );
    }
}
posted on 2013-05-14 03:35  刺猬的温驯  阅读(322)  评论(0编辑  收藏  举报