HtmlparseUtil.java

该类并不是一个通用的工具类,需要按自己的要求实现,这里只记录了Htmlparse.jar包的一些用法。仅此而已!
详细看这里:http://gundumw100.iteye.com/blog/704311

 
import java.util.*;  
import org.htmlparser.Node;  
import org.htmlparser.NodeFilter;  
import org.htmlparser.Parser;  
import org.htmlparser.filters.AndFilter;  
import org.htmlparser.filters.HasAttributeFilter;  
import org.htmlparser.filters.NodeClassFilter;  
import org.htmlparser.filters.TagNameFilter;  
import org.htmlparser.tags.BodyTag;  
import org.htmlparser.tags.LinkTag;  
import org.htmlparser.util.NodeList;  
import org.htmlparser.util.ParserException;  
  
/** 
 * httpclient与htmlparse对网页的解析 
 *  
 * @author Administrator 
 *  
 */  
public class HtmlparseUtil {  
    WebHttpClient util=new WebHttpClient();  
    /** 
     * 获得网页中的超链接,将href和text保存在Map中:map(href,text) 
     * @param url 
     * @param charset 
     * @return 
     */  
    public Map<String, String> linkGet(String url, String charset) {  
        String content=util.getWebContentByGet(url,charset);  
        Map<String, String> linkMap = new HashMap<String, String>();  
        try {  
            //开始解析  
            Parser parser = Parser.createParser(content, charset);  
            // 过滤出<a></a>标签  
            NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);  
            NodeList list = parser.extractAllNodesThatMatch(linkFilter);  
            Node node = null;  
            for (int i = 0; i < list.size(); i++) {  
                node = list.elementAt(i);  
                // 获得网页中的链接map(href,text)  
                linkMap.put(((LinkTag) node).getLink(), this.processText(((LinkTag) node).getLinkText()));  
            }  
        } catch (ParserException e) {  
            e.printStackTrace();  
        }   
        return linkMap;  
    }  
  
    /** 
     * 获得网页<body></body>标签中的内容, 保存在body中 
     * @param url 
     * @param charset 
     * @return 
     */  
    public String bodyGet(String url, String charset) {  
        String content=util.getWebContentByGet(url,charset);  
        String body = "";  
        try {  
            Parser parser = Parser.createParser(content, charset);  
            // 过滤<body></body>标签  
            NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class);  
            NodeList list = parser.extractAllNodesThatMatch(bodyFilter);  
            Node node = null;  
            for (int i = 0; i < list.size(); i++) {  
                node = list.elementAt(i);  
                // 获得网页内容 保存在content中  
                body = ((BodyTag) node).getBody();  
            }  
        } catch (ParserException e) {  
            e.printStackTrace();  
        }  
        return body;  
    }  
  
    /** 
     * 过滤出class为term的<span>元素,并获得他们的文本 
     * @param url 
     * @param charset 
     * @return 
     */  
    public Map<String,String> termGet(String url, String charset) {  
        String content=util.getWebContentByGet(url,charset);  
          
        Map<String, String> map = new HashMap<String, String>();  
        try {  
            //开始解析  
            // 过滤出class为term的<span>元素  
            Parser parser = Parser.createParser(content, charset);  
            AndFilter filter =   
                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","term"));  
              
            Node node = null;  
            NodeList nodeList = parser.parse(filter);  
              
            for (int i = 0; i < nodeList.size(); i++) {  
                node = nodeList.elementAt(i);  
                map.put("term", node.toPlainTextString());  
            }  
            // 过滤出class为start-time的<span>元素  
            Parser parser2 = Parser.createParser(content, charset);  
            AndFilter filter2 =   
                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","start-time"));  
            NodeList nodeList2 = parser2.parse(filter2);  
            for (int i = 0; i < nodeList2.size(); i++) {  
                node = nodeList2.elementAt(i);  
                map.put("start-time", node.toPlainTextString());  
            }  
            // 过滤出id为J_SingleEndTimeLabel的<span>元素  
            Parser parser3 = Parser.createParser(content, charset);  
            AndFilter filter3 =   
                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("id","J_SingleEndTimeLabel"));  
            NodeList nodeList3 = parser3.parse(filter3);  
            for (int i = 0; i < nodeList3.size(); i++) {  
                node = nodeList3.elementAt(i);  
                map.put("end-time", node.toPlainTextString());  
            }  
              
            // 过滤出class为box post的<div>元素  
            Parser parser4 = Parser.createParser(content, charset);  
            AndFilter filter4 =   
                new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","box post"));  
            NodeList nodeList4 = parser4.parse(filter4);  
            for (int i = 0; i < nodeList4.size(); i++) {  
                node = nodeList4.elementAt(i);  
                String temp=node.toPlainTextString().trim();  
                temp=temp.substring(10,20).trim();  
                map.put("pre-term", temp);  
            }  
              
            // 过滤出class为J_AwardNumber的<span>元素  
            Parser parser5 = Parser.createParser(content, charset);  
//          AndFilter filter5 =   
//                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","J_AwardNumber"));  
            NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class","J_AwardNumber"));  
            StringBuffer buffer=new StringBuffer();  
            for (int i = 0; i < nodeList5.size(); i++) {  
                node = nodeList5.elementAt(i);  
                buffer.append(","+node.toPlainTextString());  
            }  
            buffer.append("|");  
              
            // 过滤出class为blue J_AwardNumber的<span>元素  
            Parser parser6 = Parser.createParser(content, charset);  
//          AndFilter filter6 =   
//                new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","blue J_AwardNumber"));  
            NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class","blue J_AwardNumber"));  
            for (int i = 0; i < nodeList6.size(); i++) {  
                node = nodeList6.elementAt(i);  
                buffer.append(node.toPlainTextString()+",");  
            }  
              
            map.put("numbers", buffer.toString());  
        } catch (ParserException e) {  
            // TODO Auto-generated catch block  
            e.printStackTrace();  
        }  
          
        return map;  
    }  
      
    private String processText(String content){     
        content=content.trim().replaceAll("&nbsp;", "");     
//      content=content.replaceAll("<p>", "\n");     
//      content=content.replaceAll("</TD>", "");     
//      content=content.replaceAll("</div>", "");     
//      content=content.replaceAll("</a>", "");     
//      content=content.replaceAll("<a href=.*>", "");     
        return content;     
    }     
      
    public static void main(String[] str) {  
          
        String url="http://caipiao.taobao.com/lottery/order/lottery_dlt.htm?type=1";  
        HtmlparseUtil util=new HtmlparseUtil();  
        Map<String,String> map=util.termGet(url, "gb2312");  
        System.out.println("term="+map.get("term"));//<span class="term">第<em>10074</em>期</span>  
        System.out.println("start-time="+map.get("start-time"));//  
        System.out.println("end-time="+map.get("end-time"));//  
        System.out.println("pre-term="+map.get("pre-term"));//  
        System.out.println("numbers="+map.get("numbers"));//  
          
        /* 
        Map<String, String> linkMap = util.linkGet(url, "gb2312"); 
        for (String s : linkMap.keySet()) { 
            System.out.println(s + " = " + linkMap.get(s)); 
            //如果是个链接,则再获取它的<body>中的内容 
//          if (s.startsWith("http")) { 
//              util.bodyGet(s, "gb2312"); 
//          } 
        } 
        */  
          
    }  
      
}  

 

posted on 2013-06-24 15:00  little fat  阅读(387)  评论(0编辑  收藏  举报