HtmlCleanner结合xpath用法(转载)

   HtmlCleaner cleaner = new HtmlCleaner();     
        
      TagNode node = cleaner.clean(new URL("http://finance.sina.com.cn/money/nmetal/20091209/10157077895.shtml"));     
      //按tag取.     
      Object[] ns = node.getElementsByName("title", true);    //标题     
        
      if(ns.length > 0) {     
          System.out.println("title="+((TagNode)ns[0]).getText());     
      }     
      // /html/body/div[2]/div[4]/div/div/div/div[2]/p  
      ns = node.evaluateXPath("//div[@class=\"blkContainerSblkCon\"]/p"); //选取class为指定blkContainerSblkCon的div下面的所有p标签  
      for (int i = 0; i < ns.length; i++) {  
         String in = cleaner.getInnerHtml((TagNode)ns[i]);  
           System.out.println("<p>"+in + "</p>");  
}  
      String in = cleaner.getInnerHtml((TagNode)ns[0]);  
      System.out.println(in);  
  
      System.out.println(((TagNode)ns[0]).getText());  

  

 

   HtmlCleaner cleaner = new HtmlCleaner();     
        String url = "http://finance.sina.com.cn/nmetal/hjfx.html";  
        URL _url = new URL(url);  
        TagNode node = cleaner.clean(_url);     
          
        //按tag取.     
        Object[] ns = node.getElementsByName("title", true);    //标题     
          
        if(ns.length > 0) {   
            System.out.println("title="+((TagNode)ns[0]).getText());     
        }    
          
          
        ns = node.evaluateXPath("//*[@class='Frame-Row3-01-C']/table[2]/tbody/tr/td/a"); //选取class为指定blkContainerSblkCon的div下面的所有p  
        for (int i = 0; i < ns.length; i++) {  
              
            //取链接文本  
//           String in = cleaner.getInnerHtml((TagNode)ns[i]);  
//           System.out.println(in);  
              
            //获取链接的  
            TagNode n = (TagNode) ns[i];  
//          System.out.println(n.getAttributeByName("href"));  
            System.out.println(new URL(_url,n.getAttributeByName("href")).toString());  
        }  
//        String in = cleaner.getInnerHtml((TagNode)ns[0]);  
//        System.out.println(in);  
  
//        System.out.println(((TagNode)ns[0]).getText());  
          
//        System.out.println("ul/li:");     
//        //按xpath取     
//        ns = node.evaluateXPath("//div[@class='d_1']//li");     
//        for(Object on : ns) {     
//            TagNode n = (TagNode) on;     
//            System.out.println("\ttext="+n.getText());     
//        }     
//        System.out.println("a:");     
//        //按属性值取     
//        ns = node.getElementsByAttValue("name", "my_href", true, true);     
//        for(Object on : ns) {     
//            TagNode n = (TagNode) on;     
//            System.out.println("\thref="+n.getAttributeByName("href")+", text="+n.getText());     
//        }     

本文转载于:http://gstarwd.iteye.com/blog/644502

xpath 参考教材:http://www.w3school.com.cn/xpath/xpath_syntax.asp

 

posted @ 2015-09-11 11:10  zhanggl  阅读(407)  评论(0编辑  收藏  举报