使用jsoup解析html页面内容案例

public String getFaGuiKuTitles(String type, int page) {
        String href = "http://info.qd-n-tax.gov.cn/NewFaGuiKu/"+type+"/";
        String baseUrl = href + "index";
        
        int no = 0;
        String msg = "";
        
        if(page>0){
            baseUrl = baseUrl + "_"+page;
        }
        
        baseUrl += ".htm";
        
        int totalPage = 0;
        
        List<FaGui> list = new ArrayList<FaGui>();
        
        try {
            URL url = new URL(baseUrl);
            org.jsoup.nodes.Document doc = Jsoup.parse(url, 10000);
            
            org.jsoup.nodes.Element table = doc.select("table").get(0);
            org.jsoup.nodes.Element tbody = table.select("tbody").get(0);
            org.jsoup.select.Elements rows = tbody.select("tr");
            
            int len = rows.size();
            
            for (int i = 0; i < len; i++) {
                org.jsoup.select.Elements cols = rows.get(i).select("td");
                
                FaGui fg = new FaGui();
                fg.setTitle(cols.get(0).text());
                fg.setDate(cols.get(1).text());
                
                if(cols.size()>2){
                    fg.setFwzh(cols.get(2).text());
                }
                
                
                org.jsoup.nodes.Element a = cols.get(0).select("a").get(0);
                fg.setHref(a.attr("href").replaceFirst("./", href));
                
                list.add(fg);
            }
            
            //翻页信息
            String pager = doc.getElementsByClass("pager").get(0).html();
            int start = pager.indexOf("(")+1;
            int end = pager.indexOf(",");
            pager = pager.substring(start, end);//截取页面中的总页数
            
            if(pager.matches("\\d+")){
                totalPage = Integer.parseInt(pager);
            }
            
            no = 1;
            msg = "SUCCESS";
            
            log.info("获取税收法规库标题内容", "getFaGuiKuTitles");
        } catch (MalformedURLException ex) {
            Logger.getLogger(LocalServiceImpl.class.getName()).log(Level.SEVERE, null, ex);
            msg = "获取税收法规库标题内容:baseUrl"+baseUrl+"不可用,ex:"+ex;
            log.error(msg, "getFaGuiKuTitles");
        } catch (IOException ex) {
            Logger.getLogger(LocalServiceImpl.class.getName()).log(Level.SEVERE, null, ex);
            msg = "获取税收法规库标题内容:IO异常,ex:"+ex;
            log.error(msg, "getFaGuiKuTitles");
        }        
        
        return ResultUtil.getResult(no, msg, list,totalPage,page);
    }
posted @ 2015-05-06 13:15 yshy 阅读(643) 评论(0) 收藏举报
刷新页面返回顶部
YSHY

使用jsoup解析html页面内容案例

公告