使用jsoup解析html页面内容案例
public String getFaGuiKuTitles(String type, int page) { String href = "http://info.qd-n-tax.gov.cn/NewFaGuiKu/"+type+"/"; String baseUrl = href + "index"; int no = 0; String msg = ""; if(page>0){ baseUrl = baseUrl + "_"+page; } baseUrl += ".htm"; int totalPage = 0; List<FaGui> list = new ArrayList<FaGui>(); try { URL url = new URL(baseUrl); org.jsoup.nodes.Document doc = Jsoup.parse(url, 10000); org.jsoup.nodes.Element table = doc.select("table").get(0); org.jsoup.nodes.Element tbody = table.select("tbody").get(0); org.jsoup.select.Elements rows = tbody.select("tr"); int len = rows.size(); for (int i = 0; i < len; i++) { org.jsoup.select.Elements cols = rows.get(i).select("td"); FaGui fg = new FaGui(); fg.setTitle(cols.get(0).text()); fg.setDate(cols.get(1).text()); if(cols.size()>2){ fg.setFwzh(cols.get(2).text()); } org.jsoup.nodes.Element a = cols.get(0).select("a").get(0); fg.setHref(a.attr("href").replaceFirst("./", href)); list.add(fg); } //翻页信息 String pager = doc.getElementsByClass("pager").get(0).html(); int start = pager.indexOf("(")+1; int end = pager.indexOf(","); pager = pager.substring(start, end);//截取页面中的总页数 if(pager.matches("\\d+")){ totalPage = Integer.parseInt(pager); } no = 1; msg = "SUCCESS"; log.info("获取税收法规库标题内容", "getFaGuiKuTitles"); } catch (MalformedURLException ex) { Logger.getLogger(LocalServiceImpl.class.getName()).log(Level.SEVERE, null, ex); msg = "获取税收法规库标题内容:baseUrl"+baseUrl+"不可用,ex:"+ex; log.error(msg, "getFaGuiKuTitles"); } catch (IOException ex) { Logger.getLogger(LocalServiceImpl.class.getName()).log(Level.SEVERE, null, ex); msg = "获取税收法规库标题内容:IO异常,ex:"+ex; log.error(msg, "getFaGuiKuTitles"); } return ResultUtil.getResult(no, msg, list,totalPage,page); }