jsoup简单的爬取网页数据
/** * Project Name:JavaTest * File Name:BankOfChinaExchangeRate.java * Package Name:com.lee.javatest * Date:2016年7月22日下午1:34:09 * Copyright (c) 2016年7月22日, Pwenlee All Rights Reserved. * */ package com.lee.javatest; import java.io.Serializable; import java.math.BigDecimal; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.List; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * ClassName:BankOfChinaExchangeRate <br/> * Function: 中行外汇牌价. <br/> * Date: 2016年7月22日 下午1:34:09 <br/> * @author PwenLee * @version * @see */ public class BankOfChinaExchangeRate implements Serializable{ private static final Integer DEAFULT_PAGESIZE = 20; private static final long serialVersionUID = -913877619191789389L; /** * 货币名称 中文简体 */ private String currency; /** * 现汇买入价 */ private BigDecimal buyingRate; /** * 现钞买入价 */ private BigDecimal cashBuyingRate; /** * 现汇卖出价 */ private BigDecimal sellingRate; /** * 现钞卖出价 */ private BigDecimal cashSellingRate; /** * 外管局中间价 */ private BigDecimal SAFEMiddleRate; /** * 中行折算价 */ private BigDecimal bankConvertRate; /** * 发布时间 */ private String dateTime; public String getCurrency() { return currency; } public void setCurrency(String currency) { this.currency = currency; } public BigDecimal getBuyingRate() { return buyingRate; } public void setBuyingRate(BigDecimal buyingRate) { this.buyingRate = buyingRate; } public BigDecimal getCashBuyingRate() { return cashBuyingRate; } public void setCashBuyingRate(BigDecimal cashBuyingRate) { this.cashBuyingRate = cashBuyingRate; } public BigDecimal getSellingRate() { return sellingRate; } public void setSellingRate(BigDecimal sellingRate) { this.sellingRate = sellingRate; } public BigDecimal getCashSellingRate() { return cashSellingRate; } public void setCashSellingRate(BigDecimal cashSellingRate) { this.cashSellingRate = cashSellingRate; } public BigDecimal getSAFEMiddleRate() { return SAFEMiddleRate; } public void setSAFEMiddleRate(BigDecimal sAFEMiddleRate) { SAFEMiddleRate = sAFEMiddleRate; } public BigDecimal getBankConvertRate() { return bankConvertRate; } public void setBankConvertRate(BigDecimal bankConvertRate) { this.bankConvertRate = bankConvertRate; } public String getDateTime() { return dateTime; } public void setDateTime(String dateTime) { this.dateTime = dateTime; } /** * * BankOfChinaExchangeRate: * date:日期 例入“2016-07-22” * time:时间 例如“05:30:00” * BankOfChinaCurrencyCode 枚举类 * @author PwenLee * @param startDate * @param endDate * @param currencyCode * @return BankOfChinaExchangeRate */ public BankOfChinaExchangeRate (String date, String time, BankOfChinaCurrencyCode currencyCode){ List<String> context = getExchangeRate(date, time, currencyCode); this.currency = context.get(0); this.buyingRate = new BigDecimal(context.get(1)); this.cashBuyingRate = new BigDecimal(context.get(2)); this.sellingRate = new BigDecimal(context.get(3)); this.cashSellingRate = new BigDecimal(context.get(4)); this.SAFEMiddleRate = new BigDecimal(context.get(5)); this.bankConvertRate = new BigDecimal(context.get(6)); this.dateTime = context.get(7) + " " + context.get(8); } /** * 取当天凌晨05:30:00的数据 */ public BankOfChinaExchangeRate(){ SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd"); Date date=new Date(); String nowDate=sdf.format(date); List<String> context = getExchangeRate(nowDate, "05:30:00", BankOfChinaCurrencyCode.USD); this.currency = context.get(0); this.buyingRate = new BigDecimal(context.get(1)); this.cashBuyingRate = new BigDecimal(context.get(2)); this.sellingRate = new BigDecimal(context.get(3)); this.cashSellingRate = new BigDecimal(context.get(4)); this.SAFEMiddleRate = new BigDecimal(context.get(5)); this.bankConvertRate = new BigDecimal(context.get(6)); this.dateTime = context.get(7) + " " + context.get(8); } /** * 模拟请求url,返回html源码 * @author PwenLee * @param url * @return */ private static String GetHtml(String url) { String html = null; HttpClient httpClient = new DefaultHttpClient(); httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000); HttpGet httpGet = new HttpGet(url); try { HttpResponse httpResponse = httpClient.execute(httpGet); int resStatu = httpResponse.getStatusLine().getStatusCode(); if (resStatu == HttpStatus.SC_OK) { HttpEntity entity = httpResponse.getEntity(); if (entity != null) { html = EntityUtils.toString(entity, "utf-8"); } } } catch (Exception e) { //TODO 打成logger System.out.println("Connect " + url + " error"); e.printStackTrace(); } finally { httpClient.getConnectionManager().shutdown(); } return html; } private List<String> getExchangeRate(String date, String time, BankOfChinaCurrencyCode currencyCode){ Integer totalPage = totalPage(date, time, currencyCode); List<String> contextList = new ArrayList<String>(); if(totalPage <= 0){ //TODO logger return contextList; } String context = ""; for(int i=totalPage;i>=0;i--){ String url = "http://srh.bankofchina.com/search/whpj/search.jsp?erectDate="+date+"¬hing="+date+"&pjname="+currencyCode.getCode()+"&page="+i; String html = GetHtml(url); Document doc = Jsoup.parse(html); Elements linkElements = doc.getElementsByClass("BOC_main"); Elements datas = linkElements.get(0).getElementsByTag("tr"); for (Element ele : datas) { if(ele.text().indexOf(time) != -1){ context = ele.text(); break; } } if(context != ""){ //TODO 换成StringUtils.isNotBlank break; } } if(context == "") {//TODO 换成StringUtils.isBlank //TODO logger return contextList; }else{ contextList = Arrays.asList(context.split(" ")); } return contextList; } public static Integer totalPage(){ Integer totalPage = 0; try{ SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd"); Date date=new Date(); String nowDate=sdf.format(date); String url = "http://srh.bankofchina.com/search/whpj/search.jsp?erectDate="+nowDate+"¬hing="+nowDate+"&pjname="+BankOfChinaCurrencyCode.USD.getCode(); String html = GetHtml(url); //截取网页总条数变量 String stringTemp = html.substring(html.indexOf("m_nRecordCount = ")); //获取变量的值 String totalcount = stringTemp.substring(stringTemp.indexOf("m_nRecordCount = ")+"m_nRecordCount = ".length(),stringTemp.indexOf(";")); Integer totalnum = Integer.valueOf(totalcount); if(totalnum % DEAFULT_PAGESIZE == 0){ totalPage = totalnum/DEAFULT_PAGESIZE; }else{ totalPage = totalnum/DEAFULT_PAGESIZE+1; } }catch(Exception e){ //TODO 打成logger } return totalPage; } public static Integer totalPage(String date, String time, BankOfChinaCurrencyCode currencyCode){ Integer totalPage = 0; try{ String url = "http://srh.bankofchina.com/search/whpj/search.jsp?erectDate="+date+"¬hing="+date+"&pjname="+currencyCode.getCode(); String html = GetHtml(url); //截取网页总条数变量 String stringTemp = html.substring(html.indexOf("m_nRecordCount = ")); //获取变量的值 String totalcount = stringTemp.substring(stringTemp.indexOf("m_nRecordCount = ")+"m_nRecordCount = ".length(),stringTemp.indexOf(";")); Integer totalnum = Integer.valueOf(totalcount); if(totalnum % DEAFULT_PAGESIZE == 0){ totalPage = totalnum/DEAFULT_PAGESIZE; }else{ totalPage = totalnum/DEAFULT_PAGESIZE+1; } }catch(Exception e){ //TODO 打成logger } return totalPage; } @Override public String toString() { return "BankOfChinaExchangeRate [currency=" + currency + ", buyingRate=" + buyingRate + ", cashBuyingRate=" + cashBuyingRate + ", sellingRate=" + sellingRate + ", cashSellingRate=" + cashSellingRate + ", SAFEMiddleRate=" + SAFEMiddleRate + ", bankConvertRate=" + bankConvertRate + ", dateTime=" + dateTime + "]"; } }