HttpClient抓取带有压缩性质的网页
HttpClient抓取带有压缩性质的网页,需要一个解码的过程,如果缺少该过程则会呈现乱码的状态。
package com.yangbo.examples; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; public class MobileInformationPconline { /** * 用正则表达式来提取抓取下来的html中的信息 * @throws HttpException * @throws IOException */ public String getHtmlContent(String htmlurl, String charset) throws IOException { StringBuffer sb = new StringBuffer(); String acceptEncoding = ""; /* 1.生成 HttpClinet 对象并设置参数 */ HttpClient httpClient = new HttpClient(); // 设置 Http 连接超时 5s httpClient.getHttpConnectionManager().getParams().setConnectionTimeout( 5000); GetMethod method = new GetMethod(htmlurl); // 设置 get 请求超时 5s method.getParams().getDoubleParameter(HttpMethodParams.SO_TIMEOUT, 10000); // 设置请求重试处理 method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler()); int statusCode; try { statusCode = httpClient.executeMethod(method); // 判断访问的状态码 if (statusCode != HttpStatus.SC_OK) { return sb.toString(); } else { if (method.getResponseHeader("Content-Encoding") != null) acceptEncoding = method .getResponseHeader("Content-Encoding").getValue(); if (acceptEncoding.toLowerCase().indexOf("gzip") > -1) { // 建立gzip解压工作流 InputStream is; is = method.getResponseBodyAsStream(); GZIPInputStream gzin = new GZIPInputStream(is); InputStreamReader isr = new InputStreamReader(gzin, charset); // 设置读取流的编码格式,自定义编码 java.io.BufferedReader br = new java.io.BufferedReader(isr); String tempbf; while ((tempbf = br.readLine()) != null) { sb.append(tempbf); sb.append("\r\n"); } isr.close(); gzin.close(); //System.out.println(sb); } else { InputStreamReader isr; isr = new InputStreamReader( method.getResponseBodyAsStream(), charset); java.io.BufferedReader br = new java.io.BufferedReader(isr); String tempbf; while ((tempbf = br.readLine()) != null) { sb.append(tempbf); sb.append("\r\n"); } isr.close(); } } } catch (HttpException e1) { e1.printStackTrace(); } catch (IOException e1) { e1.printStackTrace(); } method.abort(); method.releaseConnection(); return sb.toString(); } public String getHtml(String url){ HttpClient httpClient=new HttpClient(); HttpMethod get=new GetMethod(url); String html=""; try { httpClient.executeMethod(get); BufferedReader reader=new BufferedReader(new InputStreamReader(get.getResponseBodyAsStream(),"GB2312")); String tmp=null; while((tmp=reader.readLine())!=null){ html+=tmp+"\r\n"; } } catch (HttpException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally{ get.releaseConnection(); } return html; } public ArrayList<String> getMobileUrl(String html){ Pattern p = null; //正则表达式 Matcher m = null; //操作的字符串 p = Pattern.compile("<h3><a href=.*</a></h3>"); m = p.matcher(html); ArrayList<String> mobileUrl = new ArrayList<String>(); String mobileUrlString = null; while(m.find()){ mobileUrlString = "http://detail.zol.com.cn"+m.group().split("href=\"")[1].split("\"")[0]; mobileUrl.add(mobileUrlString); } return mobileUrl; } public void insertMobileInformation(String mobileModel,String mobileBrand,String netType){ Connection ct =null; PreparedStatement ps=null; ResultSet rs=null; try { //1、获取数据库链接 ct=SqlHelper.getConnection(); String[] parameters = {mobileModel,mobileBrand,netType}; String sql = "insert into mobile_information(mobileModel,mobileBrand,netType,updateTime) values (?,?,?,now())"; SqlHelper.executeUpdate(sql,parameters); } catch (Exception e) { e.printStackTrace(); }finally{ if(rs!=null){ try { rs.close(); } catch (Exception e) { e.printStackTrace(); } rs=null; } if(ps!=null){ try { ps.close(); } catch (Exception e) { e.printStackTrace(); } ps=null; } if(ct!=null){ try { ct.close(); } catch (Exception e) { e.printStackTrace(); } ct=null; } } } public static void main(String[] args) throws IOException{ String url = "http://product.pconline.com.cn/mobile/25s1.shtml"; String u = "http://product.pconline.com.cn/mobile/"; for(int i=0;i<125;i++){ url = u+i*25+"s1.shtml"; System.out.println(url); try { MobileInformationPconline mobileInformationRegex = new MobileInformationPconline(); String html = mobileInformationRegex.getHtmlContent(url, "gb2312"); int mobileCount = html.split("<a class=\"name\" href=\"").length-1; System.out.println(mobileCount); for(int j=1;j<=mobileCount;j++){ try { String mobileUrl = html.split("<a class=\"name\" href=\"")[j].split("\"")[0]; System.out.println(mobileUrl); String mobileModel = html.split("<a class=\"name\" href=\"")[j].split("target=\"_blank\">")[1].split("</a>")[0]; System.out.println(mobileModel); String netType=null; if(html.split("<a class=\"name\" href=\"")[j].contains("网络制式")){ netType = html.split("<a class=\"name\" href=\"")[j].split("网络制式:</i>")[1].split("</dd>")[0]; }else if(html.split("<a class=\"name\" href=\"")[j].contains("手机制式")){ netType = html.split("<a class=\"name\" href=\"")[j].split("手机制式:</i>")[1].split("</dd>")[0]; }else{ break; } System.out.println(netType); String mobileHtml = mobileInformationRegex.getHtmlContent(mobileUrl, "gb2312"); String mobileBrand = mobileHtml.split("<div class=\"crumb fl\">")[1].split("title=\"")[4].split("手机大全")[0]; System.out.println(mobileBrand); System.out.println(i*25+j); System.out.println(); mobileInformationRegex.insertMobileInformation(mobileModel,mobileBrand,netType); } catch (Exception e) { e.printStackTrace(); } } } catch (Exception e) { e.printStackTrace(); } } } }