HttpClient抓取带有压缩性质的网页

HttpClient抓取带有压缩性质的网页,需要一个解码的过程,如果缺少该过程则会呈现乱码的状态。

package com.yangbo.examples;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;


public class MobileInformationPconline {

    /**
     * 用正则表达式来提取抓取下来的html中的信息
     * @throws HttpException 
     * @throws IOException 
     */
    public String getHtmlContent(String htmlurl, String charset)
            throws IOException {
        StringBuffer sb = new StringBuffer();
        String acceptEncoding = "";
        /* 1.生成 HttpClinet 对象并设置参数 */
        HttpClient httpClient = new HttpClient();
        // 设置 Http 连接超时 5s
        httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(
                5000);
        GetMethod method = new GetMethod(htmlurl);
        // 设置 get 请求超时 5s
        method.getParams().getDoubleParameter(HttpMethodParams.SO_TIMEOUT, 10000);
        // 设置请求重试处理
        method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
        int statusCode;
        try {
            statusCode = httpClient.executeMethod(method);
            // 判断访问的状态码
            if (statusCode != HttpStatus.SC_OK) {
                return sb.toString();
            } else {
                if (method.getResponseHeader("Content-Encoding") != null)
                    acceptEncoding = method
                            .getResponseHeader("Content-Encoding").getValue();
                if (acceptEncoding.toLowerCase().indexOf("gzip") > -1) {
                    // 建立gzip解压工作流
                    InputStream is;
                    is = method.getResponseBodyAsStream();
                    GZIPInputStream gzin = new GZIPInputStream(is);
                    InputStreamReader isr = new InputStreamReader(gzin, charset); // 设置读取流的编码格式,自定义编码
                    java.io.BufferedReader br = new java.io.BufferedReader(isr);
                    String tempbf;
                    while ((tempbf = br.readLine()) != null) {
                        sb.append(tempbf);
                        sb.append("\r\n");
                    }
                    isr.close();
                    gzin.close();
                    //System.out.println(sb);
                } else {
                    InputStreamReader isr;
                    isr = new InputStreamReader(
                            method.getResponseBodyAsStream(), charset);
                    java.io.BufferedReader br = new java.io.BufferedReader(isr);
                    String tempbf;
                    while ((tempbf = br.readLine()) != null) {
                        sb.append(tempbf);
                        sb.append("\r\n");
                    }
                    isr.close();
                }
            }
        } catch (HttpException e1) {
            e1.printStackTrace();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
        method.abort();
        method.releaseConnection();
        return sb.toString();
    }
    
    public String getHtml(String url){
        HttpClient httpClient=new HttpClient();       
        HttpMethod get=new GetMethod(url);
        String html=""; 
        try {
                    httpClient.executeMethod(get);
                    BufferedReader reader=new BufferedReader(new InputStreamReader(get.getResponseBodyAsStream(),"GB2312"));
                    String tmp=null;
                              
                    while((tmp=reader.readLine())!=null){
                        html+=tmp+"\r\n";
        }
        } catch (HttpException e) {
                    e.printStackTrace();
        } catch (IOException e) {
                    e.printStackTrace();
        }finally{
                    get.releaseConnection();
        }

         
        return html;
    }
    
    public ArrayList<String> getMobileUrl(String html){
        Pattern p = null;    //正则表达式
        Matcher m = null;    //操作的字符串
        p = Pattern.compile("<h3><a href=.*</a></h3>");
        m = p.matcher(html);
        ArrayList<String> mobileUrl = new ArrayList<String>();
        String mobileUrlString = null;
        while(m.find()){
            mobileUrlString = "http://detail.zol.com.cn"+m.group().split("href=\"")[1].split("\"")[0];
            mobileUrl.add(mobileUrlString);
        }
        return mobileUrl;
    }
    
    public void insertMobileInformation(String mobileModel,String mobileBrand,String netType){
        Connection ct =null;
        PreparedStatement ps=null;
        ResultSet rs=null;
        
        try {
            //1、获取数据库链接
            ct=SqlHelper.getConnection();
            String[] parameters = {mobileModel,mobileBrand,netType};
            String sql = "insert into mobile_information(mobileModel,mobileBrand,netType,updateTime) values (?,?,?,now())";            
            SqlHelper.executeUpdate(sql,parameters);

        } catch (Exception e) {
            e.printStackTrace();
        }finally{
            if(rs!=null){
                try {
                    rs.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
                rs=null;
            }
            if(ps!=null){
                try {
                    ps.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
                ps=null;
            }
            if(ct!=null){
                try {
                    ct.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
                ct=null;
            }
        }
    }
    
    
    public static void main(String[] args) throws IOException{
        String url = "http://product.pconline.com.cn/mobile/25s1.shtml";
        String u = "http://product.pconline.com.cn/mobile/";
        
        for(int i=0;i<125;i++){
            url = u+i*25+"s1.shtml";
            System.out.println(url);            
            
            try {
                MobileInformationPconline mobileInformationRegex = new MobileInformationPconline();        
                String html = mobileInformationRegex.getHtmlContent(url, "gb2312");
                
                int mobileCount = html.split("<a class=\"name\" href=\"").length-1;
                System.out.println(mobileCount);
    
                for(int j=1;j<=mobileCount;j++){
                    try {
                        String mobileUrl = html.split("<a class=\"name\" href=\"")[j].split("\"")[0];
                        System.out.println(mobileUrl);
                        String mobileModel = html.split("<a class=\"name\" href=\"")[j].split("target=\"_blank\">")[1].split("</a>")[0];
                        System.out.println(mobileModel);
                        String netType=null;
                        if(html.split("<a class=\"name\" href=\"")[j].contains("网络制式")){
                            netType = html.split("<a class=\"name\" href=\"")[j].split("网络制式:</i>")[1].split("</dd>")[0];
                        }else if(html.split("<a class=\"name\" href=\"")[j].contains("手机制式")){
                            netType = html.split("<a class=\"name\" href=\"")[j].split("手机制式:</i>")[1].split("</dd>")[0];
                        }else{
                            break;
                        }
                        
                        System.out.println(netType);
                                            
                        String mobileHtml = mobileInformationRegex.getHtmlContent(mobileUrl, "gb2312");
                        String mobileBrand = mobileHtml.split("<div class=\"crumb fl\">")[1].split("title=\"")[4].split("手机大全")[0];
                        System.out.println(mobileBrand);
                        System.out.println(i*25+j);
                        System.out.println();
                        
                        mobileInformationRegex.insertMobileInformation(mobileModel,mobileBrand,netType);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }    
        }        
    }

}

 

posted on 2014-01-02 10:40  jingyunyb  阅读(532)  评论(0编辑  收藏  举报