HttpClent4.3 的例子

package com.unbank.robotspider.util;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.ContentEncodingHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;

public class CrawlerRequest {

    private final static Logger logger = Logger.getLogger(CrawlerRequest.class);
    private static String constUserAgent_Chrome = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4";

    public String getUrlRespHtml(String url) {
        return getUrlRespHtml(url, null, null, 2000, "utf-8");
    }

    /***
     *
     * 
     * 
     */
    public String getUrlRespHtml(String pageUrl,
            List<NameValuePair> headerDict, List<NameValuePair> postDict,
            int timeout, String htmlCharset) {
        String respHtml = "";
        String defaultCharset = "utf-8";
        CloseableHttpResponse response = null;
        HttpUriRequest request = null;

        CloseableHttpClient httpClient = HttpClients.createDefault();
        URL url = null;
        try {
            url = new URL(pageUrl);
        } catch (MalformedURLException e2) {
            e2.printStackTrace();
        }
        URI uri = null;
        try {
            uri = new URI(url.getProtocol(), url.getHost(), url.getPath(),
                    url.getQuery(), null);
        } catch (URISyntaxException e2) {
            e2.printStackTrace();
        }// 防止pageUrl中出现空格
            // httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY,
            // CookiePolicy.BEST_MATCH);
            // httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY,
            // CookiePolicy.BEST_MATCH);

        // RequestConfig globalConfig = RequestConfig.custom()
        // .setCookieSpec(CookieSpecs.BEST_MATCH)
        // .build();
        // CloseableHttpClient httpclient = HttpClients.custom()
        // .setDefaultRequestConfig(globalConfig)
        // .build();
        // RequestConfig localConfig = RequestConfig.copy(globalConfig)
        // .setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY)
        // .build();
        // HttpGet httpGet = new HttpGet("/");
        // httpGet.setConfig(localConfig);

        RequestConfig requestConfig = RequestConfig.custom()
                .setSocketTimeout(5000).setConnectTimeout(5000)
                .setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY).build();// 设置请求和传输超时时间

        CookieStore cookieStore = new BasicCookieStore();

        // logger.info(uri);
        if (postDict != null) {
            HttpPost postReq = new HttpPost(uri);
            postReq.setConfig(requestConfig);
            postReq.addHeader("User-Agent", constUserAgent_Chrome);
            // postReq.addHeader(
            // "Accept",
            // "application/x-ms-application, image/jpeg, application/xaml+xml, "
            // + "image/gif, image/pjpeg, application/x-ms-xbap, */*");
            // postReq.addHeader("Accept-Language", "zh-CN");
            // postReq.addHeader("", "zh-CN");
            // postReq.addHeader("Connection", "close");
            // postReq.addHeader("Content-Type", "text/html;charset=UTF-8");
            try {
                HttpEntity postBodyEnt = new UrlEncodedFormEntity(postDict,
                        "UTF-8");
                postReq.setEntity(postBodyEnt);
            } catch (Exception e) {
                e.printStackTrace();
            }

            request = postReq;
        } else {
            HttpGet getReq = new HttpGet(uri);
            getReq.setConfig(requestConfig);
            getReq.addHeader("User-Agent", constUserAgent_Chrome);
            // getReq.addHeader(
            // "Accept",
            // "application/x-ms-application, image/jpeg, application/xaml+xml, "
            // + "image/gif, image/pjpeg, application/x-ms-xbap, */*");
            // getReq.addHeader("Accept-Language", "zh-CN");
            // getReq.addHeader("", "zh-CN");
            // getReq.addHeader("Connection", "close");
            request = getReq;

        }

        HttpClientContext localContext = HttpClientContext.create();
        localContext.setCookieStore(cookieStore);
        try {
            response = httpClient.execute(request, localContext);
        } catch (Exception e) {
            // logger.info(url + "=====读取出错===" + e);
            for (int i = 0; i < 5; i++) {
                if (response != null) {
                    break;
                }
                try {
                    Thread.sleep(((int) (Math.random() * 6) + 1) * 1000);
                    response = httpClient.execute(request, localContext);
                } catch (Exception e1) {
                    // logger.info("读取失败次数" + i);
                }

            }

        }
        try {
            if (response != null
                    && response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                HttpEntity respEnt = response.getEntity();
                // ContentType contentType = ContentType.getOrDefault(respEnt);
                // text/html; charset=utf-8
                // String charset = StringUtil.getStringByReg(
                // contentType.toString(), "charset=([^;]*)");
                // if (charset == null || charset.isEmpty()) {
                //
                // } else {
                // htmlCharset = charset.split("=")[1];
                // }
                if ((null == htmlCharset) || htmlCharset.isEmpty()) {
                    htmlCharset = defaultCharset;
                }
                respHtml = EntityUtils.toString(respEnt, htmlCharset);

            } else {
                // 保存到数据库
            }
        } catch (ClientProtocolException cpe) {
            logger.info(url + "=====读取出错===" + cpe);
            // cpe.printStackTrace();
        } catch (IOException ioe) {
            logger.info(url + "=====读取出错===" + ioe);
            // ioe.printStackTrace();
        } finally {

            try {
                cookieStore.clear();
                request.abort();
                if (response != null) {

                    response.close();
                }
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
                logger.info(e);
            }
        }

        return respHtml;
    }

}

 

posted @ 2014-06-17 17:33  杨桃  阅读(1211)  评论(0编辑  收藏  举报