crawler_基础之_java.net.HttpURLConnection 访问网络资源

java访问网络资源 由底层到封装  为  scoket==> java.net.HttpURLConnection==>HttpClient

这次阐述先 java.net.HttpURLConnection 的方式 ,好处是用导包 ,jdk原生自带的。

HtmlUtil 包含尝试重连(3次) ,编码识别,保存文件到磁盘

package com.cph.crawler.core.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * 类说明:html有关的操作 <br>
 * 2012-9-22下午08:22:20创建<br>
 * 
 * @author cphmvp
 */
public final class HtmlUtil {
    public final static Log LOG = LogFactory.getLog(HtmlUtil.class);
    static String defaultEncoding = "utf-8";
    static HttpURLConnection httpURLConnection = null;
    static URL urlModel = null;
    // 链接超时时间
    static int connectTimeout = 100000;
    // 读取响应超时时间
    static int readTimeout = 100000;

    /**
     * 下载图片<br>
     * 
     * @param url
     *            图片的下载地址<br>
     * @param savePath
     *            保存路径<br>
     * @throws IOException
     */
    @SuppressWarnings("resource")
    public static void downloadAndSavePictureToDisk(String url, String savePath)
            throws IOException {
        urlModel = new URL(url);
        httpURLConnection = (HttpURLConnection) urlModel.openConnection();
        httpURLConnection.setConnectTimeout(connectTimeout);
        httpURLConnection.setReadTimeout(readTimeout);
        httpURLConnection.setDoOutput(true);
        InputStream is = httpURLConnection.getInputStream();
        BufferedReader rd = new BufferedReader(new InputStreamReader(is));
        FileOutputStream fw = null;
        File f = new File(savePath.substring(0, savePath.lastIndexOf("/")));

        if (!f.exists()) {
            f.mkdirs();
        }
        File eixtsFile = new File(savePath);
        if (eixtsFile.exists()) {
            return;
        }
        fw = new FileOutputStream(savePath, true);
        int num = -1;
        while ((num = is.read()) != (-1))// 是否读完所有数据
        {
            fw.write(num);// 将数据写往文件
        }
        rd.close();
        is.close();
        if (httpURLConnection != null) {
            httpURLConnection.disconnect();
        }

    }

    /**
     * 讲url后面的参数进行编码
     * 
     * @param url
     * @return
     * @throws UnsupportedEncodingException
     */
    private static String encodParamters(String url)
            throws UnsupportedEncodingException {
        String returnStr = new String(url);
        String regex = "=([^&]+)";
        Pattern p = Pattern.compile(regex);
        Matcher m = p.matcher(url);
        while (m.find()) {
            String replaceStr = m.group(1);
            returnStr = returnStr.replaceFirst(replaceStr,
                    URLEncoder.encode(replaceStr, "utf-8"));
        }
        return returnStr;
    }

    /**
     * 获取会话的JSESSIONID
     * 
     * @param url
     * @return
     */
    public static String getSession(String url) {
        String sessionId = "";
        try {
            urlModel = new URL(url);
            httpURLConnection = (HttpURLConnection) urlModel.openConnection();
            httpURLConnection.setConnectTimeout(connectTimeout);
            httpURLConnection.setReadTimeout(readTimeout);
            String cookieVal = null;
            String key = null;
            for (int i = 1; (key = httpURLConnection.getHeaderFieldKey(i)) != null; i++) {
                if (key.equalsIgnoreCase("set-cookie")) {
                    cookieVal = httpURLConnection.getHeaderField(i);
                    cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));
                    sessionId = sessionId + cookieVal + ";";
                }
            }

        } catch (MalformedURLException e) {
            LOG.error(e);
        } catch (IOException e) {
            LOG.error(e);
        }
        return sessionId;
    }

    /**
     * 下载页面</br>
     * 
     * @param page
     *            </br>
     * @return 页面源码
     * @throws IOException
     * @throws UnsupportedEncodingException
     */
    public static StringBuffer downloadHtml(String url,String encoding) {
        StringBuffer sb = new StringBuffer();
        BufferedReader in = null;
        int tryNum = 0;
        while (true) {
            try {
                if (tryNum > 1) {
                    String ecodingUrl = encodParamters(url);
                    urlModel = new URL(ecodingUrl);
                } else {
                    urlModel = new URL(url);
                }
                httpURLConnection = (HttpURLConnection) urlModel
                        .openConnection();
                httpURLConnection.setConnectTimeout(connectTimeout);
                httpURLConnection.setReadTimeout(readTimeout);
                httpURLConnection
                        .setRequestProperty("User-Agent",
                                "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");
                String redirectUrl = httpURLConnection.getURL().toString();
                if (!redirectUrl.equals(url)) {
                    LOG.info(url + "重定向后为" + redirectUrl);
                }
                String charSetHeader = httpURLConnection
                        .getHeaderField("Content-Type");
                String charSet = null;
                if (charSetHeader != null) {
                    Pattern p = Pattern.compile("charset=[\"']?(.*?)['\"]");
                    Matcher m = p.matcher(charSetHeader);
                    if (m.find())
                        charSet = m.group(1).trim();
                    if (null == charSet) {
                        charSet = encoding;
                    }
                }

                charSet = (charSet == null ? encoding : charSet);
                in = new BufferedReader(new InputStreamReader(
                        httpURLConnection.getInputStream(), charSet));
                String inputLine;
                while ((inputLine = in.readLine()) != null) {
                    sb.append(inputLine + "\n");
                    inputLine = null;
                }
                if (in != null)
                    try {
                        in.close();
                    } catch (IOException e) {
                        LOG.error(e);
                    }
                if (httpURLConnection != null)
                    httpURLConnection.disconnect();
                break;
            } catch (Exception e) {
                if (tryNum++ == 3) {
                    LOG.error("download page error [ " + urlModel + " ] ");
                    return null;
                }
                LOG.warn(tryNum + "次下载失败", e);
            }
        }
        return sb;

    }
    /**
     * 下载页面</br>
     * 
     * @param page
     *            </br>
     * @return 页面源码
     * @throws IOException
     * @throws UnsupportedEncodingException
     */
    public static StringBuffer downloadHtml(String url) {
        StringBuffer sb = new StringBuffer();
        BufferedReader in = null;
        int tryNum = 0;
        while (true) {
            try {
                if (tryNum > 1) {
                    String ecodingUrl = encodParamters(url);
                    urlModel = new URL(ecodingUrl);
                } else {
                    urlModel = new URL(url);
                }
                httpURLConnection = (HttpURLConnection) urlModel
                        .openConnection();
                httpURLConnection.setConnectTimeout(connectTimeout);
                httpURLConnection.setReadTimeout(readTimeout);
                httpURLConnection
                        .setRequestProperty("User-Agent",
                                "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");
                String redirectUrl = httpURLConnection.getURL().toString();
                if (!redirectUrl.equals(url)) {
                    LOG.info(url + "重定向后为" + redirectUrl);
                }
                String charSetHeader = httpURLConnection
                        .getHeaderField("Content-Type");
                String charSet = null;
                if (charSetHeader != null) {
                    Pattern p = Pattern.compile("charset=[\"']?(.*?)['\"]");
                    Matcher m = p.matcher(charSetHeader);
                    if (m.find())
                        charSet = m.group(1).trim();
                    if (null == charSet) {
                        charSet = defaultEncoding;
                    }
                }

                charSet = (charSet == null ? defaultEncoding : charSet);
                in = new BufferedReader(new InputStreamReader(
                        httpURLConnection.getInputStream(), charSet));
                String inputLine;
                while ((inputLine = in.readLine()) != null) {
                    sb.append(inputLine + "\n");
                    inputLine = null;
                }
                if (in != null)
                    try {
                        in.close();
                    } catch (IOException e) {
                        LOG.error(e);
                    }
                if (httpURLConnection != null)
                    httpURLConnection.disconnect();
                break;
            } catch (Exception e) {
                if (tryNum++ == 3) {
                    LOG.error("download page error [ " + urlModel + " ] ");
                    return null;
                }
                LOG.warn(tryNum + "次下载失败", e);
            }
        }
        return sb;

    }

}

 

                             

 

posted @ 2013-12-14 01:11  cphmvp  阅读(422)  评论(0编辑  收藏  举报
爬虫在线测试小工具: http://tool.haoshuju.cn/