java抓取https网页爬虫,解决Server returned HTTP response code: 403 for URL报错

java抓取https网页爬虫,解决Server returned HTTP response code: 403 for URL报错

关键是在忽略https的地方加上:connection.setRequestProperty("User-Agent", "Mozilla/4.76");

注意:需要加在new BufferedReader 前面才行,否则无效。

HttpsURLConnection.setDefaultHostnameVerifier(hv);

            connection = (HttpURLConnection) validationUrl.openConnection();
            //first set User-Agent to solve Server returned HTTP response code: 403 for URL
            connection.setRequestProperty("User-Agent", "Mozilla/4.76");
            
            final BufferedReader in = new BufferedReader(new InputStreamReader(
                    connection.getInputStream()));

抓取的地方先调用忽略https的代码

//先调用下忽略https证书的再请求才可以
            HttpsUrlValidator.retrieveResponseFromServer(url);
            
            doc = Jsoup
                    .connect(url)
                    .header("User-Agent",rand_agents)

完整的HttpsUrlValidator.java代码如下:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;


public class HttpsUrlValidator {

    static HostnameVerifier hv = new HostnameVerifier() {
        public boolean verify(String urlHostName, SSLSession session) {
            System.out.println("Warning: URL Host: " + urlHostName + " vs. "
                               + session.getPeerHost());
            return true;
        }
    };

    public final static String retrieveResponseFromServer(final String url) {
        HttpURLConnection connection = null;
        
        try {
            URL validationUrl = new URL(url);
            trustAllHttpsCertificates();
            HttpsURLConnection.setDefaultHostnameVerifier(hv);

            connection = (HttpURLConnection) validationUrl.openConnection();
            //first set User-Agent to solve Server returned HTTP response code: 403 for URL
            connection.setRequestProperty("User-Agent", "Mozilla/4.76");
            
            final BufferedReader in = new BufferedReader(new InputStreamReader(
                    connection.getInputStream()));
            
            String line;
            final StringBuffer stringBuffer = new StringBuffer(255);

            synchronized (stringBuffer) {
                while ((line = in.readLine()) != null) {
                    stringBuffer.append(line);
                    stringBuffer.append("\n");
                }
                return stringBuffer.toString();
            }

        } catch (final IOException e) {
            System.out.println(e.getMessage());
            return null;
        } catch (final Exception e1){
            System.out.println(e1.getMessage());
            return null;
        }finally {
            if (connection != null) {
                connection.disconnect();
            }
        }
    }
    
    public static void trustAllHttpsCertificates() throws Exception {
        javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
        javax.net.ssl.TrustManager tm = new miTM();
        trustAllCerts[0] = tm;
        javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
                .getInstance("SSL");
        sc.init(null, trustAllCerts, null);
        javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc
                .getSocketFactory());
    }

    static class miTM implements javax.net.ssl.TrustManager,
            javax.net.ssl.X509TrustManager {
        public java.security.cert.X509Certificate[] getAcceptedIssuers() {
            return null;
        }

        public boolean isServerTrusted(
                java.security.cert.X509Certificate[] certs) {
            return true;
        }

        public boolean isClientTrusted(
                java.security.cert.X509Certificate[] certs) {
            return true;
        }

        public void checkServerTrusted(
                java.security.cert.X509Certificate[] certs, String authType)
                throws java.security.cert.CertificateException {
            return;
        }

        public void checkClientTrusted(
                java.security.cert.X509Certificate[] certs, String authType)
                throws java.security.cert.CertificateException {
            return;
        }
    }

}

 

posted @ 2022-10-21 16:00  大自然的流风  阅读(2103)  评论(0编辑  收藏  举报