一个爬取https和http通用的工具类(JDK自带的URL的用法)

 

  今天在java爬取天猫的时候因为ssl报错,所以从网上找了一个可以爬取https和http通用的工具类。但是有的时候此工具类爬到的数据不全,此处不得不说python爬虫很厉害。

复制代码
package cn.qlq.craw.Jsoup;

import java.io.File;
import java.io.FileWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.X509TrustManager;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Document;  
  
public class HttpCommonUtil {  
      
    public static void trustEveryone() {   
        try {    
            HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {    
                public boolean verify(String hostname, SSLSession session) {    
                    return true;    
                }    
            });    
    
            SSLContext context = SSLContext.getInstance("TLS");    
            context.init(null, new X509TrustManager[] { new X509TrustManager() {    
                public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {    
                }    
    
                public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {    
                }    
    
                public X509Certificate[] getAcceptedIssuers() {    
                    return new X509Certificate[0];    
                }    
            } }, new SecureRandom());    
            HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());    
        } catch (Exception e) {    
            e.printStackTrace();    
        }    
    }    
    
    public static Object getHttpHeaders(URL url, int timeout) {    
        try {    
            trustEveryone();   
            Connection conn = HttpConnection.connect(url);    
            conn.timeout(timeout);    
            conn.header("Accept-Encoding", "gzip,deflate,sdch");    
            conn.header("Connection", "close");    
            conn.get();    
            //String result=conn.response().body();  
            Map<String, String> result = conn.response().headers();    
            result.put("title", conn.response().parse().title());    
            return result;   
        } catch (Exception e) {    
            e.printStackTrace();    
        }    
        return null;    
    }    
    
    public static Object getHttpBody(URL url, int timeout) {    
        try {    
            trustEveryone();   
            Connection conn = HttpConnection.connect(url);    
            conn.timeout(timeout);    
            conn.header("Accept-Encoding", "gzip,deflate,sdch");    
            conn.header("Connection", "close");    
            conn.get();    
            //String result=conn.response().body();  
//            String result = conn.response().body();    
            String result = conn.response().body();    
            File file = new File("C:\\Users\\liqiang\\Desktop\\实习\\python\\javaCrawPicture\\tianmao.html");
            if(!file.exists()){
                file.createNewFile();
            }else{
                file.delete();
            }
            file.createNewFile();
            Writer fileWriter = new FileWriter(file); 
            fileWriter.write(result);
            fileWriter.close();
            return result;   
        } catch (Exception e) {    
            e.printStackTrace();    
        }    
        return null;    
    }    
    
    
    
    public static void main(String[] args) {    
        try {    
            URL url = new URL("http", "www.tmall.com", -1, "");   
            System.out.println(getHttpBody(url, 100000));
        } catch (MalformedURLException e) {    
            e.printStackTrace();    
        }    
    }    
}  
复制代码

 

posted @   QiaoZhi  阅读(1202)  评论(0编辑  收藏  举报
编辑推荐:
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· Obsidian + DeepSeek:免费 AI 助力你的知识管理,让你的笔记飞起来!
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
点击右上角即可分享
微信分享提示