一个爬取https和http通用的工具类(JDK自带的URL的用法)
今天在java爬取天猫的时候因为ssl报错,所以从网上找了一个可以爬取https和http通用的工具类。但是有的时候此工具类爬到的数据不全,此处不得不说python爬虫很厉害。
package cn.qlq.craw.Jsoup; import java.io.File; import java.io.FileWriter; import java.io.Writer; import java.net.MalformedURLException; import java.net.URL; import java.security.SecureRandom; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.Map; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSession; import javax.net.ssl.X509TrustManager; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.helper.HttpConnection; import org.jsoup.nodes.Document; public class HttpCommonUtil { public static void trustEveryone() { try { HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() { public boolean verify(String hostname, SSLSession session) { return true; } }); SSLContext context = SSLContext.getInstance("TLS"); context.init(null, new X509TrustManager[] { new X509TrustManager() { public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { } public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { } public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[0]; } } }, new SecureRandom()); HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory()); } catch (Exception e) { e.printStackTrace(); } } public static Object getHttpHeaders(URL url, int timeout) { try { trustEveryone(); Connection conn = HttpConnection.connect(url); conn.timeout(timeout); conn.header("Accept-Encoding", "gzip,deflate,sdch"); conn.header("Connection", "close"); conn.get(); //String result=conn.response().body(); Map<String, String> result = conn.response().headers(); result.put("title", conn.response().parse().title()); return result; } catch (Exception e) { e.printStackTrace(); } return null; } public static Object getHttpBody(URL url, int timeout) { try { trustEveryone(); Connection conn = HttpConnection.connect(url); conn.timeout(timeout); conn.header("Accept-Encoding", "gzip,deflate,sdch"); conn.header("Connection", "close"); conn.get(); //String result=conn.response().body(); // String result = conn.response().body(); String result = conn.response().body(); File file = new File("C:\\Users\\liqiang\\Desktop\\实习\\python\\javaCrawPicture\\tianmao.html"); if(!file.exists()){ file.createNewFile(); }else{ file.delete(); } file.createNewFile(); Writer fileWriter = new FileWriter(file); fileWriter.write(result); fileWriter.close(); return result; } catch (Exception e) { e.printStackTrace(); } return null; } public static void main(String[] args) { try { URL url = new URL("http", "www.tmall.com", -1, ""); System.out.println(getHttpBody(url, 100000)); } catch (MalformedURLException e) { e.printStackTrace(); } } }
【当你用心写完每一篇博客之后,你会发现它比你用代码实现功能更有成就感!】
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· Obsidian + DeepSeek:免费 AI 助力你的知识管理,让你的笔记飞起来!
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了