HttpClient

视频

https://www.bilibili.com/video/BV1cE411u7RA?p=4

https://www.bilibili.com/video/BV1W54y1s7BZ?p=4

基础文本框架

https://www.ayulong.cn/blog/12

https支持

https://blog.csdn.net/Sakuraaaaaaa/article/details/107280162

1|0HttpClient

官网:http://hc.apache.org

1|1准备

1|0maven

<dependencies> <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --> <!-- HttpClient --> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.12</version> </dependency> <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 --> <!--日志--> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.25</version> <scope>test</scope> </dependency> </dependencies>

1|0创建 slf4j 日志配置文件

在 resources 目录下创建 log4j.properties 文件, 并添加以下配置

# A1 在控制台显示日志 log4j.rootLogger=DEBUG,A1 log4j.logger.cn.itcast = DEBUG log4j.appender.A1=org.apache.log4j.ConsoleAppender log4j.appender.A1.layout=org.apache.log4j.PatternLayout log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH🇲🇲ss,SSS} [%t] [%c]-[%p] %m%n

1|0最简单的爬虫程序

在java的 cn.ayulong.crawler.test 中创建 CrawlerFirst类

package cn.ayulong.crawler.test; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import java.io.IOException; public class CrawlerFirst { public static void main(String[] args) throws IOException { // 1. 打开浏览器, 创建 HttpClient 对象 CloseableHttpClient httpClient = HttpClients.createDefault(); // 2. 输入网址, 发起get请求创建HttpGet对象 HttpGet httpGet = new HttpGet("http://www.baidu.com/"); // 3. 按回车, 发起请求, 返回响应 CloseableHttpResponse response = httpClient.execute(httpGet); // 4. 解析响应, 获取数据 // 判断状态码是否是 200 if (response.getStatusLine().getStatusCode() == 200) { HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity, "utf-8"); System.out.println(content); } } } // 直接运行, 成功抓取到百度首页html并打印到控制台

注意: 此方法只能抓取 http 协议的页面, 如果想抓取 https 的页面, 可以参考

1|2GET 请求

1|0无参无头

public class HttpGetTest { public static void main(String[] args) { // 可关闭的httpclient客户端,相当于你打开的一个浏览器 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建HttpGet对象, 设置url访问地址 HttpGet httpGet = new HttpGet("http://www.baidu.com"); CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpGet); // 解析响应 if (response.getStatusLine().getStatusCode() == 200) { // 获取响应结果 HttpEntity entity = response.getEntity(); // 对 HttpEntity 操作的工具类 //String content = EntityUtils.toString(response.getEntity(), "utf8"); String content = EntityUtils.toString(entity, StandardCharsets.UTF_8); System.out.println(content.length()); //确保流关闭 EntityUtils.consume(entity); } } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response if (response != null) { try { response.close(); } catch (IOException e) { e.printStackTrace(); } } if (httpClient != null) { try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } } }

1|0带请求头

public class HttpGetTest { public static void main(String[] args) { // 可关闭的httpclient客户端,相当于你打开的一个浏览器 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建HttpGet对象, 设置url访问地址 HttpGet httpGet = new HttpGet("http://www.baidu.com"); // 解决httpclient被认为不是真人行为 httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36") // 防盗链,url发生防盗链的网站的url httpGet.addHeader("Referer","https://www.baidu.com/") CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpGet); // 解析响应 if (response.getStatusLine().getStatusCode() == 200) { // 获取响应结果 HttpEntity entity = response.getEntity(); // 对 HttpEntity 操作的工具类 //String content = EntityUtils.toString(response.getEntity(), "utf8"); String content = EntityUtils.toString(entity, StandardCharsets.UTF_8); System.out.println(content.length()); //确保流关闭 EntityUtils.consume(entity); } } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response if (response != null) { try { response.close(); } catch (IOException e) { e.printStackTrace(); } } if (httpClient != null) { try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } } }

1|0请求带参数

public class HttpGetParamTest { public static void main(String[] args) throws Exception { // 创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); // 设置请求地址是: http://yun.itheima.com/search?keys=Java // 创建URIBuilder URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search"); // 设置参数 uriBuilder.setParameter("keys", "Java"); // 创建HttpGet对象, 设置url访问地址 HttpGet httpGet = new HttpGet(uriBuilder.build()); System.out.println("发起请求的信息: " + httpGet); CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpGet); // 解析响应 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); } } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response try { response.close(); httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }

或者手动转

public class HttpGetParamTest { public static void main(String[] args) throws Exception { // 创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); String urlStr = "https://www.baidu.com"; String passwordParam = "12+abc 456|789"// // 做urlencode:如果是浏览器的话,浏览器会帮我们自动给做了,123%2Babc+456%7c789 passwordParam = URLEncoder.encode(passwordParam, StandardCharsets.UTF_8.name()); urlStr = "http://demo/?" + passwordParam HttpGet httpGet = new HttpGet(urlStr); System.out.println("发起请求的信息: " + httpGet); CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpGet); // 解析响应 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); } } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response try { response.close(); httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }

1|0获取响应头以及相应的Content-Type

public class HttpGetParamTest { public static void main(String[] args) throws Exception { // 创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); String urlStr = "https://www.baidu.com"; HttpGet httpGet = new HttpGet(urlStr); System.out.println("发起请求的信息: " + httpGet); CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpGet); // 解析响应 if (response.getStatusLine().getStatusCode() == 200) { System.out.println("响应成功"); // 获取响应头 Header[] allHeaders = response.getAllHeaders(); // 这里会打印所有的响应头,包含 Content-Type for (Header header : allHeaders) { System.out.println("响应头" + header.getName() + "的值" + header.getValue()); } // 也可以从 getEntity() 中取得 HttpEntity entity = response.getEntity(); System.out.println("ContentType" + entity.getContentType()); // 对 HttpEntity 操作的工具类 String content = EntityUtils.toString(entity, StandardCharsets.UTF_8); System.out.println(content.length()); } } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response try { response.close(); httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }

1|0保存网络图片到本地

public class HttpGetParamTest { public static void main(String[] args) throws Exception { // 创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); String urlStr = "https://img2022.cnblogs.com/blog/1381066/202202/1381066-20220226153714038-1862164179.png"; HttpGet httpGet = new HttpGet(urlStr); // 可关闭的响应 CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpGet); HttpEntity entity = response.getEntity(); // image/jpg image/jpeg image/png image/图片的后缀 String contentType = entity.getContentType.getValue(); String suffix = '.jpg'; if (contentType.contains("jpg")||contentType.contains("jpeg")) { suffix = '.jpg'; }else if (contentType.contains("bmp")||contentType.contains("bitbmp")) { suffix = '.bmp'; }else if (contentType.contains("png")) { suffix = '.png'; }else if (contentType.contains("gif")) { suffix = '.gif'; } // 获取文件的字节流 byte[] bytes = EntityUtils.toByteArray(entity); String localAbsPath = "e:\\abbc" + suffix; FileOutputStream fos = new FileOutputStream(localAbsPath); fos.write(bytes); fos.close(); EntityUtils.consume(entity); } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response try { response.close(); httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }

1|0设置访问代理

public class HttpGetTest { public static void main(String[] args) { // 可关闭的httpclient客户端,相当于你打开的一个浏览器 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建HttpGet对象, 设置url访问地址 HttpGet httpGet = new HttpGet("http://www.baidu.com"); // 创建一个代理 String ip = "114.7.192.253"; int port = 8080; HttpHost proxy = new HttpHost(ip, port); // 对每一个请求进行配置,会覆盖全局的默认请求配置 RequestConfig requestConfig = RequestConfig.custom().setProxy(proxy).build(); httpGet.setConfig(requestConfig); CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpGet); // 解析响应 if (response.getStatusLine().getStatusCode() == 200) { // 获取响应结果 HttpEntity entity = response.getEntity(); // 对 HttpEntity 操作的工具类 //String content = EntityUtils.toString(response.getEntity(), "utf8"); String content = EntityUtils.toString(entity, StandardCharsets.UTF_8); System.out.println(content.length()); //确保流关闭 EntityUtils.consume(entity); } } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response if (response != null) { try { response.close(); } catch (IOException e) { e.printStackTrace(); } } if (httpClient != null) { try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } } }

1|3Post请求带参数

1|0application/x-www-form-urlencoded

这是默认的Content-type

使用 HttpClient 发送不带参数的 post 请求与 发送不带参数的 get 请求类似, 只是 HttpGet 要改为 HttpPost, 所以只记录带参数的 post 请求方式

public class HttpPostParamTest { public static void main(String[] args) throws Exception { // 创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建HttpPost对象, 设置url访问地址 HttpPost httpPost = new HttpPost("http://yun.itheima.com/search"); // 声明List集合, 封装表单中的参数 ArrayList<NameValuePair> params = new ArrayList<NameValuePair>(); params.add(new BasicNameValuePair("keys", "Java")); // 创建表单的Entity对象, 第一个参数就是封装好的表单数据, 第二个参数就是编码 UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "utf8"); // 设置表单的Entity对象到 Post 请求中 httpPost.setEntity(formEntity); CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpPost); // 解析响应 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); System.out.println(content.length()); } } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response try { response.close(); httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }

1|0application/json

public class HttpPostParamTest { public static void main(String[] args) throws Exception { // 创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建HttpPost对象, 设置url访问地址 HttpPost httpPost = new HttpPost("http://yun.itheima.com/search"); // string:是一个json字符串 JSONObject jsonObj = new JSONObject(); jsonObj.put("userName","mingzi"); jsonObj.put("password","abc123"); StringEntity jsonEntity = new StringEntity(jsonObj.toString(),Consts.UTF_8); // 也需要给 entity 设置一下内容类型 jsonEntity.setContentType(new BasicHeader("Content-Type","application/json; charset=utf-8")); // 设置entity的编码 jsonEntity.setContentEncoding(Consts.UTF_8.name()); httpPost.setEntity(jsonEntity); CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpPost); // 解析响应 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); System.out.println(content.length()); } } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response try { response.close(); httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }

1|0multipart/form-data类型上传文件

1|4连接池

public class HttpClientPoolTest { public static void main(String[] args) { // 创建连接池管理器 PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(); // 设置连接数 cm.setMaxTotal(100); // 设置每个主机的最大连接数 cm.setDefaultMaxPerRoute(10); // 使用连接池管理器发起请求 doGet(cm); doGet(cm); } private static void doGet(PoolingHttpClientConnectionManager cm) { // 不是每次创建新的HttpClient, 而是从连接池中获取 HttpClient 对象 CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build(); HttpGet httpGet = new HttpGet("http://www.itcast.cn"); CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpGet); // 解析响应 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); System.out.println(content.length()); } } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response if (response != null) { try { response.close(); } catch (IOException e) { e.printStackTrace(); } // 不能关闭 HttpClient, 由连接池管理 HttpClient // httpClient. close(); } } } }

1|5请求参数配置

  1. setConnectTimeout:连接超时,ms,完成tcp3次握手的时间上限
  2. setSocketTimeout:读取超时,ms,表示从请求的网址处获得响应数据的时间间隔
  3. setConnectionRequestTimeout:指的从连接池里面获取connection 的超时时间
pubsetConnectionRequestTimeousetConnectionRequestTimeouttlic class HttpConfigTest { public static void main(String[] args) { // 创建HttpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建HttpGet对象, 设置url访问地址 HttpGet httpGet = new HttpGet("http://yun.itheima.com/search?keys=Java"); // 配置请求信息 RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) // 创建连接的最长时间, 单位是毫秒 .setConnectionRequestTimeout(500) // 设置获取连接的最长时间, 单位是毫秒 .setSocketTimeout(10 * 1000) // 设置数据传输的最长时间, 单位是毫秒 .build(); // 给请求设置请求信息 httpGet.setConfig(config); CloseableHttpResponse response = null; try { // 使用 HttpClient 发起请求, 获取 response response = httpClient.execute(httpGet); // 解析响应 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); System.out.println(content.length()); } } catch(IOException e) { e.printStackTrace(); } finally { // 关闭 response try { response.close(); httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }

2|0Jsoup

jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

jsoup的主要功能如下:

  1. 从一个URL,文件或字符串中解析HTML;
  2. 使用DOM或CSS选择器来查找、取出数据;
  3. 可操作HTML元素、属性、文本;( Jsoup一般用于解析爬到的数据并存储, 很少用到操作 )

2|1准备

先在pom中导入依赖, 搭建开发环境

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <!--Jsoup--> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> <!-- https://mvnrepository.com/artifact/junit/junit --> <!--测试--> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> <!-- https://mvnrepository.com/artifact/commons-io/commons-io --> <!--操作文件--> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 --> <!--操作字符串--> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.4</version> </dependency>

2|2Jsoup解析URL

Jsoup可以直接输入url,它会发起请求并获取数据,封装为Document对象

public class JsoupFirstTest { @Test public void testUrl() throws Exception { // 解析url地址, 第一个参数是访问的url, 第二个参数时访问时候的超时时间 Document doc = Jsoup.parse(new URL("http://www.itcast.cn"), 1000); // 使用标签选择器, 获取title标签中的内容 String title = doc.getElementsByTag("title").first().text(); // 打印 System.out.println(title); } }

PS:虽然使用Jsoup可以替代HttpClient直接发起请求解析数据,但是往往不会这样用,因为实际的开发过程中,需要使用到多线程,连接池,代理等等方式,而jsoup对这些的支持并不是很好,所以我们一般把jsoup仅仅作为Html解析工具使用

2|3Jsoup解析字符串

@Test public void testString() throws Exception { // 使用工具类读取文件, 获取字符串 String content = FileUtils.readFileToString(new File("D:\\360MoveData\\Users\\Administrator\\Desktop\\test.html"), "utf8"); // 解析字符串 Document doc = Jsoup.parse(content); String title = doc.getElementsByTag("title").first().text(); System.out.println(title); } }

2|4Jsoup解析文件

@Test public void testFile() throws Exception { // 解析文件 Document doc = Jsoup.parse(new File("D:\\360MoveData\\Users\\Administrator\\Desktop\\test.html"), "utf8"); String title = doc.getElementsByTag("title").first().text(); System.out.println(title); }

所需 html文件

<html> <head> <title>传智播客官网-一样的教育,不一样的品质</title> </head> <body> <div class="city"> <h3 id="city_bj">北京中心</h3> <fb:img src="/2018czgw/images/slogan.jpg" class="slogan"/> <div class="city_in"> <div class="city_con" style="display: none;"> <ul> <li id="test" class="class_a class_b"> <a href="http://www.itcast.cn" target="_blank"> <span class="s_name">北京</span> </a> </li> <li> <a href="http://sh.itcast.cn" target="_blank"> <span class="s_name">上海</span> </a> </li> <li> <a href="http://gz.itcast.cn" target="_blank"> <span abc="123" class="s_name">广州</span> </a> </li> <ul> <li>天津</li> </ul> </ul> </div> </div> </div> </body> </html>

2|5使用Dom方式遍历文档

@Test public void testDom() throws Exception { // 解析文件件, 获取Document对象 Document doc = Jsoup.parse(new File("D:\\360MoveData\\Users\\Administrator\\Desktop\\test.html"), "utf8"); // 1.根据id查询元素getElementById Element element = doc.getElementById("city_bj"); // 2.根据标签获取元素getElementsByTag Elements spans = doc.getElementsByTag("span"); // 3.根据class获取元素getElementsByClass Element a = doc.getElementsByClass("class_a class_b").first(); // 4.根据属性获取元素getElementsByAttribute Element abc = doc.getElementsByAttribute("abc").first(); // 5.根据属性与属性值筛选 Element href = doc.getElementsByAttributeValue("href", "http://sh.itcast.cn").first(); // 打印元素内容 System.out.println("获取到的元素内容是: " + element.text()); for (Element span : spans) { System.out.println(span.text()); } System.out.println(a.text()); System.out.println("abc.text() = " + abc.text()); System.out.println("href.text() = " + href.text()); }

2|6Selector选择器组合使用

@Test public void testSelector2() throws Exception { // 解析html文件, 获取Document对象 Document doc = Jsoup.parse(new File("D:\\360MoveData\\Users\\Administrator\\Desktop\\test.html"), "utf8"); //el#id: 元素+ID,比如: h3#city_bj //Element element = doc.select("h3#city_bj").first(); //el.class: 元素+class,比如: li.class_a //Element element = doc.select("li.class_a").first(); //el[attr]: 元素+属性名,比如: span[abc] //Element element = doc.select("span[abc]").first(); //任意组合: 比如:span[abc].s_name Element element = doc.select("span[abc].s_name").first(); //ancestor child: 查找某个元素下子元素,比如:.city_con li 查找"city_con"下的所有li //Elements elements = doc.select(".city_con li"); //parent > child: 查找某个父元素下的直接子元素,比如: // .city_con > ul > li 查找city_con第一级(直接子元素)的ul,再找所有ul下的第一级li //Elements elements = doc.select(".city_con > ul > li"); //parent > *: 查找某个父元素下所有直接子元素 Elements elements = doc.select(".city_con > ul > *"); // 打印 System.out.println("获取到的内容是: " + element.text()); for (Element ele : elements) { System.out.println("遍历的结果: " + ele.text()); } }

__EOF__

本文作者😎
本文链接https://www.cnblogs.com/dongye95/p/16122460.html
关于博主:评论和私信会在第一时间回复。或者直接私信我。
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角推荐一下。您的鼓励是博主的最大动力!
posted @   dongye95  阅读(122)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 单线程的Redis速度为什么快?
· 展开说说关于C#中ORM框架的用法!
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库
· SQL Server 2025 AI相关能力初探
· 为什么 退出登录 或 修改密码 无法使 token 失效
点击右上角即可分享
微信分享提示