httpClient爬虫

 1 package httpClient.client;
 2 
 3 import java.io.File;
 4 import java.io.IOException;
 5 import java.io.InputStream;
 6 import java.util.UUID;
 7 
 8 import org.apache.commons.io.FileUtils;
 9 import org.apache.http.HttpEntity;
10 import org.apache.http.client.ClientProtocolException;
11 import org.apache.http.client.methods.CloseableHttpResponse;
12 import org.apache.http.client.methods.HttpGet;
13 import org.apache.http.impl.client.CloseableHttpClient;
14 import org.apache.http.impl.client.HttpClients;
15 import org.apache.http.util.EntityUtils;
16 import org.jsoup.Jsoup;
17 import org.jsoup.nodes.Document;
18 import org.jsoup.nodes.Element;
19 import org.jsoup.select.Elements;
20 
21 public class HttpClinet {
22 
23     public static void main(String[] args) throws ClientProtocolException, IOException {
24         // 图片路径
25         String url = "https://www.mzitu.com/";
26         // 创建httpClient实例
27         CloseableHttpClient httpClient = HttpClients.createDefault();
28         HttpClinet t = new HttpClinet();
29         HttpEntity httpEntity = t.getEntity(httpClient, url);
30         String html = EntityUtils.toString(httpEntity, "UTF-8");
31         Document document = Jsoup.parse(html);
32         // 像js一样,通过标签获取title
33         // System.out.println(document.getElementsByTag("title").first());
34         // 像js一样,通过id 获取文章列表元素对象
35         Element postList = document.getElementById("pins");
36         // 像js一样,通过class 获取列表下的所有博客
37         Elements postItems = postList.select("li a");
38         // 循环处理每篇博客
39         String s = "0";
40         for (Element postItem : postItems) {
41             String urls = postItem.attr("href").trim();
42             if (!s.equals(urls)) {
43                 s = urls;
44                 HttpEntity httpEntitys = t.getEntity(httpClient, urls);
45                 String htmls = EntityUtils.toString(httpEntitys, "UTF-8");
46                 Document documents = Jsoup.parse(htmls);
47                 String postLists = documents.getElementsByClass("main-image").first().select("p a img").attr("src");
48                 if (postLists != null) {
49                     System.out.println(postLists);
50                     t.save(postLists, httpClient);
51                 }
52             }
53         }
54         t.close(httpClient);
55     }
56 
57     public void save(String url, CloseableHttpClient httpClient) throws ClientProtocolException, IOException {
58         String fileName = url.substring(url.lastIndexOf("."), url.length());
59         HttpEntity entity = this.getEntity(httpClient, url); // 获取返回实体
60         if (entity != null) {
61             System.out.println("Content-Type:" + entity.getContentType().getValue());
62             InputStream inputStream = entity.getContent();
63             // 文件复制,common io 包下,需要 引入依赖
64             FileUtils.copyToFile(inputStream, new File(UUID.randomUUID() + fileName));
65         }
66     }
67 
68     public void close(CloseableHttpClient httpClient) throws IOException {
69         if (httpClient != null) {
70             httpClient.close();
71         }
72     }
73 
74     public HttpEntity getEntity(CloseableHttpClient httpClient, String url) throws ClientProtocolException, IOException {
75         HttpGet httpGet = new HttpGet(url);
76         httpGet.setHeader("If-None-Match", "W/\"5cc2cd8f-2c58");
77         httpGet.setHeader("Referer", "http://www.mzitu.com/all/");
78         httpGet.setHeader("User-Agent",
79                 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36");
80         CloseableHttpResponse response = httpClient.execute(httpGet);
81         return response.getEntity();
82     }
83 }
pom.xml

 

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>httpClient</groupId>
  <artifactId>client</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>client</name>
  <url>http://maven.apache.org</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>3.8.1</version>
      <scope>test</scope>
    </dependency>
    
    <dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.8</version>
</dependency>
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.12.1</version>
</dependency>    
  <dependency>
        <groupId>commons-io</groupId>
        <artifactId>commons-io</artifactId>
        <version>2.5</version>
    </dependency>
  </dependencies>
</project>

 

posted @ 2019-11-16 00:02  天葬  阅读(232)  评论(0编辑  收藏  举报