httpClient爬虫
1 package httpClient.client; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.io.InputStream; 6 import java.util.UUID; 7 8 import org.apache.commons.io.FileUtils; 9 import org.apache.http.HttpEntity; 10 import org.apache.http.client.ClientProtocolException; 11 import org.apache.http.client.methods.CloseableHttpResponse; 12 import org.apache.http.client.methods.HttpGet; 13 import org.apache.http.impl.client.CloseableHttpClient; 14 import org.apache.http.impl.client.HttpClients; 15 import org.apache.http.util.EntityUtils; 16 import org.jsoup.Jsoup; 17 import org.jsoup.nodes.Document; 18 import org.jsoup.nodes.Element; 19 import org.jsoup.select.Elements; 20 21 public class HttpClinet { 22 23 public static void main(String[] args) throws ClientProtocolException, IOException { 24 // 图片路径 25 String url = "https://www.mzitu.com/"; 26 // 创建httpClient实例 27 CloseableHttpClient httpClient = HttpClients.createDefault(); 28 HttpClinet t = new HttpClinet(); 29 HttpEntity httpEntity = t.getEntity(httpClient, url); 30 String html = EntityUtils.toString(httpEntity, "UTF-8"); 31 Document document = Jsoup.parse(html); 32 // 像js一样,通过标签获取title 33 // System.out.println(document.getElementsByTag("title").first()); 34 // 像js一样,通过id 获取文章列表元素对象 35 Element postList = document.getElementById("pins"); 36 // 像js一样,通过class 获取列表下的所有博客 37 Elements postItems = postList.select("li a"); 38 // 循环处理每篇博客 39 String s = "0"; 40 for (Element postItem : postItems) { 41 String urls = postItem.attr("href").trim(); 42 if (!s.equals(urls)) { 43 s = urls; 44 HttpEntity httpEntitys = t.getEntity(httpClient, urls); 45 String htmls = EntityUtils.toString(httpEntitys, "UTF-8"); 46 Document documents = Jsoup.parse(htmls); 47 String postLists = documents.getElementsByClass("main-image").first().select("p a img").attr("src"); 48 if (postLists != null) { 49 System.out.println(postLists); 50 t.save(postLists, httpClient); 51 } 52 } 53 } 54 t.close(httpClient); 55 } 56 57 public void save(String url, CloseableHttpClient httpClient) throws ClientProtocolException, IOException { 58 String fileName = url.substring(url.lastIndexOf("."), url.length()); 59 HttpEntity entity = this.getEntity(httpClient, url); // 获取返回实体 60 if (entity != null) { 61 System.out.println("Content-Type:" + entity.getContentType().getValue()); 62 InputStream inputStream = entity.getContent(); 63 // 文件复制,common io 包下,需要 引入依赖 64 FileUtils.copyToFile(inputStream, new File(UUID.randomUUID() + fileName)); 65 } 66 } 67 68 public void close(CloseableHttpClient httpClient) throws IOException { 69 if (httpClient != null) { 70 httpClient.close(); 71 } 72 } 73 74 public HttpEntity getEntity(CloseableHttpClient httpClient, String url) throws ClientProtocolException, IOException { 75 HttpGet httpGet = new HttpGet(url); 76 httpGet.setHeader("If-None-Match", "W/\"5cc2cd8f-2c58"); 77 httpGet.setHeader("Referer", "http://www.mzitu.com/all/"); 78 httpGet.setHeader("User-Agent", 79 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"); 80 CloseableHttpResponse response = httpClient.execute(httpGet); 81 return response.getEntity(); 82 } 83 }
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>httpClient</groupId> <artifactId>client</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>client</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.8</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.12.1</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.5</version> </dependency> </dependencies> </project>
哇!又赚了一天人民币