一个简单粗暴的爬虫 - 必应今日美图

1. 必应今日美图

当使用bing搜索时，每天都会出现一副美图。

搜索找到bing今日美图 http://bing.plmeizi.com/ (这里收集了一年多的今日美图) 收集者： http://leil.plmeizi.com/

目前共47页

url格式按 http://bing.plmeizi.com/?page=*

点进去就是我们要的名称和名称

2. 开始编码

使用简单的Jsoup进行爬虫，很简单，很好理解。

HtmlUtil

 1 package util;
 2 
 3 import java.io.IOException;
 4 
 5 import org.jsoup.Jsoup;
 6 import org.jsoup.nodes.Document;
 7 
 8 public class HtmlUtil {
 9     // 根据url从网络获取网页文本
10     public Document getHtmlTextByUrl(String url) {
11         Document doc = null;
12         try {
13             // doc = Jsoup.connect(url).timeout(5000000).get();
14             int i = (int) (Math.random() * 1000); // 做一个随机延时，防止网站屏蔽
15             while (i != 0) {
16                 i--;
17             }
18             doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout(300000)
19                     .post();
20         } catch (IOException e) {
21             e.printStackTrace();
22             try {
23                 doc = Jsoup.connect(url).timeout(5000000).get();
24             } catch (IOException e1) {
25                 // TODO Auto-generated catch block
26                 e1.printStackTrace();
27             }
28         }
29         return doc;
30     }
31 }

GetPhoto

这个编码主要需要先分析html属性，按照属性取到我们需要的元素，元素值。

我是先拿到每个图的page页面url

然后到详情页面拿到图的url，图的名字截取。

然后将图保存到本地。

 1 package bing;
 2 
 3 import java.io.DataInputStream;
 4 import java.io.File;
 5 import java.io.FileOutputStream;
 6 import java.io.IOException;
 7 import java.net.URL;
 8 
 9 import org.jsoup.nodes.Document;
10 import org.jsoup.nodes.Element;
11 import org.jsoup.select.Elements;
12 
13 import util.HtmlUtil;
14 
15 /**
16  *
17  * @author loveincode
18  * @data Sep 29, 2017 1:15:00 PM
19  */
20 public class GetPhoto {
21 
22     public static void go(int startpage, int endpage) throws IOException {
23 
24         HtmlUtil htmlutil = new HtmlUtil();
25         // 获取图片的绝对路径
26         String url = "http://bing.plmeizi.com/?page=";
27         for (int i = startpage; i <= endpage; i++) {
28             String gourl = url + i + "";
29             Document dochtml = htmlutil.getHtmlTextByUrl(gourl);
30             Elements elements_a = dochtml.getElementsByClass("item");
31             for (int x = 0; x < elements_a.size(); x++) {
32                 String pyotopage = elements_a.get(x).attr("href");
33                 Document dochtml_photo = htmlutil.getHtmlTextByUrl(pyotopage);
34                 Element elements_picurl = dochtml_photo.getElementById("picurl");
35                 String picurl = elements_picurl.attr("href");
36                 Element elements_searchlink = dochtml_photo.getElementById("searchlink");
37                 String name = elements_searchlink.getElementsByTag("span").get(0).html();
38                 name = name.split("\\(")[0];
39 
40                 if (picurl.contains("jpg")) {
41                     // 下载图片
42                     URL url_pic = new URL(picurl);
43                     DataInputStream dataInputStream = new DataInputStream(url_pic.openStream());
44                     String imageName = name + ".jpg";
45                     FileOutputStream fileOutputStream = new FileOutputStream(new File("bing_pic/" + imageName));
46                     byte[] buffer = new byte[1024];
47                     int length;
48                     while ((length = dataInputStream.read(buffer)) > 0) {
49                         fileOutputStream.write(buffer, 0, length);
50                     }
51                     dataInputStream.close();
52                     fileOutputStream.close();
53                 }
54             }
55         }
56 
57     }
58 
59     public static void main(String[] args) throws IOException {
60         System.out.println("test");
61         go(1, 1);
62     }
63 
64 }

Mythread

 1 package bing;
 2 
 3 import java.io.IOException;
 4 
 5 public class Mythread extends Thread {
 6 
 7     private int startpage;
 8 
 9     private int endpage;
10 
11     public Mythread(int startpage, int endpage) {
12         this.startpage = startpage;
13         this.endpage = endpage;
14     }
15 
16     @SuppressWarnings("static-access")
17     @Override
18     public void run() {
19         GetPhoto getPhoto = new GetPhoto();
20         try {
21             getPhoto.go(startpage, endpage);
22         } catch (IOException e) {
23             // TODO Auto-generated catch block
24             e.printStackTrace();
25         }
26     }
27 }RUN

RUN

采用多线程，开启多个线程同时爬取图片

 1 package bing;
 2 
 3 import java.io.IOException;
 4 
 5 /**
 6  *
 7  * @author loveincode
 8  * @data Sep 29, 2017 1:55:57 PM
 9  */
10 public class RUN {
11 
12     public static void main(String[] args) throws IOException {
13 
14         long startTime = System.currentTimeMillis(); // 获取开始时间
15 
16         Mythread a1 = new Mythread(1, 5);
17         Mythread a2 = new Mythread(6, 10);
18         Mythread a3 = new Mythread(11, 15);
19         Mythread a4 = new Mythread(16, 20);
20         Mythread a5 = new Mythread(21, 25);
21         Mythread a6 = new Mythread(26, 30);
22         Mythread a7 = new Mythread(31, 35);
23         Mythread a8 = new Mythread(36, 40);
24         Mythread a9 = new Mythread(41, 45);
25         Mythread a10 = new Mythread(46, 47);
26 
27         a1.start();
28         a2.start();
29         a3.start();
30         a4.start();
31         a5.start();
32         a6.start();
33         a7.start();
34         a8.start();
35         a9.start();
36         a10.start();
37 
38         while (true) {
39             if (a1.isAlive() == false && a2.isAlive() == false && a3.isAlive() == false && a4.isAlive() == false
40                     && a5.isAlive() == false && a6.isAlive() == false && a7.isAlive() == false && a8.isAlive() == false
41                     && a9.isAlive() == false && a10.isAlive() == false) {
42                 long endTime = System.currentTimeMillis(); // 获取结束时间
43                 System.out.println("程序运行时间： " + (endTime - startTime) / 1000.0 + "s");
44                 break;
45             }
46         }
47     }
48 
49 }

执行 RUN

耗时76.962s 完成图片下载到本地。

成功

效果：

很高清吧

posted @ 2017-09-29 17:39 loveincode 阅读(5803) 评论(13) 收藏举报

刷新页面返回顶部

loveincode

To strive, to seek, to find, and not to yield.