一个简单粗暴的爬虫 - 必应今日美图
1. 必应今日美图
当使用bing搜索时,每天都会出现一副美图。
搜索找到bing今日美图 http://bing.plmeizi.com/ (这里收集了一年多的今日美图) 收集者: http://leil.plmeizi.com/
目前共47页
url格式按 http://bing.plmeizi.com/?page=*
点进去就是我们要的名称和名称
2. 开始编码
使用简单的Jsoup进行爬虫,很简单,很好理解。
HtmlUtil
1 package util; 2 3 import java.io.IOException; 4 5 import org.jsoup.Jsoup; 6 import org.jsoup.nodes.Document; 7 8 public class HtmlUtil { 9 // 根据url从网络获取网页文本 10 public Document getHtmlTextByUrl(String url) { 11 Document doc = null; 12 try { 13 // doc = Jsoup.connect(url).timeout(5000000).get(); 14 int i = (int) (Math.random() * 1000); // 做一个随机延时,防止网站屏蔽 15 while (i != 0) { 16 i--; 17 } 18 doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout(300000) 19 .post(); 20 } catch (IOException e) { 21 e.printStackTrace(); 22 try { 23 doc = Jsoup.connect(url).timeout(5000000).get(); 24 } catch (IOException e1) { 25 // TODO Auto-generated catch block 26 e1.printStackTrace(); 27 } 28 } 29 return doc; 30 } 31 }
GetPhoto
这个编码主要需要先分析html属性,按照属性取到我们需要的元素,元素值。
我是先拿到每个图的page页面url
然后到详情页面拿到图的url,图的名字截取。
然后将图保存到本地。
1 package bing; 2 3 import java.io.DataInputStream; 4 import java.io.File; 5 import java.io.FileOutputStream; 6 import java.io.IOException; 7 import java.net.URL; 8 9 import org.jsoup.nodes.Document; 10 import org.jsoup.nodes.Element; 11 import org.jsoup.select.Elements; 12 13 import util.HtmlUtil; 14 15 /** 16 * 17 * @author loveincode 18 * @data Sep 29, 2017 1:15:00 PM 19 */ 20 public class GetPhoto { 21 22 public static void go(int startpage, int endpage) throws IOException { 23 24 HtmlUtil htmlutil = new HtmlUtil(); 25 // 获取图片的绝对路径 26 String url = "http://bing.plmeizi.com/?page="; 27 for (int i = startpage; i <= endpage; i++) { 28 String gourl = url + i + ""; 29 Document dochtml = htmlutil.getHtmlTextByUrl(gourl); 30 Elements elements_a = dochtml.getElementsByClass("item"); 31 for (int x = 0; x < elements_a.size(); x++) { 32 String pyotopage = elements_a.get(x).attr("href"); 33 Document dochtml_photo = htmlutil.getHtmlTextByUrl(pyotopage); 34 Element elements_picurl = dochtml_photo.getElementById("picurl"); 35 String picurl = elements_picurl.attr("href"); 36 Element elements_searchlink = dochtml_photo.getElementById("searchlink"); 37 String name = elements_searchlink.getElementsByTag("span").get(0).html(); 38 name = name.split("\\(")[0]; 39 40 if (picurl.contains("jpg")) { 41 // 下载图片 42 URL url_pic = new URL(picurl); 43 DataInputStream dataInputStream = new DataInputStream(url_pic.openStream()); 44 String imageName = name + ".jpg"; 45 FileOutputStream fileOutputStream = new FileOutputStream(new File("bing_pic/" + imageName)); 46 byte[] buffer = new byte[1024]; 47 int length; 48 while ((length = dataInputStream.read(buffer)) > 0) { 49 fileOutputStream.write(buffer, 0, length); 50 } 51 dataInputStream.close(); 52 fileOutputStream.close(); 53 } 54 } 55 } 56 57 } 58 59 public static void main(String[] args) throws IOException { 60 System.out.println("test"); 61 go(1, 1); 62 } 63 64 }
Mythread
1 package bing; 2 3 import java.io.IOException; 4 5 public class Mythread extends Thread { 6 7 private int startpage; 8 9 private int endpage; 10 11 public Mythread(int startpage, int endpage) { 12 this.startpage = startpage; 13 this.endpage = endpage; 14 } 15 16 @SuppressWarnings("static-access") 17 @Override 18 public void run() { 19 GetPhoto getPhoto = new GetPhoto(); 20 try { 21 getPhoto.go(startpage, endpage); 22 } catch (IOException e) { 23 // TODO Auto-generated catch block 24 e.printStackTrace(); 25 } 26 } 27 }RUN
RUN
采用多线程,开启多个线程同时爬取图片
1 package bing; 2 3 import java.io.IOException; 4 5 /** 6 * 7 * @author loveincode 8 * @data Sep 29, 2017 1:55:57 PM 9 */ 10 public class RUN { 11 12 public static void main(String[] args) throws IOException { 13 14 long startTime = System.currentTimeMillis(); // 获取开始时间 15 16 Mythread a1 = new Mythread(1, 5); 17 Mythread a2 = new Mythread(6, 10); 18 Mythread a3 = new Mythread(11, 15); 19 Mythread a4 = new Mythread(16, 20); 20 Mythread a5 = new Mythread(21, 25); 21 Mythread a6 = new Mythread(26, 30); 22 Mythread a7 = new Mythread(31, 35); 23 Mythread a8 = new Mythread(36, 40); 24 Mythread a9 = new Mythread(41, 45); 25 Mythread a10 = new Mythread(46, 47); 26 27 a1.start(); 28 a2.start(); 29 a3.start(); 30 a4.start(); 31 a5.start(); 32 a6.start(); 33 a7.start(); 34 a8.start(); 35 a9.start(); 36 a10.start(); 37 38 while (true) { 39 if (a1.isAlive() == false && a2.isAlive() == false && a3.isAlive() == false && a4.isAlive() == false 40 && a5.isAlive() == false && a6.isAlive() == false && a7.isAlive() == false && a8.isAlive() == false 41 && a9.isAlive() == false && a10.isAlive() == false) { 42 long endTime = System.currentTimeMillis(); // 获取结束时间 43 System.out.println("程序运行时间: " + (endTime - startTime) / 1000.0 + "s"); 44 break; 45 } 46 } 47 } 48 49 }
执行 RUN
耗时76.962s 完成图片下载到本地。
成功
效果:
很高清吧