爬取网站图片保存到本地
有时候我们需要在爬虫的时候将网站的图片保存到本地,这就需要我们先获取到图片的url,然后利用url再去下载图片到本地。
下面介绍两种简单的方法:
1.利用java自带的URLConnection
此方法我还没有找到可以携带cookie或者其他信息去下载图片的方法。
package cn.qlq.craw.Jsoup; import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import org.jsoup.Connection; import org.jsoup.Jsoup; /** * url获取图片并且保存到本地 * * @author liqiang * */ public class UrlConnectionGetPicture { public static void main(String[] args) throws Exception { String url = "http://qiaoliqiang.cn/fileDown/zfb.bmp"; URL url1 = new URL(url); URLConnection conn = url1.openConnection(); InputStream inputStream = conn.getInputStream(); String path = "C:\\Users\\liqiang\\Desktop\\实习\\python\\javaCrawPicture\\test.bmp"; OutputStream out = new FileOutputStream(path); byte[] buff = new byte[1024]; int i = -1; while(( i = inputStream.read(buff))!= -1){ out.write(buff, 0 , i); } inputStream.close(); out.close(); } }
补充:org.apache.commons.io.IOUtils可以简单的将一个inputStream的文件读取到另一个outputStream,实现文件的拷贝,例如:
只用传两个参数,第一个传递InputStream,第二个传递OutputStream
package cn.qlq.craw.Jsoup; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.URL; import java.net.URLConnection; import org.apache.commons.io.IOUtils; public class IOutilsDownloadFile { public static void main(String[] args) throws IOException { String url = "http://qiaoliqiang.cn/fileDown/zfb.bmp"; URL url1 = new URL(url); URLConnection conn = url1.openConnection(); InputStream inputStream = conn.getInputStream(); String path = "C:\\Users\\liqiang\\Desktop\\test.bmp"; OutputStream outputStream = new FileOutputStream(path); // 利用IOutiks拷贝文件,简单快捷 IOUtils.copy(inputStream, outputStream); } }
2.利用Jsoup。此方法可以在下载图片的时候携带cookie或者携带一些额外的数据(重要)
下面是不携带cookie的方法:
package cn.qlq.craw.Jsoup; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import org.jsoup.Connection; import org.jsoup.Jsoup; /** * Jsoup下载图片并保存到本地 * @author liqiang * */ public class JsoupDoloadPicture { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { // TODO Auto-generated method stub String imageSrc = "http://newjwc.tyust.edu.cn/CheckCode.aspx"; Connection.Response response = Jsoup.connect(imageSrc).ignoreContentType(true).execute(); byte[] img = response.bodyAsBytes(); System.out.println(img.length); savaImage(img, "C:\\Users\\liqiang\\Desktop\\实习\\python\\javaCrawJWXT", "test.png"); } public static void savaImage(byte[] img,String filePath,String fileName) { BufferedOutputStream bos = null; FileOutputStream fos = null; File file = null; File dir = new File(filePath); try { //判断文件目录是否存在 if(!dir.exists() && dir.isDirectory()){ dir.mkdir(); } file = new File(filePath+"\\"+fileName); fos = new FileOutputStream(file); bos = new BufferedOutputStream(fos); bos.write(img); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ if(bos!=null){ try { bos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if(fos!=null){ try { fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } }
下面是携带cookie的方法,此方法可以在爬虫需要登录网站的时候带着cookie去获取验证码。
package cn.qlq.craw.JsoupCrawJWXT; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.Map; import org.jsoup.Connection; import org.jsoup.Jsoup; /** * Jsoup下载图片并保存到本地 * * @author liqiang * */ public class JsoupDoloadPicture { /** * 带着cookie下载验证码图片 * * @param url * @param cookies * @throws IOException */ public static void downloadImg(String url, Map<String, String> cookies) throws IOException { // TODO Auto-generated method stub Connection connect = Jsoup.connect(url); connect.cookies(cookies);// 携带cookies爬取图片 connect.timeout(5 * 10000); Connection.Response response = connect.ignoreContentType(true).execute(); byte[] img = response.bodyAsBytes(); System.out.println(img.length); // 读取文件存储位置 String directory = ResourcesUtil.getValue("path", "file"); savaImage(img, directory, "yzm.png"); } public static void savaImage(byte[] img, String filePath, String fileName) { BufferedOutputStream bos = null; FileOutputStream fos = null; File file = null; File dir = new File(filePath); try { // 判断文件目录是否存在 if (!dir.exists() && dir.isDirectory()) { dir.mkdir(); } file = new File(filePath + "\\" + fileName); fos = new FileOutputStream(file); bos = new BufferedOutputStream(fos); bos.write(img); System.out.println("验证码已经下载到:"+filePath); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { if (bos != null) { try { bos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (fos != null) { try { fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } }
【当你用心写完每一篇博客之后,你会发现它比你用代码实现功能更有成就感!】