爬取网站图片保存到本地

  

  有时候我们需要在爬虫的时候将网站的图片保存到本地,这就需要我们先获取到图片的url,然后利用url再去下载图片到本地。

  下面介绍两种简单的方法:

1.利用java自带的URLConnection

  此方法我还没有找到可以携带cookie或者其他信息去下载图片的方法。

package cn.qlq.craw.Jsoup;

import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

import org.jsoup.Connection;
import org.jsoup.Jsoup;

/**
 * url获取图片并且保存到本地
 * 
 * @author liqiang
 *
 */
public class UrlConnectionGetPicture {
    
    public static void main(String[] args) throws Exception {
        String url  = "http://qiaoliqiang.cn/fileDown/zfb.bmp";
        URL url1 = new URL(url);
        URLConnection conn = url1.openConnection();
        InputStream inputStream = conn.getInputStream();
        String path = "C:\\Users\\liqiang\\Desktop\\实习\\python\\javaCrawPicture\\test.bmp";
        OutputStream out = new FileOutputStream(path);
        byte[] buff = new byte[1024];
        int i = -1;
        while(( i = inputStream.read(buff))!= -1){
            out.write(buff, 0 , i);
        }
        inputStream.close();
        out.close();
    }
    
}

 

 

补充:org.apache.commons.io.IOUtils可以简单的将一个inputStream的文件读取到另一个outputStream,实现文件的拷贝,例如:

  只用传两个参数,第一个传递InputStream,第二个传递OutputStream

package cn.qlq.craw.Jsoup;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;

import org.apache.commons.io.IOUtils;

public class IOutilsDownloadFile {
    public static void main(String[] args) throws IOException {
        String url = "http://qiaoliqiang.cn/fileDown/zfb.bmp";
        URL url1 = new URL(url);
        URLConnection conn = url1.openConnection();
        InputStream inputStream = conn.getInputStream();
        String path = "C:\\Users\\liqiang\\Desktop\\test.bmp";
        OutputStream outputStream = new FileOutputStream(path);
        // 利用IOutiks拷贝文件,简单快捷
        IOUtils.copy(inputStream, outputStream);
    }
}

 

 

 

 

 

 

 2.利用Jsoup。此方法可以在下载图片的时候携带cookie或者携带一些额外的数据(重要)

 下面是不携带cookie的方法:

package cn.qlq.craw.Jsoup;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
/**
 * Jsoup下载图片并保存到本地
 * @author liqiang
 *
 */
public class JsoupDoloadPicture {

    /**
     * @param args
     * @throws IOException 
     */
    public static void main(String[] args) throws IOException {
        // TODO Auto-generated method stub
        String imageSrc = "http://newjwc.tyust.edu.cn/CheckCode.aspx";
        Connection.Response response = Jsoup.connect(imageSrc).ignoreContentType(true).execute();
        byte[] img = response.bodyAsBytes();
        System.out.println(img.length);
        savaImage(img, "C:\\Users\\liqiang\\Desktop\\实习\\python\\javaCrawJWXT", "test.png");    
    }

    public static void savaImage(byte[] img,String filePath,String fileName) {
        BufferedOutputStream bos = null;
        FileOutputStream fos = null;
        File file = null;
        File dir = new File(filePath);
        try {
            //判断文件目录是否存在
            if(!dir.exists() && dir.isDirectory()){
                dir.mkdir();
            }
            file = new File(filePath+"\\"+fileName);
            fos = new FileOutputStream(file);
            bos = new BufferedOutputStream(fos);
            bos.write(img);
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            if(bos!=null){
                try {
                    bos.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            if(fos!=null){
                try {
                    fos.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
        
        
        
    }
}

 

 

 

 

 

 

 下面是携带cookie的方法,此方法可以在爬虫需要登录网站的时候带着cookie去获取验证码。

package cn.qlq.craw.JsoupCrawJWXT;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Map;

import org.jsoup.Connection;
import org.jsoup.Jsoup;

/**
 * Jsoup下载图片并保存到本地
 * 
 * @author liqiang
 *
 */
public class JsoupDoloadPicture {

    /**
     * 带着cookie下载验证码图片
     * 
     * @param url
     * @param cookies
     * @throws IOException
     */
    public static void downloadImg(String url, Map<String, String> cookies) throws IOException {
        // TODO Auto-generated method stub
        Connection connect = Jsoup.connect(url);
        connect.cookies(cookies);// 携带cookies爬取图片
        connect.timeout(5 * 10000);
        Connection.Response response = connect.ignoreContentType(true).execute();
        byte[] img = response.bodyAsBytes();
        System.out.println(img.length);
        // 读取文件存储位置
        String directory = ResourcesUtil.getValue("path", "file");
        savaImage(img, directory, "yzm.png");
    }

    public static void savaImage(byte[] img, String filePath, String fileName) {
        BufferedOutputStream bos = null;
        FileOutputStream fos = null;
        File file = null;
        File dir = new File(filePath);
        try {
            // 判断文件目录是否存在
            if (!dir.exists() && dir.isDirectory()) {
                dir.mkdir();
            }
            file = new File(filePath + "\\" + fileName);
            fos = new FileOutputStream(file);
            bos = new BufferedOutputStream(fos);
            bos.write(img);
            System.out.println("验证码已经下载到:"+filePath);
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            if (bos != null) {
                try {
                    bos.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            if (fos != null) {
                try {
                    fos.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

    }
}

 

posted @ 2018-04-19 23:47  QiaoZhi  阅读(1636)  评论(0编辑  收藏  举报