java 爬虫 爬取豆瓣 请不要害羞 图片

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by liwj on 2017/5/25.
 */
public class Spider {

    private static String IMAGE_REG = "(https://img1.doubanio.com/view/group_topic/large/public/p)[0-9]{0,}(.jpg)";
    private static String HTTP_REG = "(https://www.douban.com/group/topic/)[0-9]{0,}(/)";
    private static String FILE_NAME="[0-9]{0,}(.jpg)";
    private static SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

    /**
     * 根据url获取网页源码
     *
     * @param url
     * @return
     */
    private static String getResultByUrl(String url) {
        HttpClient hc = new DefaultHttpClient();
        try {
            HttpGet httpget = new HttpGet(url);
            httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13");
            httpget.setHeader("Accept-Encoding", "utf-8");
            HttpResponse response = hc.execute(httpget);
            HttpEntity entity = response.getEntity();
            if (entity != null) {
                InputStream in = entity.getContent();
                BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
                StringBuffer buffer = new StringBuffer();
                String line = "";
                while ((line = br.readLine()) != null) {
                    buffer.append(line);
                }
                in.close();
                entity.getContent().close();
                return buffer.toString();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "";
    }

    /**
     * 获取帖子或者图片url
     *
     * @param html
     * @return
     */
    private static List<String> getAllUrl(String reg, String html) {
        List<String> urls = new ArrayList<String>();

        Pattern pattern = Pattern.compile(reg);
        Matcher matcher = pattern.matcher(html);
        while (matcher.find()) {
            urls.add(matcher.group());
        }
        return urls;
    }

    /**
     * 下载文件
     * @param fileUrl
     * @param fileName
     * @param savePath
     * @throws Exception
     */
    private static void downloadFileFromUrl(String fileUrl, String fileName, String savePath) throws Exception {
        //获取连接
        URL url = new URL(fileUrl);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setConnectTimeout(3 * 1000);
        //设置请求头
        connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36");
        //获取输入流
        InputStream in = connection.getInputStream();

        File saveDir = new File(savePath);
        if (!saveDir.exists()) {
            saveDir.mkdirs();
        }
        File file = new File(savePath + fileName);

        OutputStream out = new FileOutputStream(file);

        byte[] bytes = new byte[1024];
        int len = 0;
        while ((len = in.read(bytes)) != -1) {
            out.write(bytes, 0, len);
        }
        out.close();
        in.close();
    }

    public static void main(String[] args) {
        for (int page = 25; page <= 25; page += 25) {
            String url = "https://www.douban.com/group/haixiuzu/discussion?start=" + page;
            String html=getResultByUrl(url);
            //System.out.println(html);
            List<String> webPages=getAllUrl(HTTP_REG,html);
            for(String webPage:webPages){
                String webHtml=getResultByUrl(webPage);
                List<String> images=getAllUrl(IMAGE_REG,webHtml);
                for(String image:images){
                    String fileName="";
                    Matcher matcher=Pattern.compile(FILE_NAME).matcher(image);
                    if(matcher.find()){
                        fileName=matcher.group();
                    }

                    try {
                        downloadFileFromUrl(image,fileName,"E:\\image\\");
                        System.out.println(df.format(new Date())+" 图片保存成功------["+fileName+"]");
                    }catch (Exception e){
                        System.err.println(df.format(new Date())+" 图片保存失败------["+fileName+"]");
                    }
                }
            }
        }
    }
}

 

posted on 2017-05-25 17:18  13rj1115  阅读(5346)  评论(0编辑  收藏  举报

导航