java 爬虫 爬取豆瓣 请不要害羞 图片
import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import java.io.*; import java.net.HttpURLConnection; import java.net.URL; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by liwj on 2017/5/25. */ public class Spider { private static String IMAGE_REG = "(https://img1.doubanio.com/view/group_topic/large/public/p)[0-9]{0,}(.jpg)"; private static String HTTP_REG = "(https://www.douban.com/group/topic/)[0-9]{0,}(/)"; private static String FILE_NAME="[0-9]{0,}(.jpg)"; private static SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); /** * 根据url获取网页源码 * * @param url * @return */ private static String getResultByUrl(String url) { HttpClient hc = new DefaultHttpClient(); try { HttpGet httpget = new HttpGet(url); httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13"); httpget.setHeader("Accept-Encoding", "utf-8"); HttpResponse response = hc.execute(httpget); HttpEntity entity = response.getEntity(); if (entity != null) { InputStream in = entity.getContent(); BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8")); StringBuffer buffer = new StringBuffer(); String line = ""; while ((line = br.readLine()) != null) { buffer.append(line); } in.close(); entity.getContent().close(); return buffer.toString(); } } catch (Exception e) { e.printStackTrace(); } return ""; } /** * 获取帖子或者图片url * * @param html * @return */ private static List<String> getAllUrl(String reg, String html) { List<String> urls = new ArrayList<String>(); Pattern pattern = Pattern.compile(reg); Matcher matcher = pattern.matcher(html); while (matcher.find()) { urls.add(matcher.group()); } return urls; } /** * 下载文件 * @param fileUrl * @param fileName * @param savePath * @throws Exception */ private static void downloadFileFromUrl(String fileUrl, String fileName, String savePath) throws Exception { //获取连接 URL url = new URL(fileUrl); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setConnectTimeout(3 * 1000); //设置请求头 connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36"); //获取输入流 InputStream in = connection.getInputStream(); File saveDir = new File(savePath); if (!saveDir.exists()) { saveDir.mkdirs(); } File file = new File(savePath + fileName); OutputStream out = new FileOutputStream(file); byte[] bytes = new byte[1024]; int len = 0; while ((len = in.read(bytes)) != -1) { out.write(bytes, 0, len); } out.close(); in.close(); } public static void main(String[] args) { for (int page = 25; page <= 25; page += 25) { String url = "https://www.douban.com/group/haixiuzu/discussion?start=" + page; String html=getResultByUrl(url); //System.out.println(html); List<String> webPages=getAllUrl(HTTP_REG,html); for(String webPage:webPages){ String webHtml=getResultByUrl(webPage); List<String> images=getAllUrl(IMAGE_REG,webHtml); for(String image:images){ String fileName=""; Matcher matcher=Pattern.compile(FILE_NAME).matcher(image); if(matcher.find()){ fileName=matcher.group(); } try { downloadFileFromUrl(image,fileName,"E:\\image\\"); System.out.println(df.format(new Date())+" 图片保存成功------["+fileName+"]"); }catch (Exception e){ System.err.println(df.format(new Date())+" 图片保存失败------["+fileName+"]"); } } } } } }