java捕获一个网站页面的全部图片
直接上代码:
package com.jeecg.util; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class CatchImage { // 地址 private static final String URL = "http://news.163.com/";// 编码 private static final String ECODING = "UTF-8"; // 获取img标签正则 private static final String IMGURL_REG = "<img src=(.*?)[^>]*?>"; // 获取src路径的正则 private static final String IMGSRC_REG = "http:.+(\\.jpeg|\\.jpg|\\.png|\\.gif)\""; public static void main(String[] args) throws Exception { CatchImage cm = new CatchImage(); // 获得html文本内容 String HTML = cm.getHTML(URL); // 获取图片标签 List<String> imgUrl = cm.getImageUrl(HTML); // 获取图片src地址 List<String> imgSrc = cm.getImageSrc(imgUrl); // 下载图片 cm.Download(imgSrc); cm.Download(imgSrc); } /** * * * 获取HTML内容 * * @param url * @return * @throws Exception **/ private String getHTML(String oldLink) throws Exception { StringBuffer sb = new StringBuffer(); URL url = new URL(oldLink); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setRequestMethod("GET"); connection.setConnectTimeout(2000); connection.setReadTimeout(2000); if (connection.getResponseCode() == 200) { InputStream inputStream = connection.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); String line = ""; while ((line = reader.readLine()) != null) { sb.append(line); } } return sb.toString(); } /** * 获取ImageUrl地址 * * @param HTML * * @return */ private List<String> getImageUrl(String HTML) { Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML); List<String> listImgUrl = new ArrayList<String>(); while (matcher.find()) { listImgUrl.add(matcher.group()); } return listImgUrl; } /** * 获取ImageSrc地址 * * @param listImageUrl * * @return **/ private List<String> getImageSrc(List<String> listImageUrl) { List<String> listImgSrc = new ArrayList<String>(); for (String image : listImageUrl) { Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image); while (matcher.find()) { listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1)); } } return listImgSrc; } /** * 下载图片 * * @param listImgSrc * @throws FileNotFoundException **/ private void Download(List<String> listImgSrc) throws Exception { int count = 0; ArrayList al = new ArrayList(); for (String urll : listImgSrc) { System.out.println(urll); Pattern p = Pattern.compile("\\.jpg|\\.png|\\.gif|\\.jpeg[^_]"); Matcher m = p.matcher(urll); while (m.find()) { al.add(m.group()); } } for (String url : listImgSrc) { System.out.println(url); URL uri = new URL(url); InputStream in = uri.openStream(); FileOutputStream fo = new FileOutputStream("D:/imgPage/" + count + al.get(count)); byte[] buf = new byte[1024]; int length = 0; System.out.println("开始下载:" + url); while ((length = in.read(buf, 0, buf.length)) != -1) { fo.write(buf, 0, length); } in.close(); fo.close(); System.out.println("下载完成"); count++; } System.out.println(count); } }