Java--多线程读取网络图片并保存在本地
本例用到了多线程、时间函数、网络流、文件读写、正则表达式(在读取html内容response时,最好不要用正则表达式来抓捕html文本内容里的特征,因为服务器返回的多个页面的文本内容不一定使用相同的模式),是一个综合性的实例。
package javatest; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.regex.Matcher; import java.util.regex.Pattern; class urlTest { public static void main(String[] args) throws IOException { //String url = "http://www.ik6.com/meinv/10000/index.html"; String dir = "d:\\result\\201601282"; int base = 40624; // 多线程方法,从网上下载多个图片并保存 ArrayList<Thread> threads = new ArrayList<Thread>(); urlTest test=new urlTest(); int threadCount=1;//开5个线程,用于下载 int themePerThread=1; Date start=new Date(); System.out.println("threads start.."); for (int i = 0; i < threadCount; i++) { Thread t = new Thread(test.new workerThread(dir, base, themePerThread)); threads.add(t); t.start(); base+=themePerThread; } for (Thread t : threads) { try { t.join();//让主线程等待此子线程执行完毕 } catch (InterruptedException e) { e.printStackTrace(); } } System.out.println("threads complete.."); Date end=new Date(); //计算总耗时 long diff = end.getTime() - start.getTime(); String info=String.format("it takes %f seconds to run.", diff / 1000.00); System.out.println(info); //单线程方法 // for (int themeCount = 0; themeCount < 200; themeCount++) // { // for (int pageIndex = 1; pageIndex <= 20; pageIndex++) // { // if (pageIndex==1) // {url = String.format( // "http://www.ik6.com/meinv/%d/index.html", base // + themeCount); // } // else // { // url = String.format( // "http://www.ik6.com/meinv/%d/index_%d.html", base // + themeCount, pageIndex); // } // // String data = GetResponseText(url); // if (!IsContentPage(data)) // break; // System.out.println(url); // ArrayList<String> imgUrls = GetImgUrls(data); // for (String imgUrl : imgUrls) // { // String imageSavedPath = String.format("%s\\%d_%d.jpg", dir,base+ // themeCount,pageIndex); // RetrieveImg2(imgUrl, imageSavedPath); // } // // } // } } public class workerThread implements Runnable { String dir = null; int base = 0; int themeCount = 0; int totalPage=0; int totalImg=0; public workerThread(String dir, int base, int themeCount) { this.dir = dir; this.base = base; this.themeCount = themeCount; } public void run() { String url=null; int pageNo=0; for (int themeIndex = 0; themeIndex < themeCount; themeIndex++) { for (int pageIndex = 1; pageIndex <= 50; pageIndex++) { pageNo=base+ themeIndex; if (pageIndex == 1) { url = String.format("http://www.ik6.com/meinv/%d/index.html", pageNo); } else { url = String.format( "http://www.ik6.com/meinv/%d/index_%d.html", pageNo, pageIndex); } String data = GetResponseText(url); if (!IsContentPage(data)) break; ArrayList<String> imgUrls = GetImgUrls(data); for (String imgUrl : imgUrls) { String imageSavedPath = String.format("%s\\%d_%d.jpg", dir, pageNo, pageIndex); RetrieveImg2(imgUrl, imageSavedPath); } } } } } //日期格式化 public static String GetTimeString() { Date dt = new Date(); SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss"); String s = df.format(dt); return s; } //通过特征判断 public static boolean IsContentPage(String pageContent) { return pageContent.indexOf("<center>") != -1; } public static ArrayList<String> GetImgUrls(String srcStr) { // 利用正则表达式,读取页面中所有图片的url // Pattern p1 = Pattern.compile("<center.+\n?.+\n?</center>"); // Pattern p2 = Pattern.compile("lazysrc=\"http\\:.+\\.jpg\""); // Matcher m = p2.matcher(srcStr); // ArrayList<String> imgUrls = new ArrayList<String>(); // while (m.find()) // { // String match = m.group(); // imgUrls.add(match.substring(match.indexOf("\"")+1,match.lastIndexOf("\""))); // } // return imgUrls; // 仅读取主题图片的url,为何不能匹配center? // Pattern p1 = Pattern.compile("<center.+\n*.+\n*</center>"); // Pattern p2 = Pattern.compile("lazysrc=\"http\\:.+\\.jpg\""); // Matcher m = p1.matcher(srcStr); // ArrayList<String> imgUrls = new ArrayList<String>(); // if (m.find()) // { // String matchCenter = m.group(); // Matcher m2 = p2.matcher(matchCenter); // while (m2.find()) // { // String matchImage = m2.group(); // imgUrls.add(matchImage.substring(matchImage.indexOf("\"") + 1, // matchImage.lastIndexOf("\""))); // } // } // return imgUrls; // 用字符串的indexOf方法找出所有图片的url srcStr = srcStr.substring(srcStr.indexOf("<center"), srcStr.indexOf("</center>")); // Pattern p2 = Pattern.compile("lazysrc=http\\:.+\\.jpg"); srcStr = srcStr.substring(srcStr.indexOf("src=")); srcStr = srcStr.substring(srcStr.indexOf("http"), srcStr.indexOf(".jpg") + 4); ArrayList<String> imgUrls = new ArrayList<String>(); imgUrls.add(srcStr); return imgUrls; } //通过url获取html页面 public static String GetResponseText(String url) { String response = null; try { URL _url = new URL(url); HttpURLConnection urlcon = (HttpURLConnection) _url .openConnection(); // 获取连接 InputStream is = urlcon.getInputStream(); BufferedReader buffer = new BufferedReader(new InputStreamReader( is, "utf-8")); StringBuffer sb = new StringBuffer(); String line = null; while ((line = buffer.readLine()) != null) { sb.append(line).append('\n'); // System.out.println(l); } response = sb.toString(); } catch (Exception e) { e.printStackTrace(); } return response; } //通过图片的url,获取图片并保存在本地.注意:此法有缺点 public static void RetrieveImg(String imgURL, String savepath) { try { File file = new File(savepath); if (file.exists()) { return; } else { file.createNewFile(); URL _url = new URL(imgURL); HttpURLConnection urlcon = (HttpURLConnection) _url .openConnection(); // urlcon.setRequestMethod("GET"); // 超时响应时间为5秒 // urlcon.setConnectTimeout(3 * 1000); // 获取连接 InputStream is = urlcon.getInputStream(); byte[] buffer = new byte[1024]; FileOutputStream out = new FileOutputStream(file); while (is.read(buffer) != -1) ; out.write(buffer);// 为何不行 is.close(); out.close(); } } catch (Exception e) { e.printStackTrace(); } } //通过图片的url,获取图片并保存在本地 public static void RetrieveImg2(String imgURL, String savepath) { try { File file = new File(savepath); if (file.exists()) { return; } else { file.createNewFile(); URL _url = new URL(imgURL); HttpURLConnection conn = (HttpURLConnection) _url .openConnection(); conn.setRequestMethod("GET"); // 超时响应时间为5秒 conn.setConnectTimeout(5 * 1000); // 通过输入流获取图片数据 InputStream inStream = conn.getInputStream(); byte[] data = readInputStream(inStream); // 写入到新文件当中 FileOutputStream out = new FileOutputStream(file); out.write(data); out.close(); } } catch (Exception e) { e.printStackTrace(); } } //将输入流的内容写入内存保存起来,以便稍后写入到文件当中 public static byte[] readInputStream(InputStream inStream) throws Exception { ByteArrayOutputStream outStream = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int len = 0;// 关键,否则图片不完整,因为不知道写入多少 while ((len = inStream.read(buffer)) != -1) { outStream.write(buffer, 0, len); } inStream.close(); // 把outStream里的数据写入内存 return outStream.toByteArray(); } }