通过文本或url扫描下载文件
1 package com.xxxx; 2 3 import java.io.BufferedInputStream; 4 import java.io.BufferedReader; 5 import java.io.File; 6 import java.io.FileNotFoundException; 7 import java.io.FileOutputStream; 8 import java.io.IOException; 9 import java.io.InputStreamReader; 10 import java.net.MalformedURLException; 11 import java.net.URL; 12 import java.util.ArrayList; 13 import java.util.List; 14 import java.util.regex.Matcher; 15 import java.util.regex.Pattern; 16 17 public class GetImage { 18 19 public int getCharacterPosition(String string,int numb){ 20 //这里是获取"#"符号的位置 21 Matcher slashMatcher = Pattern.compile("/").matcher(string); 22 int mIdx = 0; 23 while(slashMatcher.find()) { 24 mIdx++; 25 //当"#"符号第二次出现的位置 26 if(mIdx == numb){ 27 break; 28 } 29 } 30 return slashMatcher.start(); 31 } 32 33 34 35 36 37 /** 38 * 下载文件(图片、压缩包等文件都可以下载) 39 * @param httpUrl 40 * eg:http://www.xxxx.com/uploadfiles/123.rar 41 */ 42 public void getHtmlFile(String httpUrl) { 43 URL url; 44 BufferedInputStream in; 45 FileOutputStream file; 46 try { 47 System.out.println("取网络文件"); 48 //获取子目录 49 String unitPath = httpUrl.substring(getCharacterPosition(httpUrl,3) ,httpUrl.lastIndexOf("/")); 50 String fileName = httpUrl.substring(httpUrl.lastIndexOf("/")); 51 String filePath = "F:\\FocuSimple"+unitPath+"\\"; 52 File up = new File(filePath); 53 if(!up.exists()){ //判断文件夹是否不存在 54 up.mkdirs(); 55 } 56 57 url = new URL(httpUrl); 58 59 in = new BufferedInputStream(url.openStream()); 60 61 file = new FileOutputStream(new File(filePath+fileName)); 62 int t; 63 while ((t = in.read()) != -1) { 64 file.write(t); 65 } 66 file.close(); 67 in.close(); 68 System.out.println("文件获取成功"); 69 } catch (MalformedURLException e) { 70 e.printStackTrace(); 71 } catch (FileNotFoundException e) { 72 e.printStackTrace(); 73 } catch (IOException e) { 74 e.printStackTrace(); 75 } 76 } 77 78 public String getHtmlCode(String httpUrl) throws IOException { 79 String content =""; 80 URL uu = new URL(httpUrl); // 创建URL类对象 81 BufferedReader ii = new BufferedReader(new InputStreamReader(uu 82 .openStream())); // //使用openStream得到一输入流并由此构造一个BufferedReader对象 83 String input; 84 while ((input = ii.readLine()) != null) { // 建立读取循环,并判断是否有读取值 85 content += input; 86 } 87 ii.close(); 88 return content; 89 } 90 public static List<String> getImageSrc(String htmlCode) { 91 List<String> imageSrcList = new ArrayList<String>(); 92 // Pattern p = Pattern.compile("<img\\b[^>]*\\bsrc\\b\\s*=\\s*('|\")?([^'\"\n\r\f>]+(\\.jpg|\\.bmp|\\.eps|\\.gif|\\.mif|\\.miff|\\.png|\\.tif|\\.tiff|\\.svg|\\.wmf|\\.jpe|\\.jpeg|\\.dib|\\.ico|\\.tga|\\.cut|\\.pic)\\b)[^>]*>", Pattern.CASE_INSENSITIVE); 93 Pattern p = Pattern.compile("src\\b\\s*=\\s*('|\")?([^'\"\n\r\f>]+(\\.jpg|\\.bmp|\\.eps|\\.gif|\\.mif|\\.miff|\\.png|\\.tif|\\.tiff|\\.svg|\\.wmf|\\.jpe|\\.jpeg|\\.dib|\\.ico|\\.tga|\\.cut|\\.pic)\\b)[^>]*", Pattern.CASE_INSENSITIVE); 94 Matcher m = p.matcher(htmlCode); 95 String quote = null; 96 String src = null; 97 while (m.find()) { 98 quote = m.group(1); 99 src = (quote == null || quote.trim().length() == 0) ? m.group(2).split("\\s+")[0] : m.group(2); 100 imageSrcList.add(src); 101 System.out.println("src"+src); 102 } 103 return imageSrcList; 104 } 105 106 public void get(String url,String text) throws IOException { 107 108 String searchImgReg = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; 109 String searchImgReg2 = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; 110 String content = ""; 111 if(text == null){ 112 content = this.getHtmlCode(url); 113 }else{ 114 content = text; 115 } 116 System.out.println("内容:"+content); 117 118 Pattern p = Pattern.compile("src\\b\\s*=\\s*('|\")?([^'\"\n\r\f>]+(\\.jpg|\\.bmp|\\.eps|\\.gif|\\.mif|\\.miff|\\.png|\\.tif|\\.tiff|\\.svg|\\.wmf|\\.jpe|\\.jpeg|\\.dib|\\.ico|\\.tga|\\.cut|\\.pic)\\b)[^>]*", Pattern.CASE_INSENSITIVE); 119 Matcher m = p.matcher(content); 120 String quote = null; 121 String src = null; 122 while (m.find()) { 123 quote = m.group(1); 124 src = (quote == null || quote.trim().length() == 0) ? m.group(2).split("\\s+")[0] : m.group(2); 125 this.getHtmlFile(url+src); 126 } 127 128 Pattern pattern = Pattern.compile(searchImgReg); 129 Matcher matcher = pattern.matcher(content); 130 while (matcher.find()) { 131 System.out.println("图片路径1:"+matcher.group(3)); 132 this.getHtmlFile(url+matcher.group(3)); 133 134 } 135 136 pattern = Pattern.compile(searchImgReg2); 137 matcher = pattern.matcher(content); 138 while (matcher.find()) { 139 System.out.println("图片路径1:"+matcher.group(3)); 140 this.getHtmlFile(matcher.group(3)); 141 142 } 143 // searchImgReg = 144 // "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; 145 } 146 public static void main(String[] args) throws IOException { 147 GetImage gcp = new GetImage(); 148 gcp.get("http://www.123rf.com.cn/#baidu01",null); 149 gcp.get(null,"<img src=\"/images/ico/logo.png\">"); 150 gcp.getHtmlFile("http://www.xxxx.com/uploadfiles/123.rar"); 151 } 152 }
HLb`s签名:给自己一个拼搏的理由,好好的坚持下去。