通过文本或url扫描下载文件

  1 package com.xxxx;
  2 
  3 import java.io.BufferedInputStream; 
  4 import java.io.BufferedReader; 
  5 import java.io.File; 
  6 import java.io.FileNotFoundException; 
  7 import java.io.FileOutputStream; 
  8 import java.io.IOException; 
  9 import java.io.InputStreamReader; 
 10 import java.net.MalformedURLException; 
 11 import java.net.URL; 
 12 import java.util.ArrayList;
 13 import java.util.List;
 14 import java.util.regex.Matcher; 
 15 import java.util.regex.Pattern; 
 16  
 17 public class GetImage { 
 18     
 19     public int getCharacterPosition(String string,int numb){
 20         //这里是获取"#"符号的位置
 21         Matcher slashMatcher = Pattern.compile("/").matcher(string);
 22         int mIdx = 0;
 23         while(slashMatcher.find()) {
 24            mIdx++;
 25            //当"#"符号第二次出现的位置
 26            if(mIdx == numb){
 27               break;
 28            }
 29         }
 30         return slashMatcher.start();
 31     }
 32     
 33     
 34     
 35     
 36     
 37     /**
 38      * 下载文件(图片、压缩包等文件都可以下载)
 39      * @param httpUrl
 40      * eg:http://www.xxxx.com/uploadfiles/123.rar
 41      */
 42     public void getHtmlFile(String httpUrl) { 
 43     URL url; 
 44     BufferedInputStream in; 
 45     FileOutputStream file; 
 46     try { 
 47        System.out.println("取网络文件"); 
 48        //获取子目录
 49        String unitPath = httpUrl.substring(getCharacterPosition(httpUrl,3) ,httpUrl.lastIndexOf("/"));
 50        String fileName = httpUrl.substring(httpUrl.lastIndexOf("/")); 
 51        String filePath = "F:\\FocuSimple"+unitPath+"\\";
 52        File up = new File(filePath);
 53         if(!up.exists()){    //判断文件夹是否不存在
 54             up.mkdirs();
 55         }
 56        
 57        url = new URL(httpUrl); 
 58       
 59        in = new BufferedInputStream(url.openStream()); 
 60       
 61        file = new FileOutputStream(new File(filePath+fileName)); 
 62        int t; 
 63        while ((t = in.read()) != -1) { 
 64        file.write(t); 
 65        } 
 66        file.close(); 
 67        in.close(); 
 68       System.out.println("文件获取成功"); 
 69     } catch (MalformedURLException e) { 
 70        e.printStackTrace(); 
 71     } catch (FileNotFoundException e) { 
 72       e.printStackTrace(); 
 73     } catch (IOException e) { 
 74        e.printStackTrace(); 
 75     } 
 76     } 
 77       
 78     public String getHtmlCode(String httpUrl) throws IOException { 
 79     String content =""; 
 80     URL uu = new URL(httpUrl); // 创建URL类对象 
 81     BufferedReader ii = new BufferedReader(new InputStreamReader(uu 
 82         .openStream())); // //使用openStream得到一输入流并由此构造一个BufferedReader对象 
 83     String input; 
 84     while ((input = ii.readLine()) != null) { // 建立读取循环，并判断是否有读取值 
 85        content += input; 
 86     } 
 87     ii.close(); 
 88     return content; 
 89     } 
 90     public static List<String> getImageSrc(String htmlCode) {
 91         List<String> imageSrcList = new ArrayList<String>();
 92 //        Pattern p = Pattern.compile("<img\\b[^>]*\\bsrc\\b\\s*=\\s*('|\")?([^'\"\n\r\f>]+(\\.jpg|\\.bmp|\\.eps|\\.gif|\\.mif|\\.miff|\\.png|\\.tif|\\.tiff|\\.svg|\\.wmf|\\.jpe|\\.jpeg|\\.dib|\\.ico|\\.tga|\\.cut|\\.pic)\\b)[^>]*>", Pattern.CASE_INSENSITIVE);
 93         Pattern p = Pattern.compile("src\\b\\s*=\\s*('|\")?([^'\"\n\r\f>]+(\\.jpg|\\.bmp|\\.eps|\\.gif|\\.mif|\\.miff|\\.png|\\.tif|\\.tiff|\\.svg|\\.wmf|\\.jpe|\\.jpeg|\\.dib|\\.ico|\\.tga|\\.cut|\\.pic)\\b)[^>]*", Pattern.CASE_INSENSITIVE);
 94         Matcher m = p.matcher(htmlCode);
 95         String quote = null;
 96         String src = null;
 97         while (m.find()) {
 98             quote = m.group(1);
 99             src = (quote == null || quote.trim().length() == 0) ? m.group(2).split("\\s+")[0] : m.group(2);
100             imageSrcList.add(src);
101             System.out.println("src"+src);
102         }
103         return imageSrcList;
104     }
105     
106     public void get(String url,String text) throws IOException { 
107       
108     String searchImgReg = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; 
109     String searchImgReg2 = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; 
110     String content  = "";
111     if(text == null){
112         content = this.getHtmlCode(url);
113     }else{
114         content = text;
115     }
116     System.out.println("内容："+content); 
117     
118     Pattern p = Pattern.compile("src\\b\\s*=\\s*('|\")?([^'\"\n\r\f>]+(\\.jpg|\\.bmp|\\.eps|\\.gif|\\.mif|\\.miff|\\.png|\\.tif|\\.tiff|\\.svg|\\.wmf|\\.jpe|\\.jpeg|\\.dib|\\.ico|\\.tga|\\.cut|\\.pic)\\b)[^>]*", Pattern.CASE_INSENSITIVE);
119     Matcher m = p.matcher(content);
120     String quote = null;
121     String src = null;
122     while (m.find()) {
123         quote = m.group(1);
124         src = (quote == null || quote.trim().length() == 0) ? m.group(2).split("\\s+")[0] : m.group(2);
125         this.getHtmlFile(url+src); 
126     }
127     
128     Pattern pattern = Pattern.compile(searchImgReg); 
129     Matcher matcher = pattern.matcher(content); 
130     while (matcher.find()) { 
131        System.out.println("图片路径1："+matcher.group(3)); 
132       this.getHtmlFile(url+matcher.group(3)); 
133          
134     } 
135       
136     pattern = Pattern.compile(searchImgReg2); 
137     matcher = pattern.matcher(content); 
138     while (matcher.find()) { 
139        System.out.println("图片路径1："+matcher.group(3)); 
140       this.getHtmlFile(matcher.group(3)); 
141           
142     } 
143     // searchImgReg = 
144     // "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; 
145     } 
146     public static void main(String[] args) throws IOException { 
147         GetImage gcp = new GetImage(); 
148         gcp.get("http://www.123rf.com.cn/#baidu01",null); 
149         gcp.get(null,"<img src=\"/images/ico/logo.png\">"); 
150         gcp.getHtmlFile("http://www.xxxx.com/uploadfiles/123.rar");
151     } 
152 }
posted @ 2017-05-12 18:04 hlb 阅读(621) 评论(0) 收藏举报
刷新页面返回顶部
hlb

通过文本或url扫描下载文件

公告