通过网络得到html,并解析出其中网址(JAVA程序)

网络版程序:

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

public class TestIndex {
    
    private String rootUrl = "http://localhost/apk/";
    private String listUrl = rootUrl + "test-index.htm";
    private static List<String> imageUrlList = new ArrayList<String>();
    public static void main(String args[]){
        TestIndex ti = new TestIndex();
        ti.getData();
        System.out.println(imageUrlList.size());
        for(int i=0; i<imageUrlList.size();i++){
            System.out.println(imageUrlList.get(i));
        }
        
    }
    
    private InputStream getNetInputStream(String urlStr)
    {
        try
        {
            URL url = new URL(urlStr);
            URLConnection conn = url.openConnection();
            conn.connect();
            InputStream is = conn.getInputStream();
            return is;
        }
        catch (Exception e)
        {

        }
        return null;
    }
    private void getData() {
        try
        {
            InputStream is = getNetInputStream(listUrl);
            InputStreamReader isr = new InputStreamReader(is);
            BufferedReader br = new BufferedReader(isr);
            String s = null;
            String html="";
            while ((s = br.readLine()) != null)
            {
                html+=s;
            }
            
            is.close();
            String startStr = "src=\"https://";
            String endStr = " width=";
            int start = 0;
            int end = 0;
            int index =0;
            imageUrlList.clear();
            while (true)
            {
                start = html.indexOf(startStr, index);
                if (start < 0)
                    break;
                index=start;
                end = html.indexOf(endStr, index);
                String ss = html.substring(start+5,end-1);
                imageUrlList.add(ss);
                index +=ss.length();
            }
        }
        catch (Exception e)
        {
            // TODO: handle exception
        }
    }
}

本地版程序:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

public class TestIndex_IO {
    
    private String rootUrl = "D:/Hixin/webandroid/";
    private String listUrl = rootUrl + "test-index.htm";
    private static List<String> imageUrlList = new ArrayList<String>();
    public static void main(String args[]){
        TestIndex_IO ti = new TestIndex_IO();
        ti.getData();
        System.out.println(imageUrlList.size());
        for(int i=0; i<imageUrlList.size();i++){
            System.out.println(imageUrlList.get(i));
        }
        
    }
    
    private InputStream getNetInputStream(String urlStr)
    {
    
    
            InputStream is;
            try {
                is = new FileInputStream(new File(urlStr));
                return is;
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
            return null;
     
     
       
    }
    private void getData() {
        try
        {
            InputStream is = getNetInputStream(listUrl);
            InputStreamReader isr = new InputStreamReader(is);
            BufferedReader br = new BufferedReader(isr);
            String s = null;
            String html="";
            while ((s = br.readLine()) != null)
            {
                html+=s;
            }
            
            is.close();
            String startStr = "src=\"https://";
            String endStr = " width=";
            int start = 0;
            int end = 0;
            int index =0;
            imageUrlList.clear();
            while (true)
            {
                start = html.indexOf(startStr, index);
                if (start < 0)
                    break;
                index=start;
                end = html.indexOf(endStr, index);
                String ss = html.substring(start+5,end-1);
                imageUrlList.add(ss);
                index +=ss.length();
            }
        }
        catch (Exception e)
        {
            // TODO: handle exception
        }
    }
}


差别仅仅在于private InputStream getNetInputStream(String urlStr)函数。为避免中文乱码,建议InputStreamReader isr = new InputStreamReader(is,"utf-8");

            InputStream is = getNetInputStream(listUrl);
            InputStreamReader isr = new InputStreamReader(is);      
            BufferedReader br = new BufferedReader(isr);
            String s = null;
            String html="";
            while ((s = br.readLine()) != null)
            {
                html+=s;
            }
            System.out.println(html.length());
            is.close();

输出结果为:77300

            InputStream is = getNetInputStream(listUrl);
            InputStreamReader isr = new InputStreamReader(is,"utf-8");  
            BufferedReader br = new BufferedReader(isr);
            String s = null;
            String html="";
            while ((s = br.readLine()) != null)
            {
                html+=s;
            }
            System.out.println(html.length());
            is.close();

输出结果为:77135

 

 private String writeUrl = "D:/newfile/new/new";
  private String fileName ="test-index.htm";
 File f = new File(writeUrl);
            if(!f.exists()) {
                f.mkdirs();
            }
            File f1 = new File(f, fileName);
            FileOutputStream fos = new FileOutputStream(f1);
            OutputStreamWriter osw = new OutputStreamWriter(fos);  
          
           osw.write(html,0,html.length());
           osw.flush();  
           osw.close();

 

 


解析出htm文件中包含的网址。

 

结果:

20
https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRvQgUjsVDBncM3mVIgIyIuE87BnlyJUy2BNsAp8kUoTanrC_css5mVAw
https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcThd8cYjOTmCgYJZxX5ls-xpxaAlH1_yocOSCqI5_7OkL29SNtbCZ7q2Yoj
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTl-FzKmsppxuwzmTITGCv9uDxmrWr1pG0lw8mUD9wkWIloASxQeBEMnVjz
https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcQWbmiZJIXKHV2IoTBp7zSY6kD5g5VPzVtBTLJYYR5nwTtKi2-0_u93qL4e
https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSlrLi_GtVgUehU7coFe1eMdrJxPdvS42iTqXkla0g75s31NBfAq2u1LE4
https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSkrlyGxSs8Dr_7k3MUvoGq1vE45LgHZ0zEhIEdD9LLZiaoMcE7IAqn8ho
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTu__OUSJ4R4EKBu4jOi2ZAdHohpVQIBy3-SfnI8FYpN8wVC9kJG_aWuk_w
https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcR3Bf7YtsHJ813A5_wWzpxIy4MbEmqz5NLw3qv1nPxOZqVjH7QlY-qYSCg
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcToB4nJPqVwnzn0xeasnXyhxGgOqHXdypE6KZIMTfV9k52eIrE3iYsA6Ixm
https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTkKw0LpqdB2eQMUpwdQdvM9DTeNtq1mrvMNivoQtN37p3m0OPsx4ME9i4O
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSZGzMf_3hmdDktz91yp5ZQi-eGWLCenZ0U446sXT2nqYuwlWRI_V_BVIWi
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTQF-55T5GM3dLdaoafPdlIYK0ESNvM6-Bsb4-B2rQTeyD5gGoCKxokExM-
https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRoRjo4TFeXmx47zE6VH0ylcO0IQ2HBsOHYIMJCI9MsRyg_PF1WhHbqG76Q
https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcRrdegt1koEy51dLWrJAbVMJBlCEZ7fPl2mztYYM6onvxocRCq030Ft1gE
https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTtnQpte0uq9Ue9nsg25GeO1kw_-Hcn69ozTQkiMBHrXKwlANutyhwKD9XM
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRNRdxzmuFKABoGgyv2SC0gMticosL2LB3V1fBMOwNtVBZxHkyMw4IcWBFj
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQr40CEf75nWCj5dg-oeKtb9zK6mhktu7vnfoYAh5ioy34goC3c9ptDkQwP
https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcQUnyHrVEbppqhZnWnQrijhBFP0X34gRf7pKw6PdT4ggepB2k9g-p71sgGh
https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcR9Us9qblbTJaw47gULXCI8sHKN4I61gYsT2ijebtZzgsMDI8GmYqQpIIw
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSIrW-IbBZjM9Ztn60r9QE1_FIMjt494qGX12tqsLsibYPLuFVwyVSgz1I

 用正则表达式更简单:

 1 InputStream is = getNetInputStream(listUrl);
 2             InputStreamReader isr = new InputStreamReader(is);
 3             BufferedReader br = new BufferedReader(isr);
 4             String s = "";
 5             
 6             while ((s = br.readLine()) != null)
 7             {
 8                 Pattern p = Pattern.compile("src=\"https[^\"]+");
 9                 Matcher m = p.matcher(s);
10                 while(m.find()) {
11                     System.out.println(m.group());
12                 }
13             }

 

 1 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRvQgUjsVDBncM3mVIgIyIuE87BnlyJUy2BNsAp8kUoTanrC_css5mVAw
 2 src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcThd8cYjOTmCgYJZxX5ls-xpxaAlH1_yocOSCqI5_7OkL29SNtbCZ7q2Yoj
 3 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTl-FzKmsppxuwzmTITGCv9uDxmrWr1pG0lw8mUD9wkWIloASxQeBEMnVjz
 4 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcQWbmiZJIXKHV2IoTBp7zSY6kD5g5VPzVtBTLJYYR5nwTtKi2-0_u93qL4e
 5 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSlrLi_GtVgUehU7coFe1eMdrJxPdvS42iTqXkla0g75s31NBfAq2u1LE4
 6 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSkrlyGxSs8Dr_7k3MUvoGq1vE45LgHZ0zEhIEdD9LLZiaoMcE7IAqn8ho
 7 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTu__OUSJ4R4EKBu4jOi2ZAdHohpVQIBy3-SfnI8FYpN8wVC9kJG_aWuk_w
 8 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcR3Bf7YtsHJ813A5_wWzpxIy4MbEmqz5NLw3qv1nPxOZqVjH7QlY-qYSCg
 9 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcToB4nJPqVwnzn0xeasnXyhxGgOqHXdypE6KZIMTfV9k52eIrE3iYsA6Ixm
10 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTkKw0LpqdB2eQMUpwdQdvM9DTeNtq1mrvMNivoQtN37p3m0OPsx4ME9i4O
11 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSZGzMf_3hmdDktz91yp5ZQi-eGWLCenZ0U446sXT2nqYuwlWRI_V_BVIWi
12 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTQF-55T5GM3dLdaoafPdlIYK0ESNvM6-Bsb4-B2rQTeyD5gGoCKxokExM-
13 src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRoRjo4TFeXmx47zE6VH0ylcO0IQ2HBsOHYIMJCI9MsRyg_PF1WhHbqG76Q
14 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcRrdegt1koEy51dLWrJAbVMJBlCEZ7fPl2mztYYM6onvxocRCq030Ft1gE
15 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTtnQpte0uq9Ue9nsg25GeO1kw_-Hcn69ozTQkiMBHrXKwlANutyhwKD9XM
16 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRNRdxzmuFKABoGgyv2SC0gMticosL2LB3V1fBMOwNtVBZxHkyMw4IcWBFj
17 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQr40CEf75nWCj5dg-oeKtb9zK6mhktu7vnfoYAh5ioy34goC3c9ptDkQwP
18 src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcQUnyHrVEbppqhZnWnQrijhBFP0X34gRf7pKw6PdT4ggepB2k9g-p71sgGh
19 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcR9Us9qblbTJaw47gULXCI8sHKN4I61gYsT2ijebtZzgsMDI8GmYqQpIIw
20 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSIrW-IbBZjM9Ztn60r9QE1_FIMjt494qGX12tqsLsibYPLuFVwyVSgz1I

 

posted @ 2014-12-12 04:56  疾风剑  阅读(1629)  评论(0编辑  收藏  举报