通过网络得到html,并解析出其中网址(JAVA程序)
网络版程序:
import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; public class TestIndex { private String rootUrl = "http://localhost/apk/"; private String listUrl = rootUrl + "test-index.htm"; private static List<String> imageUrlList = new ArrayList<String>(); public static void main(String args[]){ TestIndex ti = new TestIndex(); ti.getData(); System.out.println(imageUrlList.size()); for(int i=0; i<imageUrlList.size();i++){ System.out.println(imageUrlList.get(i)); } } private InputStream getNetInputStream(String urlStr) { try { URL url = new URL(urlStr); URLConnection conn = url.openConnection(); conn.connect(); InputStream is = conn.getInputStream(); return is; } catch (Exception e) { } return null; } private void getData() { try { InputStream is = getNetInputStream(listUrl); InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr); String s = null; String html=""; while ((s = br.readLine()) != null) { html+=s; } is.close(); String startStr = "src=\"https://"; String endStr = " width="; int start = 0; int end = 0; int index =0; imageUrlList.clear(); while (true) { start = html.indexOf(startStr, index); if (start < 0) break; index=start; end = html.indexOf(endStr, index); String ss = html.substring(start+5,end-1); imageUrlList.add(ss); index +=ss.length(); } } catch (Exception e) { // TODO: handle exception } } }
本地版程序:
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; public class TestIndex_IO { private String rootUrl = "D:/Hixin/webandroid/"; private String listUrl = rootUrl + "test-index.htm"; private static List<String> imageUrlList = new ArrayList<String>(); public static void main(String args[]){ TestIndex_IO ti = new TestIndex_IO(); ti.getData(); System.out.println(imageUrlList.size()); for(int i=0; i<imageUrlList.size();i++){ System.out.println(imageUrlList.get(i)); } } private InputStream getNetInputStream(String urlStr) { InputStream is; try { is = new FileInputStream(new File(urlStr)); return is; } catch (FileNotFoundException e) { e.printStackTrace(); } return null; } private void getData() { try { InputStream is = getNetInputStream(listUrl); InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr); String s = null; String html=""; while ((s = br.readLine()) != null) { html+=s; } is.close(); String startStr = "src=\"https://"; String endStr = " width="; int start = 0; int end = 0; int index =0; imageUrlList.clear(); while (true) { start = html.indexOf(startStr, index); if (start < 0) break; index=start; end = html.indexOf(endStr, index); String ss = html.substring(start+5,end-1); imageUrlList.add(ss); index +=ss.length(); } } catch (Exception e) { // TODO: handle exception } } }
差别仅仅在于private InputStream getNetInputStream(String urlStr)函数。为避免中文乱码,建议InputStreamReader isr = new InputStreamReader(is,"utf-8");
InputStream is = getNetInputStream(listUrl); InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr); String s = null; String html=""; while ((s = br.readLine()) != null) { html+=s; } System.out.println(html.length()); is.close();
输出结果为:77300
InputStream is = getNetInputStream(listUrl); InputStreamReader isr = new InputStreamReader(is,"utf-8"); BufferedReader br = new BufferedReader(isr); String s = null; String html=""; while ((s = br.readLine()) != null) { html+=s; } System.out.println(html.length()); is.close();
输出结果为:77135
private String writeUrl = "D:/newfile/new/new"; private String fileName ="test-index.htm"; File f = new File(writeUrl); if(!f.exists()) { f.mkdirs(); } File f1 = new File(f, fileName); FileOutputStream fos = new FileOutputStream(f1); OutputStreamWriter osw = new OutputStreamWriter(fos); osw.write(html,0,html.length()); osw.flush(); osw.close();
解析出htm文件中包含的网址。
结果:
20
https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRvQgUjsVDBncM3mVIgIyIuE87BnlyJUy2BNsAp8kUoTanrC_css5mVAw
https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcThd8cYjOTmCgYJZxX5ls-xpxaAlH1_yocOSCqI5_7OkL29SNtbCZ7q2Yoj
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTl-FzKmsppxuwzmTITGCv9uDxmrWr1pG0lw8mUD9wkWIloASxQeBEMnVjz
https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcQWbmiZJIXKHV2IoTBp7zSY6kD5g5VPzVtBTLJYYR5nwTtKi2-0_u93qL4e
https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSlrLi_GtVgUehU7coFe1eMdrJxPdvS42iTqXkla0g75s31NBfAq2u1LE4
https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSkrlyGxSs8Dr_7k3MUvoGq1vE45LgHZ0zEhIEdD9LLZiaoMcE7IAqn8ho
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTu__OUSJ4R4EKBu4jOi2ZAdHohpVQIBy3-SfnI8FYpN8wVC9kJG_aWuk_w
https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcR3Bf7YtsHJ813A5_wWzpxIy4MbEmqz5NLw3qv1nPxOZqVjH7QlY-qYSCg
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcToB4nJPqVwnzn0xeasnXyhxGgOqHXdypE6KZIMTfV9k52eIrE3iYsA6Ixm
https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTkKw0LpqdB2eQMUpwdQdvM9DTeNtq1mrvMNivoQtN37p3m0OPsx4ME9i4O
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSZGzMf_3hmdDktz91yp5ZQi-eGWLCenZ0U446sXT2nqYuwlWRI_V_BVIWi
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTQF-55T5GM3dLdaoafPdlIYK0ESNvM6-Bsb4-B2rQTeyD5gGoCKxokExM-
https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRoRjo4TFeXmx47zE6VH0ylcO0IQ2HBsOHYIMJCI9MsRyg_PF1WhHbqG76Q
https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcRrdegt1koEy51dLWrJAbVMJBlCEZ7fPl2mztYYM6onvxocRCq030Ft1gE
https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTtnQpte0uq9Ue9nsg25GeO1kw_-Hcn69ozTQkiMBHrXKwlANutyhwKD9XM
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRNRdxzmuFKABoGgyv2SC0gMticosL2LB3V1fBMOwNtVBZxHkyMw4IcWBFj
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQr40CEf75nWCj5dg-oeKtb9zK6mhktu7vnfoYAh5ioy34goC3c9ptDkQwP
https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcQUnyHrVEbppqhZnWnQrijhBFP0X34gRf7pKw6PdT4ggepB2k9g-p71sgGh
https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcR9Us9qblbTJaw47gULXCI8sHKN4I61gYsT2ijebtZzgsMDI8GmYqQpIIw
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSIrW-IbBZjM9Ztn60r9QE1_FIMjt494qGX12tqsLsibYPLuFVwyVSgz1I
用正则表达式更简单:
1 InputStream is = getNetInputStream(listUrl); 2 InputStreamReader isr = new InputStreamReader(is); 3 BufferedReader br = new BufferedReader(isr); 4 String s = ""; 5 6 while ((s = br.readLine()) != null) 7 { 8 Pattern p = Pattern.compile("src=\"https[^\"]+"); 9 Matcher m = p.matcher(s); 10 while(m.find()) { 11 System.out.println(m.group()); 12 } 13 }
1 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRvQgUjsVDBncM3mVIgIyIuE87BnlyJUy2BNsAp8kUoTanrC_css5mVAw 2 src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcThd8cYjOTmCgYJZxX5ls-xpxaAlH1_yocOSCqI5_7OkL29SNtbCZ7q2Yoj 3 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTl-FzKmsppxuwzmTITGCv9uDxmrWr1pG0lw8mUD9wkWIloASxQeBEMnVjz 4 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcQWbmiZJIXKHV2IoTBp7zSY6kD5g5VPzVtBTLJYYR5nwTtKi2-0_u93qL4e 5 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSlrLi_GtVgUehU7coFe1eMdrJxPdvS42iTqXkla0g75s31NBfAq2u1LE4 6 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSkrlyGxSs8Dr_7k3MUvoGq1vE45LgHZ0zEhIEdD9LLZiaoMcE7IAqn8ho 7 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTu__OUSJ4R4EKBu4jOi2ZAdHohpVQIBy3-SfnI8FYpN8wVC9kJG_aWuk_w 8 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcR3Bf7YtsHJ813A5_wWzpxIy4MbEmqz5NLw3qv1nPxOZqVjH7QlY-qYSCg 9 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcToB4nJPqVwnzn0xeasnXyhxGgOqHXdypE6KZIMTfV9k52eIrE3iYsA6Ixm 10 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTkKw0LpqdB2eQMUpwdQdvM9DTeNtq1mrvMNivoQtN37p3m0OPsx4ME9i4O 11 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSZGzMf_3hmdDktz91yp5ZQi-eGWLCenZ0U446sXT2nqYuwlWRI_V_BVIWi 12 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTQF-55T5GM3dLdaoafPdlIYK0ESNvM6-Bsb4-B2rQTeyD5gGoCKxokExM- 13 src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRoRjo4TFeXmx47zE6VH0ylcO0IQ2HBsOHYIMJCI9MsRyg_PF1WhHbqG76Q 14 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcRrdegt1koEy51dLWrJAbVMJBlCEZ7fPl2mztYYM6onvxocRCq030Ft1gE 15 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTtnQpte0uq9Ue9nsg25GeO1kw_-Hcn69ozTQkiMBHrXKwlANutyhwKD9XM 16 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRNRdxzmuFKABoGgyv2SC0gMticosL2LB3V1fBMOwNtVBZxHkyMw4IcWBFj 17 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQr40CEf75nWCj5dg-oeKtb9zK6mhktu7vnfoYAh5ioy34goC3c9ptDkQwP 18 src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcQUnyHrVEbppqhZnWnQrijhBFP0X34gRf7pKw6PdT4ggepB2k9g-p71sgGh 19 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcR9Us9qblbTJaw47gULXCI8sHKN4I61gYsT2ijebtZzgsMDI8GmYqQpIIw 20 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSIrW-IbBZjM9Ztn60r9QE1_FIMjt494qGX12tqsLsibYPLuFVwyVSgz1I