网页抓取
//根据书名,获取相关图书的ISBN号。抓取的是豆瓣读书的搜索页面。
1 package cn.edu.xmu.zgy; 2 3 import java.io.BufferedReader; 4 import java.io.InputStreamReader; 5 import java.net.HttpURLConnection; 6 import java.net.URL; 7 import java.util.ArrayList; 8 9 public class WebpageCapture { 10 public ArrayList<String> captureHtml(String urls,String begins,String ends) throws Exception { 11 String strURL =urls; 12 URL url = new URL(strURL); 13 HttpURLConnection httpConn = (HttpURLConnection) url.openConnection(); 14 InputStreamReader input = new InputStreamReader( 15 httpConn.getInputStream(), "utf-8"); 16 BufferedReader bufReader = new BufferedReader(input); 17 String line = ""; 18 StringBuilder contentBuf = new StringBuilder(); 19 while ((line = bufReader.readLine()) != null) { 20 contentBuf.append(line); 21 } 22 String buf = contentBuf.toString(); 23 ArrayList<String> al=new ArrayList<String>(); 24 while(true){ 25 int beginIx = buf.indexOf(begins); 26 27 if(beginIx==-1) 28 break; 29 String result = buf.substring(beginIx); 30 int endIx = result.indexOf(ends); 31 if(endIx==-1) 32 break; 33 buf=result.substring(endIx+10); 34 result = result.substring(0, endIx); 35 al.add(result); 36 //System.out.println("captureHtml()的结果:\n" + result); 37 } 38 return al; 39 } 40 41 42 public void captureJavascript(String postid) throws Exception { 43 String strURL = "http://www.kiees.cn/sf.php?wen=" + postid 44 + "&channel="; 45 URL url = new URL(strURL); 46 HttpURLConnection httpConn = (HttpURLConnection) url.openConnection(); 47 InputStreamReader input = new InputStreamReader( 48 httpConn.getInputStream(), "utf-8"); 49 BufferedReader bufReader = new BufferedReader(input); 50 String line = ""; 51 StringBuilder contentBuf = new StringBuilder(); 52 while ((line = bufReader.readLine()) != null) { 53 contentBuf.append(line); 54 } 55 System.out.println("captureJavascript()的结果:\n" + contentBuf.toString()); 56 } 57 58 public static void main(String[] args) { 59 WebpageCapture demo = new WebpageCapture(); 60 try { 61 String ip="算法导论"; 62 String strURL = "http://book.douban.com/subject_search?search_text="+ip+"&cat=1001"; 63 String begin="a class=\"nbg\" href=\"http://book."; 64 String end="/\" "+ 65 " onclick=""; 66 ArrayList<String> AL1=new ArrayList<String>(); 67 ArrayList<String> AL2=new ArrayList<String>(); 68 String result; 69 AL1=demo.captureHtml(strURL,begin,end); 70 for(int i=0;i<AL1.size();i++) 71 { 72 AL1.set(i, AL1.get(i).substring(AL1.get(i).indexOf("http:"))); 73 System.out.println(AL1.get(i)); 74 } 75 76 begin="ISBN:</span> "; 77 end="<br/>"; 78 for(int i=0;i<AL1.size();i++){ 79 strURL=AL1.get(i); 80 AL2=demo.captureHtml(strURL,begin,end); 81 AL2.set(0, AL2.get(0).substring(13)); 82 System.out.println(AL2.get(0)); 83 } 84 //demo.captureJavascript("107818590577"); 85 } catch (Exception e) { 86 e.printStackTrace(); 87 } 88 } 89 90 91 }
参考链接:http://blog.csdn.net/zgyulongfei/article/details/7909006
网页图片下载:http://blog.csdn.net/oyzl68/article/details/9706373
代码很好
附带一个看不懂的抓取:http://www.oschina.net/code/snippet_1021353_35133?p=1