HttpClient入门实例之简单的pdf文件爬虫

需求：从网址http://www3.tjcu.edu.cn/wangshangketang/yuanneike/guanlixue/sjxz.htm上下载所有的pdf文件
代码如下：
  1 import java.io.File;
  2 import java.io.FileOutputStream;
  3 import java.io.IOException;
  4 import java.io.InputStream;
  5 import java.net.URLEncoder;
  6 import java.util.ArrayList;
  7 import java.util.List;
  8 import java.util.Timer;
  9 import java.util.TimerTask;
 10 
 11 import org.apache.http.HttpEntity;
 12 import org.apache.http.HttpResponse;
 13 import org.apache.http.client.ClientProtocolException;
 14 import org.apache.http.client.HttpClient;
 15 import org.apache.http.client.methods.HttpGet;
 16 import org.apache.http.conn.ClientConnectionManager;
 17 import org.apache.http.conn.params.ConnManagerParams;
 18 import org.apache.http.conn.params.ConnPerRouteBean;
 19 import org.apache.http.conn.scheme.PlainSocketFactory;
 20 import org.apache.http.conn.scheme.Scheme;
 21 import org.apache.http.conn.scheme.SchemeRegistry;
 22 import org.apache.http.conn.ssl.SSLSocketFactory;
 23 import org.apache.http.impl.client.DefaultHttpClient;
 24 import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
 25 import org.apache.http.params.BasicHttpParams;
 26 import org.apache.http.params.HttpConnectionParams;
 27 import org.apache.http.params.HttpParams;
 28 import org.apache.http.protocol.BasicHttpContext;
 29 import org.apache.http.protocol.HttpContext;
 30 
 31 import org.htmlparser.Node;
 32 import org.htmlparser.NodeFilter;
 33 import org.htmlparser.Parser;
 34 import org.htmlparser.filters.AndFilter;
 35 import org.htmlparser.filters.NodeClassFilter;
 36 import org.htmlparser.tags.LinkTag;
 37 import org.htmlparser.util.NodeList;
 38 import org.htmlparser.util.ParserException;
 39 
 40 public class Crawler implements Runnable{
 41     public static String SAVE="C:/Users/Administrator/Downloads";//下载保存路径
 42     private String url="";//要抓取的网页地址
 43     public Crawler(String url){
 44         this.url=url;
 45     }
 46     public Crawler(){}
 47     /**
 48      * 
 49      * @param url 要抓取的网页的地址
 50      * @return 这个对应的内容
 51      * @throws ClientProtocolException
 52      * @throws IOException
 53 */
 54     private String crawl(String url) throws ClientProtocolException, IOException{
 55         System.out.println("[INFO] Crawl From : "+url);
 56         HttpClient httpClient = new DefaultHttpClient();
 57         HttpGet httpGet=new HttpGet(url);
 58         HttpResponse httpResponse = httpClient.execute(httpGet);
 59         HttpEntity httpEntity=httpResponse.getEntity();
 60         InputStream inStream=httpEntity.getContent();
 61         String content="";
 62         while(true){
 63             byte[] bytes=new byte[1024*1000];
 64             int k=inStream.read(bytes);
 65             if(k>=0)content=content+new String(bytes,0,k);
 66             else break;
 67             System.out.println(content);
 68             System.out.println("=========================================================================================");
 69         }
 70         return content;
 71     }
 72     
 73     public void run(){
 74         try {
 75             String prefix=this.url.substring(0,this.url.lastIndexOf("/"));
 76             String content=this.crawl(this.url);//抓取网页内容
 77             Parser parser=new Parser(content); //使用HTMLParser对网页内容进行解析
 78             NodeFilter filter;
 79             NodeList list;
 80             filter=new NodeClassFilter(LinkTag.class);
 81             filter=new AndFilter(filter,new NodeFilter(){
 82                 public boolean accept(Node node) {
 83                     return ((LinkTag)node).isHTTPLink();
 84                 }});
 85             list=parser.extractAllNodesThatMatch(filter);
 86             List<String> urlsList =new ArrayList<String>();
 87             for(int i=0;i<list.size();i++){
 88                 String[] array=list.elementAt(i).getText().split("\"");
 89                 if(array[1].endsWith(".pdf")||array[1].endsWith(".PDF")){//只下载pdf
 90                     String downloadUrl=new String(prefix+"/"+array[1]);
 91                     urlsList.add(downloadUrl);//生成需要下载的地址
 92                 }
 93             }
 94             //从这里开始是进行下载，使用了多线程执行请求
 95             HttpParams params=new BasicHttpParams();
 96             //ConnManagerParams.setTimeout(params, 60000*3); //设置连接最大等待时间
 97             ConnManagerParams.setMaxConnectionsPerRoute(params, new ConnPerRouteBean(50));//设置并发数
 98 //HttpConnectionParams.setConnectionTimeout(params, 60000*2);  //设置连接超时时间
 99             HttpConnectionParams.setSoTimeout(params, 60000*10);//设置读取超时时间
100             
101             SchemeRegistry schemeRegistry=new SchemeRegistry();
102             schemeRegistry.register(new Scheme("http",PlainSocketFactory.getSocketFactory(),80));
103             schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443)); 
104             ThreadSafeClientConnManager cm=new ThreadSafeClientConnManager(params,schemeRegistry);
105             
106             HttpClient httpClient=new DefaultHttpClient(cm,params);
107             Thread[] threads=new Thread[urlsList.size()];
108             int n=0;
109             for(String url:urlsList){
110                 String path=Crawler.SAVE+url.substring(url.lastIndexOf("/"), url.length());
111                 url=url.substring(0, url.lastIndexOf("/"))+"/"+URLEncoder.encode(url.substring(url.lastIndexOf("/")+1,url.length()),"UTF-8");
112                 HttpGet httpGet=new HttpGet(url);
113                 threads[n]=new Thread(new Downloader(httpClient,httpGet,url,path));
114                 n++;
115             }
116             for(Thread thread:threads)thread.start();
117             for(Thread thread:threads)if(thread.isAlive())thread.join();
118             }catch (InterruptedException e) {
119                 System.out.println("[ERROR] Download InterruptedException : "+e.toString());
120                 //e.printStackTrace();
121             } catch (ParserException e) {
122             System.out.println("[ERROR] Parse ParserException : "+e.toString());
123             //e.printStackTrace();
124         }catch (ClientProtocolException e) {
125             System.out.println("[ERROR] Crawl ClientProtocolException : "+e.toString());
126             //e.printStackTrace();
127         } catch (IOException e) {
128             System.out.println("[ERROR] Crawl IOException : "+e.toString());
129             //e.printStackTrace();
130         }
131     }
132     public static void main(String[] args) {
133         //入口程序
134         Crawler crawler=new Crawler("http://www3.tjcu.edu.cn/wangshangketang/yuanneike/guanlixue/sjxz.htm");//这里设定网页地址
135         Thread thread=new Thread(crawler);
136         thread.start();
137         
138     }
139 
140 }
141 
142 //类Downloader真正的执行了写入网络数据到文件的步骤
143 class Downloader implements Runnable{
144     private String url="";
145     private String path="";
146     private final HttpClient httpClient;
147     private final HttpContext httpContext;
148     private final HttpGet httpGet;
149     /**
150      * 
151      * @param httpClient 多个线程共享的HtppClient
152      * @param httpGet 要下载的HttpGet
153      * @param url 资源网络地址
154      * @param path 资源下载之后本地的保存路径
155 */
156     public Downloader(HttpClient httpClient,HttpGet httpGet,String url,String path){
157         this.httpClient=httpClient;
158         this.httpGet=httpGet;
159         this.httpContext=new BasicHttpContext();
160         this.path=path;
161         this.url=url;
162         
163     }
164     
165     public void run() {
166         System.out.println("[INFO] Download From : "+this.url);
167         File file=new File(this.path);
168         if(file.exists())file.delete();
169         try {
170             //使用file来写入本地数据
171             file.createNewFile();
172             FileOutputStream outStream = new FileOutputStream(this.path);
173             
174             //执行请求，获得响应
175             HttpResponse httpResponse = this.httpClient.execute(this.httpGet,this.httpContext);
176             
177             System.out.println("[STATUS] Download : "+httpResponse.getStatusLine()+" [FROM] "+this.path);
178             
179             HttpEntity httpEntity=httpResponse.getEntity();
180             InputStream inStream=httpEntity.getContent();
181             while(true){//这个循环读取网络数据，写入本地文件
182                 byte[] bytes=new byte[1024*1000];
183                 int k=inStream.read(bytes);
184                 if(k>=0){
185                     outStream.write(bytes,0,k);
186                     outStream.flush();
187                 }
188                 else break;
189             }
190             inStream.close();
191             outStream.close();
192         } catch (IOException e){
193             this.httpGet.abort();
194             System.out.println("[ERROR] Download IOException : "+e.toString()+" [FROM] : "+this.path);
195             //e.printStackTrace();
196         }
197     }
198     
199 }
posted on 2011-11-25 16:37 Yakov 阅读(8760) 评论(2) 编辑收藏举报