需求:从网址http://www3.tjcu.edu.cn/wangshangketang/yuanneike/guanlixue/sjxz.htm上下载所有的pdf文件

代码如下:

  1 import java.io.File;
2 import java.io.FileOutputStream;
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.URLEncoder;
6 import java.util.ArrayList;
7 import java.util.List;
8 import java.util.Timer;
9 import java.util.TimerTask;
10
11 import org.apache.http.HttpEntity;
12 import org.apache.http.HttpResponse;
13 import org.apache.http.client.ClientProtocolException;
14 import org.apache.http.client.HttpClient;
15 import org.apache.http.client.methods.HttpGet;
16 import org.apache.http.conn.ClientConnectionManager;
17 import org.apache.http.conn.params.ConnManagerParams;
18 import org.apache.http.conn.params.ConnPerRouteBean;
19 import org.apache.http.conn.scheme.PlainSocketFactory;
20 import org.apache.http.conn.scheme.Scheme;
21 import org.apache.http.conn.scheme.SchemeRegistry;
22 import org.apache.http.conn.ssl.SSLSocketFactory;
23 import org.apache.http.impl.client.DefaultHttpClient;
24 import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
25 import org.apache.http.params.BasicHttpParams;
26 import org.apache.http.params.HttpConnectionParams;
27 import org.apache.http.params.HttpParams;
28 import org.apache.http.protocol.BasicHttpContext;
29 import org.apache.http.protocol.HttpContext;
30
31 import org.htmlparser.Node;
32 import org.htmlparser.NodeFilter;
33 import org.htmlparser.Parser;
34 import org.htmlparser.filters.AndFilter;
35 import org.htmlparser.filters.NodeClassFilter;
36 import org.htmlparser.tags.LinkTag;
37 import org.htmlparser.util.NodeList;
38 import org.htmlparser.util.ParserException;
39
40 public class Crawler implements Runnable{
41 public static String SAVE="C:/Users/Administrator/Downloads";//下载保存路径
42 private String url="";//要抓取的网页地址
43 public Crawler(String url){
44 this.url=url;
45 }
46 public Crawler(){}
47 /**
48 *
49 * @param url 要抓取的网页的地址
50 * @return 这个对应的内容
51 * @throws ClientProtocolException
52 * @throws IOException
53 */
54 private String crawl(String url) throws ClientProtocolException, IOException{
55 System.out.println("[INFO] Crawl From : "+url);
56 HttpClient httpClient = new DefaultHttpClient();
57 HttpGet httpGet=new HttpGet(url);
58 HttpResponse httpResponse = httpClient.execute(httpGet);
59 HttpEntity httpEntity=httpResponse.getEntity();
60 InputStream inStream=httpEntity.getContent();
61 String content="";
62 while(true){
63 byte[] bytes=new byte[1024*1000];
64 int k=inStream.read(bytes);
65 if(k>=0)content=content+new String(bytes,0,k);
66 else break;
67 System.out.println(content);
68 System.out.println("=========================================================================================");
69 }
70 return content;
71 }
72
73 public void run(){
74 try {
75 String prefix=this.url.substring(0,this.url.lastIndexOf("/"));
76 String content=this.crawl(this.url);//抓取网页内容
77 Parser parser=new Parser(content); //使用HTMLParser对网页内容进行解析
78 NodeFilter filter;
79 NodeList list;
80 filter=new NodeClassFilter(LinkTag.class);
81 filter=new AndFilter(filter,new NodeFilter(){
82 public boolean accept(Node node) {
83 return ((LinkTag)node).isHTTPLink();
84 }});
85 list=parser.extractAllNodesThatMatch(filter);
86 List<String> urlsList =new ArrayList<String>();
87 for(int i=0;i<list.size();i++){
88 String[] array=list.elementAt(i).getText().split("\"");
89 if(array[1].endsWith(".pdf")||array[1].endsWith(".PDF")){//只下载pdf
90 String downloadUrl=new String(prefix+"/"+array[1]);
91 urlsList.add(downloadUrl);//生成需要下载的地址
92 }
93 }
94 //从这里开始是进行下载,使用了多线程执行请求
95 HttpParams params=new BasicHttpParams();
96 //ConnManagerParams.setTimeout(params, 60000*3); //设置连接最大等待时间
97 ConnManagerParams.setMaxConnectionsPerRoute(params, new ConnPerRouteBean(50));//设置并发数
98 //HttpConnectionParams.setConnectionTimeout(params, 60000*2); //设置连接超时时间
99 HttpConnectionParams.setSoTimeout(params, 60000*10);//设置读取超时时间
100
101 SchemeRegistry schemeRegistry=new SchemeRegistry();
102 schemeRegistry.register(new Scheme("http",PlainSocketFactory.getSocketFactory(),80));
103 schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443));
104 ThreadSafeClientConnManager cm=new ThreadSafeClientConnManager(params,schemeRegistry);
105
106 HttpClient httpClient=new DefaultHttpClient(cm,params);
107 Thread[] threads=new Thread[urlsList.size()];
108 int n=0;
109 for(String url:urlsList){
110 String path=Crawler.SAVE+url.substring(url.lastIndexOf("/"), url.length());
111 url=url.substring(0, url.lastIndexOf("/"))+"/"+URLEncoder.encode(url.substring(url.lastIndexOf("/")+1,url.length()),"UTF-8");
112 HttpGet httpGet=new HttpGet(url);
113 threads[n]=new Thread(new Downloader(httpClient,httpGet,url,path));
114 n++;
115 }
116 for(Thread thread:threads)thread.start();
117 for(Thread thread:threads)if(thread.isAlive())thread.join();
118 }catch (InterruptedException e) {
119 System.out.println("[ERROR] Download InterruptedException : "+e.toString());
120 //e.printStackTrace();
121 } catch (ParserException e) {
122 System.out.println("[ERROR] Parse ParserException : "+e.toString());
123 //e.printStackTrace();
124 }catch (ClientProtocolException e) {
125 System.out.println("[ERROR] Crawl ClientProtocolException : "+e.toString());
126 //e.printStackTrace();
127 } catch (IOException e) {
128 System.out.println("[ERROR] Crawl IOException : "+e.toString());
129 //e.printStackTrace();
130 }
131 }
132 public static void main(String[] args) {
133 //入口程序
134 Crawler crawler=new Crawler("http://www3.tjcu.edu.cn/wangshangketang/yuanneike/guanlixue/sjxz.htm");//这里设定网页地址
135 Thread thread=new Thread(crawler);
136 thread.start();
137
138 }
139
140 }
141
142 //类Downloader真正的执行了写入网络数据到文件的步骤
143 class Downloader implements Runnable{
144 private String url="";
145 private String path="";
146 private final HttpClient httpClient;
147 private final HttpContext httpContext;
148 private final HttpGet httpGet;
149 /**
150 *
151 * @param httpClient 多个线程共享的HtppClient
152 * @param httpGet 要下载的HttpGet
153 * @param url 资源网络地址
154 * @param path 资源下载之后本地的保存路径
155 */
156 public Downloader(HttpClient httpClient,HttpGet httpGet,String url,String path){
157 this.httpClient=httpClient;
158 this.httpGet=httpGet;
159 this.httpContext=new BasicHttpContext();
160 this.path=path;
161 this.url=url;
162
163 }
164
165 public void run() {
166 System.out.println("[INFO] Download From : "+this.url);
167 File file=new File(this.path);
168 if(file.exists())file.delete();
169 try {
170 //使用file来写入本地数据
171 file.createNewFile();
172 FileOutputStream outStream = new FileOutputStream(this.path);
173
174 //执行请求,获得响应
175 HttpResponse httpResponse = this.httpClient.execute(this.httpGet,this.httpContext);
176
177 System.out.println("[STATUS] Download : "+httpResponse.getStatusLine()+" [FROM] "+this.path);
178
179 HttpEntity httpEntity=httpResponse.getEntity();
180 InputStream inStream=httpEntity.getContent();
181 while(true){//这个循环读取网络数据,写入本地文件
182 byte[] bytes=new byte[1024*1000];
183 int k=inStream.read(bytes);
184 if(k>=0){
185 outStream.write(bytes,0,k);
186 outStream.flush();
187 }
188 else break;
189 }
190 inStream.close();
191 outStream.close();
192 } catch (IOException e){
193 this.httpGet.abort();
194 System.out.println("[ERROR] Download IOException : "+e.toString()+" [FROM] : "+this.path);
195 //e.printStackTrace();
196 }
197 }
198
199 }


 

posted on 2011-11-25 16:37  Yakov  阅读(8781)  评论(2编辑  收藏  举报