代码改变世界

【搜索引擎Jediael开发笔记2】使用HttpClient下载网页至本地文件

2014-05-19 15:07  jediael  阅读(131)  评论(0编辑  收藏  举报

本文使用HttpClient根据url进行网页下载。其中

(1)HttpClient的相关知识请参见 HttpClient基础教程

(2)


package org.ljh.search.downloadpage;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.Scanner;

import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

//本类用于将指定url对应的网页下载至本地一个文件。
public class PageDownloader {

	public static void downloadPageByGetMethod(String url) throws IOException {

		// 1、通过HttpGet获取到response对象
		CloseableHttpClient httpClient = HttpClients.createDefault();
		// 注意,必需要加上http://的前缀,否则会报:Target host is null异常。
		HttpGet httpGet = new HttpGet(url);
		CloseableHttpResponse response = httpClient.execute(httpGet);

		InputStream is = null;
		if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
			try {
				// 2、获取response的entity。
				HttpEntity entity = response.getEntity();

				// 3、获取到InputStream对象,并对内容进行处理
				is = entity.getContent();

				String fileName = getFileName(url);
				saveToFile("D:\\tmp\\", fileName, is);
			} catch (ClientProtocolException e) {
				e.printStackTrace();
			} finally {

				if (is != null) {
					is.close();
				}
				if (response != null) {
					response.close();
				}
			}
		}
	}

	//将输入流中的内容输出到path指定的路径,fileName指定的文件名
	private static void saveToFile(String path, String fileName, InputStream is) {
		Scanner sc = new Scanner(is);
		Writer os = null;
		try {
			os = new PrintWriter(path + fileName);
			while (sc.hasNext()) {
				os.write(sc.nextLine());
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (sc != null) {
				sc.close();
			}
			if (os != null) {
				try{
				os.flush();
				os.close();
				}catch(IOException e){
					e.printStackTrace();
					System.out.println("输出流关闭失败!");
				}
			}
		}
	}

	// 将url中的特殊字符用下划线代替
	private static String getFileName(String url) {
		url = url.substring(7);
		String fileName = url.replaceAll("[\\?:*|<>\"/]", "_") + ".html";
		return fileName;
	}

}