我的第一个web spider,在处理链接的时候有点小问题

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.net.*;
import java.io.*;

public class Crawler {

	/**
	 * @param args
	 */
	ArrayList unVisitedURL = new ArrayList();
	HashSet VisitedURL = new HashSet();

	public String downloadURL(String url) {
		try {
			
			URL pageurl = new URL(url);
			BufferedReader reader = (new BufferedReader(new InputStreamReader(
					pageurl.openStream())));
			String line;
			StringBuffer pageBuffer = new StringBuffer();
			while ((line = reader.readLine()) != null) {
				pageBuffer.append(line);
			}
			
			return pageBuffer.toString();
		} catch (Exception e) {
			System.out.println("Fail to download the page named " + url);
		}
		return null;
	}

	private void get_reexp(String s, String url) {
		// System.out.println(s);
		Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
				Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(s);
		while (m.find()) {
			String link = m.group(1).trim();
			if (!link.equals("#") && link.indexOf("mailto") == -1
					&& link.indexOf("javascript") == -1 && link.indexOf("http:") == 0) {
				if (link.indexOf('/') == 0)
					link = url + link;
				if (!VisitedURL.contains(link)&&link!=null && link!="")
					unVisitedURL.add(link);
			}
		}
	}

	private void savePage(String Content, String url) {
		try {
			FileWriter bw=new FileWriter(new File(System.currentTimeMillis()+".txt"));
			bw.write(Content);
			System.out.println(url+"的大小是:"+Content.length());			
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	public void graspWeb() {
		String url = (String) unVisitedURL.iterator().next();
		if (!VisitedURL.contains(url)) {
			String Content = downloadURL(url);
			get_reexp(Content, url);
			savePage(Content, url);
			VisitedURL.add(url);
		}
		unVisitedURL.remove(url);
		graspWeb();
	}

	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		Crawler myCrawler = new Crawler();
		myCrawler.unVisitedURL.add("http://sina.cn");
		myCrawler.graspWeb();
	}

}

 

posted on 2012-03-03 16:49  _Clarence  阅读(127)  评论(0编辑  收藏  举报

导航