搜索网站的多线程爬虫

package GetUrls;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


public class GetIt {
	private static BufferedWriter bw;
	public static ArrayList<String> AllUrls=new ArrayList<String>();
	public static ArrayList<String> get=new ArrayList<String>();
	public static String sitename="smarter";
	public static GetIt a=new GetIt();
	public static ArrayList<String> JsName=new ArrayList<String>();
	private static BufferedReader br;
	public static ArrayList<String> tmp=new ArrayList<String>();
	public static ArrayList<String> Error=new ArrayList<String>();
	public static ExecutorService p = null;
//	public static ArrayList<String> ErrorName=new ArrayList<String>();
	public static void main(String args[]) throws IOException{
		File f1 = new File("./src/GetAllUrljs/jsname");
		FileReader reader = new FileReader(f1);
		br = new BufferedReader(reader);
		String line="";
		while((line=br.readLine()) != null){
			JsName.add(line);
		}
//		File f = new File("./src/GetUrls/errorname.txt");
//		FileReader reader1 = new FileReader(f);
//		br = new BufferedReader(reader1);
//		String line1="";
//		while((line1=br.readLine()) != null){
//			ErrorName.add(line1);
//		}
		String url="http://www."+sitename+".com";
		a.getAll(url);
		for(int i=0;i<get.size();i++){
			if(AllUrls.contains(get.get(i))==false){
				tmp.add(get.get(i));
			}
		}
		a.getrun();
		if(Error.size()>0){
			File f2 = new File("./src/GetUrls/"+sitename+"error.txt");
			FileWriter writer = new FileWriter(f2, true);
			bw = new BufferedWriter(writer);
			for(int i=0;i<Error.size();i++){
				bw.write(Error.get(i));
				bw.newLine();
				bw.flush();
			}
			bw.close();
		}
		
	}
	public void getrun() {
		// TODO Auto-generated method stub
		get.clear();
		p=Executors.newFixedThreadPool(10);
		for(int i=0;i<tmp.size();i++){
			p.execute(new runer(tmp.get(i)));
		}
		while (p.isTerminated() == false) {
			p.shutdown();
			tmp.clear();
		}
		for(int i=0;i<get.size();i++){
			if(AllUrls.contains(get.get(i))==false){
				tmp.add(get.get(i));
			}
		}
		if(tmp.size()>0){
			a.getrun();
		}
	}
	public void getAll(String url) throws IOException {
		// TODO Auto-generated method stub
		AllUrls.add(url);
		try{
			Document doc = Jsoup.connect(url).timeout(120000).get();
			Elements links=doc.select("a[href]");
			for(Element link :links){
				String urls=link.attr("href");
//				for(int i=0;i<ErrorName.size();i++){
//					if(urls.equals(ErrorName.get(i))){
//						System.out.println(url+"\t"+ErrorName.get(i));
//					}
//				}
//				CSUS判断逻辑
//				if(urls !=null){
//					if((urls.substring(0, 1)).equals("/") && urls.length()>2 && urls.indexOf("/topic/")<0 && urls.indexOf("/sst/")<0 && urls.indexOf("search.php")<0 && urls.indexOf(".jpg")<0 && urls.indexOf("/track/scripts/")<0){
//						urls="http://www."+sitename+".com"+urls;
//						if(get.contains(urls)==false)
//							get.add(urls);
//					}
//					if(urls.indexOf("www."+sitename)>0 && urls.length()>2 && urls.indexOf("/topic/")<0 && urls.indexOf("/sst/")<0 && urls.indexOf("search.php")<0 && urls.indexOf(".jpg")<0 && urls.indexOf("/track/scripts/")<0){
//						if(get.contains(urls)==false)
//							get.add(urls);
//					}
//				}
//				smarter判断逻辑
				if(urls != null){
					if((urls.substring(0, 1)).equals("/") && urls.length()>2 && urls.indexOf("/pl--")<0 && urls.indexOf("/se--")<0 && urls.indexOf("/sd--")<0 && urls.indexOf("/sz--")<0 && urls.indexOf("/cl--")<0 && urls.indexOf("/scripts/")<0){
						urls="http://www."+sitename+".com"+urls;
						if(get.contains(urls)==false)
							get.add(urls);
					}
					if(urls.indexOf("http://www."+sitename)>0 && urls.length()>2 && urls.indexOf("/pl--")<0 && urls.indexOf("/se--")<0 && urls.indexOf("/sd--")<0 && urls.indexOf("/sz--")<0 && urls.indexOf("/cl--")<0 && urls.indexOf("/scripts/")<0){
						if(get.contains(urls)==false)
							get.add(urls);
					}
				}
			}
			String html=doc.html();
			int station[]=new int[JsName.size()];
			for(int i=0;i<JsName.size();i++){
				if(html.indexOf(JsName.get(i))>0)
					station[i]=1;
				else
					station[i]=0;
			}
			a.witer(url,station);
		}catch(Exception e){
			Error.add(url+"\t"+e.getMessage());
		}
	}
	private void witer(String url, int[] station) throws IOException {
		// TODO Auto-generated method stub
		File f2 = new File("./src/GetUrls/"+sitename+"report.txt");
		FileWriter writer = new FileWriter(f2, true);
		bw = new BufferedWriter(writer);
		bw.write(url+"\t"+station[0]+"\t"+station[1]+"\t"+station[2]+"\t"+station[3]+"\t"+station[4]+"\t"+station[5]);
		bw.newLine();
		bw.flush();
		bw.close();
	}
}

  

package GetUrls;

import java.io.IOException;

public class runer implements Runnable {
	String url=null;
	public runer(String s) {
		// TODO Auto-generated constructor stub
		this.url=s;
	}

	@Override
	public void run() {
		// TODO Auto-generated method stub
		GetIt a=new GetIt();
		try {
			a.getAll(url);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

}

  

posted on 2013-07-16 17:23  IT Ⅳ  阅读(258)  评论(0编辑  收藏  举报

导航