搜索网站的多线程爬虫
package GetUrls; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class GetIt { private static BufferedWriter bw; public static ArrayList<String> AllUrls=new ArrayList<String>(); public static ArrayList<String> get=new ArrayList<String>(); public static String sitename="smarter"; public static GetIt a=new GetIt(); public static ArrayList<String> JsName=new ArrayList<String>(); private static BufferedReader br; public static ArrayList<String> tmp=new ArrayList<String>(); public static ArrayList<String> Error=new ArrayList<String>(); public static ExecutorService p = null; // public static ArrayList<String> ErrorName=new ArrayList<String>(); public static void main(String args[]) throws IOException{ File f1 = new File("./src/GetAllUrljs/jsname"); FileReader reader = new FileReader(f1); br = new BufferedReader(reader); String line=""; while((line=br.readLine()) != null){ JsName.add(line); } // File f = new File("./src/GetUrls/errorname.txt"); // FileReader reader1 = new FileReader(f); // br = new BufferedReader(reader1); // String line1=""; // while((line1=br.readLine()) != null){ // ErrorName.add(line1); // } String url="http://www."+sitename+".com"; a.getAll(url); for(int i=0;i<get.size();i++){ if(AllUrls.contains(get.get(i))==false){ tmp.add(get.get(i)); } } a.getrun(); if(Error.size()>0){ File f2 = new File("./src/GetUrls/"+sitename+"error.txt"); FileWriter writer = new FileWriter(f2, true); bw = new BufferedWriter(writer); for(int i=0;i<Error.size();i++){ bw.write(Error.get(i)); bw.newLine(); bw.flush(); } bw.close(); } } public void getrun() { // TODO Auto-generated method stub get.clear(); p=Executors.newFixedThreadPool(10); for(int i=0;i<tmp.size();i++){ p.execute(new runer(tmp.get(i))); } while (p.isTerminated() == false) { p.shutdown(); tmp.clear(); } for(int i=0;i<get.size();i++){ if(AllUrls.contains(get.get(i))==false){ tmp.add(get.get(i)); } } if(tmp.size()>0){ a.getrun(); } } public void getAll(String url) throws IOException { // TODO Auto-generated method stub AllUrls.add(url); try{ Document doc = Jsoup.connect(url).timeout(120000).get(); Elements links=doc.select("a[href]"); for(Element link :links){ String urls=link.attr("href"); // for(int i=0;i<ErrorName.size();i++){ // if(urls.equals(ErrorName.get(i))){ // System.out.println(url+"\t"+ErrorName.get(i)); // } // } // CSUS判断逻辑 // if(urls !=null){ // if((urls.substring(0, 1)).equals("/") && urls.length()>2 && urls.indexOf("/topic/")<0 && urls.indexOf("/sst/")<0 && urls.indexOf("search.php")<0 && urls.indexOf(".jpg")<0 && urls.indexOf("/track/scripts/")<0){ // urls="http://www."+sitename+".com"+urls; // if(get.contains(urls)==false) // get.add(urls); // } // if(urls.indexOf("www."+sitename)>0 && urls.length()>2 && urls.indexOf("/topic/")<0 && urls.indexOf("/sst/")<0 && urls.indexOf("search.php")<0 && urls.indexOf(".jpg")<0 && urls.indexOf("/track/scripts/")<0){ // if(get.contains(urls)==false) // get.add(urls); // } // } // smarter判断逻辑 if(urls != null){ if((urls.substring(0, 1)).equals("/") && urls.length()>2 && urls.indexOf("/pl--")<0 && urls.indexOf("/se--")<0 && urls.indexOf("/sd--")<0 && urls.indexOf("/sz--")<0 && urls.indexOf("/cl--")<0 && urls.indexOf("/scripts/")<0){ urls="http://www."+sitename+".com"+urls; if(get.contains(urls)==false) get.add(urls); } if(urls.indexOf("http://www."+sitename)>0 && urls.length()>2 && urls.indexOf("/pl--")<0 && urls.indexOf("/se--")<0 && urls.indexOf("/sd--")<0 && urls.indexOf("/sz--")<0 && urls.indexOf("/cl--")<0 && urls.indexOf("/scripts/")<0){ if(get.contains(urls)==false) get.add(urls); } } } String html=doc.html(); int station[]=new int[JsName.size()]; for(int i=0;i<JsName.size();i++){ if(html.indexOf(JsName.get(i))>0) station[i]=1; else station[i]=0; } a.witer(url,station); }catch(Exception e){ Error.add(url+"\t"+e.getMessage()); } } private void witer(String url, int[] station) throws IOException { // TODO Auto-generated method stub File f2 = new File("./src/GetUrls/"+sitename+"report.txt"); FileWriter writer = new FileWriter(f2, true); bw = new BufferedWriter(writer); bw.write(url+"\t"+station[0]+"\t"+station[1]+"\t"+station[2]+"\t"+station[3]+"\t"+station[4]+"\t"+station[5]); bw.newLine(); bw.flush(); bw.close(); } }
package GetUrls; import java.io.IOException; public class runer implements Runnable { String url=null; public runer(String s) { // TODO Auto-generated constructor stub this.url=s; } @Override public void run() { // TODO Auto-generated method stub GetIt a=new GetIt(); try { a.getAll(url); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }