java基础:9.4 web爬虫
跟随超链接来自动遍历Web.
package day11;
import java.util.ArrayList;
import java.util.Scanner;
public class WebCrawler {
public static void main(String[] args) {
Scanner input = new Scanner(System.in);
System.out.println("enter a URL(such as:http://wwww.xxxx.com):");
String url = input.nextLine();
crawler(url);
}
public static void crawler(String startingURL) {
ArrayList<String> listOfPendingURLs = new ArrayList<>();
ArrayList<String> listOfTraversedURLs = new ArrayList<>();
listOfPendingURLs.add(startingURL) ;
int i= 0;
while (!listOfPendingURLs.isEmpty() && //is.Empty() :empty return true
listOfTraversedURLs.size() <= 100) {
String urlString = listOfPendingURLs.remove(0); // remove the first url
if (!listOfTraversedURLs.contains(urlString)) {
listOfTraversedURLs.add(urlString);
System.out.println("Crawl " + ++i + " "+urlString);
for (String s: getSubURLs(urlString)) {
if (!listOfTraversedURLs.contains(s))
listOfPendingURLs.add (s);
}
}
}
}
public static ArrayList<String> getSubURLs(String urlString) {
ArrayList<String> list = new ArrayList<>() ;
try {
java.net.URL url = new java.net.URL(urlString);
Scanner input = new Scanner(url.openStream());
int current = 0;
while(input.hasNext()) {
String line = input.nextLine();
current = line.indexOf("http:",current);
while (current > 0) {
int endIndex = line.indexOf("\"",current);
if (endIndex > 0 ) {
list.add(line.substring(current,endIndex));
current = line.indexOf("http:",endIndex);
}
else
current = -1;
}
}
}
catch (Exception ex) {
System.out.println("error:" + ex.getMessage());
}
return list;
}
}