用Java实现网络爬虫
myCrawler.java
package WebCrawler; import java.io.File; import java.util.ArrayList; import java.util.LinkedList; import java.util.Queue; public class MyCrawler { private static final String SAVEPATH = "C:"+File.separator+"downloadURL"; public void crawl(ArrayList<URL> urls, int depth) { //初始化队列 Queue<URL> q = new LinkedList<URL>(); ArrayList<URL> visited = new ArrayList<URL>(); q.addAll(urls); while (!q.isEmpty()) { URL head = q.poll(); //出列 if(head.getDepth() > depth){ break; } visited.add(head); String page = HtmlParserTool.getPage(head.toString()); String charset = HtmlParserTool.getCharset(page); String urlFullPath = SAVEPATH+File.separator+head.toString().replaceAll("[?:<>*|////]","_")+".html"; HtmlParserTool.writeToDisk(urlFullPath, page, charset); //保存到磁盘 ArrayList<String> toVisit = HtmlParserTool.extractLinks(page); for (String s : toVisit) { if (!visited.contains(s)) { //visited.add(s); q.add(new URL(s, head.getDepth()+1)); } } } } public static void main(String[] args) throws Exception { ArrayList<URL> urls = new ArrayList<URL>(); urls.add(new URL("http://www.baidu.com")); new MyCrawler().crawl(urls,1); } }
HtmlParserTool.java
package WebCrawler; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.util.ArrayList; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.htmlparser.Parser; import org.htmlparser.Tag; import org.htmlparser.tags.LinkTag; import org.htmlparser.visitors.NodeVisitor; public class HtmlParserTool { //判断字符串是否是一个网址 private static boolean isValidUrl(String url) { if (url.startsWith("http") | url.startsWith("https")) { return true; } else { return false; } } //获取网页包含的超链接 public static ArrayList<String> extractLinks(String content){ ArrayList<String> links = new ArrayList<String>(); Parser parser = null; NodeVisitor visitor = null; try { parser = new Parser(content); visitor = new NodeVisitor() { @Override public void visitTag(Tag tag) { if(tag instanceof LinkTag) { LinkTag link = (LinkTag)tag; String linkString = link.getLink(); if(isValidUrl(linkString) && !links.contains(linkString)) { links.add(linkString); } } } }; parser.visitAllNodesWith(visitor); } catch (Exception e) { e.printStackTrace(); } return links; } //获取字符集 public static String getCharset(String content) { int startIdx = content.indexOf("charset"); int endIdx = content.indexOf("\"", startIdx+9); String charset = content.substring(startIdx+9, endIdx); return charset; } //获取网页内容 public static String getPage(String url) { CloseableHttpClient client = HttpClients.createDefault(); HttpGet request = new HttpGet(url); String content=""; try { CloseableHttpResponse response = client.execute(request); //System.out.println("Response Code: " + response.getStatusLine().getStatusCode()); BufferedReader rd = new BufferedReader(new InputStreamReader(response.getEntity().getContent())); String line = ""; while ((line = rd.readLine()) != null) { content = content + line + "\n"; } response.close(); client.close(); String charset = getCharset(content); if(charset != null) { content = new String(content.getBytes(),charset); } } catch (Exception e) { e.printStackTrace(); } return content; } //将网页内容写至磁盘 public static void writeToDisk(String path, String content, String charset){ try { File file = new File(path); OutputStream o = new FileOutputStream(file); o.write(content.getBytes(charset)); o.close(); } catch (Exception e) { e.printStackTrace(); } } }
URL.java
package WebCrawler; public class URL { private String url; private int depth; public URL(String url) { this.url = url; this.depth = 1; } public URL(String url, int depth) { this.url = url; this.depth = depth; } public String toString() { return this.url; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public int getDepth() { return depth; } public void setDepth(int depth) { this.depth = depth; } }