爬虫-过滤无关网页、循环爬去整个网站

相关技术:上一篇+队列

思路分析:将主网站添加进队列,根据httpClent解析该网站,的奥对应的链接,判断链接是否有效,如果有效判断是否是目标,是目标,进行相关操作,不是目标添加进队列,在次解析队列,通过这种方式解析整个网站。

 

package com.open111.crawler;

import java.io.IOException;
import java.util.LinkedList;
import java.util.Queue;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
*
* 爬虫起始类
* @author user
*
*/
public class StartCrawler {

public static String[] excludeUrl=new String[]{ ".pom", ".xml", ".md5", ".sha1", ".asc", ".gz", ".zip", "../"}; // 要过滤的url后缀

public static Queue<String> waitForCrawlerUrls=new LinkedList<String>(); // 等待爬取的url

private static int total=0;

/**
* 解析网页内容
* @param webPageContent
*/
public static void parseWebPage(String webPageContent,String realPath){
if("".equals(webPageContent)){
return;
}
Document doc=Jsoup.parse(webPageContent);
Elements links=doc.select("a"); // 获取所有超链接元素
for(int i=0;i<links.size();i++){
Element link=links.get(i);
String url=link.attr("href");
System.out.println("提取的url:"+(realPath+url));
boolean f=true;
for(int j=0;j<excludeUrl.length;j++){
if(url.endsWith(excludeUrl[j])){
f=false;
break;
}
}
if(f){ // 是我们需要的url
if(url.endsWith(".jar")){ // 目标地址
total++;
System.out.println("发现第"+total+"个目标:"+(realPath+url));
}else{ // 要继续解析的Url
addUrl(realPath+url);
}
}
}
}

/**
* 添加url到爬虫队列,假如队列中存在 就不添加
* @param string
*/
private static void addUrl(String url) {
if(url==null || "".equals(url)){
return;
}
if(!waitForCrawlerUrls.contains(url)){
waitForCrawlerUrls.add(url);
System.out.println(url+"添加到爬虫队列");
}
}

/**
* 解析网页请求
* @param url 请求的url
*/
public static void parseUrl(){
while(waitForCrawlerUrls.size()>0){
String url=waitForCrawlerUrls.poll(); // 摘取第一个元素

CloseableHttpClient httpClient=HttpClients.createDefault(); // 创建httpclient实例
HttpGet httpGet=new HttpGet(url); // 创建httpget实例
CloseableHttpResponse response=null;
try {
response=httpClient.execute(httpGet);
HttpEntity entity=response.getEntity(); // 获取返回实体
System.out.println("内容类型:"+entity.getContentType().getValue());
if("text/html".equals(entity.getContentType().getValue())){
String webPageContent=EntityUtils.toString(entity, "utf-8");
//System.out.println("网页内容:"+webPageContent);
parseWebPage(webPageContent,url);
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if(response!=null){
try {
response.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
try {
httpClient.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

try {
Thread.sleep(1000); // 休息2秒钟
System.out.println("休息1秒");
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}

private static void init(){
addUrl("http://central.maven.org/maven2/HTTPClient/HTTPClient/");
addUrl("http://central.maven.org/maven2/acegisecurity/acegi-security/");
parseUrl();
}

public static void main(String[] args) {
init();
}
}

posted @ 2017-03-20 23:08  小拽A  阅读(1521)  评论(0编辑  收藏  举报