MiseringThread.java 解析页面线程
http://injavawetrust.iteye.com
package com.iteye.injavawetrust.miner; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * 解析页面线程 * @author InJavaWeTrust * */ public class MiseringThread extends Thread { private static final Log LOG = LogFactory.getLog(MiseringThread.class); private MinerConfig config = null; public MiseringThread(MinerConfig config) { this.config = config; } @Override public void run() { while (!MinerMonitorThread.done) { misering(); } } private synchronized void misering() { Html html = MinerQueue.waitingMiseringPoll(); // 等待提取URL的分析页面出队列 if (null == html || MinerUtil.isBlank(html.getHtml())) { return; } //当前页面深度<爬取深度 取出当前页面全部URL if (html.getDepth() < config.getMaxDepth()) { LOG.info("MiseringThread获取页面[" + html.getUrl() + "]下所有URL。。。。。。 当前线程 [" + Thread.currentThread().getName() + "]"); Set<String> urls = MinerUtil.getAllUrl(html.getUrl()); for(String url : urls){ if(null == url || url.equals("")){ continue; } if(url.substring(url.length() - 1, url.length()).equals("/")){ url = url.substring(0, url.length() - 1); } MinerUrl minerUrl = new MinerUrl(); minerUrl.setUrl(url); minerUrl.setDepth(html.getDepth() + 1); // 爬取深度+1 // 判断URL列表是否包含关键字 if(!MinerUtil.checkKeys(url, config.getKeys())){ continue; } // 添加到待访问队列,每个URL只访问一次 MinerQueue.addUnVisited(minerUrl); // 将页面URL 添加到URL队列 保证每个URL只访问一次 MinerQueue.addUrlSet(minerUrl.getUrl()); } } } }