MinerHtmlThread.java 爬取页面线程
package com.iteye.injavawetrust.miner; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; /** * 爬取页面线程 * @author InJavaWetrust * */ public class MinerHtmlThread extends Thread { private static final Log LOG = LogFactory.getLog(MinerHtmlThread.class); private MinerConfig config = null; public MinerHtmlThread(MinerConfig config) { this.config = config; } @Override public void run() { while (!MinerMonitorThread.done) { minerHtml(); } } public synchronized void minerHtml() { MinerUrl minerUrl = MinerQueue.unVisitedPoll(); // 待访问出队列。 try { //判断当前页面爬取深度 if(null == minerUrl || MinerUtil.isBlank(minerUrl.getUrl()) || minerUrl.getDepth() > config.getMaxDepth()) { return; } //判断爬取页面URL是否包含http if("http".contains(minerUrl.getUrl())) { LOG.info("MinerHtmlThread当前爬取URL[" + minerUrl.getUrl() + "]没有http"); return; } LOG.info("MinerHtmlThread当前爬取页面[" + minerUrl.getUrl() + "]爬取深度[" + minerUrl.getDepth() + "] 当前线程 [" + Thread.currentThread().getName() + "]"); Connection conn = Jsoup.connect(minerUrl.getUrl()); conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//配置模拟浏览器 Document doc = conn.get(); String page = doc.html(); Html html = new Html(); html.setUrl(minerUrl.getUrl()); html.setHtml(page); html.setDepth(minerUrl.getDepth()); // 添加到存储队列 MinerQueue.addStore(html); // 已经爬取的页面 添加到等待提取URL的分析页面队列 MinerQueue.addWaitingMisering(html); } catch(Exception e) { LOG.info("MinerHtmlThread爬取页面失败 URL [" + minerUrl.getUrl() + "]"); LOG.info("MinerHtmlThreadError info [" + e.getMessage() + "]"); } } }