我们接下来分析与与BdbFrontier对象CrawlURI next()方法相关的方法
/** * Return the next CrawlURI eligible to be processed (and presumably * visited/fetched) by a a worker thread. * * Relies on the readyClassQueues having been loaded with * any work queues that are eligible to provide a URI. * * @return next CrawlURI eligible to be processed, or null if none available * * @see org.archive.crawler.framework.Frontier#next() */ protected CrawlURI findEligibleURI() { // wake any snoozed queues wakeQueues(); // consider rescheduled URIS checkFutures(); // find a non-empty ready queue, if any // TODO: refactor to untangle these loops, early-exits, etc! WorkQueue readyQ = null; findauri: while(true) { findaqueue: do { String key = readyClassQueues.poll(); if(key==null) { // no ready queues; try to activate one if(!getInactiveQueuesByPrecedence().isEmpty() && highestPrecedenceWaiting < getPrecedenceFloor()) { activateInactiveQueue(); continue findaqueue; } else { // nothing ready or readyable break findaqueue; } } readyQ = getQueueFor(key); if(readyQ==null) { // readyQ key wasn't in all queues: unexpected logger.severe("Key "+ key + " in readyClassQueues but not allQueues"); break findaqueue; } if(readyQ.getCount()==0) { // readyQ is empty and ready: it's exhausted readyQ.noteExhausted(); readyQ.makeDirty(); readyQ = null; continue; } if(!inProcessQueues.add(readyQ)) { // double activation; discard this and move on // (this guard allows other enqueuings to ready or // the various inactive-by-precedence queues to // sometimes redundantly enqueue a queue key) readyQ = null; continue; } // queue has gone 'in process' readyQ.considerActive(); readyQ.setWakeTime(0); // clear obsolete wake time, if any readyQ.setSessionBudget(getBalanceReplenishAmount()); readyQ.setTotalBudget(getQueueTotalBudget()); if (readyQ.isOverSessionBudget()) { deactivateQueue(readyQ); readyQ.makeDirty(); readyQ = null; continue; } if (readyQ.isOverTotalBudget()) { retireQueue(readyQ); readyQ.makeDirty(); readyQ = null; continue; } } while (readyQ == null); if (readyQ == null) { // no queues left in ready or readiable break findauri; } returnauri: while(true) { // loop left by explicit return or break on empty CrawlURI curi = null; curi = readyQ.peek(this); if(curi == null) { // should not reach logger.severe("No CrawlURI from ready non-empty queue " + readyQ.classKey + "\n" + readyQ.shortReportLegend() + "\n" + readyQ.shortReportLine() + "\n"); break returnauri; } // from queues, override names persist but not map source curi.setOverlayMapsSource(sheetOverlaysManager); // TODO: consider optimizations avoiding this recalc of // overrides when not necessary sheetOverlaysManager.applyOverlaysTo(curi); // check if curi belongs in different queue String currentQueueKey; try { KeyedProperties.loadOverridesFrom(curi); currentQueueKey = getClassKey(curi); } finally { KeyedProperties.clearOverridesFrom(curi); } if (currentQueueKey.equals(curi.getClassKey())) { // curi was in right queue, emit noteAboutToEmit(curi, readyQ); return curi; } // URI's assigned queue has changed since it // was queued (eg because its IP has become // known). Requeue to new queue. // TODO: consider synchronization on readyQ readyQ.dequeue(this,curi); doJournalRelocated(curi); curi.setClassKey(currentQueueKey); decrementQueuedCount(1); curi.setHolderKey(null); sendToQueue(curi); if(readyQ.getCount()==0) { // readyQ is empty and ready: it's exhausted // release held status, allowing any subsequent // enqueues to again put queue in ready // FIXME: tiny window here where queue could // receive new URI, be readied, fail not-in-process? inProcessQueues.remove(readyQ); readyQ.noteExhausted(); readyQ.makeDirty(); readyQ = null; continue findauri; } } } if(inProcessQueues.size()==0) { // Nothing was ready or in progress or imminent to wake; ensure // any piled-up pending-scheduled URIs are considered uriUniqFilter.requestFlush(); } // if truly nothing ready, wait a moment before returning null // so that loop in surrounding next() has a chance of getting something // next time if(getTotalEligibleInactiveQueues()==0) { try { Thread.sleep(1000); } catch (InterruptedException e) { // } } // nothing eligible return null; }
这个方法有点长,我们先看一下void wakeQueues() 方法
/** 唤醒snoozed queue中到时的队列 * Wake any queues sitting in the snoozed queue whose time has come. */ protected void wakeQueues() { DelayedWorkQueue waked; while((waked = snoozedClassQueues.poll())!=null) { WorkQueue queue = waked.getWorkQueue(this); queue.setWakeTime(0); queue.makeDirty(); reenqueueQueue(queue); } // also consider overflow (usually empty) if(!snoozedOverflow.isEmpty()) { synchronized(snoozedOverflow) { Iterator<DelayedWorkQueue> iter = snoozedOverflow.headMap(System.currentTimeMillis()).values().iterator(); while(iter.hasNext()) { DelayedWorkQueue dq = iter.next(); iter.remove(); snoozedOverflowCount.decrementAndGet(); WorkQueue queue = dq.getWorkQueue(this); queue.setWakeTime(0); queue.makeDirty(); reenqueueQueue(queue); } } } }
snoozedClassQueues.poll()方法是从休眠队列中取出时间到期的元素,重置睡眠时间为0,然后重置WorkQueue wq的队列归属(非活动状态队列或已经准备好被爬取的队列)
/** * Enqueue the given queue to either readyClassQueues or inactiveQueues, * as appropriate. * * @param wq */ protected void reenqueueQueue(WorkQueue wq) { //TODO:SPRINGY set overrides by queue? getQueuePrecedencePolicy().queueReevaluate(wq); if (logger.isLoggable(Level.FINE)) { logger.fine("queue reenqueued: " + wq.getClassKey()); } if(highestPrecedenceWaiting < wq.getPrecedence() || wq.getPrecedence() >= getPrecedenceFloor()) { // if still over budget, deactivate deactivateQueue(wq); } else { readyQueue(wq); } }
首先是重置队列的优先级,然后是将WorkQueue wq归入非活动状态队列或已经准备好被爬取的队列
deactivateQueue(wq)方法我们上文已经分析过(将WorkQueue wq加入非活动状态队列),这里看一下readyQueue(wq)方法
/** * Put the given queue on the readyClassQueues queue * @param wq */ protected void readyQueue(WorkQueue wq) { // assert Thread.currentThread() == managerThread; try { readyClassQueues.put(wq.getClassKey()); if(logger.isLoggable(Level.FINE)) { logger.log(Level.FINE, "queue readied: " + wq.getClassKey()); } } catch (InterruptedException e) { e.printStackTrace(); System.err.println("unable to ready queue "+wq); // propagate interrupt up throw new RuntimeException(e); } }
该方法是将WorkQueue wq加入已经准备好被爬取的队列readyClassQueues
重新回到void wakeQueues()方法,后面是从snoozedOverflow容器中取出休眠到期的队列(snoozedOverflow用Map类型存储着优先级与过载的休眠状态的队列(队列存储着key)[Map类型]),然后重置WorkQueue wq归入哪个队列
回到CrawlURI findEligibleURI()方法里面的void checkFutures()方法检测到延迟时间的CrawlURI对象,并且加入BDB数据库
/** * Check for any future-scheduled URIs now eligible for reenqueuing */ protected void checkFutures() { // assert Thread.currentThread() == managerThread; // TODO: consider only checking this every set interval if(!futureUris.isEmpty()) { synchronized(futureUris) { Iterator<CrawlURI> iter = futureUris.headMap(System.currentTimeMillis()) .values().iterator(); while(iter.hasNext()) { CrawlURI curi = iter.next(); curi.setRescheduleTime(-1); // unless again set elsewhere iter.remove(); futureUriCount.decrementAndGet(); receive(curi); } } } }
继续往下面看,String key = readyClassQueues.poll()方法为从已经准备好被爬取的队列readyClassQueues中取出队头元素(WorkQueue wq的classkey)
如果预备队列中不存在元素,则激活非活动状态队列inactiveQueues,将合适的WorkQueue wq放已经准备好被爬取的队列入readyClassQueues中
activateInactiveQueue()
/** * 激活非活动状态的队列 * Activate an inactive queue, if any are available. */ protected boolean activateInactiveQueue() { for (Entry<Integer, Queue<String>> entry: getInactiveQueuesByPrecedence().entrySet()) { int expectedPrecedence = entry.getKey(); Queue<String> queueOfWorkQueueKeys = entry.getValue(); while (true) { synchronized (getInactiveQueuesByPrecedence()) { String workQueueKey = queueOfWorkQueueKeys.poll(); if (workQueueKey == null) { break; } WorkQueue candidateQ = (WorkQueue) this.allQueues.get(workQueueKey); if (candidateQ.getPrecedence() > expectedPrecedence) { // queue demoted since placed; re-deactivate deactivateQueue(candidateQ); candidateQ.makeDirty(); continue; } updateHighestWaiting(expectedPrecedence); try { readyClassQueues.put(workQueueKey);//readyClassQueues存储着已经准备好被爬取的队列的key } catch (InterruptedException e) { throw new RuntimeException(e); } return true; } } } return false; }
更新非活动状态队列inactiveQueues中最高优先级的值(最小值)
/** * Recalculate the value of thehighest-precedence queue waiting * among inactive queues. * * @param startFrom start looking at this precedence value */ protected void updateHighestWaiting(int startFrom) { // probe for new highestWaiting for(int precedenceKey : getInactiveQueuesByPrecedence().tailMap(startFrom).keySet()) { if(!getInactiveQueuesByPrecedence().get(precedenceKey).isEmpty()) { highestPrecedenceWaiting = precedenceKey; return; } } // nothing waiting highestPrecedenceWaiting = Integer.MAX_VALUE; }
上面方法为从非活动状态队列inactiveQueues中获取大于指定值得队列元素集合,然后将highestPrecedenceWaiting值设置为非活动状态队列inactiveQueues中precedence最小的值(inactiveQueues是有序的)
---------------------------------------------------------------------------
本系列Heritrix 3.1.0 源码解析系本人原创
转载请注明出处 博客园 刺猬的温驯
本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/21/3033510.html