【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析
在创建一个job后,就要开始job的运行,运行的全流程如下:
1、在界面上启动job
2、index.jsp
查看上述页面对应的源代码
<a href='"+request.getContextPath()+"/console/action.jsp?action=start'>Start</a>
3、action.jsp
String sAction = request.getParameter("action"); if(sAction != null) { // Need to handle an action if(sAction.equalsIgnoreCase("start")) { // Tell handler to start crawl job handler.startCrawler(); } else if(sAction.equalsIgnoreCase("stop")) { // Tell handler to stop crawl job handler.stopCrawler(); } else if(sAction.equalsIgnoreCase("terminate")) { // Delete current job if(handler.getCurrentJob()!=null){ handler.deleteJob(handler.getCurrentJob().getUID()); } } else if(sAction.equalsIgnoreCase("pause")) { // Tell handler to pause crawl job handler.pauseJob(); } else if(sAction.equalsIgnoreCase("resume")) { // Tell handler to resume crawl job handler.resumeJob(); } else if(sAction.equalsIgnoreCase("checkpoint")) { if(handler.getCurrentJob() != null) { handler.checkpointJob(); } } } response.sendRedirect(request.getContextPath() + "/index.jsp");
4、CrawlJobHandler.jsp
(1)
public void startCrawler() { running = true; if (pendingCrawlJobs.size() > 0 && isCrawling() == false) { // Ok, can just start the next job startNextJob(); } }
(2)
protected final void startNextJob() { synchronized (this) { if(startingNextJob != null) { try { startingNextJob.join(); } catch (InterruptedException e) { e.printStackTrace(); return; } } startingNextJob = new Thread(new Runnable() { public void run() { startNextJobInternal(); } }, "StartNextJob"); startingNextJob.start(); } }
(3)
protected void startNextJobInternal() { if (pendingCrawlJobs.size() == 0 || isCrawling()) { // No job ready or already crawling. return; } this.currentJob = (CrawlJob)pendingCrawlJobs.first(); assert pendingCrawlJobs.contains(currentJob) : "pendingCrawlJobs is in an illegal state"; pendingCrawlJobs.remove(currentJob); try { this.currentJob.setupForCrawlStart(); // This is ugly but needed so I can clear the currentJob // reference in the crawlEnding and update the list of completed // jobs. Also, crawlEnded can startup next job. this.currentJob.getController().addCrawlStatusListener(this); // now, actually start this.currentJob.getController().requestCrawlStart(); } catch (InitializationException e) { loadJob(getStateJobFile(this.currentJob.getDirectory())); this.currentJob = null; startNextJobInternal(); // Load the next job if there is one. } }
(4)
public void requestCrawlStart() { runProcessorInitialTasks(); sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING); String jobState; state = RUNNING; jobState = CrawlJob.STATUS_RUNNING; sendCrawlStateChangeEvent(this.state, jobState); // A proper exit will change this value. this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL; Thread statLogger = new Thread(statistics); statLogger.setName("StatLogger"); statLogger.start(); frontier.start(); }