【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析

在创建一个job后，就要开始job的运行，运行的全流程如下：

1、在界面上启动job

2、index.jsp

查看上述页面对应的源代码

<a href='"+request.getContextPath()+"/console/action.jsp?action=start'>Start</a>

3、action.jsp

    String sAction = request.getParameter("action");
    if(sAction != null)
    {
        // Need to handle an action    
        if(sAction.equalsIgnoreCase("start"))
        {
            // Tell handler to start crawl job
            handler.startCrawler();
        } else if(sAction.equalsIgnoreCase("stop")) {
            // Tell handler to stop crawl job
            handler.stopCrawler();
        } else if(sAction.equalsIgnoreCase("terminate")) {
            // Delete current job
            if(handler.getCurrentJob()!=null){
                handler.deleteJob(handler.getCurrentJob().getUID());
            }
        } else if(sAction.equalsIgnoreCase("pause")) {
            // Tell handler to pause crawl job
            handler.pauseJob();
        } else if(sAction.equalsIgnoreCase("resume")) {
            // Tell handler to resume crawl job
            handler.resumeJob();
        } else if(sAction.equalsIgnoreCase("checkpoint")) {
            if(handler.getCurrentJob() != null) {
                handler.checkpointJob();
            }
        }
    }    
    response.sendRedirect(request.getContextPath() + "/index.jsp");

4、CrawlJobHandler.jsp

（1）

    public void startCrawler() {
        running = true;
        if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
            // Ok, can just start the next job
            startNextJob();
        }
    }

（2）

    protected final void startNextJob() {
        synchronized (this) {
            if(startingNextJob != null) {
                try {
                    startingNextJob.join();
                } catch (InterruptedException e) {
                    e.printStackTrace();
                    return;
                }
            }
            startingNextJob = new Thread(new Runnable() {
                public void run() {
                    startNextJobInternal();
                }
            }, "StartNextJob");
            startingNextJob.start();
        }
    }

（3）

   protected void startNextJobInternal() {
        if (pendingCrawlJobs.size() == 0 || isCrawling()) {
            // No job ready or already crawling.
            return;
        }
        this.currentJob = (CrawlJob)pendingCrawlJobs.first();
        assert pendingCrawlJobs.contains(currentJob) :
            "pendingCrawlJobs is in an illegal state";
        pendingCrawlJobs.remove(currentJob);
        try {
            this.currentJob.setupForCrawlStart();
            // This is ugly but needed so I can clear the currentJob
            // reference in the crawlEnding and update the list of completed
            // jobs.  Also, crawlEnded can startup next job.
            this.currentJob.getController().addCrawlStatusListener(this);
            // now, actually start
            this.currentJob.getController().requestCrawlStart();
        } catch (InitializationException e) {
            loadJob(getStateJobFile(this.currentJob.getDirectory()));
            this.currentJob = null;
            startNextJobInternal(); // Load the next job if there is one.
        }
    }

（4）

    public void requestCrawlStart() {
        runProcessorInitialTasks();

        sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING);
        String jobState;
        state = RUNNING;
        jobState = CrawlJob.STATUS_RUNNING;
        sendCrawlStateChangeEvent(this.state, jobState);

        // A proper exit will change this value.
        this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL;
        
        Thread statLogger = new Thread(statistics);
        statLogger.setName("StatLogger");
        statLogger.start();
        
        frontier.start();
    }

posted @ 2014-06-04 20:10 eagleGeek 阅读(329) 评论(0) 编辑收藏举报

刷新页面返回顶部

【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析

公告