spark源码(四)Master recive方法

    Master recive全部方法
        一.case ElectedLeader => ......
        二.case CompleteRecovery => ......
        三.case RevokedLeadership => ......
        四.case WorkerDecommissioning => ......
        五.case DecommissionWorkers => ......
        六.case RegisterWorker => ......
        七.case RegisterApplication => ......
        八.case ExecutorStateChanged => ......
        九.case DriverStateChanged => ......
        十.case Heartbeat => ......
        十一.case MasterChangeAcknowledged => ......
        十二.case WorkerSchedulerStateResponse => ......
        十三.case WorkerLatestState => ......
        十四.case UnregisterApplication => ......
        十五.case CheckForWorkerTimeOut => ......

九.DriverStateChanged 详解

    state match {
        case DriverState.ERROR | DriverState.FINISHED | DriverState.KILLED | DriverState.FAILED =>
          removeDriver(driverId, state, exception)//执行代码入口
        case _ =>//DriverState( SUBMITTED, RUNNING, RELAUNCHING, UNKNOWN) 打印一句报错信息
          throw new Exception(s"Received unexpected state update for driver $driverId: $state")
    }
    // removeDriver 上面其实是有的 但是有一个疑问:为什么没有worker.send(removeDriver)方法呢

十.Heartbeat 详解 整体还是比较简单的

    idToWorker.get(workerId) match {
      case Some(workerInfo) =>
        workerInfo.lastHeartbeat = System.currentTimeMillis()/*注册以后,更新最后一次心跳时长*/
      case None =>
        if (workers.map(_.id).contains(workerId)) {
          logWarning(s"Got heartbeat from unregistered worker $workerId." +
            " Asking it to re-register.")
          worker.send(ReconnectWorker(masterUrl))//最后调用一个这个方法 估计也没有什么处理逻辑
        } else {
          logWarning(s"Got heartbeat from unregistered worker $workerId." +
            " This worker was never registered, so ignoring the heartbeat.")
        }
    }

十一.MasterChangeAcknowledged 详解

    //将app状态改为WAITING 调用 completeRecovery 方法
    idToApp.get(appId) match {
      case Some(app) =>
        logInfo("Application has been re-registered: " + appId)
        app.state = ApplicationState.WAITING
      case None =>
        logWarning("Master change ack from unknown app: " + appId)
    }
    if (canCompleteRecovery) { completeRecovery() }//上面方法有介绍的

十二.WorkerSchedulerStateResponse 详解

    //这应对的情况应该是worker掉线了 重新申请资源 但是没有杀掉旧的任务
    idToWorker.get(workerId) match {
        case Some(worker) =>
          logInfo("Worker has been re-registered: " + workerId)
          worker.state = WorkerState.ALIVE
          val validExecutors = execResponses.filter(
            exec => idToApp.get(exec.desc.appId).isDefined)
          for (exec <- validExecutors) {//将worker上的所有executor重新申请
            val (execDesc, execResources) = (exec.desc, exec.resources)
            val app = idToApp(execDesc.appId)
            val execInfo = app.addExecutor(
              worker, execDesc.cores, execResources, Some(execDesc.execId))
            worker.addExecutor(execInfo)
            worker.recoverResources(execResources)
            execInfo.copyState(execDesc)
          }
          for (driver <- driverResponses) {//将worker上的所有driver重新申请
            val (driverId, driverResource) = (driver.driverId, driver.resources)
            drivers.find(_.id == driverId).foreach { driver =>
              driver.worker = Some(worker)
              driver.state = DriverState.RUNNING
              driver.withResources(driverResource)
              worker.recoverResources(driverResource)
              worker.addDriver(driver)
            }
          }
        case None =>
          logWarning("Scheduler state from unknown worker: " + workerId)
    }
    if (canCompleteRecovery) { completeRecovery() }

十三.WorkerLatestState 详解

    //这个可以用在慢任务检测的时候 杀掉多余的起来的任务
    idToWorker.get(workerId) match {
        case Some(worker) =>
          for (exec <- executors) {//将worker上的未知状态的executor杀掉
            val executorMatches = worker.executors.exists {
              case (_, e) => e.application.id == exec.appId && e.id == exec.execId
            }
            if (!executorMatches) {
              worker.endpoint.send(KillExecutor(masterUrl, exec.appId, exec.execId))
            }
          }

          for (driverId <- driverIds) {//将worker上未知状态的driver杀掉
            val driverMatches = worker.drivers.exists { case (id, _) => id == driverId }
            if (!driverMatches) {
              worker.endpoint.send(KillDriver(driverId))
            }
          }
        case None =>
          logWarning("Worker state from unknown worker: " + workerId)
    }

十四.UnregisterApplication 详解 杀掉application

    logInfo(s"Received unregister request from application $applicationId")
    idToApp.get(applicationId).foreach(finishApplication)//最终调用 removeApplication 上面介绍过的

十五.CheckForWorkerTimeOut 详解

    //worker心跳不起作用了 就移除当前节点
    onstart 方法里面有一个调度  send的就是这个方法
    timeOutDeadWorkers()

    private def timeOutDeadWorkers(): Unit = {
        val currentTime = System.currentTimeMillis()
        val toRemove = workers.filter(_.lastHeartbeat < currentTime - workerTimeoutMs).toArray
        for (worker <- toRemove) {//其实还是没有使用hadoop所谓的SortedMap 直接循环干
          if (worker.state != WorkerState.DEAD) {
            val workerTimeoutSecs = TimeUnit.MILLISECONDS.toSeconds(workerTimeoutMs)
            logWarning("Removing %s because we got no heartbeat in %d seconds".format(
              worker.id, workerTimeoutSecs))
            removeWorker(worker, s"Not receiving heartbeat for $workerTimeoutSecs seconds")
          } else {
            if (worker.lastHeartbeat < currentTime - ((reaperIterations + 1) * workerTimeoutMs)) {
              workers -= worker 
            }
          }
        }
    }
posted @ 2022-09-12 09:39  Kotlin  阅读(19)  评论(0编辑  收藏  举报
Live2D