spark源码(十)Worker receive 方法

    一.case msg: RegisterWorkerResponse 
    二.case SendHeartbeat
    三.case WorkDirCleanup
    四.case MasterChanged
    五.case ReconnectWorker
    六.case LaunchExecutor
    七.case executorStateChanged: ExecutorStateChanged
    八.case KillExecutor(masterUrl, appId, execId)
    九.case LaunchDriver(driverId, driverDesc, resources_)
    十.case KillDriver(driverId)
    十一.case driverStateChanged @ DriverStateChanged(driverId, state, exception)
    十二.case ReregisterWithMaster
    十三.case ApplicationFinished(id)
    十四.case DecommissionWorker
    十五.case WorkerSigPWRReceived

十一. driverStateChanged @ DriverStateChanged(driverId, state, exception) 详解

    private[worker] def handleDriverStateChanged(driverStateChanged: DriverStateChanged): Unit = {
        val driverId = driverStateChanged.driverId
        val exception = driverStateChanged.exception
        val state = driverStateChanged.state
        state match {
          case DriverState.ERROR =>
            logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}")
          case DriverState.FAILED =>
            logWarning(s"Driver $driverId exited with failure")
          case DriverState.FINISHED =>
            logInfo(s"Driver $driverId exited successfully")
          case DriverState.KILLED =>
            logInfo(s"Driver $driverId was killed by user")
          case _ =>
            logDebug(s"Driver $driverId changed state to $state")
        }
        sendToMaster(driverStateChanged)
        val driver = drivers.remove(driverId).get
        finishedDrivers(driverId) = driver
            //finished drivers 保存最大个数配置处理
        trimFinishedDriversIfNecessary()
        //内存cpu 相关资源操作
        memoryUsed -= driver.driverDesc.mem
        coresUsed -= driver.driverDesc.cores
        //文件相关资源操作
        removeResourcesUsed(driver.resources)
    }

十二. ReregisterWithMaster 详解

    private def reregisterWithMaster(): Unit = {
      Utils.tryOrExit {
        connectionAttemptCount += 1
        if (registered) {
          cancelLastRegistrationRetry()
          //TOTAL_REGISTRATION_RETRIES = INITIAL_REGISTRATION_RETRIES + 10
        } else if (connectionAttemptCount <= TOTAL_REGISTRATION_RETRIES) {
          logInfo(s"Retrying connection to master (attempt # $connectionAttemptCount)")

          master match {
            case Some(masterRef) =>
              if (registerMasterFutures != null) {
                //取消  注册master失败 的
                registerMasterFutures.foreach(_.cancel(true))
              }
              //重新统一注册
              val masterAddress =
                if (preferConfiguredMasterAddress) masterAddressToConnect.get 
else masterRef.address
              registerMasterFutures = Array(registerMasterThreadPool.submit(
new Runnable {
                override def run(): Unit = {
                  try {
                    logInfo("Connecting to master " + masterAddress + "...")
                    val masterEndpoint = rpcEnv.setupEndpointRef(masterAddress, Master.ENDPOINT_NAME)
                    /*跳转到Master对象的RegisterWorker */
                    sendRegisterMessageToMaster(masterEndpoint)
                    //向master发送了一个RegisterWorker的消息
                  } catch {
                    case ie: InterruptedException => // Cancelled
                    case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e)
                  }
                }
              }))
            case None =>
              if (registerMasterFutures != null) {
                registerMasterFutures.foreach(_.cancel(true))
              }
              //上面方法有介绍的 向全部的master注册
              registerMasterFutures = tryRegisterAllMasters()
          }
          //TOTAL_REGISTRATION_RETRIES = INITIAL_REGISTRATION_RETRIES + 10
          if (connectionAttemptCount == INITIAL_REGISTRATION_RETRIES) {
            //如果当前注册不行 
            registrationRetryTimer.foreach(_.cancel(true))
            registrationRetryTimer = Some(
              forwardMessageScheduler.scheduleAtFixedRate(//再次注册
                () => Utils.tryLogNonFatalError { self.send(ReregisterWithMaster) },
                PROLONGED_REGISTRATION_RETRY_INTERVAL_SECONDS,
                PROLONGED_REGISTRATION_RETRY_INTERVAL_SECONDS,
                TimeUnit.SECONDS))
          }
        } else {//超过最大注册此时  算了放弃
          logError("All masters are unresponsive! Giving up.")
          System.exit(1)
        }
      }
    }

十三. ApplicationFinished(id) 详解

    //个人理解这个前面应该做了cpu memory的清除工作
    finishedApps += id
    //上面有的介绍的.清除资源文件
    maybeCleanupApplication(id)

十四. DecommissionWorker 详解 自己失联了

    private[deploy] def decommissionSelf(): Unit = {
      if (conf.get(config.DECOMMISSION_ENABLED) && !decommissioned) {
        decommissioned = true
        logInfo(s"Decommission worker $workerId.")
      } else if (decommissioned) {
        logWarning(s"Worker $workerId already started decommissioning.")
      } else {
        logWarning(s"Receive decommission request, but decommission feature is disabled.")
      }
    }

十五. WorkerSigPWRReceived 详解

    //先把自己失联了
    decommissionSelf()
    //告诉master节点,不要联系我了
    sendToMaster(WorkerDecommissioning(workerId, self))
posted @ 2022-09-12 22:59  Kotlin  阅读(21)  评论(0编辑  收藏  举报
Live2D