spark源码(八)Worker receive 方法

    一.case msg: RegisterWorkerResponse 
    二.case SendHeartbeat
    三.case WorkDirCleanup
    四.case MasterChanged
    五.case ReconnectWorker
    六.case LaunchExecutor
    七.case executorStateChanged: ExecutorStateChanged
    八.case KillExecutor(masterUrl, appId, execId)
    九.case LaunchDriver(driverId, driverDesc, resources_)
    十.case KillDriver(driverId)
    十一.case driverStateChanged @ DriverStateChanged(driverId, state, exception)
    十二.case ReregisterWithMaster
    十三.case ApplicationFinished(id)
    十四.case DecommissionWorker
    十五.case WorkerSigPWRReceived

六. LaunchExecutor 详解

    //state 的状态的改变完成对任务状态的记录
    if (masterUrl != activeMasterUrl) {
        logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
    } else if (decommissioned) {
        logWarning("Asked to launch an executor while decommissioned. Not launching executor.")
    } else {
        try {
          logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))
              //在executor本地创建的文件夹
              val executorDir = new File(workDir, appId + "/" + execId)
          if (!executorDir.mkdirs()) {
            throw new IOException("Failed to create directory " + executorDir)
          }
          val appLocalDirs = appDirectories.getOrElse(appId, {
            val localRootDirs = Utils.getOrCreateLocalRootDirs(conf)
            val dirs = localRootDirs.flatMap { dir =>
              try {
                //
                val appDir = Utils.createDirectory(dir, namePrefix = "executor")
                Utils.chmod700(appDir)
                Some(appDir.getAbsolutePath())
              } catch {
                case e: IOException =>
                  logWarning(s"${e.getMessage}. Ignoring this directory.")
                  None
              }
            }.toSeq
            if (dirs.isEmpty) {
              throw new IOException("No subfolder can be created in " +
                s"${localRootDirs.mkString(",")}.")
            }
            dirs
          })
          appDirectories(appId) = appLocalDirs
          //上面为止全部是创建的文件夹  下面才是重点
          val manager = new ExecutorRunner(appId,execId,
            appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
            cores_,memory_,self,workerId,webUi.scheme,host,webUi.boundPort,
            publicAddress,sparkHome,executorDir,workerUri,conf,
            appLocalDirs,ExecutorState.LAUNCHING,resources_)
          executors(appId + "/" + execId) = manager
          manager.start()//这才是执行executor命令的地方
          coresUsed += cores_//修改一共使用的资源参数
          memoryUsed += memory_
          addResourcesUsed(resources_)//修改资源配置文件
        } catch {
          case e: Exception =>
            logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)
            if (executors.contains(appId + "/" + execId)) {
              executors(appId + "/" + execId).kill()
              executors -= appId + "/" + execId
            }
            sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
              Some(e.toString), None))/*向master发送失败的消息*/
        }
    }
  6.1 manager.start 详解
    private[worker] def start(): Unit = {
      workerThread = new Thread("ExecutorRunner for " + fullId) {
        override def run(): Unit = { fetchAndRunExecutor() }/*入口方法*/
      }
      workerThread.start()
      shutdownHook = ShutdownHookManager.addShutdownHook { () =>
        if (state == ExecutorState.LAUNCHING || state == ExecutorState.RUNNING) {
          state = ExecutorState.FAILED
        }
        killProcess(Some("Worker shutting down")) }//这才是给自己发送一个成功的消息  默认是成功的
    }
  6.1.1 fetchAndRunExecutor 详解
    private def fetchAndRunExecutor(): Unit = {
      try {
        //创建资源文件的 跳过
        val resourceFileOpt = prepareResourcesFile(SPARK_EXECUTOR_PREFIX, resources, executorDir)
        val arguments = appDesc.command.arguments ++ resourceFileOpt.map(f =>
          Seq("--resourcesFile", f.getAbsolutePath)).getOrElse(Seq.empty)
        val subsOpts = appDesc.command.javaOpts.map {
          Utils.substituteAppNExecIds(_, appId, execId.toString)
        }//修改java参数 替换appId execId
        val subsCommand = appDesc.command.copy(arguments = arguments, javaOpts = subsOpts)
        val builder = CommandUtils.buildProcessBuilder(subsCommand, new SecurityManager(conf),
          memory, sparkHome.getAbsolutePath, substituteVariables)
        val command = builder.command()
        val redactedCommand = Utils.redactCommandLineArgs(conf, command.asScala.toSeq)
          .mkString("\"""\" \"""\"")
        logInfo(s"Launch command: $redactedCommand")

        builder.directory(executorDir)
        builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
        builder.environment.put("SPARK_LAUNCH_WITH_SCALA""0")
        //UI界面的参数
        val baseUrl =
          if (conf.get(UI_REVERSE_PROXY)) {
            conf.get(UI_REVERSE_PROXY_URL.key, "").stripSuffix("/") +
              s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
          } else {
            s"$webUiScheme$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
          }
        builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
        builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")

        process = builder.start()/*JVM命令启动 Executor*/
        val header = "Spark Executor Command: %s\n%s\n\n".format(
          redactedCommand, "=" * 40)

        val stdout = new File(executorDir, "stdout")//标准日志目录
        stdoutAppender = FileAppender(process.getInputStream, stdout, conf, true)

        val stderr = new File(executorDir, "stderr")//错误日志目录
        Files.write(header, stderr, StandardCharsets.UTF_8)
        stderrAppender = FileAppender(process.getErrorStream, stderr, conf, true)

        state = ExecutorState.RUNNING
        //发送给自己一个消息  上面的是发送给master的
        //给自己发送一个正在运行的消息
        worker.send(ExecutorStateChanged(appId, execId, state, None, None))
        val exitCode = process.waitFor()//一直等待执行完成 
        state = ExecutorState.EXITED
        val message = "Command exited with code " + exitCode
        //给自己发送一个正在退出的消息
        worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
      } catch {
        case interrupted: InterruptedException =>
          logInfo("Runner thread for executor " + fullId + " interrupted")
          state = ExecutorState.KILLED
          killProcess(None)//给自己发送一个kill消息
        case e: Exception =>
          logError("Error running executor", e)
          state = ExecutorState.FAILED
          killProcess(Some(e.toString))//给自己发送一个失败消息
      }
    }
  6.1.1.1 killProcess 详解
    private def killProcess(messageOption[String]): Unit = {
      var exitCodeOption[Int] = None
      if (process != null) {
        logInfo("Killing process!")
        if (stdoutAppender != null) {
          stdoutAppender.stop()
        }
        if (stderrAppender != null) {
          stderrAppender.stop()
        }
        exitCode = Utils.terminateProcess(process, EXECUTOR_TERMINATE_TIMEOUT_MS)
        if (exitCode.isEmpty) {
          logWarning("Failed to terminate process: " + process +
            ". This process will likely be orphaned.")
        }
      }
      try {
        //给自己发送一个消息。
        worker.send(ExecutorStateChanged(appId, execId, state, message, exitCode))
      } catch {
        case eIllegalStateException => logWarning(e.getMessage(), e)
      }
    }

七. executorStateChanged: ExecutorStateChanged 详解

    private[worker] def handleExecutorStateChanged(executorStateChanged: ExecutorStateChanged):
      Unit = {
      sendToMaster(executorStateChanged)
      val state = executorStateChanged.state
      if (ExecutorState.isFinished(state)) {
        val appId = executorStateChanged.appId
        val fullId = appId + "/" + executorStateChanged.execId
        val message = executorStateChanged.message
        val exitStatus = executorStateChanged.exitStatus
        executors.get(fullId) match {
          case Some(executor) =>
            logInfo("Executor " + fullId + " finished with state " + state +
              message.map(" message " + _).getOrElse("") +
              exitStatus.map(" exitStatus " + _).getOrElse(""))
            executors -= fullId
            finishedExecutors(fullId) = executor//回收资源
            trimFinishedExecutorsIfNecessary()
            coresUsed -= executor.cores
            memoryUsed -= executor.memory
            removeResourcesUsed(executor.resources)//删除配置文件

            if (CLEANUP_FILES_AFTER_EXECUTOR_EXIT) {
              //如果这就删除了,上面为什么还要检查定时删除呢
              //还是说就是提供两种方式
              //观察了一下上面移除的是整个app的文件
              //这只是单个executorId 的shuffle的文件
              shuffleService.executorRemoved(executorStateChanged.execId.toString, appId)
            }
          case None =>
            logInfo("Unknown Executor " + fullId + " finished with state " + state +
              message.map(" message " + _).getOrElse("") +
              exitStatus.map(" exitStatus " + _).getOrElse(""))
        }
        maybeCleanupApplication(appId)
      }
    }
  7.1 maybeCleanupApplication 详解
    private def maybeCleanupApplication(id: String): Unit = {
      //当前app在 完成的app列表里   当前节点包含这个app   
      //后面这个判断是啥意思啊 前面发送的消息也不是全部的worker都发送啊
      val shouldCleanup = finishedApps.contains(id) && !executors.values.exists(_.appId == id)
      if (shouldCleanup) {
        finishedApps -= id
        try {
          appDirectories.remove(id).foreach { dirList =>
            concurrent.Future {
              logInfo(s"Cleaning up local directories for application $id")
              dirList.foreach { dir =>
                Utils.deleteRecursively(new File(dir))
              }
            }(cleanupThreadExecutor).failed.foreach(e =>
              logError(s"Clean up app dir $dirList failed: ${e.getMessage}", e)
            )(cleanupThreadExecutor)
          }
        } catch {
          case _: RejectedExecutionException if cleanupThreadExecutor.isShutdown =>
            logWarning("Failed to cleanup application as executor pool was shutdown")
        }
        shuffleService.applicationRemoved(id)
      }
    }
posted @   Kotlin  阅读(31)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek “源神”启动!「GitHub 热点速览」
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 我与微信审核的“相爱相杀”看个人小程序副业
· C# 集成 DeepSeek 模型实现 AI 私有化(本地部署与 API 调用教程)
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
Live2D
点击右上角即可分享
微信分享提示
西雅图
13:14发布
西雅图
13:14发布
4°
东风
3级
空气质量
相对湿度
92%
今天
多云
3°/15°
周四
4°/15°
周五
4°/14°