spark源码(八)Worker receive 方法
一.case msg: RegisterWorkerResponse
二.case SendHeartbeat
三.case WorkDirCleanup
四.case MasterChanged
五.case ReconnectWorker
六.case LaunchExecutor
七.case executorStateChanged: ExecutorStateChanged
八.case KillExecutor(masterUrl, appId, execId)
九.case LaunchDriver(driverId, driverDesc, resources_)
十.case KillDriver(driverId)
十一.case driverStateChanged @ DriverStateChanged(driverId, state, exception)
十二.case ReregisterWithMaster
十三.case ApplicationFinished(id)
十四.case DecommissionWorker
十五.case WorkerSigPWRReceived
六. LaunchExecutor 详解
//state 的状态的改变完成对任务状态的记录
if (masterUrl != activeMasterUrl) {
logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
} else if (decommissioned) {
logWarning("Asked to launch an executor while decommissioned. Not launching executor.")
} else {
try {
logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))
//在executor本地创建的文件夹
val executorDir = new File(workDir, appId + "/" + execId)
if (!executorDir.mkdirs()) {
throw new IOException("Failed to create directory " + executorDir)
}
val appLocalDirs = appDirectories.getOrElse(appId, {
val localRootDirs = Utils.getOrCreateLocalRootDirs(conf)
val dirs = localRootDirs.flatMap { dir =>
try {
//
val appDir = Utils.createDirectory(dir, namePrefix = "executor")
Utils.chmod700(appDir)
Some(appDir.getAbsolutePath())
} catch {
case e: IOException =>
logWarning(s"${e.getMessage}. Ignoring this directory.")
None
}
}.toSeq
if (dirs.isEmpty) {
throw new IOException("No subfolder can be created in " +
s"${localRootDirs.mkString(",")}.")
}
dirs
})
appDirectories(appId) = appLocalDirs
//上面为止全部是创建的文件夹 下面才是重点
val manager = new ExecutorRunner(appId,execId,
appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
cores_,memory_,self,workerId,webUi.scheme,host,webUi.boundPort,
publicAddress,sparkHome,executorDir,workerUri,conf,
appLocalDirs,ExecutorState.LAUNCHING,resources_)
executors(appId + "/" + execId) = manager
manager.start()//这才是执行executor命令的地方
coresUsed += cores_//修改一共使用的资源参数
memoryUsed += memory_
addResourcesUsed(resources_)//修改资源配置文件
} catch {
case e: Exception =>
logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)
if (executors.contains(appId + "/" + execId)) {
executors(appId + "/" + execId).kill()
executors -= appId + "/" + execId
}
sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
Some(e.toString), None))/*向master发送失败的消息*/
}
}
6.1 manager.start 详解
private[worker] def start(): Unit = {
workerThread = new Thread("ExecutorRunner for " + fullId) {
override def run(): Unit = { fetchAndRunExecutor() }/*入口方法*/
}
workerThread.start()
shutdownHook = ShutdownHookManager.addShutdownHook { () =>
if (state == ExecutorState.LAUNCHING || state == ExecutorState.RUNNING) {
state = ExecutorState.FAILED
}
killProcess(Some("Worker shutting down")) }//这才是给自己发送一个成功的消息 默认是成功的
}
6.1.1 fetchAndRunExecutor 详解
private def fetchAndRunExecutor(): Unit = {
try {
//创建资源文件的 跳过
val resourceFileOpt = prepareResourcesFile(SPARK_EXECUTOR_PREFIX, resources, executorDir)
val arguments = appDesc.command.arguments ++ resourceFileOpt.map(f =>
Seq("--resourcesFile", f.getAbsolutePath)).getOrElse(Seq.empty)
val subsOpts = appDesc.command.javaOpts.map {
Utils.substituteAppNExecIds(_, appId, execId.toString)
}//修改java参数 替换appId execId
val subsCommand = appDesc.command.copy(arguments = arguments, javaOpts = subsOpts)
val builder = CommandUtils.buildProcessBuilder(subsCommand, new SecurityManager(conf),
memory, sparkHome.getAbsolutePath, substituteVariables)
val command = builder.command()
val redactedCommand = Utils.redactCommandLineArgs(conf, command.asScala.toSeq)
.mkString("\"", "\" \"", "\"")
logInfo(s"Launch command: $redactedCommand")
builder.directory(executorDir)
builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")
//UI界面的参数
val baseUrl =
if (conf.get(UI_REVERSE_PROXY)) {
conf.get(UI_REVERSE_PROXY_URL.key, "").stripSuffix("/") +
s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
} else {
s"$webUiScheme$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
}
builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")
process = builder.start()/*JVM命令启动 Executor*/
val header = "Spark Executor Command: %s\n%s\n\n".format(
redactedCommand, "=" * 40)
val stdout = new File(executorDir, "stdout")//标准日志目录
stdoutAppender = FileAppender(process.getInputStream, stdout, conf, true)
val stderr = new File(executorDir, "stderr")//错误日志目录
Files.write(header, stderr, StandardCharsets.UTF_8)
stderrAppender = FileAppender(process.getErrorStream, stderr, conf, true)
state = ExecutorState.RUNNING
//发送给自己一个消息 上面的是发送给master的
//给自己发送一个正在运行的消息
worker.send(ExecutorStateChanged(appId, execId, state, None, None))
val exitCode = process.waitFor()//一直等待执行完成
state = ExecutorState.EXITED
val message = "Command exited with code " + exitCode
//给自己发送一个正在退出的消息
worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))
} catch {
case interrupted: InterruptedException =>
logInfo("Runner thread for executor " + fullId + " interrupted")
state = ExecutorState.KILLED
killProcess(None)//给自己发送一个kill消息
case e: Exception =>
logError("Error running executor", e)
state = ExecutorState.FAILED
killProcess(Some(e.toString))//给自己发送一个失败消息
}
}
6.1.1.1 killProcess 详解
private def killProcess(message: Option[String]): Unit = {
var exitCode: Option[Int] = None
if (process != null) {
logInfo("Killing process!")
if (stdoutAppender != null) {
stdoutAppender.stop()
}
if (stderrAppender != null) {
stderrAppender.stop()
}
exitCode = Utils.terminateProcess(process, EXECUTOR_TERMINATE_TIMEOUT_MS)
if (exitCode.isEmpty) {
logWarning("Failed to terminate process: " + process +
". This process will likely be orphaned.")
}
}
try {
//给自己发送一个消息。
worker.send(ExecutorStateChanged(appId, execId, state, message, exitCode))
} catch {
case e: IllegalStateException => logWarning(e.getMessage(), e)
}
}
七. executorStateChanged: ExecutorStateChanged 详解
private[worker] def handleExecutorStateChanged(executorStateChanged: ExecutorStateChanged):
Unit = {
sendToMaster(executorStateChanged)
val state = executorStateChanged.state
if (ExecutorState.isFinished(state)) {
val appId = executorStateChanged.appId
val fullId = appId + "/" + executorStateChanged.execId
val message = executorStateChanged.message
val exitStatus = executorStateChanged.exitStatus
executors.get(fullId) match {
case Some(executor) =>
logInfo("Executor " + fullId + " finished with state " + state +
message.map(" message " + _).getOrElse("") +
exitStatus.map(" exitStatus " + _).getOrElse(""))
executors -= fullId
finishedExecutors(fullId) = executor//回收资源
trimFinishedExecutorsIfNecessary()
coresUsed -= executor.cores
memoryUsed -= executor.memory
removeResourcesUsed(executor.resources)//删除配置文件
if (CLEANUP_FILES_AFTER_EXECUTOR_EXIT) {
//如果这就删除了,上面为什么还要检查定时删除呢
//还是说就是提供两种方式
//观察了一下上面移除的是整个app的文件
//这只是单个executorId 的shuffle的文件
shuffleService.executorRemoved(executorStateChanged.execId.toString, appId)
}
case None =>
logInfo("Unknown Executor " + fullId + " finished with state " + state +
message.map(" message " + _).getOrElse("") +
exitStatus.map(" exitStatus " + _).getOrElse(""))
}
maybeCleanupApplication(appId)
}
}
7.1 maybeCleanupApplication 详解
private def maybeCleanupApplication(id: String): Unit = {
//当前app在 完成的app列表里 当前节点包含这个app
//后面这个判断是啥意思啊 前面发送的消息也不是全部的worker都发送啊
val shouldCleanup = finishedApps.contains(id) && !executors.values.exists(_.appId == id)
if (shouldCleanup) {
finishedApps -= id
try {
appDirectories.remove(id).foreach { dirList =>
concurrent.Future {
logInfo(s"Cleaning up local directories for application $id")
dirList.foreach { dir =>
Utils.deleteRecursively(new File(dir))
}
}(cleanupThreadExecutor).failed.foreach(e =>
logError(s"Clean up app dir $dirList failed: ${e.getMessage}", e)
)(cleanupThreadExecutor)
}
} catch {
case _: RejectedExecutionException if cleanupThreadExecutor.isShutdown =>
logWarning("Failed to cleanup application as executor pool was shutdown")
}
shuffleService.applicationRemoved(id)
}
}
搬砖多年终不得要领,遂载源码看之望得真经。
分类:
spark-core
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek “源神”启动!「GitHub 热点速览」
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 我与微信审核的“相爱相杀”看个人小程序副业
· C# 集成 DeepSeek 模型实现 AI 私有化(本地部署与 API 调用教程)
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求