Spark内核源码解析六:worker原理解析和源码解析
1、worker里面先找到launchDriver
case LaunchDriver(driverId, driverDesc) => { logInfo(s"Asked to launch driver $driverId") // 创建DriverRunner线程,包括在driver失败时自动重启driver val driver = new DriverRunner( conf, driverId, workDir, sparkHome, driverDesc.copy(command = Worker.maybeUpdateSSLSettings(driverDesc.command, conf)), self, akkaUrl) drivers(driverId) = driver driver.start() coresUsed += driverDesc.cores memoryUsed += driverDesc.mem }
DriverRunner里面的start方法
def start() = { new Thread("DriverRunner for " + driverId) { override def run() { try { // 创建工作目录 val driverDir = createWorkingDirectory() // 下载用户的jar包,下载用户jar包到工作目录,然后返回在worker中的路径 val localJarFilename = downloadUserJar(driverDir) def substituteVariables(argument: String): String = argument match { case "{{WORKER_URL}}" => workerUrl case "{{USER_JAR}}" => localJarFilename case other => other } // TODO: If we add ability to submit multiple jars they should also be added here // 构建processBuilder,传入Driver启动命令和需要的cpu和内存信息 val builder = CommandUtils.buildProcessBuilder(driverDesc.command, driverDesc.mem, sparkHome.getAbsolutePath, substituteVariables) launchDriver(builder, driverDir, driverDesc.supervise) } catch { case e: Exception => finalException = Some(e) } val state = if (killed) { DriverState.KILLED } else if (finalException.isDefined) { DriverState.ERROR } else { finalExitCode match { case Some(0) => DriverState.FINISHED case _ => DriverState.FAILED } } finalState = Some(state) // driver启动后向worker发送driver启动的消息 worker ! DriverStateChanged(driverId, state, finalException) } }.start() }
worker接收到driver启动消息后会将消息发送给master
case DriverStateChanged(driverId, state, exception) => { state match { case DriverState.ERROR => logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}") case DriverState.FAILED => logWarning(s"Driver $driverId exited with failure") case DriverState.FINISHED => logInfo(s"Driver $driverId exited successfully") case DriverState.KILLED => logInfo(s"Driver $driverId was killed by user") case _ => logDebug(s"Driver $driverId changed state to $state") } master ! DriverStateChanged(driverId, state, exception) val driver = drivers.remove(driverId).get finishedDrivers(driverId) = driver memoryUsed -= driver.driverDesc.mem coresUsed -= driver.driverDesc.cores
worker接收到启动executor消息后
首先master发送启动executor消息
def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc) { logInfo("Launching executor " + exec.fullId + " on worker " + worker.id) // 将executor加入wwoker内部的缓存 worker.addExecutor(exec) // 向worker的actor发送lanchExecutor消息 worker.actor ! LaunchExecutor(masterUrl, exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory) // 向executor对象的application的driver发送ExecutorAdded消息 exec.application.driver ! ExecutorAdded( exec.id, worker.id, worker.hostPort, exec.cores, exec.memory) }
case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) => if (masterUrl != activeMasterUrl) { logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.") } else { try { logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name)) // Create the executor's working directory // 首先创建Executor工作目录 val executorDir = new File(workDir, appId + "/" + execId) if (!executorDir.mkdirs()) { throw new IOException("Failed to create directory " + executorDir) } // Create local dirs for the executor. These are passed to the executor via the // SPARK_LOCAL_DIRS environment variable, and deleted by the Worker when the // application finishes. val appLocalDirs = appDirectories.get(appId).getOrElse { Utils.getOrCreateLocalRootDirs(conf).map { dir => Utils.createDirectory(dir).getAbsolutePath() }.toSeq } appDirectories(appId) = appLocalDirs // 创建ExecutorRunner,然后加入本地缓存 val manager = new ExecutorRunner( appId, execId, appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)), cores_, memory_, self, workerId, host, webUi.boundPort, publicAddress, sparkHome, executorDir, akkaUrl, conf, appLocalDirs, ExecutorState.LOADING) executors(appId + "/" + execId) = manager manager.start() coresUsed += cores_ memoryUsed += memory_ master ! ExecutorStateChanged(appId, execId, manager.state, None, None) } catch { case e: Exception => { logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e) if (executors.contains(appId + "/" + execId)) { executors(appId + "/" + execId).kill() executors -= appId + "/" + execId } master ! ExecutorStateChanged(appId, execId, ExecutorState.FAILED, Some(e.toString), None) } } }
def start() { workerThread = new Thread("ExecutorRunner for " + fullId) { override def run() { fetchAndRunExecutor() } } workerThread.start() // Shutdown hook that kills actors on shutdown. shutdownHook = new Thread() { override def run() { killProcess(Some("Worker shutting down")) } } Runtime.getRuntime.addShutdownHook(shutdownHook) }
def fetchAndRunExecutor() { try { // Launch the process // 封装一个processBuilder val builder = CommandUtils.buildProcessBuilder(appDesc.command, memory, sparkHome.getAbsolutePath, substituteVariables) val command = builder.command() logInfo("Launch command: " + command.mkString("\"", "\" \"", "\"")) builder.directory(executorDir) builder.environment.put("SPARK_LOCAL_DIRS", appLocalDirs.mkString(",")) // In case we are running this from within the Spark Shell, avoid creating a "scala" // parent process for the executor command builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0") // Add webUI log urls val baseUrl = s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType=" builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr") builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout") process = builder.start() val header = "Spark Executor Command: %s\n%s\n\n".format( command.mkString("\"", "\" \"", "\""), "=" * 40) // Redirect its stdout and stderr to files // 从定向文件流输出到文件 val stdout = new File(executorDir, "stdout") stdoutAppender = FileAppender(process.getInputStream, stdout, conf) val stderr = new File(executorDir, "stderr") Files.write(header, stderr, UTF_8) stderrAppender = FileAppender(process.getErrorStream, stderr, conf) // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown) // or with nonzero exit code // 调用processbuilder启动executor val exitCode = process.waitFor() state = ExecutorState.EXITED val message = "Command exited with code " + exitCode // 向executor发送状态变化的消息 worker ! ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)) } catch { case interrupted: InterruptedException => { logInfo("Runner thread for executor " + fullId + " interrupted") state = ExecutorState.KILLED killProcess(None) } case e: Exception => { logError("Error running executor", e) state = ExecutorState.FAILED killProcess(Some(e.toString)) } } }
worker收到executor的状态变化消息后会向mastor发送executor状态变化消息
case ExecutorStateChanged(appId, execId, state, message, exitStatus) => master ! ExecutorStateChanged(appId, execId, state, message, exitStatus) val fullId = appId + "/" + execId if (ExecutorState.isFinished(state)) { executors.get(fullId) match { case Some(executor) => logInfo("Executor " + fullId + " finished with state " + state + message.map(" message " + _).getOrElse("") + exitStatus.map(" exitStatus " + _).getOrElse("")) executors -= fullId finishedExecutors(fullId) = executor coresUsed -= executor.cores memoryUsed -= executor.memory case None => logInfo("Unknown Executor " + fullId + " finished with state " + state + message.map(" message " + _).getOrElse("") + exitStatus.map(" exitStatus " + _).getOrElse("")) } maybeCleanupApplication(appId) }
def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc) {
logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
// 将executor加入wwoker内部的缓存
worker.addExecutor(exec)
// 向worker的actor发送lanchExecutor消息
worker.actor ! LaunchExecutor(masterUrl,
exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory)
// 向executor对象的application的driver发送ExecutorAdded消息
exec.application.driver ! ExecutorAdded(
exec.id, worker.id, worker.hostPort, exec.cores, exec.memory)
}