spark源码(三)Master recive方法
Master recive全部方法
一.case ElectedLeader => ......
二.case CompleteRecovery => ......
三.case RevokedLeadership => ......
四.case WorkerDecommissioning => ......
五.case DecommissionWorkers => ......
六.case RegisterWorker => ......
七.case RegisterApplication => ......
八.case ExecutorStateChanged => ......
九.case DriverStateChanged => ......
十.case Heartbeat => ......
十一.case MasterChangeAcknowledged => ......
十二.case WorkerSchedulerStateResponse => ......
十三.case WorkerLatestState => ......
十四.case UnregisterApplication => ......
十五.case CheckForWorkerTimeOut => ......
三.RevokedLeadership 详解
logError("Leadership has been revoked -- master shutting down.")
System.exit(0) //方法还是比较简单的,直接把当前master节点退出就行
//别听公众号瞎几把扯 exit 就是最优雅的退出方式
四.WorkerDecommissioning 详解 单个worker节点坏了
if (state == RecoveryState.STANDBY) {//STANDBY就是备份的节点
workerRef.send(MasterInStandby)
} else {
idToWorker.get(id).foreach(decommissionWorker)
}
4.1 decommissionWorker 详解
private def decommissionWorker(worker: WorkerInfo): Unit = {
if (worker.state != WorkerState.DECOMMISSIONED) {
logInfo("Decommissioning worker %s on %s:%d".format(worker.id, worker.host, worker.port))
worker.setState(WorkerState.DECOMMISSIONED)
for (exec <- worker.executors.values) {
logInfo("Telling app of decommission executors")
exec.application.driver.send(ExecutorUpdated(//在别的节点上新起executor
exec.id, ExecutorState.DECOMMISSIONED,
Some("worker decommissioned"), None,
Some(worker.host)))
exec.state = ExecutorState.DECOMMISSIONED
exec.application.removeExecutor(exec)//worker上所有executor去除
}
persistenceEngine.removeWorker(worker)
} else {
logWarning("Skipping decommissioning worker %s on %s:%d as worker is already decommissioned".
format(worker.id, worker.host, worker.port))
}
}
五.DecommissionWorkers 详解 所有节点全坏了
assert(state != RecoveryState.STANDBY)
ids.foreach ( id =>
idToWorker.get(id).foreach { w =>
decommissionWorker(w)
w.endpoint.send(DecommissionWorker)
}
)
六.registerWorker 详解 注册节点
private def registerWorker(worker: WorkerInfo): Boolean = {
workers.filter { w =>
(w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD)
}.foreach { w =>
workers -= w
}//当前节点是死节点 就去掉当前节点
val workerAddress = worker.endpoint.address
if (addressToWorker.contains(workerAddress)) {//如果当前节点是已经注册过
val oldWorker = addressToWorker(workerAddress)
if (oldWorker.state == WorkerState.UNKNOWN) {
removeWorker(oldWorker, "Worker replaced by a new worker with same address")
} else {
logInfo("Attempted to re-register worker at same address: " + workerAddress)
return false
}
}
workers += worker//真正注册节点的代码
idToWorker(worker.id) = worker//真正注册节点的代码
addressToWorker(workerAddress) = worker//真正注册节点的代码
true
}
七.RegisterApplication 详解
if (state == RecoveryState.STANDBY) {
// ignore, don't send response
} else {
logInfo("Registering app " + description.name)
val app = createApplication(description, driver)//只是new 了一个ApplicationInfo对象
registerApplication(app)/*注册app相关信息 资源相关的处理*/
logInfo("Registered app " + description.name + " with ID " + app.id)
persistenceEngine.addApplication(app)
driver.send(RegisteredApplication(app.id, self))/*返回一个注册成功的消息*/
schedule()
}
八.ExecutorStateChanged 详解
(appId, execId, state, message, exitStatus) =>
val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId))
execOption match {
case Some(exec) =>
val appInfo = idToApp(appId)
val oldState = exec.state
exec.state = state
if (state == ExecutorState.RUNNING) {//没有看懂这一块 如果状态是running 代码做什么
assert(oldState == ExecutorState.LAUNCHING,
s"executor $execId state transfer from $oldState to RUNNING is illegal")
appInfo.resetRetryCount()
}
exec.application.driver.send(ExecutorUpdated(execId, state, message, exitStatus, None))
if (ExecutorState.isFinished(state)) {//如果任务完成 移除executor
logInfo(s"Removing executor ${exec.fullId} because it is $state")
if (!appInfo.isFinished) {
appInfo.removeExecutor(exec)//移除Executor 这只是移除资源
}
exec.worker.removeExecutor(exec)
val normalExit = exitStatus == Some(0)
if (!normalExit
&& oldState != ExecutorState.DECOMMISSIONED
&& appInfo.incrementRetryCount() >= maxExecutorRetries
&& maxExecutorRetries >= 0) { //任务完成状态 其中是失败导致的
val execs = appInfo.executors.values
if (!execs.exists(_.state == ExecutorState.RUNNING)) {
logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " +
s"${appInfo.retryCount} times; removing it")
removeApplication(appInfo, ApplicationState.FAILED)
}
}
}
schedule()
case None =>
logWarning(s"Got status update for unknown executor $appId/$execId")
}
8.1 removeApplication 详解
def removeApplication(app: ApplicationInfo, state: ApplicationState.Value): Unit = {
if (apps.contains(app)) {
logInfo("Removing app " + app.id)
apps -= app
idToApp -= app.id
endpointToApp -= app.driver
addressToApp -= app.driver.address
if (completedApps.size >= retainedApplications) {
val toRemove = math.max(retainedApplications / 10, 1)
completedApps.take(toRemove).foreach { a =>
applicationMetricsSystem.removeSource(a.appSource)
}
completedApps.trimStart(toRemove)
}
completedApps += app
waitingApps -= app
for (exec <- app.executors.values) {
killExecutor(exec)
}
app.markFinished(state)
if (state != ApplicationState.FINISHED) {
app.driver.send(ApplicationRemoved(state.toString))//这其实已经把driver相关的移除了
}
persistenceEngine.removeApplication(app)
schedule()
workers.foreach { w =>
w.endpoint.send(ApplicationFinished(app.id))//上面执行了一遍KillExecutor 这又执行一遍 ApplicationFinished
}
}
}
8.1.1 killExecutor 详解
private def killExecutor(exec: ExecutorDesc): Unit = {
exec.worker.removeExecutor(exec)
exec.worker.endpoint.send(KillExecutor(masterUrl, exec.application.id, exec.id))//这才是重点代码啊
exec.state = ExecutorState.KILLED
}
搬砖多年终不得要领,遂载源码看之望得真经。
分类:
spark-core
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?