spark源码(四)Master recive方法
Master recive全部方法
一.case ElectedLeader => ......
二.case CompleteRecovery => ......
三.case RevokedLeadership => ......
四.case WorkerDecommissioning => ......
五.case DecommissionWorkers => ......
六.case RegisterWorker => ......
七.case RegisterApplication => ......
八.case ExecutorStateChanged => ......
九.case DriverStateChanged => ......
十.case Heartbeat => ......
十一.case MasterChangeAcknowledged => ......
十二.case WorkerSchedulerStateResponse => ......
十三.case WorkerLatestState => ......
十四.case UnregisterApplication => ......
十五.case CheckForWorkerTimeOut => ......
九.DriverStateChanged 详解
state match {
case DriverState.ERROR | DriverState.FINISHED | DriverState.KILLED | DriverState.FAILED =>
removeDriver(driverId, state, exception)//执行代码入口
case _ =>//DriverState( SUBMITTED, RUNNING, RELAUNCHING, UNKNOWN) 打印一句报错信息
throw new Exception(s"Received unexpected state update for driver $driverId: $state")
}
// removeDriver 上面其实是有的 但是有一个疑问:为什么没有worker.send(removeDriver)方法呢
十.Heartbeat 详解 整体还是比较简单的
idToWorker.get(workerId) match {
case Some(workerInfo) =>
workerInfo.lastHeartbeat = System.currentTimeMillis()/*注册以后,更新最后一次心跳时长*/
case None =>
if (workers.map(_.id).contains(workerId)) {
logWarning(s"Got heartbeat from unregistered worker $workerId." +
" Asking it to re-register.")
worker.send(ReconnectWorker(masterUrl))//最后调用一个这个方法 估计也没有什么处理逻辑
} else {
logWarning(s"Got heartbeat from unregistered worker $workerId." +
" This worker was never registered, so ignoring the heartbeat.")
}
}
十一.MasterChangeAcknowledged 详解
//将app状态改为WAITING 调用 completeRecovery 方法
idToApp.get(appId) match {
case Some(app) =>
logInfo("Application has been re-registered: " + appId)
app.state = ApplicationState.WAITING
case None =>
logWarning("Master change ack from unknown app: " + appId)
}
if (canCompleteRecovery) { completeRecovery() }//上面方法有介绍的
十二.WorkerSchedulerStateResponse 详解
//这应对的情况应该是worker掉线了 重新申请资源 但是没有杀掉旧的任务
idToWorker.get(workerId) match {
case Some(worker) =>
logInfo("Worker has been re-registered: " + workerId)
worker.state = WorkerState.ALIVE
val validExecutors = execResponses.filter(
exec => idToApp.get(exec.desc.appId).isDefined)
for (exec <- validExecutors) {//将worker上的所有executor重新申请
val (execDesc, execResources) = (exec.desc, exec.resources)
val app = idToApp(execDesc.appId)
val execInfo = app.addExecutor(
worker, execDesc.cores, execResources, Some(execDesc.execId))
worker.addExecutor(execInfo)
worker.recoverResources(execResources)
execInfo.copyState(execDesc)
}
for (driver <- driverResponses) {//将worker上的所有driver重新申请
val (driverId, driverResource) = (driver.driverId, driver.resources)
drivers.find(_.id == driverId).foreach { driver =>
driver.worker = Some(worker)
driver.state = DriverState.RUNNING
driver.withResources(driverResource)
worker.recoverResources(driverResource)
worker.addDriver(driver)
}
}
case None =>
logWarning("Scheduler state from unknown worker: " + workerId)
}
if (canCompleteRecovery) { completeRecovery() }
十三.WorkerLatestState 详解
//这个可以用在慢任务检测的时候 杀掉多余的起来的任务
idToWorker.get(workerId) match {
case Some(worker) =>
for (exec <- executors) {//将worker上的未知状态的executor杀掉
val executorMatches = worker.executors.exists {
case (_, e) => e.application.id == exec.appId && e.id == exec.execId
}
if (!executorMatches) {
worker.endpoint.send(KillExecutor(masterUrl, exec.appId, exec.execId))
}
}
for (driverId <- driverIds) {//将worker上未知状态的driver杀掉
val driverMatches = worker.drivers.exists { case (id, _) => id == driverId }
if (!driverMatches) {
worker.endpoint.send(KillDriver(driverId))
}
}
case None =>
logWarning("Worker state from unknown worker: " + workerId)
}
十四.UnregisterApplication 详解 杀掉application
logInfo(s"Received unregister request from application $applicationId")
idToApp.get(applicationId).foreach(finishApplication)//最终调用 removeApplication 上面介绍过的
十五.CheckForWorkerTimeOut 详解
//worker心跳不起作用了 就移除当前节点
onstart 方法里面有一个调度 send的就是这个方法
timeOutDeadWorkers()
private def timeOutDeadWorkers(): Unit = {
val currentTime = System.currentTimeMillis()
val toRemove = workers.filter(_.lastHeartbeat < currentTime - workerTimeoutMs).toArray
for (worker <- toRemove) {//其实还是没有使用hadoop所谓的SortedMap 直接循环干
if (worker.state != WorkerState.DEAD) {
val workerTimeoutSecs = TimeUnit.MILLISECONDS.toSeconds(workerTimeoutMs)
logWarning("Removing %s because we got no heartbeat in %d seconds".format(
worker.id, workerTimeoutSecs))
removeWorker(worker, s"Not receiving heartbeat for $workerTimeoutSecs seconds")
} else {
if (worker.lastHeartbeat < currentTime - ((reaperIterations + 1) * workerTimeoutMs)) {
workers -= worker
}
}
}
}
搬砖多年终不得要领,遂载源码看之望得真经。