spark源码(六)Worker服务启动
一.main函数详解
def main(argStrings: Array[String]): Unit = {
Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler(
exitOnUncaughtException = false))
Utils.initDaemon(log)//日志相关的跳过
val conf = new SparkConf
val args = new WorkerArguments(argStrings, conf)
val rpcEnv = startRpcEnvAndEndpoint(args.host, args.port, args.webUiPort, args.cores,
args.memory, args.masters, args.workDir, conf = conf,
resourceFileOpt = conf.get(SPARK_WORKER_RESOURCE_FILE))/*启动RpcEnvAndEndpoint*/
val externalShuffleServiceEnabled = conf.get(config.SHUFFLE_SERVICE_ENABLED)
val sparkWorkerInstances = scala.sys.env.getOrElse("SPARK_WORKER_INSTANCES", "1").toInt
//默认启动一个executor
require(externalShuffleServiceEnabled == false || sparkWorkerInstances <= 1,
"Starting multiple workers on one host is failed because we may launch no more than one " +
"external shuffle service on each host, please set spark.shuffle.service.enabled to " +
"false or set SPARK_WORKER_INSTANCES to 1 to resolve the conflict.")
rpcEnv.awaitTermination()
}
1.1 startRpcEnvAndEndpoint 详解
def startRpcEnvAndEndpoint(host: String,port: Int,webUiPort: Int,cores: Int,
memory: Int,masterUrls: Array[String],
workDir: String,workerNumber: Option[Int] = None,
conf: SparkConf = new SparkConf,resourceFileOpt: Option[String] = None): RpcEnv = {
val systemName = SYSTEM_NAME + workerNumber.map(_.toString).getOrElse("")
val securityMgr = new SecurityManager(conf)
val rpcEnv = RpcEnv.create(systemName, host, port, conf, securityMgr)
val masterAddresses = masterUrls.map(RpcAddress.fromSparkURL)/*主动找Master*/
//new 方法跳转到 Worker 代码块和 onstart recive方法中
rpcEnv.setupEndpoint(ENDPOINT_NAME, new Worker(rpcEnv, webUiPort, cores, memory,
masterAddresses, ENDPOINT_NAME, workDir, conf, securityMgr, resourceFileOpt))
rpcEnv
}
二. 代码块 详解
//变量的初始化就不看了 直接看执行的代码
if (conf.get(config.DECOMMISSION_ENABLED)) {
logInfo("Registering SIGPWR handler to trigger decommissioning.")
SignalUtils.register("PWR", "Failed to register SIGPWR handler - " +
"disabling worker decommission feature.") {
self.send(WorkerSigPWRReceived)//等receive方法详细看
true
}
} else {
logInfo("Worker decommissioning not enabled, SIGPWR will result in exiting.")
}
private val sparkHome =//从配置中拿到sparkHome初始化
if (sys.props.contains(IS_TESTING.key)) {
assert(sys.props.contains("spark.test.home"), "spark.test.home is not set!")
new File(sys.props("spark.test.home"))
} else {
new File(sys.env.getOrElse("SPARK_HOME", "."))
}
//executor执行完成后,资源就回收了。
//但是下一个阶段的executor还是需要知道相应文件存储的地址的 所以需要有一个服务记录这些信息
private val shuffleService = if (externalShuffleServiceSupplier != null) {
externalShuffleServiceSupplier.get()
} else {
new ExternalShuffleService(conf, securityMgr)/*shuffle服务*/
}
三 . onstart 方法详解
override def onStart(): Unit = {
assert(!registered)
logInfo("Starting Spark worker %s:%d with %d cores, %s RAM".format(
host, port, cores, Utils.megabytesToString(memory)))
logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}")
logInfo("Spark home: " + sparkHome)
createWorkDir()/*创建工作目录*/
startExternalShuffleService()/*shuffle服务*/
setupWorkerResources()
webUi = new WorkerWebUI(this, workDir, webUiPort)/*webui*/
webUi.bind()
workerWebUiUrl = s"${webUi.scheme}$publicAddress:${webUi.boundPort}"
registerWithMaster()/*向master注册*/
metricsSystem.registerSource(workerSource)//指标系统直接跳过
metricsSystem.start()
metricsSystem.getServletHandlers.foreach(webUi.attachHandler)
}
1.1 registerWithMaster 详解
private def registerWithMaster(): Unit = {
registrationRetryTimer match {
case None =>
registered = false
registerMasterFutures = tryRegisterAllMasters()/*向每一个master注册*/
connectionAttemptCount = 0
//启动一个定时器 调用本地方法 定时的注册当前节点 定时向master注册 感觉有点心跳的意思哈
registrationRetryTimer = Some(forwardMessageScheduler.scheduleAtFixedRate(
() => Utils.tryLogNonFatalError { Option(self).foreach(_.send(ReregisterWithMaster)) },
INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS,
INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS,
TimeUnit.SECONDS))
case Some(_) =>
logInfo("Not spawning another attempt to register with the master, since there is an" +
" attempt scheduled already.")
}
}
1.1.1 tryRegisterAllMasters 详解
private def tryRegisterAllMasters(): Array[JFuture[_]] = {
masterRpcAddresses.map { masterAddress =>
registerMasterThreadPool.submit(new Runnable {//启动线程池 发送注册消息
override def run(): Unit = {
try {
logInfo("Connecting to master " + masterAddress + "...")
/*获取master对象*/
val masterEndpoint = rpcEnv.setupEndpointRef(masterAddress, Master.ENDPOINT_NAME)
/*发送注册消息给master master节点的recive方法会接受到注册消息*/
sendRegisterMessageToMaster(masterEndpoint)
//其实最后发送了一个 masterEndpoint.send(RegisterWorker) 方法
} catch {
case ie: InterruptedException => // Cancelled
case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e)
}
}
})
}
}
搬砖多年终不得要领,遂载源码看之望得真经。