spark源码(一)Master服务启动
源码地址:"https://mirrors.tuna.tsinghua.edu.cn/apache/spark/spark-3.1.3/spark-3.1.3.tgz"
一.spark源码中基础概念的介绍:
1、RpcEnv:RPC包装对象类似于sparkcontext对象
2、RpcEndpoint:RPC 真正发消息的类都需要实现这个接口,并实现其中的方法:onStart、receive、stop方法。
3、Inbox:指令消息收件箱,OutBox:指令消息发件箱。
4、TransportClient:Netty 通信客户端,主要负责将相对应的 OutBox 中的数据发送给远程 TransportServer。
5、TransportServer:Netty 通信服务端,主要用于接收远程 RpcEndpoint 发送过来的消息。
二.spark启动过程中,是先启动Master和Worker节点,分别对应以下两个类:
org.apache.spark.deploy.master.Master.main()
org.apache.spark.deploy.worker.Worker.main()
三.Master服务启动 org.apache.spark.deploy.master.Master.main
1.main函数介绍
def main(argStrings: Array[String]): Unit = {
Thread.setDefaultUncaughtExceptionHandler(
new SparkUncaughtExceptionHandler(exitOnUncaughtException = false))
Utils.initDaemon(log)/*初始化log组件*/
/*内部有一个重要方法:加载环境变量,spark开头的配置,将变量放到SparkConf.settings(ConcurrentHashMap)*/
val conf = new SparkConf
/*解析启动的时候:自带的环境配置继续更新SparkConf.settings*/
val args = new MasterArguments(argStrings, conf)
/*重点方法,继续跟进*/
val (rpcEnv, _, _) = startRpcEnvAndEndpoint(args.host, args.port, args.webUiPort, conf)
rpcEnv.awaitTermination()
}
1.1 val conf = new SparkConf 详解
//进入该方法后找到可执行的代码块
if (loadDefaults) {
loadFromSystemProperties(false)
}
1.1.1 loadFromSystemProperties(false) 详解
private[spark] def loadFromSystemProperties(silent: Boolean): SparkConf = {
//可以看到解析了所有spark开头的参数,这也解释了为什么配置spark参数的时候需要spark.开头,要不不生效
for ((key, value) <- Utils.getSystemProperties if key.startsWith("spark.")) {
set(key, value, silent)
}
this
}
//该类还有下面重要的成员变量:后续自己做平台的时候可以参考一下这个哈
//废弃的属性的map
private val deprecatedConfigs: Map[String, DeprecatedConfig]
//将现有配置映射到老的配置名称上的map
private val configsWithAlternatives = Map[String, Seq[AlternateConfig]]
1.2 val args = new MasterArguments(argStrings, conf) 详解
该函数会执行以下代码块
parse(args.toList)/*解析IP,端口,UI参数 下面就是这个方法的详细函数*/
/*解析默认配置文件,如果配置文件不存在,则返回配置文件的路径*/
propertiesFile = Utils.loadDefaultSparkProperties(conf, propertiesFile)
/*到这其实只是解析了端口,IP,UI等配置,别的还没有解析的*/
1.2.1 parse(args.toList) 详解
//可以看到这其实只是解析了IP,端口,UI,配置等 --help和其他的参数就直接退出了
private def parse(args: List[String]): Unit = args match {
case ("--ip" | "-i") :: value :: tail =>
Utils.checkHost(value)
host = value
parse(tail)
case ("--host" | "-h") :: value :: tail =>
Utils.checkHost(value)
host = value
parse(tail)
case ("--port" | "-p") :: IntParam(value) :: tail =>
port = value
parse(tail)
case "--webui-port" :: IntParam(value) :: tail =>
webUiPort = value
parse(tail)
case ("--properties-file") :: value :: tail =>
propertiesFile = value
parse(tail)
case ("--help") :: tail =>
printUsageAndExit(0)
case Nil => // No-op
case _ =>
printUsageAndExit(1)
}
1.3 val (rpcEnv, , ) = startRpcEnvAndEndpoint(args.host, args.port, args.webUiPort, conf) 详解
def startRpcEnvAndEndpoint(host: String,port: Int,webUiPort: Int,conf: SparkConf): (RpcEnv, Int, Option[Int]) = {
val securityMgr = new SecurityManager(conf)//安全相关的配置跳过
val rpcEnv = RpcEnv.create(SYSTEM_NAME, host, port, conf, securityMgr)/*Endpoint的容器*/
/*启动Endpoint,跳转到class Master的代码块和Master.onStart receive 方法 electedLeader 方法 选举Master(多个master情况)*/
val masterEndpoint = rpcEnv.setupEndpoint(ENDPOINT_NAME,
new Master(rpcEnv, rpcEnv.address, webUiPort, securityMgr, conf))
val portsResponse = masterEndpoint.askSync[BoundPortsResponse](BoundPortsRequest)/*绑定端口请求*/
(rpcEnv, portsResponse.webUIPort, portsResponse.restPort)
}
1.3.1 class Master的代码块 详解
private val forwardMessageThread =
ThreadUtils.newDaemonSingleThreadScheduledExecutor("master-forward-message-thread")/*消息的线程池*/
private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)/*hadoop配置*/
private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
private val workerTimeoutMs = conf.get(WORKER_TIMEOUT) * 1000/*worker连接超时100秒*/
private val retainedApplications = conf.get(RETAINED_APPLICATIONS)/*历史spark在UI保存的数量200*/
private val retainedDrivers = conf.get(RETAINED_DRIVERS)/*单个master保存Driver数量200*/
private val reaperIterations = conf.get(REAPER_ITERATIONS)/*断开的worker保存的数量*/
private val recoveryMode = conf.get(RECOVERY_MODE)/*worker恢复模式*/
private val maxExecutorRetries = conf.get(MAX_EXECUTOR_RETRIES)
val workers = new HashSet[WorkerInfo]/*worker列表*/
val idToApp = new HashMap[String, ApplicationInfo]/*appid对应的app信息*/
private val waitingApps = new ArrayBuffer[ApplicationInfo]/*等待执行的app信息*/
val apps = new HashSet[ApplicationInfo]/*所有app信息*/
private val idToWorker = new HashMap[String, WorkerInfo]/*workerid对应的worker信息*/
private val addressToWorker = new HashMap[RpcAddress, WorkerInfo]/*ip对应的worker信息*/
private val endpointToApp = new HashMap[RpcEndpointRef, ApplicationInfo]/*app对应的rpc信息*/
private val addressToApp = new HashMap[RpcAddress, ApplicationInfo]/*ip对应的app信息*/
private val completedApps = new ArrayBuffer[ApplicationInfo]/*完成的app信息*/
private var nextAppNumber = 0
private val drivers = new HashSet[DriverInfo]/*app相关的driver信息*/
private val completedDrivers = new ArrayBuffer[DriverInfo]
private val waitingDrivers = new ArrayBuffer[DriverInfo]
private var nextDriverNumber = 0
Utils.checkHost(address.host)
private val masterMetricsSystem =
MetricsSystem.createMetricsSystem(MetricsSystemInstances.MASTER, conf, securityMgr)
private val applicationMetricsSystem =
MetricsSystem.createMetricsSystem(MetricsSystemInstances.APPLICATIONS, conf, securityMgr)
private val masterSource = new MasterSource(this)/*MasterSource的属性就是Master*/
private var webUi: MasterWebUI = null
private val masterPublicAddress = {
val envVar = conf.getenv("SPARK_PUBLIC_DNS")
if (envVar != null) envVar else address.host
}
private val masterUrl = address.toSparkURL
private var masterWebUiUrl: String = _
....................................
1.3.1 electedLeader 方法 选举Master
override def electedLeader(): Unit = {
//这一块其实是给自己发送了一个ElectedLeader 对象调用的是的receive方法,后面详细看
self.send(ElectedLeader)
}
1.3.1 onStart 方法 初始化RPC需要的属性 详解
//下面真正重点的代码其实就是启动了一个定时任务,定时去检查worker状态
override def onStart(): Unit = {
webUi = new MasterWebUI(this, webUiPort)/*MasterWebUI,绑定端口,绑定URL,原理类似springmvc*/
webUi.bind()
masterWebUiUrl = webUi.webUrl
if (reverseProxy) {
val uiReverseProxyUrl = conf.get(UI_REVERSE_PROXY_URL).map(_.stripSuffix("/"))
if (uiReverseProxyUrl.nonEmpty) {
System.setProperty("spark.ui.proxyBase", uiReverseProxyUrl.get)
masterWebUiUrl = uiReverseProxyUrl.get + "/"
}
webUi.addProxy()
}
/*启动一个定时任务 检查worker状态*/
checkForWorkerTimeOutTask = forwardMessageThread.scheduleAtFixedRate(
() => Utils.tryLogNonFatalError { self.send(CheckForWorkerTimeOut) },
0, workerTimeoutMs, TimeUnit.MILLISECONDS)
if (restServerEnabled) {
val port = conf.get(MASTER_REST_SERVER_PORT)
restServer = Some(new StandaloneRestServer(address.host, port, conf, self, masterUrl))
}
restServerBoundPort = restServer.map(_.start())
masterMetricsSystem.registerSource(masterSource)
masterMetricsSystem.start()
applicationMetricsSystem.start()
masterMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
applicationMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
val serializer = new JavaSerializer(conf)/*高可用相关代码,暂时不关注*/
val (persistenceEngine_, leaderElectionAgent_) = recoveryMode match {
case "ZOOKEEPER" =>
logInfo("Persisting recovery state to ZooKeeper")
val zkFactory =
new ZooKeeperRecoveryModeFactory(conf, serializer)
(zkFactory.createPersistenceEngine(), zkFactory.createLeaderElectionAgent(this))/*zk选举*/
case "FILESYSTEM" =>
val fsFactory =
new FileSystemRecoveryModeFactory(conf, serializer)
(fsFactory.createPersistenceEngine(), fsFactory.createLeaderElectionAgent(this))
case "CUSTOM" =>
val clazz = Utils.classForName(conf.get(RECOVERY_MODE_FACTORY))
val factory = clazz.getConstructor(classOf[SparkConf], classOf[Serializer])
.newInstance(conf, serializer)
.asInstanceOf[StandaloneRecoveryModeFactory]
(factory.createPersistenceEngine(), factory.createLeaderElectionAgent(this))
case _ =>
(new BlackHolePersistenceEngine(), new MonarchyLeaderAgent(this))
}
persistenceEngine = persistenceEngine_
leaderElectionAgent = leaderElectionAgent_
}
搬砖多年终不得要领,遂载源码看之望得真经。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?