通过模板类简单实现Spark的JobServer
实验前后效果对比:
之前:执行13个节点,耗时16分钟
之后:同样13个节点,耗时3分钟
具体逻辑请参照代码及注释。
1 import java.util.concurrent.{ExecutorService, Executors, TimeUnit} 2 3 import akka.actor.{ActorSystem, Props} 4 import com.alibaba.fastjson.JSONObject 5 import xxx.listener.AddJobToQueueActor 6 import xxx.listener.bean.{AppStatusMessage, SparkContextStatusMessage} 7 import xxx.listener.utils.JSONUtil 8 import xxx.listener.utils.JmsUtils._ 9 import xxx.main.SparkJob 10 import xxx.main.utils.JsonUtils 11 import com.typesafe.config.ConfigFactory 12 import org.apache.commons.lang.StringUtils 13 import org.apache.spark.sql.hive.HiveContext 14 import org.apache.spark.{Logging, SparkConf, SparkContext} 15 16 import scala.collection.mutable.Queue 17 18 /** 19 * Created by zpc on 2016/9/1. 20 * JobServer实现模板。 21 * 修正前:各个任务节点独立提交到spark平台,其中启动sparkContext和hiveContext会占用大量时间,大约40多秒。 22 * 修改后:将统一用户,占用资源相同的节点利用JMS发送消息提交到同一个SparkContext上,默认设置为3个节点任务并行。 23 * 实现:1.提交到queue中的msg为任务包含任务中型的子类及参数信息,接收到的任务不存在依赖关系,依赖的处理在消息发送端控制。 24 * 前置任务执行结束,再发送下一节点任务。 25 * 2.第一次提交时,任务的参数在args中获取。启动之后,启动jms的lister监听,通过actor将接收到的任务信息加入队列。 26 * 3.通过反射调用SparkJob的各个子类(真正执行节点逻辑的类),SparkContext的默认timeout时间为30mins。 27 * 4.节点执行结束,发送节点成功消息到web端,节点失败,发送错误日志及错误消息。 28 * 程序退出,通过shutdownhook,发送sc关闭消息到web端。 29 * 程序被关闭,如kill时,将等待队列及正在执行集合中的任务,发送失败消息到web端。 30 * 31 * 32 */ 33 object ExecuteJobServer extends Logging { 34 35 //等待执行的job所在的queue 36 val jobWaitingQueue = new Queue[String] 37 //当前正在执行的任务的集合 38 val jobRunningSet = new scala.collection.mutable.HashSet[JSONObject] 39 val timeout_mins = 30 40 //最后运行任务时间 41 var lastRunTime = System.currentTimeMillis() 42 43 //spark context 对应的 applicationId, user, expId, resource 44 var appId : String = "" 45 var user: String = "" 46 var expId: Long = 0 47 var resource: String = "" 48 //正在执行的job JSON 49 var jobJson : JSONObject = null 50 51 def main(args: Array[String]): Unit = { 52 53 //进程杀死时,将正在执行或未执行的任务,发送失败消息到web端。 54 Runtime.getRuntime().addShutdownHook(new HookMessage()) 55 //接收到的任务,可以同时提交时,线程数可以多设置,暂定为3 56 val threadPool: ExecutorService = Executors.newFixedThreadPool(3) 57 val sc = initSparkContext() 58 val hiveContext = new HiveContext(sc) 59 60 val list = JsonUtils.parseArray(args(0)) 61 val it = list.iterator 62 while (it.hasNext) { 63 val jobStr = it.next().toString 64 if(expId == 0){ 65 val json = JSONUtil.toJSONString(jobStr) 66 val param = json.getJSONObject("params") 67 appId = sc.applicationId 68 user = param.getString("user") 69 expId = param.getLongValue("expId") 70 var driver_memory = "" 71 var num_executors = "spark.executor.instances" 72 var executor_memory = "" 73 sc.getConf.getAll.map( x => { 74 if(x._1 != null && "spark.executor.instances".equals(x._1)) { 75 num_executors = x._2 76 } 77 else if(x._1 != null && "spark.executor.memory".equals(x._1)){ 78 executor_memory = x._2.substring(0, x._2.length - 1) 79 }else if(x._1 != null && "spark.driver.memory".equals(x._1)){ 80 driver_memory = x._2.substring(0, x._2.length - 1) 81 } 82 }) 83 84 resource = driver_memory + num_executors + executor_memory; 85 logInfo("resource is : " +resource) 86 // resource = param.getString("driver-memory") + param.getString("num-executors") + param.getString("executor-memory") 87 } 88 jobWaitingQueue.enqueue(jobStr) 89 } 90 91 /** 1.启动listener监听appId,接收queue中发送过来的JobMsg消息2.通过Queue发送消息通知web端,sc启动 **/ 92 val system = ActorSystem("mlp", ConfigFactory.load()) 93 val actor = system.actorOf(Props(new AddJobToQueueActor(appId, jobWaitingQueue))) 94 createTopicListenerOfContextJobMsg("contextJobMsgListener", actor) 95 informSparkContextStatus(true) 96 97 while (jobWaitingQueue.size > 0 || !checkTimeOut) { 98 while (jobWaitingQueue.size > 0) { 99 lastRunTime = System.currentTimeMillis() 100 val jobStr = jobWaitingQueue.dequeue()//.replace("\\", "") 101 logInfo("***** ExecuteJobServer jobStr ***** jobStr: " + jobStr) 102 val json = JSONUtil.toJSONString(jobStr) 103 jobRunningSet.add(json) 104 threadPool.execute(new ThreadSparkJob(json, hiveContext, sc)) 105 jobJson = json 106 } 107 Thread.sleep(1000) 108 } 109 110 /** 111 * jobWaittingQueue队列不再接收消息 112 * 113 */ 114 threadPool.shutdown() 115 var loop = true 116 do { 117 //等待所有任务完成 118 loop = !threadPool.awaitTermination(2, TimeUnit.SECONDS); //阻塞,直到线程池里所有任务结束 119 } while (loop); 120 } 121 122 def checkTimeOut(): Boolean = { 123 val nowTime = System.currentTimeMillis() 124 if (jobRunningSet.isEmpty && (nowTime - lastRunTime) / (1000 * 60) > timeout_mins) { 125 return true 126 } else { 127 return false 128 } 129 } 130 131 class ThreadSparkJob(json: JSONObject, hiveContext: HiveContext, ctx: SparkContext) extends Runnable { 132 override def run() { 133 134 try{ 135 val classStr = json.get("class").toString 136 val argsStr = json.get("params").toString 137 val obj: SparkJob = Class.forName(classStr).getMethod("self").invoke(null).asInstanceOf[SparkJob] 138 // val obj: SparkJob = Class.forName(classStr).newInstance().asInstanceOf[SparkJob] 139 obj.jobServer = true 140 obj.failed = false 141 obj.setContext(ctx) 142 obj.setHiveContext(hiveContext) 143 obj.main(Array(argsStr)) 144 // InformJobSuccess(json) 145 logInfo("***** jobRunningSet remove job json***** json: " + json.toJSONString ) 146 jobRunningSet.remove(json) 147 lastRunTime = System.currentTimeMillis() 148 }catch { 149 case oom: OutOfMemoryError => { 150 informJobFailure(oom.toString, json) 151 jobRunningSet.remove(json) 152 logInfo("***** SparkContext go to stop, reaseon: " + oom.getMessage ) 153 hiveContext.sparkContext.stop() 154 //异常时,sc停止,driver程序停止 155 System.exit(1) 156 } 157 case ex: Exception => { 158 informJobFailure(ex.toString, json) 159 jobRunningSet.remove(json) 160 if(ex.toString.contains("stopped SparkContext")){ 161 logInfo("***** SparkContext go to stop, reaseon: " + ex.getMessage ) 162 hiveContext.sparkContext.stop() 163 //异常时,sc停止,driver程序停止 164 System.exit(1) 165 } 166 } 167 } 168 } 169 def informJobFailure(errMsg: String, json: JSONObject) = { 170 if(json != null) { 171 val params = json.getJSONObject("params") 172 val user = StringUtils.trimToEmpty(params.getString("user")) 173 val expId = params.getLongValue("expId") 174 val nodeId = params.getLongValue("nodeId") 175 val message = new AppStatusMessage(user, expId, nodeId, "FAILURE", errMsg) 176 logInfo("***** send informJobFailure message*****: expId: " + expId + "nodeId: " + nodeId) 177 jobStatusTemplate send message 178 } 179 } 180 } 181 182 def initSparkContext(): SparkContext = { 183 val conf = new SparkConf().setAppName("cbt-mlaas") 184 new SparkContext(conf) 185 } 186 187 class HookMessage extends Thread { 188 override def run() { 189 var shouldInformStop = false 190 informSparkContextStatus(false) 191 while (jobWaitingQueue.size > 0) { 192 val jobStr = jobWaitingQueue.dequeue()//.replace("\\", "") 193 val json = JSONUtil.toJSONString(jobStr) 194 informJobFailureInHook("SparkContext stopped, inform waiting job failed!", json) 195 shouldInformStop = true 196 } 197 jobRunningSet.map(json => { 198 informJobFailureInHook("SparkContext stopped, inform running job failed!", json); 199 shouldInformStop = true 200 }) 201 if (shouldInformStop) { 202 informExpStop("SparkContext stopped, inform exp failed!", jobJson) 203 } 204 } 205 def informJobFailureInHook(errMsg: String, json: JSONObject) = { 206 if(json != null) { 207 val params = json.getJSONObject("params") 208 val user = StringUtils.trimToEmpty(params.getString("user")) 209 val expId = params.getLongValue("expId") 210 val nodeId = params.getLongValue("nodeId") 211 val message = new AppStatusMessage(user, expId, nodeId, "FAILURE", errMsg) 212 logInfo("***** send informJobFailure message*****: expId: " + expId + "nodeId: " + nodeId) 213 jobStatusTemplate send message 214 } 215 } 216 def informExpStop(errMsg: String, json: JSONObject) = { 217 if(json != null) { 218 val params = json.getJSONObject("params") 219 val user = StringUtils.trimToEmpty(params.getString("user")) 220 val expId = params.getLongValue("expId") 221 val nodeId = params.getLongValue("nodeId") 222 val message = new AppStatusMessage(user, expId, nodeId, "STOP", errMsg) 223 logInfo("***** send informExpStop message*****: expId: " + expId + "nodeId: " + nodeId) 224 jobStatusTemplate send message 225 } 226 } 227 } 228 229 def informSparkContextStatus(start : Boolean) = { 230 val msg = new SparkContextStatusMessage(appId, start, user, expId, resource) 231 logInfo("***** send sparkContext start message*****: appId: " + appId + "start: " + start) 232 sparkContextStatusTemplate send msg 233 } 234 235 }