通过模板类简单实现Spark的JobServer

实验前后效果对比:

之前:执行13个节点,耗时16分钟

之后:同样13个节点,耗时3分钟

具体逻辑请参照代码及注释。

  1 import java.util.concurrent.{ExecutorService, Executors, TimeUnit}
  2 
  3 import akka.actor.{ActorSystem, Props}
  4 import com.alibaba.fastjson.JSONObject
  5 import xxx.listener.AddJobToQueueActor
  6 import xxx.listener.bean.{AppStatusMessage, SparkContextStatusMessage}
  7 import xxx.listener.utils.JSONUtil
  8 import xxx.listener.utils.JmsUtils._
  9 import xxx.main.SparkJob
 10 import xxx.main.utils.JsonUtils
 11 import com.typesafe.config.ConfigFactory
 12 import org.apache.commons.lang.StringUtils
 13 import org.apache.spark.sql.hive.HiveContext
 14 import org.apache.spark.{Logging, SparkConf, SparkContext}
 15 
 16 import scala.collection.mutable.Queue
 17 
 18 /**
 19   * Created by zpc on 2016/9/1.
 20   * JobServer实现模板。
 21   * 修正前:各个任务节点独立提交到spark平台,其中启动sparkContext和hiveContext会占用大量时间,大约40多秒。
 22   * 修改后:将统一用户,占用资源相同的节点利用JMS发送消息提交到同一个SparkContext上,默认设置为3个节点任务并行。
 23   * 实现:1.提交到queue中的msg为任务包含任务中型的子类及参数信息,接收到的任务不存在依赖关系,依赖的处理在消息发送端控制。
 24   *      前置任务执行结束,再发送下一节点任务。
 25   *      2.第一次提交时,任务的参数在args中获取。启动之后,启动jms的lister监听,通过actor将接收到的任务信息加入队列。
 26   *      3.通过反射调用SparkJob的各个子类(真正执行节点逻辑的类),SparkContext的默认timeout时间为30mins。
 27   *      4.节点执行结束,发送节点成功消息到web端,节点失败,发送错误日志及错误消息。
 28   *      程序退出,通过shutdownhook,发送sc关闭消息到web端。
 29   *      程序被关闭,如kill时,将等待队列及正在执行集合中的任务,发送失败消息到web端。
 30   *
 31   *
 32   */
 33 object ExecuteJobServer extends Logging {
 34 
 35   //等待执行的job所在的queue
 36   val jobWaitingQueue = new Queue[String]
 37   //当前正在执行的任务的集合
 38   val jobRunningSet = new scala.collection.mutable.HashSet[JSONObject]
 39   val timeout_mins = 30
 40   //最后运行任务时间
 41   var lastRunTime = System.currentTimeMillis()
 42 
 43   //spark context 对应的 applicationId, user, expId, resource
 44   var appId : String = ""
 45   var user: String = ""
 46   var expId: Long = 0
 47   var resource: String = ""
 48   //正在执行的job JSON
 49   var jobJson : JSONObject = null
 50 
 51   def main(args: Array[String]): Unit = {
 52 
 53     //进程杀死时,将正在执行或未执行的任务,发送失败消息到web端。
 54     Runtime.getRuntime().addShutdownHook(new HookMessage())
 55     //接收到的任务,可以同时提交时,线程数可以多设置,暂定为3
 56     val threadPool: ExecutorService = Executors.newFixedThreadPool(3)
 57     val sc = initSparkContext()
 58     val hiveContext = new HiveContext(sc)
 59 
 60     val list = JsonUtils.parseArray(args(0))
 61     val it = list.iterator
 62     while (it.hasNext) {
 63       val jobStr = it.next().toString
 64       if(expId == 0){
 65         val json = JSONUtil.toJSONString(jobStr)
 66         val param = json.getJSONObject("params")
 67         appId = sc.applicationId
 68         user = param.getString("user")
 69         expId = param.getLongValue("expId")
 70         var driver_memory = ""
 71         var num_executors = "spark.executor.instances"
 72         var executor_memory = ""
 73         sc.getConf.getAll.map( x => {
 74           if(x._1 != null && "spark.executor.instances".equals(x._1)) {
 75             num_executors = x._2
 76           }
 77           else if(x._1 != null && "spark.executor.memory".equals(x._1)){
 78             executor_memory = x._2.substring(0, x._2.length - 1)
 79           }else if(x._1 != null && "spark.driver.memory".equals(x._1)){
 80             driver_memory = x._2.substring(0, x._2.length - 1)
 81           }
 82         })
 83 
 84         resource = driver_memory + num_executors + executor_memory;
 85         logInfo("resource is : " +resource)
 86 //        resource = param.getString("driver-memory") + param.getString("num-executors") +  param.getString("executor-memory")
 87       }
 88       jobWaitingQueue.enqueue(jobStr)
 89     }
 90 
 91     /** 1.启动listener监听appId,接收queue中发送过来的JobMsg消息2.通过Queue发送消息通知web端,sc启动 **/
 92     val system = ActorSystem("mlp", ConfigFactory.load())
 93     val actor = system.actorOf(Props(new AddJobToQueueActor(appId, jobWaitingQueue)))
 94     createTopicListenerOfContextJobMsg("contextJobMsgListener", actor)
 95     informSparkContextStatus(true)
 96 
 97     while (jobWaitingQueue.size > 0 || !checkTimeOut) {
 98       while (jobWaitingQueue.size > 0) {
 99         lastRunTime = System.currentTimeMillis()
100         val jobStr = jobWaitingQueue.dequeue()//.replace("\\", "")
101         logInfo("***** ExecuteJobServer jobStr ***** jobStr: " + jobStr)
102         val json = JSONUtil.toJSONString(jobStr)
103         jobRunningSet.add(json)
104         threadPool.execute(new ThreadSparkJob(json, hiveContext, sc))
105         jobJson = json
106       }
107       Thread.sleep(1000)
108     }
109 
110     /**
111       * jobWaittingQueue队列不再接收消息
112       *
113       */
114     threadPool.shutdown()
115     var loop = true
116     do {
117       //等待所有任务完成
118       loop = !threadPool.awaitTermination(2, TimeUnit.SECONDS); //阻塞,直到线程池里所有任务结束
119     } while (loop);
120   }
121 
122   def checkTimeOut(): Boolean = {
123     val nowTime = System.currentTimeMillis()
124     if (jobRunningSet.isEmpty && (nowTime - lastRunTime) / (1000 * 60) > timeout_mins) {
125       return true
126     } else {
127       return false
128     }
129   }
130 
131   class ThreadSparkJob(json: JSONObject, hiveContext: HiveContext, ctx: SparkContext) extends Runnable {
132     override def run() {
133 
134       try{
135         val classStr = json.get("class").toString
136         val argsStr = json.get("params").toString
137         val obj: SparkJob = Class.forName(classStr).getMethod("self").invoke(null).asInstanceOf[SparkJob]
138 //        val obj: SparkJob = Class.forName(classStr).newInstance().asInstanceOf[SparkJob]
139         obj.jobServer = true
140         obj.failed = false
141         obj.setContext(ctx)
142         obj.setHiveContext(hiveContext)
143         obj.main(Array(argsStr))
144         //      InformJobSuccess(json)
145         logInfo("***** jobRunningSet remove job json***** json: " + json.toJSONString )
146         jobRunningSet.remove(json)
147         lastRunTime = System.currentTimeMillis()
148       }catch {
149         case oom: OutOfMemoryError => {
150           informJobFailure(oom.toString, json)
151           jobRunningSet.remove(json)
152           logInfo("***** SparkContext go to stop, reaseon: " + oom.getMessage )
153           hiveContext.sparkContext.stop()
154           //异常时,sc停止,driver程序停止
155           System.exit(1)
156         }
157         case ex: Exception => {
158           informJobFailure(ex.toString, json)
159           jobRunningSet.remove(json)
160           if(ex.toString.contains("stopped SparkContext")){
161             logInfo("***** SparkContext go to stop, reaseon: " + ex.getMessage )
162             hiveContext.sparkContext.stop()
163             //异常时,sc停止,driver程序停止
164             System.exit(1)
165           }
166         }
167       }
168     }
169     def informJobFailure(errMsg: String, json: JSONObject) = {
170       if(json != null) {
171         val params = json.getJSONObject("params")
172         val user = StringUtils.trimToEmpty(params.getString("user"))
173         val expId = params.getLongValue("expId")
174         val nodeId = params.getLongValue("nodeId")
175         val message = new AppStatusMessage(user, expId, nodeId, "FAILURE", errMsg)
176         logInfo("***** send informJobFailure message*****: expId: " + expId + "nodeId: " + nodeId)
177         jobStatusTemplate send message
178       }
179     }
180   }
181 
182   def initSparkContext(): SparkContext = {
183     val conf = new SparkConf().setAppName("cbt-mlaas")
184     new SparkContext(conf)
185   }
186 
187   class HookMessage extends Thread {
188     override def run() {
189       var shouldInformStop = false
190       informSparkContextStatus(false)
191       while (jobWaitingQueue.size > 0) {
192         val jobStr = jobWaitingQueue.dequeue()//.replace("\\", "")
193         val json = JSONUtil.toJSONString(jobStr)
194         informJobFailureInHook("SparkContext stopped, inform waiting job failed!", json)
195         shouldInformStop = true
196       }
197       jobRunningSet.map(json => {
198         informJobFailureInHook("SparkContext stopped, inform running job failed!", json);
199         shouldInformStop = true
200       })
201       if (shouldInformStop) {
202         informExpStop("SparkContext stopped, inform exp failed!", jobJson)
203       }
204     }
205     def informJobFailureInHook(errMsg: String, json: JSONObject) = {
206       if(json != null) {
207         val params = json.getJSONObject("params")
208         val user = StringUtils.trimToEmpty(params.getString("user"))
209         val expId = params.getLongValue("expId")
210         val nodeId = params.getLongValue("nodeId")
211         val message = new AppStatusMessage(user, expId, nodeId, "FAILURE", errMsg)
212         logInfo("***** send informJobFailure message*****: expId: " + expId + "nodeId: " + nodeId)
213         jobStatusTemplate send message
214       }
215     }
216     def informExpStop(errMsg: String, json: JSONObject) = {
217       if(json != null) {
218         val params = json.getJSONObject("params")
219         val user = StringUtils.trimToEmpty(params.getString("user"))
220         val expId = params.getLongValue("expId")
221         val nodeId = params.getLongValue("nodeId")
222         val message = new AppStatusMessage(user, expId, nodeId, "STOP", errMsg)
223         logInfo("***** send informExpStop message*****: expId: " + expId + "nodeId: " + nodeId)
224         jobStatusTemplate send message
225       }
226     }
227   }
228 
229   def informSparkContextStatus(start : Boolean) = {
230    val msg = new  SparkContextStatusMessage(appId, start, user, expId, resource)
231     logInfo("***** send sparkContext start message*****: appId: " + appId + "start: " + start)
232     sparkContextStatusTemplate send msg
233   }
234 
235 }

 

posted on 2016-09-26 10:48  在大地画满窗子  阅读(1145)  评论(0编辑  收藏  举报