|NO.Z.00092|——————————|BigDataEnd|——|Hadoop&Spark.V08|——|Spark.v08|Spark 原理源码|作业执行原理&Stage划分&提交Task|

一、Stage划分

### --- 提交 Task

~~~     得到RDD中需要计算的partition
~~~     对于Shuffle类型的stage，需要判断stage中是否缓存了该结果；
~~~     对于Result类型的Final Stage，则判断计算Job中该partition是否已经计算完成。
~~~     这么做（没有直接提交全部tasks）的原因是，
~~~     stage中某个task执行失败其他执行成功的时候就
~~~     需要找出这个失败的task对应要计算的partition而不是要计算所有partition。

### --- 序列化task的binary

~~~     Executor可以通过广播变量得到它。每个task运行的时候首先会反序列化
~~~     为每个需要计算的partition生成一个task
~~~     ShuffleMapStage对应的task全是ShuffleMapTask；ResultStage对应的全是ResultTask。
~~~     task继承Serializable，要确保task是可序列化的。
~~~     提交tasks
~~~     先用tasks来初始化一个 TaskSet 对象，再调用 TaskScheduler.submitTasks 提交

二、源码提交说明

### --- 源码提交说明

~~~     # 源码提交说明：DAGScheduler.scala
~~~     # 1085行~1234行
  private def submitMissingTasks(stage: Stage, jobId: Int) {
    logDebug("submitMissingTasks(" + stage + ")")
    // Get our pending tasks and remember them in our pendingTasks entry
    // 清空当前Stage的pendingPartitions，便于记录需要计算的分区任务。
    stage.pendingPartitions.clear()
    // First figure out the indexes of partition ids to compute.
    // 找出当前Stage的所有分区中还没有完成计算的分区的索引
    val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()
    // Use the scheduling pool, job group, description, etc. from an ActiveJob associated
    // with this Stage
    // 获取ActiveJob的properties。properties包含了当前Job的调度、group、描述等属性信息。
    val properties = jobIdToActiveJob(jobId).properties
    // 将stage添加到runningStages集合中，表示其正在运行
    runningStages += stage
    // SparkListenerStageSubmitted should be posted before testing whether tasks are
    // serializable. If tasks are not serializable, a SparkListenerStageCompleted event
    // will be posted, which should always come after a corresponding
    SparkListenerStageSubmitted
    // event.
    // 启动对当前Stage的输出提交到HDFS的协调机制
    stage match {
      case s: ShuffleMapStage =>
        outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions -
          1)
      case s: ResultStage =>
        outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId =
          s.rdd.partitions.length - 1)
    }
    // 获取还没有完成计算的每一个分区的偏好位置
    val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
      stage match {
        case s: ShuffleMapStage =>
          partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
        case s: ResultStage =>
          partitionsToCompute.map { id =>
            val p = s.partitions(id)
            (id, getPreferredLocs(stage.rdd, p))
          }.toMap
      }
    } catch {
      // 如果发生任何异常，则调用Stage的makeNewStageAttempt()方法开始一次新的Stage执行尝试
      case NonFatal(e) =>
        stage.makeNewStageAttempt(partitionsToCompute.size)
        listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
        abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
        runningStages -= stage
        return
    }
    // 开始Stage的执行尝试
    stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
    // 向事件总线投递SparkListenerStageSubmitted事件
    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
    // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multipletimes.
    // Broadcasted binary for the task, used to dispatch tasks to executors. Note that webroadcast
    // the serialized copy of the RDD and for each task we will deserialize it, which means each
    // task gets a different copy of the RDD. This provides stronger isolation between tasks that
    // might modify state of objects referenced in their closures. This is necessary in Hadoop
    // where the JobConf/Configuration object is not thread-safe.
    // 对任务进行序列化
    var taskBinary: Broadcast[Array[Byte]] = null
    try {
      // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
      // For ResultTask, serialize and broadcast (rdd, func).
      val taskBinaryBytes: Array[Byte] = stage match {
        // 对Stage的rdd和ShuffleDependency进行序列化
        case stage: ShuffleMapStage =>
          JavaUtils.bufferToArray(
            closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
        // 对Stage的rdd和对RDD的分区进行计算的函数func进行序列化
          case stage: ResultStage =>
            JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
      }
      // 广播任务的序列化对象
      taskBinary = sc.broadcast(taskBinaryBytes)
    } catch {
      // In the case of a failure during serialization, abort the stage.
      case e: NotSerializableException =>
        abortStage(stage, "Task not serializable: " + e.toString, Some(e))
        runningStages -= stage
        // Abort execution
      return
      case NonFatal(e) =>
        abortStage(stage, s"Task serialization failed: $e\n${Utils.exceptionString(e)}", Some(e))
        runningStages -= stage
        return
    }

~~~     # 源码提交说明：DAGScheduler.scala
~~~     # 1085行~1234行

    // 创建Task序列
    val tasks: Seq[Task[_]] = try {
      stage match {
        case stage: ShuffleMapStage => // 为ShuffleMapStage的每一个分区创建一个ShuffleMapTask
           partitionsToCompute.map { id =>
             // 对应分区的偏好位置序列
            val locs = taskIdToLocations(id)
            // RDD的分区
            val part = stage.rdd.partitions(id)
            // 创建ShuffleMapTask
             new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
               taskBinary, part, locs, stage.latestInfo.taskMetrics, properties, Option(jobId),
               Option(sc.applicationId), sc.applicationAttemptId)
           }
        case stage: ResultStage => // 为ResultStage的每一个分区创建一个ResultTask
           partitionsToCompute.map { id =>
             val p: Int = stage.partitions(id)
             // RDD的分区
             val part = stage.rdd.partitions(p)
             // 分区偏好位置序列
             val locs = taskIdToLocations(id)
             // 创建ResultTask
             new ResultTask(stage.id, stage.latestInfo.attemptId,
               taskBinary, part, locs, id, properties, stage.latestInfo.taskMetrics,
               Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
           }
      }
    } catch {
      case NonFatal(e) =>
        // 出现错误就放弃提交
        abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
        runningStages -= stage
        return
    }
    if (tasks.size > 0) { // Task数量大于0
      // logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd+ ")")
      // 将提交的分区添加到pendingPartitions集合中，表示它们正在等待处理
      stage.pendingPartitions ++= tasks.map(_.partitionId)
      logDebug("New pending partitions: " + stage.pendingPartitions)
      // 为这批Task创建TaskSet，调用TaskScheduler的submitTasks方法提交此批Task
      taskScheduler.submitTasks(new TaskSet(
        tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
      // 记录最后一次提交时间
      stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
    } else { // Task数量为0，没有创建任何Task
      // Because we posted SparkListenerStageSubmitted earlier, we should mark
      // the stage as completed here in case there are no tasks to run
      // 将当前Stage标记为完成
      markStageAsFinished(stage, None)
      val debugString = stage match {
        case stage: ShuffleMapStage =>
          s"Stage ${stage} is actually done; " +
            s"(available: ${stage.isAvailable}," +
            s"available outputs: ${stage.numAvailableOutputs}," +
            s"partitions: ${stage.numPartitions})"
        case stage : ResultStage =>
          s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
      }
      logDebug(debugString)
      // 提交当前Stage的子Stage
      submitWaitingChildStages(stage)
    }
  }

Walter Savage Landor:strove with none,for none was worth my strife.Nature I loved and, next to Nature, Art:I warm'd both hands before the fire of life.It sinks, and I am ready to depart

——W.S.Landor