Spark内核源码解析七：job触发流程原理解析和源码解析

1、一个action就会触发一个job，

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-04 15:41
 */
object WordCountScala {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("WordCount")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("hdfs://spark1:9000/spark.txt")
    val words = lines.flatMap{line=>line.split(" ")}
    val pairs = words.map{word=>(word,1)}
　　//RDD里面没有reduceByKey，因此对RDD调用reduceByKey方法时会触发隐式转换，此时会在作用域内，寻找域内隐式转换，会在RDD中找到rddToPairRDDFunctions()隐式转换，然后将RDD转换成PairRDDFunctions
    val wordCounts = pairs.reduceByKey{_ + _}
    wordCounts.foreach(wordCount=>println(wordCount._1+" appeared "+wordCount._2+" times"))
  }
}

1、textFile

 // 调用hadoopFile,会创建一个HadoopRDD，其中的元素其实是（key，value） pair
    // key是hdfs或者文本文件的的每一行的offset，value就是文本行
    // 然后对hadoopRDD调用map()方法，会剔除key只保留value，会获得一个MapPartitionRDD，内部元素其实就是一个个文本行
  def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String] = {
    assertNotStopped()
    hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
      minPartitions).map(pair => pair._2.toString).setName(path)
  }

2、foreach，会调用sparkContext的runJob方法

  def foreach(f: T => Unit) {
    val cleanF = sc.clean(f)
    sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))
  }

def runJob[T, U: ClassTag](
      rdd: RDD[T],
      func: (TaskContext, Iterator[T]) => U,
      partitions: Seq[Int],
      allowLocal: Boolean,
      resultHandler: (Int, U) => Unit) {
    if (stopped) {
      throw new IllegalStateException("SparkContext has been shutdown")
    }
    val callSite = getCallSite
    val cleanedFunc = clean(func)
    logInfo("Starting job: " + callSite.shortForm)
    if (conf.getBoolean("spark.logLineage", false)) {
      logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
    }

    // 调用sparkContext初始化创建的DAGScheduler的runJob方法
    dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal,
      resultHandler, localProperties.get)
    progressBar.foreach(_.finishAll())
    rdd.doCheckpoint()
  }

posted on 2020-05-11 18:25 清浊阅读(230) 评论(0) 编辑收藏举报