Spark内核源码解析七:job触发流程原理解析和源码解析

1、一个action就会触发一个job,

package cn.spark.study.core

import org.apache.spark.{SparkConf, SparkContext}

/**
 * @author: yangchun
 * @description:
 * @date: Created in 2020-05-04 15:41
 */
object WordCountScala {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("WordCount")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("hdfs://spark1:9000/spark.txt")
    val words = lines.flatMap{line=>line.split(" ")}
    val pairs = words.map{word=>(word,1)}
  //RDD里面没有reduceByKey,因此对RDD调用reduceByKey方法时会触发隐式转换,此时会在作用域内,寻找域内隐式转换,会在RDD中找到rddToPairRDDFunctions()隐式转换,然后将RDD转换成PairRDDFunctions val wordCounts
= pairs.reduceByKey{_ + _} wordCounts.foreach(wordCount=>println(wordCount._1+" appeared "+wordCount._2+" times")) } }

1、textFile

 // 调用hadoopFile,会创建一个HadoopRDD,其中的元素其实是(key,value) pair
    // key是hdfs或者文本文件的的每一行的offset,value就是文本行
    // 然后对hadoopRDD调用map()方法,会剔除key只保留value,会获得一个MapPartitionRDD,内部元素其实就是一个个文本行
  def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String] = {
    assertNotStopped()
    hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
      minPartitions).map(pair => pair._2.toString).setName(path)
  }

2、foreach,会调用sparkContext的runJob方法

  def foreach(f: T => Unit) {
    val cleanF = sc.clean(f)
    sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))
  }
def runJob[T, U: ClassTag](
      rdd: RDD[T],
      func: (TaskContext, Iterator[T]) => U,
      partitions: Seq[Int],
      allowLocal: Boolean,
      resultHandler: (Int, U) => Unit) {
    if (stopped) {
      throw new IllegalStateException("SparkContext has been shutdown")
    }
    val callSite = getCallSite
    val cleanedFunc = clean(func)
    logInfo("Starting job: " + callSite.shortForm)
    if (conf.getBoolean("spark.logLineage", false)) {
      logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
    }

    // 调用sparkContext初始化创建的DAGScheduler的runJob方法
    dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal,
      resultHandler, localProperties.get)
    progressBar.foreach(_.finishAll())
    rdd.doCheckpoint()
  }

 

posted on 2020-05-11 18:25  清浊  阅读(230)  评论(0编辑  收藏  举报