Spark内核源码解析七:job触发流程原理解析和源码解析
1、一个action就会触发一个job,
package cn.spark.study.core import org.apache.spark.{SparkConf, SparkContext} /** * @author: yangchun * @description: * @date: Created in 2020-05-04 15:41 */ object WordCountScala { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("WordCount") val sc = new SparkContext(conf) val lines = sc.textFile("hdfs://spark1:9000/spark.txt") val words = lines.flatMap{line=>line.split(" ")} val pairs = words.map{word=>(word,1)}
//RDD里面没有reduceByKey,因此对RDD调用reduceByKey方法时会触发隐式转换,此时会在作用域内,寻找域内隐式转换,会在RDD中找到rddToPairRDDFunctions()隐式转换,然后将RDD转换成PairRDDFunctions val wordCounts = pairs.reduceByKey{_ + _} wordCounts.foreach(wordCount=>println(wordCount._1+" appeared "+wordCount._2+" times")) } }
1、textFile
// 调用hadoopFile,会创建一个HadoopRDD,其中的元素其实是(key,value) pair // key是hdfs或者文本文件的的每一行的offset,value就是文本行 // 然后对hadoopRDD调用map()方法,会剔除key只保留value,会获得一个MapPartitionRDD,内部元素其实就是一个个文本行 def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String] = { assertNotStopped() hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], minPartitions).map(pair => pair._2.toString).setName(path) }
2、foreach,会调用sparkContext的runJob方法
def foreach(f: T => Unit) { val cleanF = sc.clean(f) sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF)) }
def runJob[T, U: ClassTag]( rdd: RDD[T], func: (TaskContext, Iterator[T]) => U, partitions: Seq[Int], allowLocal: Boolean, resultHandler: (Int, U) => Unit) { if (stopped) { throw new IllegalStateException("SparkContext has been shutdown") } val callSite = getCallSite val cleanedFunc = clean(func) logInfo("Starting job: " + callSite.shortForm) if (conf.getBoolean("spark.logLineage", false)) { logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString) } // 调用sparkContext初始化创建的DAGScheduler的runJob方法 dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal, resultHandler, localProperties.get) progressBar.foreach(_.finishAll()) rdd.doCheckpoint() }