RDD 源码
1.介绍
弹性分布式数据集(RDD)
弹性: 存储弹性(可内存、可磁盘)
分布式: 数据存储不同的机器节点上
Spark的基本抽象,最小的数据单元
在这个数据集上,定义了基本操作(算子) map、filter
2.五个重要的属性
//分区信息 => A list of partitions protected def getPartitions: Array[Partition] //计算逻辑(计算每个分区) => A function for computing each split def compute(split: Partition, context: TaskContext): Iterator[T] //依赖信息 => A list of dependencies on other RDDs protected def getDependencies: Seq[Dependency[_]] = deps //分区器 @transient val partitioner: Option[Partitioner] = None //首选位置 protected def getPreferredLocations(split: Partition): Seq[String] = Nil
3.构造器
// 主构造器 abstract class RDD[T: ClassTag]( @transient private var _sc: SparkContext, @transient private var deps: Seq[Dependency[_]] ) // 辅助构造器 def this(@transient oneParent: RDD[_]) = this(oneParent.context, List(new OneToOneDependency(oneParent))) => 默认依赖关系为 OneToOneDependency
4.源码阅读
/*********TODO RDD 源码分析 (基于 Spark 3.0.0)*******************************************************/ abstract class RDD[T: ClassTag]( @transient private var _sc: SparkContext, @transient private var deps: Seq[Dependency[_]] ) extends Serializable with Logging { // ======================================================================= // 构造器 主构造器 + 辅助构造器 // ======================================================================= // TODO 辅助构造器 创建 OneToOneDependency 的RDD def this(@transient oneParent: RDD[_]) = this(oneParent.context, List(new OneToOneDependency(oneParent))) // ======================================================================= // 上下文对象 和 配置对象 // ======================================================================= // TODO 判断 SparkContext 对象是否为空,空时进行初始化 private def sc: SparkContext = { if (_sc == null) { throw new SparkException() } _sc } // TODO 成员属性 SparkContext-上下文对象 def sparkContext: SparkContext = sc // TODO 成员属性 SparkConf-配置对象 private[spark] def conf = sc.conf // ======================================================================= // 定义了一些抽象方法 => 由RDD的子类来实现具体功能 // ======================================================================= // TODO 每个切片的计算逻辑 @DeveloperApi def compute(split: Partition, context: TaskContext): Iterator[T] // TODO 获取 所有分区 protected def getPartitions: Array[Partition] // TODO 获取 所有依赖 protected def getDependencies: Seq[Dependency[_]] = deps // TODO 根据分区 获取该分区的 首选位置 protected def getPreferredLocations(split: Partition): Seq[String] = Nil // TODO 指定 分区器 @transient val partitioner: Option[Partitioner] = None // ======================================================================= // 定义了一些公共的方法和字段 (所有RDD实现类都可使用) // ======================================================================= // TODO 成员属性 SparkContext-上下文对象 def sparkContext: SparkContext = sc // TODO 成员属性 RDD 的唯一id(存储在 SparkContext) val id: Int = sc.newRddId() // TODO 成员属性 RDD 的名称 @transient var name: String = _ // TODO 设置 RDD 的名称 def setName(_name: String): this.type = { name = _name this } // ======================================================================= // 和 RDD 持久化(缓存) 相关的方法 // ======================================================================= /** * TODO 使用 指定的存储级别 来持久化RDD * * @param newLevel 指定存储级别( 存储级别类型查看 org.apache.spark.storage.StorageLevel) * @param allowOverride 是否使用 新级别覆盖原有级别 */ private def persist(newLevel: StorageLevel, allowOverride: Boolean): this.type = { } // TODO 使用指定的级别持久化RDD(只会保存第一次设置的值,检查点除外) def persist(newLevel: StorageLevel): this.type = { if (isLocallyCheckpointed) { // This means the user previously called localCheckpoint(), which should have already // marked this RDD for persisting. Here we should override the old storage level with // one that is explicitly requested by the user (after adapting it to use disk). persist(LocalRDDCheckpointData.transformStorageLevel(newLevel), allowOverride = true) } else { persist(newLevel, allowOverride = false) } } // TODO 持久化RDD(使用 MEMORY_ONLY 级别) def persist(): this.type = persist(StorageLevel.MEMORY_ONLY) // TODO 持久化RDD 等同于 persist def cache(): this.type = persist() /** * TODO 删除RDD的缓存(从内存或磁盘中) * * @param 是否阻塞所有的block,直到所有block被删除(默认为fase) * @return This RDD. */ def unpersist(blocking: Boolean = false): this.type = { logInfo(s"Removing RDD $id from persistence list") sc.unpersistRDD(id, blocking) storageLevel = StorageLevel.NONE this } // TODO 获取 RDD 存储级别(如果没set,则为NONE) def getStorageLevel: StorageLevel = storageLevel /** * Lock for all mutable state of this RDD (persistence, partitions, dependencies, etc.). We do * not use `this` because RDDs are user-visible, so users might have added their own locking on * RDDs; sharing that could lead to a deadlock. * * One thread might hold the lock on many of these, for a chain of RDD dependencies; but * because DAGs are acyclic, and we only ever hold locks for one path in that DAG, there is no * chance of deadlock. * * Executors may reference the shared fields (though they should never mutate them, * that only happens on the driver). */ private val stateLock = new Serializable {} // Our dependencies and partitions will be gotten by calling subclass's methods below, and will // be overwritten when we're checkpointed @volatile private var dependencies_ : Seq[Dependency[_]] = _ @volatile @transient private var partitions_ : Array[Partition] = _ /** An Option holding our checkpoint RDD, if we are checkpointed */ private def checkpointRDD: Option[CheckpointRDD[T]] = checkpointData.flatMap(_.checkpointRDD) // TODO 获取 RDD的 依赖列表(注意 当前rdd是否为checkpoint) final def dependencies: Seq[Dependency[_]] = { checkpointRDD.map(r => List(new OneToOneDependency(r))).getOrElse { if (dependencies_ == null) { stateLock.synchronized { if (dependencies_ == null) { dependencies_ = getDependencies } } } dependencies_ } } // TODO 获取 RDD 的分区数组 final def partitions: Array[Partition] = { checkpointRDD.map(_.partitions).getOrElse { if (partitions_ == null) { stateLock.synchronized { if (partitions_ == null) { partitions_ = getPartitions partitions_.zipWithIndex.foreach { case (partition, index) => require(partition.index == index, s"partitions($index).partition == ${partition.index}, but it should equal $index") } } } } partitions_ } } // TODO 获取 RDD 分区个数 @Since("1.6.0") final def getNumPartitions: Int = partitions.length // TODO 获取 RDD 分区的首选位置 final def preferredLocations(split: Partition): Seq[String] = { checkpointRDD.map(_.getPreferredLocations(split)).getOrElse { getPreferredLocations(split) } } /** * Internal method to this RDD; will read from cache if applicable, or otherwise compute it. * This should ''not'' be called by users directly, but is available for implementors of custom * subclasses of RDD. */ final def iterator(split: Partition, context: TaskContext): Iterator[T] = { if (storageLevel != StorageLevel.NONE) { getOrCompute(split, context) } else { computeOrReadCheckpoint(split, context) } } /** * Return the ancestors of the given RDD that are related to it only through a sequence of * narrow dependencies. This traverses the given RDD's dependency tree using DFS, but maintains * no ordering on the RDDs returned. */ private[spark] def getNarrowAncestors: Seq[RDD[_]] = { val ancestors = new mutable.HashSet[RDD[_]] def visit(rdd: RDD[_]): Unit = { val narrowDependencies = rdd.dependencies.filter(_.isInstanceOf[NarrowDependency[_]]) val narrowParents = narrowDependencies.map(_.rdd) val narrowParentsNotVisited = narrowParents.filterNot(ancestors.contains) narrowParentsNotVisited.foreach { parent => ancestors.add(parent) visit(parent) } } visit(this) // In case there is a cycle, do not include the root itself ancestors.filterNot(_ == this).toSeq } /** * Compute an RDD partition or read it from a checkpoint if the RDD is checkpointing. */ private[spark] def computeOrReadCheckpoint(split: Partition, context: TaskContext): Iterator[T] = { if (isCheckpointedAndMaterialized) { firstParent[T].iterator(split, context) } else { compute(split, context) } } /** * Gets or computes an RDD partition. Used by RDD.iterator() when an RDD is cached. */ private[spark] def getOrCompute(partition: Partition, context: TaskContext): Iterator[T] = { val blockId = RDDBlockId(id, partition.index) var readCachedBlock = true // This method is called on executors, so we need call SparkEnv.get instead of sc.env. SparkEnv.get.blockManager.getOrElseUpdate(blockId, storageLevel, elementClassTag, () => { readCachedBlock = false computeOrReadCheckpoint(partition, context) }) match { case Left(blockResult) => if (readCachedBlock) { val existingMetrics = context.taskMetrics().inputMetrics existingMetrics.incBytesRead(blockResult.bytes) new InterruptibleIterator[T](context, blockResult.data.asInstanceOf[Iterator[T]]) { override def next(): T = { existingMetrics.incRecordsRead(1) delegate.next() } } } else { new InterruptibleIterator(context, blockResult.data.asInstanceOf[Iterator[T]]) } case Right(iter) => new InterruptibleIterator(context, iter.asInstanceOf[Iterator[T]]) } } /** * Execute a block of code in a scope such that all new RDDs created in this body will * be part of the same scope. For more detail, see {{org.apache.spark.rdd.RDDOperationScope}}. * * Note: Return statements are NOT allowed in the given body. */ private[spark] def withScope[U](body: => U): U = RDDOperationScope.withScope[U](sc)(body) // ======================================================================= // TODO 转换算子 // ======================================================================= /** * Return a new RDD by applying a function to all elements of this RDD. */ def map[U: ClassTag](f: T => U): RDD[U] = withScope { val cleanF = sc.clean(f) new MapPartitionsRDD[U, T](this, (_, _, iter) => iter.map(cleanF)) } /** * Return a new RDD by first applying a function to all elements of this * RDD, and then flattening the results. */ def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U] = withScope { val cleanF = sc.clean(f) new MapPartitionsRDD[U, T](this, (_, _, iter) => iter.flatMap(cleanF)) } /** * Return a new RDD containing only the elements that satisfy a predicate. */ def filter(f: T => Boolean): RDD[T] = withScope { val cleanF = sc.clean(f) new MapPartitionsRDD[T, T]( this, (_, _, iter) => iter.filter(cleanF), preservesPartitioning = true) } /** * Return a new RDD containing the distinct elements in this RDD. */ def distinct(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope { def removeDuplicatesInPartition(partition: Iterator[T]): Iterator[T] = { // Create an instance of external append only map which ignores values. val map = new ExternalAppendOnlyMap[T, Null, Null]( createCombiner = _ => null, mergeValue = (a, b) => a, mergeCombiners = (a, b) => a) map.insertAll(partition.map(_ -> null)) map.iterator.map(_._1) } partitioner match { case Some(_) if numPartitions == partitions.length => mapPartitions(removeDuplicatesInPartition, preservesPartitioning = true) case _ => map(x => (x, null)).reduceByKey((x, _) => x, numPartitions).map(_._1) } } /** * Return a new RDD containing the distinct elements in this RDD. */ def distinct(): RDD[T] = withScope { distinct(partitions.length) } /** * Return a new RDD that has exactly numPartitions partitions. * * Can increase or decrease the level of parallelism in this RDD. Internally, this uses * a shuffle to redistribute data. * * If you are decreasing the number of partitions in this RDD, consider using `coalesce`, * which can avoid performing a shuffle. */ def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope { coalesce(numPartitions, shuffle = true) } /** * Return a new RDD that is reduced into `numPartitions` partitions. * * This results in a narrow dependency, e.g. if you go from 1000 partitions * to 100 partitions, there will not be a shuffle, instead each of the 100 * new partitions will claim 10 of the current partitions. If a larger number * of partitions is requested, it will stay at the current number of partitions. * * However, if you're doing a drastic coalesce, e.g. to numPartitions = 1, * this may result in your computation taking place on fewer nodes than * you like (e.g. one node in the case of numPartitions = 1). To avoid this, * you can pass shuffle = true. This will add a shuffle step, but means the * current upstream partitions will be executed in parallel (per whatever * the current partitioning is). * * @note With shuffle = true, you can actually coalesce to a larger number * of partitions. This is useful if you have a small number of partitions, * say 100, potentially with a few partitions being abnormally large. Calling * coalesce(1000, shuffle = true) will result in 1000 partitions with the * data distributed using a hash partitioner. The optional partition coalescer * passed in must be serializable. */ def coalesce(numPartitions: Int, shuffle: Boolean = false, partitionCoalescer: Option[PartitionCoalescer] = Option.empty) (implicit ord: Ordering[T] = null) : RDD[T] = withScope { require(numPartitions > 0, s"Number of partitions ($numPartitions) must be positive.") if (shuffle) { /** Distributes elements evenly across output partitions, starting from a random partition. */ val distributePartition = (index: Int, items: Iterator[T]) => { var position = new Random(hashing.byteswap32(index)).nextInt(numPartitions) items.map { t => // Note that the hash code of the key will just be the key itself. The HashPartitioner // will mod it with the number of total partitions. position = position + 1 (position, t) } } : Iterator[(Int, T)] // include a shuffle step so that our upstream tasks are still distributed new CoalescedRDD( new ShuffledRDD[Int, T, T]( mapPartitionsWithIndexInternal(distributePartition, isOrderSensitive = true), new HashPartitioner(numPartitions)), numPartitions, partitionCoalescer).values } else { new CoalescedRDD(this, numPartitions, partitionCoalescer) } } /** * Return a sampled subset of this RDD. * * @param withReplacement can elements be sampled multiple times (replaced when sampled out) * @param fraction expected size of the sample as a fraction of this RDD's size * without replacement: probability that each element is chosen; fraction must be [0, 1] * with replacement: expected number of times each element is chosen; fraction must be greater * than or equal to 0 * @param seed seed for the random number generator * * @note This is NOT guaranteed to provide exactly the fraction of the count * of the given [[RDD]]. */ def sample( withReplacement: Boolean, fraction: Double, seed: Long = Utils.random.nextLong): RDD[T] = { require(fraction >= 0, s"Fraction must be nonnegative, but got ${fraction}") withScope { require(fraction >= 0.0, "Negative fraction value: " + fraction) if (withReplacement) { new PartitionwiseSampledRDD[T, T](this, new PoissonSampler[T](fraction), true, seed) } else { new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](fraction), true, seed) } } } /** * Randomly splits this RDD with the provided weights. * * @param weights weights for splits, will be normalized if they don't sum to 1 * @param seed random seed * * @return split RDDs in an array */ def randomSplit( weights: Array[Double], seed: Long = Utils.random.nextLong): Array[RDD[T]] = { require(weights.forall(_ >= 0), s"Weights must be nonnegative, but got ${weights.mkString("[", ",", "]")}") require(weights.sum > 0, s"Sum of weights must be positive, but got ${weights.mkString("[", ",", "]")}") withScope { val sum = weights.sum val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _) normalizedCumWeights.sliding(2).map { x => randomSampleWithRange(x(0), x(1), seed) }.toArray } } /** * Internal method exposed for Random Splits in DataFrames. Samples an RDD given a probability * range. * @param lb lower bound to use for the Bernoulli sampler * @param ub upper bound to use for the Bernoulli sampler * @param seed the seed for the Random number generator * @return A random sub-sample of the RDD without replacement. */ private[spark] def randomSampleWithRange(lb: Double, ub: Double, seed: Long): RDD[T] = { this.mapPartitionsWithIndex( { (index, partition) => val sampler = new BernoulliCellSampler[T](lb, ub) sampler.setSeed(seed + index) sampler.sample(partition) }, isOrderSensitive = true, preservesPartitioning = true) } /** * Return a fixed-size sampled subset of this RDD in an array * * @param withReplacement whether sampling is done with replacement * @param num size of the returned sample * @param seed seed for the random number generator * @return sample of specified size in an array * * @note this method should only be used if the resulting array is expected to be small, as * all the data is loaded into the driver's memory. */ def takeSample( withReplacement: Boolean, num: Int, seed: Long = Utils.random.nextLong): Array[T] = withScope { val numStDev = 10.0 require(num >= 0, "Negative number of elements requested") require(num <= (Int.MaxValue - (numStDev * math.sqrt(Int.MaxValue)).toInt), "Cannot support a sample size > Int.MaxValue - " + s"$numStDev * math.sqrt(Int.MaxValue)") if (num == 0) { new Array[T](0) } else { val initialCount = this.count() if (initialCount == 0) { new Array[T](0) } else { val rand = new Random(seed) if (!withReplacement && num >= initialCount) { Utils.randomizeInPlace(this.collect(), rand) } else { val fraction = SamplingUtils.computeFractionForSampleSize(num, initialCount, withReplacement) var samples = this.sample(withReplacement, fraction, rand.nextInt()).collect() // If the first sample didn't turn out large enough, keep trying to take samples; // this shouldn't happen often because we use a big multiplier for the initial size var numIters = 0 while (samples.length < num) { logWarning(s"Needed to re-sample due to insufficient sample size. Repeat #$numIters") samples = this.sample(withReplacement, fraction, rand.nextInt()).collect() numIters += 1 } Utils.randomizeInPlace(samples, rand).take(num) } } } } /** * Return the union of this RDD and another one. Any identical elements will appear multiple * times (use `.distinct()` to eliminate them). */ def union(other: RDD[T]): RDD[T] = withScope { sc.union(this, other) } /** * Return the union of this RDD and another one. Any identical elements will appear multiple * times (use `.distinct()` to eliminate them). */ def ++(other: RDD[T]): RDD[T] = withScope { this.union(other) } /** * Return this RDD sorted by the given key function. */ def sortBy[K]( f: (T) => K, ascending: Boolean = true, numPartitions: Int = this.partitions.length) (implicit ord: Ordering[K], ctag: ClassTag[K]): RDD[T] = withScope { this.keyBy[K](f) .sortByKey(ascending, numPartitions) .values } /** * Return the intersection of this RDD and another one. The output will not contain any duplicate * elements, even if the input RDDs did. * * @note This method performs a shuffle internally. */ def intersection(other: RDD[T]): RDD[T] = withScope { this.map(v => (v, null)).cogroup(other.map(v => (v, null))) .filter { case (_, (leftGroup, rightGroup)) => leftGroup.nonEmpty && rightGroup.nonEmpty } .keys } /** * Return the intersection of this RDD and another one. The output will not contain any duplicate * elements, even if the input RDDs did. * * @note This method performs a shuffle internally. * * @param partitioner Partitioner to use for the resulting RDD */ def intersection( other: RDD[T], partitioner: Partitioner)(implicit ord: Ordering[T] = null): RDD[T] = withScope { this.map(v => (v, null)).cogroup(other.map(v => (v, null)), partitioner) .filter { case (_, (leftGroup, rightGroup)) => leftGroup.nonEmpty && rightGroup.nonEmpty } .keys } /** * Return the intersection of this RDD and another one. The output will not contain any duplicate * elements, even if the input RDDs did. Performs a hash partition across the cluster * * @note This method performs a shuffle internally. * * @param numPartitions How many partitions to use in the resulting RDD */ def intersection(other: RDD[T], numPartitions: Int): RDD[T] = withScope { intersection(other, new HashPartitioner(numPartitions)) } /** * Return an RDD created by coalescing all elements within each partition into an array. */ def glom(): RDD[Array[T]] = withScope { new MapPartitionsRDD[Array[T], T](this, (_, _, iter) => Iterator(iter.toArray)) } /** * Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of * elements (a, b) where a is in `this` and b is in `other`. */ def cartesian[U: ClassTag](other: RDD[U]): RDD[(T, U)] = withScope { new CartesianRDD(sc, this, other) } /** * Return an RDD of grouped items. Each group consists of a key and a sequence of elements * mapping to that key. The ordering of elements within each group is not guaranteed, and * may even differ each time the resulting RDD is evaluated. * * @note This operation may be very expensive. If you are grouping in order to perform an * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey` * or `PairRDDFunctions.reduceByKey` will provide much better performance. */ def groupBy[K](f: T => K)(implicit kt: ClassTag[K]): RDD[(K, Iterable[T])] = withScope { groupBy[K](f, defaultPartitioner(this)) } /** * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements * mapping to that key. The ordering of elements within each group is not guaranteed, and * may even differ each time the resulting RDD is evaluated. * * @note This operation may be very expensive. If you are grouping in order to perform an * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey` * or `PairRDDFunctions.reduceByKey` will provide much better performance. */ def groupBy[K]( f: T => K, numPartitions: Int)(implicit kt: ClassTag[K]): RDD[(K, Iterable[T])] = withScope { groupBy(f, new HashPartitioner(numPartitions)) } /** * Return an RDD of grouped items. Each group consists of a key and a sequence of elements * mapping to that key. The ordering of elements within each group is not guaranteed, and * may even differ each time the resulting RDD is evaluated. * * @note This operation may be very expensive. If you are grouping in order to perform an * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey` * or `PairRDDFunctions.reduceByKey` will provide much better performance. */ def groupBy[K](f: T => K, p: Partitioner)(implicit kt: ClassTag[K], ord: Ordering[K] = null) : RDD[(K, Iterable[T])] = withScope { val cleanF = sc.clean(f) this.map(t => (cleanF(t), t)).groupByKey(p) } /** * Return an RDD created by piping elements to a forked external process. */ def pipe(command: String): RDD[String] = withScope { // Similar to Runtime.exec(), if we are given a single string, split it into words // using a standard StringTokenizer (i.e. by spaces) pipe(PipedRDD.tokenize(command)) } /** * Return an RDD created by piping elements to a forked external process. */ def pipe(command: String, env: Map[String, String]): RDD[String] = withScope { // Similar to Runtime.exec(), if we are given a single string, split it into words // using a standard StringTokenizer (i.e. by spaces) pipe(PipedRDD.tokenize(command), env) } /** * Return an RDD created by piping elements to a forked external process. The resulting RDD * is computed by executing the given process once per partition. All elements * of each input partition are written to a process's stdin as lines of input separated * by a newline. The resulting partition consists of the process's stdout output, with * each line of stdout resulting in one element of the output partition. A process is invoked * even for empty partitions. * * The print behavior can be customized by providing two functions. * * @param command command to run in forked process. * @param env environment variables to set. * @param printPipeContext Before piping elements, this function is called as an opportunity * to pipe context data. Print line function (like out.println) will be * passed as printPipeContext's parameter. * @param printRDDElement Use this function to customize how to pipe elements. This function * will be called with each RDD element as the 1st parameter, and the * print line function (like out.println()) as the 2nd parameter. * An example of pipe the RDD data of groupBy() in a streaming way, * instead of constructing a huge String to concat all the elements: * {{{ * def printRDDElement(record:(String, Seq[String]), f:String=>Unit) = * for (e <- record._2) {f(e)} * }}} * @param separateWorkingDir Use separate working directories for each task. * @param bufferSize Buffer size for the stdin writer for the piped process. * @param encoding Char encoding used for interacting (via stdin, stdout and stderr) with * the piped process * @return the result RDD */ def pipe( command: Seq[String], env: Map[String, String] = Map(), printPipeContext: (String => Unit) => Unit = null, printRDDElement: (T, String => Unit) => Unit = null, separateWorkingDir: Boolean = false, bufferSize: Int = 8192, encoding: String = Codec.defaultCharsetCodec.name): RDD[String] = withScope { new PipedRDD(this, command, env, if (printPipeContext ne null) sc.clean(printPipeContext) else null, if (printRDDElement ne null) sc.clean(printRDDElement) else null, separateWorkingDir, bufferSize, encoding) } /** * Return a new RDD by applying a function to each partition of this RDD. * * `preservesPartitioning` indicates whether the input function preserves the partitioner, which * should be `false` unless this is a pair RDD and the input function doesn't modify the keys. */ def mapPartitions[U: ClassTag]( f: Iterator[T] => Iterator[U], preservesPartitioning: Boolean = false): RDD[U] = withScope { val cleanedF = sc.clean(f) new MapPartitionsRDD( this, (_: TaskContext, _: Int, iter: Iterator[T]) => cleanedF(iter), preservesPartitioning) } /** * [performance] Spark's internal mapPartitionsWithIndex method that skips closure cleaning. * It is a performance API to be used carefully only if we are sure that the RDD elements are * serializable and don't require closure cleaning. * * @param preservesPartitioning indicates whether the input function preserves the partitioner, * which should be `false` unless this is a pair RDD and the input * function doesn't modify the keys. * @param isOrderSensitive whether or not the function is order-sensitive. If it's order * sensitive, it may return totally different result when the input order * is changed. Mostly stateful functions are order-sensitive. */ private[spark] def mapPartitionsWithIndexInternal[U: ClassTag]( f: (Int, Iterator[T]) => Iterator[U], preservesPartitioning: Boolean = false, isOrderSensitive: Boolean = false): RDD[U] = withScope { new MapPartitionsRDD( this, (_: TaskContext, index: Int, iter: Iterator[T]) => f(index, iter), preservesPartitioning = preservesPartitioning, isOrderSensitive = isOrderSensitive) } /** * [performance] Spark's internal mapPartitions method that skips closure cleaning. */ private[spark] def mapPartitionsInternal[U: ClassTag]( f: Iterator[T] => Iterator[U], preservesPartitioning: Boolean = false): RDD[U] = withScope { new MapPartitionsRDD( this, (_: TaskContext, _: Int, iter: Iterator[T]) => f(iter), preservesPartitioning) } /** * Return a new RDD by applying a function to each partition of this RDD, while tracking the index * of the original partition. * * `preservesPartitioning` indicates whether the input function preserves the partitioner, which * should be `false` unless this is a pair RDD and the input function doesn't modify the keys. */ def mapPartitionsWithIndex[U: ClassTag]( f: (Int, Iterator[T]) => Iterator[U], preservesPartitioning: Boolean = false): RDD[U] = withScope { val cleanedF = sc.clean(f) new MapPartitionsRDD( this, (_: TaskContext, index: Int, iter: Iterator[T]) => cleanedF(index, iter), preservesPartitioning) } /** * Return a new RDD by applying a function to each partition of this RDD, while tracking the index * of the original partition. * * `preservesPartitioning` indicates whether the input function preserves the partitioner, which * should be `false` unless this is a pair RDD and the input function doesn't modify the keys. * * `isOrderSensitive` indicates whether the function is order-sensitive. If it is order * sensitive, it may return totally different result when the input order * is changed. Mostly stateful functions are order-sensitive. */ private[spark] def mapPartitionsWithIndex[U: ClassTag]( f: (Int, Iterator[T]) => Iterator[U], preservesPartitioning: Boolean, isOrderSensitive: Boolean): RDD[U] = withScope { val cleanedF = sc.clean(f) new MapPartitionsRDD( this, (_: TaskContext, index: Int, iter: Iterator[T]) => cleanedF(index, iter), preservesPartitioning, isOrderSensitive = isOrderSensitive) } /** * Zips this RDD with another one, returning key-value pairs with the first element in each RDD, * second element in each RDD, etc. Assumes that the two RDDs have the *same number of * partitions* and the *same number of elements in each partition* (e.g. one was made through * a map on the other). */ def zip[U: ClassTag](other: RDD[U]): RDD[(T, U)] = withScope { zipPartitions(other, preservesPartitioning = false) { (thisIter, otherIter) => new Iterator[(T, U)] { def hasNext: Boolean = (thisIter.hasNext, otherIter.hasNext) match { case (true, true) => true case (false, false) => false case _ => throw new SparkException("Can only zip RDDs with " + "same number of elements in each partition") } def next(): (T, U) = (thisIter.next(), otherIter.next()) } } } /** * Zip this RDD's partitions with one (or more) RDD(s) and return a new RDD by * applying a function to the zipped partitions. Assumes that all the RDDs have the * *same number of partitions*, but does *not* require them to have the same number * of elements in each partition. */ def zipPartitions[B: ClassTag, V: ClassTag] (rdd2: RDD[B], preservesPartitioning: Boolean) (f: (Iterator[T], Iterator[B]) => Iterator[V]): RDD[V] = withScope { new ZippedPartitionsRDD2(sc, sc.clean(f), this, rdd2, preservesPartitioning) } def zipPartitions[B: ClassTag, V: ClassTag] (rdd2: RDD[B]) (f: (Iterator[T], Iterator[B]) => Iterator[V]): RDD[V] = withScope { zipPartitions(rdd2, preservesPartitioning = false)(f) } def zipPartitions[B: ClassTag, C: ClassTag, V: ClassTag] (rdd2: RDD[B], rdd3: RDD[C], preservesPartitioning: Boolean) (f: (Iterator[T], Iterator[B], Iterator[C]) => Iterator[V]): RDD[V] = withScope { new ZippedPartitionsRDD3(sc, sc.clean(f), this, rdd2, rdd3, preservesPartitioning) } def zipPartitions[B: ClassTag, C: ClassTag, V: ClassTag] (rdd2: RDD[B], rdd3: RDD[C]) (f: (Iterator[T], Iterator[B], Iterator[C]) => Iterator[V]): RDD[V] = withScope { zipPartitions(rdd2, rdd3, preservesPartitioning = false)(f) } def zipPartitions[B: ClassTag, C: ClassTag, D: ClassTag, V: ClassTag] (rdd2: RDD[B], rdd3: RDD[C], rdd4: RDD[D], preservesPartitioning: Boolean) (f: (Iterator[T], Iterator[B], Iterator[C], Iterator[D]) => Iterator[V]): RDD[V] = withScope { new ZippedPartitionsRDD4(sc, sc.clean(f), this, rdd2, rdd3, rdd4, preservesPartitioning) } def zipPartitions[B: ClassTag, C: ClassTag, D: ClassTag, V: ClassTag] (rdd2: RDD[B], rdd3: RDD[C], rdd4: RDD[D]) (f: (Iterator[T], Iterator[B], Iterator[C], Iterator[D]) => Iterator[V]): RDD[V] = withScope { zipPartitions(rdd2, rdd3, rdd4, preservesPartitioning = false)(f) } // Actions (launch a job to return a value to the user program) // ======================================================================= // TODO 行动算子 // ======================================================================= /** * Applies a function f to all elements of this RDD. */ def foreach(f: T => Unit): Unit = withScope { val cleanF = sc.clean(f) sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF)) } /** * Applies a function f to each partition of this RDD. */ def foreachPartition(f: Iterator[T] => Unit): Unit = withScope { val cleanF = sc.clean(f) sc.runJob(this, (iter: Iterator[T]) => cleanF(iter)) } /** * Return an array that contains all of the elements in this RDD. * * @note This method should only be used if the resulting array is expected to be small, as * all the data is loaded into the driver's memory. */ def collect(): Array[T] = withScope { val results = sc.runJob(this, (iter: Iterator[T]) => iter.toArray) Array.concat(results: _*) } /** * Return an iterator that contains all of the elements in this RDD. * * The iterator will consume as much memory as the largest partition in this RDD. * * @note This results in multiple Spark jobs, and if the input RDD is the result * of a wide transformation (e.g. join with different partitioners), to avoid * recomputing the input RDD should be cached first. */ def toLocalIterator: Iterator[T] = withScope { def collectPartition(p: Int): Array[T] = { sc.runJob(this, (iter: Iterator[T]) => iter.toArray, Seq(p)).head } partitions.indices.iterator.flatMap(i => collectPartition(i)) } /** * Return an RDD that contains all matching values by applying `f`. */ def collect[U: ClassTag](f: PartialFunction[T, U]): RDD[U] = withScope { val cleanF = sc.clean(f) filter(cleanF.isDefinedAt).map(cleanF) } /** * Return an RDD with the elements from `this` that are not in `other`. * * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting * RDD will be <= us. */ def subtract(other: RDD[T]): RDD[T] = withScope { subtract(other, partitioner.getOrElse(new HashPartitioner(partitions.length))) } /** * Return an RDD with the elements from `this` that are not in `other`. */ def subtract(other: RDD[T], numPartitions: Int): RDD[T] = withScope { subtract(other, new HashPartitioner(numPartitions)) } /** * Return an RDD with the elements from `this` that are not in `other`. */ def subtract( other: RDD[T], p: Partitioner)(implicit ord: Ordering[T] = null): RDD[T] = withScope { if (partitioner == Some(p)) { // Our partitioner knows how to handle T (which, since we have a partitioner, is // really (K, V)) so make a new Partitioner that will de-tuple our fake tuples val p2 = new Partitioner() { override def numPartitions: Int = p.numPartitions override def getPartition(k: Any): Int = p.getPartition(k.asInstanceOf[(Any, _)]._1) } // Unfortunately, since we're making a new p2, we'll get ShuffleDependencies // anyway, and when calling .keys, will not have a partitioner set, even though // the SubtractedRDD will, thanks to p2's de-tupled partitioning, already be // partitioned by the right/real keys (e.g. p). this.map(x => (x, null)).subtractByKey(other.map((_, null)), p2).keys } else { this.map(x => (x, null)).subtractByKey(other.map((_, null)), p).keys } } /** * Reduces the elements of this RDD using the specified commutative and * associative binary operator. */ def reduce(f: (T, T) => T): T = withScope { val cleanF = sc.clean(f) val reducePartition: Iterator[T] => Option[T] = iter => { if (iter.hasNext) { Some(iter.reduceLeft(cleanF)) } else { None } } var jobResult: Option[T] = None val mergeResult = (_: Int, taskResult: Option[T]) => { if (taskResult.isDefined) { jobResult = jobResult match { case Some(value) => Some(f(value, taskResult.get)) case None => taskResult } } } sc.runJob(this, reducePartition, mergeResult) // Get the final result out of our Option, or throw an exception if the RDD was empty jobResult.getOrElse(throw new UnsupportedOperationException("empty collection")) } /** * Reduces the elements of this RDD in a multi-level tree pattern. * * @param depth suggested depth of the tree (default: 2) * @see [[org.apache.spark.rdd.RDD#reduce]] */ def treeReduce(f: (T, T) => T, depth: Int = 2): T = withScope { require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.") val cleanF = context.clean(f) val reducePartition: Iterator[T] => Option[T] = iter => { if (iter.hasNext) { Some(iter.reduceLeft(cleanF)) } else { None } } val partiallyReduced = mapPartitions(it => Iterator(reducePartition(it))) val op: (Option[T], Option[T]) => Option[T] = (c, x) => { if (c.isDefined && x.isDefined) { Some(cleanF(c.get, x.get)) } else if (c.isDefined) { c } else if (x.isDefined) { x } else { None } } partiallyReduced.treeAggregate(Option.empty[T])(op, op, depth) .getOrElse(throw new UnsupportedOperationException("empty collection")) } /** * Aggregate the elements of each partition, and then the results for all the partitions, using a * given associative function and a neutral "zero value". The function * op(t1, t2) is allowed to modify t1 and return it as its result value to avoid object * allocation; however, it should not modify t2. * * This behaves somewhat differently from fold operations implemented for non-distributed * collections in functional languages like Scala. This fold operation may be applied to * partitions individually, and then fold those results into the final result, rather than * apply the fold to each element sequentially in some defined ordering. For functions * that are not commutative, the result may differ from that of a fold applied to a * non-distributed collection. * * @param zeroValue the initial value for the accumulated result of each partition for the `op` * operator, and also the initial value for the combine results from different * partitions for the `op` operator - this will typically be the neutral * element (e.g. `Nil` for list concatenation or `0` for summation) * @param op an operator used to both accumulate results within a partition and combine results * from different partitions */ def fold(zeroValue: T)(op: (T, T) => T): T = withScope { // Clone the zero value since we will also be serializing it as part of tasks var jobResult = Utils.clone(zeroValue, sc.env.closureSerializer.newInstance()) val cleanOp = sc.clean(op) val foldPartition = (iter: Iterator[T]) => iter.fold(zeroValue)(cleanOp) val mergeResult = (_: Int, taskResult: T) => jobResult = op(jobResult, taskResult) sc.runJob(this, foldPartition, mergeResult) jobResult } /** * Aggregate the elements of each partition, and then the results for all the partitions, using * given combine functions and a neutral "zero value". This function can return a different result * type, U, than the type of this RDD, T. Thus, we need one operation for merging a T into an U * and one operation for merging two U's, as in scala.TraversableOnce. Both of these functions are * allowed to modify and return their first argument instead of creating a new U to avoid memory * allocation. * * @param zeroValue the initial value for the accumulated result of each partition for the * `seqOp` operator, and also the initial value for the combine results from * different partitions for the `combOp` operator - this will typically be the * neutral element (e.g. `Nil` for list concatenation or `0` for summation) * @param seqOp an operator used to accumulate results within a partition * @param combOp an associative operator used to combine results from different partitions */ def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U = withScope { // Clone the zero value since we will also be serializing it as part of tasks var jobResult = Utils.clone(zeroValue, sc.env.serializer.newInstance()) val cleanSeqOp = sc.clean(seqOp) val cleanCombOp = sc.clean(combOp) val aggregatePartition = (it: Iterator[T]) => it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp) val mergeResult = (_: Int, taskResult: U) => jobResult = combOp(jobResult, taskResult) sc.runJob(this, aggregatePartition, mergeResult) jobResult } /** * Aggregates the elements of this RDD in a multi-level tree pattern. * This method is semantically identical to [[org.apache.spark.rdd.RDD#aggregate]]. * * @param depth suggested depth of the tree (default: 2) */ def treeAggregate[U: ClassTag](zeroValue: U)( seqOp: (U, T) => U, combOp: (U, U) => U, depth: Int = 2): U = withScope { require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.") if (partitions.length == 0) { Utils.clone(zeroValue, context.env.closureSerializer.newInstance()) } else { val cleanSeqOp = context.clean(seqOp) val cleanCombOp = context.clean(combOp) val aggregatePartition = (it: Iterator[T]) => it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp) var partiallyAggregated: RDD[U] = mapPartitions(it => Iterator(aggregatePartition(it))) var numPartitions = partiallyAggregated.partitions.length val scale = math.max(math.ceil(math.pow(numPartitions, 1.0 / depth)).toInt, 2) // If creating an extra level doesn't help reduce // the wall-clock time, we stop tree aggregation. // Don't trigger TreeAggregation when it doesn't save wall-clock time while (numPartitions > scale + math.ceil(numPartitions.toDouble / scale)) { numPartitions /= scale val curNumPartitions = numPartitions partiallyAggregated = partiallyAggregated.mapPartitionsWithIndex { (i, iter) => iter.map((i % curNumPartitions, _)) }.foldByKey(zeroValue, new HashPartitioner(curNumPartitions))(cleanCombOp).values } val copiedZeroValue = Utils.clone(zeroValue, sc.env.closureSerializer.newInstance()) partiallyAggregated.fold(copiedZeroValue)(cleanCombOp) } } /** * Return the number of elements in the RDD. */ def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum /** * Approximate version of count() that returns a potentially incomplete result * within a timeout, even if not all tasks have finished. * * The confidence is the probability that the error bounds of the result will * contain the true value. That is, if countApprox were called repeatedly * with confidence 0.9, we would expect 90% of the results to contain the * true count. The confidence must be in the range [0,1] or an exception will * be thrown. * * @param timeout maximum time to wait for the job, in milliseconds * @param confidence the desired statistical confidence in the result * @return a potentially incomplete result, with error bounds */ def countApprox( timeout: Long, confidence: Double = 0.95): PartialResult[BoundedDouble] = withScope { require(0.0 <= confidence && confidence <= 1.0, s"confidence ($confidence) must be in [0,1]") val countElements: (TaskContext, Iterator[T]) => Long = { (_, iter) => var result = 0L while (iter.hasNext) { result += 1L iter.next() } result } val evaluator = new CountEvaluator(partitions.length, confidence) sc.runApproximateJob(this, countElements, evaluator, timeout) } /** * Return the count of each unique value in this RDD as a local map of (value, count) pairs. * * @note This method should only be used if the resulting map is expected to be small, as * the whole thing is loaded into the driver's memory. * To handle very large results, consider using * * {{{ * rdd.map(x => (x, 1L)).reduceByKey(_ + _) * }}} * * , which returns an RDD[T, Long] instead of a map. */ def countByValue()(implicit ord: Ordering[T] = null): Map[T, Long] = withScope { map(value => (value, null)).countByKey() } /** * Approximate version of countByValue(). * * @param timeout maximum time to wait for the job, in milliseconds * @param confidence the desired statistical confidence in the result * @return a potentially incomplete result, with error bounds */ def countByValueApprox(timeout: Long, confidence: Double = 0.95) (implicit ord: Ordering[T] = null) : PartialResult[Map[T, BoundedDouble]] = withScope { require(0.0 <= confidence && confidence <= 1.0, s"confidence ($confidence) must be in [0,1]") if (elementClassTag.runtimeClass.isArray) { throw new SparkException("countByValueApprox() does not support arrays") } val countPartition: (TaskContext, Iterator[T]) => OpenHashMap[T, Long] = { (_, iter) => val map = new OpenHashMap[T, Long] iter.foreach { t => map.changeValue(t, 1L, _ + 1L) } map } val evaluator = new GroupedCountEvaluator[T](partitions.length, confidence) sc.runApproximateJob(this, countPartition, evaluator, timeout) } /** * Return approximate number of distinct elements in the RDD. * * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available * <a href="https://doi.org/10.1145/2452376.2452456">here</a>. * * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (`sp` is greater * than `p`) would trigger sparse representation of registers, which may reduce the memory * consumption and increase accuracy when the cardinality is small. * * @param p The precision value for the normal set. * `p` must be a value between 4 and `sp` if `sp` is not zero (32 max). * @param sp The precision value for the sparse set, between 0 and 32. * If `sp` equals 0, the sparse representation is skipped. */ def countApproxDistinct(p: Int, sp: Int): Long = withScope { require(p >= 4, s"p ($p) must be >= 4") require(sp <= 32, s"sp ($sp) must be <= 32") require(sp == 0 || p <= sp, s"p ($p) cannot be greater than sp ($sp)") val zeroCounter = new HyperLogLogPlus(p, sp) aggregate(zeroCounter)( (hll: HyperLogLogPlus, v: T) => { hll.offer(v) hll }, (h1: HyperLogLogPlus, h2: HyperLogLogPlus) => { h1.addAll(h2) h1 }).cardinality() } /** * Return approximate number of distinct elements in the RDD. * * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice: * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available * <a href="https://doi.org/10.1145/2452376.2452456">here</a>. * * @param relativeSD Relative accuracy. Smaller values create counters that require more space. * It must be greater than 0.000017. */ def countApproxDistinct(relativeSD: Double = 0.05): Long = withScope { require(relativeSD > 0.000017, s"accuracy ($relativeSD) must be greater than 0.000017") val p = math.ceil(2.0 * math.log(1.054 / relativeSD) / math.log(2)).toInt countApproxDistinct(if (p < 4) 4 else p, 0) } /** * Zips this RDD with its element indices. The ordering is first based on the partition index * and then the ordering of items within each partition. So the first item in the first * partition gets index 0, and the last item in the last partition receives the largest index. * * This is similar to Scala's zipWithIndex but it uses Long instead of Int as the index type. * This method needs to trigger a spark job when this RDD contains more than one partitions. * * @note Some RDDs, such as those returned by groupBy(), do not guarantee order of * elements in a partition. The index assigned to each element is therefore not guaranteed, * and may even change if the RDD is reevaluated. If a fixed ordering is required to guarantee * the same index assignments, you should sort the RDD with sortByKey() or save it to a file. */ def zipWithIndex(): RDD[(T, Long)] = withScope { new ZippedWithIndexRDD(this) } /** * Zips this RDD with generated unique Long ids. Items in the kth partition will get ids k, n+k, * 2*n+k, ..., where n is the number of partitions. So there may exist gaps, but this method * won't trigger a spark job, which is different from [[org.apache.spark.rdd.RDD#zipWithIndex]]. * * @note Some RDDs, such as those returned by groupBy(), do not guarantee order of * elements in a partition. The unique ID assigned to each element is therefore not guaranteed, * and may even change if the RDD is reevaluated. If a fixed ordering is required to guarantee * the same index assignments, you should sort the RDD with sortByKey() or save it to a file. */ def zipWithUniqueId(): RDD[(T, Long)] = withScope { val n = this.partitions.length.toLong this.mapPartitionsWithIndex { case (k, iter) => Utils.getIteratorZipWithIndex(iter, 0L).map { case (item, i) => (item, i * n + k) } } } /** * Take the first num elements of the RDD. It works by first scanning one partition, and use the * results from that partition to estimate the number of additional partitions needed to satisfy * the limit. * * @note This method should only be used if the resulting array is expected to be small, as * all the data is loaded into the driver's memory. * * @note Due to complications in the internal implementation, this method will raise * an exception if called on an RDD of `Nothing` or `Null`. */ def take(num: Int): Array[T] = withScope { val scaleUpFactor = Math.max(conf.get(RDD_LIMIT_SCALE_UP_FACTOR), 2) if (num == 0) { new Array[T](0) } else { val buf = new ArrayBuffer[T] val totalParts = this.partitions.length var partsScanned = 0 while (buf.size < num && partsScanned < totalParts) { // The number of partitions to try in this iteration. It is ok for this number to be // greater than totalParts because we actually cap it at totalParts in runJob. var numPartsToTry = 1L val left = num - buf.size if (partsScanned > 0) { // If we didn't find any rows after the previous iteration, quadruple and retry. // Otherwise, interpolate the number of partitions we need to try, but overestimate // it by 50%. We also cap the estimation in the end. if (buf.isEmpty) { numPartsToTry = partsScanned * scaleUpFactor } else { // As left > 0, numPartsToTry is always >= 1 numPartsToTry = Math.ceil(1.5 * left * partsScanned / buf.size).toInt numPartsToTry = Math.min(numPartsToTry, partsScanned * scaleUpFactor) } } val p = partsScanned.until(math.min(partsScanned + numPartsToTry, totalParts).toInt) val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, p) res.foreach(buf ++= _.take(num - buf.size)) partsScanned += p.size } buf.toArray } } /** * Return the first element in this RDD. */ def first(): T = withScope { take(1) match { case Array(t) => t case _ => throw new UnsupportedOperationException("empty collection") } } /** * Returns the top k (largest) elements from this RDD as defined by the specified * implicit Ordering[T] and maintains the ordering. This does the opposite of * [[takeOrdered]]. For example: * {{{ * sc.parallelize(Seq(10, 4, 2, 12, 3)).top(1) * // returns Array(12) * * sc.parallelize(Seq(2, 3, 4, 5, 6)).top(2) * // returns Array(6, 5) * }}} * * @note This method should only be used if the resulting array is expected to be small, as * all the data is loaded into the driver's memory. * * @param num k, the number of top elements to return * @param ord the implicit ordering for T * @return an array of top elements */ def top(num: Int)(implicit ord: Ordering[T]): Array[T] = withScope { takeOrdered(num)(ord.reverse) } /** * Returns the first k (smallest) elements from this RDD as defined by the specified * implicit Ordering[T] and maintains the ordering. This does the opposite of [[top]]. * For example: * {{{ * sc.parallelize(Seq(10, 4, 2, 12, 3)).takeOrdered(1) * // returns Array(2) * * sc.parallelize(Seq(2, 3, 4, 5, 6)).takeOrdered(2) * // returns Array(2, 3) * }}} * * @note This method should only be used if the resulting array is expected to be small, as * all the data is loaded into the driver's memory. * * @param num k, the number of elements to return * @param ord the implicit ordering for T * @return an array of top elements */ def takeOrdered(num: Int)(implicit ord: Ordering[T]): Array[T] = withScope { if (num == 0) { Array.empty } else { val mapRDDs = mapPartitions { items => // Priority keeps the largest elements, so let's reverse the ordering. val queue = new BoundedPriorityQueue[T](num)(ord.reverse) queue ++= collectionUtils.takeOrdered(items, num)(ord) Iterator.single(queue) } if (mapRDDs.partitions.length == 0) { Array.empty } else { mapRDDs.reduce { (queue1, queue2) => queue1 ++= queue2 queue1 }.toArray.sorted(ord) } } } /** * Returns the max of this RDD as defined by the implicit Ordering[T]. * @return the maximum element of the RDD * */ def max()(implicit ord: Ordering[T]): T = withScope { this.reduce(ord.max) } /** * Returns the min of this RDD as defined by the implicit Ordering[T]. * @return the minimum element of the RDD * */ def min()(implicit ord: Ordering[T]): T = withScope { this.reduce(ord.min) } /** * @note Due to complications in the internal implementation, this method will raise an * exception if called on an RDD of `Nothing` or `Null`. This may be come up in practice * because, for example, the type of `parallelize(Seq())` is `RDD[Nothing]`. * (`parallelize(Seq())` should be avoided anyway in favor of `parallelize(Seq[T]())`.) * @return true if and only if the RDD contains no elements at all. Note that an RDD * may be empty even when it has at least 1 partition. */ def isEmpty(): Boolean = withScope { partitions.length == 0 || take(1).length == 0 } /** * Save this RDD as a text file, using string representations of elements. */ def saveAsTextFile(path: String): Unit = withScope { saveAsTextFile(path, null) } /** * Save this RDD as a compressed text file, using string representations of elements. */ def saveAsTextFile(path: String, codec: Class[_ <: CompressionCodec]): Unit = withScope { this.mapPartitions { iter => val text = new Text() iter.map { x => require(x != null, "text files do not allow null rows") text.set(x.toString) (NullWritable.get(), text) } }.saveAsHadoopFile[TextOutputFormat[NullWritable, Text]](path, codec) } /** * Save this RDD as a SequenceFile of serialized objects. */ def saveAsObjectFile(path: String): Unit = withScope { this.mapPartitions(iter => iter.grouped(10).map(_.toArray)) .map(x => (NullWritable.get(), new BytesWritable(Utils.serialize(x)))) .saveAsSequenceFile(path) } /** * Creates tuples of the elements in this RDD by applying `f`. */ def keyBy[K](f: T => K): RDD[(K, T)] = withScope { val cleanedF = sc.clean(f) map(x => (cleanedF(x), x)) } /** A private method for tests, to look at the contents of each partition */ private[spark] def collectPartitions(): Array[Array[T]] = withScope { sc.runJob(this, (iter: Iterator[T]) => iter.toArray) } /** * Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint * directory set with `SparkContext#setCheckpointDir` and all references to its parent * RDDs will be removed. This function must be called before any job has been * executed on this RDD. It is strongly recommended that this RDD is persisted in * memory, otherwise saving it on a file will require recomputation. */ def checkpoint(): Unit = RDDCheckpointData.synchronized { // NOTE: we use a global lock here due to complexities downstream with ensuring // children RDD partitions point to the correct parent partitions. In the future // we should revisit this consideration. if (context.checkpointDir.isEmpty) { throw new SparkException("Checkpoint directory has not been set in the SparkContext") } else if (checkpointData.isEmpty) { checkpointData = Some(new ReliableRDDCheckpointData(this)) } } /** * Mark this RDD for local checkpointing using Spark's existing caching layer. * * This method is for users who wish to truncate RDD lineages while skipping the expensive * step of replicating the materialized data in a reliable distributed file system. This is * useful for RDDs with long lineages that need to be truncated periodically (e.g. GraphX). * * Local checkpointing sacrifices fault-tolerance for performance. In particular, checkpointed * data is written to ephemeral local storage in the executors instead of to a reliable, * fault-tolerant storage. The effect is that if an executor fails during the computation, * the checkpointed data may no longer be accessible, causing an irrecoverable job failure. * * This is NOT safe to use with dynamic allocation, which removes executors along * with their cached blocks. If you must use both features, you are advised to set * `spark.dynamicAllocation.cachedExecutorIdleTimeout` to a high value. * * The checkpoint directory set through `SparkContext#setCheckpointDir` is not used. */ def localCheckpoint(): this.type = RDDCheckpointData.synchronized { if (conf.get(DYN_ALLOCATION_ENABLED) && conf.contains(DYN_ALLOCATION_CACHED_EXECUTOR_IDLE_TIMEOUT)) { logWarning("Local checkpointing is NOT safe to use with dynamic allocation, " + "which removes executors along with their cached blocks. If you must use both " + "features, you are advised to set `spark.dynamicAllocation.cachedExecutorIdleTimeout` " + "to a high value. E.g. If you plan to use the RDD for 1 hour, set the timeout to " + "at least 1 hour.") } // Note: At this point we do not actually know whether the user will call persist() on // this RDD later, so we must explicitly call it here ourselves to ensure the cached // blocks are registered for cleanup later in the SparkContext. // // If, however, the user has already called persist() on this RDD, then we must adapt // the storage level he/she specified to one that is appropriate for local checkpointing // (i.e. uses disk) to guarantee correctness. if (storageLevel == StorageLevel.NONE) { persist(LocalRDDCheckpointData.DEFAULT_STORAGE_LEVEL) } else { persist(LocalRDDCheckpointData.transformStorageLevel(storageLevel), allowOverride = true) } // If this RDD is already checkpointed and materialized, its lineage is already truncated. // We must not override our `checkpointData` in this case because it is needed to recover // the checkpointed data. If it is overridden, next time materializing on this RDD will // cause error. if (isCheckpointedAndMaterialized) { logWarning("Not marking RDD for local checkpoint because it was already " + "checkpointed and materialized") } else { // Lineage is not truncated yet, so just override any existing checkpoint data with ours checkpointData match { case Some(_: ReliableRDDCheckpointData[_]) => logWarning( "RDD was already marked for reliable checkpointing: overriding with local checkpoint.") case _ => } checkpointData = Some(new LocalRDDCheckpointData(this)) } this } /** * Return whether this RDD is checkpointed and materialized, either reliably or locally. */ def isCheckpointed: Boolean = isCheckpointedAndMaterialized /** * Return whether this RDD is checkpointed and materialized, either reliably or locally. * This is introduced as an alias for `isCheckpointed` to clarify the semantics of the * return value. Exposed for testing. */ private[spark] def isCheckpointedAndMaterialized: Boolean = checkpointData.exists(_.isCheckpointed) /** * Return whether this RDD is marked for local checkpointing. * Exposed for testing. */ private[rdd] def isLocallyCheckpointed: Boolean = { checkpointData match { case Some(_: LocalRDDCheckpointData[T]) => true case _ => false } } /** * Return whether this RDD is reliably checkpointed and materialized. */ private[rdd] def isReliablyCheckpointed: Boolean = { checkpointData match { case Some(reliable: ReliableRDDCheckpointData[_]) if reliable.isCheckpointed => true case _ => false } } /** * Gets the name of the directory to which this RDD was checkpointed. * This is not defined if the RDD is checkpointed locally. */ def getCheckpointFile: Option[String] = { checkpointData match { case Some(reliable: ReliableRDDCheckpointData[T]) => reliable.getCheckpointDir case _ => None } } /** * :: Experimental :: * Marks the current stage as a barrier stage, where Spark must launch all tasks together. * In case of a task failure, instead of only restarting the failed task, Spark will abort the * entire stage and re-launch all tasks for this stage. * The barrier execution mode feature is experimental and it only handles limited scenarios. * Please read the linked SPIP and design docs to understand the limitations and future plans. * @return an [[RDDBarrier]] instance that provides actions within a barrier stage * @see [[org.apache.spark.BarrierTaskContext]] * @see <a href="https://jira.apache.org/jira/browse/SPARK-24374">SPIP: Barrier Execution Mode</a> * @see <a href="https://jira.apache.org/jira/browse/SPARK-24582">Design Doc</a> */ @Experimental @Since("2.4.0") def barrier(): RDDBarrier[T] = withScope(new RDDBarrier[T](this)) // ======================================================================= // Other internal methods and fields // ======================================================================= private var storageLevel: StorageLevel = StorageLevel.NONE /** User code that created this RDD (e.g. `textFile`, `parallelize`). */ @transient private[spark] val creationSite = sc.getCallSite() /** * The scope associated with the operation that created this RDD. * * This is more flexible than the call site and can be defined hierarchically. For more * detail, see the documentation of {{RDDOperationScope}}. This scope is not defined if the * user instantiates this RDD himself without using any Spark operations. */ @transient private[spark] val scope: Option[RDDOperationScope] = { Option(sc.getLocalProperty(SparkContext.RDD_SCOPE_KEY)).map(RDDOperationScope.fromJson) } private[spark] def getCreationSite: String = Option(creationSite).map(_.shortForm).getOrElse("") private[spark] def elementClassTag: ClassTag[T] = classTag[T] private[spark] var checkpointData: Option[RDDCheckpointData[T]] = None // Whether to checkpoint all ancestor RDDs that are marked for checkpointing. By default, // we stop as soon as we find the first such RDD, an optimization that allows us to write // less data but is not safe for all workloads. E.g. in streaming we may checkpoint both // an RDD and its parent in every batch, in which case the parent may never be checkpointed // and its lineage never truncated, leading to OOMs in the long run (SPARK-6847). private val checkpointAllMarkedAncestors = Option(sc.getLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS)).exists(_.toBoolean) /** Returns the first parent RDD */ protected[spark] def firstParent[U: ClassTag]: RDD[U] = { dependencies.head.rdd.asInstanceOf[RDD[U]] } /** Returns the jth parent RDD: e.g. rdd.parent[T](0) is equivalent to rdd.firstParent[T] */ protected[spark] def parent[U: ClassTag](j: Int): RDD[U] = { dependencies(j).rdd.asInstanceOf[RDD[U]] } /** The [[org.apache.spark.SparkContext]] that this RDD was created on. */ def context: SparkContext = sc }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· 字符编码:从基础到乱码解决
· SpringCloud带你走进微服务的世界