combineByKey
def test66: Unit = { val initialScores = Array(("Fred", 88.0), ("Fred", 95.0), ("Fred", 91.0), ("Wilma", 93.0), ("Wilma", 95.0), ("Wilma", 98.0)) val conf = new SparkConf().setAppName("wc").setMaster("local[2]") val sc = new SparkContext(conf) /* val a = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"), 3) val b = sc.parallelize(List(1,1,2,2,2,1,2,2,2), 3) val c = b.zip(a) val d = c.combineByKey(List(_), (x:List[String], y:String) => y :: x, (x:List[String], y:List[String]) => x ::: y) d.collect res16: Array[(Int, List[String])] = Array((1,List(cat, dog, turkey)), (2,List(gnu, rabbit, salmon, bee, bear, wolf))) */ val resrdd0 = sc.makeRDD(initialScores).map(t=>t).combineByKey(List(_),(x:List[Double],y:Double)=>y::x,(x:List[Double],y:List[Double])=>x:::y) .map(t=>(t._1,t._2.sum/t._2.size)) println(resrdd0.collect().toBuffer) type MVType = (Int, Double) val resrdd = sc.makeRDD(initialScores) .combineByKey(score => (1, score),(x:MVType,y:Double)=>{(x._1+1,x._2+y)},(x:MVType,y:MVType)=>{(x._1+y._1,x._2+y._2)}) .map(t=>(t._1,t._2._2/t._2._1)) println(resrdd.collect().toBuffer) //reducebykey 的缺陷可以用前后两次map来规避 val resrdd2 = sc.makeRDD(initialScores).map(t=>(t._1,(List(t._2)))).reduceByKey(_:::_).map(t=>(t._1,t._2.sum/t._2.size)) println(resrdd2.collect().toBuffer) } }
Spark函数讲解:combineByKey Spark 2015-03-19 08:03:47 16623 0评论 下载为PDF 使用用户设置好的聚合函数对每个Key中的Value进行组合(combine)。可以将输入类型为RDD[(K, V)]转成成RDD[(K, C)]。 函数原型 def combineByKey[C](createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C) : RDD[(K, C)] def combineByKey[C](createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, numPartitions: Int): RDD[(K, C)] def combineByKey[C](createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true, serializer: Serializer = null): RDD[(K, C)] 第一个和第二个函数都是基于第三个函数实现的,使用的是HashPartitioner,Serializer为null。而第三个函数我们可以指定分区,
如果需要使用Serializer的话也可以指定。combineByKey函数比较重要,我们熟悉地诸如aggregateByKey、foldByKey、reduceByKey等函数都是基于该函数实现的。默认情况会在Map端进行组合操作。
学好计算机,走遍天下都不怕