Spark算子实现wordCount的十种方法
1 2 3 4 5 6 7 | //groupBy def wordCount 1 (sc : SparkContext) = { val rdd = sc.makeRDD(List( "hello scala" , "hello spark" )) val words : RDD[String] = rdd.flatMap( _ .split( " " )) val group : RDD[(String, Iterable[String])] = words.groupBy(word = > word) val wordCount : RDD[(String, Int)] = group.mapValues(iter = > iter.size) } |
1 2 3 4 5 6 7 8 | //groupBykey def wordCount 2 (sc : SparkContext) = { val rdd = sc.makeRDD(List( "hello scala" , "hello spark" )) val words : RDD[String] = rdd.flatMap( _ .split( " " )) val wordToOne : RDD[(String, Int)] = words.map(( _ , 1 )) val groupByKey : RDD[(String, Iterable[Int])] = wordToOne.groupByKey() val wordCount : RDD[(String, Int)] = groupByKey.mapValues(iter = > iter.size) } |
1 2 3 4 5 6 7 | //reduceByKey def wordCount 3 (sc : SparkContext) = { val rdd = sc.makeRDD(List( "hello scala" , "hello spark" )) val words : RDD[String] = rdd.flatMap( _ .split( " " )) val wordToOne : RDD[(String, Int)] = words.map(( _ , 1 )) val wordCount : RDD[(String, Int)] = wordToOne.reduceByKey( _ + _ ) } |
1 2 3 4 5 6 7 | //aggregateByKey def wordCount 4 (sc : SparkContext) = { val rdd = sc.makeRDD(List( "hello scala" , "hello spark" )) val words : RDD[String] = rdd.flatMap( _ .split( " " )) val wordToOne : RDD[(String, Int)] = words.map(( _ , 1 )) val wordCount : RDD[(String, Int)] = wordToOne.aggregateByKey( 0 )( _ + _ , _ + _ ) } |
1 2 3 4 5 6 7 | //foldByKey def wordCount 5 (sc : SparkContext) = { val rdd = sc.makeRDD(List( "hello scala" , "hello spark" )) val words : RDD[String] = rdd.flatMap( _ .split( " " )) val wordToOne : RDD[(String, Int)] = words.map(( _ , 1 )) val wordCount : RDD[(String, Int)] = wordToOne.foldByKey( 0 )( _ + _ ) } |
1 2 3 4 5 6 7 8 9 10 11 | //combineByKey def wordCount 6 (sc : SparkContext) = { val rdd = sc.makeRDD(List( "hello scala" , "hello spark" )) val words : RDD[String] = rdd.flatMap( _ .split( " " )) val wordToOne : RDD[(String, Int)] = words.map(( _ , 1 )) val wordCount : RDD[(String, Int)] = wordToOne.combineByKey( v = > v, (x : Int, y : Int) = > x + y, (x : Int, y : Int) = > x + y ) } |
1 2 3 4 5 6 7 | //countByKey def wordCount 7 (sc : SparkContext) = { val rdd = sc.makeRDD(List( "hello scala" , "hello spark" )) val words : RDD[String] = rdd.flatMap( _ .split( " " )) val wordToOne : RDD[(String, Int)] = words.map(( _ , 1 )) val wordCount : collection.Map[String, Long] = wordToOne.countByKey() } |
1 2 3 4 5 6 | //countByValue def wordCount 8 (sc : SparkContext) = { val rdd = sc.makeRDD(List( "hello scala" , "hello spark" )) val words : RDD[String] = rdd.flatMap( _ .split( " " )) val wordCount : collection.Map[String, Long] = words.countByValue() } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | //reduce def wordCount 9 (sc : SparkContext) = { val rdd = sc.makeRDD(List( "hello scala" , "hello spark" )) val words : RDD[String] = rdd.flatMap( _ .split( " " )) val mapWord : RDD[mutable.Map[String, Long]] = words.map( word = > { mutable.Map[String, Long]((word, 1 )) } ) val wordCount : mutable.Map[String, Long] = mapWord.reduce( (map 1 , map 2 ) = > { map 2 .foreach{ case (word,count) = > { val newCount = map 1 .getOrElse(word, 0 L) + count map 1 .update(word,newCount) } } map 1 } ) } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | //aggregate def wordCount 10 (sc : SparkContext) = { val rdd = sc.makeRDD(List( "hello scala" , "hello spark" )) val words : RDD[String] = rdd.flatMap( _ .split( " " )) val wordCount : mutable.Map[String, Long] = words.aggregate(mutable.Map[String, Long]())( (map, word) = > { // 将单词添加到映射,如果不存在则创建新的键值对 // `->` 是一个特殊的操作符,用于创建键值对。 // key -> value 是一个用于创建元组 (key, value) 的语法糖 //将下述键值对添加到映射 map 中。 //如果 word 已存在于映射中,则更新其值;否则,添加新的键值对。 map + = (word -> (map.getOrElse(word, 0 L) + 1 )) map }, (map 1 , map 2 ) = > { // 将两个映射合并 map 2 .foreach { case (word, count) = > { val newCount = map 1 .getOrElse(word, 0 L) + count map 1 .update(word, newCount) } } map 1 } ) } |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通