Spark算子实现wordCount的十种方法

1
2
3
4
5
6
7
//groupBy
  def wordCount1(sc:SparkContext) = {
    val rdd = sc.makeRDD(List("hello scala","hello spark"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val group: RDD[(String, Iterable[String])] = words.groupBy(word => word)
    val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
  }

  

1
2
3
4
5
6
7
8
//groupBykey
  def wordCount2(sc:SparkContext) = {
    val rdd = sc.makeRDD(List("hello scala","hello spark"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordToOne: RDD[(String, Int)] = words.map((_, 1))
    val groupByKey: RDD[(String, Iterable[Int])] = wordToOne.groupByKey()
    val wordCount: RDD[(String, Int)] = groupByKey.mapValues(iter => iter.size)
  }

  

1
2
3
4
5
6
7
//reduceByKey
 def wordCount3(sc:SparkContext) = {
   val rdd = sc.makeRDD(List("hello scala","hello spark"))
   val words: RDD[String] = rdd.flatMap(_.split(" "))
   val wordToOne: RDD[(String, Int)] = words.map((_, 1))
   val wordCount: RDD[(String, Int)] = wordToOne.reduceByKey(_ + _)
 }

  

1
2
3
4
5
6
7
//aggregateByKey
  def wordCount4(sc:SparkContext) = {
    val rdd = sc.makeRDD(List("hello scala","hello spark"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordToOne: RDD[(String, Int)] = words.map((_, 1))
    val wordCount: RDD[(String, Int)] = wordToOne.aggregateByKey(0)(_ + _,_ + _)
  }

  

1
2
3
4
5
6
7
//foldByKey
  def wordCount5(sc:SparkContext) = {
    val rdd = sc.makeRDD(List("hello scala","hello spark"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordToOne: RDD[(String, Int)] = words.map((_, 1))
    val wordCount: RDD[(String, Int)] = wordToOne.foldByKey(0)(_ + _)
  }

  

1
2
3
4
5
6
7
8
9
10
11
//combineByKey
  def wordCount6(sc:SparkContext) = {
    val rdd = sc.makeRDD(List("hello scala","hello spark"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordToOne: RDD[(String, Int)] = words.map((_, 1))
    val wordCount: RDD[(String, Int)] = wordToOne.combineByKey(
      v => v,
      (x: Int, y: Int) => x + y,
      (x: Int, y: Int) => x + y
    )
  }

  

1
2
3
4
5
6
7
//countByKey
  def wordCount7(sc:SparkContext) = {
    val rdd = sc.makeRDD(List("hello scala","hello spark"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordToOne: RDD[(String, Int)] = words.map((_, 1))
    val wordCount: collection.Map[String, Long] = wordToOne.countByKey()
  }

  

1
2
3
4
5
6
//countByValue
  def wordCount8(sc:SparkContext) = {
    val rdd = sc.makeRDD(List("hello scala","hello spark"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordCount: collection.Map[String, Long] = words.countByValue()
  }

  

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
//reduce
 def wordCount9(sc:SparkContext) = {
   val rdd = sc.makeRDD(List("hello scala","hello spark"))
   val words: RDD[String] = rdd.flatMap(_.split(" "))
   val mapWord: RDD[mutable.Map[String, Long]] = words.map(
     word => {
       mutable.Map[String, Long]((word, 1))
     }
   )
   val wordCount: mutable.Map[String, Long] = mapWord.reduce(
     (map1, map2) => {
       map2.foreach{
         case (word,count) => {
           val newCount = map1.getOrElse(word,0L) + count
           map1.update(word,newCount)
         }
       }
       map1
     }
   )
 }

  

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
//aggregate
  def wordCount10(sc:SparkContext) = {
    val rdd = sc.makeRDD(List("hello scala","hello spark"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
  
    val wordCount: mutable.Map[String, Long] = words.aggregate(mutable.Map[String, Long]())(
      (map, word) => {
        // 将单词添加到映射,如果不存在则创建新的键值对
        // `->` 是一个特殊的操作符,用于创建键值对。
        // key -> value 是一个用于创建元组 (key, value) 的语法糖
        //将下述键值对添加到映射 map 中。
        //如果 word 已存在于映射中,则更新其值;否则,添加新的键值对。
        map += (word -> (map.getOrElse(word, 0L) + 1))
        map
      },
      (map1, map2) => {
        // 将两个映射合并
        map2.foreach {
          case (word, count) => {
            val newCount = map1.getOrElse(word, 0L) + count
            map1.update(word, newCount)
          }
        }
        map1
      }
    )
  }

  

posted @   sunshin1  阅读(72)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
点击右上角即可分享
微信分享提示