累加器的高级使用--实现wordcount

  • HighWordCountAccumulator.scala
package accumulator

import org.apache.spark.util.AccumulatorV2

import scala.collection.mutable

/*
继承AccumulatorV2类,
传递两个泛型,第一个泛型代表的是累加器add的时候传递数据类型
第二泛型代表的是累加器最终value给你返回的数据类型
 */
class HighWordCountAccumulator extends AccumulatorV2[Array[String], collection.mutable.Map[String, Long]] {
  // 累加器,累加单词出现的总次数
  var wordCountMap = collection.mutable.Map[String, Long]()

  // 判断集合是否为空
  override def isZero: Boolean = {
    wordCountMap.isEmpty
  }

  override def copy(): AccumulatorV2[Array[String], mutable.Map[String, Long]] = {
    val wordCountAccumulator = new HighWordCountAccumulator()
    wordCountAccumulator.wordCountMap = wordCountMap
    wordCountAccumulator
  }

  override def reset(): Unit = {
    wordCountMap = collection.mutable.Map[String, Long]()
  }

  override def add(v: Array[String]): Unit = {
    for (word <- v) {
      val flag = wordCountMap.contains(word)
      if (flag) {
        wordCountMap.update(word, wordCountMap.getOrElse(word, 0L) + 1L)
      } else {
        wordCountMap.put(word, 1L)
      }
    }
  }

  override def merge(other: AccumulatorV2[Array[String], mutable.Map[String, Long]]): Unit = {
    val res = other.value
    for (elem <- res) {
      val word = elem._1
      val count = elem._2
      val flag = wordCountMap.contains(word)
      if (flag) {
        wordCountMap.update(word, wordCountMap.getOrElse(word, 0L)+count)
      } else {
        wordCountMap.put(word, count)
      }
    }
  }

  override def value: mutable.Map[String, Long] = {
    wordCountMap
  }
}
  • HighAccCode.scala
package accumulator

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object HighAccCode {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("accumulator")
    val sc = new SparkContext(sparkConf)
    val rdd: RDD[String] = sc.textFile("hdfs://node1:9000/wc.txt")

    val hwca = new HighWordCountAccumulator()
    sc.register(hwca)

    val value = rdd.flatMap((line: String) => {
      val wordArrays = line.split(" ")
      hwca.add(wordArrays)
      wordArrays
    })
    value.collect()

    println(hwca.wordCountMap)
    sc.stop()
  }
}
posted @ 2022-08-24 20:02  jsqup  阅读(17)  评论(0编辑  收藏  举报