Spark框架——WordCount案例实现

package wordcount

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Spark01_WordCount {
  def main(args: Array[String]): Unit = {
    //TODO 建立与spark的连接
    val sparConf = new SparkConf().setMaster("local").setAppName("WordCount")    //基本配置
    val sc = new SparkContext(sparConf)

    //TODO 执行业务操作
    //1.读取文件,获取一行一行的数据
    val lines: RDD[String] = sc.textFile("datas/1.txt")

    //2.将一行数据进行拆分,形成一个一个的单词
    //  扁平化:将整体拆分成个体
    //  "hello world,hello world" => hello,world,hello,world
    val words: RDD[String] = lines.flatMap(_.split(" "))

    //3.将数据根据单词进行分组,便于统计
    //  "(hello,hello),(world,world)
    val wordGroup: RDD[(String, Iterable[String])] = words.groupBy(word => word)

    //4.对分组后的数据进行转换
//      "(hello,hello),(world,world) => (hello,2),(world,2)
    val wordToCount = wordGroup.map {
      case (word,list) => {
        (word,list.size)
      }
    }

    //5.将转换结果采集到控制台打印出来
    val array: Array[(String, Int)] = wordToCount.collect()
    array.foreach(println)

    //TODO 关闭spark连接
    sc.stop()
  }
}

 方式二

package wordcount

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Spark02_WordCount {
  def main(args: Array[String]): Unit = {
    //TODO 本地建立与spark的连接
    val sparConf = new SparkConf().setMaster("local").setAppName("WordCount")
    val sc = new SparkContext(sparConf)   //spark实例

    //TODO 执行业务操作
    //1.读取文件,获取一行一行的数据,按行读取
    val lines: RDD[String] = sc.textFile("datas/1.txt")

    //2.将一行数据进行拆分,形成一个一个的单词
    val words: RDD[String] = lines.flatMap(_.split(" "))

    val wordToOne = words.map(
      word => (word,1)
    )
    val wordGroup: RDD[(String, Iterable[(String, Int)])] = wordToOne.groupBy(
      t => t._1
    )

    val wordToCount = wordGroup.map {
      case (word,list) => {
        list.reduce(
          (t1, t2) => {
            (t1._1, t1._2 + t2._2)
          }
        )
      }
    }

    //5.将转换结果采集到控制台打印出来
    val array: Array[(String, Int)] = wordToCount.collect()
    array.foreach(println)

    //TODO 关闭spark连接
    sc.stop()
  }
}

方式三

package wordcount

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object Spark03_WordCount {
  def main(args: Array[String]): Unit = {
    //TODO 本地建立与spark的连接
    val sparConf = new SparkConf().setMaster("local").setAppName("WordCount")
    val sc = new SparkContext(sparConf)   //spark实例

    //TODO 执行业务操作
    //1.读取文件,获取一行一行的数据,按行读取
    val lines: RDD[String] = sc.textFile("datas/1.txt")

    //2.将一行数据进行拆分,形成一个一个的单词
    val words: RDD[String] = lines.flatMap(_.split(" "))

    val wordToOne = words.map(
      word => (word,1)
    )
    //spark框架提供了更多功能,可以将分组和聚合使用一个方法实现
    //reduceByKey:相同的key数据,可以对value进行reduce聚合
    val wordToCount = wordToOne.reduceByKey(_+_)

    //5.将转换结果采集到控制台打印出来
    val array: Array[(String, Int)] = wordToCount.collect()
    array.foreach(println)

    //TODO 关闭spark连接
    sc.stop()
  }
}

运行截图:

posted @ 2022-02-22 16:52  CherriesOvO  阅读(93)  评论(0编辑  收藏  举报