spark小案例实战1(scala + spark2 版本:wordcount+sort)
案例需求:
1、对文本文件内的每个单词都统计出其出现的次数。
2、按照每个单词出现次数的数量,降序排序。
步骤:
- 1.创建RDD
- 2.将文本进行拆分 (flatMap)
- 3.将拆分后的单词进行统计 (mapToPair,reduceByKey)
- 4.反转键值对 (mapToPair)
- 5.按键升序排序 (sortedByKey)
- 6.再次反转键值对 (mapToPair)
- 7.打印输出(foreach)
import org.apache.spark.sql.SparkSession
object SortWordCount {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("SortWordCount").master("local").getOrCreate()
val lines = spark.sparkContext.textFile("D:\\Users\\Administrator\\Desktop\\spark.txt")
val words = lines.flatMap{line => line.split(" ")}
val wordCounts = words.map{word => (word,1)}.reduceByKey(_ + _)
val countWord = wordCounts.map{word =>(word._2,word._1)}
val sortedCountWord = countWord.sortByKey(false)
val sortedWordCount = sortedCountWord.map{word => (word._2, word._1)}
sortedWordCount.foreach(s=>
{
println("word \""+s._1+ "\" appears "+s._2+" times.")
})
spark.stop()
}
}