day20-spark实现Wordcount
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object WordCount {
def main(args: Array[String]): Unit = {
// 创建配置文件对象
val conf: SparkConf = new SparkConf().setAppName("wc").setMaster("local")
// 创建spark上下文环境
val sc: SparkContext = new SparkContext(conf)
// 设置日志的级别
sc.setLogLevel("WARN")
// 读取文件
val lines:RDD[String] = sc.textFile("datas/wc/input")
// 将每行按照空格切割为单词
val words: RDD[String] = lines.flatMap(_.split(" "))
// 每个单词的形式变为 (单词,1)
val groups: RDD[(String, Int)] = words.map((_,1))
// 使用spark的api直接对相同key的值进行聚合
val value: RDD[(String, Int)] = groups.reduceByKey(_ + _)
// 生成遍历对象
val tuples: Array[(String, Int)] = value.collect()
tuples.foreach(println(_))
}
}
需要的依赖为:
<dependencies>
<!--SparkCore依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.0.1</version>
</dependency>
</dependencies>