文件路径
代码一
package com.xiao.spark.core.wc
import org.apache.spark.{SparkConf, SparkContext}
object Spark01_WoldCount {
def main(args: Array[String]): Unit = {
// 建立和spark框架的连接
val conf = new SparkConf().setMaster("local").setAppName("WordCount");
val sc = new SparkContext(conf);
// 实现业务逻辑
// 读取文件,按行读取
val lines = sc.textFile("datas");
// 对数据进行分词
// 扁平化:将整体拆分成个体的操作
val words = lines.flatMap(_.split(" "))
// 将数据根据单词进行分组,便于统计
// (hello,hello,hello)
val wordGroup = words.groupBy(word => word)
// 转换数据结构 word => (word, 1)
val wordCount = wordGroup.map{
case (word,list) =>{
(word,list.size)
}
}
// 展示数据
wordCount.foreach(println)
// 关闭连接
sc.stop();
}
}
运行结果:
代码二
package com.xiao.spark.core.wc
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Spark02_WoldCount {
def main(args: Array[String]): Unit = {
// 建立和spark框架的连接
val conf = new SparkConf().setMaster("local").setAppName("WordCount");
val sc = new SparkContext(conf);
// 实现业务逻辑
// 读取文件,按行读取
val lines = sc.textFile("datas");
// 对数据进行分词
// 扁平化:将整体拆分成个体的操作
val words = lines.flatMap(_.split(" "))
// 转换数据结构 word => (word, 1)
val wordToOne: RDD[(String, Int)] = words.map {
word =>
(word, 1)
}
val wordCount: RDD[(String, Int)] = wordToOne.reduceByKey(_ + _)
// 展示数据
wordCount.foreach(println)
// 关闭连接
sc.stop();
}
}
运行结果: