package wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Spark01_WordCount {
def main(args: Array[String]): Unit = {
//TODO 建立与spark的连接
val sparConf = new SparkConf().setMaster("local").setAppName("WordCount") //基本配置
val sc = new SparkContext(sparConf)
//TODO 执行业务操作
//1.读取文件,获取一行一行的数据
val lines: RDD[String] = sc.textFile("datas/1.txt")
//2.将一行数据进行拆分,形成一个一个的单词
// 扁平化:将整体拆分成个体
// "hello world,hello world" => hello,world,hello,world
val words: RDD[String] = lines.flatMap(_.split(" "))
//3.将数据根据单词进行分组,便于统计
// "(hello,hello),(world,world)
val wordGroup: RDD[(String, Iterable[String])] = words.groupBy(word => word)
//4.对分组后的数据进行转换
// "(hello,hello),(world,world) => (hello,2),(world,2)
val wordToCount = wordGroup.map {
case (word,list) => {
(word,list.size)
}
}
//5.将转换结果采集到控制台打印出来
val array: Array[(String, Int)] = wordToCount.collect()
array.foreach(println)
//TODO 关闭spark连接
sc.stop()
}
}
方式二
package wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Spark02_WordCount {
def main(args: Array[String]): Unit = {
//TODO 本地建立与spark的连接
val sparConf = new SparkConf().setMaster("local").setAppName("WordCount")
val sc = new SparkContext(sparConf) //spark实例
//TODO 执行业务操作
//1.读取文件,获取一行一行的数据,按行读取
val lines: RDD[String] = sc.textFile("datas/1.txt")
//2.将一行数据进行拆分,形成一个一个的单词
val words: RDD[String] = lines.flatMap(_.split(" "))
val wordToOne = words.map(
word => (word,1)
)
val wordGroup: RDD[(String, Iterable[(String, Int)])] = wordToOne.groupBy(
t => t._1
)
val wordToCount = wordGroup.map {
case (word,list) => {
list.reduce(
(t1, t2) => {
(t1._1, t1._2 + t2._2)
}
)
}
}
//5.将转换结果采集到控制台打印出来
val array: Array[(String, Int)] = wordToCount.collect()
array.foreach(println)
//TODO 关闭spark连接
sc.stop()
}
}
方式三
package wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Spark03_WordCount {
def main(args: Array[String]): Unit = {
//TODO 本地建立与spark的连接
val sparConf = new SparkConf().setMaster("local").setAppName("WordCount")
val sc = new SparkContext(sparConf) //spark实例
//TODO 执行业务操作
//1.读取文件,获取一行一行的数据,按行读取
val lines: RDD[String] = sc.textFile("datas/1.txt")
//2.将一行数据进行拆分,形成一个一个的单词
val words: RDD[String] = lines.flatMap(_.split(" "))
val wordToOne = words.map(
word => (word,1)
)
//spark框架提供了更多功能,可以将分组和聚合使用一个方法实现
//reduceByKey:相同的key数据,可以对value进行reduce聚合
val wordToCount = wordToOne.reduceByKey(_+_)
//5.将转换结果采集到控制台打印出来
val array: Array[(String, Int)] = wordToCount.collect()
array.foreach(println)
//TODO 关闭spark连接
sc.stop()
}
}
运行截图: