SparkCore系列(二)rdd聚合操作,rdd之间聚合操作
一:rdd聚合操作
1.count
val conf = new SparkConf().setAppName("HelloWorld").setMaster("local")
val sc = new JavaSparkContext(conf).sc
val dataLength = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).count()//相当于数组的length
println(dataLength)
2.countByValue
val initialScores1: Array[(String, Double)] =
Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
val data1 = sc.parallelize(initialScores1)
println(data1.countByValue) // 以当前值作为key计数
3.reduce
val conf = new SparkConf().setAppName("HelloWorld").setMaster("local")
val sc = new JavaSparkContext(conf).sc
val dataLength = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).map(x=>x.toInt).reduce((x,y)=>x+y)//相当于数组的sum
println(dataLength)
4.reduceByKey
val avg = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).map(x=>(x.toInt,1))
.reduceByKey((x,y)=>x+y).collect().map(x=>println(x)) //reduceByKey现在map端进行聚合,在真正开发过程中也常用
5.sortByKey
val conf = new SparkConf().setAppName("HelloWorld").setMaster("local")
val sc = new JavaSparkContext(conf).sc
val data = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).map(x=>(x.toInt,1)).sortByKey(true)//true正序,false倒序
println(data.collect().map(x=>println(x)))
6.countByKey生产一般不用
val data = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).map(x=>(x.toInt,1))
.countByKey() //map结构 key->key value->的个数
println(data)
7.collectAsMap生产一般不用
val data = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).map(x=>(x.toInt,1))
.collectAsMap() //map结构
println(data)
8.flod
val data = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).map(x=>x.toInt)
.fold(100)((x,y)=>x+y)//带初始值的聚合
println(data)
9.groupByKey
val avg = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).map(x=>(x.toInt,1))
.groupByKey().collect().map(x=>println(x))//value 是一个数组,需要循环value时候使用
10.aggregate
//自定义聚合函数
//第一个参数 两个函数都会以2为参数算一遍
//第二个参数 文件内部行与行之间操作
//第三个参数 文件结果 操作
val sum = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).map(x=>x.toInt)
.aggregate(2)(pfun1,pfun2)
println(sum)
def pfun1(p1: Int, p2: Int): Int = {//行与行之间操作
println("p1"+p1+" p2:"+p2)
p1 * p2
}
def pfun2(p3: Int, p4: Int): Int = {//文件之间结果操作
p3 + p4
} //sum
def pfun1(p1:Tuple2[Int,Int], p2: Int): Tuple2[Int,Int] = {//行与行之间操作
(p1._1 + 1,p1._2 + p2)
}
def pfun2(p1:Tuple2[Int,Int], p2: Tuple2[Int,Int]): Tuple2[Int,Int] = {//文件之间结果操作
(p1._1 + p2._1,p1._2 + p2._2)
}
val avg = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).map(x=>x.toInt)
.aggregate(0,0)(pfun1,pfun2)
println(avg._2/avg._1) //avg
11.combineByKey
type MVType = (Int, Int)
val avg = sc.textFile("/software/java/idea/data")
.flatMap(x=>x.split("\\|")).map(x=>(x.toInt,1))
.combineByKey(
score => (score,1), //创建元素
(c1: MVType, newScore) => (c1._1 + 1, c1._2 + newScore), //处理已经遇到的键
(c1: MVType, c2: MVType) => (c1._1 + c2._1, c1._2 + c2._2) //处理已经未遇到的键
).collect().map(x=>println(x))//value 是一个数组,需要循环value时候使用
//aggregate功能很类似
二:rdd聚合操作
1.union
val initialScores1 = Array(("A", 88.0), ("B", 95.0), ("C", 91.0))
val data1 = sc.parallelize(initialScores1)
val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
val data2 = sc.parallelize(initialScores2)
data1.union(data2).collect().map(x=> println(x)) //SQL中UNION
2.intersection
val initialScores1 = Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
val data1 = sc.parallelize(initialScores1)
val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
val data2 = sc.parallelize(initialScores2)
data1.intersection(data2).collect().map(x=> println(x)) //SQL INNER
3.JOIN
val initialScores1: Array[(String, Double)] =
Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
val data1 = sc.parallelize(initialScores1)
val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
val data2 = sc.parallelize(initialScores2)
data1.join(data2).collect().map(x=>println(x))
//SQL INNER JOIN
4.subtract
val initialScores1 = Array(("A", 88.0), ("B", 95.0), ("C", 91.0),(
val data1 = sc.parallelize(initialScores1)
val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
val data2 = sc.parallelize(initialScores2)
data1.subtract(data2).collect().map(x=> println(x)) //LEFT ANTI
5.subtractByKey
val initialScores1: Array[(String, Double)] =
Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
val data1 = sc.parallelize(initialScores1)
val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
val data2 = sc.parallelize(initialScores2)
data1.subtractByKey(data2).collect().map(x=>println(x))
//删掉rdd1中与rdd2的key相同的元素 相当于subtract
6.rightOuterJoin
val initialScores1: Array[(String, Double)] =
Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
val data1 = sc.parallelize(initialScores1)
val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
val data2 = sc.parallelize(initialScores2)
data1.rightOuterJoin(data2).collect().map(x=>println(x))
//右外连接
7.leftOuterJoin
val initialScores1: Array[(String, Double)] =
Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
val data1 = sc.parallelize(initialScores1)
val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
val data2 = sc.parallelize(initialScores2)
data1.leftOuterJoin(data2).collect().map(x=>println(x))
//左外连接
8.cartesian
val initialScores1 = Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D",
val data1 = sc.parallelize(initialScores1)
val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
val data2 = sc.parallelize(initialScores2)
data1.cartesian(data2).collect().map(x=> println(x)) // key不相同的笛卡
9.尔积
val initialScores1: Array[(String, Double)] =
Array(("A", 88.0), ("B", 95.0), ("C", 91.0),("D", 93.0))
val data1 = sc.parallelize(initialScores1)
val initialScores2 = Array(("D", 93.0), ("E", 95.0), ("F", 98.0))
val data2 = sc.parallelize(initialScores2)
data1.cogroup(data2).collect().map(x=>println(x))
//key 相同的笛卡尔积
搬砖多年终不得要领,遂载源码看之望得真经。
分类:
spark-core
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?