头代码
| import org.apache.log4j.{Level, Logger} |
| import org.apache.spark.{SparkConf, SparkContext} |
| |
| |
| Logger.getLogger("org.apache.spark").setLevel(Level.WARN) |
| Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) |
| |
| |
| val conf = new SparkConf() |
| .setAppName("rdd Test") |
| .setMaster("local[*]") |
| |
| |
| val sc = new SparkContext(conf) |
RDD创建
| |
| val rdd = sc.parallelize(Array(1, 2, 3, 4, 5)) |
| |
| |
| val rdd = sc.textFile("hdfs://master:9000/words.txt") |
| val rdd = sc.textFile("/words.txt") |
| |
| |
| val rdd = sc.textFile("file:///D:\\words.txt") |
| val rdd = sc.textFile("D:\\words.txt") |
| |
| |
| rdd.foreach(println) |
保存RDD数据
| |
| sc.parallelize(1 to 10).saveAsTextFile("hdfs://master:9000/sparkFile.txt") |
| |
| |
| sc.parallelize(1 to 10).saveAsTextFile("file:///D:/sparkFile.txt") |
| sc.parallelize(1 to 10).saveAsTextFile("D:/sparkFile.txt") |
RDD数据类型转换
| val rdd = sc.parallelize(List((1, "a"), (1, "b"), (2, "c"), (2, "d"))) |
| |
| println(rdd.collectAsMap()) |
| val rdd4 = sc.parallelize(List("dog", "wolf", "cat", "bear"), 2) |
| |
| println(rdd4.keyBy(x => x.length).collect.foreach(print)) |
| (3,dog)(4,wolf)(3,cat)(4,bear) |
RDD逻辑操作方法
| val rdd1 = sc.parallelize(Array(1, 2, 3, 4, 5)) |
| val rdd2 = sc.parallelize(Array(3, 4, 5, 6, 7)) |
| |
| println(rdd1.union(rdd2).collect.foreach(print)) |
| |
| println(rdd1.intersection(rdd2).collect.foreach(print)) |
| |
| println(rdd1.subtract(rdd2).collect.foreach(print)) |
去重
| |
| sc.parallelize(List(1, 1, 2, 2, 3, 3)).distinct.foreach(println) |
| sc.parallelize(List("aa bb", "cc dd", "aa bb", "cc dd")).distinct().foreach(println) |
分组
| |
| sc.parallelize(List("a", "a", "b", "c", "c", "d", "e", "f")) |
| .map((_, 1)) |
| .groupByKey() |
| .foreach(println) |
| |
| RDD合并的: |
| |
| rdd1.cogroup(rdd2) |
| rdd1.groupWith(rdd2) |
| (d,CompactBuffer(1)) |
| (e,CompactBuffer(1)) |
| (a,CompactBuffer(1, 1)) |
| (b,CompactBuffer(1)) |
| (f,CompactBuffer(1)) |
| (c,CompactBuffer(1, 1)) |
| |
| def fun1(a: Int, b: Int): Int = { |
| |
| println("fun1: " + a + " " + b) |
| max(a, b) |
| } |
| def fun2(a: Int, b: Int): Int = { |
| |
| println("fun2: " + a + " " + b) |
| a + b |
| } |
| sc.parallelize(List((1, 1), (1, 2), (2, 1), (2, 3), (2, 4), (1, 7)), 2) |
| .aggregateByKey(3)(fun1, fun2) |
| .foreach(print) |
| fun1: 3 1 |
| fun1: 3 2 |
| fun1: 3 1 |
| fun1: 3 3 |
| fun1: 3 4 |
| fun1: 3 7 |
| fun2: 3 4 |
| (2,7) |
| fun2: 3 7 |
| (1,10) |

累积运算(reduce)
| val wordMap = sc.parallelize(List("a", "a", "b", "c", "c", "d", "e", "f")).map((_, 1)) |
| |
| println(wordMap.foreach(print)) |
| |
| wordMap.reduceByKey(_ + _).foreach(println) |
| (a,1)(a,1)(b,1)(c,1)(c,1)(d,1)(e,1)(f,1) |
| (d,1) |
| (e,1) |
| (a,2) |
| (b,1) |
| (f,1) |
| (c,2) |
| |
| println(sc.parallelize(1 to 10).reduce(_+_)) |
| val rdd3 = sc.parallelize(List("dog", "wolf", "cat", "bear"), 2) |
| |
| rdd3.map(x => (x.length, x)) |
| .foldByKey("~")((x,y) => {println("fun",x,y); x + y}) |
| .collect |
| .foreach(println) |
| (fun,~,dog) |
| (fun,~,wolf) |
| |
| (fun,~,cat) |
| (fun,~,bear) |
| |
| (fun,~wolf,~bear) |
| (fun,~dog,~cat) |
| |
| (4,~wolf~bear) |
| (3,~dog~cat) |

筛选RDD元素
| val rdd1 = sc.parallelize(List(("a", 1), ("b", 2), ("c", 3), ("a", 4), ("b", 5), ("d", 6))) |
| |
| println(rdd1.filterByRange("a", "b").collect.foreach(print)) |
拆分RDD的Map
| val rdd2 = sc.parallelize(List(("fruit", "apple,banana,pear"), ("animal", "pig,cat,dog,tiger"))) |
| |
| rdd2.flatMapValues(_.split(",")).collect.foreach(println) |
| (fruit,apple) |
| (fruit,banana) |
| (fruit,pear) |
| (animal,pig) |
| (animal,cat) |
| (animal,dog) |
| (animal,tiger) |
统计个数
| |
| println(sc.parallelize(1 to 10).count()) |
| |
| |
| sc.parallelize(List((1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3))) |
| .countByKey() |
| .foreach(print) |
选取元素
| |
| println(sc.parallelize(1 to 10).first()) |
| |
| |
| println(sc.parallelize(1 to 10).take(3).foreach(print)) |
| |
| |
| println(sc.parallelize(1 to 5).takeSample(false, 7).foreach(print)) |
| |
| |
| println(sc.parallelize(1 to 5).takeSample(true, 7).foreach(print)) |
| |
| |
| println(sc.parallelize(1 to 10).takeOrdered(12).foreach(print)) |
| 1 |
| 123 |
| 42153 |
| 2233554 |
| 12345678910 |
排序
| sc.parallelize(List(1, 2, 5, 7, 3, 0)).map((_, "v")) |
| .sortByKey(false) |
| .foreach(print) |
| |
| |
| sc.parallelize(List(1, 2, 5, 7, 3, 0)).map(( "v",_)) |
| .sortBy(_._2,false) |
| .foreach(print) |
| (7,v)(5,v)(3,v)(2,v)(1,v)(0,v) |
| |
| (v,7)(v,5)(v,3)(v,2)(v,1)(v,0) |
RDD合并
| |
| val rdd1 = sc.parallelize(List("a"-> 1,"b" -> 2,"c" -> 3)) |
| |
| val rdd2 = sc.parallelize(List(("a", (4,1)), ("b", 5), ("c", 6), ("d", 7))) |
| |
| rdd1.join(rdd2).foreach(println) |
| |
| |
| rdd1.leftOuterJoin(rdd2).foreach(println) |
| |
| |
| rdd1.rightOuterJoin(rdd2).foreach(println) |
| |
| (a,(1,(4,1))) |
| (b,(2,5)) |
| (c,(3,6)) |
| |
| (a,(1,Some((4,1)))) |
| (b,(2,Some(5))) |
| (c,(3,Some(6))) |
| |
| (d,(None,7)) |
| (a,(Some(1),(4,1))) |
| (b,(Some(2),5)) |
| (c,(Some(3),6)) |
| |
| rdd1.cogroup(rdd2).foreach(println) |
| |
| |
| rdd1.groupWith(rdd2).foreach(println) |
| |
| val rdd3 = sc.parallelize(Array(1, 2, 3)) |
| val rdd4 = sc.parallelize(Array((4,5), (6,7))) |
| |
| rdd3.cartesian(rdd4).foreach(print) |
| (d,(CompactBuffer(),CompactBuffer(7))) |
| (a,(CompactBuffer(1),CompactBuffer((4,1)))) |
| (b,(CompactBuffer(2),CompactBuffer(5))) |
| (c,(CompactBuffer(3),CompactBuffer(0))) |
| |
| (d,(CompactBuffer(),CompactBuffer(7))) |
| (a,(CompactBuffer(1),CompactBuffer((4,1)))) |
| (b,(CompactBuffer(2),CompactBuffer(5))) |
| (c,(CompactBuffer(3),CompactBuffer(0))) |
| |
| (1,(4,5))(1,(6,7))(2,(4,5))(2,(6,7))(3,(4,5))(3,(6,7)) |
RDD分区
| |
| |
| |
| sc.parallelize(1 to 10,4) |
| .coalesce(3) |
| .foreach(print) |
| |
| |
| sc.parallelize(1 to 10, 3) |
| .repartition(2) |
| .foreach(print) |
| |
| |
| sc.parallelize( |
| List((2, 3), (1, 3), (1, 2), (5, 4), (1, 4), (2, 4)), 5) |
| .repartitionAndSortWithinPartitions(new HashPartitioner(4) |
| ).foreach(print) |
| |
| 12 345 678910 |
| |
| 134679 25810 |
| |
| (1,3)(1,2)(1,4)(5,4) (2,3)(2,4) |
分区运算
| val rdd = sc.parallelize(Array( 2, 3, 4,5,6,7), 3) |
| |
| |
| println(rdd.aggregate(0)(_ + _, _ + _)) |
| |
| println(rdd.aggregate(1)((x,y) => {println("fun1",x,y); x + y},(x,y) => {println("fun2",x,y); x + y})) |
| |
| println(rdd.aggregate(1)(_ + _, _ + _)) |
| 27 |
| (fun1,1,2) |
| (fun1,3,3) |
| (fun2,1,6) |
| |
| (fun1,1,4) |
| (fun1,5,5) |
| (fun2,7,10) |
| |
| (fun1,1,6) |
| (fun1,7,7) |
| (fun2,17,14) |
| 31 |
| 31 |

| val rdd2 = sc.parallelize(List(1, 2, 3, 4, 5), 3) |
| |
| println(rdd2.mapPartitions(_.map(_ * 10)).collect.foreach(print)) |
【推荐】还在用 ECharts 开发大屏?试试这款永久免费的开源 BI 工具!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 一次Java后端服务间歇性响应慢的问题排查记录
· dotnet 源代码生成器分析器入门
· ASP.NET Core 模型验证消息的本地化新姿势
· 对象命名为何需要避免'-er'和'-or'后缀
· SQL Server如何跟踪自动统计信息更新?
· “你见过凌晨四点的洛杉矶吗?”--《我们为什么要睡觉》
· 编程神器Trae:当我用上后,才知道自己的创造力被低估了多少
· C# 从零开始使用Layui.Wpf库开发WPF客户端
· C#/.NET/.NET Core技术前沿周刊 | 第 31 期(2025年3.17-3.23)
· 接口重试的7种常用方案!