DataFDataFrameApi
package com.shujia.spark.sql import org.apache.spark.sql._ import org.apache.spark.sql.expressions.Window object Demo3DataFrameApi { def main(args: Array[String]): Unit = { val spark: SparkSession = SparkSession .builder() .appName("source") .master("local") .config("spark.sql.shuffle.partitions", "1") .getOrCreate() //导入隐式转换 import spark.implicits._ //导入spark 所有的函数 import org.apache.spark.sql.functions._ val student: DataFrame = spark.read.json("data/students.json") /** * show :相当于一个action算子 * */ //默认读取前20行 student.show() //指定读取的行数 student.show(100) //完整显示数据 student.show(false) /** * select * */ //使用列对象的方式 student.select($"name", $"age" + 1 as "age").show() //使用列名,不能进行计算 student.select("name", "clazz").show() //使用sql 表达式 student.selectExpr("name", "age + 1 as age").show() student.select(expr("name"), expr("age + 1") as "age").show() /** * where * */ student.where($"age" > 23).show() student.where("age < 22").show() /** * group * */ student.groupBy($"clazz").count().show() student.groupBy("clazz").count().show() /** * agg:聚合 * */ student.agg(count($"id") as "num").show() student.groupBy("clazz") .agg(count("clazz") as "num") // .show() student.groupBy("clazz") .agg(avg($"age") as "avgage") // .show() /** * * join * */ val score: DataFrame = spark .read .format("csv") .option("sep", ",") .schema("id STRING,cid STRING,sco DOUBlE") .load("data/score.txt") //对DF 取别名 val stuAS: Dataset[Row] = student.as("stu") val scoAS: Dataset[Row] = score.as("sco") //指定关联条件 val joinDF: DataFrame = stuAS.join(scoAS, $"stu.id" === $"sco.id", "inner") joinDF.show() //指定关联的列 stuAS.join(scoAS, List("id"), "inner").show() /** * sort * */ score .groupBy($"id") .agg(sum($"sco") as "sumSco") .sort($"sumSco".desc) //降序 .select($"id", $"sumSco") // .show() /** * over: 开窗函数 * */ //获取每个班级总分前十的学生 student .join(score, "id") .groupBy($"id", $"clazz") .agg(sum($"sco") as "sumSco") .select($"id", $"clazz", $"sumSco", row_number() over Window.partitionBy("clazz").orderBy("$sumSco").orderBy($"sumSco".desc) as "r") .where($"r" <= 10) // .show() /** * withColumn: 增加字段 * */ student .join(score, "id") .groupBy($"id", $"clazz") .agg(sum($"sco") as "sumSco") .withColumn("r", row_number() over Window.partitionBy($"clazz").orderBy($"sumSco".desc)) .where($"r" <= 10) // .show() /** * * sql */ student.createOrReplaceTempView("student") score.createOrReplaceTempView("score") val resultDF: DataFrame = spark.sql( """ | |select * from ( |select id,clazz,sumSco,row_number() over(partition by clazz order by sumSco desc) as r from ( |select a.id,a.clazz,sum(b.sco) as sumSco from |student as a |join |score as b |on a.id=b.id |group by a.id,a.clazz) as c ) as d |where r<=10 | | """.stripMargin) //resultDF.show() /** * explode * */ val linesDF: DataFrame = spark.read .format("csv") .option("sep", "\t") .schema("lines STRING") .load("data/words.txt") linesDF .select(explode(split($"lines", ",")) as "word") .groupBy($"word") .agg(count($"word") as "c") .show() /** * map join * * 100m左右可以广播 * */ student.hint("broadcast").join(score, "id") show() } }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· DeepSeek在M芯片Mac上本地化部署
· 葡萄城 AI 搜索升级:DeepSeek 加持,客户体验更智能