sparkSQL中的example学习(2)
UserDefinedUntypedAggregate.scala(默认返回类型为空,不能更改)
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
object UserDefinedUntypedAggregate {
// $example on: untyped_custom_aggregations$
object MyAverage extends UserDefinedAggregateFunction {
//Data types of input arguments of this aggregate function
def inputSchema: StructType = StructType(StructField("inputColumn", LongType) :: Nil)
//Data types of values in the aggregation buffer
def bufferSchema: StructType = {
StructType(StructField("sum", LongType) :: StructField("count", LongType) :: Nil)
}
//The data type of the returned value
def dataType: DataType = DoubleType
//Whether this function always return s the same output on the identical input
def deterministic: Boolean = true
// """
// |Initializes the given aggregation buffer.
// |The buffer itself is a `Row` that in addition to
// |standard method like retrieving a value at an index (e.g., get(), getBoolean()),
// |providesthe opportunity to update its values.
// |Note that arrays andmaps inside the buffer are still ummutable.
// """
def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = 0L
buffer(1) = 0L } //Updates the given aggregation buffer `buffer` with new input data from `input`
def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
//isNullAt() -> Checks whether the value at position i is null.
if (!input.isNullAt(0)) {
buffer(0) = buffer.getLong(0) + input.getLong(0)
buffer(1) = buffer.getLong(1) + 1
}
}
//Merges two aggregation buffers and stores the updated buffer values back to `buffer1`
def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1(1) = buffer1.getLong(0) + buffer2.getLong(0)
buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1)
}
// Calcuates the final result
def evaluate(buffer: Row): Double = buffer.getLong(0).toDouble / buffer.getLong(1)
}
// $example off: untyped_custom_aggregation$
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.master("local")
.appName("Spark SQL user-defined DataFrames aggregation example")
.getOrCreate()
// $eeample on: untyped_custom_aggregation$
//Register the function to access it
spark.udf.register("myAverage", MyAverage)
val df = spark.read.json("/Users/hadoop/app/spark/examples/src/main/resources/employees.json")
df.createOrReplaceTempView("employees")
df.show()
val result = spark.sql("SELECT myAverage(salary) as average_salary FROM employees")
result.show()
spark.stop()
}
}

【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南