sparkSQL中的example学习(3)

UserDefinedTypedAggregation.scala(用户可自定义类型)


import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}

object UserDefinedTypedAggregation {

 case class Employee(name: String, salary: Long)
 case class Average(var sum: Long, var count: Long)


 object MyAverage extends Aggregator[Employee, Average, Double] {

  //A zero value for this aggregation. Should satisfy the property that any b + zero = b
  def zero: Average = Average(0L, 0L)

  //Commine two values to produce a new value. For performance, the function may modify `buffer`
  //and return it instead of constructiong a new object
  def reduce(buffer: Average, employee: Employee): Average = {
   buffer.sum += employee.salary
   buffer.count += 1
   buffer
  }

  //Merge two intermediate values
  def merge(b1: Average, b2: Average): Average = {
   b1.sum += b2.sum
   b1.count += b2.count
   b1
  }

  //Transform the ouput of the reduction
  def finish(reducetion: Average): Double = reducetion.sum.toDouble / reducetion.count

  //Specifies the Encoder for the intermediate value type
  def bufferEncoder: Encoder[Average] = Encoders.product

  //Specifies the Encoder for the final output value type
  def outputEncoder: Encoder[Double] = Encoders.scalaDouble
 }

// $example off: type_custom_aggregation$


 def main(args: Array[String]): Unit = {
  val spark = SparkSession
    .builder()
    .appName("Spark SQL user-defined Datasets aggregation example")
    .master("local")
    .getOrCreate()

  import spark.implicits._

  val ds = spark.read.json("/Users/hadoop/app/spark/examples/src/main/resources/employees.json").as[Employee]
  ds.show()

  val averageSalary = MyAverage.toColumn.name("average_salary")
  val result = ds.select(averageSalary)
  result.show()



  spark.stop()
 }

}

屏幕快照 2019-05-14 03.57.12

posted @   BBBone  阅读(140)  评论(0编辑  收藏  举报
编辑推荐:
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
点击右上角即可分享
微信分享提示