自定义聚合函数(统计每种行为的触发次数排名前三的商品id)

package SparkSQL.fun.project
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, DataTypes, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
/**
* 统计每种行为的触发次数排名前三的商品id
*/
object BehaviorCode2 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("project01").setMaster("local[*]")
val session = SparkSession.builder().config(sparkConf).getOrCreate()
val map = Map("mode"->"dropMalformed","inferSchema"->"true")
val frame = session.read.options(map).csv("G:\\shixunworkspace\\sparkcode\\src\\main\\java\\SparkSQL\\fun\\project\\b.csv")
// "userId", "goodsId", "categoryId", "behavior", "time"
import session.implicits._
val frame1: Dataset[UserBehaviorBean] = frame.map(row => {
UserBehaviorBean(row.getInt(0), row.getInt(1),
row.getInt(2), row.getString(3), row.getInt(4))
})
val frame3 = frame1.toDF("userId", "goodsId", "categoryId", "behavior", "time")
frame3.createTempView("tmp")
val frame2 = session.sql("select behavior, goodsId, count(*) count from tmp group by behavior, goodsId")
frame2.show()
frame2.createTempView("tmp1")
val frame4 = session.sql("select behavior, goodsId, count, row_number() over(partition by behavior, goodsId order by count) rn from tmp1")
frame4.show()
frame4.createTempView("temp2")
val frame5 = session.sql("select behavior, goodsId, count, rn from temp2 where rn <= 3")
frame5.show()
session.stop()
}
}
posted @   jsqup  阅读(23)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?
点击右上角即可分享
微信分享提示