自定义聚合函数(统计每种行为的触发次数排名前三的商品id)
package SparkSQL.fun.project import org.apache.spark.SparkConf import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types.{DataType, DataTypes, StructField, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} /** * 统计每种行为的触发次数排名前三的商品id */ object BehaviorCode2 { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("project01").setMaster("local[*]") val session = SparkSession.builder().config(sparkConf).getOrCreate() val map = Map("mode"->"dropMalformed","inferSchema"->"true") val frame = session.read.options(map).csv("G:\\shixunworkspace\\sparkcode\\src\\main\\java\\SparkSQL\\fun\\project\\b.csv") // "userId", "goodsId", "categoryId", "behavior", "time" import session.implicits._ val frame1: Dataset[UserBehaviorBean] = frame.map(row => { UserBehaviorBean(row.getInt(0), row.getInt(1), row.getInt(2), row.getString(3), row.getInt(4)) }) val frame3 = frame1.toDF("userId", "goodsId", "categoryId", "behavior", "time") frame3.createTempView("tmp") val frame2 = session.sql("select behavior, goodsId, count(*) count from tmp group by behavior, goodsId") frame2.show() frame2.createTempView("tmp1") val frame4 = session.sql("select behavior, goodsId, count, row_number() over(partition by behavior, goodsId order by count) rn from tmp1") frame4.show() frame4.createTempView("temp2") val frame5 = session.sql("select behavior, goodsId, count, rn from temp2 where rn <= 3") frame5.show() session.stop() } }
本文来自博客园,作者:jsqup,转载请注明原文链接:https://www.cnblogs.com/jsqup/p/16659672.html
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?