计算分位数
4个分位数的取法:
df1 = spark.createDataFrame([(1,1),(1,2),(1,3),(1,4),(1,5),(1,6),(1,7),(1,8),(1,9),(1,10),(2,1),(2,10),(2,100)],['id','cnt']) cnt_med_1 = F.expr('percentile_approx(cnt, 0.25)') cnt_med_2 = F.expr('percentile_approx(cnt, 0.5)') cnt_med_3 = F.expr('percentile_approx(cnt, 0.75)') cnt_med_4 = F.expr('percentile_approx(cnt, 0.90)') df1.groupBy('id').agg(F.max('cnt').alias('max_cnt'),cnt_med_1.alias('cnt_med_1'),cnt_med_2.alias('cnt_med_2'),cnt_med_3.alias('cnt_med_3'),cnt_med_4.alias('cnt_med_4')).show()