hive 窗口函数(一)
窗口函数可以对数据进行分组,将数据按组分成一个窗口。利用窗口函数可以对窗口内的数据进行排序,聚合操作,还可以通过window子句让窗口上下滑动,非常灵活。
数据准备
create table cookie_sum ( id varchar(5), createtime varchar(20), pv int ) row format delimited fields terminated by ',' stored as orc tblproperties ("orc.compress" = "snappy"); insert into cookie_sum(id, createtime, pv) values ('1001', '2022-04-10', 1), ('1001', '2022-04-11', 5), ('1001', '2022-04-12', 7), ('1001', '2022-04-13', 3), ('1001', '2022-04-14', 2), ('1001', '2022-04-15', 4), ('1001', '2022-04-16', 4), ('1002', '2022-04-10', 2), ('1002', '2022-04-11', 3), ('1002', '2022-04-12', 5), ('1002', '2022-04-13', 6), ('1002', '2022-04-14', 3), ('1002', '2022-04-15', 9), ('1002', '2022-04-16', 7);
一、sum
select id, createtime, pv, sum(pv) over (partition by id order by createtime rows between unbounded preceding and current row ) p1, sum(pv) over (partition by id order by createtime ) p2, sum(pv) over (partition by id ) p3, sum(pv) over (partition by id order by createtime rows between 3 preceding and 1 following ) p4, sum(pv) over (partition by id order by createtime rows between 3 preceding and current row ) p5, sum(pv) over (partition by id order by createtime rows between current row and unbounded following) p6 from cookie_sum where id = '1001';
结果
说明
pv1 rows between unbounded preceding and current row : 分组内从起点到当前行的pv累积,如,11号的pv1=10号的pv+11号的pv, 12号=10号+11号+12号 pv2: 分组内从起点到当前行的pv累积 同上 pv3: 分组内(id='1001') 所有的pv累加 pv4 rows between 3 preceding and 1 following : 分组内当前行+往前3行+往后1行,如,14号=11号+12号+13号+14号+15号=5+7+3+2+4=21 pv5 between 3 preceding and current row : 分组内当前行+往前3行到当前行,如,14号=11号+12号+13号+14号 =5+7+3+2 =17 pv6 rows between current row and unbounded following: 分组内当前行+往后所有行,如,13号=13号+14号+15号+16号=3+2+4+4=13,14号=14号+15号+16号=2+4+4=10 ****************************** 如果不指定rows between,默认为从起点到当前行; 如果不指定order by,则将分组内所有值累加;关键是理解 ROWS BETWEEN含义,也叫做WINDOW子句: PRECEDING:往前 FOLLOWING:往后 CURRENT ROW:当前行 UNBOUNDED:起点, unbounded preceding 表示从前面的起点, unbounded following:表示到后面的终点 –其他AVG,MIN,MAX,和SUM用法一样。
二、avg
select id, createtime, pv, avg(pv) over (partition by id order by createtime rows between unbounded preceding and current row) as pv1, --起始行到当前行求平均值 avg(pv) over (partition by id order by createtime) as pv2, --起始行到当前行求平均值 avg(pv) over (partition by id) as pv3, avg(pv) over (partition by id order by createtime rows between 3 preceding and current row) as pv4,--往前3行到当前行求均值 avg(pv) over (partition by id order by createtime rows between 3 preceding and 1 following) as pv5,--往前3行到当前行的后1求均值 avg(pv) over (partition by id order by createtime rows between current row and unbounded following) as pv6--当前行到末行求均值 from cookie_sum where id='1001';
结果
三、max
select id, createtime, pv, max(pv) over (partition by id order by createtime rows between unbounded preceding and current row) as pv1, -- 默认为从起点到当前行求最大值 max(pv) over (partition by id order by createtime) as pv2, --从起点到当前行,结果同pv1 max(pv) over (partition by id) as pv3, --分组内所有行求最大值 max(pv) over (partition by id order by createtime rows between 3 preceding and current row) as pv4, --当前行+往前3行求最大值 max(pv) over (partition by id order by createtime rows between 3 preceding and 1 following) as pv5, --当前行+往前3行+往后1行求最大值 max(pv) over (partition by id order by createtime rows between current row and unbounded following) as pv6 --当前行+往后所有行求最大值 from cookie_sum where id='1001';
结果
四、min
select id, createtime, pv, min(pv) over (partition by id order by createtime rows between unbounded preceding and current row) as pv1, -- 默认为从起点到当前行 min(pv) over (partition by id order by createtime) as pv2, --从起点到当前行,结果同pv1 min(pv) over (partition by id) as pv3, --分组内所有行 min(pv) over (partition by id order by createtime rows between 3 preceding and current row) as pv4, --当前行+往前3行 min(pv) over (partition by id order by createtime rows between 3 preceding and 1 following) as pv5, --当前行+往前3行+往后1行 min(pv) over (partition by id order by createtime rows between current row and unbounded following) as pv6 --当前行+往后所有行 from cookie_sum where id='1001';
结果