统计用户连续登陆最大的天数
注意: 断一天 也算连续登陆
1、创建建表语句
create table demo( id string, dt string ) row format delimited fields terminated by '\t';
2、数据准备
1001 2021-08-01 1001 2021-08-02 1001 2021-08-03 1001 2021-08-05 1001 2021-08-06 1001 2021-08-07 1001 2021-08-10 1001 2021-08-12
3、数据导入
load data local inpath '/home/hadoop/demo.txt' into table demo;
4、sql 语句
select id, flag, datediff(max(dt), min(dt)) + 1 as days from( select id, dt, lag_dt, dt_diff, sum(if(dt_diff > 2, 1, 0)) over(partition by id order by dt) as flag from( select id, dt, lag_dt, datediff(dt, lag_dt) as dt_diff from( select id, dt, lag(dt, 1, '1970-01-01') over(partition by id order by dt) as lag_dt from demo ) t1 ) t2 ) t3 group by id, flag having datediff(max(dt), min(dt)) + 1 > 6
5、sql 解析
首先将日期下移一位,用于得到当前日期和上一个日期之间相差的天数
select id, dt, lag_dt, datediff(dt, lag_dt) as dt_diff from( select id, dt, lag(dt, 1, '1970-01-01') over(partition by id order by dt) as lag_dt from demo ) t1
针对相差的天数大于2的做等差数列,区别是否连续天数
select id, dt, lag_dt, dt_diff, sum(if(dt_diff > 2, 1, 0)) over(partition by id order by dt) as flag -- 根据连续的定义更改 dt_diff > 2 值就可以 from( select id, dt, lag_dt, datediff(dt, lag_dt) as dt_diff from( select id, dt, lag(dt, 1, '1970-01-01') over(partition by id order by dt) as lag_dt from demo ) t1 ) t2
分组聚合,查找连续登陆天数的最大值
select id, flag, datediff(max(dt), min(dt)) + 1 as days from( select id, dt, lag_dt, dt_diff, sum(if(dt_diff > 2, 1, 0)) over(partition by id order by dt) as flag from( select id, dt, lag_dt, datediff(dt, lag_dt) as dt_diff from( select id, dt, lag(dt, 1, '1970-01-01') over(partition by id order by dt) as lag_dt from demo ) t1 ) t2 ) t3 group by id, flag