Hive 刷题——连续签到领金币问题
需求描述
有如下数据
create table temp_user_log ( uid int comment '用户id', artical_id int comment '视频id', in_time string comment '进入时间', out_time string comment '离开时间', sign_in int comment '是否签到' ) stored as orc tblproperties ("orc.compress" = "snappy"); INSERT INTO temp_user_log(uid, artical_id, in_time, out_time, sign_in) VALUES (101, 0, '2021-07-07 10:00:00', '2021-07-07 10:00:09', 1), (101, 0, '2021-07-08 10:00:00', '2021-07-08 10:00:09', 1), (101, 0, '2021-07-09 10:00:00', '2021-07-09 10:00:42', 1), (101, 0, '2021-07-10 10:00:00', '2021-07-10 10:00:09', 1), (101, 0, '2021-07-11 23:59:55', '2021-07-11 23:59:59', 1), (101, 0, '2021-07-12 10:00:28', '2021-07-12 10:00:50', 1), (101, 0, '2021-07-13 10:00:28', '2021-07-13 10:00:50', 1), (101, 0, '2021-07-14 11:00:28', '2021-07-14 11:00:50', 1), (101, 0, '2021-07-15 11:59:28', '2021-07-16 00:01:20', 1), (102, 0, '2021-10-01 10:00:28', '2021-10-01 10:00:50', 1), (102, 0, '2021-10-02 10:00:01', '2021-10-02 10:01:50', 1), (102, 0, '2021-10-03 11:00:55', '2021-10-03 11:00:59', 1), (102, 0, '2021-10-04 11:00:45', '2021-10-04 11:00:55', 0), (102, 0, '2021-10-05 11:00:53', '2021-10-05 11:00:59', 1), (102, 0, '2021-10-06 11:00:45', '2021-10-06 11:00:55', 1);
场景逻辑说明:
artical_id-文章ID代表用户浏览的文章的ID,特殊情况artical_id-文章ID为0表示用户在非文章内容页(比如App内的列表页、活动页等)。注意:只有artical_id为0时sign_in值才有效。从2021年7月7日0点开始,用户每天签到可以领1金币,并可以开始累积签到天数,连续签到的第3、7天分别可额外领2、6金币。每连续签到7天后重新累积签到天数(即重置签到天数:连续第8天签到时记为新的一轮签到的第一天,领1金币)问题:计算每个用户2021年7月以来每月获得的金币数(该活动到10月底结束,11月1日开始的签到不再获得金币)。结果按月份、ID升序排序。注:如果签到记录的in_time-进入时间和out_time-离开时间跨天了,也只记作in_time对应的日期签到了。
分析
领取金币规则:
(1)只要用户签到就能获取一枚金币
(2)连续签到第三天和第七天分别额外获取2枚和6金币
(3)连续签到7天后进行重置,按照规则(1)和(2)进行新一轮领币
分析:本题的维度是用户和月份,也就是最终结果是按照这两个来分组
- 第一步:根据领金币的规则,我们容易想到按照签到的连续性先进行分组,将连续的进行标记分成一组
- 第二步:在每一个连续的组里进行row_number()标记(rn),目的是获取连续的第三天和第七个天特殊位置。
- 第三步:根据第二步连续的天数中是否有超过7天的,然后以此进行分割,再进行分组
参考实现
select uid, month, series_flag, series_7_flg, case when cnt >= 1 and cnt < 3 then cnt when cnt >= 3 and cnt < 7 then cnt + 2 else cnt + 2 + 6 end coin from (select uid, month, series_flag, series_7_flg, count(*) cnt from (select uid, in_time, sign_in, month, series_flag, rn, sum(if(rn % 7 = 0, 1, 0)) over (partition by uid,month,series_flag order by in_time rows between current row and unbounded following) series_7_flg from (select uid, in_time, sign_in, month, series_flag, row_number() over (partition by uid, series_flag,month) rn from (select uid, in_time, sign_in, month, sum(if(diff < 1, 1, 0)) over (partition by uid,month,series_sign_in_flag order by in_time) series_flag from (select uid, in_time, sign_in, month, datediff(to_date(in_time), to_date(lag_in_time)) diff, sum(if(sign_in != lag_sign_in, 1, 0)) over (partition by uid,month order by in_time) series_sign_in_flag from (select uid, in_time, sign_in, substr(in_time, 1, 7) month, lag(in_time, 1, in_time) over (partition by uid,substr(in_time, 1, 7) order by in_time) lag_in_time, lag(sign_in, 1, sign_in) over (partition by uid,substr(in_time, 1, 7) order by in_time) lag_sign_in from temp_user_log) t) t) i) i) i group by uid, month, series_flag, series_7_flg) i; select uid, month, sum(case when cnt >= 1 and cnt < 3 then cnt when cnt >= 3 and cnt < 7 then cnt + 2 else cnt + 2 + 6 end) coin from (select uid, month, series_flag, series_7_flg, count(*) cnt from (select uid, in_time, sign_in, month, series_flag, rn, sum(if(rn % 7 = 0, 1, 0)) over (partition by uid,month,series_flag order by in_time rows between current row and unbounded following) series_7_flg from (select uid, in_time, sign_in, month, series_flag, row_number() over (partition by uid, series_flag,month) rn from (select uid, in_time, sign_in, month, sum(if(diff < 1, 1, 0)) over (partition by uid,month,series_sign_in_flag order by in_time) series_flag from (select uid, in_time, sign_in, month, datediff(to_date(in_time), to_date(lag_in_time)) diff, sum(if(sign_in != lag_sign_in, 1, 0)) over (partition by uid,month order by in_time) series_sign_in_flag from (select uid, in_time, sign_in, substr(in_time, 1, 7) month, lag(in_time, 1, in_time) over (partition by uid,substr(in_time, 1, 7) order by in_time) lag_in_time, lag(sign_in, 1, sign_in) over (partition by uid,substr(in_time, 1, 7) order by in_time) lag_sign_in from temp_user_log where artical_id = 0 and substr(in_time, 1, 7) >= '2021-07' and substr(in_time, 1, 7) <= '2021-10') t) t where sign_in = 1) i) i) i group by uid, month, series_flag, series_7_flg) i group by uid, month;