hive 用户访问时长分析问题
需求描述
数据如下
问题:
- 用户总量,用户平均年龄,用户平均观看时长
- 每10岁一个分段,统计每个区间的用户总量,用户平均观看时长
- 每个用户最喜欢的节目
- 观看时长大于5min的用户总量,只要有一个节目用户观看时间小于5min就不能算
数据准备
create table temp_userlook_0305 ( view_date bigint, user_id string, age int, programid string, playtime int ) stored as orc tblproperties ('orc.compress' = 'snappy'); insert into temp_userlook_0305 values (20220221, 'u1', 30, 'a', 4), (20220221, 'u1', 30, 'b', 10), (20220221, 'u1', 30, 'a', 2), (20220221, 'u2', 23, 'c', 1), (20220222, 'u3', 26, 'd', 3), (20220223, 'u2', 23, 'a', 2); select * from temp_userlook_0305;
需求一、用户总量,用户平均年龄,用户平均观看时长
select count(user_id) user_cnt, avg(age) age_avg, avg(playtime_sum) playtime_sum_avg from (select user_id, age, sum(playtime) playtime_sum from temp_userlook_0305 group by user_id, age) t;
需求二、每10岁一个分段,统计每个区间的用户总量,用户平均观看时长
select count(user_id) user_cnt, avg(playtime_sum) playtime_sum_avg from (select user_id, age, sum(playtime) playtime_sum from temp_userlook_0305 group by user_id, age) t group by int(age / 10);
需求三、每个用户最喜欢的节目
select user_id, programid, age from (select *, dense_rank() over (partition by user_id order by playtime_sum desc) rn from (select user_id, programid, age, sum(playtime) playtime_sum from temp_userlook_0305 group by user_id, programid, age) i) o where rn = 1;
需求四、观看时长大于5min的用户总量,只要有一个节目用户观看时间小于5min就不能算
select count(distinct t.user_id) user_cnt from (select user_id from temp_userlook_0305) t left join (select user_id from temp_userlook_0305 where playtime < 5) t1 where t1.user_id is null;
create table temp_userlook_0305 ( view_date bigint, user_id string, age int, programid string, playtime int ) stored as orc tblproperties ('orc.compress' = 'snappy'); insert into temp_userlook_0305 values (20220221, 'u1', 30, 'a', 4), (20220221, 'u1', 30, 'b', 10), (20220221, 'u1', 30, 'a', 2), (20220221, 'u2', 23, 'c', 1), (20220222, 'u3', 26, 'd', 3), (20220223, 'u2', 23, 'a', 2); select * from temp_userlook_0305;