hive 用户访问时长分析问题

需求描述

数据如下

 

 问题:

  1. 用户总量,用户平均年龄,用户平均观看时长
  2. 每10岁一个分段,统计每个区间的用户总量,用户平均观看时长
  3. 每个用户最喜欢的节目
  4. 观看时长大于5min的用户总量,只要有一个节目用户观看时间小于5min就不能算

数据准备

create table temp_userlook_0305
(
    view_date bigint,
    user_id   string,
    age       int,
    programid string,
    playtime  int
) stored as orc
    tblproperties ('orc.compress' = 'snappy');
insert into temp_userlook_0305
values (20220221, 'u1', 30, 'a', 4),
       (20220221, 'u1', 30, 'b', 10),
       (20220221, 'u1', 30, 'a', 2),
       (20220221, 'u2', 23, 'c', 1),
       (20220222, 'u3', 26, 'd', 3),
       (20220223, 'u2', 23, 'a', 2);
select * from temp_userlook_0305;

需求一、用户总量,用户平均年龄,用户平均观看时长
select count(user_id) user_cnt, avg(age) age_avg, avg(playtime_sum) playtime_sum_avg
from (select user_id, age, sum(playtime) playtime_sum
      from temp_userlook_0305
      group by user_id, age) t;

 

 需求二、每10岁一个分段,统计每个区间的用户总量,用户平均观看时长

select count(user_id) user_cnt, avg(playtime_sum) playtime_sum_avg
from (select user_id, age, sum(playtime) playtime_sum
      from temp_userlook_0305
      group by user_id, age) t
group by int(age / 10);

 

 需求三、每个用户最喜欢的节目

select  user_id, programid, age
from (select *, dense_rank() over (partition by user_id order by playtime_sum desc) rn
      from (select user_id, programid, age, sum(playtime) playtime_sum
            from temp_userlook_0305
            group by user_id, programid, age) i) o
where rn = 1;

 

 需求四、观看时长大于5min的用户总量,只要有一个节目用户观看时间小于5min就不能算

select count(distinct t.user_id) user_cnt
from (select user_id
      from temp_userlook_0305) t
         left join
     (select user_id
      from temp_userlook_0305
      where playtime < 5) t1
where t1.user_id is null;

 

create table temp_userlook_0305 ( view_date bigint, user_id string, age int, programid string, playtime int ) stored as orc tblproperties ('orc.compress' = 'snappy'); insert into temp_userlook_0305 values (20220221, 'u1', 30, 'a', 4), (20220221, 'u1', 30, 'b', 10), (20220221, 'u1', 30, 'a', 2), (20220221, 'u2', 23, 'c', 1), (20220222, 'u3', 26, 'd', 3), (20220223, 'u2', 23, 'a', 2); select * from temp_userlook_0305;

posted @ 2022-03-07 17:19  晓枫的春天  阅读(225)  评论(0编辑  收藏  举报