Hive SQL题库-高级
第1题 同时在线人数问题
1.1 题目需求
现有各直播间的用户访问记录表(live_events)如下,表中每行数据表达的信息为,一个用户何时进入了一个直播间,又在何时离开了该直播间。
user_id(用户id) | live_id(直播间id) | in_datetime(进入直播间的时间) | out_datetime(离开直播间的时间) |
---|---|---|---|
100 | 1 | 2021-12-1 19:30:00 | 2021-12-1 19:53:00 |
100 | 2 | 2021-12-1 21:01:00 | 2021-12-1 22:00:00 |
101 | 1 | 2021-12-1 19:05:00 | 2021-12-1 20:55:00 |
现要求统计各直播间最大同时在线人数,期望结果如下:
live_id | max_user_count |
---|---|
1 | 4 |
2 | 3 |
3 | 2 |
1.2 数据准备
1)建表语句
drop table if exists live_events;
create table if not exists live_events
(
user_id int comment '用户id',
live_id int comment '直播id',
in_datetime string comment '进入直播间时间',
out_datetime string comment '离开直播间时间'
)
comment '直播间访问记录';
2)数据装载
INSERT overwrite table live_events
VALUES (100, 1, '2021-12-01 19:00:00', '2021-12-01 19:28:00'),
(100, 1, '2021-12-01 19:30:00', '2021-12-01 19:53:00'),
(100, 2, '2021-12-01 21:01:00', '2021-12-01 22:00:00'),
(101, 1, '2021-12-01 19:05:00', '2021-12-01 20:55:00'),
(101, 2, '2021-12-01 21:05:00', '2021-12-01 21:58:00'),
(102, 1, '2021-12-01 19:10:00', '2021-12-01 19:25:00'),
(102, 2, '2021-12-01 19:55:00', '2021-12-01 21:00:00'),
(102, 3, '2021-12-01 21:05:00', '2021-12-01 22:05:00'),
(104, 1, '2021-12-01 19:00:00', '2021-12-01 20:59:00'),
(104, 2, '2021-12-01 21:57:00', '2021-12-01 22:56:00'),
(105, 2, '2021-12-01 19:10:00', '2021-12-01 19:18:00'),
(106, 3, '2021-12-01 19:01:00', '2021-12-01 21:10:00');
1.3 代码实现
select
live_id,
max(user_count) max_user_count
from
(
select
user_id,
live_id,
sum(flag) over(partition by live_id order by event_time rows between unbounded preceding and current row) user_count
from
(
-- 登录标记1 登出标记-1
select user_id,
live_id,
in_datetime event_time,
1 flag
from live_events
union all
select user_id,
live_id,
out_datetime event_time,
-1 flag
from live_events
)t1
)t2
group by live_id;
第2题 会话划分问题
2.1 题目需求
现有页面浏览记录表(page_view_events)如下,表中有每个用户的每次页面访问记录。
user_id | page_id | view_timestamp |
---|---|---|
100 | home | 1659950435 |
100 | good_search | 1659950446 |
100 | good_list | 1659950457 |
100 | home | 1659950541 |
100 | good_detail | 1659950552 |
100 | cart | 1659950563 |
101 | home | 1659950435 |
101 | good_search | 1659950446 |
101 | good_list | 1659950457 |
101 | home | 1659950541 |
101 | good_detail | 1659950552 |
101 | cart | 1659950563 |
102 | home | 1659950435 |
102 | good_search | 1659950446 |
102 | good_list | 1659950457 |
103 | home | 1659950541 |
103 | good_detail | 1659950552 |
103 | cart | 1659950563 |
规定若同一用户的相邻两次访问记录时间间隔小于60s,则认为两次浏览记录属于同一会话。现有如下需求,为属于同一会话的访问记录增加一个相同的会话id字段,期望结果如下:
user_id | page_id | view_timestamp | session_id |
---|---|---|---|
100 | home | 1659950435 | 100-1 |
100 | good_search | 1659950446 | 100-1 |
100 | good_list | 1659950457 | 100-1 |
100 | home | 1659950541 | 100-2 |
100 | good_detail | 1659950552 | 100-2 |
100 | cart | 1659950563 | 100-2 |
101 | home | 1659950435 | 101-1 |
101 | good_search | 1659950446 | 101-1 |
101 | good_list | 1659950457 | 101-1 |
101 | home | 1659950541 | 101-2 |
101 | good_detail | 1659950552 | 101-2 |
101 | cart | 1659950563 | 101-2 |
102 | home | 1659950435 | 102-1 |
102 | good_search | 1659950446 | 102-1 |
102 | good_list | 1659950457 | 102-1 |
103 | home | 1659950541 | 103-1 |
103 | good_detail | 1659950552 | 103-1 |
2.2 数据准备
1)建表语句
drop table if exists page_view_events;
create table if not exists page_view_events
(
user_id int comment '用户id',
page_id string comment '页面id',
view_timestamp bigint comment '访问时间戳'
)
comment '页面访问记录';
2)数据装载
insert overwrite table page_view_events
values (100, 'home', 1659950435),
(100, 'good_search', 1659950446),
(100, 'good_list', 1659950457),
(100, 'home', 1659950541),
(100, 'good_detail', 1659950552),
(100, 'cart', 1659950563),
(101, 'home', 1659950435),
(101, 'good_search', 1659950446),
(101, 'good_list', 1659950457),
(101, 'home', 1659950541),
(101, 'good_detail', 1659950552),
(101, 'cart', 1659950563),
(102, 'home', 1659950435),
(102, 'good_search', 1659950446),
(102, 'good_list', 1659950457),
(103, 'home', 1659950541),
(103, 'good_detail', 1659950552),
(103, 'cart', 1659950563);
2.3 代码实现
session_id的设计有两部分user_id + 会话
非常巧妙的思路,先滑窗得到邻近差值,差值大于等于60s标记为1,小于60s标记为0,再开窗sum求和,遇到0不会算到一个会话中,遇到1则表示另一个新的会话。
关键:先找会话的起点
select user_id,
page_id,
view_timestamp,
concat(user_id, '-', sum(session_start_point) over (partition by user_id order by view_timestamp)) session_id
from (
select user_id,
page_id,
view_timestamp,
-- 1 大于等于60s
-- 0 小于60s
if(view_timestamp - lagts >= 60, 1, 0) session_start_point
from (
select user_id,
page_id,
view_timestamp,
lag(view_timestamp, 1, 0) over (partition by user_id order by view_timestamp) lagts
from page_view_events
) t1
) t2;
第3题 间断连续登录用户问题
3.1 题目需求
现有各用户的登录记录表(login_events)如下,表中每行数据表达的信息是一个用户何时登录了平台。
user_id | login_datetime |
---|---|
100 | 2021-12-01 19:00:00 |
100 | 2021-12-01 19:30:00 |
100 | 2021-12-02 21:01:00 |
现要求统计各用户最长的连续登录天数,间断一天也算作连续,例如:一个用户在1,3,5,6登录,则视为连续6天登录。期望结果如下:
user_id | max_day_count |
---|---|
100 | 3 |
101 | 6 |
102 | 3 |
104 | 3 |
105 | 1 |
3.2 数据准备
1)建表语句
drop table if exists login_events;
create table if not exists login_events
(
user_id int comment '用户id',
login_datetime string comment '登录时间'
)comment '直播间访问记录';
2)数据装载
INSERT overwrite table login_events
VALUES (100, '2021-12-01 19:00:00'),
(100, '2021-12-01 19:30:00'),
(100, '2021-12-02 21:01:00'),
(100, '2021-12-03 11:01:00'),
(101, '2021-12-01 19:05:00'),
(101, '2021-12-01 21:05:00'),
(101, '2021-12-03 21:05:00'),
(101, '2021-12-05 15:05:00'),
(101, '2021-12-06 19:05:00'),
(102, '2021-12-01 19:55:00'),
(102, '2021-12-01 21:05:00'),
(102, '2021-12-02 21:57:00'),
(102, '2021-12-03 19:10:00'),
(104, '2021-12-04 21:57:00'),
(104, '2021-12-02 22:57:00'),
(105, '2021-12-01 10:01:00');
3.3 代码实现
如果是真正连续的
select
user_id,
max(cnt) max_cnt
from
(
select
user_id,
diff,
count(*) cnt
from
(
select
user_id,
login_date,
date_sub(login_date,rk) diff
from
(
select
user_id,
login_date,
rank() over(partition by user_id order by login_date) rk
from
(
-- 按照用户和日期去重
select
distinct
user_id,
-- 只要日期
date_format(login_datetime,'yyyy-MM-dd') login_date
from login_events
)t1
)t2
)t3
group by user_id,diff
)t4
group by user_id;
间断连续
方法一
将间断连续转为真正连续,在用真正连续的方法(array explode)
select
user_id,
login_date,
new_login_date
from
(
select
user_id,
login_date,
if(datediff(next_login_date,login_date)=2,array(login_date,date_add(login_date,1)),array(login_date)) arr
from
(
select
user_id,
login_date,
lead(login_date, 1, '9999-12-31') over (partition by user_id order by login_date) next_login_date
from
(
select
distinct
user_id,
date_format(login_datetime,'yyyy-MM-dd') login_date
from login_events
)t1
)t2
)t3 lateral view explode(arr) temp as new_login_date
方法二
思路类似上一题的会话划分问题
间隔小于等于2的在一个连续(会话)内,大于2的属于另一个连续(会话)了
select
user_id,
-- 求出每个用户最大的连续天数
max(recent_days) max_day_count
from
(
select
user_id,
user_flag,
-- 按照分组求每个用户每次连续的天数(记得加1)
datediff(max(login_date),min(login_date)) + 1 recent_days
from
(
select
user_id,
login_date,
lag1_date,
-- 拼接用户和标签分组
concat(user_id,'_',flag) user_flag
from
(
-- 获取大于2的标签 为间断
select
user_id,
login_date,
lag1_date,
sum(if(datediff(login_date,lag1_date)>2,1,0)) over(partition by user_id order by login_date) flag
from
(
-- 获取上一次登录日期
select
user_id,
login_date,
lag(login_date,1,'1970-01-01') over(partition by user_id order by login_date) lag1_date
from
(
-- 按照用户和日期去重
select
distinct
user_id,
-- 只要日期
date_format(login_datetime,'yyyy-MM-dd') login_date
from login_events
)t1
)t2
)t3
)t4
group by user_id,user_flag
)t5
group by user_id;
第4题 日期交叉问题
4.1 题目需求
现有各品牌优惠周期表(promotion_info)如下,其记录了每个品牌的每个优惠活动的周期,其中同一品牌的不同优惠活动的周期可能会有交叉。
promotion_id | brand | start_date | end_date |
---|---|---|---|
1 | oppo | 2021-06-05 | 2021-06-09 |
2 | oppo | 2021-06-11 | 2021-06-21 |
3 | vivo | 2021-06-05 | 2021-06-15 |
现要求统计每个品牌的优惠总天数,若某个品牌在同一天有多个优惠活动,则只按一天计算。期望结果如下:
brand | promotion_day_count |
---|---|
vivo | 17 |
oppo | 16 |
redmi | 22 |
huawei | 22 |
4.2 数据准备
1)建表语句
drop table if exists promotion_info;
create table promotion_info
(
promotion_id string comment '优惠活动id',
brand string comment '优惠品牌',
start_date string comment '优惠活动开始日期',
end_date string comment '优惠活动结束日期'
) comment '各品牌活动周期表';
2)数据装载
insert overwrite table promotion_info
values (1, 'oppo', '2021-06-05', '2021-06-09'),
(2, 'oppo', '2021-06-11', '2021-06-21'),
(3, 'vivo', '2021-06-05', '2021-06-15'),
(4, 'vivo', '2021-06-09', '2021-06-21'),
(5, 'redmi', '2021-06-05', '2021-06-21'),
(6, 'redmi', '2021-06-09', '2021-06-15'),
(7, 'redmi', '2021-06-17', '2021-06-26'),
(8, 'huawei', '2021-06-05', '2021-06-26'),
(9, 'huawei', '2021-06-09', '2021-06-15'),
(10, 'huawei', '2021-06-17', '2021-06-21');
4.3 代码实现
方法一
如果没有交集的话,可以直接这么写
select
brand,
sum(diff)
from
(
select
brand,
datediff(end_date,start_date)+1 diff
from promotion_info
)t1
group by brand;
那是不是可以去掉交集部分,变成没有交集的,使用上面的无交集的思路
select
brand,
sum(datediff(end_date,start_date)+1) promotion_day_count
from
(
select
brand,
max_end_date,
-- 进行截断
if(max_end_date is null or start_date>max_end_date,start_date,date_add(max_end_date,1)) start_date,
end_date
from
(
select
brand,
start_date,
end_date,
-- 以开始时间排序
-- 窗口函数最大结束时间
max(end_date) over(partition by brand order by start_date rows between unbounded preceding and 1 preceding) max_end_date
from promotion_info
)t1
)t2
where end_date>start_date
group by brand;
方法二
展开,之后去重,再统计
但该方法会造成数据放大问题,要根据实际业务使用
select
brand,
count(distinct event_date)
from
(
select
promotion_id,
brand,
date_add(start_date,pos) event_date
from
(
select
promotion_id,
brand,
start_date,
end_date,
diff,
split(repeat(',',diff),',')
from
(
select
promotion_id,
brand,
start_date,
end_date,
datediff(end_date,start_date) diff
from promotion_info
)t1
)t2 lateral view posexplode(arr) temp as pos,item
)t3
group by brand;