Hive面试SQL总结

一、连续登陆问题

-- user_id, date, flag
-- data: 
1 2019-07-11 1 
1 2019-07-12 1 
1 2019-07-13 1 
1 2019-07-14 1 
1 2019-07-15 1 
1 2019-07-16 1 
1 2019-07-17 1 
1 2019-07-18 1 
2 2019-07-11 1 
2 2019-07-12 1 
2 2019-07-13 0 
2 2019-07-14 1 
2 2019-07-15 1 
2 2019-07-16 0 
2 2019-07-17 1 
3 2019-07-11 1 
3 2019-07-12 1 
3 2019-07-13 1 
3 2019-07-14 0 
3 2019-07-15 1 
3 2019-07-16 1 
3 2019-07-17 1 
3 2019-07-18 1

-- 建表
create table t_login(user_id string, dt string, flag string) row format delimited fields terminated by ' ';
load data local inpath '/opt/workspace/login.dat' into table t_login;

-- 求连续登陆3天的用户的起始时间和结束时间
with tmp as (
　　select *, date_sub(dt, rn) as start_day
　　from (
　　　　select *, row_number() over(partition by user_id order by dt) as rn
　　　　from t_login
　　　　where flag = '1'
　　) t
)
select user_id, start_day, count(1) as cnt, max(dt), min(dt)
from tmp 
group by user_id, start_day
having cnt >= 3
;

二、留存分析

-- 计算20211123的留存
with t1 as (
　　select distinct user_id
　　from t_marmot_login_event
　　where dt = '20211123'
), t2 as (
　　select distinct user_id
　　from t_marmot_login_event
　　where dt = '20211124'
)
select count(t1.user_id) as yesterday_cnt, count(t2.user_id) as today_cnt
from t1 left outer join t2
on t1.user_id = t2.user_id

-- 计算留存分析(2)
with tmp as (
    SELECT distinct view_id, distinct_id, p_dymd
    from dwd_marmot_click_event
    where p_dymd <= '${YEAR_MONTH_DAY}'
    and p_dymd >= date_sub('${YEAR_MONTH_DAY}', 30)
    order by p_dymd desc
)
select tab2.t1,
       count(distinct case when diff = 1 then tab2.distinct_id else null end ) as one_day,
       count(distinct case when diff = 2 then tab2.distinct_id else null end ) as two_day
from (
    select *, datediff(t1, t2) as diff
    from (
        select a.view_id, a.distinct_id, a.p_dymd t1, b.p_dymd t2
        from tmp a
        inner join tmp b
        on a.view_id = b.view_id
        and a.distinct_id = b.distinct_id
    ) tab1
) tab2

三、求每个科目前三名的同学，分数一样的并列，没有间隔，同时计算分差

--sid, sub, score
1 语文 78
2 语文 89
3 语文 76 
4 语文 81
1 数学 67
2 数学 77
3 数学 78
4 数学 71
1 英语 87
2 英语 81
3 英语 91
4 英语 79

-- 建表
create table t_score(uid string, sub string, score int) row format delimited fields terminated by ' ';
load data local inpath '/opt/workspace/score.dat' into table t_score;

-- 计算
with tmp as (
　　select *
　　from (
　　　　select *, dense_rank() over(partition by sub order by score desc) as rn
　　　　from t_score
　　) t
　　where rn <= 3
) 
select uid, sub, score, rn, nvl(score-lag_value, 0)
from (
　　select *, lag(score) over(partition by sub order by score desc) as lag_value
　　from tmp 
) t1

四、行列转换

-- 数据使用t_score表
-- 行转列： case when group by
select uid, 
　　sum(case when sub='语文' then score else 0 end) as a,
　　sum(case when sub='数学' then score else 0 end) as b,
　　sum(case when sub='英语' then score else 0 end) as c
from t_score
group by uid;

-- 列转行
select uid, concat_ws(",", collect_list(item))
from (
select uid, concat(sub, ":", score) as item
from t_score
) t
group by uid

-- lateral view explode

五、漏斗分析

-- 简单的漏斗: 常理A->B->C-D数量是依次递减的
select 
　　substr(server_time, 1, 10) as biz_day,
　　count(distinct case when event='A' then uid else null end) as a_cnt,
　　count(distinct case when event='B' then uid else null end) as b_cnt,
　　count(distinct case when event='C' then uid else null end) as c_cnt,
　　count(distinct case when event='D' then uid else null end) as d_cnt
from t_marmor_test
where p_dymd = '20211121'
and substr(server_time, 1, 10) = '2021-11-21'

-- 考虑完整的链路，不同事件之间间隔少于5min，同时不存在重复事件
-- 一般来说每次访问都有一个flow_id或者session_id
with t1 as (
　　select flow_id, event, ctime
　　from t_marmot_log
　　where p_dymd='20211121' and event='A'
), t2 as (
　　select flow_id, event, ctime
　　from t_marmot_log
　　where p_dymd='20211121' and event='B'
), t3 as (
　　select flow_id, evnet, ctime
　　from t_marmot_log
　　where p_dymd='20211121' and event='C'
), t4 as (
　　select flow_id, event, ctime
　　from t_marmor_log
　　where p_dymd=''20211121 and event='D'
 )
select 
　　count(t1.flow_id) as step1,
　　count(t2.flow_id) as step2,
　　count(t3.flow_id) as step3,
　　count(t4.flow_id) as step4
from t1 left outer join t2
on t1.flow_id = t2.flow_id and datediff('second', t1.ctime, t2.ctime) < 30 * 60
left outer join t3 
on t1.flow_id = t3.flow_id and datediff('second', t2.ctime, t3.ctime) < 30 * 60
left outer join t3
on t1.flow_id = t4.floe_id and datediff('second', t3.ctime, t4.ctime) < 30 * 60

posted @ 2021-11-23 22:43 Shydow 阅读(193) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Shydow

Hive面试SQL总结

公告