Hive面试SQL总结

一、连续登陆问题

-- user_id, date, flag
-- data:
1 2019-07-11 1
1 2019-07-12 1
1 2019-07-13 1
1 2019-07-14 1
1 2019-07-15 1
1 2019-07-16 1
1 2019-07-17 1
1 2019-07-18 1
2 2019-07-11 1
2 2019-07-12 1
2 2019-07-13 0
2 2019-07-14 1
2 2019-07-15 1
2 2019-07-16 0
2 2019-07-17 1
3 2019-07-11 1
3 2019-07-12 1
3 2019-07-13 1
3 2019-07-14 0
3 2019-07-15 1
3 2019-07-16 1
3 2019-07-17 1
3 2019-07-18 1

-- 建表
create table t_login(user_id string, dt string, flag string) row format delimited fields terminated by ' ';
load data local inpath '/opt/workspace/login.dat' into table t_login;

-- 求连续登陆3天的用户的起始时间和结束时间
with tmp as (
  select *, date_sub(dt, rn) as start_day
  from (
    select *, row_number() over(partition by user_id order by dt) as rn
    from t_login
    where flag = '1'
  ) t
)
select user_id, start_day, count(1) as cnt, max(dt), min(dt)
from tmp
group by user_id, start_day
having cnt >= 3
;

 

二、留存分析

-- 计算20211123的留存
with t1 as (
  select distinct user_id
  from t_marmot_login_event
  where dt = '20211123'
), t2 as (
  select distinct user_id
  from t_marmot_login_event
  where dt = '20211124'
)
select count(t1.user_id) as yesterday_cnt, count(t2.user_id) as today_cnt
from t1 left outer join t2
on t1.user_id = t2.user_id

--
计算留存分析(2) with tmp as ( SELECT distinct view_id, distinct_id, p_dymd from dwd_marmot_click_event where p_dymd <= '${YEAR_MONTH_DAY}' and p_dymd >= date_sub('${YEAR_MONTH_DAY}', 30) order by p_dymd desc ) select tab2.t1, count(distinct case when diff = 1 then tab2.distinct_id else null end ) as one_day, count(distinct case when diff = 2 then tab2.distinct_id else null end ) as two_day from ( select *, datediff(t1, t2) as diff from ( select a.view_id, a.distinct_id, a.p_dymd t1, b.p_dymd t2 from tmp a inner join tmp b on a.view_id = b.view_id and a.distinct_id = b.distinct_id ) tab1 ) tab2

 

三、求每个科目前三名的同学,分数一样的并列,没有间隔,同时计算分差

--sid, sub, score
1 语文 78
2 语文 89
3 语文 76
4 语文 81
1 数学 67
2 数学 77
3 数学 78
4 数学 71
1 英语 87
2 英语 81
3 英语 91
4 英语 79

-- 建表
create table t_score(uid string, sub string, score int) row format delimited fields terminated by ' ';
load data local inpath '/opt/workspace/score.dat' into table t_score;

-- 计算
with tmp as (
  select *
  from (
    select *, dense_rank() over(partition by sub order by score desc) as rn
    from t_score
  ) t
  where rn <= 3
)
select uid, sub, score, rn, nvl(score-lag_value, 0)
from (
  select *, lag(score) over(partition by sub order by score desc) as lag_value
  from tmp
) t1

 

四、行列转换

-- 数据使用t_score表
-- 行转列: case when group by
select uid,
  sum(case when sub='语文' then score else 0 end) as a,
  sum(case when sub='数学' then score else 0 end) as b,
  sum(case when sub='英语' then score else 0 end) as c
from t_score
group by uid;

-- 列转行
select uid, concat_ws(",", collect_list(item))
from (
select uid, concat(sub, ":", score) as item
from t_score
) t
group by uid

-- lateral view explode

 

五、漏斗分析

-- 简单的漏斗: 常理A->B->C-D数量是依次递减的
select
  substr(server_time, 1, 10) as biz_day,
  count(distinct case when event='A' then uid else null end) as a_cnt,
  count(distinct case when event='B' then uid else null end) as b_cnt,
  count(distinct case when event='C' then uid else null end) as c_cnt,
  count(distinct case when event='D' then uid else null end) as d_cnt
from t_marmor_test
where p_dymd = '20211121'
and substr(server_time, 1, 10) = '2021-11-21'

-- 考虑完整的链路,不同事件之间间隔少于5min,同时不存在重复事件
-- 一般来说每次访问都有一个flow_id或者session_id
with t1 as (
  select flow_id, event, ctime
  from t_marmot_log
  where p_dymd='20211121' and event='A'
), t2 as (
  select flow_id, event, ctime
  from t_marmot_log
  where p_dymd='20211121' and event='B'
), t3 as (
  select flow_id, evnet, ctime
  from t_marmot_log
  where p_dymd='20211121' and event='C'
), t4 as (
  select flow_id, event, ctime
  from t_marmor_log
  where p_dymd=''20211121 and event='D'
)
select
  count(t1.flow_id) as step1,
  count(t2.flow_id) as step2,
  count(t3.flow_id) as step3,
  count(t4.flow_id) as step4
from t1 left outer join t2
on t1.flow_id = t2.flow_id and datediff('second', t1.ctime, t2.ctime) < 30 * 60
left outer join t3
on t1.flow_id = t3.flow_id and datediff('second', t2.ctime, t3.ctime) < 30 * 60
left outer join t3
on t1.flow_id = t4.floe_id and datediff('second', t3.ctime, t4.ctime) < 30 * 60

 

posted @ 2021-11-23 22:43  Shydow  阅读(193)  评论(0编辑  收藏  举报