hive 用户行为路径分析
需求
已知用户行为表 temp_actions_log, 大概字段有:(user_id 用户编号, op_id 操作编号, op_time 操作时间)
要求
- 统计每天符合以下条件的用户数:A操作之后是B操作,AB操作必须相邻。
- 统计用户行为序列为A-B-D的用户数
其中:A-B之间可以有任何其他浏览记录(如C,E等),B-D之间除了C记录可以有任何其他浏览记录(如A,E等)
数据准备
create table temp_actions_log ( user_id int, op_id string, op_time string ) row format delimited fields terminated by '\t' stored as orc tblproperties ("orc.compress" = "snappy");; insert into temp_actions_log(user_id, op_id, op_time) values (1, 'A', '2020-1-1 12:01:03'), (2, 'A', '2020-1-1 12:01:04'), (3, 'A', '2020-1-1 12:01:05'), (1, 'B', '2020-1-1 12:03:03'), (1, 'A', '2020-1-1 12:04:03'), (1, 'C', '2020-1-1 12:06:03'), (1, 'D', '2020-1-1 12:11:03'), (2, 'A', '2020-1-1 12:07:04'), (3, 'C', '2020-1-1 12:02:05'), (2, 'C', '2020-1-1 12:09:03'), (2, 'A', '2020-1-1 12:10:03'), (4, 'A', '2020-1-1 12:01:03'), (4, 'C', '2020-1-1 12:11:05'), (4, 'D', '2020-1-1 12:15:05'), (1, 'A', '2020-1-2 12:01:03'), (2, 'A', '2020-1-2 12:01:04'), (3, 'A', '2020-1-2 12:01:05'), (1, 'B', '2020-1-2 12:03:03'), (1, 'A', '2020-1-2 12:04:03'), (1, 'C', '2020-1-2 12:06:03'), (2, 'A', '2020-1-2 12:07:04'), (3, 'B', '2020-1-2 12:08:05'), (3, 'E', '2020-1-2 12:09:05'), (3, 'D', '2020-1-2 12:11:05'), (2, 'C', '2020-1-2 12:09:03'), (4, 'E', '2020-1-2 12:05:03'), (4, 'B', '2020-1-2 12:06:03'), (4, 'E', '2020-1-2 12:07:03'), (2, 'A', '2020-1-2 12:10:03');
需求一实现
先计算 前后的动作ID
select user_id, op_id, collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time) op_before, collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time desc ) op_after, to_date(op_time) as dt from temp_actions_log;
user_id
|
op_id
|
op_before
|
op_after
|
dt
|
1
|
C
|
["A","B","C"]
|
["C"]
|
2020/1/2
|
1
|
A
|
["A","B"]
|
["C","A"]
|
2020/1/2
|
1
|
B
|
["A","B"]
|
["C","A","B"]
|
2020/1/2
|
1
|
A
|
["A"]
|
["C","A","B"]
|
2020/1/2
|
2
|
A
|
["A","C"]
|
["A"]
|
2020/1/1
|
2
|
C
|
["A","C"]
|
["A","C"]
|
2020/1/1
|
2
|
A
|
["A"]
|
["A","C"]
|
2020/1/1
|
2
|
A
|
["A"]
|
["A","C"]
|
2020/1/1
|
3
|
C
|
["A","C"]
|
["C"]
|
2020/1/1
|
3
|
A
|
["A"]
|
["C","A"]
|
2020/1/1
|
4
|
E
|
["E","B"]
|
["E"]
|
2020/1/2
|
4
|
B
|
["E","B"]
|
["E","B"]
|
2020/1/2
|
4
|
E
|
["E"]
|
["E","B"]
|
2020/1/2
|
1
|
D
|
["A","B","C","D"]
|
["D"]
|
2020/1/1
|
1
|
C
|
["A","B","C"]
|
["D","C"]
|
2020/1/1
|
1
|
A
|
["A","B"]
|
["D","C","A"]
|
2020/1/1
|
1
|
B
|
["A","B"]
|
["D","C","A","B"]
|
2020/1/1
|
1
|
A
|
["A"]
|
["D","C","A","B"]
|
2020/1/1
|
2
|
A
|
["A","C"]
|
["A"]
|
2020/1/2
|
2
|
C
|
["A","C"]
|
["A","C"]
|
2020/1/2
|
2
|
A
|
["A"]
|
["A","C"]
|
2020/1/2
|
2
|
A
|
["A"]
|
["A","C"]
|
2020/1/2
|
3
|
D
|
["A","B","E","D"]
|
["D"]
|
2020/1/2
|
3
|
E
|
["A","B","E"]
|
["D","E"]
|
2020/1/2
|
3
|
B
|
["A","B"]
|
["D","E","B"]
|
2020/1/2
|
3
|
A
|
["A"]
|
["D","E","B","A"]
|
2020/1/2
|
4
|
D
|
["A","C","D"]
|
["D"]
|
2020/1/1
|
4
|
C
|
["A","C"]
|
["D","C"]
|
2020/1/1
|
4
|
A
|
["A"]
|
["D","C","A"]
|
2020/1/1
|
根据需求要求过滤结果
select user_id, dt from (select user_id, op_id, collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time) op_before, collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time desc ) op_after, to_date(op_time) as dt from temp_actions_log) i where op_id = 'B' and array_contains(op_before, 'A') and array_contains(op_after, 'D') and !array_contains(op_after, 'C')
user_id
|
dt
|
3
|
2020/1/2
|
计算结果
select dt, count(distinct user_id) cnt from (select user_id, dt from (select user_id, op_id, collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time) op_before, collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time desc ) op_after, to_date(op_time) as dt from temp_actions_log) i where op_id = 'B' and array_contains(op_before, 'A') and array_contains(op_after, 'D') and !array_contains(op_after, 'C')) i group by dt;
dt
|
cnt
|
2020/1/2
|
1
|
需求二实现
select user_id, op_id, lag(op_id, 1, op_id) over (partition by user_id, date_format(op_time, 'yyyy-MM-dd') order by op_time) as last_op_id, date_format(op_time, 'yyyy-MM-dd') as dt from temp_actions_log;
第一步
user_id
|
op_id
|
last_op_id
|
dt
|
2
|
A
|
A
|
2020/1/1
|
2
|
C
|
A
|
2020/1/1
|
2
|
A
|
C
|
2020/1/1
|
2
|
A
|
A
|
2020/1/2
|
2
|
A
|
A
|
2020/1/2
|
2
|
C
|
A
|
2020/1/2
|
2
|
A
|
C
|
2020/1/2
|
3
|
A
|
A
|
2020/1/1
|
3
|
C
|
A
|
2020/1/1
|
3
|
A
|
A
|
2020/1/2
|
3
|
B
|
A
|
2020/1/2
|
3
|
E
|
B
|
2020/1/2
|
3
|
D
|
E
|
2020/1/2
|
1
|
A
|
A
|
2020/1/1
|
1
|
B
|
A
|
2020/1/1
|
1
|
A
|
B
|
2020/1/1
|
1
|
C
|
A
|
2020/1/1
|
1
|
D
|
C
|
2020/1/1
|
1
|
A
|
A
|
2020/1/2
|
1
|
B
|
A
|
2020/1/2
|
1
|
A
|
B
|
2020/1/2
|
1
|
C
|
A
|
2020/1/2
|
4
|
A
|
A
|
2020/1/1
|
4
|
C
|
A
|
2020/1/1
|
4
|
D
|
C
|
2020/1/1
|
4
|
E
|
E
|
2020/1/2
|
4
|
B
|
E
|
2020/1/2
|
4
|
E
|
B
|
2020/1/2
|
第二步
select user_id, dt from (select user_id, op_id, lag(op_id, 1, op_id) over (partition by user_id, date_format(op_time, 'yyyy-MM-dd') order by op_time) as last_op, date_format(op_time, 'yyyy-MM-dd') as dt from temp_actions_log) t where op_id = 'B' and last_op = 'A';
user_id
|
dt
|
3
|
2020/1/2
|
1
|
2020/1/1
|
1
|
2020/1/2
|
第三步
select dt, count(distinct user_id) cnt from (select user_id, dt from (select user_id, op_id, lag(op_id, 1, op_id) over (partition by user_id, date_format(op_time, 'yyyy-MM-dd') order by op_time) as last_op, date_format(op_time, 'yyyy-MM-dd') as dt from temp_actions_log) o where op_id = 'B' and last_op = 'A') o group by dt;
dt
|
cnt
|
2020/1/1
|
1
|
2020/1/2
|
2
|
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· 一文读懂知识蒸馏
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下