hive 用户行为路径分析

需求

已知用户行为表 temp_actions_log, 大概字段有:(user_id 用户编号, op_id 操作编号, op_time 操作时间)

要求

  1. 统计每天符合以下条件的用户数:A操作之后是B操作,AB操作必须相邻。
  2. 统计用户行为序列为A-B-D的用户数
其中:A-B之间可以有任何其他浏览记录(如C,E等),B-D之间除了C记录可以有任何其他浏览记录(如A,E等)

数据准备

create table temp_actions_log
(
    user_id int,
    op_id   string,
    op_time string
) row format delimited fields terminated by '\t'
    stored as orc
    tblproperties ("orc.compress" = "snappy");;

insert into temp_actions_log(user_id, op_id, op_time)
values (1, 'A', '2020-1-1 12:01:03'),
       (2, 'A', '2020-1-1 12:01:04'),
       (3, 'A', '2020-1-1 12:01:05'),
       (1, 'B', '2020-1-1 12:03:03'),
       (1, 'A', '2020-1-1 12:04:03'),
       (1, 'C', '2020-1-1 12:06:03'),
       (1, 'D', '2020-1-1 12:11:03'),
       (2, 'A', '2020-1-1 12:07:04'),
       (3, 'C', '2020-1-1 12:02:05'),
       (2, 'C', '2020-1-1 12:09:03'),
       (2, 'A', '2020-1-1 12:10:03'),
       (4, 'A', '2020-1-1 12:01:03'),
       (4, 'C', '2020-1-1 12:11:05'),
       (4, 'D', '2020-1-1 12:15:05'),
       (1, 'A', '2020-1-2 12:01:03'),
       (2, 'A', '2020-1-2 12:01:04'),
       (3, 'A', '2020-1-2 12:01:05'),
       (1, 'B', '2020-1-2 12:03:03'),
       (1, 'A', '2020-1-2 12:04:03'),
       (1, 'C', '2020-1-2 12:06:03'),
       (2, 'A', '2020-1-2 12:07:04'),
       (3, 'B', '2020-1-2 12:08:05'),
       (3, 'E', '2020-1-2 12:09:05'),
       (3, 'D', '2020-1-2 12:11:05'),
       (2, 'C', '2020-1-2 12:09:03'),
       (4, 'E', '2020-1-2 12:05:03'),
       (4, 'B', '2020-1-2 12:06:03'),
       (4, 'E', '2020-1-2 12:07:03'),
       (2, 'A', '2020-1-2 12:10:03');

需求一实现

先计算 前后的动作ID

select user_id,
       op_id,
       collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time)       op_before,
       collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time desc ) op_after,
       to_date(op_time) as                                                                    dt
from temp_actions_log;
user_id
op_id
op_before
op_after
dt
1
C
["A","B","C"]
["C"]
2020/1/2
1
A
["A","B"]
["C","A"]
2020/1/2
1
B
["A","B"]
["C","A","B"]
2020/1/2
1
A
["A"]
["C","A","B"]
2020/1/2
2
A
["A","C"]
["A"]
2020/1/1
2
C
["A","C"]
["A","C"]
2020/1/1
2
A
["A"]
["A","C"]
2020/1/1
2
A
["A"]
["A","C"]
2020/1/1
3
C
["A","C"]
["C"]
2020/1/1
3
A
["A"]
["C","A"]
2020/1/1
4
E
["E","B"]
["E"]
2020/1/2
4
B
["E","B"]
["E","B"]
2020/1/2
4
E
["E"]
["E","B"]
2020/1/2
1
D
["A","B","C","D"]
["D"]
2020/1/1
1
C
["A","B","C"]
["D","C"]
2020/1/1
1
A
["A","B"]
["D","C","A"]
2020/1/1
1
B
["A","B"]
["D","C","A","B"]
2020/1/1
1
A
["A"]
["D","C","A","B"]
2020/1/1
2
A
["A","C"]
["A"]
2020/1/2
2
C
["A","C"]
["A","C"]
2020/1/2
2
A
["A"]
["A","C"]
2020/1/2
2
A
["A"]
["A","C"]
2020/1/2
3
D
["A","B","E","D"]
["D"]
2020/1/2
3
E
["A","B","E"]
["D","E"]
2020/1/2
3
B
["A","B"]
["D","E","B"]
2020/1/2
3
A
["A"]
["D","E","B","A"]
2020/1/2
4
D
["A","C","D"]
["D"]
2020/1/1
4
C
["A","C"]
["D","C"]
2020/1/1
4
A
["A"]
["D","C","A"]
2020/1/1

根据需求要求过滤结果

select user_id, dt
from (select user_id,
             op_id,
             collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time)       op_before,
             collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time desc ) op_after,
             to_date(op_time) as                                                                    dt
      from temp_actions_log) i
where op_id = 'B'
  and array_contains(op_before, 'A')
  and array_contains(op_after, 'D')
  and !array_contains(op_after, 'C')
user_id
dt
3
2020/1/2

计算结果

select dt, count(distinct user_id) cnt
from (select user_id, dt
      from (select user_id,
                   op_id,
                   collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time)       op_before,
                   collect_set(op_id) over (partition by user_id,to_date(op_time) order by op_time desc ) op_after,
                   to_date(op_time) as                                                                    dt
            from temp_actions_log) i
      where op_id = 'B'
        and array_contains(op_before, 'A')
        and array_contains(op_after, 'D')
        and !array_contains(op_after, 'C')) i
group by dt;
dt
cnt
2020/1/2
1

需求二实现

select user_id,
       op_id,
       lag(op_id, 1, op_id) over (partition by user_id, date_format(op_time, 'yyyy-MM-dd') order by op_time) as last_op_id,
       date_format(op_time, 'yyyy-MM-dd')                                                                    as dt
from temp_actions_log;

第一步

user_id
op_id
last_op_id
dt
2
A
A
2020/1/1
2
C
A
2020/1/1
2
A
C
2020/1/1
2
A
A
2020/1/2
2
A
A
2020/1/2
2
C
A
2020/1/2
2
A
C
2020/1/2
3
A
A
2020/1/1
3
C
A
2020/1/1
3
A
A
2020/1/2
3
B
A
2020/1/2
3
E
B
2020/1/2
3
D
E
2020/1/2
1
A
A
2020/1/1
1
B
A
2020/1/1
1
A
B
2020/1/1
1
C
A
2020/1/1
1
D
C
2020/1/1
1
A
A
2020/1/2
1
B
A
2020/1/2
1
A
B
2020/1/2
1
C
A
2020/1/2
4
A
A
2020/1/1
4
C
A
2020/1/1
4
D
C
2020/1/1
4
E
E
2020/1/2
4
B
E
2020/1/2
4
E
B
2020/1/2

第二步

select user_id, dt
from (select user_id,
             op_id,
             lag(op_id, 1, op_id)
                 over (partition by user_id, date_format(op_time, 'yyyy-MM-dd') order by op_time) as last_op,
             date_format(op_time, 'yyyy-MM-dd')                                                   as dt
      from temp_actions_log) t
where op_id = 'B'
  and last_op = 'A';
user_id
dt
3
2020/1/2
1
2020/1/1
1
2020/1/2

第三步

select dt, count(distinct user_id) cnt
from (select user_id, dt
      from (select user_id,
                   op_id,
                   lag(op_id, 1, op_id)
                       over (partition by user_id, date_format(op_time, 'yyyy-MM-dd') order by op_time) as last_op,
                   date_format(op_time, 'yyyy-MM-dd')                                                   as dt
            from temp_actions_log) o
      where op_id = 'B'
        and last_op = 'A') o
group by dt;
dt
cnt
2020/1/1
1
2020/1/2
2

posted @ 2022-02-11 12:17  晓枫的春天  阅读(397)  评论(0编辑  收藏  举报