Hive 刷题——银行可疑支付监测
场景说明
有一个支付流水表,关键字段:用户,交易时间,交易金额,现在规定:两个小时内交易次数大于2且交易总结金大于100000的用户为可疑用户,现在需要使用HiveSQL 进行监测
数据准备
CREATE TABLE transfer_log ( log_id INTEGER, log_ts TIMESTAMP, from_user VARCHAR(50), to_user VARCHAR(50), type VARCHAR(10), amount NUMERIC(10) ) stored as orc tblproperties ('orc.compress' = 'snappy'); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (1, '2023-06-02 10:31:40', '62221234567890', NULL, '存款', 50000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (2, '2023-06-02 10:32:15', '62221234567890', NULL, '存款', 100000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (3, '2023-06-03 08:14:29', '62221234567890', '62226666666666', '转账', 200000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (4, '2023-06-05 13:55:38', '62221234567890', '62226666666666', '转账', 150000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (5, '2023-06-07 20:00:31', '62221234567890', '62227777777777', '转账', 300000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (6, '2023-06-09 17:28:07', '62221234567890', '62227777777777', '转账', 500000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (7, '2023-06-10 07:46:02', '62221234567890', '62227777777777', '转账', 100000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (8, '2023-06-11 09:36:53', '62221234567890', NULL, '存款', 40000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (9, '2023-06-12 07:10:01', '62221234567890', '62228888888881', '转账', 10000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (10, '2023-06-12 07:11:12', '62221234567890', '62228888888882', '转账', 8000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (11, '2023-06-12 07:12:36', '62221234567890', '62228888888883', '转账', 5000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (12, '2023-06-12 07:13:55', '62221234567890', '62228888888884', '转账', 6000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (13, '2023-06-12 07:14:24', '62221234567890', '62228888888885', '转账', 7000); INSERT INTO transfer_log (log_id, log_ts, from_user, to_user, type, amount) VALUES (14, '2023-06-21 12:11:16', '62221234567890', '62228888888885', '转账', 70000)
解题思路
本题主要考察的是窗口函数,以及窗口函数内部窗口大小如何划分的事情
参考实现
第一步:先开窗计算每个用户两个小时内的交易次数和交易总金额
select from_user , sum(1) over (partition by from_user order by unix_timestamp(log_ts) range between 7200 preceding and current row) cnt , sum(amount) over (partition by from_user order by unix_timestamp(log_ts) range between 7200 preceding and current row) amount from transfer_log;
这样以来直接对这个结果进行过来就行了
select from_user from (select from_user , sum(1) over (partition by from_user order by unix_timestamp(log_ts) range between 7200 preceding and current row) cnt , sum(amount) over (partition by from_user order by unix_timestamp(log_ts) range between 7200 preceding and current row) amount from transfer_log) y where amount >= 50000 and cnt > 4 group by from_user;