hive 定位连续区间的起始位置和结束位置
需求
id 是上表temp_id_list_0209的主键, 表每一行包含日志表中的一个 ID,现将一些 ID 从 Logs 表中删除。编写一个 SQL 查询得到 Logs 表中的连续区间的开始数字和结束数字,将查询表按照 start_id 排序。
要求输出结果为
start_id
|
end_id
|
1
|
2
|
4
|
6
|
9
|
12
|
数据准备
create table temp_id_list_0209 ( id int ) row format delimited fields terminated by '\t' stored as orc tblproperties ("orc.compress" = "snappy"); insert into temp_id_list_0209 values (1), (2), (4), (5), (6), (9), (10), (11), (12) ; select * from temp_id_list_0209;
需求实现
第一步 :获取ID,借助函数构建差值
select id, lag(id, 1, id) over (order by id) df1, id - lag(id, 1, id) over (order by id) df2 from temp_id_list_0209;
id
|
df1
|
df2
|
1
|
1
|
0
|
2
|
1
|
1
|
4
|
2
|
2
|
5
|
4
|
1
|
6
|
5
|
1
|
9
|
6
|
3
|
10
|
9
|
1
|
11
|
10
|
1
|
12
|
11
|
1
|
第二步、根据第一步结果,创造分组条件
select id, sum(case when dif2 > 1 then 1 else 0 end) over (order by id) flag from (select id, id - lag(id, 1, id) over (order by id) dif2 from temp_id_list_0209) o;
id
|
flag
|
1
|
0
|
2
|
0
|
4
|
1
|
5
|
1
|
6
|
1
|
9
|
2
|
10
|
2
|
11
|
2
|
12
|
2
|
第三步、根据flag 分组
select min(id) start_id,max(id) end_id from (select id, sum(if(diff > 1, 1, 0)) over (order by id) as flag from (select id , id - lag(id, 1, id) over (order by id) as diff from temp_id_list_0209) o) t group by flag order by 1;
start_id
|
end_id
|
1
|
2
|
4
|
6
|
9
|
12
|