拉链表学习
拉链表
- 介绍:记录历史。记录一个事务从开始,一直到当前状态的所有变化的信息。
使用场景
- 表中的数据量很大。(每天都存储会占用很多空间)
- 表中的部分字段会被更新。
- 需要查看某一个时间点或者时间段的历史快照信息。
- 表中的记录变化的比例和频率不是很大。
拉链表优势
优势:既能获取最新的数据,也能添加筛选条件也获取历史的数据。
说明
ods为全量表
具体案例
1.新增的直接取过来
2.更新的保留前一天数据
3.不变的记录,取最后一次更新的数据
4.拉链结束的数据直接拿过来
5.删除的数据则保留前一日数据,并将记录结束日期记成更新日期
表结构
CREATE EXTERNAL TABLE if not exists ods_business(
id int COMMENT '自增主键'
,corp_id int COMMENT '公司ID'
,uid int COMMENT 'UID'
,resume_count int COMMENT '简历下载量'
,used_count int COMMENT '已使用简历下载量'
,update_date string COMMENT '更新时间'
,create_date string COMMENT '创建时间'
) partitioned by(dt string comment '分区日期')
row format delimited fields terminated by '\t'
stored as textfile;
CREATE EXTERNAL TABLE if not exists dwd_business (
id int COMMENT '自增主键'
,corp_id int COMMENT '公司ID'
,uid int COMMENT 'UID'
,resume_count int COMMENT '简历下载量'
,used_count int COMMENT '已使用简历下载量'
,update_date string COMMENT '更新时间'
,create_date string COMMENT '创建时间'
,record_start_date string COMMENT '记录有效开始时间'
,record_end_date string COMMENT '记录有效结束时间'
) partitioned by(dt string comment '分区日期')
row format delimited fields terminated by '\t'
stored as textfile;
CREATE EXTERNAL TABLE if not exists mid_business (
id int COMMENT '自增主键'
,corp_id int COMMENT '公司ID'
,uid int COMMENT 'UID'
,resume_count int COMMENT '简历下载量'
,used_count int COMMENT '已使用简历下载量'
,update_date string COMMENT '更新时间'
,create_date string COMMENT '创建时间'
,record_start_date string COMMENT '记录有效开始时间'
,record_end_date string COMMENT '记录有效结束时间'
) partitioned by(dt string comment '分区日期')
row format delimited fields terminated by '\t'
stored as textfile;
ODS表处理
INSERT overwrite table ods_business partition(dt='2023-10-22')
VALUES
(1, 1021, 102931, 15, 2, '2022-10-22', '2022-10-21'),
(2, 1001, 1342, 52, 23, '2022-10-22', '2022-10-21'),
(3, 1043, 3563, 33, 12, '2022-10-22', '2022-10-21'),
(4, 1212, 5745, 53, 15, '2022-10-22', '2022-10-21'),
(5, 15342, 7543, 7, 1, '2022-10-22', '2022-10-21'),
(6, 1153, 7486, 21, 4, '2022-10-22', '2022-10-21'),
(7, 1865, 8534, 14, 9, '2022-10-22', '2022-10-21'),
(8, 1453, 5423, 25, 15, '2022-10-22', '2022-10-21');
INSERT into table ods_business partition(dt='2023-10-23')
VALUES
(1, 1021, 102931, 15, 6, '2022-10-23', '2022-10-21'), -- 更新
(2, 1001, 1342, 52, 26, '2022-10-23', '2022-10-21'), -- 更新
(3, 1043, 3563, 33, 12, '2022-10-22', '2022-10-21'),
(4, 1212, 5745, 53, 15, '2022-10-22', '2022-10-21'),
(5, 15342, 7543, 7, 1, '2022-10-22', '2022-10-21'),
(6, 1153, 7486, 21, 4, '2022-10-22', '2022-10-21'),
(7, 1865, 8534, 14, 9, '2022-10-22', '2022-10-21'),
(8, 1453, 5423, 25, 15, '2022-10-22', '2022-10-21'),
(9, 3211, 6543, 19, 14, '2022-10-23', '2022-10-23'), -- 新增
(10, 1263, 1264, 45, 31, '2022-10-23', '2022-10-23'); -- 新增
INSERT OVERWRITE TABLE dwd_business partition(dt='2023-10-23')
select distinct
id, corp_id, uid, resume_count, used_count, update_date, create_date,
record_start_date, record_end_date
from mid_test_business
where dt='2023-10-23'
表处理
-- 1.新增的直接取过来 今天10月24号
INSERT OVERWRITE TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
t1.id,
t1.corp_id,
t1.uid,
t1.resume_count,
t1.used_count,
t1.update_date,
t1.create_date,
t1.create_date as record_start_date,
'3000-12-31' as record_end_date
from
(
select * from ods_business where dt='2023-10-23' -- 分区时间:
) t1
left join
(
select * from dwd_business
where dt = '2023-10-22' -- (分区时间-1):
and record_start_date <= '2023-10-23' -- 分区时间:
and record_end_date > '2023-10-23' -- 分区时间:
) t2
on t1.id = t2.id
where t2.id is null;
-- 2.更新的保留前一天数据
-- 先修改掉旧的结束日期
INSERT INTO TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
t1.id,
t2.corp_id,
t2.uid,
t2.resume_count,
t2.used_count,
t2.update_date,
t2.create_date,
t2.record_start_date as record_start_date, -- 老记录开始时间不变
t1.update_date as record_end_date -- 最后一次更新时间为此状态的结束时间
from
(
select * from ods_business where dt='2023-10-23' -- 分区时间:
) t1
inner join
(
select * from dwd_business
where dt = '2023-10-22' -- (分区时间-1):
and record_start_date <= '2023-10-23' -- 分区时间:
and record_end_date > '2023-10-23' -- 分区时间:
) t2
on t1.id = t2.id
where t1.update_date > t2.update_date;
-- 再加入新的结束日期
INSERT INTO TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
t1.id,
t1.corp_id,
t1.uid,
t1.resume_count,
t1.used_count,
t1.update_date,
t1.create_date,
t1.update_date as record_start_date, -- 新记录开始时间为最后一次记录时间***
'3000-12-31' as record_end_date
from
(
select * from ods_business where dt='2023-10-23' -- 分区时间:
) t1
inner join
(
select * from dwd_business
where dt = '2023-10-22' -- (分区时间-1):
and record_start_date <= '2023-10-23' -- 分区时间:
and record_end_date > '2023-10-23' -- 分区时间:
) t2
on t1.id = t2.id
where t1.update_date > t2.update_date;
-- 3.不变的记录,取的是最后一次修改的数据
INSERT INTO TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
t2.id,
t2.corp_id,
t2.uid,
t2.resume_count,
t2.used_count,
t2.update_date,
t2.create_date,
t2.update_date as record_start_date,
'3000-12-31' as record_end_date
from
(
select * from ods_business where dt='2023-10-23' -- 分区时间:
) t1
inner join
(
select * from dwd_business
where dt = '2023-10-22' -- (分区时间-1):
and record_start_date <= '2023-10-23' -- 分区时间:
and record_end_date > '2023-10-23' -- 分区时间:
) t2
on t1.id = t2.id
where t1.update_date = t2.update_date;
-- 4.删除的数据则保留前一日数据,并将记录结束日期记成更新日期
INSERT INTO TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
t1.id,
t1.corp_id,
t1.uid,
t1.resume_count,
t1.used_count,
t1.update_date,
t1.create_date,
t1.record_start_date as record_start_date,
t1.update_date as record_end_date
from
(
select * from dwd_business
where dt = '2023-10-22' -- (分区时间-1):
and record_start_date <= '2023-10-23' -- 分区时间:
and record_end_date > '2023-10-23' -- 分区时间:
) t1
left join
(
select *
from ods_business
where dt = '2023-10-23' -- 分区时间
) t2
on t1.id = t2.id
where t2.id is null;
-- 5. 拉链结束的数据直接拿过来
INSERT INTO TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
t1.id,
t1.corp_id,
t1.uid,
t1.resume_count,
t1.used_count,
t1.update_date,
t1.create_date,
t1.record_start_date as record_start_date,
t1.record_end_date as record_end_date
from
dwd_business t1
where dt = '2023-10-22' -- 分区时间:
and record_start_date < '2023-10-23'; -- 分区时间
其他实现方式
实现方式一: 五种情况一一实现
将【新增、删除、更新、不变、拉链结束】这五种情况,一一列举实现。
实现方式二: 将五种情况合并实现
【修改之后+不变+新增】存放到99991231最新分区
-- 修改之后 + 不变 + 新增
insert overwrite table dim_profit_center_dzip partition(ds)
select
if (
-- 看属性是否发生了变化
profit_center_old.profit_center_code = profit_center_new.BIC_ZPRFI_CTR
and profit_center_old.profit_center_short_desc = profit_center_new.TXTSH
and profit_center_old.profit_center_middle_desc = profit_center_new.TXTMD,
profit_center_old.profit_center_id, -- 属性没有发生变化,ID不变
concat('${bizdate}', profit_center_new.BIC_ZPRFI_CTR) -- 属性发生了变化,ID更改
) as profit_center_id, -- 利润中心ID
profit_center_new.BIC_ZPRFI_CTR, -- 利润中心编码
profit_center_new.TXTSH, -- 利润中心短描述
profit_center_new.TXTMD, -- 利润中心长描述
if (
-- 看属性是否发生了变化
profit_center_old.profit_center_code = profit_center_new.BIC_ZPRFI_CTR
and profit_center_old.profit_center_short_desc = profit_center_new.TXTSH
and profit_center_old.profit_center_middle_desc = profit_center_new.TXTMD,
profit_center_old.start_date, -- 属性没有发生变化,记录时间不变
'${bizdate}' -- 属性发生了变化,ID更改
) as start_date, -- 开始日期
'99991231' end_date,
'99991231' ds
from
(
select
profit_center_id,
profit_center_code,
profit_center_short_desc,
profit_center_middle_desc,
start_date
from dim_profit_center_dzip
where ds = '99991231'
) profit_center_old
right join
(
select
BIC_ZPRFI_CTR, -- profit_center_code
TXTSH, -- profit_center_short_desc
TXTMD -- profit_center_middle_desc
from
(
select
BIC_ZPRFI_CTR, -- profit_center_code
TXTSH, -- profit_center_short_desc
TXTMD, -- profit_center_middle_desc
row_number() over(partition by BIC_ZPRFI_CTR order by BIC_ZPRFI_CTR) rn
from ic_sap_lake.ods_sap_sys_bic_zeis_master_m_zprfi_ctr_ds_h -- 取昨天(最新)的数据
where BIC_ZPRFI_CTR is not null and BIC_ZPRFI_CTR <> ''
) profit_center_d
where profit_center_d.rn = 1 -- 做数据清理(对编码相同的多条数据,只取第一条数据)
) profit_center_new
on profit_center_old.profit_center_code = profit_center_new.BIC_ZPRFI_CTR
【修改+删除】存放到历史分区
-- 历史记录分区 修改、删除
select
profit_center_old.profit_center_id, -- 利润中心ID
profit_center_old.profit_center_code, -- 利润中心编码
profit_center_old.profit_center_short_desc, -- 利润中心短描述
profit_center_old.profit_center_middle_desc, -- 利润中心长描述
profit_center_old.start_date, -- 开始日期
'${bizdate - 1}' end_date,
'${bizdate - 1}' ds
from
(
select
profit_center_id,
profit_center_code,
profit_center_short_desc,
profit_center_middle_desc,
start_date
from dim_profit_center_dzip
where ds = '99991231'
) profit_center_old
left join
(
select
BIC_ZPRFI_CTR, -- profit_center_code
TXTSH, -- profit_center_short_desc
TXTMD -- profit_center_middle_desc
from
(
select
BIC_ZPRFI_CTR, -- profit_center_code
TXTSH, -- profit_center_short_desc
TXTMD, -- profit_center_middle_desc
row_number() over(partition by BIC_ZPRFI_CTR order by BIC_ZPRFI_CTR) rn
from ic_sap_lake.ods_sap_sys_bic_zeis_master_m_zprfi_ctr_ds_h -- 取昨天(最新)的数据
where BIC_ZPRFI_CTR is not null and BIC_ZPRFI_CTR <> ''
) profit_center_d
where profit_center_d.rn = 1 -- 做数据清理(对编码相同的多条数据,只取第一条数据)
) profit_center_new
on profit_center_old.profit_center_code = profit_center_new.BIC_ZPRFI_CTR
where profit_center_old.profit_center_code != profit_center_new.BIC_ZPRFI_CTR
or profit_center_old.profit_center_short_desc != profit_center_new.TXTSH
or profit_center_old.profit_center_middle_desc != profit_center_new.TXTMD
本文来自博客园,作者:jsqup,转载请注明原文链接:https://www.cnblogs.com/jsqup/p/17859423.html