拉链表学习

拉链表

  • 介绍:记录历史。记录一个事务从开始,一直到当前状态的所有变化的信息。

使用场景

  1. 表中的数据量很大。(每天都存储会占用很多空间)
  2. 表中的部分字段会被更新。
  3. 需要查看某一个时间点或者时间段的历史快照信息。
  4. 表中的记录变化的比例和频率不是很大。

拉链表优势

优势:既能获取最新的数据,也能添加筛选条件也获取历史的数据。

说明

ods为全量表

具体案例

1.新增的直接取过来
2.更新的保留前一天数据
3.不变的记录,取最后一次更新的数据
4.拉链结束的数据直接拿过来
5.删除的数据则保留前一日数据,并将记录结束日期记成更新日期

表结构

CREATE EXTERNAL TABLE if not exists ods_business(
id int COMMENT '自增主键'
,corp_id int COMMENT '公司ID'
,uid int COMMENT 'UID'
,resume_count int COMMENT '简历下载量'
,used_count int COMMENT '已使用简历下载量'
,update_date string COMMENT '更新时间'
,create_date string COMMENT '创建时间'
) partitioned by(dt string comment '分区日期')
row format delimited fields terminated by '\t'
stored as textfile;

CREATE EXTERNAL TABLE if not exists dwd_business (
id int COMMENT '自增主键'
,corp_id int COMMENT '公司ID'
,uid int COMMENT 'UID'
,resume_count int COMMENT '简历下载量'
,used_count int COMMENT '已使用简历下载量'
,update_date string COMMENT '更新时间'
,create_date string COMMENT '创建时间'
,record_start_date string COMMENT '记录有效开始时间'
,record_end_date string COMMENT '记录有效结束时间'
) partitioned by(dt string comment '分区日期')
row format delimited fields terminated by '\t'
stored as textfile;

CREATE EXTERNAL TABLE if not exists mid_business (
id int COMMENT '自增主键'
,corp_id int COMMENT '公司ID'
,uid int COMMENT 'UID'
,resume_count int COMMENT '简历下载量'
,used_count int COMMENT '已使用简历下载量'
,update_date string COMMENT '更新时间'
,create_date string COMMENT '创建时间'
,record_start_date string COMMENT '记录有效开始时间'
,record_end_date string COMMENT '记录有效结束时间'
) partitioned by(dt string comment '分区日期')
row format delimited fields terminated by '\t'
stored as textfile;

ODS表处理

INSERT overwrite table ods_business partition(dt='2023-10-22')
VALUES
(1, 1021, 102931, 15, 2, '2022-10-22', '2022-10-21'),
(2, 1001, 1342, 52, 23, '2022-10-22', '2022-10-21'),
(3, 1043, 3563, 33, 12, '2022-10-22', '2022-10-21'),
(4, 1212, 5745, 53, 15, '2022-10-22', '2022-10-21'),
(5, 15342, 7543, 7, 1, '2022-10-22', '2022-10-21'),
(6, 1153, 7486, 21, 4, '2022-10-22', '2022-10-21'),
(7, 1865, 8534, 14, 9, '2022-10-22', '2022-10-21'),
(8, 1453, 5423, 25, 15, '2022-10-22', '2022-10-21');

INSERT into table ods_business partition(dt='2023-10-23')
VALUES
(1, 1021, 102931, 15, 6, '2022-10-23', '2022-10-21'), -- 更新
(2, 1001, 1342, 52, 26, '2022-10-23', '2022-10-21'), -- 更新
(3, 1043, 3563, 33, 12, '2022-10-22', '2022-10-21'),
(4, 1212, 5745, 53, 15, '2022-10-22', '2022-10-21'),
(5, 15342, 7543, 7, 1, '2022-10-22', '2022-10-21'),
(6, 1153, 7486, 21, 4, '2022-10-22', '2022-10-21'),
(7, 1865, 8534, 14, 9, '2022-10-22', '2022-10-21'),
(8, 1453, 5423, 25, 15, '2022-10-22', '2022-10-21'),
(9, 3211, 6543, 19, 14, '2022-10-23', '2022-10-23'), -- 新增
(10, 1263, 1264, 45, 31, '2022-10-23', '2022-10-23'); -- 新增
INSERT OVERWRITE TABLE dwd_business partition(dt='2023-10-23')
select distinct
id, corp_id, uid, resume_count, used_count, update_date, create_date,
record_start_date, record_end_date
from mid_test_business
where dt='2023-10-23'

表处理

-- 1.新增的直接取过来 今天10月24号
INSERT OVERWRITE TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
  t1.id,
  t1.corp_id,
  t1.uid,
  t1.resume_count,
  t1.used_count,
  t1.update_date,
  t1.create_date,
  t1.create_date as record_start_date,
  '3000-12-31' as record_end_date
from
(
   select * from ods_business where dt='2023-10-23' -- 分区时间:
) t1
left join
(
select * from dwd_business
   where dt = '2023-10-22' -- (分区时间-1):
   and record_start_date <= '2023-10-23' -- 分区时间:
   and record_end_date > '2023-10-23' -- 分区时间:
) t2
on t1.id = t2.id
where t2.id is null;

-- 2.更新的保留前一天数据
-- 先修改掉旧的结束日期
INSERT INTO TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
t1.id,
   t2.corp_id,
   t2.uid,
   t2.resume_count,
   t2.used_count,
   t2.update_date,
   t2.create_date,
t2.record_start_date as record_start_date, -- 老记录开始时间不变
   t1.update_date as record_end_date -- 最后一次更新时间为此状态的结束时间
from
(
   select * from ods_business where dt='2023-10-23' -- 分区时间:
) t1
inner join
(
select * from dwd_business
   where dt = '2023-10-22' -- (分区时间-1):
   and record_start_date <= '2023-10-23' -- 分区时间:
   and record_end_date > '2023-10-23' -- 分区时间:
) t2
on t1.id = t2.id
where t1.update_date > t2.update_date;
-- 再加入新的结束日期
INSERT INTO TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
t1.id,
   t1.corp_id,
   t1.uid,
   t1.resume_count,
   t1.used_count,
   t1.update_date,
   t1.create_date,
t1.update_date as record_start_date, -- 新记录开始时间为最后一次记录时间***
   '3000-12-31' as record_end_date
from
(
   select * from ods_business where dt='2023-10-23' -- 分区时间:
) t1
inner join
(
select * from dwd_business
   where dt = '2023-10-22' -- (分区时间-1):
   and record_start_date <= '2023-10-23' -- 分区时间:
   and record_end_date > '2023-10-23' -- 分区时间:
) t2
on t1.id = t2.id
where t1.update_date > t2.update_date;

-- 3.不变的记录,取的是最后一次修改的数据
INSERT INTO TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
   t2.id,
   t2.corp_id,
   t2.uid,
   t2.resume_count,
   t2.used_count,
   t2.update_date,
   t2.create_date,
   t2.update_date as record_start_date,
   '3000-12-31' as record_end_date
from
(
   select * from ods_business where dt='2023-10-23' -- 分区时间:
) t1
inner join
(
select * from dwd_business
   where dt = '2023-10-22' -- (分区时间-1):
   and record_start_date <= '2023-10-23' -- 分区时间:
   and record_end_date > '2023-10-23' -- 分区时间:
) t2
on t1.id = t2.id
where t1.update_date = t2.update_date;

-- 4.删除的数据则保留前一日数据,并将记录结束日期记成更新日期
INSERT INTO TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
t1.id,
   t1.corp_id,
   t1.uid,
   t1.resume_count,
   t1.used_count,
   t1.update_date,
   t1.create_date,
   t1.record_start_date as record_start_date,
   t1.update_date as record_end_date
from
(
select * from dwd_business
   where dt = '2023-10-22' -- (分区时间-1):
   and record_start_date <= '2023-10-23' -- 分区时间:
   and record_end_date > '2023-10-23' -- 分区时间:
) t1
left join
(
select *
   from ods_business
   where dt = '2023-10-23' -- 分区时间
) t2
on t1.id = t2.id
where t2.id is null;

-- 5. 拉链结束的数据直接拿过来
INSERT INTO TABLE mid_business partition(dt='2023-10-23') -- 分区时间:
select
t1.id,
   t1.corp_id,
   t1.uid,
   t1.resume_count,
   t1.used_count,
   t1.update_date,
   t1.create_date,
   t1.record_start_date as record_start_date,
   t1.record_end_date as record_end_date
from
dwd_business t1
where dt = '2023-10-22' -- 分区时间:
and record_start_date < '2023-10-23'; -- 分区时间

其他实现方式

实现方式一: 五种情况一一实现

将【新增、删除、更新、不变、拉链结束】这五种情况,一一列举实现。

实现方式二: 将五种情况合并实现

【修改之后+不变+新增】存放到99991231最新分区

-- 修改之后 + 不变 + 新增
insert overwrite table dim_profit_center_dzip partition(ds)
select 
	if (
		-- 看属性是否发生了变化
		profit_center_old.profit_center_code = profit_center_new.BIC_ZPRFI_CTR
		and profit_center_old.profit_center_short_desc = profit_center_new.TXTSH
		and profit_center_old.profit_center_middle_desc = profit_center_new.TXTMD,
		profit_center_old.profit_center_id, -- 属性没有发生变化,ID不变
		concat('${bizdate}', profit_center_new.BIC_ZPRFI_CTR) -- 属性发生了变化,ID更改
	) as profit_center_id, -- 利润中心ID
	profit_center_new.BIC_ZPRFI_CTR, -- 利润中心编码
	profit_center_new.TXTSH, -- 利润中心短描述
	profit_center_new.TXTMD, -- 利润中心长描述
	if (
		-- 看属性是否发生了变化
		profit_center_old.profit_center_code = profit_center_new.BIC_ZPRFI_CTR
		and profit_center_old.profit_center_short_desc = profit_center_new.TXTSH
		and profit_center_old.profit_center_middle_desc = profit_center_new.TXTMD,
		profit_center_old.start_date, -- 属性没有发生变化,记录时间不变
		'${bizdate}' -- 属性发生了变化,ID更改
	) as start_date, -- 开始日期
	'99991231' end_date,
	'99991231' ds
from 
(
	select 
		profit_center_id,
		profit_center_code,
		profit_center_short_desc,
		profit_center_middle_desc,
		start_date
	from dim_profit_center_dzip
	where ds = '99991231'
) profit_center_old
right join 
(
	select 
		BIC_ZPRFI_CTR, -- profit_center_code
        TXTSH, -- profit_center_short_desc
        TXTMD -- profit_center_middle_desc
	from
	(
		select 
			BIC_ZPRFI_CTR, -- profit_center_code
			TXTSH, -- profit_center_short_desc
			TXTMD, -- profit_center_middle_desc
			row_number() over(partition by BIC_ZPRFI_CTR order by BIC_ZPRFI_CTR) rn
		from ic_sap_lake.ods_sap_sys_bic_zeis_master_m_zprfi_ctr_ds_h -- 取昨天(最新)的数据
		where BIC_ZPRFI_CTR is not null and BIC_ZPRFI_CTR <> ''
	) profit_center_d
	where profit_center_d.rn = 1 -- 做数据清理(对编码相同的多条数据,只取第一条数据)
) profit_center_new
on profit_center_old.profit_center_code = profit_center_new.BIC_ZPRFI_CTR

【修改+删除】存放到历史分区

-- 历史记录分区 修改、删除
select 
	profit_center_old.profit_center_id, -- 利润中心ID
	profit_center_old.profit_center_code, -- 利润中心编码
	profit_center_old.profit_center_short_desc, -- 利润中心短描述
	profit_center_old.profit_center_middle_desc, -- 利润中心长描述
	profit_center_old.start_date, -- 开始日期
	'${bizdate - 1}' end_date,
	'${bizdate - 1}' ds
from 
(
	select 
		profit_center_id,
		profit_center_code,
		profit_center_short_desc,
		profit_center_middle_desc,
		start_date
	from dim_profit_center_dzip
	where ds = '99991231'
) profit_center_old
left join 
(
	select 
		BIC_ZPRFI_CTR, -- profit_center_code
        TXTSH, -- profit_center_short_desc
        TXTMD -- profit_center_middle_desc
    from
	(
		select 
			BIC_ZPRFI_CTR, -- profit_center_code
			TXTSH, -- profit_center_short_desc
			TXTMD, -- profit_center_middle_desc
			row_number() over(partition by BIC_ZPRFI_CTR order by BIC_ZPRFI_CTR) rn
		from ic_sap_lake.ods_sap_sys_bic_zeis_master_m_zprfi_ctr_ds_h -- 取昨天(最新)的数据
		where BIC_ZPRFI_CTR is not null and BIC_ZPRFI_CTR <> ''
	) profit_center_d
	where profit_center_d.rn = 1 -- 做数据清理(对编码相同的多条数据,只取第一条数据)
) profit_center_new
on profit_center_old.profit_center_code = profit_center_new.BIC_ZPRFI_CTR
where profit_center_old.profit_center_code != profit_center_new.BIC_ZPRFI_CTR
or profit_center_old.profit_center_short_desc != profit_center_new.TXTSH
or profit_center_old.profit_center_middle_desc != profit_center_new.TXTMD
posted @ 2023-11-28 18:47  jsqup  阅读(11)  评论(0编辑  收藏  举报