数据量不小,不能全量存储, 数据缓慢变化的维度数据
拉链表的建表语句--全量表 通过有效起始时间<=时间<=有效结束时间来获取维度的全量切片数据
建表语句
drop table if exists dwd_dim_user_info_his;
create external table dwd_dim_user_info_his(
`id` string COMMENT '用户 id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性别',
`email` string COMMENT '邮箱',
`user_level` string COMMENT '用户等级',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间',
`start_date` string COMMENT '有效开始日期',
`end_date` string COMMENT '有效结束日期'
) COMMENT '用户拉链表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_user_info_his/'
tblproperties ("parquet.compression"="lzo");
已有的一些数据表
drop table if exists ods_user_info;
create external table ods_user_info(
`id` string COMMENT '用户 id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性别',
`email` string COMMENT '邮箱',
`user_level` string COMMENT '用户等级',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间'
) COMMENT '用户表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_user_info/';
数据示例:
20200101ods数据
id create_time operate_time
1 20200101 20200101
20200102 ods数据,产生修改的数据
1 20200101 20200102
20200103 ods数据,产生修改的数据
1 20200101 20200103
2 20200103 20200103
20200101拉链表
1 20200101 9999-99-99
20200102拉链表数据
1 20200101 20200101
1 20200102 9999-99-99
20200103拉链表数据
1 20200101 20200101
1 20200102 20200102
1 20200103 9999-99-99
2 20200103 9999-99-99
步骤一:初始化拉链表(以某一天为基准日期,这一天的数据都当成是新增数据)
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'20200101' as start_date,
'9999-99-99' as end_date
from
ods_user_info
where
dt = '20200101'
步骤二:制作当日变动数据(新增、修改)
insert overwrite table dwd_dim_user_info_his_tmp --先导入临时表。再通过临时表导入正式表
select
user_his.id,
user_his.name,
user_his.birthday,
user_his.gender,
user_his.email,
user_his.user_level,
user_his.create_time,
user_his.operate_time,
user_his.start_date,
if(
update_user.end_date is not null
and user_his.end_date = '9999-99-99',
'20200101',
user_his.end_date
) as end_date
from
(
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
start_date,
end_date
from
dwd_dim_user_info_his
where
dt = '20200101'
) user_his
left join (
select
id,
name,
birthday,
gender,
email,
user_level,
from
ods_user_info
where
dt = '20200102'
) update_user on user_his.id = update_user.id
union all
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'20200102' as start_date,
'9999-99-99' as end_date
from
ods_user_info
where
dt = '20200102'