|NO.Z.00050|——————————|^^ 案例 ^^|——|Hadoop&PB级数仓.V08|——|PB数仓.v08|拉链表实现|构建拉链表|

一、构建拉链表
### --- 拉链表的实现

~~~     userinfo(分区表) => userid、mobile、regdate => 
~~~     每日变更的数据(修改的+新增的) / 历史数据(第一天)
~~~     userhis(拉链表)=> 多了两个字段 start_date / end_date
二、拉链表的实现
### --- userinfo初始化(2020-06-20)。获取历史数据
~~~     userinfo初始化(2020-06-20)。获取历史数据

001,13551111111,2020-03-01,2020-06-20
002,13561111111,2020-04-01,2020-06-20
003,13571111111,2020-05-01,2020-06-20
004,13581111111,2020-06-01,2020-06-20
### --- 初始化拉链表(2020-06-20)。userinfo => userhis
~~~     初始化拉链表(2020-06-20)。userinfo => userhis

hive (default)> insert overwrite table test.userhis
select userid, mobile, regdate, dt as start_date, '9999-12-31' as end_date
from test.userinfo where dt='2020-06-20';
~~~     # 查看userhis下数据

hive (default)> select * from test.userhis;
userhis.userid  userhis.mobile  userhis.regdate userhis.start_date  userhis.end_date
001 13551111111 2020-03-01  2020-06-20  9999-12-31
002 13561111111 2020-04-01  2020-06-20  9999-12-31
003 13571111111 2020-05-01  2020-06-20  9999-12-31
004 13581111111 2020-06-01  2020-06-20  9999-12-31
### --- 次日新增数据(2020-06-21);获取新增数据
~~~     # 次日新增数据(2020-06-21);获取新增数据

002,13562222222,2020-04-01,2020-06-21
004,13582222222,2020-06-01,2020-06-21
005,13552222222,2020-06-21,2020-06-21
### --- 构建拉链表(userhis)(2020-06-21)【核心】 userinfo(2020-06-21) + userhis => userhis

~~~     # 构建拉链表(userhis)(2020-06-21)【核心】 userinfo(2020-06-21) + userhis => userhis
~~~     userinfo: 新增数据
~~~     userhis:历史数据
### --- 构建拉链表:处理新增数据
~~~     # 第一步:处理新增数据【userinfo】(处理逻辑与加载历史数据类似)

hive (default)>  select userid, mobile, regdate, dt as start_date, 
'9999-12-31' as end_date from test.userinfo where dt='2020-06-21';
~~~输出参数
userid  mobile  regdate start_date  end_date
002 13562222222 2020-04-01  2020-06-21  9999-12-31
004 13582222222 2020-06-01  2020-06-21  9999-12-31
005 13552222222 2020-06-21  2020-06-21  9999-12-31
~~~     # 第二步:处理历史数据【userhis】(历史包括两部分:变化的、未变化的)
~~~     变化的:start_date:不变;end_date:传入日期-1
~~~     未变化的:不做处理
~~~     观察数据

hive (default)>  select A.userid, B.userid, B.mobile, B.regdate, B.start_Date, B.end_date
from (select * from test.userinfo where dt='2020-06-21') 
A right join test.userhis B on A.userid= B.userid;
~~~输出参数
a.userid    b.userid    b.mobile    b.regdate   b.start_date    b.end_date
NULL    001 13551111111 2020-03-01  2020-06-20  9999-12-31
002 002 13561111111 2020-04-01  2020-06-20  9999-12-31
NULL    003 13571111111 2020-05-01  2020-06-20  9999-12-31
004 004 13581111111 2020-06-01  2020-06-20  9999-12-31
~~~     # 编写SQL,处理历史数据

hive (default)> select B.userid,
B.mobile,
B.regdate,
B.start_Date,
case when B.end_date='9999-12-31' and A.userid is not null
then date_add('2020-06-21', -1)
else B.end_date
end as end_date
from (select * from test.userinfo where dt='2020-06-21') 
A right join test.userhis 
B on A.userid=B.userid;
~~~输出参数
b.userid    b.mobile    b.regdate   b.start_date    end_date
001 13551111111 2020-03-01  2020-06-20  9999-12-31
002 13561111111 2020-04-01  2020-06-20  2020-06-20
003 13571111111 2020-05-01  2020-06-20  9999-12-31
004 13581111111 2020-06-01  2020-06-20  2020-06-20
~~~     # 最终的处理(新增+历史数据)

hive (default)> insert overwrite table test.userhis
select userid, mobile, regdate, dt as start_date, '9999-12-31' as end_date
from test.userinfo
where dt='2020-06-21'
union all
select B.userid,
B.mobile,
B.regdate,
B.start_Date,
case when B.end_date='9999-12-31' and A.userid is not null
then date_add('2020-06-21', -1)
else B.end_date
end as end_date
from (select * from test.userinfo where dt='2020-06-21') 
A right join test.userhis 
B on A.userid=B.userid;
~~~     # 查看新增+历史数据

hive (default)> select * from test.userhis;
userhis.userid  userhis.mobile  userhis.regdate userhis.start_date  userhis.end_date
001 13551111111 2020-03-01  2020-06-20  9999-12-31
002 13561111111 2020-04-01  2020-06-20  2020-06-20
003 13571111111 2020-05-01  2020-06-20  9999-12-31
004 13581111111 2020-06-01  2020-06-20  2020-06-20
002 13562222222 2020-04-01  2020-06-21  9999-12-31
004 13582222222 2020-06-01  2020-06-21  9999-12-31
005 13552222222 2020-06-21  2020-06-21  9999-12-31
### --- 第三日新增数据(2020-06-22);获取新增数据
~~~     # 第三日新增数据(2020-06-22);获取新增数据

004,13333333333,2020-06-01,2020-06-22
005,13533333333,2020-06-21,2020-06-22
006,13733333333,2020-06-22,2020-06-22
### --- 构建拉链表(2020-06-22) userinfo(2020-06-22) + userhis => userhis
~~~     # 处理新增数据

hive (default)> select userid, mobile, regdate, dt as start_date, 
'9999-12-31' as end_date from test.userinfo where dt='2020-06-22';
userid  mobile  regdate start_date  end_date
~~~输出参数
004 13333333333 2020-06-01  2020-06-22  9999-12-31
005 13533333333 2020-06-21  2020-06-22  9999-12-31
006 13733333333 2020-06-22  2020-06-22  9999-12-31
~~~     # 处理历史数据

hive (default)>  select A.userid, B.userid, B.mobile, B.regdate, B.start_Date, B.end_date
from (select * from test.userinfo where dt='2020-06-22') 
A right join test.userhis B on A.userid= B.userid;
~~~输出参数
a.userid    b.userid    b.mobile    b.regdate   b.start_date    b.end_date
NULL    001 13551111111 2020-03-01  2020-06-20  9999-12-31
NULL    002 13561111111 2020-04-01  2020-06-20  2020-06-20
NULL    003 13571111111 2020-05-01  2020-06-20  9999-12-31
004 004 13581111111 2020-06-01  2020-06-20  2020-06-20
NULL    002 13562222222 2020-04-01  2020-06-21  9999-12-31
004 004 13582222222 2020-06-01  2020-06-21  9999-12-31
005 005 13552222222 2020-06-21  2020-06-21  9999-12-31
~~~     # 编写sql,处理历史数据

hive (default)> select B.userid,
B.mobile,
B.regdate,
B.start_Date,
case when B.end_date='9999-12-31' and A.userid is not null
then date_add('2020-06-22', -1)
else B.end_date
end as end_date
from (select * from test.userinfo where dt='2020-06-22') 
A right join test.userhis 
B on A.userid=B.userid;
~~~输出参数
b.userid    b.mobile    b.regdate   b.start_date    end_date
001 13551111111 2020-03-01  2020-06-20  9999-12-31
002 13561111111 2020-04-01  2020-06-20  2020-06-20
003 13571111111 2020-05-01  2020-06-20  9999-12-31
004 13581111111 2020-06-01  2020-06-20  2020-06-20
002 13562222222 2020-04-01  2020-06-21  9999-12-31
004 13582222222 2020-06-01  2020-06-21  2020-06-21
005 13552222222 2020-06-21  2020-06-21  2020-06-21
~~~     # 最终的处理(新增+历史数据)

hive (default)> insert overwrite table test.userhis
select userid, mobile, regdate, dt as start_date, '9999-12-31' as end_date
from test.userinfo
where dt='2020-06-22'
union all
select B.userid,
B.mobile,
B.regdate,
B.start_Date,
case when B.end_date='9999-12-31' and A.userid is not null
then date_add('2020-06-22', -1)
else B.end_date
end as end_date
from (select * from test.userinfo where dt='2020-06-22') 
A right join test.userhis 
B on A.userid=B.userid;
~~~     # 查看新增+历史数据

hive (default)> select * from test.userhis;
~~~输出参数
userhis.userid  userhis.mobile  userhis.regdate userhis.start_date  userhis.end_date
001 13551111111 2020-03-01  2020-06-20  9999-12-31
002 13561111111 2020-04-01  2020-06-20  2020-06-20
003 13571111111 2020-05-01  2020-06-20  9999-12-31
004 13581111111 2020-06-01  2020-06-20  2020-06-20
002 13562222222 2020-04-01  2020-06-21  9999-12-31
004 13582222222 2020-06-01  2020-06-21  2020-06-21
005 13552222222 2020-06-21  2020-06-21  2020-06-21
004 13333333333 2020-06-01  2020-06-22  9999-12-31
005 13533333333 2020-06-21  2020-06-22  9999-12-31
006 13733333333 2020-06-22  2020-06-22  9999-12-31
### --- 第四日新增数据(2020-06-23)
~~~     # 构建拉链表(2020-06-22) userinfo(2020-06-22) + userhis => userhis
~~~     # 第四日新增数据(2020-06-23)

001,13554444444,2020-03-01,2020-06-23
003,13574444444,2020-05-01,2020-06-23
005,13555554444,2020-06-21,2020-06-23
007,18600744444,2020-06-23,2020-06-23
008,18600844444,2020-06-23,2020-06-23
~~~     # 处理新增数据

hive (default)> select userid, mobile, regdate, dt as start_date, 
'9999-12-31' as end_date from test.userinfo where dt='2020-06-23';
userid  mobile  regdate start_date  end_date
~~~输出参数
001 13554444444 2020-03-01  2020-06-23  9999-12-31
003 13574444444 2020-05-01  2020-06-23  9999-12-31
005 13555554444 2020-06-21  2020-06-23  9999-12-31
007 18600744444 2020-06-23  2020-06-23  9999-12-31
008 18600844444 2020-06-23  2020-06-23  9999-12-31
~~~     # 处理历史数据

hive (default)>  select A.userid, B.userid, B.mobile, B.regdate, B.start_Date, B.end_date
from (select * from test.userinfo where dt='2020-06-23') 
A right join test.userhis B on A.userid= B.userid;
~~~输出参数
a.userid    b.userid    b.mobile    b.regdate   b.start_date    b.end_date
001 001 13551111111 2020-03-01  2020-06-20  9999-12-31
NULL    002 13561111111 2020-04-01  2020-06-20  2020-06-20
003 003 13571111111 2020-05-01  2020-06-20  9999-12-31
NULL    004 13581111111 2020-06-01  2020-06-20  2020-06-20
NULL    002 13562222222 2020-04-01  2020-06-21  9999-12-31
NULL    004 13582222222 2020-06-01  2020-06-21  2020-06-21
005 005 13552222222 2020-06-21  2020-06-21  2020-06-21
NULL    004 13333333333 2020-06-01  2020-06-22  9999-12-31
005 005 13533333333 2020-06-21  2020-06-22  9999-12-31
NULL    006 13733333333 2020-06-22  2020-06-22  9999-12-31
~~~     # 编写sql,处理历史数据

hive (default)> select B.userid,
B.mobile,
B.regdate,
B.start_Date,
case when B.end_date='9999-12-31' and A.userid is not null
then date_add('2020-06-23', -1)
else B.end_date
end as end_date
from (select * from test.userinfo where dt='2020-06-23') 
A right join test.userhis 
B on A.userid=B.userid;
~~~输出参数
b.userid    b.mobile    b.regdate   b.start_date    end_date
001 13551111111 2020-03-01  2020-06-20  2020-06-22
002 13561111111 2020-04-01  2020-06-20  2020-06-20
003 13571111111 2020-05-01  2020-06-20  2020-06-22
004 13581111111 2020-06-01  2020-06-20  2020-06-20
002 13562222222 2020-04-01  2020-06-21  9999-12-31
004 13582222222 2020-06-01  2020-06-21  2020-06-21
005 13552222222 2020-06-21  2020-06-21  2020-06-21
004 13333333333 2020-06-01  2020-06-22  9999-12-31
005 13533333333 2020-06-21  2020-06-22  2020-06-22
006 13733333333 2020-06-22  2020-06-22  9999-12-31
~~~     # 最终的处理(新增+历史数据)

hive (default)> insert overwrite table test.userhis
select userid, mobile, regdate, dt as start_date, '9999-12-31' as end_date
from test.userinfo
where dt='2020-06-23'
union all
select B.userid,
B.mobile,
B.regdate,
B.start_Date,
case when B.end_date='9999-12-31' and A.userid is not null
then date_add('2020-06-23', -1)
else B.end_date
end as end_date
from (select * from test.userinfo where dt='2020-06-23') 
A right join test.userhis 
B on A.userid=B.userid;
~~~     # 查看新增+历史数据

hive (default)> select * from test.userhis;
~~~输出参数
userhis.userid  userhis.mobile  userhis.regdate userhis.start_date  userhis.end_date
001 13551111111 2020-03-01  2020-06-20  2020-06-22
002 13561111111 2020-04-01  2020-06-20  2020-06-20
003 13571111111 2020-05-01  2020-06-20  2020-06-22
004 13581111111 2020-06-01  2020-06-20  2020-06-20
002 13562222222 2020-04-01  2020-06-21  9999-12-31
004 13582222222 2020-06-01  2020-06-21  2020-06-21
005 13552222222 2020-06-21  2020-06-21  2020-06-21
004 13333333333 2020-06-01  2020-06-22  9999-12-31
005 13533333333 2020-06-21  2020-06-22  2020-06-22
006 13733333333 2020-06-22  2020-06-22  9999-12-31
001 13554444444 2020-03-01  2020-06-23  9999-12-31
003 13574444444 2020-05-01  2020-06-23  9999-12-31
005 13555554444 2020-06-21  2020-06-23  9999-12-31
007 18600744444 2020-06-23  2020-06-23  9999-12-31
008 18600844444 2020-06-23  2020-06-23  9999-12-31
三、通过脚本实现拉链表
### --- 创建拉链表运行脚本

[root@hadoop02 ~]# vim /data/yanqidw/data/userzipper.sh
#!/bin/bash

source /etc/profile

if [ -n "$1" ] ;
then
    do_date=$1
else
    do_date=`date -d "-1 day" +%F`
fi

sql="
insert overwrite table test.userhis
select userid, mobile, regdate, dt as start_date, '9999-12-31' as end_date
from test.userinfo
where dt='$do_date'
union all
        select B.userid,
        B.mobile,
        B.regdate,
        B.start_Date,
case when B.end_date='9999-12-31' and A.userid is not null
then date_add('$do_date', -1)
else B.end_date
end as end_date
from (select * from test.userinfo where dt='$do_date') 
A right join test.userhis 
B on A.userid=B.userid;
"

hive -e "$sql"
[root@hadoop02 ~]# sh /data/yanqidw/data/userzipper.sh
### --- 拉链表的使用:
~~~     # 查看拉链表中最新数据(2020-06-23以后的数据)

hive (default)> select * from test.userhis where end_date='9999-12-31';
~~~输出参数
002 13562222222 2020-04-01  2020-06-21  9999-12-31
004 13333333333 2020-06-01  2020-06-22  9999-12-31
006 13733333333 2020-06-22  2020-06-22  9999-12-31
001 13554444444 2020-03-01  2020-06-23  9999-12-31
003 13574444444 2020-05-01  2020-06-23  9999-12-31
005 13555554444 2020-06-21  2020-06-23  9999-12-31
007 18600744444 2020-06-23  2020-06-23  9999-12-31
008 18600844444 2020-06-23  2020-06-23  9999-12-31
~~~     # 查看拉链表中给定日期数据("2020-06-22")

hive (default)> select * from test.userhis where start_date <= '2020-06-22' 
and end_date >= '2020-06-22';
~~~输出参数
001 13551111111 2020-03-01  2020-06-20  2020-06-22
003 13571111111 2020-05-01  2020-06-20  2020-06-22
002 13562222222 2020-04-01  2020-06-21  9999-12-31
004 13333333333 2020-06-01  2020-06-22  9999-12-31
005 13533333333 2020-06-21  2020-06-22  2020-06-22
006 13733333333 2020-06-22  2020-06-22  9999-12-31
~~~     # 查看拉链表中给定日期数据("2020-06-21")

hive (default)> select * from test.userhis where start_date <= '2020-06-21' 
and end_date >= '2020-06-21';
~~~输出参数
001 13551111111 2020-03-01  2020-06-20  2020-06-22
003 13571111111 2020-05-01  2020-06-20  2020-06-22
002 13562222222 2020-04-01  2020-06-21  9999-12-31
004 13582222222 2020-06-01  2020-06-21  2020-06-21
005 13552222222 2020-06-21  2020-06-21  2020-06-21
~~~     # 查看拉链表中给定日期数据("2020-06-20")

hive (default)> select * from test.userhis where start_date <= '2020-06-20' 
and end_date >= '2020-06-20';
~~~输出参数
001 13551111111 2020-03-01  2020-06-20  2020-06-22
002 13561111111 2020-04-01  2020-06-20  2020-06-20
003 13571111111 2020-05-01  2020-06-20  2020-06-22
004 13581111111 2020-06-01  2020-06-20  2020-06-20

 
 
 
 
 
 
 
 
 

Walter Savage Landor:strove with none,for none was worth my strife.Nature I loved and, next to Nature, Art:I warm'd both hands before the fire of life.It sinks, and I am ready to depart
                                                                                                                                                   ——W.S.Landor

 

 

posted on   yanqi_vip  阅读(39)  评论(0编辑  收藏  举报

相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

导航

统计

点击右上角即可分享
微信分享提示