liudehaos

还历史以真诚,还生命以过程。 ——余秋雨
  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

一、flink整合hive的catalog

  flink的元数据需要存放在hive中,需要创建hive的catalog(可以理解为一个flink中的数据库)

-- 进入sql客户端
sql-client.sh 

-- 创建hive catalog
CREATE CATALOG hive_catalog WITH (
    'type' = 'hive',
    'default-database' = 'default',
    'hive-conf-dir' = '/usr/local/soft/hive-1.2.1/conf'
);
-- set the HiveCatalog as the current catalog of the session
USE CATALOG hive_catalog;


-- 创建一个flink_init.sql文件,将hive catalog放进去,
-- 后面再启动sql-client -i flink_init.sql (时指定sql文件)

2、在hviecatalog中创建数据库

  离线数据仓库:每一个库对应一个用户,每一个库对应hdfs中一个目录

  实时数据仓库每一个层的数据保存在不同位置,ods,dwd,dws保存在kafka,dim,ads的数据保存在数据库中

create database gma_ods;
create database gma_dwd;
create database gma_dws;
create database gma_dim;
create database gma_ads;

3、数据采集

# 1、在mysql中创建数据库gma,指定彪马格式为utf-8
# 2、修改canal配置文件
cd /usr/local/soft/canal/conf/example
# 修改配置文件
vim instance.properties
# 增加动态topic配置,每个表在kafka中生成一个topic
canal.mq.dynamicTopic=gma\\..*


# 3、开启mysqlbinlog日志
vim /etc/my.cnf 
# 在配置文件中增加二配置
# 需要将配置放在[mysqld]后面
# 打开binlog
log-bin=mysql-bin
# 选择ROW(行)模式
binlog-format=ROW
# 重启mysql
service mysqld restart

# 4、启动canal
cd /usr/local/soft/canal/bin
./restart.sh

# 5、在数据库中创建表
# 执行下面这个sql文件
init_mysql_table.sql

# 6、查看topic是否生成
kafka-topics.sh --list  --zookeeper  master:2181
# 如果topic没有生成,检测前面哪里出了问题,创建表之后必须要有topic

# 7、导入数据到mysql中
# 执行sql文件
load_data.sql

# 8、使用kafka控制台消费者消费数据
kafka-console-consumer.sh --bootstrap-server  master:9092 --from-beginning --topic gma.base_category1

4、ODS层

  • 在flink sql中创建ods的表

  • 样例:
  • # 进入sql-client
    sql-client.sh -i flink_init.sql
    
    # 创建ods层所有的表
    ods_mysql_kafka_base_category1:商品一级分类表
    ods_mysql_kafka_base_category2:商品二级分类表
    ods_mysql_kafka_base_category3:商品三级分类表
    ods_mysql_kafka_base_province:省份配置报表
    ods_mysql_kafka_base_region:地区配置表
    ods_mysql_kafka_base_trademark:品牌表
    ods_mysql_kafka_date_info:时间配置表
    ods_mysql_kafka_holiday_info:节假日表
    ods_mysql_kafka_holiday_year:节假日对应时间表
    ods_mysql_kafka_order_detail:订单详情表,一个订单中一个商品一条数据
    ods_mysql_kafka_order_info:订单表。一个订单一条数据,订单表中有订单状态
    ods_mysql_kafka_order_status_log:订单转台变化日志记录表
    ods_mysql_kafka_payment_info:支付流水表
    ods_mysql_kafka_sku_info:商品信息表
    ods_mysql_kafka_user_info:用户信息表
    • 样例(部分数据):

5、DIM层

    将维度表单独保存到一个hbase的命名空间(相当于库)中

# 进入hbase shell
habse shell
# 创建命名空间
create_namespace 'gma_dim'

  1. 地区维度表

    将省份表和地区表合并成地区维度表(维度退化) 从kafka中读取数据合并两个表,将合并之后的地区维度表保存到hbae中

 1 -- 1、在hbase中创建表
 2 create 'gma_dim.dim_region','info'
 3 
 4 -- 2、在flink实时数据仓库的dim层创建地区维度表
 5 -- 创建hbase sink表
 6 CREATE TABLE gma_dim.dim_hbase_region (
 7  pro_id BIGINT,
 8  info ROW<pro_name STRING,region_id BIGINT,region_name STRING,area_code STRING>,
 9  PRIMARY KEY (pro_id) NOT ENFORCED
10 ) WITH (
11  'connector' = 'hbase-1.4',
12  'table-name' = 'gma_dim.dim_region',
13  'zookeeper.quorum' = 'master:2181'
14 );
15 
16 
17 -- 3、编写flinksql 合并地区表和省份表,将数据保存到地区维度表中
18 insert into gma_dim.dim_hbase_region
19 select pro_id,ROW(pro_name,region_id,region_name,area_code) from (
20 select b.id as pro_id,b.name as pro_name,region_id,region_name,area_code from
21 gma_ods.ods_mysql_kafka_base_region /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as a
22 inner join 
23 gma_ods.ods_mysql_kafka_base_province /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as b
24 on
25 a.id=b.region_id
26 ) as c
27 
28 --4、到hbase中查看是否有数据
29 scan 'gma_dim.dim_region'

 

  2. 商品维度表

    商品表、品类表、spu表、商品三级分类、商品二级分类、商品一级分类表退化为商品维度表

 1 -- 1、在hbsae中创建商品维度表
 2 create 'gma_dim.dim_item_info','info'
 3 
 4 -- 2、在flink实时数据仓库中创建商品维度表
 5 CREATE TABLE gma_dim.dim_hbase_item_info (
 6  sku_id bigint,
 7  info ROW<spu_id bigint, price decimal(10,0) , sku_name STRING, sku_desc STRING, weight  decimal(10,2), 
tm_id bigint, tm_name STRING, category3_id bigint, category3_name STRING, category2_id bigint, category2_name STRING,
category1_id bigint, category1_name STRING, sku_default_img STRING, create_time TIMESTAMP(3) >, 8 PRIMARY KEY (sku_id) NOT ENFORCED 9 ) WITH ( 10 'connector' = 'hbase-1.4', 11 'table-name' = 'gma_dim.dim_item_info', 12 'zookeeper.quorum' = 'master:2181', 13 'sink.buffer-flush.max-rows'='0' 14 ); 15 16 17 --3、关联多张表得到商品维度表 18 insert into gma_dim.dim_hbase_item_info 19 select 20 sku_id,ROW(spu_id , price, sku_name, sku_desc, weight, tm_id, tm_name, category3_id, category3_name,
category2_id, category2_name, category1_id, category1_name, sku_default_img, create_time) as info 21 from ( 22 select 23 a.id as sku_id, 24 a.spu_id, 25 a.price, 26 a.sku_name, 27 a.sku_desc, 28 a.weight, 29 a.tm_id, 30 b.tm_name, 31 a.category3_id, 32 c.name as category3_name, 33 d.id as category2_id, 34 d.name as category2_name, 35 e.id as category1_id, 36 e.name as category1_name, 37 a.sku_default_img, 38 a.create_time 39 from 40 gma_ods.ods_mysql_kafka_sku_info 41 /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as a 42 inner join 43 gma_ods.ods_mysql_kafka_base_trademark 44 /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as b 45 on a.tm_id=cast(b.tm_id as bigint) 46 inner join 47 gma_ods.ods_mysql_kafka_base_category3 48 /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as c 49 on a.category3_id=c.id 50 inner join 51 gma_ods.ods_mysql_kafka_base_category2 52 /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as d 53 on c.category2_id=d.id 54 inner join 55 gma_ods.ods_mysql_kafka_base_category1 56 /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as e 57 on d.category1_id=e.id 58 ) as f;

  3. 用户维度表

 1 --1、在hbase中创建用户维度表
 2 create 'gma_dim.dim_user_info','info'
 3 
 4 --2、在flink实时数据仓库中创建用户维度表
 5 CREATE TABLE gma_dim.dim_hbase_user_info (
 6  user_id BIGINT,
 7  info ROW<login_name STRING , nick_name STRING , passwd STRING , name STRING , phone_num STRING ,
email STRING , head_img STRING , user_level STRING , birthday DATE , gender STRING , create_time TIMESTAMP(3)>, 8 PRIMARY KEY (user_id) NOT ENFORCED 9 ) WITH ( 10 'connector' = 'hbase-1.4', 11 'table-name' = 'gma_dim.dim_user_info', 12 'zookeeper.quorum' = 'master:2181' 13 ); 14 15 --3、编写flink sql读取ods层用户信息表将数据保存到hbae中构建用户维度表 16 insert into gma_dim.dim_hbase_user_info 17 select id as user_id,ROW(login_name,nick_name,passwd,name,phone_num,email,head_img,user_level,birthday,gender,create_time) from 18 gma_ods.ods_mysql_kafka_user_info 19 /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as a 20 ;

7、DWD层

  1. 支付事实表

 1 --1、创建kafka sink表
 2 
 3 CREATE TABLE gma_dwd.dwd_kafka_payment_info (
 4 id bigint,
 5 user_id BIGINT,
 6 payment_time STRING,
 7 payment_type STRING,
 8 province_id BIGINT,
 9 skus STRING,
10 payment_price decimal(10,2),
11 sku_num BIGINT,
12 proc_time as PROCTIME(),
13  PRIMARY KEY (id) NOT ENFORCED-- 设置唯一主键
14 ) WITH (
15   'connector' = 'upsert-kafka',
16   'topic' = 'dwd_payment_info',
17   'properties.bootstrap.servers' = 'master:9092',
18   'key.format' = 'json',
19   'value.format' = 'json'
20 );
21 
22 --2、关联支付流水表,订单表,订单明细表,构建支付事实表
23 
24 insert into gma_dwd.dwd_kafka_payment_info
25 select 
26 id,
27 cast(user_id as BIGINT) as user_id,
28 payment_time,
29 payment_type,
30 cast(province_id as BIGINT) as province_id,
31 listAGG(cast(sku_id as STRING)) as skus,
32 sum(order_price*sku_num) as payment_price,
33 sum(sku_num) as sku_num
34 from (
35     select a.id,a.user_id,a.payment_time,a.payment_type,b.province_id,c.sku_id,c.order_price,cast(c.sku_num as bigint) as sku_num  from 
36     gma_ods.ods_mysql_kafka_payment_info
37     /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as a
38     inner join 
39     gma_ods.ods_mysql_kafka_order_info
40     /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as b
41     on cast(a.order_id as bigint)=b.id
42     inner join 
43     gma_ods.ods_mysql_kafka_order_detail
44     /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as c
45     on b.id=c.order_id
46 )
47 as d
48 group by 
49 id,user_id,payment_time,payment_type,province_id;
50 
51 
52 -- 消费kafka
53 kafka-console-consumer.sh --bootstrap-server  master:9092,node2:9092,node2:9092 --from-beginning --topic dwd_payment_info
54 
55 --在flink中查询数据
56 select * from gma_dwd.dwd_kafka_payment_info  /*+ OPTIONS('scan.startup.mode'='earliest-offset') */

  2. 订单事实表

 1 -- 1、创建kafka sink表
 2 
 3 CREATE TABLE gma_dwd.dwd_kafka_order_info (
 4 id BIGINT,
 5 consignee  STRING,
 6 consignee_tel STRING,
 7 delivery_address STRING,
 8 order_status  STRING,
 9 user_id BIGINT,
10 payment_way  STRING,
11 create_time  TIMESTAMP(3),
12 operate_time  TIMESTAMP(3),
13 expire_time TIMESTAMP(3),
14 province_id  BIGINT,
15 skus STRING,
16 total_amount decimal(10,2),
17 proc_time as PROCTIME(),
18 PRIMARY KEY (id) NOT ENFORCED-- 设置唯一主键
19 ) WITH (
20   'connector' = 'upsert-kafka',
21   'topic' = 'dwd_order_info',
22   'properties.bootstrap.servers' = 'master:9092',
23   'key.format' = 'json',
24   'value.format' = 'json'
25 );
26 
27 
28 -- 执行sql
29 insert into gma_dwd.dwd_kafka_order_info
30 select 
31 a.id,
32 a.consignee,
33 a.consignee_tel,
34 a.delivery_address,
35 a.order_status,
36 a.user_id,
37 a.payment_way,
38 a.create_time,
39 a.operate_time,
40 a.expire_time,
41 cast(a.province_id as bigint),
42 b.skus,
43 a.total_amount
44 from 
45 gma_ods.ods_mysql_kafka_order_info
46 /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ as a
47 join 
48 (
49     select order_id,listagg(cast(sku_id as STRING)) as skus from 
50     gma_ods.ods_mysql_kafka_order_detail
51     /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ 
52     group by order_id
53 )
54 as b
55 on a.id=b.order_id