Flink SQL 同时 Join 多个 Mysql 表
Flink 版本: 1.14.3
- 主要测试一个任务中同时关联多个 MySql 中的表
MySQL 表
MySQL 1
两个mysql 表: lookup_join_config、lookup_join_config_2
表结构:
create table lookup_join_config
(
id int auto_increment
primary key,
code varchar(10) null,
value varchar(10) null,
create_time datetime default CURRENT_TIMESTAMP null,
update_time datetime default CURRENT_TIMESTAMP null
)
comment 'lookup join 的配置表';
create table lookup_join_config_2
(
id int auto_increment
primary key,
code varchar(10) null,
value varchar(10) null,
create_time datetime default CURRENT_TIMESTAMP null,
update_time datetime default CURRENT_TIMESTAMP null
)
comment 'lookup join 的配置表';
数据lookup_join_config:
数据lookup_join_config:
MySQL 2
两个mysql 表: lookup_join_config
表结构:
create table lookup_join_config
(
id int auto_increment
primary key,
code varchar(10) null,
value varchar(10) null,
create_time datetime default CURRENT_TIMESTAMP null,
update_time datetime default CURRENT_TIMESTAMP null
)
comment 'lookup join 的配置表';
数据lookup_join_config:
Flink SQL
-- flink lookup mysql test
-- kafka source
drop table if exists user_log;
CREATE TABLE user_log (
user_id VARCHAR
,item_id VARCHAR
,category_id VARCHAR
,behavior VARCHAR
,ts TIMESTAMP(3)
,process_time as proctime()
,WATERMARK FOR ts AS ts - INTERVAL '5' SECOND
) WITH (
'connector' = 'kafka'
,'topic' = 'user_log'
,'properties.bootstrap.servers' = 'localhost:9092'
,'properties.group.id' = 'user_log'
,'scan.startup.mode' = 'latest-offset'
,'format' = 'json'
);
drop table if exists mysql_behavior_conf ;
CREATE TEMPORARY TABLE mysql_behavior_conf (
id int
,code STRING
,`value` STRING
,update_time TIMESTAMP(3)
-- ,primary key (id) not enforced
-- ,WATERMARK FOR update_time AS update_time - INTERVAL '5' SECOND
) WITH (
'connector' = 'jdbc'
,'url' = 'jdbc:mysql://localhost:3306/venn'
,'table-name' = 'lookup_join_config'
,'username' = 'root'
,'password' = '123456'
,'scan.partition.column' = 'id'
,'scan.partition.num' = '5'
,'scan.partition.lower-bound' = '5'
,'scan.partition.upper-bound' = '99999'
,'lookup.cache.max-rows' = '28'
,'lookup.cache.ttl' = '5555' -- ttl time 超过这么长时间无数据才行
);
drop table if exists mysql_behavior_conf_2 ;
CREATE TEMPORARY TABLE mysql_behavior_conf_2 (
id int
,code STRING
,`value` STRING
,update_time TIMESTAMP(3)
-- ,primary key (id) not enforced
-- ,WATERMARK FOR update_time AS update_time - INTERVAL '5' SECOND
) WITH (
'connector' = 'jdbc'
,'url' = 'jdbc:mysql://localhost:3306/venn'
,'table-name' = 'lookup_join_config'
,'username' = 'root'
,'password' = '123456'
,'scan.partition.column' = 'id'
,'scan.partition.num' = '5'
,'scan.partition.lower-bound' = '5'
,'scan.partition.upper-bound' = '99999'
,'lookup.cache.max-rows' = '28'
,'lookup.cache.ttl' = '5555' -- ttl time 超过这么长时间无数据才行
);
-- set table.sql-dialect=hive;
-- kafka sink
drop table if exists print_sink;
CREATE TABLE print_sink (
user_id STRING
,item_id STRING
,category_id STRING
,behavior STRING
,behavior_map STRING
,behavior_map2 STRING
,ts timestamp(3)
) WITH (
'connector' = 'print'
);
-- streaming sql
INSERT INTO print_sink(user_id, item_id, category_id, behavior, behavior_map, behavior_map2, ts)
SELECT a.user_id, a.item_id, a.category_id, a.behavior, c.`value`, d.`value`, a.ts
FROM user_log a
left join mysql_behavior_conf FOR SYSTEM_TIME AS OF a.process_time AS
c ON a.behavior = c.code
left join mysql_behavior_conf_2 FOR SYSTEM_TIME AS OF a.process_time AS
d ON a.behavior = d.code
where a.behavior is not null;
DAG:
输入、输出
输入数据:
{"category_id":90,"user_id":"394221","item_id":"90138","behavior":"pv","ts":"2022-02-15 17:52:54.678"}
{"category_id":39,"user_id":"352182","item_id":"39985","behavior":"pv","ts":"2022-02-15 17:53:04.678"}
{"category_id":36,"user_id":"502031","item_id":"367","behavior":"buy","ts":"2022-02-15 17:53:14.678"}
{"category_id":93,"user_id":"993014","item_id":"93254","behavior":"pv","ts":"2022-02-15 17:53:24.678"}
{"category_id":65,"user_id":"729660","item_id":"6548","behavior":"pv","ts":"2022-02-15 17:53:34.678"}
{"category_id":78,"user_id":"263469","item_id":"78933","behavior":"pv","ts":"2022-02-15 17:53:44.678"}
输出数据:
+I[394221, 90138, 90, pv, 浏览, 浏览, 2022-02-15T17:52:54.678]
+I[352182, 39985, 39, pv, 浏览, 浏览, 2022-02-15T17:53:04.678]
+I[502031, 367, 36, buy, 下单, 下单, 2022-02-15T17:53:14.678]
+I[993014, 93254, 93, pv, 浏览, 浏览, 2022-02-15T17:53:24.678]
+I[729660, 6548, 65, pv, 浏览, 浏览, 2022-02-15T17:53:34.678]
+I[263469, 78933, 78, pv, 浏览, 浏览, 2022-02-15T17:53:44.678]
非 lookup join
INSERT INTO print_sink(user_id, item_id, category_id, behavior, behavior_map, behavior_map2, ts)
SELECT a.user_id, a.item_id, a.category_id, a.behavior, c.`value`, d.`value`, a.ts
FROM user_log a
left join mysql_behavior_conf --FOR SYSTEM_TIME AS OF a.process_time AS
c ON a.behavior = c.code
left join mysql_behavior_conf_2 --FOR SYSTEM_TIME AS OF a.process_time AS
d ON a.behavior = d.code
where a.behavior is not null;
- 非 lookup 的 source,mysql 表会使用 TableSourceScan 加载全表数据,加载完成后,算子就结束了,导致任务不能正常 Checkpoint
2022-02-15 17:57:09,748 INFO - Failed to trigger checkpoint for job fd4484a88094c8a8c1c9296a50655f0b because Some tasks of the job have already finished and checkpointing with finished tasks is not enabled. Failure reason: Not all required tasks are currently running.
- 注: 多个数据库不影响SQL结构
- 注: 注意参数 “scan.partition.lower-bound”,“scan.partition.upper-bound”,会作为查询条件过滤数据
- 注: Flink 1.14 后有个 bate 功能,任务中有部分 task 是 finish 状态也能做 checkpoint
官网: Checkpointing with parts of the graph finished (BETA)
完整 sql 参考: github sqlSubmit sql 文件: kafka_lookup_mysql.sql
欢迎关注Flink菜鸟公众号,会不定期更新Flink(开发技术)相关的推文