Flink Table Sql(六)
基础代码
环境信息
sql-client.sh 启动 SQL 客户端
CREATE TABLE employee_information (emp_id INT,name VARCHAR,dept_id INT)
WITH ('connector' = 'filesystem','path' = '/path/to/something.csv','format' = 'csv');
SELECT * from employee_information WHERE dept_id = 1; --查询语句
SELECT * FROM TABLE(TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES));
窗口
group window sql支持的group 函数
SELECT window_start, window_end, SUM(price) FROM TABLE( TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY window_start, window_end;//滚动窗口
SELECT window_start, window_end, SUM(price) FROM TABLE( HOP(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '5' MINUTES, INTERVAL '10' MINUTES))
GROUP BY window_start, window_end;//滑动窗口
SELECT window_start, window_end, SUM(price) FROM TABLE( CUMULATE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '2' MINUTES, INTERVAL '10' MINUTES))
GROUP BY window_start, window_end;//会话窗口
还有其他的window函数 可以获取开始和结束时间戳,以及时间
这里只写 TUMBLE_*,滑动和会话窗口是类似的(HOP_*,SESSION_*)
TUMBLE_START(time_attr, interval)
TUMBLE_END(time_attr, interval)
TUMBLE_ROWTIME(time_attr, interval)
TUMBLE_PROCTIME(time_attr, interval)
窗口聚合
SELECT window_start, window_end, supplier_id, SUM(price) as price FROM TABLE( TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY window_start, window_end, GROUPING SETS ((supplier_id), ());
SELECT window_start, window_end, supplier_id, SUM(price) as price FROM TABLE( TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY window_start, window_end, ROLLUP (supplier_id);
SELECT window_start, window_end, item, supplier_id, SUM(price) as price FROM TABLE( TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY window_start, window_end, CUBE (supplier_id, item);
级联窗口聚合
CREATE VIEW window1 AS SELECT window_start, window_end, window_time as rowtime, SUM(price) as partial_price
FROM TABLE( TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '5' MINUTES))
GROUP BY supplier_id, window_start, window_end, window_time;
SELECT window_start, window_end, SUM(partial_price) as total_price
FROM TABLE(TUMBLE(TABLE window1, DESCRIPTOR(rowtime), INTERVAL '10' MINUTES))
GROUP BY window_start, window_end;
组聚合
SELECT COUNT(DISTINCT order_id) FROM Orders GROUP BY order_id HAVING aa > 50
over 两种写法
RANGE BETWEEN INTERVAL '30' MINUTE PRECEDING AND CURRENT ROW
ROWS BETWEEN 10 PRECEDING AND CURRENT ROW
"select count(amount) over(PARTITION by user order by proctime rows between 2 preceding and current row) from orders "
"select id,count(id) over ow from data window ow as (partation by id order by st rows between 2 preceding and current row) "
datatable.window(Tumble.over(10.second()).on($"ts").as("ts")).groupBy($"id",$"ts").select($"id",$"ts",$"id".count())
"select id,count(id),Tumble(ts,interval '10' second) from data group by id,TUMBLE(ts,interval '10' second)"
join
SELECT * FROM Orders INNER JOIN Product ON Orders.product_id = Product.id
SELECT * FROM Orders LEFT JOIN Product ON Orders.product_id = Product.id
SELECT * FROM Orders RIGHT JOIN Product ON Orders.product_id = Product.id
SELECT * FROM Orders FULL OUTER JOIN Product ON Orders.product_id = Product.id
SELECT * FROM Orders o, Shipments s WHERE o.id = s.order_id AND o.order_time BETWEEN s.ship_time - INTERVAL '4' HOUR AND s.ship_time
时间连接
CREATE TABLE orders (order_id STRING,price DECIMAL(32,2),currency STRING,order_time TIMESTAMP(3),
WATERMARK FOR order_time AS order_time ) WITH (/* ... */);
CREATE TABLE currency_rates (currency STRING,conversion_rate DECIMAL(32, 2),
update_time TIMESTAMP(3) METADATA FROM `values.source.timestamp` VIRTUAL,
WATERMARK FOR update_time AS update_time,
PRIMARY KEY(currency) NOT ENFORCED
) WITH ('connector' = 'kafka','value.format' = 'debezium-json');
事件时间连接
SELECT order_id,price,currency,conversion_rate,order_time,
FROM orders LEFT JOIN currency_rates FOR SYSTEM_TIME AS OF orders.order_time
ON orders.currency = currency_rates.currency;
处理时间连接
SELECT o.amount, o.currency, r.rate, o.amount * r.rate
FROM Orders AS o JOIN LatestRates FOR SYSTEM_TIME AS OF o.proctime AS r
ON r.currency = o.currency
查找join
SELECT o.order_id, o.total, c.country, c.zip FROM Orders AS o
JOIN Customers FOR SYSTEM_TIME AS OF o.proc_time AS c
ON o.customer_id = c.id;
数组展开
SELECT order_id, tag FROM Orders CROSS JOIN UNNEST(tags) AS t (tag)
内连接、左外连接
SELECT order_id, res FROM Orders, LATERAL TABLE(table_func(order_id)) t(res)
SELECT order_id, res FROM Orders LEFT OUTER JOIN LATERAL TABLE(table_func(order_id)) t(res) ON TRUE
两表join
create view t1(s) as values ('c'), ('a'), ('b'), ('b'), ('c');
create view t2(s) as values ('d'), ('e'), ('a'), ('b'), ('b');
(SELECT s FROM t1) UNION (SELECT s FROM t2);
(SELECT s FROM t1) UNION ALL (SELECT s FROM t2);
(SELECT s FROM t1) INTERSECT (SELECT s FROM t2);--返回两个表都存在的记录 删除重复项
(SELECT s FROM t1) INTERSECT ALL (SELECT s FROM t2);--返回两个表都存在的记录 不删除重复项
(SELECT s FROM t1) EXCEPT (SELECT s FROM t2); --类似not in 删除重复项
(SELECT s FROM t1) EXCEPT ALL (SELECT s FROM t2);--类似not in 不删除重复项
SELECT user, amount FROM Orders WHERE product IN (SELECT product FROM NewProducts) --in
SELECT user, amount FROM Orders WHERE product EXISTS (SELECT product FROM NewProducts) --EXISTS
top N
SELECT product_id, category, product_name, sales FROM (
SELECT *,ROW_NUMBER() OVER (PARTITION BY category ORDER BY sales DESC) AS row_num
FROM ShopSales
)WHERE row_num <= 5
窗口 top N
SELECT * FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY window_start, window_end ORDER BY price DESC) as rownum
FROM (
SELECT window_start, window_end, supplier_id, SUM(price) as price, COUNT(*) as cnt
FROM TABLE(TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY window_start, window_end, supplier_id
)
) WHERE rownum <= 3;
重复数据删除
SELECT order_id, user, product, num FROM (
SELECT *,ROW_NUMBER() OVER (PARTITION BY order_id ORDER BY proctime ASC) AS row_num FROM Orders
) WHERE row_num = 1
模式识别(CEP SQL版本)
SELECT T.aid, T.bid, T.cid FROM MyTable MATCH_RECOGNIZE (
PARTITION BY userid ORDER BY proctime MEASURES A.id AS aid,
B.id AS bid,C.id AS cid PATTERN (A B C)
DEFINE A AS name = 'a', B AS name = 'b', C AS name = 'c'
) AS T
PARTITION BY - 定义表的逻辑分区;类似于 GROUP BY 操作。
ORDER BY - 指定传入行的排序方式;这是必须的,因为模式依赖于顺序。
MEASURES - 定义子句的输出;类似于 SELECT 子句。
ONE ROW PER MATCH - 输出方式,定义每个匹配项应产生多少行。
AFTER MATCH SKIP - 指定下一个匹配的开始位置;这也是控制单个事件可以属于多少个不同匹配项的方法。
PATTERN - 允许使用类似于 正则表达式 的语法构造搜索的模式。
DEFINE - 本部分定义了模式变量必须满足的条件。
匹配模式
https://nightlies.apache.org/flink/flink-docs-release-1.13/zh/docs/dev/table/sql/queries/match_recognize/
demo
SELECT * FROM Ticker MATCH_RECOGNIZE (
PARTITION BY symbol ORDER BY rowtime MEASURES
START_ROW.rowtime AS start_tstamp,
LAST(PRICE_DOWN.rowtime) AS bottom_tstamp,
LAST(PRICE_UP.rowtime) AS end_tstamp,
AVG(PRICE_UP) AS avgPrice
ONE ROW PER MATCH
AFTER MATCH SKIP TO LAST PRICE_UP
PATTERN (START_ROW PRICE_DOWN+ PRICE_UP)
DEFINE PRICE_DOWN AS
(LAST(PRICE_DOWN.price, 1) IS NULL AND PRICE_DOWN.price < START_ROW.price) OR
PRICE_DOWN.price < LAST(PRICE_DOWN.price, 1),
PRICE_UP AS PRICE_UP.price > LAST(PRICE_DOWN.price, 1)
) MR;
LAST:映射到变量最后n个元素的事件中的字段值 FIRST映射到变量的第 n 个元素的事件中的字段值
SKIP PAST LAST ROW - 在当前匹配的最后一行之后的下一行继续模式匹配。
SKIP TO NEXT ROW - 继续从匹配项开始行后的下一行开始搜索新匹配项。
SKIP TO LAST variable - 恢复映射到指定模式变量的最后一行的模式匹配。
SKIP TO FIRST variable - 在映射到指定模式变量的第一行继续模式匹配。
常用样例
CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)
DROP TABLE IF EXISTS Orders;DROP DATABASE IF EXISTS DB;DROP [TEMPORARY] VIEW IF EXISTS;DROP FUNCTION IF EXISTS
ALTER TABLE Orders RENAME TO NewOrders;ALTER TABLE Orders SET (key1=val1, key2=val2);
ALTER DATABASE Orders SET (key1=val1, key2=val2);
INSERT INTO RubberOrders PARTITION (date='2019-8-30') SELECT product from data;
INSERT OVERWRITE RubberOrders PARTITION (date='2019-8-30') SELECT product from data;
DESCRIBE Orders;DESC Orders
EXPLAIN PLAN FOR select * from data;
SHOW CATALOGS;USE CATALOG;SHOW DATABASES;USE DB;USE MODULES hive;SHOW FULL MODULES;
SHOW CATALOGS;SHOW CURRENT CATALOG;SHOW DATABASES;SHOW CURRENT DATABASE;
SHOW TABLES;SHOW VIEWS;SHOW FUNCTIONS;SHOW MODULES;SHOW FULL MODULES;
LOAD MODULE hive WITH ('hive-version' = '3.1.2');SHOW MODULES;
UNLOAD MODULE core;SHOW MODULES;
SET table.planner = blink;SET;
RESET table.planner;RESET(所有设置回复默认);
km暂存
系统函数
https://nightlies.apache.org/flink/flink-docs-release-1.13/zh/docs/dev/table/functions/systemfunctions/
SQL 配置页面
https://nightlies.apache.org/flink/flink-docs-release-1.13/zh/docs/dev/table/config/
代码 配置页面
https://nightlies.apache.org/flink/flink-docs-release-1.13/zh/docs/dev/execution/execution_configuration/
模块
SHOW MODULES;SHOW FULL MODULES;LOAD MODULE hive WITH ('hive-version' = '...');
USE MODULES hive,core;UNLOAD MODULE hive;
可视化模式
在内存中实体化结果,并将结果用规则的分页表格可视化展示出来
SET sql-client.execution.result-mode=table;
不会实体化和可视化结果,而是由插入(+)和撤销(-)组成的持续查询产生结果流
SET sql-client.execution.result-mode=changelog;
更接近传统的数据库,会将执行的结果以制表的形式直接打在屏幕之上
SET sql-client.execution.result-mode=tableau;
可设置参数
SET table.planner = blink;
SET execution.runtime-mode = streaming;
SET sql-client.execution.result-mode = table;
SET sql-client.execution.max-table-result.rows = 10000;
SET parallelism.default = 1;
SET pipeline.auto-watermark-interval = 200;
SET pipeline.max-parallelism = 10;
SET table.exec.state.ttl=1000;
SET table.optimizer.join-reorder-enabled = true;
SET table.exec.spill-compression.enabled = true;
SET table.exec.spill-compression.block-size = 128kb;
SET execution.savepoint.path=/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab;
SET pipeline.name= 'kafka-to-hive' ;--定义自定义作业名称
执行一组SQL
BEGIN STATEMENT SET;
INSERT INTO|OVERWRITE <select_statement>;
END;
版本表
CREATE TABLE product_changelog (product_id STRING,product_name STRING,product_price DECIMAL(10, 4),
update_time TIMESTAMP(3) METADATA FROM 'value.source.timestamp' VIRTUAL,
PRIMARY KEY(product_id) NOT ENFORCED, -- (1) 定义主键约束
WATERMARK FOR update_time AS update_time -- (2) 通过 watermark 定义事件时间
) WITH ('connector' = 'kafka','topic' = 'products',
'scan.startup.mode' = 'earliest-offset',
'properties.bootstrap.servers' = 'localhost:9092',
'value.format' = 'debezium-json');
版本视图
CREATE TABLE RatesHistory (currency_time TIMESTAMP(3),currency STRING,rate DECIMAL(38, 10),
WATERMARK FOR currency_time AS currency_time -- 定义事件时间)
WITH ('connector' = 'kafka','topic' = 'rates','scan.startup.mode' = 'earliest-offset',
'properties.bootstrap.servers' = 'localhost:9092','format' = 'json' -- 普通的 append-only 流)
普通表
CREATE TABLE LatestRates (currency STRING,fam1 ROW<rate DOUBLE>)
WITH ('connector' = 'hbase-1.4','table-name' = 'rates','zookeeper.quorum' = 'localhost:2181');
CEP
//CEP 允许再事件流中组合事件模式,一个或者多个由简单事件构成的事件流 Pattern模式
//个体模式:组成每一个单独的模式定义 组合模式:很多个体模式组合起来(严格 宽松) 模式组:将一个模式序列作为条件嵌套再个体模式里面
//个体模式:单例(循环一次) 循环模式(多个) 量词(循环次数 greedy贪心) 条件(.where .or .until)
//oneOrMore 必须指定until结束 迭代条件: .where((value,ctx))//上下文状态
//.next()下一个必须是 followedBy后面跟着 followedByAny之前匹配过的也可以用
//notNext() 不让某个事件紧邻前一个时间 notFollowedBy不想让每个事件再两个事件之间发生
//所有模式必须以begin() 不能以notFollowedBy结束 not类型不能被optional修饰 还可以指定时间约束
import org.apache.flink.cep.pattern.Pattern
import org.apache.flink.cep.scala.CEP
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.scala.createTypeInformation
import java.util
// https://nightlies.apache.org/flink/flink-docs-release-1.15/docs/libs/cep/
case class UserBehavior(userId:Long,itemId:Long,timestamp:Long)
val cepdata = sourceTestEnv.fromElements[UserBehavior](
new UserBehavior(1l,1l,1663828406000l),
new UserBehavior(2l,2l,1663828416000l),
new UserBehavior(3l,3l,1663828426000l))
val pattern1 = Pattern.begin[UserBehavior]("first")
.next("second").followedBy("end")
CEP.pattern(cepdata,pattern1)
import org.apache.flink.cep.PatternSelectFunction
class CepSelectFunctiob extends PatternSelectFunction[Long,Long]{//乱序数据也搞得定
override def select(pattern: util.Map[String, util.List[Long]]): Long = {
val firstFail = pattern.get("begin").iterator().next()
val failFail = pattern.get("next").iterator().next()
1L
}
}
val pattern2 = Pattern.begin("first")//.where(_.login="false")
.followedBy("second")//.where(_.login="false")
.within(Time.seconds(30))//创建订单后支付 30min超时
import org.apache.flink.cep.PatternTimeoutFunction
class OrderTimeOutSelect extends PatternTimeoutFunction [Long,Long] {
override def timeout(pattern: util.Map[String, util.List[Long]], timeoutTimestamp: Long): Long = {
timeoutTimestamp //超时的时间
val timeout = pattern.get("begin").iterator().next()
1l
}
}
搬砖多年终不得要领,遂载源码看之望得真经。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?