FlinkSql指定时间语义
FlinkSql指定时间语义
FlinkSql在建表时指定时间语义,根据建表方式和时间语义的不同进行记录
1.从DataStream流建表+process time语义
因为是process time所以不需要指定watermark的延迟生成时间,故可以直接在创建table对象时最后一列增加一个字段即可
- 举例
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> dataStreamSource = env.readTextFile("D:\\workspace21\\myflink\\src\\main\\resources\\sensors.txt");
DataStream<SensorReading> mapDataStream = dataStreamSource.map(el -> {
String[] split = el.split(",");
return new SensorReading(split[0], Double.valueOf(split[2]), Long.valueOf(split[1]));
});
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
//pt就是我们要增加的process time字段 名字可以任意命名
Table tableApi = tableEnv.fromDataStream(mapDataStream, "id,temperature as temp,timestamp,pt.proctime");
tableApi.printSchema();
tableEnv.toAppendStream(tableApi, Row.class).print("api");
env.execute();
}
- 此时打印表的
Schema
可以看到表最后增加了一列
root
|-- id: STRING
|-- temp: DOUBLE
|-- timestamp: BIGINT
|-- pt: TIMESTAMP(3) *PROCTIME*
2.使用connect+format+schema建表+process time语义
- 举例
public static void main(String[] args) throws Exception {
//对于流式环境 StreamExecutionEnvironment 是必不可少的
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//快速创建时使用默认的planner 版本不同默认不同
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
//2. 连接数据源并注册成一张表
String filePath = "D:\\workspace21\\myflink\\src\\main\\resources\\sensors.txt";
tableEnv.connect(new FileSystem().path(filePath))
//withFormat 是用来告诉flink我应该怎么处理来源用的每一条数据 比如按csv的格式,号分割
.withFormat(new Csv())
//withSchema 是声明创建表的表结构 要按照解析得到的数据的先后顺序对应
.withSchema(new Schema()
.field("id", DataTypes.STRING())
.field("time", DataTypes.BIGINT())
.field("temp", DataTypes.DOUBLE())
//在最后直接追加一列即可
.field("pt", DataTypes.TIMESTAMP(3)
//现在的还不完善 低版本没有这个方法
//.processTime()
)
)
.createTemporaryTable("inputTable");
Table inputTable = tableEnv.from("inputTable");
inputTable.printSchema();
tableEnv.toAppendStream(inputTable, Row.class);
env.execute();
}
3.使用DDL方式建表+process time语义
String sinkDDL =
"create table dataTable (" +
" id varchar(20) not null, " +
" ts bigint, " +
" temperature double, " +
" pt AS PROCTIME() " +
") with (" +
" 'connector.type' = 'filesystem', " +
" 'connector.path' = '/sensor.txt', " +
" 'format.type' = 'csv')";
tableEnv.sqlUpdate(sinkDDL);
4.从DataStream流建表+evnettime语义
事件时间语义 和watermark在生成table之前就定义了,建表时使用.rowtime
- 举例
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//指定时间语义
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStreamSource<String> dataStreamSource = env.readTextFile("D:\\workspace21\\myflink\\src\\main\\resources\\sensors.txt");
DataStream<SensorReading> mapDataStream = dataStreamSource.map(el -> {
String[] split = el.split(",");
return new SensorReading(split[0], Double.valueOf(split[2]), Long.valueOf(split[1]));
}).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<SensorReading>(Time.seconds(3)) {
@Override
public long extractTimestamp(SensorReading element) {
return element.getTimestamp() * 1000L;
}
});
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
//event time 使用 rt.rowtime 声明 这个时候rt已经不是原来的timestamp的LONG类型的时间戳了 而是TIMESTAMP(3)
`Table tableApi = tableEnv.fromDataStream(mapDataStream, "id,temperature as tp,rt.rowtime,timestamp as ts");`
tableApi.printSchema();
tableEnv.toAppendStream(tableApi, Row.class).print("api");
env.execute();
}
scheam
如下:
root
|-- id: STRING
|-- tp: DOUBLE
|-- rt: TIMESTAMP(3) *ROWTIME*
|-- timestamp: BIGINT
- 数据如下:
api> sensor_1,37.9,2021-01-31 11:35:07.0,1612092907
api> sensor_2,50.1,2021-01-31 11:34:15.0,1612092855
api> sensor_3,23.7,2021-01-31 11:34:58.0,1612092898
api> sensor_4,15.3,2021-01-31 11:35:17.0,1612092917
5.使用connect+format+schema建表+eventtime语义
- 举例
public static void main(String[] args) throws Exception {
//对于流式环境 StreamExecutionEnvironment 是必不可少的
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//快速创建时使用默认的planner 版本不同默认不同
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
//2. 连接数据源并注册成一张表
String filePath = "D:\\workspace21\\myflink\\src\\main\\resources\\sensors.txt";
tableEnv.connect(new FileSystem().path(filePath))
//withFormat 是用来告诉flink我应该怎么处理来源用的每一条数据 比如按csv的格式,号分割
.withFormat(new Csv())
//withSchema 是声明创建表的表结构 要按照解析得到的数据的先后顺序对应
.withSchema(new Schema()
.field("id", DataTypes.STRING())
.field("ts", DataTypes.BIGINT())
.field("temp", DataTypes.DOUBLE())
//还是增加一列是event time的列 但是需要声明watermark的提取和生成方式
.rowtime(new Rowtime()
.timestampsFromField("ts") // 从字段中提取时间戳
.watermarksPeriodicBounded(1000) // watermark延迟1秒
)
)
.createTemporaryTable("inputTable");
Table inputTable = tableEnv.from("inputTable");
inputTable.printSchema();
tableEnv.toAppendStream(inputTable, Row.class);
env.execute();
}
6.使用DDL方式建表+event time语义
- 举例
String sinkDDL=
"create table dataTable (" +
" id varchar(20) not null, " +
" ts bigint, " +
" temperature double, " +
" rt AS TO_TIMESTAMP( FROM_UNIXTIME(ts) ), " +
" watermark for rt as rt - interval '1' second" +
") with (" +
" 'connector.type' = 'filesystem', " +
" 'connector.path' = '/sensor.txt', " +
" 'format.type' = 'csv')";
tableEnv.sqlUpdate(sinkDDL);