Flink的高级应用watermake理论
Time/Watermarker
时间分类
EventTime的重要性和Watermarker的引入
代码演示-开发版-掌握
https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/event_timestamps_watermarks.html
package com.pzb.watermaker;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import java.time.Duration;
import java.util.Random;
import java.util.UUID;
/**
* Desc 演示基于事件时间的窗口计算+Watermaker解决一定程度上的数据乱序/延迟到达的问题
*/
public class WatermakerDemo01 {
public static void main(String[] args) throws Exception {
//TODO 0.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
//TODO 1.source
DataStreamSource<Order> orderDS = env.addSource(new SourceFunction<Order>() {
private boolean flag = true;
@Override
public void run(SourceContext<Order> ctx) throws Exception {
Random random = new Random();
while (flag) {
String orderId = UUID.randomUUID().toString();
int userId = random.nextInt(2);
int money = random.nextInt(101);
//随机模拟延迟
long eventTime = System.currentTimeMillis() - random.nextInt(5) * 1000;
ctx.collect(new Order(orderId, userId, money, eventTime));// 发给下游处理
Thread.sleep(1000);
}
}
@Override
public void cancel() {
flag = false;
}
});
//TODO 2.transformation
//老版本API
/*DataStream<Order> watermakerDS = orderDS.assignTimestampsAndWatermarks(
new BoundedOutOfOrdernessTimestampExtractor<Order>(Time.seconds(3)) {//最大允许的延迟时间或乱序时间
@Override
public long extractTimestamp(Order element) {
return element.eventTime;
//指定事件时间是哪一列,Flink底层会自动计算:
//Watermaker = 当前最大的事件时间 - 最大允许的延迟时间或乱序时间
}
});*/
//注意:下面的代码使用的是Flink1.12中新的API
//每隔5s计算最近5s的数据求每个用户的订单总金额,要求:基于事件时间进行窗口计算+Watermaker
//env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);//在新版本中默认就是EventTime
// 默认情况下,水位线是每隔200ms产生一次
//设置Watermaker = 当前最大的事件时间 - 最大允许的延迟时间或乱序时间
SingleOutputStreamOperator<Order> orderDSWithWatermark = orderDS.assignTimestampsAndWatermarks(
WatermarkStrategy.<Order>forBoundedOutOfOrderness(Duration.ofSeconds(3))//指定maxOutOfOrderness最大无序度/最大允许的延迟时间/乱序时间
.withTimestampAssigner((order, timestamp) -> order.getEventTime())//指定事件时间列
);
SingleOutputStreamOperator<Order> result = orderDSWithWatermark.keyBy(Order::getUserId)
// TumblingEventTimeWindows 凡是基于EventTime的窗体,都要加水位线
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
.sum("money");
//TODO 3.sink
result.print();
//TODO 4.execute
env.execute();
}
@Data
@AllArgsConstructor
@NoArgsConstructor
public static class Order {
private String orderId;
private Integer userId;
private Integer money;
private Long eventTime;
}
}
代码验证
package com.pzb.windows;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.commons.lang.time.FastDateFormat;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.eventtime.*;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.UUID;0
/**
* Desc 演示基于事件时间的窗口计算+Watermaker解决一定程度上的数据乱序/延迟到达的问题
*/
public class WatermakerDemo01 {
public static void main(String[] args) throws Exception {
FastDateFormat df = FastDateFormat.getInstance("HH:mm:ss");
//TODO 0.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
//TODO 1.source
DataStreamSource<Order> orderDS = env.addSource(new SourceFunction<Order>() {
private boolean flag = true;
@Override
public void run(SourceContext<Order> ctx) throws Exception {
Random random = new Random();
while (flag) {
String orderId = UUID.randomUUID().toString();
int userId = random.nextInt(2);
int money = random.nextInt(101);
//随机模拟延迟
long eventTime = System.currentTimeMillis() - random.nextInt(5) * 1000;
ctx.collect(new Order(orderId, userId, money, eventTime));
Thread.sleep(1000);
}
}
@Override
public void cancel() {
flag = false;
}
});
//3.Transformation
/*DataStream<Order> watermakerDS = orderDS
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Order>forBoundedOutOfOrderness(Duration.ofSeconds(3))
.withTimestampAssigner((event, timestamp) -> event.getEventTime())
);*/
//开发中直接使用上面的即可
//学习测试时可以自己实现
DataStream<Order> watermakerDS = orderDS
.assignTimestampsAndWatermarks(
new WatermarkStrategy<Order>() {
@Override
public WatermarkGenerator<Order> createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
return new WatermarkGenerator<Order>() {
private int userId = 0;
private long eventTime = 0L;
private final long outOfOrdernessMillis = 3000; // 设置允许的最大延迟/乱序时间
private long maxTimestamp = Long.MIN_VALUE + outOfOrdernessMillis + 1; //定义当前最大事件时间
@Override
public void onEvent(Order event, long eventTimestamp, WatermarkOutput output) {//每来一条数据,就会自动调用一次onEvent方法
userId = event.userId;
eventTime = event.eventTime;
maxTimestamp = Math.max(maxTimestamp, eventTimestamp);
}
@Override
public void onPeriodicEmit(WatermarkOutput output) {// 发射水位线方法,默认每200ms调用一次,如果想更改可通过env.getConfig().setAutoWatermarkInterval(5000L)设置成5000毫秒发射一次
//Watermaker = 当前最大事件时间 - 最大允许的延迟时间或乱序时间
Watermark watermark = new Watermark(maxTimestamp - outOfOrdernessMillis - 1);
System.out.println("key:" + userId + ",系统时间:" + df.format(System.currentTimeMillis()) + ",事件时间:" + df.format(eventTime) + ",水印时间:" + df.format(watermark.getTimestamp()));
output.emitWatermark(watermark);
}
};
}
}.withTimestampAssigner((event, timestamp) -> event.getEventTime())
);
//代码走到这里,就已经被添加上Watermaker了!接下来就可以进行窗口计算了
//要求每隔5s,计算5秒内(基于时间的滚动窗口),每个用户的订单总金额
/* DataStream<Order> result = watermakerDS
.keyBy(Order::getUserId)
//.timeWindow(Time.seconds(5), Time.seconds(5))
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
.sum("money");*/
//开发中使用上面的代码进行业务计算即可
//学习测试时可以使用下面的代码对数据进行更详细的输出,如输出窗口触发时各个窗口中的数据的事件时间,Watermaker时间
DataStream<String> result = watermakerDS
.keyBy(Order::getUserId)
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
//把apply中的函数应用在窗口中的数据上
//WindowFunction<IN, OUT, KEY, W extends Window>
.apply(new WindowFunction<Order, String, Integer, TimeWindow>() {//第一个参数:输入数据类型,第二个参数:输出数据类型,第三个参数:KEY值数据类型,第四个参数:窗体时间类型
@Override
public void apply(Integer key, TimeWindow window, Iterable<Order> input, Collector<String> out) throws Exception {// 当上一个窗体发生计算,会回调此方法
//准备一个集合用来存放属于该窗口的数据的事件时间
List<String> eventTimeList = new ArrayList<>();
for (Order order : input) {
Long eventTime = order.eventTime;
eventTimeList.add(df.format(eventTime));
}
String outStr = String.format("key:%s,窗口开始结束:[%s~%s),属于该窗口的事件时间:%s",
key.toString(), df.format(window.getStart()), df.format(window.getEnd()), eventTimeList);
out.collect(outStr);
}
});
//4.Sink
result.print();
//5.execute
env.execute();
}
@Data
@AllArgsConstructor
@NoArgsConstructor
public static class Order {
private String orderId;
private Integer userId;
private Integer money;
private Long eventTime;
}
}