Title

flink1.13.3 watermark + window入门

1.1 pom文件

 <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <flink.version>1.13.3</flink.version>
        <hadoop.version>2.9.2</hadoop.version>
        <scala.binary.version>2.11</scala.binary.version>
        <scala.version>2.11.12</scala.version>
        <slf4j.version>1.7.30</slf4j.version>
    </properties>

    <dependencies>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-hadoop-compatibility_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>

1.2 代码实现

package com.lew.timedemo;

import org.apache.flink.api.common.eventtime.*;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;

import java.util.Iterator;

/**
 * @Author gcwel
 * @Description
 * @Date 2021/11/2
 */
public class Demo1 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //设置自动水印发射的间隔
        env.getConfig().setAutoWatermarkInterval(1000L);
        env.setParallelism(1);
        DataStreamSource<String> sourceDs = env.socketTextStream("gcw1", 10086);
        SingleOutputStreamOperator<Tuple2<String, Long>> mapDs = sourceDs.map(new MapFunction<String, Tuple2<String, Long>>() {
            @Override
            public Tuple2<String, Long> map(String value) throws Exception {
                String[] split = value.split(",");
                return Tuple2.of(split[0], Long.valueOf(split[1]));
            }
        });
        //周期性 发射watermark
        SingleOutputStreamOperator<Tuple2<String, Long>> watermarks = mapDs.assignTimestampsAndWatermarks(new WatermarkStrategy<Tuple2<String, Long>>() {
            @Override
            public WatermarkGenerator<Tuple2<String, Long>> createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
                return new WatermarkGenerator<Tuple2<String, Long>>() {
                    private long maxTimeStamp = 0L;
                    private long maxOutOfOrderness = 3000L; //允许的最大延迟时间

                    @Override
                    public void onEvent(Tuple2<String, Long> event, long eventTimestamp, WatermarkOutput output) {
                        //每次来一条数据就会触发一次
                        maxTimeStamp = Math.max(maxTimeStamp, event.f1);
                    }

                    @Override
                    public void onPeriodicEmit(WatermarkOutput output) {
                        //周期性 发射watermark
                        output.emitWatermark(new Watermark(maxTimeStamp - maxOutOfOrderness));
                    }
                };
            }
        }.withTimestampAssigner(((element, recordTimestamp) -> element.f1)));

        watermarks.keyBy(x -> x.f0).window(TumblingEventTimeWindows.of(Time.seconds(4)))
                .apply(new WindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
                    @Override
                    public void apply(String s, TimeWindow window, Iterable<Tuple2<String, Long>> input, Collector<String> out) throws Exception {
                        Iterator<Tuple2<String, Long>> iterator = input.iterator();
                        int count = 0;
                        while (iterator.hasNext()) {
                            count++;
                            iterator.next();
                        }
                        out.collect(window.getStart() + "->" + window.getEnd() + " " + s + ":" + count);
                    }
                }).print();
        env.execute();
    }


}

1.3 理解

此例子中设置了

  • env.getConfig().setAutoWatermarkInterval(1000L);每隔一秒去自动emitWatermark,

  • TumblingEventTimeWindows.of(Time.seconds(4))滚动窗口为4s

  • private long maxOutOfOrderness = 3000L; 允许的最大延迟时间3s

测试数据

01,1635867066000
01,1635867067000
01,1635867068000
01,1635867069000
01,1635867070000
01,1635867071000

当最后一条数据01,1635867071000处理时,会触发窗口[1635867064000, 1635867068000)且不再接收此阶段数据(可以自定义处理)

滚动窗口将每一分钟每隔四秒分隔,前闭后开

例如2021-11-02 23:31分划分成

[2021-11-02 23:31:00 , 2021-11-02 23:31:04)
[2021-11-02 23:31:04 , 2021-11-02 23:31:08)
....
[2021-11-02 23:31:56 , 2021-11-02 23:32:00)

[1635867064000, 1635867068000) 对应 [2021-11-02 23:31:04, 2021-11-02 23:31:08);

而最大延迟时间为3s,所以触发[1635867064000, 1635867068000) 此窗口的时间戳要>= 16358670710001635867068000 + 3000 = 1635867067100

1.4 代码编写出现问题

//最初写成Long.MIN_VALUE 导致new Watermark(maxTimeStamp - maxOutOfOrderness)
//超出范围出错,出错不报错很难排查
//private long maxTimeStamp =Long.MIN_VALUE;

//排出后改为
 private long maxTimeStamp = 0L;
posted @ 2021-11-02 23:43  apeGcWell  阅读(227)  评论(0编辑  收藏  举报