利用Flink提供的接口产生实时数据源
package com.kong.flink; import org.apache.flink.streaming.api.functions.source.SourceFunction; import java.util.Arrays; import java.util.List; import java.util.Random; /** * 利用 Flink 提供的自定义 Source 功能来实现一个自定义的实时数据源 * 实现SourceFunction接口,重写run()和cancel() * 也可以生成自己定义的类数据,这里简单的随机获取word */ public class MyStreamingSource implements SourceFunction<String> { private volatile boolean isRunning = true; @Override public void run(SourceContext<String> sourceContext) throws Exception { List<String> list = Arrays.asList("flink", "ksw", "hbase", "spark"); while (isRunning) { int i = new Random().nextInt(4); sourceContext.collect(list.get(i)); //每秒产生一条数据 Thread.sleep(1000); } } @Override public void cancel() { isRunning = false; } }
流wordcount示例
package com.kong.flink; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.windowing.time.Time; /** * org.apache.flink.api.common.InvalidProgramException: Specifying keys via field positions is only valid for tuple data types * 通过下标指定数据,只支持tuple类型 */ public class FlinkStreamingDemo2 { public static void main(String[] args) throws Exception { //创建执行环境 StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); //使用我们定义的流数据 DataStreamSource<String> source = streamEnv.addSource(new MyStreamingSource2()).setParallelism(1); //map操作,转换一下数据格式,跟spark中map一样 SingleOutputStreamOperator<WordWithCount> windowCounts = source .map(word -> new WordWithCount(word, 1L)).keyBy("word") .timeWindow(Time.seconds(5), Time.seconds(3)).sum("count"); windowCounts.print().setParallelism(1); //定义一个job名字 streamEnv.execute("job1"); } public static class WordWithCount { public String word; public long count; public WordWithCount() { } public WordWithCount(String word, long count) { this.word = word; this.count = count; } @Override public String toString() { return "WordWithCount{" + "word='" + word + '\'' + ", count=" + count + '}'; } } }