Flink三种API的wordcount
官网参考:https://ci.apache.org/projects/flink/flink-docs-release-1.10/#api-references
导入maven依赖
需要注意的是,如果使用scala写程序,导入的依赖跟java是不一样的
Maven Dependencies You can add the following dependencies to your pom.xml to include Apache Flink in your project. These dependencies include a local execution environment and thus support local testing. Scala API: To use the Scala API, replace the flink-java artifact id with flink-scala_2.11 and flink-streaming-java_2.11 with flink-streaming-scala_2.11. <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>1.8.2</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>1.8.2</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-clients_2.11</artifactId> <version>1.8.2</version> </dependency>
批处理wordcount示例(DataSet API)
import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.util.Collector; public class WordCount { // 批量处理示例代码 public static void main(String[] args) throws Exception { String inputPath = "E:\\flink\\words.txt"; String outputPath = "E:\\flink\\result"; //获取运行环境 ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); //读取文件 DataSet<String> text = env.readTextFile(inputPath); DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1) text.flatMap(new Tokenizer()) // group by the tuple field "0" and sum up tuple field "1" .groupBy(0) //以tuple的第一个字段分组 .sum(1);//以tuple的第二个字段计算总和 //setParallelism来设置并行度,类似spark。如果不设置并行度,将以多线程的形式输出,生成多个文件 counts.writeAsCsv(outputPath, "\n", " ").setParallelism(1); env.execute("Batch WordCount Example"); } // 自定义函数,也可以不在这里自定义,直接卸载上面flatMap()中也可以 public static class Tokenizer implements FlatMapFunction<String, Tuple2<String, Integer>> { @Override public void flatMap(String value, Collector<Tuple2<String, Integer>> out) { // normalize and split the line String[] tokens = value.toLowerCase().split(","); for (String token : tokens) { if (token.length() > 0) { //包装成tuple2 out.collect(new Tuple2<String, Integer>(token, 1)); } } } } }
流式处理wordcount示例(DataStream API)
import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.windowing.time.Time; import org.apache.flink.util.Collector; /** * 滑动窗口计算 * 通过socket模拟产生单词数据 * flink对数据进行统计计算 */ public class SocketWindowWordCount { public static void main(String[] args) throws Exception { //获取socket的端口号 int port; try { ParameterTool parameterTool = ParameterTool.fromArgs(args); port = parameterTool.getInt("port"); }catch (Exception e){ System.out.println("No port set. use default port 9000"); port = 9999; } //获取运行环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); String hostname = "master01.hadoop.mobile.cn"; String delimiter = "\n"; DataStreamSource<String> text = env.socketTextStream(hostname, port, delimiter); //跟spark一样,使用flatmap算子来操作 //输入数据为string类型,输出为自定义的WordWithCount类型对象 DataStream<WordWithCount> windowCounts = text.flatMap(new FlatMapFunction<String, WordWithCount>() { public void flatMap(String value, Collector<WordWithCount> out) throws Exception { String[] splits = value.split(" "); for (String word : splits) { out.collect(new WordWithCount(word, 1L)); } } }).keyBy("word") .timeWindow(Time.seconds(10), Time.seconds(5))//指定时间窗口大小为10秒,指定时间间隔为5秒 //每隔1秒统计前2秒的数据 .sum("count"); //把数据打印到控制台并且设置并行度 windowCounts.print().setParallelism(1); System.out.println(System.currentTimeMillis()); env.execute("Socket window count"); } public static class WordWithCount{ public String word; public long count; public WordWithCount(){} public WordWithCount(String word,long count){ this.word = word; this.count = count; } @Override public String toString() { return "WordWithCount{" + "word='" + word + '\'' + ", count=" + count + '}'; } } }
关于keyby算子:
/** * Partitions the operator state of a {@link DataStream} using field expressions. * A field expression is either the name of a public field or a getter method with parentheses * of the {@link DataStream}'s underlying type. A dot can be used to drill * down into objects, as in {@code "field1.getInnerField2()" }. * * @param fields * One or more field expressions on which the state of the {@link DataStream} operators will be * partitioned. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) * keyby用于分组的,接收的为变长参数,所以key可以指定一个或者多个字段。 * 此外在指定key的时候可以直接指定该字段的名字(但是要求为public类型的,否则报错如下: * Exception in thread "main" org.apache.flink.api.common.InvalidProgramException: This type (GenericType<SocketWindowWordCount.WordWithCount>) cannot be used as key. * at org.apache.flink.api.common.operators.Keys$ExpressionKeys.<init>(Keys.java:330) * at org.apache.flink.streaming.api.datastream.DataStream.keyBy(DataStream.java:337) * at SocketWindowWordCount.main(SocketWindowWordCount.java:41) ) 也可以通过getter方法来获取 **/ public KeyedStream<T, Tuple> keyBy(String... fields) { return keyBy(new Keys.ExpressionKeys<>(fields, getType())); }
flink table sql处理
package com.kong.flink; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.java.BatchTableEnvironment; import java.util.ArrayList; public class FlinkSqlWordCount { public static void main(String[] args) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); //创建一个tableEnvironment BatchTableEnvironment tableEnv = BatchTableEnvironment.create(env); //将word封装成对象 String words = "hello,flink,hello,ksw"; ArrayList<WordCount> list = new ArrayList<>(); String[] split = words.split(","); for (String word : split) { list.add(new WordCount(word, 1L)); } //生成DataSet,类似spark并行化一个集合生成rdd DataSet<WordCount> inputDataSet = env.fromCollection(list); //将dataset转换为table // * @param dataSet The {@link DataSet} to be converted. // * @param fields The field names of the resulting {@link Table}. //第一个参数表示我们要转换为table的dataSet;第二个参数表示table对应的字段名字 Table table = tableEnv.fromDataSet(inputDataSet, "word,frequency"); table.printSchema(); tableEnv.createTemporaryView("WordCount", table); // tableEnv.createTemporaryView("wordCount",inputDataSet,"word,count"); Table table1 = tableEnv.sqlQuery("select word as word, sum(frequency) as frequency from WordCount GROUP BY word"); DataSet<WordCount> resultDataSet = tableEnv.toDataSet(table1, WordCount.class); resultDataSet.printToErr(); } public static class WordCount { public String word; public long frequency;//这里不能用count表示,属于flink sql保留关键词...参考:https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/table/sql/index.html#reserved-keywords //这个无参构造方法必须要有,要不会报错...参考:https://ci.apache.org/projects/flink/flink-docs-release-1.10/zh/dev/api_concepts.html#pojo //org.apache.flink.table.api.ValidationException: Too many fields referenced from an atomic type. public WordCount() { } public WordCount(String word, long frequency) { this.word = word; this.frequency = frequency; } @Override public String toString() { return word + ", " + frequency; } } }