Flink三种API的wordcount

官网参考：https://ci.apache.org/projects/flink/flink-docs-release-1.10/#api-references

导入maven依赖

需要注意的是，如果使用scala写程序，导入的依赖跟java是不一样的

Maven Dependencies
You can add the following dependencies to your pom.xml to include Apache Flink in your project. These dependencies include a local execution environment and thus support local testing.

Scala API: To use the Scala API, replace the flink-java artifact id with flink-scala_2.11 and flink-streaming-java_2.11 with flink-streaming-scala_2.11.
<dependency>
  <groupId>org.apache.flink</groupId>
  <artifactId>flink-java</artifactId>
  <version>1.8.2</version>
</dependency>
<dependency>
  <groupId>org.apache.flink</groupId>
  <artifactId>flink-streaming-java_2.11</artifactId>
  <version>1.8.2</version>
</dependency>
<dependency>
  <groupId>org.apache.flink</groupId>
  <artifactId>flink-clients_2.11</artifactId>
  <version>1.8.2</version>
</dependency>

批处理wordcount示例（DataSet API）

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

public class WordCount {

    // 批量处理示例代码
    public static void main(String[] args) throws Exception {
        String inputPath = "E:\\flink\\words.txt";
        String outputPath = "E:\\flink\\result";
        //获取运行环境
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        //读取文件
        DataSet<String> text = env.readTextFile(inputPath);

        DataSet<Tuple2<String, Integer>> counts =
                // split up the lines in pairs (2-tuples) containing: (word,1)
                text.flatMap(new Tokenizer())
                        // group by the tuple field "0" and sum up tuple field "1"
                        .groupBy(0) //以tuple的第一个字段分组
                        .sum(1);//以tuple的第二个字段计算总和

        //setParallelism来设置并行度，类似spark。如果不设置并行度，将以多线程的形式输出，生成多个文件
        counts.writeAsCsv(outputPath, "\n", " ").setParallelism(1);

        env.execute("Batch WordCount Example");

    }

    // 自定义函数，也可以不在这里自定义，直接卸载上面flatMap()中也可以
    public static class Tokenizer implements FlatMapFunction<String, Tuple2<String, Integer>> {

        @Override
        public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
            // normalize and split the line
            String[] tokens = value.toLowerCase().split(",");

            for (String token : tokens) {
                if (token.length() > 0) {
                    //包装成tuple2
                    out.collect(new Tuple2<String, Integer>(token, 1));
                }
            }
        }
    }
}

流式处理wordcount示例（DataStream API）

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

/**
 *  滑动窗口计算
 * 通过socket模拟产生单词数据
 * flink对数据进行统计计算
 */
public class SocketWindowWordCount {

    public static void main(String[] args) throws Exception {
        //获取socket的端口号
        int port;
        try {
            ParameterTool parameterTool = ParameterTool.fromArgs(args);
            port = parameterTool.getInt("port");
        }catch (Exception e){
            System.out.println("No port set. use default port 9000");
            port = 9999;
        }

        //获取运行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        String hostname = "master01.hadoop.mobile.cn";
        String delimiter = "\n";
        DataStreamSource<String> text  = env.socketTextStream(hostname, port, delimiter);
        //跟spark一样，使用flatmap算子来操作
        //输入数据为string类型，输出为自定义的WordWithCount类型对象
        DataStream<WordWithCount> windowCounts = text.flatMap(new FlatMapFunction<String, WordWithCount>() {
            public void flatMap(String value, Collector<WordWithCount> out) throws Exception {
                String[] splits = value.split(" ");
                for (String word : splits) {
                    out.collect(new WordWithCount(word, 1L));
                }
            }
        }).keyBy("word")
                .timeWindow(Time.seconds(10), Time.seconds(5))//指定时间窗口大小为10秒，指定时间间隔为5秒
                //每隔1秒统计前2秒的数据
                .sum("count");

        //把数据打印到控制台并且设置并行度
        windowCounts.print().setParallelism(1);
        System.out.println(System.currentTimeMillis());
        env.execute("Socket window count");
    }

    public static class WordWithCount{
        public String word;
        public long count;
        public  WordWithCount(){}
        public WordWithCount(String word,long count){
            this.word = word;
            this.count = count;
        }
        @Override
        public String toString() {
            return "WordWithCount{" +
                    "word='" + word + '\'' +
                    ", count=" + count +
                    '}';
        }
    }

}

关于keyby算子：

    /**
     * Partitions the operator state of a {@link DataStream} using field expressions.
     * A field expression is either the name of a public field or a getter method with parentheses
     * of the {@link DataStream}'s underlying type. A dot can be used to drill
     * down into objects, as in {@code "field1.getInnerField2()" }.
     *
     * @param fields
     *            One or more field expressions on which the state of the {@link DataStream} operators will be
     *            partitioned.
     * @return The {@link DataStream} with partitioned state (i.e. KeyedStream)
     * keyby用于分组的，接收的为变长参数，所以key可以指定一个或者多个字段。
     *    此外在指定key的时候可以直接指定该字段的名字（但是要求为public类型的，否则报错如下：
     *    Exception in thread "main" org.apache.flink.api.common.InvalidProgramException: This type (GenericType<SocketWindowWordCount.WordWithCount>) cannot be used as key.
     *    at org.apache.flink.api.common.operators.Keys$ExpressionKeys.<init>(Keys.java:330)
     *    at org.apache.flink.streaming.api.datastream.DataStream.keyBy(DataStream.java:337)
     *    at SocketWindowWordCount.main(SocketWindowWordCount.java:41)
     ）
     也可以通过getter方法来获取
     **/
    public KeyedStream<T, Tuple> keyBy(String... fields) {
        return keyBy(new Keys.ExpressionKeys<>(fields, getType()));
    }

flink table sql处理

package com.kong.flink;

import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.BatchTableEnvironment;

import java.util.ArrayList;

public class FlinkSqlWordCount {

    public static void main(String[] args) throws Exception {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        //创建一个tableEnvironment
        BatchTableEnvironment tableEnv = BatchTableEnvironment.create(env);

        //将word封装成对象
        String words = "hello,flink,hello,ksw";
        ArrayList<WordCount> list = new ArrayList<>();
        String[] split = words.split(",");
        for (String word : split) {
            list.add(new WordCount(word, 1L));
        }

        //生成DataSet,类似spark并行化一个集合生成rdd
        DataSet<WordCount> inputDataSet = env.fromCollection(list);
        //将dataset转换为table
        //     * @param dataSet The {@link DataSet} to be converted.
        //     * @param fields The field names of the resulting {@link Table}.
        //第一个参数表示我们要转换为table的dataSet;第二个参数表示table对应的字段名字
        Table table = tableEnv.fromDataSet(inputDataSet, "word,frequency");
        table.printSchema();
        tableEnv.createTemporaryView("WordCount", table);
//        tableEnv.createTemporaryView("wordCount",inputDataSet,"word,count");
        Table table1 = tableEnv.sqlQuery("select word as word, sum(frequency) as frequency from WordCount GROUP BY word");
        DataSet<WordCount> resultDataSet = tableEnv.toDataSet(table1, WordCount.class);
        resultDataSet.printToErr();
    }

    public static class WordCount {
        public String word;
        public long frequency;//这里不能用count表示，属于flink sql保留关键词...参考：https://ci.apache.org/projects/flink/flink-docs-release-1.10/dev/table/sql/index.html#reserved-keywords

        //这个无参构造方法必须要有，要不会报错...参考：https://ci.apache.org/projects/flink/flink-docs-release-1.10/zh/dev/api_concepts.html#pojo
        //org.apache.flink.table.api.ValidationException: Too many fields referenced from an atomic type.
        public WordCount() {
        }

        public WordCount(String word, long frequency) {
            this.word = word;
            this.frequency = frequency;
        }

        @Override
        public String toString() {
            return word + ", " + frequency;
        }
    }
}

posted @ 2019-12-04 17:36 sw_kong 阅读(1339) 评论(2) 编辑收藏举报

刷新页面返回顶部

sw_kong

Flink三种API的wordcount

导入maven依赖

批处理wordcount示例（DataSet API）

流式处理wordcount示例（DataStream API）

flink table sql处理

公告