Flink 实现 WordCount
pom.xml
<properties> <flink.version>1.13.0</flink.version> <java.version>1.8</java.version> <scala.binary.version>2.12</scala.binary.version> <slf4j.version>1.7.30</slf4j.version> </properties> <dependencies> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_${scala.binary.version}</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-clients_${scala.binary.version}</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-runtime-web_${scala.binary.version}</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>${slf4j.version}</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>${slf4j.version}</version> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-to-slf4j</artifactId> <version>2.14.0</version> </dependency> </dependencies>
flink 批处理 WordCount
import org.apache.flink.api.common.typeinfo.Types; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.operators.AggregateOperator; import org.apache.flink.api.java.operators.DataSource; import org.apache.flink.api.java.operators.FlatMapOperator; import org.apache.flink.api.java.operators.UnsortedGrouping; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.util.Collector; /** * 批处理 wordcount */ public class BatchWordCount { public static void main(String[] args) throws Exception { //1、创建执行环境 ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); //2、读取数据 DataSource<String> lineDS = env.readTextFile("input/word.txt"); //lineDS 分词&转换结构 FlatMapOperator<String, Tuple2<String, Long>> wordOne = lineDS.flatMap((String line, Collector<Tuple2<String, Long>> out) -> { //分词 String[] words = line.split(" "); //包装元组 for (String word : words) { out.collect(Tuple2.of(word, 1L)); } }).returns(Types.TUPLE(Types.STRING, Types.LONG)); //按照第一个元素分组 UnsortedGrouping<Tuple2<String, Long>> groupDate = wordOne.groupBy(0); //聚合 AggregateOperator<Tuple2<String, Long>> sum = groupDate.sum(1); //输出结果 sum.print(); } }
准备好样例数据,运行程序可以看到运行效果。
flink 有界流 woedcount
import org.apache.flink.api.common.typeinfo.Types; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.KeyedStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; /** * 有界流 wordcount */ public class BoundeStreamWordCount { public static void main(String[] args) throws Exception { //流处理环节 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); //读取数据 DataStreamSource<String> lineDS = env.readTextFile("input/word.txt"); SingleOutputStreamOperator<Tuple2<String, Long>> wordOne = lineDS.flatMap((String line, Collector<Tuple2<String, Long>> out) -> { String[] words = line.split(" "); for (String word : words) { out.collect(Tuple2.of(word, 1L)); } }).returns(Types.TUPLE(Types.STRING, Types.LONG)); KeyedStream<Tuple2<String, Long>, String> dataGroup = wordOne.keyBy(data -> data.f0); dataGroup.sum(1).print(); //开启任务 env.execute(); } }
flink 无界流 woedcount
import org.apache.flink.api.common.typeinfo.Types; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; /** * 无界流 org.wdh01.wc.StreamWordCount * --host hadoop103 --port 9999 */ public class StreamWordCount { public static void main(String[] args) throws Exception { //从参数读取 host & port ParameterTool parameterTool = ParameterTool.fromArgs(args); String host = parameterTool.get("host"); int port = parameterTool.getInt("port"); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); DataStreamSource<String> socketTextStream = env.socketTextStream(host, port); socketTextStream.flatMap((String line, Collector<Tuple2<String, Long>> out) -> { String[] s = line.split(" "); for (String s1 : s) { out.collect(Tuple2.of(s1, 1L)); } }).returns(Types.TUPLE(Types.STRING, Types.LONG)) .keyBy(data -> data.f0) .sum(1) .print(); env.execute(); } }
注意:需要提前运行 nc 服务,在运行应用程序,否则运行程序直接报错.。