1. flink 基础
flink word count 程序
1. 数据集模式
pom.xml 文件
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>deng.com</groupId> <artifactId>flink_demo</artifactId> <version>1.0-SNAPSHOT</version> <properties> <maven.compiler.source>8</maven.compiler.source> <maven.compiler.target>8</maven.compiler.target> </properties> <dependencies> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>1.10.1</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.12</artifactId> <version>1.10.1</version> </dependency> </dependencies> </project>
wordCount 程序
package com.deng; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.util.Collector; public class WordCount { public static void main(String[] args) throws Exception { // 创建执行环境 ExecutionEnvironment env =ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(8); // 从文件中读取数据 String inputPath="C:\\Users\\侠客云\\IdeaProjects\\flink_demo\\src\\main\\resources\\hello.txt"; DataSet<String> inputDataSet = env.readTextFile(inputPath); // 对数据集进行处理,按空格分词展开,转换成(word,1)二元组 DataSet<Tuple2<String,Integer>> resultSets= inputDataSet.flatMap(new MyFlatMaper()) .groupBy(0)//按照第一个位置word 进行分组 .sum(1) //按照 第二个位置上的数据求和 ; resultSets.print(); } // 自定义类,实现 FlatMapFunction 接口 public static class MyFlatMaper implements FlatMapFunction<String, Tuple2<String,Integer>>{ @Override public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception { // 按空格分词 String[] words=s.split(" "); // 遍历所有word,包成二元组 for (String word : words) { collector.collect(new Tuple2<>(word,1)); } } } }
2 流式数据集模式
StreamWordCount 程序
package com.deng;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class StreamWordCount {
public static void main(String[] args) throws Exception {
// 1.创建流处理执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(8);
// 2.
// 从文件中读取数据
// String inputPath="C:\\Users\\侠客云\\IdeaProjects\\flink_demo\\src\\main\\resources\\hello.txt";
// DataStream<String> inPutDataStream = env.readTextFile(inputPath);
// 用parameter tool 从程序启动参数中获取配置项
ParameterTool parameterTool = ParameterTool.fromArgs(args);
String host = parameterTool.get("host");
int port = parameterTool.getInt("port");
// 从socket文件流中获取数据
// DataStream<String> inPutDataStream =env.socketTextStream("hadoop102",7777);
DataStream<String> inPutDataStream =env.socketTextStream(host,port);
DataStream<Tuple2<String, Integer>> resultStream = inPutDataStream.flatMap(new MyFlatMaper())
.keyBy(0)
.sum(1);
resultStream.print();
// 执行任务
env.execute();
}
// 自定义类,实现 FlatMapFunction 接口
public static class MyFlatMaper implements FlatMapFunction<String, Tuple2<String,Integer>> {
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
// 按空格分词
String[] words=s.split(" ");
// 遍历所有word,包成二元组
for (String word : words) {
collector.collect(new Tuple2<>(word,1));
}
}
}
}
测试环境中执行程序传参代码时,如何运行:
3. 从集合中获取数据
package com.deng.sourceTest; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; public class SourceTestCollection { public static void main(String[] args) throws Exception { // 获取流执行环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); //设置并行度 env.setParallelism(4); // DataStream<Integer> integerDataStream= env.fromElements(1, 2, 34, 52, 16700); integerDataStream.print("int=》"); env.execute("deng_flink_job"); } }
4. 从kafka 获取数据
添加依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>deng.com</groupId> <artifactId>flink_demo</artifactId> <version>1.0-SNAPSHOT</version> <properties> <maven.compiler.source>8</maven.compiler.source> <maven.compiler.target>8</maven.compiler.target> </properties> <dependencies> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>1.10.1</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.12</artifactId> <version>1.10.1</version> </dependency> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka-clients</artifactId> <version>0.11.0.0</version> </dependency> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka_2.12</artifactId> <version>0.11.0.0</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-kafka-0.11_2.12</artifactId> <version>1.10.1</version> </dependency> </dependencies> </project>
相关代码:
package com.deng.sourceTest; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011; import org.apache.kafka.clients.consumer.ConsumerConfig; import java.util.Properties; public class SourceKafkaTest { public static void main(String[] args) throws Exception { // 1. 获取执行环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // 2.可以设置并行度 env.setParallelism(1); // kafka 配置 Properties prop = new Properties(); prop.setProperty("bootstrap.servers","hadoop102:9092"); prop.setProperty("group.id","deng"); // 2. 开启自动提交 prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,true); // 3. 自动提交延时 1s prop.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,1000); // 4. KEY, VAULE 反序列化 prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringDeserializer"); prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringDeserializer"); prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest"); DataStreamSource<String> dataStream = env.addSource(new FlinkKafkaConsumer011<String>("first", new SimpleStringSchema(), prop)); dataStream.print(); env.execute("deng_flink_job"); } }
5. 自定义SourceFunction
package com.deng.flink; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.source.SourceFunction; import java.util.HashMap; import java.util.Random; class SensorReading { private String key; private Double temp; private Long t; public SensorReading(String k, Long time, Double tp) { key =k; temp=tp; t=time; } @Override public String toString() { return "SensorReading{" + "key='" + key + '\'' + ", temp=" + temp + ", t=" + t + '}'; } } public class SourceTestUDF { public static void main(String[] args) throws Exception { // 1. 获取执行环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // 2.可以设置并行度 env.setParallelism(1); DataStreamSource<SensorReading> sensorReadingDataStreamSource = env.addSource(new MySensorSource()); sensorReadingDataStreamSource.print(); env.execute(); } // 自定义SourceFunction public static class MySensorSource implements SourceFunction<SensorReading> { // 定义一个标志位 private boolean running = true; @Override public void run(SourceContext<SensorReading> ctx) throws Exception { // 定义一个随机数发生器 Random random = new Random(); // 设置10个传感器的初始温度 HashMap<String, Double> sensorTempMap = new HashMap<>(); for (int i = 0; i < 10; ++i) { sensorTempMap.put("sensor_" + i, 60 + random.nextGaussian() * 20); } while (running) { for (String key : sensorTempMap.keySet()) { //当前温度基础上随机波动 Double nextTemp = sensorTempMap.get(key) + random.nextGaussian(); sensorTempMap.put(key, nextTemp); ctx.collect(new SensorReading(key, System.currentTimeMillis(), nextTemp)); } // 控制输出频率 Thread.sleep(1000L); } } @Override public void cancel() { running = false; } } }
有疑问可以加wx:18179641802,进行探讨