1. flink 基础

flink word count  程序

1. 数据集模式

pom.xml 文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>deng.com</groupId>
    <artifactId>flink_demo</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.10.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.12</artifactId>
            <version>1.10.1</version>
        </dependency>

    </dependencies>

</project>

wordCount 程序

package com.deng;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

public class WordCount {
    public static void main(String[] args) throws Exception {
        // 创建执行环境
        ExecutionEnvironment env =ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(8);
        // 从文件中读取数据
        String inputPath="C:\\Users\\侠客云\\IdeaProjects\\flink_demo\\src\\main\\resources\\hello.txt";
        DataSet<String> inputDataSet = env.readTextFile(inputPath);
        // 对数据集进行处理,按空格分词展开,转换成(word,1)二元组
        DataSet<Tuple2<String,Integer>> resultSets= inputDataSet.flatMap(new MyFlatMaper())
                .groupBy(0)//按照第一个位置word 进行分组
                .sum(1) //按照 第二个位置上的数据求和
        ;
        resultSets.print();
    }
    // 自定义类,实现 FlatMapFunction 接口
    public static class MyFlatMaper implements FlatMapFunction<String, Tuple2<String,Integer>>{
        @Override
        public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
            // 按空格分词
            String[] words=s.split(" ");
            // 遍历所有word,包成二元组
            for (String word : words) {
                collector.collect(new Tuple2<>(word,1));
            }
        }
    }

}

2 流式数据集模式

StreamWordCount 程序

package com.deng;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;



public class StreamWordCount {
public static void main(String[] args) throws Exception {
// 1.创建流处理执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(8);
// 2.
// 从文件中读取数据
// String inputPath="C:\\Users\\侠客云\\IdeaProjects\\flink_demo\\src\\main\\resources\\hello.txt";
// DataStream<String> inPutDataStream = env.readTextFile(inputPath);
// 用parameter tool 从程序启动参数中获取配置项
ParameterTool parameterTool = ParameterTool.fromArgs(args);
String host = parameterTool.get("host");
int port = parameterTool.getInt("port");

// 从socket文件流中获取数据
// DataStream<String> inPutDataStream =env.socketTextStream("hadoop102",7777);
DataStream<String> inPutDataStream =env.socketTextStream(host,port);
DataStream<Tuple2<String, Integer>> resultStream = inPutDataStream.flatMap(new MyFlatMaper())
.keyBy(0)
.sum(1);
resultStream.print();
// 执行任务
env.execute();

}

// 自定义类,实现 FlatMapFunction 接口
public static class MyFlatMaper implements FlatMapFunction<String, Tuple2<String,Integer>> {
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
// 按空格分词
String[] words=s.split(" ");
// 遍历所有word,包成二元组
for (String word : words) {
collector.collect(new Tuple2<>(word,1));
}
}
}
}

 测试环境中执行程序传参代码时,如何运行:

 3. 从集合中获取数据

package com.deng.sourceTest;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;


public class SourceTestCollection {

    public static void main(String[] args) throws Exception {

        // 获取流执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //设置并行度
        env.setParallelism(4);
        //
        DataStream<Integer> integerDataStream= env.fromElements(1, 2, 34, 52, 16700);
        integerDataStream.print("int=》");

        env.execute("deng_flink_job");


    }
}

 4. 从kafka 获取数据

添加依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>deng.com</groupId>
    <artifactId>flink_demo</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
    </properties>

    <dependencies>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.10.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.12</artifactId>
            <version>1.10.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.11.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka_2.12</artifactId>
            <version>0.11.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka-0.11_2.12</artifactId>
            <version>1.10.1</version>
        </dependency>


    </dependencies>

</project>

相关代码:

package com.deng.sourceTest;

import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import org.apache.kafka.clients.consumer.ConsumerConfig;

import java.util.Properties;

public class SourceKafkaTest {
    public static void main(String[] args) throws Exception {
        // 1. 获取执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        // 2.可以设置并行度
        env.setParallelism(1);
        // kafka 配置
        Properties prop = new Properties();
        prop.setProperty("bootstrap.servers","hadoop102:9092");
        prop.setProperty("group.id","deng");
        // 2. 开启自动提交
        prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,true);
        // 3. 自动提交延时 1s
        prop.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,1000);
        // 4. KEY, VAULE 反序列化
        prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringDeserializer");
        prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringDeserializer");

        prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
        DataStreamSource<String> dataStream = env.addSource(new FlinkKafkaConsumer011<String>("first", new SimpleStringSchema(), prop));
        dataStream.print();
        env.execute("deng_flink_job");
    }
}

 5. 自定义SourceFunction

package com.deng.flink;

import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;

import java.util.HashMap;
import java.util.Random;

class SensorReading {
    private String key;
    private Double temp;
    private Long t;

    public SensorReading(String k, Long time, Double tp) {
        key =k;
        temp=tp;
        t=time;
    }

    @Override
    public String toString() {
        return "SensorReading{" +
                "key='" + key + '\'' +
                ", temp=" + temp +
                ", t=" + t +
                '}';
    }
}

public class SourceTestUDF {
    public static void main(String[] args) throws Exception {

        // 1. 获取执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        // 2.可以设置并行度
        env.setParallelism(1);
        DataStreamSource<SensorReading> sensorReadingDataStreamSource = env.addSource(new MySensorSource());
        sensorReadingDataStreamSource.print();
        env.execute();
    }

    // 自定义SourceFunction
    public static class MySensorSource implements SourceFunction<SensorReading> {
        // 定义一个标志位
        private boolean running = true;

        @Override
        public void run(SourceContext<SensorReading> ctx) throws Exception {
            // 定义一个随机数发生器
            Random random = new Random();
            // 设置10个传感器的初始温度
            HashMap<String, Double> sensorTempMap = new HashMap<>();
            for (int i = 0; i < 10; ++i) {
                sensorTempMap.put("sensor_" + i, 60 + random.nextGaussian() * 20);
            }
            while (running) {
                for (String key : sensorTempMap.keySet()) {
                    //当前温度基础上随机波动
                    Double nextTemp = sensorTempMap.get(key) + random.nextGaussian();
                    sensorTempMap.put(key, nextTemp);
                    ctx.collect(new SensorReading(key, System.currentTimeMillis(), nextTemp));

                }
                // 控制输出频率
                Thread.sleep(1000L);

            }
        }

        @Override
        public void cancel() {
            running = false;
        }
    }

}

 

posted @ 2021-05-09 12:07  冰底熊  阅读(104)  评论(0编辑  收藏  举报