Flink 消费 Kafka 数据的三种方式
Kafka作为分布式消息传输队列,是一个高吞吐、易于扩展的消息系统。而消息队列的传输方式,恰恰和流处理是完全一致的。所以可以说Kafka和Flink天生一对,是当前处理流式数据的双子星。在如今的实时流处理应用中,由Kafka进行数据的收集和传输,Flink 进行分析计算,这样的架构已经成为众多企业的首选:
略微遗憾的是,与Kafka的连接比较复杂,Flink内部并没有提供预实现的方法。所以只能采用通用的addSource方式、实现一个SourceFunction了。好在Kafka与Flink确实是非常契合,所以Flink官方提供了连接工具flink-connector-kafka,直接帮我们实现了一个消费者FlinkKafkaConsumer,它就是用来读取Kafka数据的SourceFunction。今天梳理一下Flink 消费 Kafka 几种常见方式
public class MyKafkaSource0824 { /** * 默认模式 * * @param env 执行环境 * @param topic kafka 主题 * @return */ public static FlinkKafkaConsumer createKafkaConsumer(StreamExecutionEnvironment env, String topic) { //配置信息 Properties prop = new Properties(); //zk 地址 prop.setProperty("bootstrap.servers", "hadoop103:9092"); //消费者组 prop.setProperty("group.id", "consumer-group"); prop.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); prop.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); prop.setProperty("auto.offset.reset", "latest"); FlinkKafkaConsumer<String> stringFlinkKafkaConsumer = new FlinkKafkaConsumer<String>(topic, new SimpleStringSchema(), prop); //自动提交 offset stringFlinkKafkaConsumer.setCommitOffsetsOnCheckpoints(true); return stringFlinkKafkaConsumer; } /** * 设置消费位置 * * @param env * @param topic * @param sm * @return */ public static FlinkKafkaConsumer createKafkaConsumerSerDe(StreamExecutionEnvironment env, String topic, StartupMode sm) { //配置信息 Properties prop = new Properties(); //zk 地址 prop.setProperty("bootstrap.servers", "hadoop103:9092"); //消费者组 prop.setProperty("group.id", "consumer-group"); prop.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); prop.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); prop.setProperty("auto.offset.reset", "latest"); FlinkKafkaConsumer<String> stringFlinkKafkaConsumer = new FlinkKafkaConsumer<>(topic, new SimpleStringSchema(), prop); //设置kafka消费位置 if (sm.equals(StartupMode.EARLIEST)) { stringFlinkKafkaConsumer.setStartFromEarliest(); } else if (sm.equals(StartupMode.LATEST)) { stringFlinkKafkaConsumer.setStartFromLatest(); } //自动提交 offset stringFlinkKafkaConsumer.setCommitOffsetsOnCheckpoints(true); return stringFlinkKafkaConsumer; } public static FlinkKafkaConsumer mutiTopicKafkaConsumerSerDe(StreamExecutionEnvironment env) { //配置信息 Properties prop = new Properties(); //zk 地址 prop.setProperty("bootstrap.servers", "hadoop103:9092"); //消费者组 prop.setProperty("group.id", "consumer-group"); prop.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); prop.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); prop.setProperty("auto.offset.reset", "latest"); LinkedList<Object> topics = new LinkedList<>(); topics.add("lhc"); topics.add("tbg"); FlinkKafkaConsumer flinkKafkaConsumer = new FlinkKafkaConsumer(topics, new SimpleStringSchema(), prop); //自动提交 offset flinkKafkaConsumer.setCommitOffsetsOnCheckpoints(true); return flinkKafkaConsumer; } public static void main(String[] args) throws Exception { //获取执行环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); //方便测试:设置并行度1,生产环境设置为 kafka 的分区数 env.setParallelism(1); //kafka topic String topic = "lhc"; //方式1 基本模式 //FlinkKafkaConsumer kafkaConsumer = MyKafkaSource0824.createKafkaConsumer(env, topic); //方式2 自定义消费位置 //FlinkKafkaConsumer kafkaConsumerSerDe = MyKafkaSource0824.createKafkaConsumerSerDe(env, topic, StartupMode.LATEST); //方式3 多 topic 模式 FlinkKafkaConsumer kafkaConsumerSerDe = MyKafkaSource0824.mutiTopicKafkaConsumerSerDe(env); env.addSource(kafkaConsumerSerDe).print(); env.execute(); } }