Hudi-Flink消费kafka将增量数据实时写入Hudi(java)
零、步骤
一、Flink SQL集成Kafka

1.创建topic(一分区一备份)
flink-topic
2.准备flink-sql-connector-kafka_2.12-1.13.1.jar,放入flink/lib下
3.启动client,指定jar
./sql-client.sh embedded -j ../lib/flink-sql-connector-kafka_2.12-1.13.1.jar shell
设置分析结果展示模式为:set execution.result-mode=tableau;
4.创建表,映射到kafka topic
kafka topic中数据是CSV文件格式,有三个字段,user_id、item_id、behavior,从kafka消费数据时,设置从最新偏移量开始
CREATE TABLE test_kafka( `user_id` BIGINT, `item_id` BIGINT, `behavior` STRING ) WITH( 'connector' = 'kafka', 'topic'='flink-topic', 'properties.bootstrap.servers' = 'localhost:9092', 'properties.group.id' = 'test-group-10001', 'scan.startup.mode' = 'latest-offset', 'format' = 'csv' ); Flink SQL> select * from test_kafka; +----+----------------------+----------------------+--------------------------------+ | op | user_id | item_id | behavior | +----+----------------------+----------------------+————————————————+
5.kafka写入数据
kafka-console-producer.sh --broker-list localhost:9092 —-topic flink-topic 1001,90001,click 1001,90001,browser 1001,90001,click 1002,90002,click 1002,90003,click 1003,90001,order 1004,90001,order
MacBook-Pro:bin FengZhen$ kafka-console-producer.sh --broker-list localhost:9092 --topic flink-topic >1001,90001,click 1001,90001,browser 1001,90001,click 1002,90002,click 1002,90003,click 1003,90001,order 1004,90001,order >>>>>>> 数据可实时查询处理 Flink SQL> select * from test_kafka; +----+----------------------+----------------------+--------------------------------+ | op | user_id | item_id | behavior | +----+----------------------+----------------------+--------------------------------+ | +I | 1001 | 90001 | click | | +I | 1001 | 90001 | browser | | +I | 1001 | 90001 | click | | +I | 1002 | 90002 | click | | +I | 1002 | 90003 | click | | +I | 1003 | 90001 | order | | +I | 1004 | 90001 | order |
二、代码实现
package com.zhen.hudi; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.TableEnvironment; import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import static org.apache.flink.table.api.Expressions.$; /** * @Author FengZhen * @Date 3/9/22 10:17 PM * @Description 基于Flink SQL Connector实现:实时消费topic中数据,转换处理后,实时存储到hudi表中 */ public class FlinkSQLHudiDemo { public static void main(String[] args) { //1.获取表的执行环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); //并行度设置为1 env.setParallelism(1); //TODO: 由于增量将数据写入到Hudi表,所以需要启动Flink Checkpoint 检查点 env.enableCheckpointing(5 * 1000); EnvironmentSettings settings = EnvironmentSettings .newInstance() .inStreamingMode()//设置流式模式 .build(); StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings); //2.创建输入表,TODO:从kafka消费数据 tableEnv.executeSql( "CREATE TABLE order_kafka_source(\n" + " `orderId` STRING,\n" + " `userId` STRING,\n" + " `orderTime` STRING,\n" + " `ip` STRING,\n" + " `orderMoney` DOUBLE,\n" + " `orderStatus` INT\n" + ")\n" + "WITH(\n" + " 'connector' = 'kafka',\n" + " 'topic'='order-topic',\n" + " 'properties.bootstrap.servers' = 'localhost:9092',\n" + " 'properties.group.id' = 'gid-1001',\n" + " 'scan.startup.mode' = 'latest-offset',\n" + " 'format' = 'json',\n" + " 'json.fail-on-missing-field' = 'false',\n" + " 'json.ignore-parse-errors' = 'true'\n" + ")\n" ); //3.转换数据,可以使用SQL,也可以是TableAPI Table etlTable = tableEnv .from("order_kafka_source") //添加字段:hudi数据合并的字段,时间戳 .addColumns( $("orderId").substring(0,17).as("ts") ) //添加字段:Hudi表分区字段,"orderTime": 2022-03-09 22:21:13.124 .addColumns( $("orderTime").substring(0, 10).as("partition_day") ); tableEnv.createTemporaryView("view_order", etlTable); //4.创建输出表,TODO:关联到hudi表,指定hudi表名称,存储路径,字段名称等信息 tableEnv.executeSql( "CREATE TABLE order_hudi_sink(\n" + " `orderId` STRING PRIMARY KEY NOT ENFORCED,\n" + " `userId` STRING,\n" + " `orderTime` STRING,\n" + " `ip` STRING,\n" + " `orderMoney` DOUBLE,\n" + " `orderStatus` INT,\n" + " `ts` STRING,\n" + " `partition_day` STRING\n" + ")\n" + "PARTITIONED BY (partition_day)\n" + "WITH(\n" + " 'connector' = 'hudi',\n" + " 'path'='hdfs://localhost:9000/hudi-warehouse/flink_hudi_order',\n" + " 'table.type' = 'MERGE_ON_READ',\n" + " 'write.operation' = 'upsert',\n" + " 'hoodie.datasource.write.recordkey.field' = 'orderId',\n" + " 'write.precombine.field' = 'ts',\n" + " 'write.tasks' = '1'\n" + ")\n" ); //5.通过子查询的方式,将数据写入输出表 tableEnv.executeSql( "INSERT INTO order_hudi_sink " + "SELECT orderId, userId, orderTime, ip, orderMoney, orderStatus, ts, partition_day FROM view_order" ); } }
kafka数据生成工具类
package com.zhen.hudi.streaming import java.util.Properties import org.apache.commons.lang3.time.FastDateFormat import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} import org.apache.kafka.common.serialization.StringSerializer import org.json4s.jackson.Json import scala.util.Random /** * 订单实体类(Case Class) * * @param orderId 订单ID * @param userId 用户ID * @param orderTime 订单日期时间 * @param ip 下单IP地址 * @param orderMoney 订单金额 * @param orderStatus 订单状态 */ case class OrderRecord( orderId: String, userId: String, orderTime: String, ip: String, orderMoney: Double, orderStatus: Int ) /** * @Author FengZhen * @Date 3/3/22 9:54 PM * @Description TODO * 模拟生产订单数据,发送到Kafka Topic中 * Topic中每条数据Message类型为String,以JSON格式数据发送 * 数据转换: * 将Order类实例对象转换为JSON格式字符串数据(可以使用json4s类库) */ object MockOrderProducer { def main(args: Array[String]): Unit = { var producer: KafkaProducer[String, String] = null try { // 1. Kafka Client Producer 配置信息 val props = new Properties() props.put("bootstrap.servers", "localhost:9092") props.put("acks", "1") props.put("retries", "3") // props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") // props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") props.put("key.serializer", classOf[StringSerializer].getName) props.put("value.serializer", classOf[StringSerializer].getName) // 2. 创建KafkaProducer对象,传入配置信息 producer = new KafkaProducer[String, String](props) // 随机数实例对象 val random: Random = new Random() // 订单状态:订单打开 0,订单取消 1,订单关闭 2,订单完成 3 val allStatus = Array(0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) while (true) { // 每次循环 模拟产生的订单数目 val batchNumber: Int = random.nextInt(1) + 1 (1 to batchNumber).foreach { number => val currentTime: Long = System.currentTimeMillis() val orderId: String = s"${getDate(currentTime)}%06d".format(number) val userId: String = s"${1 + random.nextInt(5)}%08d".format(random.nextInt(1000)) val orderTime: String = getDate(currentTime, format = "yyyy-MM-dd HH:mm:ss.SSS") val orderMoney: String = s"${5 + random.nextInt(500)}.%02d".format(random.nextInt(100)) val orderStatus: Int = allStatus(random.nextInt(allStatus.length)) // 3. 订单记录数据 val orderRecord: OrderRecord = OrderRecord( orderId, userId, orderTime, getRandomIp, orderMoney.toDouble, orderStatus ) // 转换为JSON格式数据 val orderJson = new Json(org.json4s.DefaultFormats).write(orderRecord) println(orderJson) // 4. 构建ProducerRecord对象 val record = new ProducerRecord[String, String]("order-topic", orderId, orderJson) // 5. 发送数据:def send(messages: KeyedMessage[K,V]*), 将数据发送到Topic producer.send(record) } Thread.sleep(random.nextInt(500) + 5000) } } catch { case e: Exception => e.printStackTrace() } finally { if (null != producer) producer.close() } } /** =================获取当前时间================= */ def getDate(time: Long, format: String = "yyyyMMddHHmmssSSS"): String = { val fastFormat: FastDateFormat = FastDateFormat.getInstance(format) val formatDate: String = fastFormat.format(time) // 格式化日期 formatDate } /** ================= 获取随机IP地址 ================= */ def getRandomIp: String = { // ip范围 val range: Array[(Int, Int)] = Array( (607649792, 608174079), //36.56.0.0-36.63.255.255 (1038614528, 1039007743), //61.232.0.0-61.237.255.255 (1783627776, 1784676351), //106.80.0.0-106.95.255.255 (2035023872, 2035154943), //121.76.0.0-121.77.255.255 (2078801920, 2079064063), //123.232.0.0-123.235.255.255 (-1950089216, -1948778497), //139.196.0.0-139.215.255.255 (-1425539072, -1425014785), //171.8.0.0-171.15.255.255 (-1236271104, -1235419137), //182.80.0.0-182.92.255.255 (-770113536, -768606209), //210.25.0.0-210.47.255.255 (-569376768, -564133889) //222.16.0.0-222.95.255.255 ) // 随机数:IP地址范围下标 val random = new Random() val index = random.nextInt(10) val ipNumber: Int = range(index)._1 + random.nextInt(range(index)._2 - range(index)._1) // 转换Int类型IP地址为IPv4格式 number2IpString(ipNumber) } /** =================将Int类型IPv4地址转换为字符串类型================= */ def number2IpString(ip: Int): String = { val buffer: Array[Int] = new Array[Int](4) buffer(0) = (ip >> 24) & 0xff buffer(1) = (ip >> 16) & 0xff buffer(2) = (ip >> 8) & 0xff buffer(3) = ip & 0xff // 返回IPv4地址 buffer.mkString(".") } }
分类:
Hudi
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 记一次.NET内存居高不下排查解决与启示
2020-03-09 机器学习-kmeans(实现步骤、sklearn实现、python自实现、优缺点)