Flink CDC采集MySQL binlog日志实时写入ClickHouse
一、DataStream API
这里以通过Flink CDC采集MySQL的binlog日志实时写入ClickHouse中:
package com.shydow; import com.alibaba.fastjson.JSONObject; import com.bangdao.bigdata.schema.CustomDebeziumDeserializationSchema; import com.bangdao.bigdata.sink.CustomClickHouseSink; import com.bangdao.bigdata.utils.GetStreamExecutionEnvironment; import com.ververica.cdc.connectors.mysql.MySqlSource; import com.ververica.cdc.connectors.mysql.table.StartupOptions; import com.ververica.cdc.debezium.DebeziumSourceFunction; import com.ververica.cdc.debezium.StringDebeziumDeserializationSchema; import org.apache.commons.lang3.StringUtils; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.contrib.streaming.state.RocksDBStateBackend; import org.apache.flink.shaded.guava18.com.google.common.collect.Lists; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction; import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows; import org.apache.flink.streaming.api.windowing.time.Time; import org.apache.flink.streaming.api.windowing.windows.TimeWindow; import org.apache.flink.util.Collector; import org.apache.flink.util.OutputTag; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Properties; /** * @author shydow * @date 2021-08-18 */ public class CdcLauncher { public static void main(String[] args) throws Exception { ParameterTool parameterTool = ParameterTool.fromArgs(args); String s = parameterTool.get("propertiesPath"); Properties properties = new Properties(); try { InputStream resourceAsStream = null; if (StringUtils.isEmpty(s)) { resourceAsStream = CdcLauncher.class.getClassLoader().getResourceAsStream("CdcLauncher.properties"); } else { resourceAsStream = new FileInputStream(new File(s)); } properties.load(resourceAsStream); } catch (IOException e) { throw new IOException(e); } StreamExecutionEnvironment env = GetStreamExecutionEnvironment.getEnv(); env.setStateBackend(new RocksDBStateBackend("file:///tmp/ckp")); DebeziumSourceFunction<JSONObject> sourceFunction = MySqlSource.<JSONObject>builder() .hostname(properties.getProperty("HOSTNAME")) .port(Integer.parseInt(properties.getProperty("PORT"))) .databaseList(properties.getProperty("DATABASE_LIST")) .tableList(properties.getProperty("TABLE_LIST").split(",")) .username(properties.getProperty("USERNAME")) .password(properties.getProperty("PASSWORD")) .startupOptions(StartupOptions.initial()) .deserializer(new CustomDebeziumDeserializationSchema()) .build(); DataStreamSource<JSONObject> source = env.addSource(sourceFunction); // 按照库和表进行分流 OutputTag<JSONObject> t1 = new OutputTag<JSONObject>("output-tag-one") { }; OutputTag<JSONObject> t2 = new OutputTag<JSONObject>("output-tag-two") { }; SingleOutputStreamOperator<Object> processStream = source.process(new ProcessFunction<JSONObject, Object>() { @Override public void processElement(JSONObject jsonObject, Context context, Collector<Object> collector) throws Exception { if ("cdc_test_1".equals(jsonObject.getString("cdc_mysql_table"))) { context.output(t1, jsonObject); } else if ("cdc_test".equals(jsonObject.getString("cdc_mysql_table"))) { context.output(t2, jsonObject); } } }); DataStream<JSONObject> sourceOne = processStream.getSideOutput(t1); DataStream<JSONObject> sourceTwo = processStream.getSideOutput(t2); // sink ClickHouse sourceOne .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(10))) .apply(new AllWindowFunction<JSONObject, List<JSONObject>, TimeWindow>() { @Override public void apply(TimeWindow timeWindow, Iterable<JSONObject> iterable, Collector<List<JSONObject>> collector) throws Exception { ArrayList<JSONObject> list = Lists.newArrayList(iterable); collector.collect(list); } }) .addSink(new CustomClickHouseSink(properties, "ods.cdc_test_1")).setParallelism(1); sourceTwo .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(10))) .apply(new AllWindowFunction<JSONObject, List<JSONObject>, TimeWindow>() { @Override public void apply(TimeWindow timeWindow, Iterable<JSONObject> iterable, Collector<List<JSONObject>> collector) throws Exception { ArrayList<JSONObject> list = Lists.newArrayList(iterable); collector.collect(list); } }) .addSink(new CustomClickHouseSink(properties, "ods.cdc_test")).setParallelism(1); env.execute(); } }
package com.shydow.schema; import com.alibaba.fastjson.JSONObject; import com.ververica.cdc.debezium.DebeziumDeserializationSchema; import org.apache.flink.api.common.typeinfo.BasicTypeInfo; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.util.Collector; import org.apache.kafka.connect.data.Field; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.source.SourceRecord; import java.util.List; /** * @author shydow * @date 2021-08-18 */ public class CustomDebeziumDeserializationSchema implements DebeziumDeserializationSchema<JSONObject> { @Override public void deserialize(SourceRecord record, Collector<JSONObject> out) throws Exception { Struct dataRecord = (Struct) record.value(); Struct afterStruct = dataRecord.getStruct("after"); Struct beforeStruct = dataRecord.getStruct("before"); /* todo 1,同时存在 beforeStruct 跟 afterStruct数据的话,就代表是update的数据 2,只存在 beforeStruct 就是delete数据 3,只存在 afterStruct数据 就是insert数据 */ JSONObject mainJson = new JSONObject(); JSONObject dataJson = new JSONObject(); JSONObject pkJson = new JSONObject(); String canal_type = ""; List<Field> fieldsList = null; if (afterStruct != null && beforeStruct != null) { canal_type = "update"; fieldsList = afterStruct.schema().fields(); //todo 字段与值 for (Field field : fieldsList) { String fieldName = field.name(); Object fieldValue = afterStruct.get(fieldName); dataJson.put(fieldName, fieldValue); } } else if (afterStruct != null) { canal_type = "insert"; fieldsList = afterStruct.schema().fields(); for (Field field : fieldsList) { String fieldName = field.name(); Object fieldValue = afterStruct.get(fieldName); dataJson.put(fieldName, fieldValue); } } else if (beforeStruct != null) { canal_type = "detele"; fieldsList = beforeStruct.schema().fields(); for (Field field : fieldsList) { String fieldName = field.name(); Object fieldValue = beforeStruct.get(fieldName); dataJson.put(fieldName, fieldValue); } } //todo 拿到databases table信息 Struct source = dataRecord.getStruct("source"); Object db = source.get("db"); Object table = source.get("table"); Object ts_ms = source.get("ts_ms"); mainJson.put("cdc_mysql_database", db); mainJson.put("cdc_mysql_table", table); mainJson.put("cdc_canal_ts", ts_ms); mainJson.put("cdc_canal_type", canal_type); //todo 拿到topic String topic = record.topic(); //todo 主键字段 Struct pk = (Struct) record.key(); List<Field> pkFieldList = pk.schema().fields(); int partitionerNum = 0; for (Field field : pkFieldList) { Object pkValue = pk.get(field.name()); pkJson.put(field.name(), pkValue); partitionerNum += pkValue.hashCode(); } int hash = Math.abs(partitionerNum) % 3; mainJson.put("cdc_pk_hashcode", hash); mainJson.put("cdc_pk", pkJson); mainJson.put("cdc_data", dataJson); out.collect(mainJson); } @Override public TypeInformation<JSONObject> getProducedType() { return BasicTypeInfo.of(JSONObject.class); } }
package com.shydow.sink; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONObject; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.Statement; import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.stream.Collectors; /** * @author shydow * @date 2021-08-19 */ public class CustomClickHouseSink extends RichSinkFunction<List<JSONObject>> { private Properties properties; private String tableName; private Connection conn; private PreparedStatement preparedStatement; public CustomClickHouseSink(Properties properties, String tableName) { this.properties = properties; this.tableName = tableName; } @Override public void open(Configuration parameters) throws Exception { super.open(parameters); // 创建数据库语句 String createDBStr = "create database if not exists " + tableName.split("\\.")[0]; // 创建表语句,这里使用ClickHouse中自带的Nested Structures格式,下游自己解析 String createTableStr = "create table if not exists " + tableName + " (record Nested(k String, v String)," + "create_time DateTime DEFAULT now()) ENGINE = MergeTree PARTITION BY toDate(create_time) ORDER BY tuple()"; // 插入语句 String insertStr = "insert into " + tableName + "(record.k, record.v) values(?, ?)"; Class.forName("ru.yandex.clickhouse.ClickHouseDriver"); conn = DriverManager.getConnection(properties.getProperty("CLICKHOUSE_URL"), properties.getProperty("CLICKHOUSE_USER"), ""); Statement statement = conn.createStatement(); // 执行创建数据库 statement.execute(createDBStr); // 执行创建表 statement.execute(createTableStr); preparedStatement = conn.prepareStatement(insertStr); } @Override public void invoke(List<JSONObject> values, Context context) throws Exception { if (!values.isEmpty()) { for (JSONObject value : values) { // 取出data部分,时间戳,操作类型 String cdcData = value.getString("cdc_data"); // json字符串 String ts = value.getString("cdc_canal_ts"); String canalType = value.getString("cdc_canal_type"); JSONObject parseObject = JSON.parseObject(cdcData); parseObject.put("canal_time", ts); parseObject.put("canal_type", canalType); // 开始插入 List<String> keys = new ArrayList<>(parseObject.keySet()); List<String> v = parseObject.values().stream().map(Object::toString).collect(Collectors.toList()); preparedStatement.setObject(1, keys); preparedStatement.setObject(2, v); preparedStatement.addBatch(); } preparedStatement.executeBatch(); } } @Override public void close() throws Exception { super.close(); if (preparedStatement != null) { preparedStatement.close(); } if (conn != null) { conn.close(); } } }