Flink CDC采集MySQL binlog日志实时写入ClickHouse

一、DataStream API

  这里以通过Flink CDC采集MySQL的binlog日志实时写入ClickHouse中:

package com.shydow;

import com.alibaba.fastjson.JSONObject;
import com.bangdao.bigdata.schema.CustomDebeziumDeserializationSchema;
import com.bangdao.bigdata.sink.CustomClickHouseSink;
import com.bangdao.bigdata.utils.GetStreamExecutionEnvironment;
import com.ververica.cdc.connectors.mysql.MySqlSource;
import com.ververica.cdc.connectors.mysql.table.StartupOptions;
import com.ververica.cdc.debezium.DebeziumSourceFunction;
import com.ververica.cdc.debezium.StringDebeziumDeserializationSchema;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.contrib.streaming.state.RocksDBStateBackend;
import org.apache.flink.shaded.guava18.com.google.common.collect.Lists;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

/**
 * @author shydow
 * @date 2021-08-18
 */

public class CdcLauncher {

    public static void main(String[] args) throws Exception {

        ParameterTool parameterTool = ParameterTool.fromArgs(args);
        String s = parameterTool.get("propertiesPath");
        Properties properties = new Properties();
        try {
            InputStream resourceAsStream = null;
            if (StringUtils.isEmpty(s)) {
                resourceAsStream = CdcLauncher.class.getClassLoader().getResourceAsStream("CdcLauncher.properties");
            } else {
                resourceAsStream = new FileInputStream(new File(s));
            }
            properties.load(resourceAsStream);
        } catch (IOException e) {
            throw new IOException(e);
        }

        StreamExecutionEnvironment env = GetStreamExecutionEnvironment.getEnv();
        env.setStateBackend(new RocksDBStateBackend("file:///tmp/ckp"));

        DebeziumSourceFunction<JSONObject> sourceFunction = MySqlSource.<JSONObject>builder()
                .hostname(properties.getProperty("HOSTNAME"))
                .port(Integer.parseInt(properties.getProperty("PORT")))
                .databaseList(properties.getProperty("DATABASE_LIST"))
                .tableList(properties.getProperty("TABLE_LIST").split(","))
                .username(properties.getProperty("USERNAME"))
                .password(properties.getProperty("PASSWORD"))
                .startupOptions(StartupOptions.initial())
                .deserializer(new CustomDebeziumDeserializationSchema())
                .build();
        DataStreamSource<JSONObject> source = env.addSource(sourceFunction);

        // 按照库和表进行分流
        OutputTag<JSONObject> t1 = new OutputTag<JSONObject>("output-tag-one") {
        };
        OutputTag<JSONObject> t2 = new OutputTag<JSONObject>("output-tag-two") {
        };
        SingleOutputStreamOperator<Object> processStream = source.process(new ProcessFunction<JSONObject, Object>() {
            @Override
            public void processElement(JSONObject jsonObject, Context context, Collector<Object> collector) throws Exception {
                if ("cdc_test_1".equals(jsonObject.getString("cdc_mysql_table"))) {
                    context.output(t1, jsonObject);
                } else if ("cdc_test".equals(jsonObject.getString("cdc_mysql_table"))) {
                    context.output(t2, jsonObject);
                }
            }
        });
        DataStream<JSONObject> sourceOne = processStream.getSideOutput(t1);
        DataStream<JSONObject> sourceTwo = processStream.getSideOutput(t2);

        // sink ClickHouse
        sourceOne
                .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(10)))
                .apply(new AllWindowFunction<JSONObject, List<JSONObject>, TimeWindow>() {
                    @Override
                    public void apply(TimeWindow timeWindow, Iterable<JSONObject> iterable, Collector<List<JSONObject>> collector) throws Exception {
                        ArrayList<JSONObject> list = Lists.newArrayList(iterable);
                        collector.collect(list);
                    }
                })
                .addSink(new CustomClickHouseSink(properties, "ods.cdc_test_1")).setParallelism(1);

        sourceTwo
                .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(10)))
                .apply(new AllWindowFunction<JSONObject, List<JSONObject>, TimeWindow>() {
                    @Override
                    public void apply(TimeWindow timeWindow, Iterable<JSONObject> iterable, Collector<List<JSONObject>> collector) throws Exception {
                        ArrayList<JSONObject> list = Lists.newArrayList(iterable);
                        collector.collect(list);
                    }
                })
                .addSink(new CustomClickHouseSink(properties, "ods.cdc_test")).setParallelism(1);

        env.execute();
    }
}
package com.shydow.schema;

import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.debezium.DebeziumDeserializationSchema;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.util.Collector;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.Struct;
import org.apache.kafka.connect.source.SourceRecord;

import java.util.List;

/**
 * @author shydow
 * @date 2021-08-18
 */
public class CustomDebeziumDeserializationSchema implements DebeziumDeserializationSchema<JSONObject> {


    @Override
    public void deserialize(SourceRecord record, Collector<JSONObject> out) throws Exception {
        Struct dataRecord = (Struct) record.value();

        Struct afterStruct = dataRecord.getStruct("after");
        Struct beforeStruct = dataRecord.getStruct("before");
        /*
          todo 1,同时存在 beforeStruct 跟 afterStruct数据的话,就代表是update的数据
               2,只存在 beforeStruct 就是delete数据
               3,只存在 afterStruct数据 就是insert数据
         */

        JSONObject mainJson = new JSONObject();
        JSONObject dataJson = new JSONObject();
        JSONObject pkJson = new JSONObject();

        String canal_type = "";
        List<Field> fieldsList = null;
        if (afterStruct != null && beforeStruct != null) {
            canal_type = "update";
            fieldsList = afterStruct.schema().fields();
            //todo 字段与值
            for (Field field : fieldsList) {
                String fieldName = field.name();
                Object fieldValue = afterStruct.get(fieldName);
                dataJson.put(fieldName, fieldValue);
            }
        } else if (afterStruct != null) {
            canal_type = "insert";
            fieldsList = afterStruct.schema().fields();
            for (Field field : fieldsList) {
                String fieldName = field.name();
                Object fieldValue = afterStruct.get(fieldName);
                dataJson.put(fieldName, fieldValue);
            }
        } else if (beforeStruct != null) {
            canal_type = "detele";
            fieldsList = beforeStruct.schema().fields();
            for (Field field : fieldsList) {
                String fieldName = field.name();
                Object fieldValue = beforeStruct.get(fieldName);
                dataJson.put(fieldName, fieldValue);
            }
        }

        //todo 拿到databases table信息
        Struct source = dataRecord.getStruct("source");
        Object db = source.get("db");
        Object table = source.get("table");
        Object ts_ms = source.get("ts_ms");

        mainJson.put("cdc_mysql_database", db);
        mainJson.put("cdc_mysql_table", table);
        mainJson.put("cdc_canal_ts", ts_ms);
        mainJson.put("cdc_canal_type", canal_type);

        //todo 拿到topic
        String topic = record.topic();
        //todo 主键字段
        Struct pk = (Struct) record.key();
        List<Field> pkFieldList = pk.schema().fields();
        int partitionerNum = 0;
        for (Field field : pkFieldList) {
            Object pkValue = pk.get(field.name());
            pkJson.put(field.name(), pkValue);
            partitionerNum += pkValue.hashCode();
        }
        int hash = Math.abs(partitionerNum) % 3;
        mainJson.put("cdc_pk_hashcode", hash);
        mainJson.put("cdc_pk", pkJson);
        mainJson.put("cdc_data", dataJson);

        out.collect(mainJson);
    }

    @Override
    public TypeInformation<JSONObject> getProducedType() {
        return BasicTypeInfo.of(JSONObject.class);
    }
}
package com.shydow.sink;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.stream.Collectors;

/**
 * @author shydow
 * @date 2021-08-19
 */
public class CustomClickHouseSink extends RichSinkFunction<List<JSONObject>> {

    private Properties properties;
    private String tableName;
    private Connection conn;
    private PreparedStatement preparedStatement;

    public CustomClickHouseSink(Properties properties, String tableName) {
        this.properties = properties;
        this.tableName = tableName;
    }

    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        // 创建数据库语句
        String createDBStr = "create database if not exists " + tableName.split("\\.")[0];

        // 创建表语句,这里使用ClickHouse中自带的Nested Structures格式,下游自己解析
        String createTableStr = "create table if not exists " + tableName + " (record Nested(k String, v String)," +
                "create_time DateTime DEFAULT now()) ENGINE = MergeTree PARTITION BY toDate(create_time) ORDER BY tuple()";

        // 插入语句
        String insertStr = "insert into " + tableName + "(record.k, record.v) values(?, ?)";

        Class.forName("ru.yandex.clickhouse.ClickHouseDriver");
        conn = DriverManager.getConnection(properties.getProperty("CLICKHOUSE_URL"), properties.getProperty("CLICKHOUSE_USER"), "");
        Statement statement = conn.createStatement();
        // 执行创建数据库
        statement.execute(createDBStr);
        // 执行创建表
        statement.execute(createTableStr);
        preparedStatement = conn.prepareStatement(insertStr);
    }

    @Override
    public void invoke(List<JSONObject> values, Context context) throws Exception {
        if (!values.isEmpty()) {
            for (JSONObject value : values) {
                // 取出data部分,时间戳,操作类型
                String cdcData = value.getString("cdc_data"); // json字符串
                String ts = value.getString("cdc_canal_ts");
                String canalType = value.getString("cdc_canal_type");
                JSONObject parseObject = JSON.parseObject(cdcData);
                parseObject.put("canal_time", ts);
                parseObject.put("canal_type", canalType);
                //  开始插入
                List<String> keys = new ArrayList<>(parseObject.keySet());
                List<String> v = parseObject.values().stream().map(Object::toString).collect(Collectors.toList());
                preparedStatement.setObject(1, keys);
                preparedStatement.setObject(2, v);
                preparedStatement.addBatch();

            }
            preparedStatement.executeBatch();
        }
    }

    @Override
    public void close() throws Exception {
        super.close();
        if (preparedStatement != null) {
            preparedStatement.close();
        }
        if (conn != null) {
            conn.close();
        }
    }
}

 

posted @ 2021-12-06 14:50  Shydow  阅读(1636)  评论(0编辑  收藏  举报