Flink之Mysql数据CDC

知识点:

https://github.com/ververica/flink-cdc-connectors  //官网地址
https://blog.csdn.net/u012551524/article/details/108985945

1、依赖

  <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.16</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba.ververica</groupId>
            <!-- add the dependency matching your database -->
            <artifactId>flink-connector-mysql-cdc</artifactId>
            <version>1.0.0</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>1.7.25</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.25</version>
        </dependency>

2、处理类

import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import com.alibaba.ververica.cdc.debezium.StringDebeziumDeserializationSchema;
import com.alibaba.ververica.cdc.connectors.mysql.MySQLSource;
/**
 * @program: Flink1.11
 * @description:
 * @author: yang
 * @create: 2021-01-11 17:41
 */
public class MySqlBinlogSourceExample {
    public static void main(String[] args) throws Exception {
        SourceFunction<String> sourceFunction = MySQLSource.<String>builder()
                .hostname("localhost")
                .port(3306)
                .databaseList("test")
                // monitor all tables under inventory database
                .username("root")
                .password("root")
                .deserializer(new StringDebeziumDeserializationSchema())
                // converts SourceRecord to String
                .build();

        StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());

        env.addSource(sourceFunction).print().setParallelism(1);
        // use parallelism 1 for sink to keep message ordering
        env.execute("test");
    }
}

3、binlog结果

修改:befor and after 
SourceRecord{
    sourcePartition={server=mysql-binlog-source}, 
    sourceOffset={ts_sec=1610362335, file=mysql-bin.000004, pos=233445691, row=1, server_id=1, event=2}
} 
ConnectRecord
{topic='mysql-binlog-source.test.weblog', kafkaPartition=null, key=Struct{id=5}, keySchema=Schema{mysql_binlog_source.test.weblog.Key:STRUCT}, value=Struct{before=Struct{id=5,url=5,method=5,ip=5,args=5,create_time=1610390670000},after=Struct{id=5,url=5555,method=5555,ip=5555,args=5555,create_time=1610390670000},source=Struct{version=1.2.0.Final,connector=mysql,name=mysql-binlog-source,ts_ms=1610362335000,db=test,table=weblog,server_id=1,file=mysql-bin.000004,pos=233445826,row=0,thread=944986},op=u,ts_ms=1610362335615}, valueSchema=Schema{mysql_binlog_source.test.weblog.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)
}


增加:只有after
SourceRecord{sourcePartition={server=mysql-binlog-source}, sourceOffset={file=mysql-bin.000004, pos=233455303}} 
ConnectRecord
{topic='mysql-binlog-source.test.weblog', kafkaPartition=null, key=Struct{id=7}, keySchema=Schema{mysql_binlog_source.test.weblog.Key:STRUCT}, value=Struct{after=Struct{id=7,url=7,method=7,ip=7,args=7,create_time=1610391478000},source=Struct{version=1.2.0.Final,connector=mysql,name=mysql-binlog-source,ts_ms=0,snapshot=last,db=test,table=weblog,server_id=0,file=mysql-bin.000004,pos=233455303,row=0},op=c,ts_ms=1610362692061}, valueSchema=Schema{mysql_binlog_source.test.weblog.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)}


删除:只有before
SourceRecord{sourcePartition={server=mysql-binlog-source}, sourceOffset={ts_sec=1610362743, file=mysql-bin.000004, pos=233456891, row=1, server_id=1, event=2}} ConnectRecord{topic='mysql-binlog-source.test.weblog', kafkaPartition=null, key=Struct{id=1}, keySchema=Schema{mysql_binlog_source.test.weblog.Key:STRUCT}, value=Struct{before=Struct{id=1,url=1,method=1,ip=1,args=1,create_time=1603115590000},source=Struct{version=1.2.0.Final,connector=mysql,name=mysql-binlog-source,ts_ms=1610362743000,db=test,table=weblog,server_id=1,file=mysql-bin.000004,pos=233457026,row=0,thread=944986},op=d,ts_ms=1610362744527}, valueSchema=Schema{mysql_binlog_source.test.weblog.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)}

4、如果需要将数据进行etl,解析数据,然后自定义实现sink

 

5、测试代码

 

import com.alibaba.fastjson.JSONObject;
import com.alibaba.ververica.cdc.connectors.mysql.MySQLSource;
import com.alibaba.ververica.cdc.debezium.DebeziumDeserializationSchema;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.util.Collector;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.Struct;
import org.apache.kafka.connect.source.SourceRecord;

import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;

/**
 * @program: FlinkSql
 * @description:
 * @author: yang
 * @create: 2021-06-18 15:38
 */
public class MySqlBinlogSourceExample {
    public static void main(String[] args) {
        EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().build();
        TableEnvironment tableEnv = TableEnvironment.create(settings);
        //todo 1、读取mysqlbinlog
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        SourceFunction<JSONObject> sourceFunction = MySQLSource.<JSONObject>builder()
                .hostname("hadoop101")
                .port(3306)
                .databaseList("test")
                .username("root")
                .password("yang156122")
                // todo 2、读取数据解析为json数据
                //数据样例: {"canal_type":"insert","create_time":1622815639000,"name":"user0","id":1,"canal_ts":0,"age":1.0,"canal_database":"user","pk_hashcode":1}
                .deserializer(new CdcDeserializationSchema())
                .build();
        DataStreamSource<JSONObject> jsonDataStreamSource = env.addSource(sourceFunction);

        //todo 3、过滤数据
        SingleOutputStreamOperator<JSONObject> userData = jsonDataStreamSource.filter(data -> {
            if (("user".equals(data.getString("canal_database") ))
                && !("detele".equals(data.getString("canal_type"))))
            { return true; } else { return false; }
        });

        //todo 4、只拿业务数据
        String strDateFormat = "yyyy-MM-dd HH:mm:ss";
        SimpleDateFormat dateFormat = new SimpleDateFormat(strDateFormat);
        SingleOutputStreamOperator<String> resultData = userData.map(data -> {
            //时间处理
//            String create_time = data.getString("create_time");
//            data.put("create_time",dateFormat.format(new Date(Long.valueOf(create_time) - 28800000)));
            data.remove("canal_database");
            data.remove("canal_type");
            data.remove("canal_ts");
            data.remove("pk_hashcode");
            return data.toJSONString();
        });
        //todo 5、sink操作
        //5.1 kafka sink  uat-datacenter1
        FlinkKafkaProducer<String> kafkaSink = new FlinkKafkaProducer<>("hadoop103:9092",
                "user_changelog_json", new SimpleStringSchema());
        resultData.addSink(kafkaSink);

        //5.2 打印数据
        resultData.print("最终数据===>");

        try {
            env.execute("测试mysql-cdc");
        } catch (Exception e) {
            e.printStackTrace();
        }

    }


}



class CdcDeserializationSchema implements DebeziumDeserializationSchema<JSONObject> {
    private static final long serialVersionUID = -3168848963265670603L;

    public CdcDeserializationSchema() {
    }

    @Override
    public void deserialize(SourceRecord record, Collector<JSONObject> out) throws Exception {
        Struct dataRecord  =  (Struct)record.value();

        Struct afterStruct = dataRecord.getStruct("after");
        Struct beforeStruct = dataRecord.getStruct("before");
        /*
          todo 1,同时存在 beforeStruct 跟 afterStruct数据的话,就代表是update的数据
               2,只存在 beforeStruct 就是delete数据
               3,只存在 afterStruct数据 就是insert数据
         */
        JSONObject logJson = new JSONObject();

        String canal_type = "";
        List<Field> fieldsList = null;
        if(afterStruct !=null && beforeStruct !=null){
            System.out.println("这是修改数据");
            canal_type = "update";
            fieldsList = afterStruct.schema().fields();
            //todo 字段与值
            for (Field field : fieldsList) {
                String fieldName = field.name();
                Object fieldValue = afterStruct.get(fieldName);
                logJson.put(fieldName,fieldValue);
            }
        }else if (afterStruct !=null){
            System.out.println( "这是新增数据");

            canal_type = "insert";
            fieldsList = afterStruct.schema().fields();
            //todo 字段与值
            for (Field field : fieldsList) {
                String fieldName = field.name();
                Object fieldValue = afterStruct.get(fieldName);
                logJson.put(fieldName,fieldValue);
            }
        }else if (beforeStruct !=null){
            System.out.println( "这是删除数据");
            canal_type = "detele";
            fieldsList = beforeStruct.schema().fields();
            //todo 字段与值
            for (Field field : fieldsList) {
                String fieldName = field.name();
                Object fieldValue = beforeStruct.get(fieldName);
                logJson.put(fieldName,fieldValue);
            }
        }else {
            System.out.println("一脸蒙蔽了");
        }

        //todo 拿到databases table信息
        Struct source = dataRecord.getStruct("source");
        Object db = source.get("db");
        Object table = source.get("table");
        Object ts_ms = source.get("ts_ms");

        logJson.put("canal_database",db);
        logJson.put("canal_database",table);
        logJson.put("canal_ts",ts_ms);
        logJson.put("canal_type",canal_type);

        //todo 拿到topic
        String topic = record.topic();
        System.out.println("topic = " + topic);

        //todo 主键字段
        Struct pk = (Struct)record.key();
        List<Field> pkFieldList = pk.schema().fields();
        int partitionerNum = 0 ;
        for (Field field : pkFieldList) {
            Object pkValue= pk.get(field.name());
            partitionerNum += pkValue.hashCode();

        }
        int hash = Math.abs(partitionerNum) % 3;
        logJson.put("pk_hashcode",hash);
        out.collect(logJson);
    }
    @Override
    public TypeInformation<JSONObject> getProducedType() {
        return BasicTypeInfo.of(JSONObject.class);
    }
}

 

posted @ 2021-01-11 18:51  小白啊小白,Fighting  阅读(6084)  评论(2编辑  收藏  举报