Flink之Mysql数据CDC
知识点:
https://github.com/ververica/flink-cdc-connectors //官网地址
https://blog.csdn.net/u012551524/article/details/108985945
1、依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.16</version>
</dependency>
<dependency>
<groupId>com.alibaba.ververica</groupId>
<!-- add the dependency matching your database -->
<artifactId>flink-connector-mysql-cdc</artifactId>
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
2、处理类
import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.source.SourceFunction; import com.alibaba.ververica.cdc.debezium.StringDebeziumDeserializationSchema; import com.alibaba.ververica.cdc.connectors.mysql.MySQLSource; /** * @program: Flink1.11 * @description: * @author: yang * @create: 2021-01-11 17:41 */ public class MySqlBinlogSourceExample { public static void main(String[] args) throws Exception { SourceFunction<String> sourceFunction = MySQLSource.<String>builder() .hostname("localhost") .port(3306) .databaseList("test") // monitor all tables under inventory database .username("root") .password("root") .deserializer(new StringDebeziumDeserializationSchema()) // converts SourceRecord to String .build(); StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration()); env.addSource(sourceFunction).print().setParallelism(1); // use parallelism 1 for sink to keep message ordering env.execute("test"); } }
3、binlog结果
修改:befor and after SourceRecord{ sourcePartition={server=mysql-binlog-source}, sourceOffset={ts_sec=1610362335, file=mysql-bin.000004, pos=233445691, row=1, server_id=1, event=2} } ConnectRecord {topic='mysql-binlog-source.test.weblog', kafkaPartition=null, key=Struct{id=5}, keySchema=Schema{mysql_binlog_source.test.weblog.Key:STRUCT}, value=Struct{before=Struct{id=5,url=5,method=5,ip=5,args=5,create_time=1610390670000},after=Struct{id=5,url=5555,method=5555,ip=5555,args=5555,create_time=1610390670000},source=Struct{version=1.2.0.Final,connector=mysql,name=mysql-binlog-source,ts_ms=1610362335000,db=test,table=weblog,server_id=1,file=mysql-bin.000004,pos=233445826,row=0,thread=944986},op=u,ts_ms=1610362335615}, valueSchema=Schema{mysql_binlog_source.test.weblog.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=) } 增加:只有after SourceRecord{sourcePartition={server=mysql-binlog-source}, sourceOffset={file=mysql-bin.000004, pos=233455303}} ConnectRecord {topic='mysql-binlog-source.test.weblog', kafkaPartition=null, key=Struct{id=7}, keySchema=Schema{mysql_binlog_source.test.weblog.Key:STRUCT}, value=Struct{after=Struct{id=7,url=7,method=7,ip=7,args=7,create_time=1610391478000},source=Struct{version=1.2.0.Final,connector=mysql,name=mysql-binlog-source,ts_ms=0,snapshot=last,db=test,table=weblog,server_id=0,file=mysql-bin.000004,pos=233455303,row=0},op=c,ts_ms=1610362692061}, valueSchema=Schema{mysql_binlog_source.test.weblog.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)} 删除:只有before SourceRecord{sourcePartition={server=mysql-binlog-source}, sourceOffset={ts_sec=1610362743, file=mysql-bin.000004, pos=233456891, row=1, server_id=1, event=2}} ConnectRecord{topic='mysql-binlog-source.test.weblog', kafkaPartition=null, key=Struct{id=1}, keySchema=Schema{mysql_binlog_source.test.weblog.Key:STRUCT}, value=Struct{before=Struct{id=1,url=1,method=1,ip=1,args=1,create_time=1603115590000},source=Struct{version=1.2.0.Final,connector=mysql,name=mysql-binlog-source,ts_ms=1610362743000,db=test,table=weblog,server_id=1,file=mysql-bin.000004,pos=233457026,row=0,thread=944986},op=d,ts_ms=1610362744527}, valueSchema=Schema{mysql_binlog_source.test.weblog.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)}
4、如果需要将数据进行etl,解析数据,然后自定义实现sink
5、测试代码
import com.alibaba.fastjson.JSONObject; import com.alibaba.ververica.cdc.connectors.mysql.MySQLSource; import com.alibaba.ververica.cdc.debezium.DebeziumDeserializationSchema; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.common.typeinfo.BasicTypeInfo; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.TableEnvironment; import org.apache.flink.util.Collector; import org.apache.kafka.connect.data.Field; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.source.SourceRecord; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; /** * @program: FlinkSql * @description: * @author: yang * @create: 2021-06-18 15:38 */ public class MySqlBinlogSourceExample { public static void main(String[] args) { EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().build(); TableEnvironment tableEnv = TableEnvironment.create(settings); //todo 1、读取mysqlbinlog StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); SourceFunction<JSONObject> sourceFunction = MySQLSource.<JSONObject>builder() .hostname("hadoop101") .port(3306) .databaseList("test") .username("root") .password("yang156122") // todo 2、读取数据解析为json数据 //数据样例: {"canal_type":"insert","create_time":1622815639000,"name":"user0","id":1,"canal_ts":0,"age":1.0,"canal_database":"user","pk_hashcode":1} .deserializer(new CdcDeserializationSchema()) .build(); DataStreamSource<JSONObject> jsonDataStreamSource = env.addSource(sourceFunction); //todo 3、过滤数据 SingleOutputStreamOperator<JSONObject> userData = jsonDataStreamSource.filter(data -> { if (("user".equals(data.getString("canal_database") )) && !("detele".equals(data.getString("canal_type")))) { return true; } else { return false; } }); //todo 4、只拿业务数据 String strDateFormat = "yyyy-MM-dd HH:mm:ss"; SimpleDateFormat dateFormat = new SimpleDateFormat(strDateFormat); SingleOutputStreamOperator<String> resultData = userData.map(data -> { //时间处理 // String create_time = data.getString("create_time"); // data.put("create_time",dateFormat.format(new Date(Long.valueOf(create_time) - 28800000))); data.remove("canal_database"); data.remove("canal_type"); data.remove("canal_ts"); data.remove("pk_hashcode"); return data.toJSONString(); }); //todo 5、sink操作 //5.1 kafka sink uat-datacenter1 FlinkKafkaProducer<String> kafkaSink = new FlinkKafkaProducer<>("hadoop103:9092", "user_changelog_json", new SimpleStringSchema()); resultData.addSink(kafkaSink); //5.2 打印数据 resultData.print("最终数据===>"); try { env.execute("测试mysql-cdc"); } catch (Exception e) { e.printStackTrace(); } } } class CdcDeserializationSchema implements DebeziumDeserializationSchema<JSONObject> { private static final long serialVersionUID = -3168848963265670603L; public CdcDeserializationSchema() { } @Override public void deserialize(SourceRecord record, Collector<JSONObject> out) throws Exception { Struct dataRecord = (Struct)record.value(); Struct afterStruct = dataRecord.getStruct("after"); Struct beforeStruct = dataRecord.getStruct("before"); /* todo 1,同时存在 beforeStruct 跟 afterStruct数据的话,就代表是update的数据 2,只存在 beforeStruct 就是delete数据 3,只存在 afterStruct数据 就是insert数据 */ JSONObject logJson = new JSONObject(); String canal_type = ""; List<Field> fieldsList = null; if(afterStruct !=null && beforeStruct !=null){ System.out.println("这是修改数据"); canal_type = "update"; fieldsList = afterStruct.schema().fields(); //todo 字段与值 for (Field field : fieldsList) { String fieldName = field.name(); Object fieldValue = afterStruct.get(fieldName); logJson.put(fieldName,fieldValue); } }else if (afterStruct !=null){ System.out.println( "这是新增数据"); canal_type = "insert"; fieldsList = afterStruct.schema().fields(); //todo 字段与值 for (Field field : fieldsList) { String fieldName = field.name(); Object fieldValue = afterStruct.get(fieldName); logJson.put(fieldName,fieldValue); } }else if (beforeStruct !=null){ System.out.println( "这是删除数据"); canal_type = "detele"; fieldsList = beforeStruct.schema().fields(); //todo 字段与值 for (Field field : fieldsList) { String fieldName = field.name(); Object fieldValue = beforeStruct.get(fieldName); logJson.put(fieldName,fieldValue); } }else { System.out.println("一脸蒙蔽了"); } //todo 拿到databases table信息 Struct source = dataRecord.getStruct("source"); Object db = source.get("db"); Object table = source.get("table"); Object ts_ms = source.get("ts_ms"); logJson.put("canal_database",db); logJson.put("canal_database",table); logJson.put("canal_ts",ts_ms); logJson.put("canal_type",canal_type); //todo 拿到topic String topic = record.topic(); System.out.println("topic = " + topic); //todo 主键字段 Struct pk = (Struct)record.key(); List<Field> pkFieldList = pk.schema().fields(); int partitionerNum = 0 ; for (Field field : pkFieldList) { Object pkValue= pk.get(field.name()); partitionerNum += pkValue.hashCode(); } int hash = Math.abs(partitionerNum) % 3; logJson.put("pk_hashcode",hash); out.collect(logJson); } @Override public TypeInformation<JSONObject> getProducedType() { return BasicTypeInfo.of(JSONObject.class); } }
本文来自博客园,作者:小白啊小白,Fighting,转载请注明原文链接:https://www.cnblogs.com/ywjfx/p/14263718.html