Ubuntu部署Canal同步Mysql到ES
下载安装包7.16.3
mysql:https://mirrors.tuna.tsinghua.edu.cn/mysql/yum/mysql-8.0-community-el7-x86_64/
es:https://mirrors.tuna.tsinghua.edu.cn/elasticstack/yum/elastic-7.x/7.16.3/
kibana:https://mirrors.tuna.tsinghua.edu.cn/elasticstack/yum/elastic-7.x/7.16.3/
elasticsearch下载地址:https://www.elastic.co/cn/downloads/past-releases#elasticsearch
kibana下载地址:https://www.elastic.co/cn/downloads/past-releases#kibana
canal下载地址:https://github.com/alibaba/canal/releases
安装es
es配置文件
root@es:/es7# grep -v ^# app/es7/config/elasticsearch.yml cluster.name: es7-test node.name: node-1 path.data: /es7 path.logs: /es7/logs bootstrap.memory_lock: true network.host: 0.0.0.0 discovery.seed_hosts: ["172.25.0.129"] cluster.initial_master_nodes: ["172.25.0.129"] #在文件中末尾处添加下面参数,开启x-pack验证 xpack.security.enabled: true xpack.license.self_generated.type: basic xpack.security.transport.ssl.enabled: true #关闭geoip数据库的更新,要不然启动es会报错 ingest.geoip.downloader.enabled: false
设置 elasticsearch
账号密码
cd app/es7/bin/
./elasticsearch-setup-passwords interactive
安装ik分词器
下载地址:Releases · medcl/elasticsearch-analysis-ik · GitHub
root@es:/es7/app/es7/plugins/ik# ll total 1440 drwxr-xr-x 3 es es 4096 Jun 19 18:22 ./ drwxr-xr-x 3 es es 4096 Jun 19 18:12 ../ -rw-r--r-- 1 es es 263965 Jan 18 18:46 commons-codec-1.9.jar -rw-r--r-- 1 es es 61829 Jan 18 18:46 commons-logging-1.2.jar drwxr-xr-x 2 es es 4096 Jan 18 18:45 config/ -rw-r--r-- 1 es es 54596 Jan 19 11:54 elasticsearch-analysis-ik-7.16.3.jar -rw-r--r-- 1 es es 736658 Jan 18 18:46 httpclient-4.5.2.jar -rw-r--r-- 1 es es 326724 Jan 18 18:46 httpcore-4.4.4.jar -rw-r--r-- 1 es es 1807 Jan 19 11:54 plugin-descriptor.properties -rw-r--r-- 1 es es 125 Jan 19 11:54 plugin-security.policy
启动
root@es:/opt/canal/admin/bin# netstat -ntpl Active Internet connections (only servers) Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name tcp 0 0 127.0.0.53:53 0.0.0.0:* LISTEN 1139/systemd-resolv tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN 1234/sshd: /usr/sbi tcp 0 0 0.0.0.0:8089 0.0.0.0:* LISTEN 7899/java tcp 0 0 127.0.0.1:6010 0.0.0.0:* LISTEN 2644/sshd: root@pts tcp 0 0 127.0.0.1:6011 0.0.0.0:* LISTEN 2996/sshd: root@pts tcp 0 0 0.0.0.0:5601 0.0.0.0:* LISTEN 1178/node tcp 0 0 0.0.0.0:11110 0.0.0.0:* LISTEN 7792/java tcp 0 0 0.0.0.0:11111 0.0.0.0:* LISTEN 7792/java tcp 0 0 0.0.0.0:11112 0.0.0.0:* LISTEN 7792/java tcp 0 0 0.0.0.0:8081 0.0.0.0:* LISTEN 7702/java tcp6 0 0 :::9300 :::* LISTEN 4386/java tcp6 0 0 :::22 :::* LISTEN 1234/sshd: /usr/sbi tcp6 0 0 ::1:6010 :::* LISTEN 2644/sshd: root@pts tcp6 0 0 ::1:6011 :::* LISTEN 2996/sshd: root@pts tcp6 0 0 :::9000 :::* LISTEN 7446/java tcp6 0 0 :::9200 :::* LISTEN 4386/java
安装kibana
#打开kibana端口:5601 server.port: 5601 #设置kibana服务host:127.0.0.1 server.host: "127.0.0.1" #配置kibana连接es的地址和端口 elasticsearch.hosts: ["http://127.0.0.1:9200"] #配置kibana连接es的验证用户和密码 elasticsearch.username: "elastic" elasticsearch.password: "123456" #设置中文编码 i18n.locale: "zh-CN" #在末尾处添加以下属性,开启es对kibana的xpack认证 xpack.monitoring.ui.container.elasticsearch.enabled: true
下载和安装canal
canal-server
、canal-adapter
、canal-admin
配置MySQL
[mysqld] #设置serveri_id server_id=1 #开启二进制日志功能 log-bin=mall-mysql-bin #设置使用的二进制日志格式(mixed,statement,row) binlog_format=row
拥有从库权限的账号,用于订阅binlog
,这里创建的账号为canal
;
mysql> create user canal identified by 'passwd'; mysql> grant all on *.* to 'canal'@'%'; mysql> flush privtleges;
创建一个用于测试的数据库:canal
mysql> CREATE DATABASE canal; mysql> CREATE DATABASE IF NOT EXISTS canal default charset utf8 COLLATE utf8_general_ci;
然后创建一张数据库表 product
mysql> use canal; mysql> CREATE TABLE `product` ( `id` bigint(20) NOT NULL AUTO_INCREMENT, `title` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `sub_title` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, `price` decimal(10, 2) NULL DEFAULT NULL, `pic` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB AUTO_INCREMENT = 2 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
配置 canal-server (canal-deploy)
canal-server(canal-deploy):可以直接监听MySQL的binlog,把自己伪装成MySQL的从库,只负责接收数据,并不做处理。
进入 conf/example
路径,修改配置文件 instance.properties
,主要是修改数据库相关配置
root@es:/opt/canal/deployer/conf/example# grep -v ^# insance.properties canal.instance.gtidon=false canal.instance.master.address=172.25.0.128:3306 canal.instance.master.journal.name= canal.instance.master.position= canal.instance.master.timestamp= canal.instance.master.gtid= canal.instance.rds.accesskey= canal.instance.rds.secretkey= canal.instance.rds.instanceId= canal.instance.tsdb.enable=true canal.instance.dbUsername=canal canal.instance.dbPassword=passwd canal.instance.connectionCharset = UTF-8 canal.instance.enableDruid=false canal.instance.filter.regex=.*\\..* canal.instance.filter.black.regex=mysql\\.slave_.* canal.mq.topic=example canal.mq.partition=0
配置 canal-adapter
修改配置文件conf/application.ym
root@es:/opt/canal/adapter/conf# grep -v ^# application.yml server: port: 8081 spring: jackson: date-format: yyyy-MM-dd HH:mm:ss time-zone: GMT+8 default-property-inclusion: non_null canal.conf: mode: tcp #tcp kafka rocketMQ rabbitMQ flatMessage: true zookeeperHosts: syncBatchSize: 1000 retries: timeout: accessKey: secretKey: consumerProperties: # canal tcp consumer canal.tcp.server.host: 127.0.0.1:11111 canal.tcp.zookeeper.hosts: canal.tcp.batch.size: 500 canal.tcp.username: canal.tcp.password: # kafka consumer kafka.bootstrap.servers: 127.0.0.1:9092 kafka.enable.auto.commit: false kafka.auto.commit.interval.ms: 1000 kafka.auto.offset.reset: latest kafka.request.timeout.ms: 40000 kafka.session.timeout.ms: 30000 kafka.isolation.level: read_committed kafka.max.poll.records: 1000 # rocketMQ consumer rocketmq.namespace: rocketmq.namesrv.addr: 127.0.0.1:9876 rocketmq.batch.size: 1000 rocketmq.enable.message.trace: false rocketmq.customized.trace.topic: rocketmq.access.channel: rocketmq.subscribe.filter: # rabbitMQ consumer rabbitmq.host: rabbitmq.virtual.host: rabbitmq.username: rabbitmq.password: rabbitmq.resource.ownerId: srcDataSources: defaultDS: url: jdbc:mysql://172.25.0.128:3306/canal?useUnicode=true # canal为授权的database名称 username: canal password: passwd canalAdapters: - instance: example # canal instance Name or mq topic name groups: - groupId: g1 outerAdapters: - name: logger - name: es7 hosts: http://172.25.0.129:9200 # 127.0.0.1:9200 for rest mode properties: mode: rest # or rest security.auth: elastic:passwd # only used for rest mode cluster.name: es7-test
额外
配置普通用户同步
创建账号 create user 'customer_test'@'%' IDENTIFIED BY 'Read429..'; 授权 GRANT INSERT,SELECT,REPLICATION SLAVE,REPLICATION CLIENT ON *.* TO 'customer_test'@'%'; 修改密码规则 ALTER USER 'customer_test'@'%' IDENTIFIED WITH mysql_native_password BY 'Read429..'; 刷新 flush privileges; 密码尽量不使用@
修改 canal-adapter/conf/es7/mytest_user.yml
文件,用于配置MySQL
中的表与Elasticsearch
中索引的映射关系
root@es:/opt/canal/adapter/conf/es7# cat mytest_user.yml dataSourceKey: defaultDS destination: example groupId: g1 esMapping: _index: mytest_user # 索引要跟es创建的对应 _id: _id # upsert: true # pk: id # sql: "select a.id as _id, a.name, a.role_id, b.role_name, # a.c_time from user a # left join role b on b.id=a.role_id" sql: "SELECT # 查询sql p.id AS _id, p.title, p.sub_title, p.price, p.pic FROM product p" # objFields: # _labels: array:; etlCondition: "where a.c_time>={}" commitBatch: 3000
详解:
dataSourceKey: defaultDS destination: comment groupId: g1 esMapping: _index: commentrepeatedsj _id: comment_md5 upsert: true pk: commentMd5 sql: "SELECT comment_md5 AS commentMd5, topic_msg_id AS topicMsgId, create_time AS createTime FROM comment_repeated_sj_0" objFields: _labels: array:; etlCondition: "where a.c_time>={}" commitBatch: 300 这是一个数据同步任务的配置文件,相比之前的配置文件,增加了一些参数,下面逐个解释一下: dataSourceKey: 数据源的标识,这里设置为 defaultDS,表示使用默认数据源。 destination: 目标数据的类型,这里设置为 comment,表示将数据同步到名为 comment 的目标数据源中。 groupId: 数据同步任务的分组标识,这里设置为 g1。 esMapping: Elasticsearch 索引的映射配置,包括索引名称 _index、主键名称 _id、操作类型 upsert、主键名称 pk、SQL 查询语句 sql 等。 _index: Elasticsearch 索引的名称,这里设置为 commentrepeatedsj。 _id: Elasticsearch 索引的主键名称,这里设置为 comment_md5,表示使用数据中的 comment_md5 字段作为主键。 upsert: 数据的操作类型,这里设置为 true,表示进行 upsert 操作,即如果数据已存在则更新,否则插入。 pk: 数据源中的主键名称,这里设置为 commentMd5,表示将数据源中的 comment_md5 字段与 Elasticsearch 索引中的 _id 字段进行映射。 sql: 数据源的 SQL 查询语句,用于从源数据源中获取数据。 etlCondition: 数据的过滤条件,这里表示只同步 c_time 大于等于某个值的数据。 commitBatch: 数据同步的批次大小,这里设置为 300。 #objFields: 对象字段的映射配置,这里是一个注释,表示不进行对象字段映射。如果需要进行对象字段映射,可以将注释去掉,并设置对应的映射关系。 总的来说,这个配置文件定义了一个数据同步任务,将源数据源中的数据同步到 Elasticsearch 中,其中包括了数据源的标识、目标数据类型、分组标识、Elasticsearch 索引的映射配置、数据过滤条件和同步批次等信息,同时还指定了主键映射关系和进行 upsert 操作。
额外需求
需要同步id与_id,同时字段和mysql内置字段冲突
dataSourceKey: defaultDS destination: example_1222 groupId: g1 esMapping: # 字段映射 _index: hotsearchdata id: id # 字段映射 _id: _id # 字段映射 upsert: true pk: id sql: "select id AS id, # 字段映射 id AS _id, # 字段映射 `key`, title, author, author_id AS authorId, source_url AS sourceUrl, create_time AS createTime, publish_time AS publishTime, update_time AS updateTime, content, cover_img AS coverImg, video_url AS videoUrl, source, comment_count AS commentCount, transmit_count AS transmitCount, like_count AS likeCount, share_count AS shareCount, read_count AS readCount from hot_search_data" # objFields: # _labels: array:; etlCondition: "where a.id>={}" commitBatch: 3000
报错:
ERROR Config: comment_repeated_sj_0.yml esMapping._id or esMapping.pk
这个错误提示是在执行数据同步任务时出现的,它表示配置文件中缺少 esMapping._id 或 esMapping.pk 字段,导致无法确定主键字段,从而无法进行数据同步操作。
在 Elasticsearch 中,每个文档都必须有一个唯一的标识符,称为 _id。在数据同步任务中,需要将源数据源中的某个字段映射到 Elasticsearch 索引中的 _id 字段,以确保每个文档有一个唯一的标识符。
通常情况下,可以在配置文件中将 esMapping._id 或 esMapping.pk 字段设置为源数据源中的某个唯一字段,例如数据库表中的主键字段。如果配置文件中没有设置这两个字段,Elasticsearch 将无法确定文档的唯一标识符,从而无法进行数据同步操作。
因此,您需要检查配置文件中是否已经设置了 esMapping._id 或 esMapping.pk 字段,并确保这些字段映射到了源数据源中的某个唯一字段。如果配置文件中确实没有设置这些字段,则需要添加它们并设置正确的映射关系,以便数据同步任务能够正常执行。
配置canal-admin
source /root/canal_manager.sql
修改配置文件conf/application.yml
,按如下配置即可,主要是修改数据源配置和canal-admin
的管理账号配置,注意需要用一个有读写权限的数据库账号,比如管理账号root:root
;
root@es:/opt/canal/admin/conf# grep -v ^# application.yml server: port: 8089 spring: jackson: date-format: yyyy-MM-dd HH:mm:ss time-zone: GMT+8 spring.datasource: address: 172.25.0.128:3306 database: canal_manager username: canal password: passwd driver-class-name: com.mysql.jdbc.Driver url: jdbc:mysql://${spring.datasource.address}/${spring.datasource.database}?useUnicode=true&characterEncoding=UTF-8&useSSL=false hikari: maximum-pool-size: 30 minimum-idle: 1 canal: adminUser: admin adminPasswd: admin
测试集群健康状态和ik分词器
root@es:/es7/app/es7/plugins/ik# curl -k -u elastic:密码 -XGET http://192.168.20.133:9200/_cluster/health?pretty -i HTTP/1.1 200 OK X-elastic-product: Elasticsearch content-type: application/json; charset=UTF-8 content-length: 463 { "cluster_name" : "es7-test", "status" : "green", "timed_out" : false, "number_of_nodes" : 1, "number_of_data_nodes" : 1, "active_primary_shards" : 20, "active_shards" : 20, "relocating_shards" : 0, "initializing_shards" : 0, "unassigned_shards" : 0, "delayed_unassigned_shards" : 0, "number_of_pending_tasks" : 0, "number_of_in_flight_fetch" : 0, "task_max_waiting_in_queue_millis" : 0, "active_shards_percent_as_number" : 100.0 }
ik
root@es:/es7/app/es7/plugins/ik# curl -k -u elastic:密码 -XGET http://192.168.20.133:9200/_analyze -i -H 'Content-Type: application/json' -d '{"text":"这个月工资减半","tokenizer": "ik_max_word"}' HTTP/1.1 200 OK X-elastic-product: Elasticsearch content-type: application/json; charset=UTF-8 content-length: 501 {"tokens":[{"token":"这个","start_offset":0,"end_offset":2,"type":"CN_WORD","position":0},{"token":"个月","start_offset":1,"end_offset":3,"type":"CN_WORD","position":1},{"token":"月工资","start_offset":2,"end_offset":5,"type":"CN_WORD","position":2},{"token":"月工","start_offset":2,"end_offset":4,"type":"CN_WORD","position":3},{"token":"工资","start_offset":3,"end_offset":5,"type":"CN_WORD","position":4},{"token":"减半","start_offset":5,"end_offset":7,"type":"CN_WORD","position":5}]}r
创建es索引
PUT canal_product { "settings":{ "number_of_shards":10, "number_of_replicas":0, "analysis":{ "analyzer":{ "ik":{ "tokenizer":"ik_max_word" } } } }, "mappings":{ "properties":{ "_id":{ "type":"integer" }, "title":{ "type":"text" }, "sub_title":{ "type":"text" }, "price":{ "type":"integer" }, "pic":{ "type":"text" } } } }
在MySQL数据库中创建数据
# 之后使用如下SQL语句在数据库中创建一条记录 INSERT INTO product ( id, title, sub_title, price, pic ) VALUES ( 5, '小米8', ' 全面屏游戏智能手机 6GB+64GB', 1999.00, NULL );
查询
GET canal_product/_search
root@es:/opt/canal/adapter/bin# curl -X POST localhost:8081/etl/es7/mytest_user.yml {"succeeded":true,"resultMessage":"导入ES 数据:5 条"}