基于DNS(Consul)高可用
DNS
推荐从Bind-DLZ入手,资料多
可控制度更好(查询DNS记录SQL可定制)
据说性能差
可控制度更好(查询DNS记录SQL可定制)
据说性能差
Bind-DLZ
https://www.cnblogs.com/saneri/p/8178065.html
PowerDNS
SQL schema设置规范
性能比Bind-DLZ好
https://www.cnblogs.com/saneri/p/8178065.html
PowerDNS
SQL schema设置规范
性能比Bind-DLZ好
coredns 和k8s结合比较多
nacos 阿里开源,含DNS和服务发现
监控程序:
主从结构,支持GTID
监控逻辑:
按分组取出来机器节点
按分组取出来机器节点
master:
尝试连接成功 ok 保持
失败 进行从库选举
尝试连接成功 ok 保持
失败 进行从库选举
slave:
检查是不是在线online
在线的:
连接成功,复制是不是正常,不正常下线,检验延迟
下线更新cmdb,dns records
下线的:
连接成功,复制正常,不延迟 上线
上线更新cmdb,dns records
检查是不是在线online
在线的:
连接成功,复制是不是正常,不正常下线,检验延迟
下线更新cmdb,dns records
下线的:
连接成功,复制正常,不延迟 上线
上线更新cmdb,dns records
从库选举:
获取从库列表
获取从库列表
故障切换:
确认所有节点都复制中断,判断复制完成
对比,所有节点是不是复制到一个位置
通过获取的GTID对比,是不是所有节点同步到一个位置,如果不是,选举出来最靠前的做master
如果同步位置都一样,根据cmdb中定义的level选择,最大的那个
新的主节点选举成功后,其他节点change过来
更新cmdb中的角色,oldmaster->slave,选举出来的master,更改新主节点的read_only
更新dns_records
确认所有节点都复制中断,判断复制完成
对比,所有节点是不是复制到一个位置
通过获取的GTID对比,是不是所有节点同步到一个位置,如果不是,选举出来最靠前的做master
如果同步位置都一样,根据cmdb中定义的level选择,最大的那个
新的主节点选举成功后,其他节点change过来
更新cmdb中的角色,oldmaster->slave,选举出来的master,更改新主节点的read_only
更新dns_records
在线切换:
oldmaster上执行super_read_only & read_only干掉业务连接
获取oldmaster中的show master status信息
获取从节点中的show slave status对比,确认都同步完成
按cmdb中的level或是指定的节点为新master
更新重做master/slave架构
更新cmdb
更新dns
记录log
oldmaster上执行super_read_only & read_only干掉业务连接
获取oldmaster中的show master status信息
获取从节点中的show slave status对比,确认都同步完成
按cmdb中的level或是指定的节点为新master
更新重做master/slave架构
更新cmdb
更新dns
记录log
一定要提高英文阅读能力
[root@mydb1 ~]# wget https://releases.hashicorp.com/consul/1.4.0/consul_1.4.0_linux_amd64.zip
[root@mydb1 ~]# mkdir -p /opt/consul /opt/consul/conf /data/consul /data/consul/shell/
[root@mydb2 ~]# mkdir -p /opt/consul /opt/consul/conf /data/consul /data/consul/shell/
[root@mydb3 ~]# mkdir -p /opt/consul /opt/consul/conf /data/consul /data/consul/shell/
[root@mydb1 ~]# unzip consul_1.4.0_linux_amd64.zip
将consul拷贝至/opt/consul目录
[root@mydb1 ~]# cat /opt/consul/conf/server.json
{
"data_dir": "/data/consul",
"enable_script_checks": true,
"datacenter": "dc1",
"log_level": "INFO",
"server": true,
"bootstrap_expect": 3,
"ui":true
}
[root@mydb1 consul]# ./consul agent -config-dir=/opt/consul/conf > /data/consul/consul.log &
[root@mydb2 consul]# ./consul agent -config-dir=/opt/consul/conf > /data/consul/consul.log &
[root@mydb3 consul]# ./consul agent -config-dir=/opt/consul/conf > /data/consul/consul.log &
[root@mydb2 consul]# ./consul join 192.168.1.101
[root@mydb3 consul]# ./consul join 192.168.1.101
[root@mydb1 consul]# ./consul members
Node Address Status Type Build Protocol DC Segment
mydb1 192.168.1.101:8301 alive server 1.4.0 2 dc1 <all>
mydb2 192.168.1.102:8301 alive server 1.4.0 2 dc1 <all>
mydb3 192.168.1.103:8301 alive server 1.4.0 2 dc1 <all>
[root@mydb1 consul]# ./consul catalog nodes
Node ID Address DC
mydb1 52514e74 192.168.1.101 dc1
mydb2 aebbf0b2 192.168.1.102 dc1
mydb3 0e179069 192.168.1.103 dc1
Node ID Address DC
mydb1 52514e74 192.168.1.101 dc1
mydb2 aebbf0b2 192.168.1.102 dc1
mydb3 0e179069 192.168.1.103 dc1
# dig @127.0.0.1 -p 8600 mydb1.node.consul
# dig @127.0.0.1 -p 8600 mydb2.node.consul
# dig @127.0.0.1 -p 8600 mydb3.node.consul
[root@mydb1 consul]# ./consul operator raft list-peers
Node ID Address State Voter RaftProtocol
mydb1 52514e74-d063-cfe3-1d58-55fda9fc2451 192.168.1.101:8300 leader true 3
mydb2 aebbf0b2-09ad-f396-4c21-3f9ee40a16da 192.168.1.102:8300 follower true 3
mydb3 0e179069-7360-3866-d9a6-7ea60c540c04 192.168.1.103:8300 follower true 3
[root@mydb1 consul]# ./consul kv put id 11
Success! Data written to: id
[root@mydb1 consul]# ./consul kv get id
11
[root@mydb2 consul]# ./consul kv get id
11
[root@mydb3 consul]# ./consul kv get id
11
consul是用Raft来实现分布式一致性的
[root@mydb1 ~]# cat /opt/consul/conf/r-test-mgr-ser.json
{
"service": {
"name": "r-test-3306-mydb-ser",
"tags": ["测试-3306"],
"address": "192.168.1.101",
"meta": {
"meta": "for my service"
},
"port": 3306,
"enable_tag_override": false,
"checks": [
{
"args": ["/data/consul/shell/check_mysql_mgr_slave.sh"],
"interval": "1s"
}
]
}
}
[root@mydb1 ~]# cat /opt/consul/conf/w-test-mgr-ser.json
{
"service": {
"name": "w-test-3306-mydb-ser",
"tags": ["测试-3306"],
"address": "192.168.1.101",
"meta": {
"meta": "for my service"
},
"port": 3306,
"enable_tag_override": false,
"checks": [
{
"args": ["/data/consul/shell/check_mysql_mgr_master.sh"],
"interval": "10s"
}
]
}
}
注意在mydb2,mydb3上调整ip
检测脚本如下
[root@mydb1 ~]# cat /data/consul/shell/check_mysql_mgr_master.sh
#!/bin/bash
host="192.168.1.101"
port=3306
user="dba_user"
passwod="msds007"
comm="/usr/local/mysql/bin/mysql -u$user -h$host -P $port -p$passwod"
value=`$comm -Nse "select 1"`
primary_member=`$comm -Nse "select variable_value from performance_schema.global_status WHERE VARIABLE_NAME= 'group_replication_primary_member'"`
server_uuid=`$comm -Nse "select variable_value from performance_schema.global_variables where VARIABLE_NAME='server_uuid';"`
# 判断MySQL是否存活
if [ -z $value ]
then
echo "mysql $port is down....."
exit 2
fi
# 判断节点状态,是否存活
node_state=`$comm -Nse "select MEMBER_STATE from performance_schema.replication_group_members where MEMBER_ID='$server_uuid'"`
if [ $node_state != "ONLINE" ]
then
echo "MySQL $port state is not online...."
exit 2
fi
# 判断是不是主节点
if [[ $server_uuid == $primary_member ]]
then
echo "MySQL $port Instance is master ........"
exit 0
else
echo "MySQL $port Instance is slave ........"
exit 2
fi
[root@mydb1 ~]# cat /data/consul/shell/check_mysql_mgr_slave.sh
#!/bin/bash
host="192.168.1.101"
port=3306
user="dba_user"
passwod="msds007"
comm="/usr/local/mysql/bin/mysql -u$user -h$host -P $port -p$passwod"
value=`$comm -Nse "select 1"`
primary_member=`$comm -Nse "select variable_value from performance_schema.global_status WHERE VARIABLE_NAME= 'group_replication_primary_member'"`
server_uuid=`$comm -Nse "select variable_value from performance_schema.global_variables where VARIABLE_NAME='server_uuid';"`
# 判断mysql是否存活
if [ -z $value ]
then
echo "mysql $port is down....."
exit 2
fi
# 判断节点状态
node_state=`$comm -Nse "select MEMBER_STATE from performance_schema.replication_group_members where MEMBER_ID='$server_uuid'"`
if [ $node_state != "ONLINE" ]
then
echo "MySQL $port state is not online...."
exit 2
fi
# 判断是不是主节点
if [[ $server_uuid != $primary_member ]]
then
echo "MySQL $port Instance is slave ........"
exit 0
else
node_num=`$comm -Nse "select count(*) from performance_schema.replication_group_members"`
# 判断如果没有任何从节点,主节点也注册从角色服务。
if [ $node_num -eq 1 ]
then
echo "MySQL $port Instance is slave ........"
exit 0
else
echo "MySQL $port Instance is master ........"
exit 2
fi
fi
注意在mydb2,mydb3上调整ip
{
"service": {
"name": "r-test-3306-mydb-ser",
"tags": ["测试-3306"],
"address": "192.168.1.101",
"meta": {
"meta": "for my service"
},
"port": 3306,
"enable_tag_override": false,
"checks": [
{
"args": ["/data/consul/shell/check_mysql_mgr_slave.sh"],
"interval": "1s"
}
]
}
}
[root@mydb1 ~]# cat /opt/consul/conf/w-test-mgr-ser.json
{
"service": {
"name": "w-test-3306-mydb-ser",
"tags": ["测试-3306"],
"address": "192.168.1.101",
"meta": {
"meta": "for my service"
},
"port": 3306,
"enable_tag_override": false,
"checks": [
{
"args": ["/data/consul/shell/check_mysql_mgr_master.sh"],
"interval": "10s"
}
]
}
}
注意在mydb2,mydb3上调整ip
检测脚本如下
[root@mydb1 ~]# cat /data/consul/shell/check_mysql_mgr_master.sh
#!/bin/bash
host="192.168.1.101"
port=3306
user="dba_user"
passwod="msds007"
comm="/usr/local/mysql/bin/mysql -u$user -h$host -P $port -p$passwod"
value=`$comm -Nse "select 1"`
primary_member=`$comm -Nse "select variable_value from performance_schema.global_status WHERE VARIABLE_NAME= 'group_replication_primary_member'"`
server_uuid=`$comm -Nse "select variable_value from performance_schema.global_variables where VARIABLE_NAME='server_uuid';"`
# 判断MySQL是否存活
if [ -z $value ]
then
echo "mysql $port is down....."
exit 2
fi
# 判断节点状态,是否存活
node_state=`$comm -Nse "select MEMBER_STATE from performance_schema.replication_group_members where MEMBER_ID='$server_uuid'"`
if [ $node_state != "ONLINE" ]
then
echo "MySQL $port state is not online...."
exit 2
fi
# 判断是不是主节点
if [[ $server_uuid == $primary_member ]]
then
echo "MySQL $port Instance is master ........"
exit 0
else
echo "MySQL $port Instance is slave ........"
exit 2
fi
[root@mydb1 ~]# cat /data/consul/shell/check_mysql_mgr_slave.sh
#!/bin/bash
host="192.168.1.101"
port=3306
user="dba_user"
passwod="msds007"
comm="/usr/local/mysql/bin/mysql -u$user -h$host -P $port -p$passwod"
value=`$comm -Nse "select 1"`
primary_member=`$comm -Nse "select variable_value from performance_schema.global_status WHERE VARIABLE_NAME= 'group_replication_primary_member'"`
server_uuid=`$comm -Nse "select variable_value from performance_schema.global_variables where VARIABLE_NAME='server_uuid';"`
# 判断mysql是否存活
if [ -z $value ]
then
echo "mysql $port is down....."
exit 2
fi
# 判断节点状态
node_state=`$comm -Nse "select MEMBER_STATE from performance_schema.replication_group_members where MEMBER_ID='$server_uuid'"`
if [ $node_state != "ONLINE" ]
then
echo "MySQL $port state is not online...."
exit 2
fi
# 判断是不是主节点
if [[ $server_uuid != $primary_member ]]
then
echo "MySQL $port Instance is slave ........"
exit 0
else
node_num=`$comm -Nse "select count(*) from performance_schema.replication_group_members"`
# 判断如果没有任何从节点,主节点也注册从角色服务。
if [ $node_num -eq 1 ]
then
echo "MySQL $port Instance is slave ........"
exit 0
else
echo "MySQL $port Instance is master ........"
exit 2
fi
fi
注意在mydb2,mydb3上调整ip
[root@mydb1 consul]# ./consul agent -config-dir=/opt/consul/conf > /data/consul/consul.log &
[root@mydb2 consul]# ./consul agent -config-dir=/opt/consul/conf > /data/consul/consul.log &
[root@mydb3 consul]# ./consul agent -config-dir=/opt/consul/conf > /data/consul/consul.log &
[root@mydb2 consul]# ./consul join 192.168.1.101
[root@mydb3 consul]# ./consul join 192.168.1.101
[root@mydb1 consul]# ./consul members
# dig @127.0.0.1 -p 8600 w-test-3306-mydb-ser.service.consul
# dig @127.0.0.1 -p 8600 r-test-3306-mydb-ser.service.consul
# dig @127.0.0.1 -p 8600 r-test-3306-mydb-ser.service.consul
Consul使用手册
http://www.liangxiansen.cn/2017/04/06/consul/
http://www.liangxiansen.cn/2017/04/06/consul/