postgres repmgr 自动故障转移 手动故障转移

yum源

rm -f /etc/yum.repos.d
wget -O /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-vault-8.5.2111.repo

yum clean all
yum makecache

安装部分常用工具

yum -y groupinstall "Development Tools"
yum -y install coreutils glib2 lrzsz dstat sysstat e4fsprogs xfsprogs readline-devel zlib-devel gcc gcc-c++ make 
yum -y install cmake smartmontools flex bison perl-devel perl-ExtUtils-* openldap-devel jadetex bzip2 openssl-devel 
yum -y install pam-devel libxml2-devel libxslt-devel tcl-devel lz4 clang-devel libicu-devel llvm-devel 

yum -y install systemtap-sdt-devel perl krb5-devel pam libuuid libuuid-devel systemd-devel tcl lz4-devel 

yum -y install python3 python3-devel libzstd-devel libselinux-devel pcre-devel tree psmisc  yum-utils libffi-devel ncurses-devel

创建postgres用户和组

groupadd -g 60000 postgres
useradd -u 60000 -g postgres postgres
echo "postgres" |passwd --stdin postgres

目录结构创建与权限调整

mkdir -p /postgresql/pg16 /pgdata/data /backup /arch
chmod -R 775 /postgresql
chown -R postgres:postgres /postgresql
chmod -R 775 /backup
chown -R postgres:postgres /backup
chmod -R 775 /pgdata
chown -R postgres:postgres /pgdata
chmod -R 775 /arch
chown -R postgres:postgres /arch

数据库源码编译与安装

下载地址 https://www.postgresql.org/ftp/source/v16.2/

cd /postgresql/pg16/
wget https://ftp.postgresql.org/pub/source/v16.2/postgresql-16.2.tar.gz
tar -zxvf postgresql-16.2.tar.gz
cd postgresql-16.2/
chmod +x configure
./configure --prefix=/postgresql/pg16 --with-pgport=5432 --with-openssl --with-perl  --with-blocksize=8 
make world && make install-world

更改软件目录和数据目录的所有权

chown -R postgres. /postgresql/pg16 /pgdata /backup /arch

设置环境变量

su - postgres
vi .bashrc
# 添加如下内容至.bashrc
if [ -f /etc/bashrc ]; then
 . /etc/bashrc
fi
export PATH=/postgresql/pg16/bin/:$PATH
export PGDATA=/pgdata/data
export PGDATABASE=postgres
export PGUSER=postgres
export PGPORT=6543
export PGLOCALEDIR=/postgresql/pg16/share/locale
alias pg_ctl='LD_LIBRARY_PATH=/postgresql/pg16/lib pg_ctl'





#重新编译
source ~/.bashrc

初始化主节点

su - postgres
initdb -D /pgdata/data -E UTF8 --locale=zh_CN.utf8

# 创建用户(随后根据提示输入密码)
/postgresql/pg16/bin/createuser -P -s -e postgres
/postgresql/pg16/bin/createuser -P -s -e repl

编辑pg_hba.conf文件以启用信任或scram-sha-256认证方式:

su - postgres
vi $PGDATA/pg_hba.conf
host    replication     all             192.168.0.92            trust
host    replication     all             192.168.0.93            trust
host    replication     all             192.168.0.94            trust
host    replication     all             192.168.0.95            trust

postgresql.conf

shared_preload_libraries = 'repmgr'
listen_addresses = '*'
port = 6543
wal_level = replica
wal_log_hints = on

启动服务

su - postgres
pg_ctl start

获取最新repmgr

https://www.repmgr.org/

yum install curl-devel
yum install json-c-devel



mkdir /repmgr
chown -R postgres:postgres /repmgr
su - postgres
cd /repmgr
tar -zxvf repmgr-5.4.1.tar.gz
cd repmgr-5.4.1/
./configure USE_PGXS=1 PG_CONFIG=/postgresql/pg16/bin/pg_config
make && make install

配置免密登录

su - postgres
ssh-keygen -t rsa
一路回车直到完成
--配置所有节点
ssh-copy-id -i ~/.ssh/id_rsa.pub -p 22 postgres@192.168.0.92

创建repmgr目录

su - postgres
mkdir /postgresql/pg16/repmgr /pgdata/repmgr_log

repmgr主节点

su - postgres
mkdir /postgresql/pg16/repmgr /pgdata/repmgr_log

vim /postgresql/pg16/repmgr/repmgr.conf
注意修改ip
添加内容

# 节点标识设置
node_id = 1
node_name = pg01
# 数据库连接信息
conninfo = 'host=192.168.0.92 user=postgres dbname=repmgr port=6543 connect_timeout=2'
# 数据目录路径
data_directory = '/pgdata/data'
# PostgreSQL二进制目录路径
pg_bindir = '/postgresql/pg16/bin'
# 日志级别与日志文件位置
log_level = info
log_file = '/pgdata/repmgr_log/repmgr.log'
# 故障切换策略设定
failover = automatic
# 提升从库为新主库的命令
promote_command = '/postgresql/pg16/bin/repmgr standby promote -f /postgresql/pg16/repmgr/repmgr.conf --log-to-file'
# 从库跟随主库命令
follow_command = '/postgresql/pg16/bin/repmgr standby follow -f /postgresql/pg16/repmgr/repmgr.conf --log-to-file --upstream-node-id=%n'
# 监控历史记录存储选项
monitoring_history = true
# 监控间隔时间(秒)
monitor_interval_secs = 5
# 重连尝试次数
reconnect_attempts = 3
# 两次重连之间的间隔时间(秒)
reconnect_interval = 5
# repmgrd服务启动命令
repmgrd_service_start_command = '/postgresql/pg16/bin/repmgrd repmgrd -f /postgresql/pg16/repmgr/repmgr.conf --pid-file /tmp/repmgrd.pid'
# repmgrd服务停止命令
repmgrd_service_stop_command = 'kill cat /tmp/repmgrd.pid'

从节点配置(/postgresql/pg16/repmgr/repmgr.conf)

# 节点标识设置
node_id = 2
node_name = pg02
# 数据库连接信息
conninfo = 'host=192.168.0.93 user=postgres dbname=repmgr port=6543 connect_timeout=2'
# 数据目录路径
data_directory = '/pgdata/data'
# PostgreSQL二进制目录路径
pg_bindir = '/postgresql/pg16/bin'
# 日志级别与日志文件位置
log_level = info
log_file = '/pgdata/repmgr_log/repmgr.log'
# 故障切换策略设定
failover = automatic
# 提升从库为新主库的命令
promote_command = '/postgresql/pg16/bin/repmgr standby promote -f /postgresql/pg16/repmgr/repmgr.conf --log-to-file'
# 从库跟随主库命令
follow_command = '/postgresql/pg16/bin/repmgr standby follow -f /postgresql/pg16/repmgr/repmgr.conf --log-to-file --upstream-node-id=%n'
# 监控历史记录存储选项
monitoring_history = true
# 监控间隔时间(秒)
monitor_interval_secs = 5
# 重连尝试次数
reconnect_attempts = 3
# 两次重连之间的间隔时间(秒)
reconnect_interval = 5
# repmgrd服务启动命令
repmgrd_service_start_command = '/postgresql/pg16/bin/repmgrd repmgrd -f /postgresql/pg16/repmgr/repmgr.conf --pid-file /tmp/repmgrd.pid'
# repmgrd服务停止命令
repmgrd_service_stop_command = 'kill cat /tmp/repmgrd.pid'

添加repmgr所需要的数据库(postgre主数据库操作)

su - postgres
createdb repmgr;

repmgr注册主节点(repmgr主操作)

su - postgres
# 注册主节点
repmgr -f /postgresql/pg16/repmgr/repmgr.conf master register
# 显示集群状态信息
repmgr -f /postgresql/pg16/repmgr/repmgr.conf cluster show

从库克隆数据

su - postgres
pg_ctl stop
rm -rf /pgdata/data/
repmgr -h 192.168.0.92 --replication-user=postgres -d repmgr -D /pgdata/data -f /postgresql/pg16/repmgr/repmgr.conf standby clone --upstream-node-id=1

启动从库

pg_ctl start

注册从库到集群并验证集群状态

su - postgres
repmgr -f /postgresql/pg16/repmgr/repmgr.conf standby register -F
repmgr -f /postgresql/pg16/repmgr/repmgr.conf cluster show

repmgr守护进程(所有节点)

su - postgres
repmgr daemon start

查询repmgr守护进程(repmgrd)的状态确认其正常运行:

repmgr -f /postgresql/pg16/repmgr/repmgr.conf daemon status -v

手动故障转移

主从角色切换

repmgr -f /postgresql/pg16/repmgr/repmgr.conf standby switchover -U postgres

其他从节点从跟随新主

repmgr -f /postgresql/pg16/repmgr/repmgr.conf standby follow

原主节点重新加入集群

# 连接新主节点的数据库
# 需要先停止故障的主节点
pg_ctl stop
# 此命令会自动尝试启动并在启动过程中持续ping服务,此过程中可能会提示无法ping通服务,耐心等待即可
repmgr -f /postgresql/pg16/repmgr/repmgr.conf node rejoin -d "host=192.168.0.92 user=postgres dbname=repmgr port=6543 connect_timeout=2" --force-rewind

自动故障转移

初始化一个新的主数据库

su - postgres
initdb -D /pgdata/data -E UTF8 --locale=zh_CN.utf8

postgresql.conf添加
shared_preload_libraries = 'repmgr'


pg_ctl start

# 创建用户(随后根据提示输入密码)
/postgresql/pg16/bin/createuser -P -s -e postgres
/postgresql/pg16/bin/createuser -P -s -e repl

createdb repmgr;
# 主子节点启动
repmgrd -f /postgresql/pg16/repmgr/repmgr.conf --pid-file=/tmp/repmgrd.pid

停止repmgrd

kill `cat /tmp/repmgrd.pid`

witness配置 ( /postgresql/pg16/repmgr/repmgr.conf )

部署repmgr过程和主、从节点一致,此处省略

# 节点标识设置
node_id = 4
node_name = witness
# 数据库连接信息
conninfo = 'host=192.168.0.95 user=postgres dbname=repmgr port=6543 connect_timeout=2'
# 数据目录路径
data_directory = '/pgdata/data'
# PostgreSQL二进制目录路径
pg_bindir = '/postgresql/pg16/bin'
# 日志级别与日志文件位置
log_level = info
log_file = '/pgdata/repmgr_log/repmgr.log'
# 故障切换策略设定
failover = automatic
# 提升从库为新主库的命令
promote_command = '/postgresql/pg16/bin/repmgr standby promote -f /postgresql/pg16/repmgr/repmgr.conf --log-to-file'
# 从库跟随主库命令
follow_command = '/postgresql/pg16/bin/repmgr standby follow -f /postgresql/pg16/repmgr/repmgr.conf --log-to-file --upstream-node-id=%n'
# 监控历史记录存储选项
monitoring_history = true
# 监控间隔时间(秒)
monitor_interval_secs = 5
# 重连尝试次数
reconnect_attempts = 3
# 两次重连之间的间隔时间(秒)
reconnect_interval = 5
# repmgrd服务启动命令
repmgrd_service_start_command = '/postgresql/pg16/bin/repmgrd repmgrd -f /postgresql/pg16/repmgr/repmgr.conf --pid-file /tmp/repmgrd.pid'
# repmgrd服务停止命令
repmgrd_service_stop_command = 'kill cat /tmp/repmgrd.pid'

注册witness

repmgr -f /postgresql/pg16/repmgr/repmgr.conf witness register -h 192.168.0.93 -drepmgr -Upostgres

参考

PostgreSQL 16.1与Repmgr实现高可用集群

重启repmgr
repmgr -f /postgresql/pg16/repmgr/repmgr.conf node service --action=restart

移除从库
repmgr -f /postgresql/pg16/repmgr/repmgr.conf standby unregister -F

查询节点状态
repmgr -f /postgresql/pg16/repmgr/repmgr.conf daemon status -v

repmgr -f /postgresql/pg16/repmgr/repmgr.conf service pause --dry-run
repmgr -f /postgresql/pg16/repmgr/repmgr.conf service unpause --dry-run

注意事项

如果是计划内的服务中断,需要先停止repmgrd,随后再停止postgres,避免自动执行了故障转移

常用命令

repmgr primary register  安装pg的repmgr扩展并注册为主节点
repmgr primary unregister 注销不活动的主节点
repmgr standby clone 从其他节点复制数据到从节点
repmgr standby register 注册从节点(添加从的信息到repmgr元数据)
repmgr standby unregister repmgr  元数据中移除从的信息
repmgr standby promote 将从提升为主
repmgr standby follow  将从跟随新主
repmgr standby switchover 将从提升为主并将主降级为从
repmgr witness register  注册一个观察节点
repmgr witness unregister  移除一个观察节点
repmgr node status  显示节点的基本信息和复制状态
repmgr node check  从复制的角度对节点进行健康监测
repmgr node rejoin   重新加入一个失效节点到集群
repmgr cluster show  显示所有集群中注册的节点信息
repmgr cluster matrix 在所有节点运行show并汇总
repmgr cluster crosscheck  在节点间两两交叉监测连接
repmgr cluster event 输出时间记录
repmgr cluster cleanup   清理监控历史
repmgr service status  节点状态


repmgr service pause --dry-run    暂停服务
repmgr service unpause --dry-run     启动服务

keepalived

创建目录、修改权限
su -
mkdir /keepalived
chmod -R 775 /keepalived
chown -R postgres:postgres /keepalived

VIP(基于脚本自动切换)

基于ip addr自动设置虚拟IP
/usr/bin/sudo /sbin/ip addr add 192.168.0.110/24 dev enp0s3
/usr/bin/sudo /sbin/ip addr del 192.168.0.110/24 dev enp0s3

需要确保用户具有sudo权限

添加权限
chmod u+x /usr/sbin/ip
chmod u+x /usr/sbin/arping
chmod u+s /sbin/ip

创建脚本 /postgres/repmgr_promote_command.sh

#!/bin/bash
  
set -o xtrace
PGCANDIDATES=(192.168.0.92 192.168.0.93 192.168.0.94 192.168.0.95)
# HOSTNAME=`hostname -i`
VIP=192.168.0.110
GW=192.168.1.1
DEVICE=enp0s3
STEP1="Remove the VIP on all nodes"
STEP2="Check if vip still online"
STEP3="Promte primary DB"
STEP4="Add vip on new primary DB"
STEP5="Clear ARP Cache"


###step1. Remove the VIP on all nodes
for candidate in "${PGCANDIDATES[@]}"; do
echo $candidate
ssh -p '22' postgres@$candidate -tt  "
ip addr del $VIP/24 dev $DEVICE
exit
"
done

####clear arp cache
/sbin/arping -I $DEVICE  -b -c 5 $GW
#/usr/bin/sudo /sbin/arping -I $DEVICE -s $VIP -b -c 5 $GW

#/usr/bin/sudo /sbin/arping -I enp0s3 -s 192.168.0.92 -b -c 5 192.168.1.1
#/usr/bin/sudo /sbin/arping -I enp0s3 -b -c 5 192.168.1.1



###step2.Check if vip still online
/usr/bin/ping -c1 -w1 -t5 ${VIP}
if [ $? -eq 0 ]; then
    echo repmgr_promote_command.sh: ${STEP2} : ${VIP} failed !!!  
    exit 1
fi


###step4.Add vip on new primary DB
/sbin/ip addr add $VIP/24 dev $DEVICE
if [ $? -ne 0 ]; then
    echo repmgr_promote_command.sh: ${STEP4} on ${HOSTNAME} failed !!!  
    exit 1
fi


####step5.Clear arp cache
/sbin/arping -I $DEVICE -s $VIP -b -c 3 $GW
if [ $? -ne 0 ]; then
    echo repmgr_promote_command.sh: ${STEP5} on ${HOSTNAME} failed !!!  
    exit 1
fi

修改/postgresql/pg16/repmgr/repmgr.conf
在原有promote_command基础上添加此脚本

promote_command = '/postgresql/pg16/bin/repmgr standby promote -f /postgresql/pg16/repmgr/repmgr.conf --log-to-file && /test/repmgt_promote_command.sh '

重启repmgr、repmgrd

repmgr  -f /postgresql/pg16/repmgr/repmgr.conf   node service --action=restart

repmgrd -f /postgresql/pg16/repmgr/repmgr.conf --pid-file=/tmp/repmgrd.pid
kill `cat /tmp/repmgrd.pid`
posted @ 2024-05-10 10:41  Hey,Coder!  阅读(69)  评论(1编辑  收藏  举报