orchestrator-Raft集群部署
本文简要说明下orchestrator的Raft集群部署,其实部署很简单主要是好好研究下配置文件的配置,这里我的样例配置文件暂时只适用于我们这块业务
如果您自己使用请根据情况自行修改。
主要通过配置文件,守护进程、代理配置(官方推荐通过代理把请求都打在Leader上)三部分说明
机器信息:
192.168.1.100
192.168.1.101
192.168.1.102
一共三台机器,三台机器分别部署
1. 配置文件
{ "Debug": true, "EnableSyslog": false, "ListenAddress": ":3000", "AgentsServerPort": ":3001", "MySQLTopologyUser": "orc_client_user", "MySQLTopologyPassword": "orc_client_user_password", "MySQLTopologyCredentialsConfigFile": "", "MySQLTopologySSLPrivateKeyFile": "", "MySQLTopologySSLCertFile": "", "MySQLTopologySSLCAFile": "", "MySQLTopologySSLSkipVerify": true, "MySQLTopologyUseMutualTLS": false, # 这里我使用的是sqlite数据库,如果使用mysql数据库请自己修改 "BackendDB": "sqlite3", "SQLite3DataFile": "/export/orc-sqlite3.db", "MySQLOrchestratorHost": "192.168.1.100", "MySQLOrchestratorPort": 3358, "MySQLOrchestratorDatabase": "orchestrator", "MySQLOrchestratorUser": "orchestrator_rw", "MySQLOrchestratorPassword": "orchestrator_pwd", "MySQLOrchestratorCredentialsConfigFile": "", "MySQLOrchestratorSSLPrivateKeyFile": "", "MySQLOrchestratorSSLCertFile": "", "MySQLOrchestratorSSLCAFile": "", "MySQLOrchestratorSSLSkipVerify": true, "MySQLOrchestratorUseMutualTLS": false, "MySQLConnectTimeoutSeconds": 5, "DefaultInstancePort": 3306, "SkipOrchestratorDatabaseUpdate": false, "SlaveLagQuery": "", "DiscoverByShowSlaveHosts": true, "InstancePollSeconds": 30, "UnseenInstanceForgetHours": 240, "SnapshotTopologiesIntervalHours": 0, "InstanceBulkOperationsWaitTimeoutSeconds": 10, "HostnameResolveMethod": "none", "MySQLHostnameResolveMethod": "none", "SkipBinlogServerUnresolveCheck": true, "ExpiryHostnameResolvesMinutes": 60, "RejectHostnameResolvePattern": "", "ReasonableReplicationLagSeconds": 10, "ProblemIgnoreHostnameFilters": [], "VerifyReplicationFilters": false, "ReasonableMaintenanceReplicationLagSeconds": 20, "CandidateInstanceExpireMinutes": 60, "AuditLogFile": "/tmp/orchestrator-audit.log", "AuditToSyslog": false, "RemoveTextFromHostnameDisplay": ".mydomain.com:3358", "ReadOnly": false, "AuthenticationMethod": "", "HTTPAuthUser": "", "HTTPAuthPassword": "", "AuthUserHeader": "", "PowerAuthUsers": [ "*" ], "ClusterNameToAlias": { "127.0.0.1": "test suite" }, "DetectClusterAliasQuery": "SELECT value FROM _vt.local_metadata WHERE name='ClusterAlias'", "DetectClusterDomainQuery": "", "DetectInstanceAliasQuery": "SELECT value FROM _vt.local_metadata WHERE name='Alias'", "DetectPromotionRuleQuery": "SELECT value FROM _vt.local_metadata WHERE name='PromotionRule'", "DataCenterPattern": "", "PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com", "DetectDataCenterQuery": "SELECT value FROM _vt.local_metadata where name='DataCenter'", "PromotionIgnoreHostnameFilters": [], "DetectSemiSyncEnforcedQuery": "SELECT @@global.rpl_semi_sync_master_wait_no_slave AND @@global.rpl_semi_sync_master_timeout > 1000000", "ServeAgentsHttp": false, "AgentsUseSSL": false, "AgentsUseMutualTLS": false, "AgentSSLSkipVerify": false, "AgentSSLPrivateKeyFile": "", "AgentSSLCertFile": "", "AgentSSLCAFile": "", "AgentSSLValidOUs": [], "UseSSL": false, "UseMutualTLS": false, "SSLSkipVerify": false, "SSLPrivateKeyFile": "", "SSLCertFile": "", "SSLCAFile": "", "SSLValidOUs": [], "StatusEndpoint": "/api/status", "StatusSimpleHealth": true, "StatusOUVerify": false, "AgentPollMinutes": 60, "UnseenAgentForgetHours": 6, "StaleSeedFailMinutes": 60, "SeedAcceptableBytesDiff": 8192, "PseudoGTIDPattern": "drop view if exists .*?`_pseudo_gtid_hint__", "PseudoGTIDMonotonicHint": "asc:", "DetectPseudoGTIDQuery": "", "BinlogEventsChunkSize": 10000, "SkipBinlogEventsContaining": [], "ReduceReplicationAnalysisCount": true, "FailureDetectionPeriodBlockMinutes": 60, "RecoveryPeriodBlockMinutes": 60, "RecoveryPeriodBlockSeconds": 3600, "RecoveryIgnoreHostnameFilters": [ ".*" ], "RecoverMasterClusterFilters": [ ".*" ], "RecoverIntermediateMasterClusterFilters": [ "_intermediate_master_pattern_" ], "OnFailureDetectionProcesses": [ "echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log" ], "PreFailoverProcesses": [ "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log" ], "PostFailoverProcesses": [ "echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log" ], "PostUnsuccessfulFailoverProcesses": [ "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log", "curl -d '{\"isSuccessful\": {isSuccessful}, \"failureType\": \"{failureType}\", \"failureDescription\": \"{failureDescription}\", \"failedHost\": \"{failedHost}\", \"failedPort\": {failedPort}, \"failureCluster\": \"{failureCluster}\", \"failureClusterAlias\": \"{failureClusterAlias}\", \"failureClusterDomain\": \"{failureClusterDomain}\", \"countSlaves\": {countSlaves}, \"countReplicas\": {countReplicas}, \"isDowntimed\": {isDowntimed}, \"autoMasterRecovery\": {autoMasterRecovery}, \"autoIntermediateMasterRecovery\": {autoIntermediateMasterRecovery}, \"orchestratorHost\": \"{orchestratorHost}\", \"recoveryUID\": \"{recoveryUID}\", \"lostSlaves\": \"{lostSlaves}\", \"lostReplicas\": \"{lostReplicas}\", \"slaveHosts\": \"{slaveHosts}\", \"replicaHosts\": \"{replicaHosts}\"}' http://test.domain.com/api/failover" ], "PostMasterFailoverProcesses": [ "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log", "curl -d '{\"isSuccessful\": {isSuccessful}, \"failureType\": \"{failureType}\", \"failureDescription\": \"{failureDescription}\", \"failedHost\": \"{failedHost}\", \"failedPort\": {failedPort}, \"failureCluster\": \"{failureCluster}\", \"failureClusterAlias\": \"{failureClusterAlias}\", \"failureClusterDomain\": \"{failureClusterDomain}\", \"countSlaves\": {countSlaves}, \"countReplicas\": {countReplicas}, \"isDowntimed\": {isDowntimed}, \"autoMasterRecovery\": {autoMasterRecovery}, \"autoIntermediateMasterRecovery\": {autoIntermediateMasterRecovery}, \"orchestratorHost\": \"{orchestratorHost}\", \"recoveryUID\": \"{recoveryUID}\", \"successorHost\": \"{successorHost}\", \"successorPort\": {successorPort}, \"lostSlaves\": \"{lostSlaves}\", \"lostReplicas\": \"{lostReplicas}\", \"slaveHosts\": \"{slaveHosts}\", \"successorAlias\": \"{successorAlias}\",\"replicaHosts\": \"{replicaHosts}\"}' http://test.domain.com/api/failover" ], "PostIntermediateMasterFailoverProcesses": [ "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log" ], "CoMasterRecoveryMustPromoteOtherCoMaster": true, "DetachLostSlavesAfterMasterFailover": true, "ApplyMySQLPromotionAfterMasterFailover": true, "MasterFailoverLostInstancesDowntimeMinutes": 0, "PostponeSlaveRecoveryOnLagMinutes": 0, "OSCIgnoreHostnameFilters": [], "GraphiteAddr": "", "GraphitePath": "", "GraphiteConvertHostnameDotsToUnderscores": true, "RaftEnabled": true, "RaftDataDir": "/export/Data/orchestrator", "RaftBind": "192.168.1.100", "RaftAdvertise": "192.168.1.100", "DefaultRaftPort": 10008, "RaftNodes": [ "192.168.1.100", "192.168.1.101", "192.168.1.102", ] }
以上只列出了192.168.1.100的配置,另外两台机器的配置类似的修改就可以,主要注意需要修改RaftBind, RaftAdvertise对应的值即可,其他参数根据个人需要自己修改。
配置准备好后将程序打包防止在/export/App/orchestrator目录下
2. sqlite数据库文件配置
由于配置里面我们使用了sqlite数据库,所以需要将sqlite数据库文件放置在对应的目录下
3. 守护进程
参数配置文件准备好了,我们下面准备守护进程脚本,方便程序故障时重新启动
配置脚本信息如下:
#!/bin/bash # orchestrator daemon # chkconfig: 345 20 80 # description: orchestrator daemon # processname: orchestrator # Script credit: http://werxltd.com/wp/2012/01/05/simple-init-d-script-template/ # 执行路径,这个和我们防止可执行文件的路劲一致 DAEMON_PATH="/export/App/orchestrator" DAEMON=orchestrator DAEMONOPTS="-config /export/App/orchestrator/orchestrator.conf.json --verbose http" NAME=orchestrator DESC="orchestrator: MySQL replication management and visualization" PIDFILE=/var/run/$NAME.pid SCRIPTNAME=/etc/init.d/$NAME # Limit the number of file descriptors (and sockets) used by # orchestrator. This setting should be fine in most cases but a # large busy environment may # reach this limit. If exceeded expect # to see errors of the form: # 2017-06-12 02:33:09 ERROR dial tcp 10.1.2.3:3306: connect: cannot assign requested address # To avoid touching this script you can use /etc/orchestrator_profile # to increase this limit. ulimit -n 16384 # initially noop but can adjust according by modifying orchestrator_profile # - see https://github.com/github/orchestrator/issues/227 for more details. post_start_daemon_hook () { # by default do nothing : } # Start the orchestrator daemon in the background
# 这里需要注意的是我们启动后会把日志文件直接重定向到/export/Logs/orchestrator目录下面,查看日志直接在该目录查看即可 start_daemon () { # start up daemon in the background $DAEMON_PATH/$DAEMON $DAEMONOPTS >> /export/Logs/orchestrator/${NAME}.log 2>&1 & # collect and print PID of started process echo $! # space for optional processing after starting orchestrator # - redirect stdout to stderro to prevent this corrupting the pid info post_start_daemon_hook 1>&2 } # The file /etc/orchestrator_profile can be used to inject pre-service execution # scripts, such as exporting variables or whatever. It's yours! #[ -f /etc/orchestrator/orchestrator_profile ] && . /etc/orchestrator/orchestrator_profile case "$1" in start) printf "%-50s" "Starting $NAME..." cd $DAEMON_PATH PID=$(start_daemon) #echo "Saving PID" $PID " to " $PIDFILE if [ -z $PID ]; then printf "%s\n" "Fail" exit 1 elif [ -z "$(ps axf | awk '{print $1}' | grep ${PID})" ]; then printf "%s\n" "Fail" exit 1 else echo $PID > $PIDFILE printf "%s\n" "Ok" fi ;; status) printf "%-50s" "Checking $NAME..." if [ -f $PIDFILE ]; then PID=$(cat $PIDFILE) if [ -z "$(ps axf | awk '{print $1}' | grep ${PID})" ]; then printf "%s\n" "Process dead but pidfile exists" exit 1 else echo "Running" fi else printf "%s\n" "Service not running" exit 1 fi ;; stop) printf "%-50s" "Stopping $NAME" PID=$(cat $PIDFILE) cd $DAEMON_PATH if [ -f $PIDFILE ]; then kill -TERM $PID rm -f $PIDFILE # Wait for orchestrator to stop otherwise restart may fail. # (The newly restarted process may be unable to bind to the # currently bound socket.) while ps -p $PID >/dev/null 2>&1; do printf "." sleep 1 done printf "\n" printf "Ok\n" else printf "%s\n" "pidfile not found" exit 1 fi ;; restart) $0 stop $0 start ;; reload) PID=$(cat $PIDFILE) cd $DAEMON_PATH if [ -f $PIDFILE ]; then kill -HUP $PID printf "%s\n" "Ok" else printf "%s\n" "pidfile not found" exit 1 fi ;; *) echo "Usage: $0 {status|start|stop|restart|reload}" exit 1 esac
如果应用程序防止目录不一致,只需要对应的修改路径即可,其他信息不用修改
所有程序防止完成之后即可启动,看看是否能正常选举
4. haproxy代理
安装haproxy
yum install haproxy
配置文件
#--------------------------------------------------------------------- # Example configuration for a possible web application. See the # full configuration options online. # # http://haproxy.1wt.eu/download/1.4/doc/configuration.txt # #--------------------------------------------------------------------- #--------------------------------------------------------------------- # Global settings #--------------------------------------------------------------------- global # to have these messages end up in /var/log/haproxy.log you will # need to: # # 1) configure syslog to accept network log events. This is done # by adding the '-r' option to the SYSLOGD_OPTIONS in # /etc/sysconfig/syslog # # 2) configure local2 events to go to the /var/log/haproxy.log # file. A line like the following can be added to # /etc/sysconfig/syslog # # local2.* /var/log/haproxy.log # log 127.0.0.1 local2 chroot /var/lib/haproxy pidfile /var/run/haproxy.pid maxconn 4000 user haproxy group haproxy daemon # turn on stats unix socket stats socket /var/lib/haproxy/stats #--------------------------------------------------------------------- # common defaults that all the 'listen' and 'backend' sections will # use if not designated in their block #--------------------------------------------------------------------- defaults mode http log global option httplog option dontlognull option http-server-close option forwardfor except 127.0.0.0/8 option redispatch retries 3 timeout http-request 10s timeout queue 1m timeout connect 10s timeout client 1m timeout server 1m timeout http-keep-alive 10s timeout check 10s maxconn 3000 #--------------------------------------------------------------------- # main frontend which proxys to the backends #--------------------------------------------------------------------- #frontend main *:5000 # acl url_static path_beg -i /static /images /javascript /stylesheets # acl url_static path_end -i .jpg .gif .png .css .js # # use_backend static if url_static # default_backend app #--------------------------------------------------------------------- # static backend for serving up images, stylesheets and such #--------------------------------------------------------------------- #backend static # balance roundrobin # server static 127.0.0.1:4331 check #--------------------------------------------------------------------- # round robin balancing between the various backends #--------------------------------------------------------------------- #backend app # balance roundrobin # server app1 127.0.0.1:5001 check # server app2 127.0.0.1:5002 check # server app3 127.0.0.1:5003 check # server app4 127.0.0.1:5004 check listen orchestrator stats enable bind *:9888 mode http stats refresh 30s stats uri /admin bind 0.0.0.0:80 process 1 bind 0.0.0.0:80 process 2 bind 0.0.0.0:80 process 3 bind 0.0.0.0:80 process 4 option httpchk GET /api/leader-check maxconn 20000 balance first retries 1 timeout connect 1000 timeout check 300 timeout server 30s timeout client 30s default-server port 3000 fall 1 inter 1000 rise 1 downinter 1000 on-marked-down shutdown-sessions weight 10 server 192.168.1.100 192.168.1.100:3000 check server 192.168.1.101 192.168.1.101:3000 check server 192.168.1.102 192.168.1.102:3000 check
重启haproxy服务
service haproxy status
启动后查看监控页面
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 25岁的心里话
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现