监控系统告警脚本集合
告警系统对接原则:以脚本执行的返回值为准,如返回值为0则表示正常,返回值为1则表示异常,根据预配置内容发出告警短信or邮件。
mysql拨测监控告警
#!/bin/bash
result=`/apps/svr/mysql_3306/bin/mysql -uuserAndPassword -puserAndPassword -h127.0.0.1 -N -e "select 1" 2>/dev/null`
if [[ $result -eq 1 ]]; then
echo "select 1 is OK"
exit 0
else
echo "ERROR,select 1 is not OK"
exit 1
fi
mysql连接数超过90%告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
re_status=`$pwd/mysql -u$user -p$password -h$host --port=$port -N -e "show status like 'Threads_running'" 2>/dev/null |awk -F ' ' '{print $2}'`
re_variables=`$pwd/mysql -u$user -p$password -h$host --port=$port -N -e "show global variables like 'max_connections'" 2>/dev/null |awk -F ' ' '{print $2}'`
result=`awk 'BEGIN{printf "%.2f%\n",('$re_status'/'$re_variables')*100}'`
if [[ $result < 90% ]];then
echo "连接数正常"
exit 0
else
echo "当前连接数超过90%,告警!"
exit 1
fi
mysql主从状态监控异常告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
# 监控主从同步状态,主从同步断开既告警# 同时监控主从同步延时时间Seconds_Behind_Master,当时间大于60秒告警# 需要监控账号 repl 具有"replication client" 权限:grant replication client on *.* to repl@'%' ;
DATE=`date +"%Y-%m-%d %H:%M:%S"` #当前日期时间
MYSQLTEST=`$pwd/mysql -u$user -p$password -h$host --port=$port -S /apps/run/mysql_3306/mysql.sock -e "show slave status\G" 2>/dev/null |wc -l`
if [ $MYSQLTEST -ne 0 ];then #该机器为从库
SLAVE_STATUS=`$pwd/mysql -u$user -p$password -h$host --port=$port -S /apps/run/mysql_3306/mysql.sock -e "show slave status\G" 2>/dev/null | egrep -i "running|Seconds_Behind_Master"`
IO_env=`echo $SLAVE_STATUS | grep IO | awk ' {print $2}'`
SQL_env=`echo $SLAVE_STATUS | grep SQL | awk '{print $4}'`
Seconds_Behind_env=`echo $SLAVE_STATUS |grep Seconds_Behind_Master |awk ' {print $6}'`
if [ "$IO_env" = "Yes" ]&&[ "$SQL_env" = "Yes" ];then #主从同步正常
echo "[$DATE] [INFO] Master-slave synchronization is running!"
else #主从同步断开,告警
echo "[$DATE] [ERROR] Master-slave synchronization is not running!"
exit 1
fi
## 监控延时时间Seconds_Behind_Master
if [ "$Seconds_Behind_env" != "NULL" ]&&[ $Seconds_Behind_env -gt 60 ];then #主从同步延时大于60秒,告警
echo "[$DATE] [ERROR] Master-slave synchronization delay time is greater than 60 seconds!"
exit 1
fi
else
echo "Master" #该机器为主库或者单机
fi
exit 0
mysql集群未提交长事务监控异常告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
DATE=`date +"%Y-%m-%d %H:%M:%S"`
MYSQLCOMMIT=` $pwd/mysql -u$user -p$password -h$host --port=$port -e "select a.id as conn_id, time_to_sec(timediff(now(),b.trx_started)) as trx_open_seconds from information_schema.processlist a right outer join information_schema.innodb_trx b on a.id = b.trx_mysql_thread_id where time_to_sec(timediff(now(),b.trx_started))>60;" 2>/dev/null |grep -v trx_open_seconds |wc -l`
if [ $MYSQLCOMMIT -gt 50 ];then
echo "[$DATE] [WARNING] 事务超过60秒未提交数量超过50个!"
exit 1
else
echo "[$DATE] [INFO] 事务超过60秒未提交数量: $MYSQLCOMMIT"
exit 0
fi
mysql缓存命中率
#!/bin/bash
#采集间隔时间,单位s
asleep=10
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
probe_file=probe.log
Innodb_buffer_read_hit_ratio=0
getBufferRatio(){
a=$(/data01/svr/mysql_${port}/bin/mysql -u${user} -p${password} -h${host} -e "show global status like 'Innodb_buffer_pool%';" |grep -i "Innodb_buffer_pool_read_requests\|Innodb_buffer_pool_reads" | awk '{if(length($3)==0) print $0 ;else print $3}'| tr -t '\n'' ');
Innodb_buffer_pool_read_requests=$(echo $a | awk '{print $2}')
Innodb_buffer_pool_reads=$(echo $a | awk '{print $4}')
Innodb_buffer_read_hit_ratio=`awk 'BEGIN{printf "%.2f\n",(1-'$Innodb_buffer_pool_reads'/'$Innodb_buffer_pool_read_requests')*100}'`
}
getBufferRatio
echo $(date "+%Y-%m-%d_%H:%M:%S") $Innodb_buffer_read_hit_ratio
if [ ${Innodb_buffer_read_hit_ratio%.*} -lt 99 ];then
echo "[$DATE] [WARNING] buffer命中率低于99!"
exit 1
else
echo "[$DATE] [INFO] buffer命中率: $Innodb_buffer_read_hit_ratio"
exit 0
fi
MySQL锁表监控告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0
a=`$pwd/mysql -u$user -p$password -h$host --port=$port -e "SHOW STATUS LIKE 'Innodb_row_lock_current_waits'" 2>/dev/null |grep Innodb_row_lock_current_waits|awk '{print $2}'`
if [ $a -gt 0 ];then
echo "[$DATE] [WARNING] 出现锁表!!"
exit 1
else
echo "[$DATE] [INFO] 锁表检查正常。"
exit 0
fi
QPS大于10000告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0
Uptime=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $2}' `
QPS() {
Questions=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $6}' `
}
QPS
Ratio=`awk 'BEGIN{ printf "%.2f\n",'$Questions'/'$Uptime'}'`
if [ ${Ratio%.*} -gt 10000 ];then
echo "[$DATE] [WARNING] QPS大于10000!"
exit 1
else
echo "[$DATE] [INFO] 当前QPS为: $Ratio"
exit 0
fi
TPS大于4000告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0
Uptime=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $2}' `
TPS() {
rollback=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port extended-status 2>/dev/null | awk '/\<Com_rollback\>/{print $4}'`
commit=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port extended-status 2>/dev/null | awk '/\<Com_commit\>/{print $4}'`
}
TPS
TpsRatio=`awk 'BEGIN{printf "%.2f\n",'$(($rollback+$commit))'/'$Uptime'}'`
if [ ${TpsRatio%.*} -gt 4000 ];then
echo "[$DATE] [WARNING] TPS大于4000!"
exit 1
else
echo "[$DATE] [INFO] 当前QPS为: $TpsRatio"
exit 0
fi
haproxy后端状态监测
#!/bin/bash
result=`curl -s http://admin:admin@10.172.95.97:2000/status |grep -E 'order01|base|cust|idservice|irsc|sec|upc|ewe|ftpgw ' |grep DOWN |wc -l`
if [[ $result = 2 ]]; then
exit 0
else
exit 1
fi
JAVA内存溢出告警-OutOfMemory
#!/bin/bash
result=`tail -1000 /apps/logs/svc/svc-node01-order01-`date +%m%d`.log | grep 'java.lang.OutOfMemoryError' | wc -l`
if [[ $result = 0 ]]; then
exit 0
else
exit 1
fi
redis集群状态异常告警
#!/bin/bash
result=`/apps/svr/ids/redis-cli -c -p 11001 -h 127.0.0.1 cluster info |grep cluster_state | grep ok | wc -l`
if [[ $result = 1 ]]; then
exit 0
else
exit 1
fi
redis节点状态监测告警
#!/bin/bash
result=`/apps/svr/ids/redis-cli -c -p 11001 -h 127.0.0.1 cluster info |grep cluster_known_nodes |awk -F ":" '{print $2}'`
if [[ $result = 6 ]]; then
exit 0
else
exit 1
fi
redis_slot异常告警
#!/bin/bash
result=`/apps/svr/ids/redis-cli -c -p 11001 -h 127.0.0.1 cluster info |grep cluster_slots_ok |awk -F ":" '{print $2}'`
if [[ $result = 16384 ]]; then
exit 0
else
exit 1
fi
keepalivedVIP连通性监控告警
#!/bin/sh
VIPS=`cat /etc/keepalived/keepalived.conf |grep -v real|grep -v smtp_server|grep -v 127.0.0.1|grep -v '#'|grep -E -o "\b([0-9]{1,3}\.){3}[0-9]{1,3}\b"|sort -nu`
num=0
for ips in $VIPS
do
result=`ping -w 2 -c 3 ${ips} | grep packet | awk -F" " '{print $6}'| awk -F"%" '{print $1}'| awk -F' ' '{print $1}'`
if [ $result -eq 0 ]; then
num=$num
else
let num=$num+1
fi
done
if [ $num -eq 0 ]; then
exit 0
else
exit 1
fi
keepalivedVIP丢失告警
#!/bin/bash
ip_count=`ip a |grep inet|grep -v 127.0.0.1|grep -v inet6 |wc -l`
if [ ! -f /tmp/check_vip.log ];then
ip a |grep inet|grep -v 127.0.0.1|grep -v inet6 |wc -l > /tmp/check_vip.log
else
vip_count=`sudo cat /tmp/check_vip.log`
if [ "$ip_count" == "$vip_count" ];then
echo "vip ok"
exit 0
else
echo $ip_count > /tmp/check_vip.log
exit 1
fi
fi
keepalived脑裂预警
这里解释一下,由于脑裂的验证需要结合多个节点的情况,监控脚本尽量不做的过于复杂,因此这里在keepalived备节点做了监控,只要发现VIP切换即发出告警,人为接入检查脑裂情况。
#!/bin/bash
result=`ip addr |grep 32 | wc -l`
if [[ $result = 0 ]]; then
exit 0
else
echo "keepalived从节点出现32位的vip,可能出现脑裂现象"
exit 1
fi
rocketmq集群节点数量监控告警
#!/bin/bash
DATE=`date +"%Y-%m-%d %H:%M:%S"`
declare -x JAVA_HOME="/apps/tools/jdk"
NumCluster=`sudo -E /apps/svr/mqbroker/rocketmq/bin/mqadmin clusterList -n 127.0.0.1:9876 |grep -v Version|wc -l`
if [ $NumCluster -gt 4 ];then
echo "[$DATE] [WARNING] rocketMQ集群节点小于4个!"
exit 1
else
echo "[$DATE] [INFO] rocketMQ集群节点数量为: $NumCluster"
exit 0
fi
rocketmq消息数量异常告警
#!/bin/bash
DATE=`date +"%Y-%m-%d %H:%M:%S"`
export JAVA_HOME=/apps/tools/jdk
export JAVA_BIN=/apps/tools/jdk/bin
NumTopic=`sudo -E /apps/svr/mqbroker/rocketmq/bin/mqadmin topicList -n 127.0.0.1:9876 2>/dev/null|grep Blue |grep CIDC |grep -v RETRY|wc -l`
if [ $NumTopic -lt 50 ];then
echo "[$DATE] [WARNING] rocketMQ消息主题小于50个!"
exit 1
else
echo "[$DATE] [INFO] rocketMQ消息topic数量当前为: $NumTopic"
exit 0
fi
logstash日志报错告警
#!/bin/bash
result=`tail -2000 /apps/logs/logstash/logstash-plain.log | grep 'error' | wc -l`
if [[ $result = 0 ]]; then
exit 0
else
exit 1
fi
elasticsearch集群个数异常告警
#!/bin/bash
result=`curl http://10.172.95.1:9201/_cluster/health?pretty |grep number_of_nodes | awk -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result = 4 ]]; then
exit 0
else
exit 1
fi
elasticsearch数据节点个数异常告警
#!/bin/bash
result=`curl http://10.172.95.1:9201/_cluster/health?pretty |grep number_of_data_nodes | awk -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result = 4 ]]; then
exit 0
else
exit 1
fi
elasticsearch_java内存使用已超过48G
#!/bin/bash
#51539607552=48G
result=` curl http://10.172.95.1:9201/_cluster/stats?pretty |grep heap_used_in_bytes |awk -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result > 51539607552 ]]; then
exit 0
else
exit 1
fi
zookeeper日志告警
#!/bin/bash
result=`tail -2000 /apps/logs/logstash/logstash-plain.log | grep 'error' | wc -l`
if [[ $result = 0 ]]; then
exit 0
else
exit 1
fi
zookeeper集群follower-mode变更告警
#!/bin/bash
result=`/apps/sh/zk/zkServer.sh status |grep follower |wc -l`
if [[ $result = 0 ]]; then
exit 0
else
exit 1
fi
zookeeper集群leader-mode变更告警
#!/bin/bash
result=`/apps/sh/zk/zkServer.sh status |grep leader|wc -l `
if [[ $result = 0 ]]; then
exit 0
else
exit 1
fi
api接口层面监控告警
#!/bin/bash
result=`curl --location --request GET 'http://10.172.95.186:8000/emop?appId=600006&method=SYAN_UNHQ_queryOfferStatus&channelTypeId=0&flowdId=202006091314501278181&format=json&status=1%0A' --header 'Content-Type: text/plain' --data '{ "productType": "vm"}' -w "\n" |grep OK | wc -l`
if [[ $result = 1 ]]; then
exit 0
else
exit 1
fi