如来神掌第二式第三招----Shell应用案例之主机监控

###############################################################################
# Name : Mahavairocana                                                                                                                                           
# Author : Mahavairocana                                                                                                                                         
# QQ : 10353512                                                                                                                                                    
# WeChat : shenlan-qianlan                                                                                                                                      
# Blog : http://www.cnblogs.com/Mahavairocana/                                                                                                       
# Description : You are welcome to reprint, or hyperlinks to indicate the                                                                        
#                    source of the article, as well as author information.                                                                                ###############################################################################

1、菜鸟版

1、查看主机网卡流量

#!/bin/bash
#network
while : ; do
time=’date +%m”-”%d” “%k”:”%M’
day=’date +%m”-”%d’
rx_before=’ifconfig eth0|sed -n “8″p|awk ‘{print $2}’|cut -c7-’
tx_before=’ifconfig eth0|sed -n “8″p|awk ‘{print $6}’|cut -c7-sleep 2
rx_after=’ifconfig eth0|sed -n “8″p|awk ‘{print $2}’|cut -c7-’
tx_after=’ifconfig eth0|sed -n “8″p|awk ‘{print $6}’|cut -c7-’
rx_result=$[(rx_after-rx_before)/256]
tx_result=$[(tx_after-tx_before)/256]
echo “$time Now_In_Speed: “$rx_result”kbps Now_OUt_Speed: “$tx_result”kbps”
sleep 2
done

2、系统状况监控

#!/bin/sh
#systemstat.sh
ip=192.168.1.227
top -n 2| grep “Cpu” >>./temp/cpu.txt
free -m | grep “Mem” >> ./temp/mem.txt
df -k | grep “sda1″ >> ./temp/drive_sda1.txt
#df -k | grep sda2 >> ./temp/drive_sda2.txt
df -k | grep “/mnt/storage_0″ >> ./temp/mnt_storage_0.txt
df -k | grep “/mnt/storage_pic” >> ./temp/mnt_storage_pic.txt
time=`date +%m”.”%d” “%k”:”%M`
connect=`netstat -na | grep219.238.148.30:80″ | wc -l`
echo “$time $connect” >> ./temp/connect_count.txt

3、监控主机的磁盘空间,当使用空间超过90%就通过发mail来发警告

#!/bin/bash
#monitor available disk space
SPACE=’df | sed -n ‘/ / $ / p’ | gawk ‘{print $5}’ | sed ’s/%//
if [ $SPACE -ge 90 ]
then
fty89@163.com
fi

4、监控CPU和内存的使用情况

#!/bin/bash
#script to capture system statistics
OUTFILE=/home/xu/capstats.csv
DATE=’date +%m/%d/%Y’
TIME=’date +%k:%m:%s’
TIMEOUT=’uptime’
VMOUT=’vmstat 1 2′
users=’echo $TIMEOUT | gawk ‘{print $4}’ ‘
LOAD=’echo $TIMEOUT | gawk ‘{print $9}’ | sed “s/,//’ ‘
FREE=’echo $VMOUT | sed -n ‘/[0-9]/p’ | sed -n ’2p’ | gawk ‘{print $4} ‘ ‘
IDLE=’echo $VMOUT | sed -n ‘/[0-9]/p’ | sed -n ’2p’ |gawk ‘{print $15}’ ‘
echo “$DATE,$TIME,$USERS,$LOAD,$FREE,$IDLE” >> $OUTFILE

5、全方位监控主机

#!/bin/bash
# check_xu.sh
# 0 * * * * /home/check_xu.sh
DAT=”`date +%Y%m%d`”
HOUR=”`date +%H`”
DIR=”/home/oslog/host_${DAT}/${HOUR}”
DELAY=60
COUNT=60
# whether the responsible directory exist
if ! test -d ${DIR}
then
/bin/mkdir -p ${DIR}
fi
# general check
export TERM=linux
/usr/bin/top -b -d ${DELAY} -n ${COUNT} > ${DIR}/top_${DAT}.log 2>&1 &
# cpu check
/usr/bin/sar -u ${DELAY} ${COUNT} > ${DIR}/cpu_${DAT}.log 2>&1 &
#/usr/bin/mpstat -P 0 ${DELAY} ${COUNT} > ${DIR}/cpu_0_${DAT}.log 2>&1 &
#/usr/bin/mpstat -P 1 ${DELAY} ${COUNT} > ${DIR}/cpu_1_${DAT}.log 2>&1 &
# memory check
/usr/bin/vmstat ${DELAY} ${COUNT} > ${DIR}/vmstat_${DAT}.log 2>&1 &
# I/O check
/usr/bin/iostat ${DELAY} ${COUNT} > ${DIR}/iostat_${DAT}.log 2>&1 &
# network check
/usr/bin/sar -n DEV ${DELAY} ${COUNT} > ${DIR}/net_${DAT}.log 2>&1 &
#/usr/bin/sar -n EDEV ${DELAY} ${COUNT} > ${DIR}/net_edev_${DAT}.log 2>&1 &

2、进阶版

#! /bin/sh
#################################################
# 主机健康状态监控脚本
# (监控内容:内存、CPU、磁盘、网卡)
#
# V1.0 Writen by: MR.G Date:2012-03-20
##################################################

export LANG=C

#设定管理员的信箱
Email=zhangxiaogang@8tgame.com

#设定日期格式
time=`date "+%Y-%m-%d %H:%M:%S"`

#设定日志文件
log=`date +%Y-%m-%d`.log

#设定配置文件的路径
config=config.ini

if [[ -f $config && -s $config ]];then

for ip in `cat $config`
do
# -------------------------------------------------------------------------------------------------
# 检查服务器的状态,如果异常,则发送mail报警。
# -------------------------------------------------------------------------------------------------
ssh $ip pwd &> /dev/null
if [ $? -ne 0 ];then
#echo "主机:$ip的SSH无法登陆,请及时处理!" | mail -s "$ip SSH状态异常" $Email
echo "$time $ip 的SSH状态检查完毕,状态:Failed.">>$log
else
echo "$time $ip 的SSH状态检查完毕,状态:Ok." >>$log

# -------------------------------------------------------------------------------------------------
# 更新服务器的时间
# ----------------------------------------------------------
#设定NTP Server
server="ntp.fudan.edu.cn"

ssh $ip /usr/sbin/ntpdate -s $server
ssh $ip /usr/sbin/hwclock --systohc
# -------------------------------------------------------------------------------------------------
# 检查服务器网络状态,如果无法ping通,则发送mail报警。(ICMP过滤的除外)
# -------------------------------------------------------------------------------------------------
#设定检测的网站
site=www.baidu.com

ssh $ip ping -c3 www.baidu.com >/dev/null
if [ $? -ne 0 ]; then

#echo "主机:$ip无法ping通,请及时处理!" | mail -s "$ip 磁盘空间警告" $Email
echo "$time $ip 的网络状态检查完毕,状态:Failed.">>$log
else
echo "$time $ip 的网络状态检查完毕,状态:Ok." >>$log
fi

# -------------------------------------------------------------------------------------------------
# 检查系统的磁盘空间,如果使用率超过90%,则发送mail报警。
# -------------------------------------------------------------------------------------------------

#设定的阀值
space_warn="90"

ssh $ip df -P | grep "^/dev"| awk '{print $0}' | while read x
do
space_name=`echo $x | awk '{print $1}'`
space_per=`echo $x | awk '{print $5}' | sed 's/%//g'`
space_used=`echo $x | awk '{print $3}'`
if [ $space_per -ge $space_warn ];then
#echo "主机:$ip的$space_name分区仅剩下$space_used M,使用率为$space_per,已超过指定阀值,请及时处理!" | mail -s "$ip 磁盘空间警告" $Email
echo -e "$time $ip 的$space_name分区检查完毕,状态:Failed.">>$log
else
echo -e "$time $ip 的$space_name分区检查完毕,状态:Ok." >>$log
fi
done

# -------------------------------------------------------------------------------------------------
# 检查系统的内存状态,如果交换分区的使用率超过80%,则发送mail报警。
# -------------------------------------------------------------------------------------------------

#设定的阀值
swap_warn=80

swap_total=`ssh $ip free -m | grep "Swap" | awk '{print $4}'`
swap_free=`ssh $ip free -m | grep "Swap" | awk '{print $2}' `
swap_used=`ssh $ip free -m | grep "Swap" | awk '{print $3}' `

if [ $swap_used -ne 0 ];then
swap_per=$[[$swap_used / $swap_total]*100]
if [ $swap_per -ge $swap_warn ];then
#echo "主机:$ip的Swap交换分区仅剩下$swap_free M,使用率为$swap_per,已超过指定阀值,请及时处理!" | mail -s "$ip 内存使用警告" $Email
echo "$time $ip的Swap分区检查完毕,状态:Failed." >> $log
else
echo "$time $ip的Swap分区检查完毕,状态:Ok." >> $log
fi
fi

# -------------------------------------------------------------------------------------------------
# 检查系统CPU的状态,如果使用率超过80%,则发送mail报警
# -------------------------------------------------------------------------------------------------

#设定的阀值
cpu_warn=80

cpu_free=`ssh $ip top -b -n 1 | grep "Cpu" | awk '{print $5}' | sed 's/%id,//g'`
cpu_used=$(echo "100 - $cpu_free" | bc)

if [ `echo "$cpu_used >= $cpu_warn" | bc ` -eq 1 ];then
#echo "主机:$ip的CPU使用率为$cpu_used%,已超过指定阀值,请及时处理!" | mail -s "$ip CPU使用警告" $Email
echo "$time $ip 的CPU状态检查完毕,状态:Failed." >> $log
else
echo "$time $ip 的CPU状态检查完毕,状态:Ok." >> $log
fi

# -------------------------------------------------------------------------------------------------
# 检查系统登陆的用户数,如果当前用户数超过3个,则发送mail报警
# -------------------------------------------------------------------------------------------------

#设定的阀值
users_max=4

users_now=`ssh $ip uptime | awk '{print $4}'`

if [ $user_now >=$users_max ];then
#echo "$ip登陆的用户数已经达到了$user_nowg个,已超过指定的阀值,请及时处理!" | mail -s "$ip 用户数报警" $Email
echo "$time $ip的用户数检查完毕,状态:Failed." >> $log
else
echo "$time $ip 的用户数检查完毕,状态:Ok." >> $log
fi

# -------------------------------------------------------------------------------------------------
# 检查系统十五分钟内的平均负载情况,如果超过0.7(单核),则发送mail报警
# -------------------------------------------------------------------------------------------------

#设定的阀值
load_warn=0.7

cpu_num=`ssh $ip cat /proc/cpuinfo | grep -c "model name"`

load_num=`ssh $ip uptime | awk '{print $10}'`

load_average=`echo "scale=2;$load_num/$cpu_num" | bc`

if [ `echo "$load_average >= $load_warn" | bc` -eq 1 ];then
#echo "$ip 15分钟单核的平均负载已经达到$load_average,已超过指定的阀值,请及时处理!" | mail -s "$ip 平均负载报警" $Email
echo "$time $ip 的平均负载检查完毕,状态:Failed." >> $log
else
echo "$time $ip 的平均负载检查完毕,状态:Ok." >> $log
fi

# -------------------------------------------------------------------------------------------------
# 检查系统当前的IP连接数,如果超过8000,则发送mail报警
# -------------------------------------------------------------------------------------------------

#设定的阀值
conns_warn=8000

ip_conns=`ssh $ip netstat -an | grep tcp | grep EST | wc -l`

if [ $ip_conns -ge $conns_warn ];then
#echo "$ip 的IP连接数已经达到$ip_conns,已超过指定的阀值,请及时处理!" | mail -s "$ip IP连接数" $Email
echo "$time $ip 的IP连接数检查完毕,状态:Failed." >> $log
else
echo "$time $ip 的IP连接数检查完毕,状态:Ok." >> $log

fi

# -------------------------------------------------------------------------------------------------
# 检查系统Apache服务的运行状态,如果不返回200,则发送mail报警
# -------------------------------------------------------------------------------------------------

httpd=`ssh $ip ps -ef |grep httpd |awk '{if($3==1)print $0}'|awk '{if($1=="root")print $3 }'`

if [ "$httpd" != "1" ];then
echo "$time $ip Apache状态异常,尝试重启进程……" >> $log
ssh $ip /etc/init.d/httpd restart &> /dev/null
ssh $ip sleep 100
httpd=`ssh $ip ps -ef |grep httpd |awk '{if($3==1)print $0}'|awk '{if($1=="root")print $3 }'`
if [ "$httpd" != "1" ];then
result=`wget -o /dev/stdout "http://$ip/" | grep "HTTP"| awk '{print $6}'`
if [ "$result" != "200" ];then
#echo "主机:$ip 的Apache服务已经没有响应,请及时处理!" | mail -s "$ip Apache服务警告" $Email
echo "$time $ip 的Apache状态检查完毕,状态:Failed." >> $log
else
echo "$time $ip 的Apache状态检查完毕,状态:Ok." >> $log

fi
else
#echo "主机:$ip 的Apache服务已经没有响应,请及时处理!" | mail -s "$ip Apache服务警告" $Email
echo "$time $ip 的Apache状态检查完毕,状态:Failed." >> $log
fi
else
result=`wget -o /dev/stdout "http://$ip/" | grep "HTTP" | awk '{print $6}'`
if [ "$result" != "200" ];then
#echo "主机:$ip 的Apache服务已经没有响应,请及时处理!" | mail -s "$ip Apache服务警告" $Email
echo "$time $ip 的Apache状态检查完毕,状态:Failed." >> $log
else
echo "$time $ip 的Apache状态检查完毕,状态:Ok." >> $log

fi
fi

# -------------------------------------------------------------------------------------------------
# 检查系统MySQL服务的运行状态,通过检查端口3360,若重启后不正常发送mail报警(没有考虑锁表的情况)
# -------------------------------------------------------------------------------------------------

PORT=`ssh $ip netstat -na|grep "LISTEN"|grep "3306"|awk -F[:" "]+ '{print $5}'`

if [ $PORT -eq 3306 ];then
echo "$time $ip 的MySQL状态检查完毕,状态:Ok." >> $log
else
echo "$time $ip MySQL状态异常,尝试重启进程……" >> $log
ssh $ip /etc/init.d/mysqld restart &>/dev/null
PORT=`ssh $ip netstat -na|grep "LISTEN"|grep "3306"|awk -F[:" "]+ '{print $5}'`
if [ $PORT -eq 3306 ];then
echo "$time $ip 的MySQL状态检查完毕,状态:Ok." >> $log
else
#echo "主机:$ip 的MySQL服务已经没有响应,请及时处理!" | mail -s "$ip MySQL服务警告" $Email
echo "$time $ip 的MySQL状态检查完毕,状态:Failed." >> $log
fi
fi

# -------------------------------------------------------------------------------------------------
# 检查系统网卡的流速情况,如果超过指定的阀值,则发送mail报警
# -------------------------------------------------------------------------------------------------

#设定的阀值,单位KB/S
speed_warn=10240

send_before=`ifconfig eth0 | grep bytes | awk '{print $6}' | awk -F : '{print $2}'`
recv_before=`ifconfig eth0 | grep bytes | awk '{print $2}' | awk -F : '{print $2}'`

sleep 1

send_after=`ifconfig eth0 | grep bytes | awk '{print $6}' | awk -F : '{print $2}'`
recv_after=`ifconfig eth0 | grep bytes | awk '{print $2}' | awk -F : '{print $2}'`

send_bytes=`expr $send_after - $send_before`
recv_bytes=`expr $recv_after - $recv_before`

send_speed=`expr $send_bytes / 1024`
recv_speed=`expr $recv_bytes / 1024`

if [[ `echo "$send_speed >= $speed_warn" | bc` -eq 1 || `echo "$recv_speed >= $speed_warn" | bc` -eq 1 ]];then
# echo "$ip 的网卡流速为$send_speed Kb/s(上行)/$recv_speed Kb/s(下行),已超过指定的阀值,请及时处理!" | mail -s "$ip 平均负载报警" $Email
echo "$time $ip 的网卡流速检查完毕,状态:Failed." >> $log
else
echo "$time $ip 的网卡流速检查完毕,状态:Ok." >> $log
fi

fi
done
else
echo "配置文件不存在或内容为空,请检查!"

fi

3、日志监控

需要准备环境 
1: rsync 安装包 (yum 安装 编译安装均可)
2:防火墙开放相应端口
3:sendemail 客户端  上传到/usr/bin/后添加执行权限    (sendEmail附件放在附件)

服务端执行命令
echo "work:work" > /etc/rsyncd.pas  | chmod600 /etc/rsync.pas

1,编辑配置文件/etc/rsyncd.conf如下
[global]
uid = root
gid = root
use chroot = yes
max connections = 50
pid file = /var/run/rsyncd.pid
lock file = /var/run/rsyncd.lock
log file = /var/log/rsyncd.log
transfer logging = yes
log format = %t %a %m %f %b
syslog facility = local3
timeout = 300


[1.1]
read only = false
write only = yes
path = /$path
comment = log
auth users = log
secrets file = /etc/rsync.pas
hosts allow = 10.1.1.1

[1.2]
read only = false
write only = yes
path = /$path
comment = log
auth users = log
secrets file = /etc/rsync.pas
hosts allow = 10.1.1.2

[1.21]
read only = false
write only = yes
path = /$path
comment = log
auth users = log
secrets file = /etc/rsync.pas
hosts allow = 10.1.1.21

[1.22]
read only = false
write only = yes
path = /$path
comment = log
auth users = log
secrets file = /etc/rsync.pas
hosts allow = 10.1.1.22



巡检脚本
#!/bin/bash
Path=/var/log/
Time=`date "+%Y-%m-%d"`
Ytime=`date -d yesterday "+%Y-%m-%d"`
Tlmip="10.1.8.1 10.1.8.2"
Tpip="10.1.8.21 10.1.8.22"
ID=tomcatserver.pid
Tlmlog="tlm.log tlm-trace.log catalina-`date -d yesterday  "+%Y-%m-%d"`.out  catalina-`date  "+%Y-%m-%d"`.out"
Tplog="tp.log tp-trace.log catalina-`date -d yesterday  "+%Y-%m-%d"`.out  catalina-`date  "+%Y-%m-%d"`.out"
Errorlog="$Path""$Time-error.log"
Contacts="a@vmware.com,b@vmware.com"  ###定义接受邮件的联系人,中间以英文逗号隔开即可###

############################################检测服务##############################################
for I in "$Tlmip $Tpip";do
        nc -v -w 10 -z $I 80
        if [ $? -ne 0]
        then
        echo "$l service abnormity" >> $Errorlog
        fi
done
#############################################巡检TLM#############################################
for P in $Tlmip;do
id=`ls /proc/$(cat /$Path$P/tomcatserver.pid)/fd | wc -l`
        if [$id -gt 4000 ];then
                echo "$P 连接数超过4000,请查看!!!>>$Errorlog" 
        fi

        for L in $Tlmlog;do
        cat $Path$P/$L | grep -v INFO | grep -v vmext |grep -v "at com"|grep -v "at sun"|grep -v "at org" | grep -v "at java" |grep -v "Caused by" | grep -v "more" | grep -E "$Time|$Ytime"  >> $Errorlog
        done
done
##############################################巡检TP#############################################

for P in $Tpip;do
id=`ls /proc/$(cat /$Path$P/tomcatserver.pid)/fd | wc -l`
        if [$id -gt 4000 ];then
                 echo "$P 连接数超过4000,请查看!!!>>$Errorlog" 
        fi

        for L in $Tplog;do
        cat $Path$P/$L | grep -v INFO | grep -v vmext |grep -v "at com"|grep -v "at sun"|grep -v "at org" | grep -v "at java" |grep -v "Caused by" | grep -v "more" | grep -E "$Time|$Ytime"  >> $Errorlog
        done
done

################################将报错信息以附件形式发送到指定邮件#####################################

/usr/bin/sendEmail -t $Contacts -f 抄送账号 -s smtp地址以及端口:25 -xu 发件箱账号 -xp  发件箱密码 -o message-file=$Errorlog -u "巡检报错信息"    

rm -rf  $Errorlog


echo "55 8 * * * /path/*.sh" >> /var/spool/cron/root








客户端分别执行命令
echo "work:work" > /etc/rsyncd.pas  | chmod600 /etc/rsync.pas

vim /path/rsync.sh 
#/bin/sh
rsync --port=服务端口 -aP --bwlimit 3000 /opt/vmware/instances/myserver/logs/ work@跳板机IP::模块 --password-file=/etc/rsync.pas                 #####模块即为客户端定义好的1.1  1.2  1.21  1.22

chmod a+x /path/rsync
echo "50 8 * * * /path/rsync.sh" >> /var/spool/cron/root

 

posted on 2018-01-10 23:05  Mahavairocana  阅读(268)  评论(0编辑  收藏  举报

导航