磁盘坏道检测脚本
磁盘坏道检测脚本
说明: hp服务器有一定概率执行命令会卡主,
#!/bin/bash
# utf-8
# 说明: 其他型号服务器获取raid信息通过MegaCli
# 说明: HP服务器获取raid信息通过hpacucli
# 安装包信息定义
HP_SSACLI_COMMAND_RPM_NAME="hpacucli-9.40-12.0.x86_64.rpm"
HP_SSACLI_COMMAND_RPM_NAME2="hpssacli-2.30-6.0.x86_64.rpm"
LIB_UTILS_RPM_NAME="Lib_Utils-1.00-09.noarch.rpm"
OTHER_MEGACLI_COMMAND_RPM_NAME="MegaCli-8.02.21-1.noarch.rpm"
# 其他基础常量定义
DOWNLOAD_BASE_URL="http://xxxxxx/CentOS/app/xxxx"
RPM_BACKAGE_STORAGE_LOCATION="/tmp"
CORRECT_DISK_DESCRIPTION="all disk is ok"
BAD_DISK_DESCRIPTION="disk has error"
# 安装相关依赖
function install_dependency(){
yum install dmidecode* -y &>/dev/null
}
function install_rpm() {
case $1 in
"HP")
if [ ! -f ${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME} ]; then
timeout 60s wget "${DOWNLOAD_BASE_URL}/${HP_SSACLI_COMMAND_RPM_NAME}" -O ${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME} &>/dev/null
fi
if [ ! -f ${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME2} ]; then
timeout 60s wget "${DOWNLOAD_BASE_URL}/${HP_SSACLI_COMMAND_RPM_NAME2}" -O ${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME2} &>/dev/null
fi
cd ${RPM_BACKAGE_STORAGE_LOCATION}
rpm -Uvh "${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME}" &>/dev/null
rpm -Uvh "${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME2}" &>/dev/null
;;
"Other")
if [ ! -f ${RPM_BACKAGE_STORAGE_LOCATION}/${LIB_UTILS_RPM_NAME} ]; then
timeout 60s wget "${DOWNLOAD_BASE_URL}/${LIB_UTILS_RPM_NAME}" -O ${RPM_BACKAGE_STORAGE_LOCATION}/${LIB_UTILS_RPM_NAME} &>/dev/null
fi
if [ ! -f ${RPM_BACKAGE_STORAGE_LOCATION}/${OTHER_MEGACLI_COMMAND_RPM_NAME} ]; then
timeout 60s wget "${DOWNLOAD_BASE_URL}/${OTHER_MEGACLI_COMMAND_RPM_NAME}" -O ${RPM_BACKAGE_STORAGE_LOCATION}/${OTHER_MEGACLI_COMMAND_RPM_NAME} &>/dev/null
fi
cd ${RPM_BACKAGE_STORAGE_LOCATION}
rpm -Uvh "${RPM_BACKAGE_STORAGE_LOCATION}/${LIB_UTILS_RPM_NAME}" &>/dev/null
rpm -Uvh "${RPM_BACKAGE_STORAGE_LOCATION}/${OTHER_MEGACLI_COMMAND_RPM_NAME}" &>/dev/null
ln -s /opt/MegaRAID/MegaCli/MegaCli64 /bin/MegaCli64 2>/dev/null
ln -s /opt/MegaRAID/MegaCli/MegaCli64 /sbin/MegaCli64 2>/dev/null
;;
*)
:
;;
esac
}
# 其他服务器类型获取raid信息
function other_server_hardware_health_check() {
#echo 'start check server '
server_ip=$(timeout 20s ifconfig | grep Bcast | grep 'inet addr:' | grep -v '127.0.0.1' | head -n 1 | awk '{print $2}' | awk -F: '{print $2}')
disk_usage_percent=$(timeout 20s df -h | grep /export | grep -v 'mnt' | awk '{print $5}')
raid_level=$(timeout 20s MegaCli64 -LDInfo -Lall -aALL | grep 'RAID Level')
Current_Access_Policy=$(timeout 20s MegaCli64 -LDInfo -Lall -aALL | grep 'Current Cache Policy')
disk_bad_sectors=$(timeout 20s MegaCli64 -pdlist -aALL | grep -E "Media Error|Other Error" | awk -F ': ' '{print($2)}' | sort -r | head -n 1)
Firmware_State=$(timeout 20s MegaCli64 -pdlist -aALL | grep -E 'Slot Number|Firmware state' | grep -B 1 'Firmware state: [^O]')
if [ -z "$Firmware_State" ]; then
Firmware_State="other_server_Firmware_State_NULL"
fi
Predictive_Failure_Count=$(timeout 20s MegaCli64 -PDList -aALL | grep -E 'Slot Number|Predictive Failure Count:' | grep -B 1 'Predictive Failure Count: [^0]')
if [ -z "$Predictive_Failure_Count" ]; then
Predictive_Failure_Count="other_server_Predictive_Failure_Count_NULL"
fi
local disk_bad_info=$(timeout 20s MegaCli64 -pdlist -aALL | grep -E "Slot Number|Media Error|Other Error" | grep -E 'Count: [^0]|Slot Number' | grep 'Error' -B1)
BBU_status=$(timeout 20s MegaCli64 -LDInfo -Lall -aALL | grep 'Current Cache Policy')
Media_Type=$(timeout 30s MegaCli64 -PDList -aAll | grep 'Media Type' | sort -u | sed 's/ //g')
Media_Size=$(timeout 30s MegaCli64 -PDList -aAll | grep 'Raw Size' | sort -u | sed 's/ //g')
Media_Type=$Media_Type' '$Media_Size
raid_Memory=$(timeout 20s MegaCli64 -Cfgdsply -aALL | grep -E 'Memory' | head -1 | sed 's/ //g' | sed 's/Memory/CacheSize/g')
disk_bad_sectors=$([ ${disk_bad_sectors} -gt 0 ] && echo -n "${BAD_DISK_DESCRIPTION}" || echo -n "${CORRECT_DISK_DESCRIPTION}")
# 结果print
if [[ "$disk_bad_sectors" == "disk has error" ]]; then
# echo "check_disk_disk_bad_sectors: "$disk_bad_sectors | sed 'N;s/\n//g'
echo "$disk_bad_info" | sed 'N;s/\n//g'
fi
}
# 需要前置进行rpm库的初始化保证后续执行rpm包安装的时候不出现问题
function rpm_localdb_rebuild(){
/bin/rm -f /var/lib/rpm/.dbenv.lock
/bin/rm -f /var/lib/rpm/.rpm.lock
/bin/rm -rf /var/lib/rpm/__db.00*
/bin/rm -f /var/run/yum.pid
rpm --rebuilddb
}
# HP服务器健康检查
function hp_server_hardware_health_check() {
#has_command=$(rpm -qa 2>/dev/null | grep -i 'hpssacli' | wc -l)
#echo 'start check DBserver '
server_ip=$(timeout 20s ifconfig | grep Bcast | grep 'inet addr:' | grep -v '127.0.0.1' | head -n 1 | awk '{print $2}' | awk -F: '{print $2}')
disk_usage_percent=$(timeout 20s df -h | grep /export | grep -v 'mnt' | awk '{print $5}')
raid_level=$(timeout 20s hpssacli ctrl slot=0 ld all show | grep RAID | awk -F ',' '{print $2}')
Current_Access_Policy=$(timeout 20s hpssacli ctrl all show config detail | grep -i cache | grep 'Wait for Cache Room' | awk -F ':' '{print $2}')
disk_bad_sectors=$(timeout 20s hpacucli ctrl slot=${raid_slot_id} physicaldrive all show | grep 'physicaldrive' | grep -v 'OK' | wc -l | sed 'N;s/\n//g')
Firmware_State=$(timeout 20s hpssacli ctrl all show config | grep physicaldrive | awk -F ',' '{print $2 $3 $4}' | sort | uniq -c)
Predictive_Failure_Count="HP_server_Predictive_Failure_Count_NULL"
local disk_bad_info=$(timeout 20s hpacucli ctrl slot=${raid_slot_id} physicaldrive all show | grep 'physicaldrive' | grep -v 'OK')
BBU_status=$(timeout 20s hpacucli ctrl all show config detail | grep -i 'No-Battery Write Cache')
Media_Type=$(timeout 20s hpacucli ctrl slot=${raid_slot_id} physicaldrive all show | grep 'physicaldrive' | awk -F ',' '{print $2 $3 $4}' | sort -u | sed 'N;s/\n//g' | sed 's/ //g')
raid_Memory=$(timeout 20s hpacucli ctrl all show config detail | grep -i cache | grep 'Total Cache Size' | sed 'N;s/\n//g' | sed 's/ //g')
# disk sector judge
disk_bad_sectors=$([ "$disk_bad_sectors" = 0 ] && echo -n "${CORRECT_DISK_DESCRIPTION}" || echo -n "${BAD_DISK_DESCRIPTION}")
# response print
if [[ "$disk_bad_sectors" == "disk has error" ]]; then
#echo "check_disk_disk_bad_sectors: "$disk_bad_sectors | sed 'N;s/\n//g'
echo "$disk_bad_info" | sed 'N;s/\n//g'
fi
}
function main() {
#server_vendor=`dmidecode -s system-manufacturer|sed '/^#/d'|awk '{print $1}'`
local is_docker=$(ps -ef | grep '/sbin/init' | wc -l)
if [[ "$is_docker" == "1" ]]; then
echo "server is docker,return"
exit 0
fi
# 重建rpm本地db
rpm_localdb_rebuild
# 安装依赖
install_dependency
# 服务器类型信息
local server_vendor=$(dmidecode -s system-manufacturer | sed '/^#/d' | awk '{print $1}')
case $server_vendor in
"HP"|"Hewlett-Packard")
install_rpm "HP"
hp_server_hardware_health_check
;;
*)
install_rpm "Other"
other_server_hardware_health_check
;;
esac
}
main
原创:做时间的朋友