磁盘坏道检测脚本

磁盘坏道检测脚本

说明: hp服务器有一定概率执行命令会卡主,

#!/bin/bash
# utf-8
# 说明: 其他型号服务器获取raid信息通过MegaCli
# 说明: HP服务器获取raid信息通过hpacucli

# 安装包信息定义
HP_SSACLI_COMMAND_RPM_NAME="hpacucli-9.40-12.0.x86_64.rpm"
HP_SSACLI_COMMAND_RPM_NAME2="hpssacli-2.30-6.0.x86_64.rpm"
LIB_UTILS_RPM_NAME="Lib_Utils-1.00-09.noarch.rpm"
OTHER_MEGACLI_COMMAND_RPM_NAME="MegaCli-8.02.21-1.noarch.rpm"

# 其他基础常量定义
DOWNLOAD_BASE_URL="http://xxxxxx/CentOS/app/xxxx"
RPM_BACKAGE_STORAGE_LOCATION="/tmp"
CORRECT_DISK_DESCRIPTION="all disk is ok"
BAD_DISK_DESCRIPTION="disk has error"

# 安装相关依赖
function install_dependency(){
  yum install dmidecode* -y &>/dev/null
}

function install_rpm() {
  case $1 in
  "HP")
    if [ ! -f ${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME} ]; then
        timeout 60s wget "${DOWNLOAD_BASE_URL}/${HP_SSACLI_COMMAND_RPM_NAME}" -O ${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME} &>/dev/null
    fi

    if [ ! -f ${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME2} ]; then
        timeout 60s wget "${DOWNLOAD_BASE_URL}/${HP_SSACLI_COMMAND_RPM_NAME2}" -O ${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME2} &>/dev/null
    fi

    cd ${RPM_BACKAGE_STORAGE_LOCATION}
    rpm -Uvh "${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME}" &>/dev/null
    rpm -Uvh "${RPM_BACKAGE_STORAGE_LOCATION}/${HP_SSACLI_COMMAND_RPM_NAME2}" &>/dev/null
    ;;
  "Other")
    if [ ! -f ${RPM_BACKAGE_STORAGE_LOCATION}/${LIB_UTILS_RPM_NAME} ]; then
      timeout 60s wget "${DOWNLOAD_BASE_URL}/${LIB_UTILS_RPM_NAME}" -O ${RPM_BACKAGE_STORAGE_LOCATION}/${LIB_UTILS_RPM_NAME} &>/dev/null
    fi
    if [ ! -f ${RPM_BACKAGE_STORAGE_LOCATION}/${OTHER_MEGACLI_COMMAND_RPM_NAME} ]; then
      timeout 60s wget "${DOWNLOAD_BASE_URL}/${OTHER_MEGACLI_COMMAND_RPM_NAME}" -O ${RPM_BACKAGE_STORAGE_LOCATION}/${OTHER_MEGACLI_COMMAND_RPM_NAME} &>/dev/null
    fi

    cd ${RPM_BACKAGE_STORAGE_LOCATION}
    rpm -Uvh "${RPM_BACKAGE_STORAGE_LOCATION}/${LIB_UTILS_RPM_NAME}" &>/dev/null
    rpm -Uvh "${RPM_BACKAGE_STORAGE_LOCATION}/${OTHER_MEGACLI_COMMAND_RPM_NAME}" &>/dev/null
    ln -s /opt/MegaRAID/MegaCli/MegaCli64 /bin/MegaCli64 2>/dev/null
    ln -s /opt/MegaRAID/MegaCli/MegaCli64 /sbin/MegaCli64 2>/dev/null
    ;;
  *)
    :
    ;;
  esac
}

# 其他服务器类型获取raid信息
function other_server_hardware_health_check() {
  #echo 'start check server '
  server_ip=$(timeout 20s ifconfig | grep Bcast | grep 'inet addr:' | grep -v '127.0.0.1' | head -n 1 | awk '{print $2}' | awk -F: '{print $2}')
  disk_usage_percent=$(timeout 20s df -h | grep /export | grep -v 'mnt' | awk '{print $5}')
  raid_level=$(timeout 20s MegaCli64 -LDInfo -Lall -aALL | grep 'RAID Level')
  Current_Access_Policy=$(timeout 20s MegaCli64 -LDInfo -Lall -aALL | grep 'Current Cache Policy')
  disk_bad_sectors=$(timeout 20s MegaCli64 -pdlist -aALL | grep -E "Media Error|Other Error" | awk -F ': ' '{print($2)}' | sort -r | head -n 1)

  Firmware_State=$(timeout 20s MegaCli64 -pdlist -aALL | grep -E 'Slot Number|Firmware state' | grep -B 1 'Firmware state: [^O]')

  if [ -z "$Firmware_State" ]; then
    Firmware_State="other_server_Firmware_State_NULL"
  fi

  Predictive_Failure_Count=$(timeout 20s MegaCli64 -PDList -aALL | grep -E 'Slot Number|Predictive Failure Count:' | grep -B 1 'Predictive Failure Count: [^0]')

  if [ -z "$Predictive_Failure_Count" ]; then
    Predictive_Failure_Count="other_server_Predictive_Failure_Count_NULL"
  fi

  local disk_bad_info=$(timeout 20s MegaCli64 -pdlist -aALL | grep -E "Slot Number|Media Error|Other Error" | grep -E 'Count: [^0]|Slot Number' | grep 'Error' -B1)
  BBU_status=$(timeout 20s MegaCli64 -LDInfo -Lall -aALL | grep 'Current Cache Policy')
  Media_Type=$(timeout 30s MegaCli64 -PDList -aAll | grep 'Media Type' | sort -u | sed 's/      //g')
  Media_Size=$(timeout 30s MegaCli64 -PDList -aAll | grep 'Raw Size' | sort -u | sed 's/      //g')
  Media_Type=$Media_Type' '$Media_Size
  raid_Memory=$(timeout 20s MegaCli64 -Cfgdsply -aALL | grep -E 'Memory' | head -1 | sed 's/      //g' | sed 's/Memory/CacheSize/g')
  disk_bad_sectors=$([ ${disk_bad_sectors} -gt 0 ] && echo -n "${BAD_DISK_DESCRIPTION}" || echo -n "${CORRECT_DISK_DESCRIPTION}")
  # 结果print
  if [[ "$disk_bad_sectors" == "disk has error" ]]; then
#    echo "check_disk_disk_bad_sectors: "$disk_bad_sectors | sed 'N;s/\n//g'
    echo "$disk_bad_info" | sed 'N;s/\n//g'
  fi
}

# 需要前置进行rpm库的初始化保证后续执行rpm包安装的时候不出现问题
function rpm_localdb_rebuild(){
  /bin/rm -f /var/lib/rpm/.dbenv.lock
  /bin/rm -f /var/lib/rpm/.rpm.lock
  /bin/rm -rf /var/lib/rpm/__db.00*
  /bin/rm -f /var/run/yum.pid
  rpm --rebuilddb
}

# HP服务器健康检查
function hp_server_hardware_health_check() {
  #has_command=$(rpm -qa 2>/dev/null | grep -i 'hpssacli' | wc -l)
  #echo 'start check DBserver '
  server_ip=$(timeout 20s ifconfig | grep Bcast | grep 'inet addr:' | grep -v '127.0.0.1' | head -n 1 | awk '{print $2}' | awk -F: '{print $2}')
  disk_usage_percent=$(timeout 20s df -h | grep /export | grep -v 'mnt' | awk '{print $5}')
  raid_level=$(timeout 20s hpssacli ctrl slot=0 ld all show | grep RAID | awk -F ',' '{print $2}')
  Current_Access_Policy=$(timeout 20s hpssacli ctrl all show config detail | grep -i cache | grep 'Wait for Cache Room' | awk -F ':' '{print $2}')
  disk_bad_sectors=$(timeout 20s hpacucli ctrl slot=${raid_slot_id} physicaldrive all show | grep 'physicaldrive' | grep -v 'OK' | wc -l | sed 'N;s/\n//g')
  Firmware_State=$(timeout 20s hpssacli ctrl all show config | grep physicaldrive | awk -F ',' '{print $2 $3 $4}' | sort | uniq -c)
  Predictive_Failure_Count="HP_server_Predictive_Failure_Count_NULL"
  local disk_bad_info=$(timeout 20s hpacucli ctrl slot=${raid_slot_id} physicaldrive all show | grep 'physicaldrive' | grep -v 'OK')
  BBU_status=$(timeout 20s hpacucli ctrl all show config detail | grep -i 'No-Battery Write Cache')
  Media_Type=$(timeout 20s hpacucli ctrl slot=${raid_slot_id} physicaldrive all show | grep 'physicaldrive' | awk -F ',' '{print $2 $3 $4}' | sort -u | sed 'N;s/\n//g' | sed 's/      //g')
  raid_Memory=$(timeout 20s hpacucli ctrl all show config detail | grep -i cache | grep 'Total Cache Size' | sed 'N;s/\n//g' | sed 's/   //g')
  # disk sector judge
  disk_bad_sectors=$([ "$disk_bad_sectors" = 0 ] && echo -n "${CORRECT_DISK_DESCRIPTION}" || echo -n "${BAD_DISK_DESCRIPTION}")
  # response print
  if [[ "$disk_bad_sectors" == "disk has error" ]]; then
    #echo "check_disk_disk_bad_sectors: "$disk_bad_sectors | sed 'N;s/\n//g'
    echo "$disk_bad_info" | sed 'N;s/\n//g'
  fi
}

function main() {

  #server_vendor=`dmidecode -s system-manufacturer|sed '/^#/d'|awk '{print $1}'`
  local is_docker=$(ps -ef | grep '/sbin/init' | wc -l)

  if [[ "$is_docker" == "1" ]]; then
    echo "server is docker,return"
    exit 0
  fi
  # 重建rpm本地db
  rpm_localdb_rebuild
  # 安装依赖
  install_dependency
  # 服务器类型信息
  local server_vendor=$(dmidecode -s system-manufacturer | sed '/^#/d' | awk '{print $1}')

  case $server_vendor in
  "HP"|"Hewlett-Packard")
    install_rpm "HP"
    hp_server_hardware_health_check
    ;;
  *)
    install_rpm "Other"
    other_server_hardware_health_check
    ;;
  esac
}

main

posted @ 2023-09-26 15:58  SpecialSpeculator  阅读(16)  评论(0编辑  收藏  举报