利用定时任务监控k8s集群容器状态和容器日志中的error并通告警

背景介绍

利用定时任务监控k8s集群容器状态和容器日志中的error并通告警,可以通过企业微信告警、通过邮件告警通知。

脚本说明

#!/bin/bash
# Descriptions: 查询 Pod 日志从 10 分钟之前开始的日志,是否包含 error 或者 exception 关键字

PATH=$PATH:/usr/local/bin

# 脚本执行时间参数,取当前时间的十分钟之前,由于时区关系,时间需要减 8 小时
QueryTime=$(date +"%Y-%m-%dT%H:%M:%S" -d '-8 hours -10 minutes').0000000Z


# 企业通知显示时间
startTime=$(date +"%Y-%m-%dT%H:%M:%S" -d '-10 minutes')

# 企业通知显示截止时间
endTime=$(date +"%Y-%m-%dT%H:%M:%S")

# 需要查询日志的服务清单
DeploymentNames=(
  maorong-bank-attachment \
  maorong-bank-standard \
  maorong-auth \
  maorong-file \
  maorong-gateway \
  maorong-system \
  vchain-afterloan \
  vchain-auth \
  vchain-credit \
  vchain-finance \
  vchain-gateway \
  vchain-iot \
  vchain-iot-sdk-client \
  vchain-living \
  vchain-operating \
  vchain-sic-third \
  vchain-base \
  vchain-inloan \
  vchain-preloan \
  vchain-sic-cfca \
  vchain-sic-msg \
  vchain-sic-blockchain
)

# 企业微信机器人URL
logRobotUrl="https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=ebfdaf5094f3f0"
livenssRobotUrl="https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=2ab65b7d-7898ad8"

# 服务所在的命名空间
Namespace="haian-prod"

# Pod 日志查询记录文件存放位置
pods_log_file="/var/log/checklogs/pods_log.txt"

# 日志详细信息压缩文件
logs_zipfile=/tmp/logs-$(data +"%Y-%m-%dT%H%M%S").zip
pods_log_dir="/var/log/checklogs/logs"

# deployment 服务状态
deployments_status_file="/var/log/checklogs/deployments_status.txt"

# 服务存活状态
livenessState_file="/var/log/checklogs/liveness_state.txt"

# 邮件收件人地址
mailToUser="
  changli.tang@vonechain.com \
  jiaojiao.zou@vonechain.com
"

# 邮件抄送人地址
mailCcUser="bin.zhou@vonechain.com,"

# 服务健康检查接口配置
health_api=(认证中心_vchain-auth_auth \
  基础服务_vchain-base-biz_base \
  贷前服务_vchain-preloan-biz_preloan \
  贷中服务_vchain-inloan-biz_inloan \
  贷后服务_vchain-afterloan-biz_afterloan \
  运营服务_vchain-operating-biz_operating \
  区块链服务_vchain-sic-blockchain-biz_blockchain \
  第三方服务_vchain-sic-third-biz_third \
  物联网服务_vchain-iot-biz_iot \
  短信服务_vchain-sic-msg-biz_msg \
  cfca认证服务_vchain-sic-cfca-biz_cfca \
  物联网sdk服务_vchain-iot-sdk-client-biz_iot-sdk \
  融资核心服务_vchain-finance-biz_finance \
  预授信场景服务_vchain-credit-biz_credit \
  活体质押融资服务_vchain-living-biz_living 
  银行接口服务_bank-standard_bank-standard
  银行接口附件服务_bank-attachment__bank-attachment
)

# vchain gateway 域名配置
INGRESS_HOST="192.168.10.50"
SCHEMA_PORT="80"
GATEWAY_DOMAIN="prodvchain-gateway.ntmaorong.com"
RESOLVE=${GATEWAY_DOMAIN}:${SCHEMA_PORT}:${INGRESS_HOST}
VCHAIN_GATEWAY_SVC_ADDRESS=$(kubectl get svc -n haian-prod -l app=vchain-gateway -o jsonpath='{.items[0].spec.clusterIP}')
VCHAIN_GATEWAY_SVC_PORT=$(kubectl get svc -n haian-prod -l app=vchain-gateway -o jsonpath='{.items[0].spec.ports[?(@.name == "tcp-9999")].port}')

# maorong gateway 服务配置
MAORONG_GATEWAY_SVC_ADDRESS=$(kubectl get svc -n haian-prod -l app=maorong-gateway -o jsonpath='{.items[0].spec.clusterIP}')
MAORONG_GATEWAY_SVC_PORT=$(kubectl get svc -n haian-prod -l app=maorong-gateway -o jsonpath='{.items[0].spec.ports[?(@.name == "tcp-8080")].port}')

# 判断文件目录是否存在
[ ! -d ${pods_log_file%/*} ] && mkdir -p ${pods_log_file%/*}
[ ! -d ${deployments_status_file%/*} ] && mkdir -p ${deployments_status_file%/*}
[ ! -f ${pods_log_file} ] && touch ${pods_log_file}
[ ! -f ${deployments_status_file} ] && touch ${deployments_status_file}
[ ! -f ${livenessState_file} ] && touch ${livenessState_file}

# 查询日志匹配关键字
PATTERN="\[ERROR"
# 忽略提醒关键字
IGNORE_PATTERN="验证码不正确\|Invalid cookie\|验证码过期或不存在\|供应商未网签银行"

# 查询服务日志
function tail_pods_log(){
    echo "日志开始时间为: ${startTime}, 截止时间为: ${endTime}"
    echo ""
    for item in ${DeploymentNames[@]};do
        # 查询 Pods 的名称
        podsName=(`kubectl get pods -n ${Namespace} -l app=${item} -o jsonpath='{range .items[*]}{.metadata.name}{" "}{end}'`)
        if [[ ${#podsName[@]} -gt "1" ]];then
            for pod in ${podsName[@]};do
                echo "正在查询 Pod [${pod}] 的日志..."
                kubectl logs -n ${Namespace} --since-time="$QueryTime" ${pod} | grep -v "$IGNORE_PATTERN" | grep  "${PATTERN}" -A100 -B100 > ${pods_log_dir}/${pod}.log 2>&1
                if [[ $? -eq 0 ]];then
                    echo -e "\033[1;33;41m查询到 error 或者 exception 关键字的日志。查询日志时间范围为 ${startTime} ~ ${endTime}, Pod 名称为: ${pod}, 命名空间为: ${Namespace}.\e[0m"
                    echo "时间范围: [\`${startTime}\` ~ \`${endTime}\`]; pod 名称: [\`${pod}\`];  命名空间: [\`${Namespace}\`]" >> ${pods_log_file}
                else
                    echo "未查询到错误日志."
                fi
            done 
        else
            echo "正在查询 Pod [${podsName[0]}] 的日志..."
            kubectl logs -n ${Namespace} --since-time="$QueryTime" ${podsName} | grep -v "$IGNORE_PATTERN" | grep  "${PATTERN}" -A100 -B100 > ${pods_log_dir}/${podsName}.log 2>&1
            if [[ $? -eq 0 ]];then
                echo -e "\033[1;33;41m查询到 error 或者 exception 关键字的日志。查询日志时间范围为 ${startTime} ~ ${endTime}, Pod 名称为: ${podsName}, 名空间为: ${Namespace}.\e[0m" 
                echo "时间范围: [\`${startTime}\` ~ \`${endTime}\`]; pod 名称: [\`${podsName}\`]; 命名空间: [\`${Namespace}\`]" >> ${pods_log_file}
            else
                echo "未查询到错误日志."
            fi
        fi
    
    done
}


# 查询服务存活状态
function get_liveness_state(){
    echo "查询的命名空间为: [<font color='warning'>${Namespace}</font>]\n>" >> ${livenessState_file}
    for item in ${health_api[@]};do
        serverName=$(echo ${item} | awk  -F '_' '{print $2}')
        apiUrl=${item##*_}
        echo "正在查询服务 [${item%%_*} - ${serverName}] 的状态..."
        if [[ ${apiUrl} != "bank-standard" ]] && [[ ${apiUrl} != "bank-attachment" ]];then
            livenessState=$(curl -s http://${VCHAIN_GATEWAY_SVC_ADDRESS}:${VCHAIN_GATEWAY_SVC_PORT}/${apiUrl}/actuator/health |jq ".components.livenessState.status" | awk -F '"' '{print $2}')
        else
            livenessState=$(curl -s http://${MAORONG_GATEWAY_SVC_ADDRESS}:${MAORONG_GATEWAY_SVC_PORT}/${apiUrl}/actuator/health |jq ".status" | awk -F '"' '{print $2}')
        fi

        if [[ "$livenessState" != "UP" ]];then
            echo -e "\033[1;33;41m服务 [${item%%_*} - ${serverName}] 服务异常,状态为非 UP!!\e[0m"
            echo "serverName: [<font color='warning'>${serverName}</font>]; status: [<font color='comment'>DOWN</font>]; aliasName: [<font color='warning'>${item%%_*}</font>]" >> ${livenessState_file}
            #echo "serverName: [<font color='warning'>${serverName}</font>]; status: [<font color='comment'>${livenessState}</font>]; aliasName: [<font color='warning'>${item%%_*}</font>]" >> ${livenessState_file}
        else
            echo -e "服务 \033[1;33;32m[${item%%_*} - ${serverName}] \e[0m存活状态为 \033[1;33;32m${livenessState}.\e[0m"
            echo "serverName: [<font color='warning'>${serverName}</font>]; status: [<font color='info'>${livenessState}</font>]; aliasName: [<font color='warning'>${item%%_*}</font>]" >> ${livenessState_file}
        fi
    done
}


# 获取 deployment 副本控制器
function get_deployment_status(){
    for item in ${DeploymentNames[@]};do
        echo "正在查询服务 [${item}] 的状态..."
        availableReplicas=$(kubectl get deployment -n ${Namespace} -l app=${item} -o jsonpath='{.items[0].status.availableReplicas}')
        replicas=$(kubectl get deployment -n ${Namespace} -l app=${item} -o jsonpath='{.items[0].status.replicas}')
        if [[ $availableReplicas != $replicas  ]];then
            echo -e "\033[1;33;41m${item}服务状态异常,可用副本数不等于目标副本数!\e[0m"
            echo "服务名: ${item}; 可用副本数: ${availableReplicas}; 目标副本数: ${replicas}" >> ${deployments_status_file}
        else
            echo -e "\033[1;33;32m${item} \e[0m服务状态正常. 可用副本数: ${availableReplicas}. 目标副本数: ${replicas}"
        fi
     done
}

# 发送消息到用户邮箱
send_email(){
    echo -e "生产环境产生错误日志,相关信息如下: \
        \n`cat ${2}`
        \n查询日志时使用命令如下: \
        \nkubectl logs -n haian-prod --timestamps --since-time=${3} \${PodName}" \
        | mail -s "${1}" -a ${2} -c ${4} ${5}
}

# 发送 checklogs 脚本执行内容到企业微信机器人
function send_pods_log_messages(){
    # 生成日志查询记录文件
    tail_pods_log

    data=$(cat ${pods_log_file})

    # 删除内容为空的日志文件
    for file in `ls ${pods_log_dir}`;do
        if [[ ! -s $file ]];then
            rm -rf ${pods_log_dir}/$file
        fi
    done
    # 压缩日志文件
    if [[ "$(ls -A ${pods_log_dir})" ]];then
        zip -r ${logs_zipfile} ${pods_log_dir}
    
        if [[ $? -eq 0 ]];then
            echo "文件压缩完成"
        else
            echo "文件压缩失败"
        fi
    else
        echo "${pods_log_dir} 目录为空"
    fi
    if [[ $data != "" ]];then
        # 发送消息到企业微信机器人
        curl ${logRobotUrl} -H 'Content-Type: application/json' -d '{
          "msgtype": "markdown",
          "markdown": {
            "content": "<font color='warning'>发现错误日志。相关信息为:</font>
                        > '"$data"' \n>
                        > <font color='warning'>**查询日志时使用命令如下:** </font>
                        > `kubectl logs -n '"${Namespace}"' --timestamps --since-time=\"'"${QueryTime}"'\" ${PodName} `\n>
                        > <font color='info'>将以上命令中的 PodName 替换为实际的 Pod 名称即可</font>"
          }
        }'
        # 发送邮件
        if [[ -e ${logs_zipfile} ]];then
            echo -e "生产环境产生错误日志,相关信息如下: \
              \n`cat ${pods_log_file}` \
              \n查询日志时使用命令如下: \
              \nkubectl logs -n haian-prod --timestamps --since-time=${QueryTime} \${PodName}" \
              | mail -s "项目-生产环境-错误日志" -a ${logs_zipfile} -c ${mailCcUser} ${mailToUser}
        else
             echo -e "生产环境产生错误日志,相关信息如下: \
              \n`cat ${pods_log_file}` \
              \n查询日志时使用命令如下: \
              \nkubectl logs -n haian-prod --timestamps --since-time=${QueryTime} \${PodName}" \
              | mail -s "项目-生产环境-错误日志" -c ${mailCcUser} ${mailToUser}
        fi
	# 清空日志文件
        echo "" > ${pods_log_file}
        rm -rf ${logs_zipfile}
    else
        echo "不存在错误日志"
    fi
}


# 发送 deployment 副本状态到企业微信机器人
function send_deployments_status_messages(){
    # 生成 deployments 状态记录文件
    get_deployment_status

    data=$(cat ${deployments_status_file})
    if [[ $data != "" ]];then
        curl ${logRobotUrl} -H 'Content-Type: application/json' -d '{
          "msgtype": "markdown",
          "markdown": {
            "content": "<font color='warning'>当前 deployment 服务状态信息如下:</font>
                       > '"$data"' \n>
                       > <font color='warning'>**查询deployment 状态使用命令如下:** </font>
                       > `kubectl get deployment -n ${Namespace} ${DeloymentName}` \n>
                       > <font color='info'>将以上命令中的变量替换为实际的值即可</font>"
          }
        }'
	# 发送邮件
        #echo -e "项目生产环境 Deployment 状态信息如下: \
        #  \n`cat ${deployments_status_file}`" \
        #  | mail -s "项目-生产环境-Deployment 状态" -a ${deployments_status_file} -c ${mailCcUser} ${mailToUser}
        # 清空文件       
        echo "" > ${deployments_status_file}
    else
        echo -e "\033[1;33;32m所有服务状态正常.\e[0m"
    fi
}


# 发送服务存活状态到企业微信机器人
function send_liveness_stats_messages(){
    # 生成服务存活数据
    get_liveness_state

    data=$(cat ${livenessState_file})
    curl ${livenssRobotUrl} -H 'Content-Type: application/json' -d '{
      "msgtype": "markdown",
       "markdown": {
         "content": "<font color='warning'>当前服务存活状态信息为:</font>
                    > '"$data"' \n>"
      }
    }'
    # 发送邮件
    #echo -e "项目生产环境服务存活状态信息如下: \
    #  \n`cat ${livenessState_file}`" \
    #  | mail -s "项目-生产环境-服务状态" -a ${livenessState_file} -c ${mailCcUser} ${mailToUser}
    # 清空文件
    echo "" > ${livenessState_file}
    
}


# 脚本使用说明
function Usage(){
    echo "$0 可用参数说明如下:"
    echo "  -l: 查询 Pods 日志"
    echo "  -d: 查询 Deployment 副本状态"
    echo "  -s: 查询服务存活状态"
    echo "  -h: 帮助信息"

}

while getopts "ldsh?*" arg
do
    case $arg in
        l)
            echo "查询 pods 日志"
            send_pods_log_messages
            ;;
        d)
            echo "查询 Deployment 副本状态"
            send_deployments_status_messages
            ;;
        s)
            echo "查询服务的存活状态"
            send_liveness_stats_messages
            ;;
        h)
            Usage
            ;;
        *)
            echo ”参数错误“
            Usage
            ;;
    esac
done

发邮件

#!/bin/bash
# 发送信息邮件

QueryTime=$(date +"%Y-%m-%dT%H:%M:%S" -d '-8 hours -10 minutes').0000000Z
pods_log_file="test.txt"

# 邮件收件人
EMAIL_RECIVER="bin.zhou@vonechain.com"

# 邮件抄送人
#EMAIL_CC="jiaojiao.zou@vonechain.com,"

# 邮件正文
EMAIL_CONTENT="./test.txt"

#send_email(){
#    echo -e "生产环境产生错误日志,相关信息如下: \
#        \n`cat ${2}`
#        \n查询日志时使用命令如下: \
#        \nkubectl logs -n haian-prod --timestamps --since-time=${3} \${PodName}" \
#        | mail -s "${1}" -a ${4} -c ${5} ${6}
#}

send_email(){
  echo -e $2 | mail -s $1 -a ${4} -c ${5} ${6}
}

subject="项目-生产环境-产生错误日志"
echo -e "生产环境产生错误日志,相关信息如下: \
  \n`cat ${pods_log_file}` \
  \n查询日志时使用命令如下: \
  \nkubectl logs -n haian-prod --timestamps --since-time='${QueryTime}' \${PodName}
  \n
" | mail -s ${subject} -a ${pods_log_file} ${EMAIL_RECIVER}
#send_email ${subject} ${pods_log_file} ${QueryTime} ${pods_log_file} ${EMAIL_CC} ${EMAIL_RECIVER}
#echo -e ${content} | mail -s ${subject} -a ${pods_log_file} ${EMAIL_RECIVER}

配置定时任务

0 */2 * * * /root/scripts/checklogs.sh -d >/tmp/crontab_deployment_status.log 2>&1
0 */2 * * * /root/scripts/checklogs.sh -s >/tmp/crontab_liveness_status.log 2>&1
*/10 * * * * /root/scripts/checklogs.sh -l >/tmp/crontab_checklogs.log 2>&1

效果

posted @ 2023-12-14 09:37  邹姣姣  阅读(61)  评论(0编辑  收藏  举报