利用定时任务监控k8s集群容器状态和容器日志中的error并通告警
背景介绍
利用定时任务监控k8s集群容器状态和容器日志中的error并通告警,可以通过企业微信告警、通过邮件告警通知。
脚本说明
#!/bin/bash
# Descriptions: 查询 Pod 日志从 10 分钟之前开始的日志,是否包含 error 或者 exception 关键字
PATH=$PATH:/usr/local/bin
# 脚本执行时间参数,取当前时间的十分钟之前,由于时区关系,时间需要减 8 小时
QueryTime=$(date +"%Y-%m-%dT%H:%M:%S" -d '-8 hours -10 minutes').0000000Z
# 企业通知显示时间
startTime=$(date +"%Y-%m-%dT%H:%M:%S" -d '-10 minutes')
# 企业通知显示截止时间
endTime=$(date +"%Y-%m-%dT%H:%M:%S")
# 需要查询日志的服务清单
DeploymentNames=(
maorong-bank-attachment \
maorong-bank-standard \
maorong-auth \
maorong-file \
maorong-gateway \
maorong-system \
vchain-afterloan \
vchain-auth \
vchain-credit \
vchain-finance \
vchain-gateway \
vchain-iot \
vchain-iot-sdk-client \
vchain-living \
vchain-operating \
vchain-sic-third \
vchain-base \
vchain-inloan \
vchain-preloan \
vchain-sic-cfca \
vchain-sic-msg \
vchain-sic-blockchain
)
# 企业微信机器人URL
logRobotUrl="https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=ebfdaf5094f3f0"
livenssRobotUrl="https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=2ab65b7d-7898ad8"
# 服务所在的命名空间
Namespace="haian-prod"
# Pod 日志查询记录文件存放位置
pods_log_file="/var/log/checklogs/pods_log.txt"
# 日志详细信息压缩文件
logs_zipfile=/tmp/logs-$(data +"%Y-%m-%dT%H%M%S").zip
pods_log_dir="/var/log/checklogs/logs"
# deployment 服务状态
deployments_status_file="/var/log/checklogs/deployments_status.txt"
# 服务存活状态
livenessState_file="/var/log/checklogs/liveness_state.txt"
# 邮件收件人地址
mailToUser="
changli.tang@vonechain.com \
jiaojiao.zou@vonechain.com
"
# 邮件抄送人地址
mailCcUser="bin.zhou@vonechain.com,"
# 服务健康检查接口配置
health_api=(认证中心_vchain-auth_auth \
基础服务_vchain-base-biz_base \
贷前服务_vchain-preloan-biz_preloan \
贷中服务_vchain-inloan-biz_inloan \
贷后服务_vchain-afterloan-biz_afterloan \
运营服务_vchain-operating-biz_operating \
区块链服务_vchain-sic-blockchain-biz_blockchain \
第三方服务_vchain-sic-third-biz_third \
物联网服务_vchain-iot-biz_iot \
短信服务_vchain-sic-msg-biz_msg \
cfca认证服务_vchain-sic-cfca-biz_cfca \
物联网sdk服务_vchain-iot-sdk-client-biz_iot-sdk \
融资核心服务_vchain-finance-biz_finance \
预授信场景服务_vchain-credit-biz_credit \
活体质押融资服务_vchain-living-biz_living
银行接口服务_bank-standard_bank-standard
银行接口附件服务_bank-attachment__bank-attachment
)
# vchain gateway 域名配置
INGRESS_HOST="192.168.10.50"
SCHEMA_PORT="80"
GATEWAY_DOMAIN="prodvchain-gateway.ntmaorong.com"
RESOLVE=${GATEWAY_DOMAIN}:${SCHEMA_PORT}:${INGRESS_HOST}
VCHAIN_GATEWAY_SVC_ADDRESS=$(kubectl get svc -n haian-prod -l app=vchain-gateway -o jsonpath='{.items[0].spec.clusterIP}')
VCHAIN_GATEWAY_SVC_PORT=$(kubectl get svc -n haian-prod -l app=vchain-gateway -o jsonpath='{.items[0].spec.ports[?(@.name == "tcp-9999")].port}')
# maorong gateway 服务配置
MAORONG_GATEWAY_SVC_ADDRESS=$(kubectl get svc -n haian-prod -l app=maorong-gateway -o jsonpath='{.items[0].spec.clusterIP}')
MAORONG_GATEWAY_SVC_PORT=$(kubectl get svc -n haian-prod -l app=maorong-gateway -o jsonpath='{.items[0].spec.ports[?(@.name == "tcp-8080")].port}')
# 判断文件目录是否存在
[ ! -d ${pods_log_file%/*} ] && mkdir -p ${pods_log_file%/*}
[ ! -d ${deployments_status_file%/*} ] && mkdir -p ${deployments_status_file%/*}
[ ! -f ${pods_log_file} ] && touch ${pods_log_file}
[ ! -f ${deployments_status_file} ] && touch ${deployments_status_file}
[ ! -f ${livenessState_file} ] && touch ${livenessState_file}
# 查询日志匹配关键字
PATTERN="\[ERROR"
# 忽略提醒关键字
IGNORE_PATTERN="验证码不正确\|Invalid cookie\|验证码过期或不存在\|供应商未网签银行"
# 查询服务日志
function tail_pods_log(){
echo "日志开始时间为: ${startTime}, 截止时间为: ${endTime}"
echo ""
for item in ${DeploymentNames[@]};do
# 查询 Pods 的名称
podsName=(`kubectl get pods -n ${Namespace} -l app=${item} -o jsonpath='{range .items[*]}{.metadata.name}{" "}{end}'`)
if [[ ${#podsName[@]} -gt "1" ]];then
for pod in ${podsName[@]};do
echo "正在查询 Pod [${pod}] 的日志..."
kubectl logs -n ${Namespace} --since-time="$QueryTime" ${pod} | grep -v "$IGNORE_PATTERN" | grep "${PATTERN}" -A100 -B100 > ${pods_log_dir}/${pod}.log 2>&1
if [[ $? -eq 0 ]];then
echo -e "\033[1;33;41m查询到 error 或者 exception 关键字的日志。查询日志时间范围为 ${startTime} ~ ${endTime}, Pod 名称为: ${pod}, 命名空间为: ${Namespace}.\e[0m"
echo "时间范围: [\`${startTime}\` ~ \`${endTime}\`]; pod 名称: [\`${pod}\`]; 命名空间: [\`${Namespace}\`]" >> ${pods_log_file}
else
echo "未查询到错误日志."
fi
done
else
echo "正在查询 Pod [${podsName[0]}] 的日志..."
kubectl logs -n ${Namespace} --since-time="$QueryTime" ${podsName} | grep -v "$IGNORE_PATTERN" | grep "${PATTERN}" -A100 -B100 > ${pods_log_dir}/${podsName}.log 2>&1
if [[ $? -eq 0 ]];then
echo -e "\033[1;33;41m查询到 error 或者 exception 关键字的日志。查询日志时间范围为 ${startTime} ~ ${endTime}, Pod 名称为: ${podsName}, 名空间为: ${Namespace}.\e[0m"
echo "时间范围: [\`${startTime}\` ~ \`${endTime}\`]; pod 名称: [\`${podsName}\`]; 命名空间: [\`${Namespace}\`]" >> ${pods_log_file}
else
echo "未查询到错误日志."
fi
fi
done
}
# 查询服务存活状态
function get_liveness_state(){
echo "查询的命名空间为: [<font color='warning'>${Namespace}</font>]\n>" >> ${livenessState_file}
for item in ${health_api[@]};do
serverName=$(echo ${item} | awk -F '_' '{print $2}')
apiUrl=${item##*_}
echo "正在查询服务 [${item%%_*} - ${serverName}] 的状态..."
if [[ ${apiUrl} != "bank-standard" ]] && [[ ${apiUrl} != "bank-attachment" ]];then
livenessState=$(curl -s http://${VCHAIN_GATEWAY_SVC_ADDRESS}:${VCHAIN_GATEWAY_SVC_PORT}/${apiUrl}/actuator/health |jq ".components.livenessState.status" | awk -F '"' '{print $2}')
else
livenessState=$(curl -s http://${MAORONG_GATEWAY_SVC_ADDRESS}:${MAORONG_GATEWAY_SVC_PORT}/${apiUrl}/actuator/health |jq ".status" | awk -F '"' '{print $2}')
fi
if [[ "$livenessState" != "UP" ]];then
echo -e "\033[1;33;41m服务 [${item%%_*} - ${serverName}] 服务异常,状态为非 UP!!\e[0m"
echo "serverName: [<font color='warning'>${serverName}</font>]; status: [<font color='comment'>DOWN</font>]; aliasName: [<font color='warning'>${item%%_*}</font>]" >> ${livenessState_file}
#echo "serverName: [<font color='warning'>${serverName}</font>]; status: [<font color='comment'>${livenessState}</font>]; aliasName: [<font color='warning'>${item%%_*}</font>]" >> ${livenessState_file}
else
echo -e "服务 \033[1;33;32m[${item%%_*} - ${serverName}] \e[0m存活状态为 \033[1;33;32m${livenessState}.\e[0m"
echo "serverName: [<font color='warning'>${serverName}</font>]; status: [<font color='info'>${livenessState}</font>]; aliasName: [<font color='warning'>${item%%_*}</font>]" >> ${livenessState_file}
fi
done
}
# 获取 deployment 副本控制器
function get_deployment_status(){
for item in ${DeploymentNames[@]};do
echo "正在查询服务 [${item}] 的状态..."
availableReplicas=$(kubectl get deployment -n ${Namespace} -l app=${item} -o jsonpath='{.items[0].status.availableReplicas}')
replicas=$(kubectl get deployment -n ${Namespace} -l app=${item} -o jsonpath='{.items[0].status.replicas}')
if [[ $availableReplicas != $replicas ]];then
echo -e "\033[1;33;41m${item}服务状态异常,可用副本数不等于目标副本数!\e[0m"
echo "服务名: ${item}; 可用副本数: ${availableReplicas}; 目标副本数: ${replicas}" >> ${deployments_status_file}
else
echo -e "\033[1;33;32m${item} \e[0m服务状态正常. 可用副本数: ${availableReplicas}. 目标副本数: ${replicas}"
fi
done
}
# 发送消息到用户邮箱
send_email(){
echo -e "生产环境产生错误日志,相关信息如下: \
\n`cat ${2}`
\n查询日志时使用命令如下: \
\nkubectl logs -n haian-prod --timestamps --since-time=${3} \${PodName}" \
| mail -s "${1}" -a ${2} -c ${4} ${5}
}
# 发送 checklogs 脚本执行内容到企业微信机器人
function send_pods_log_messages(){
# 生成日志查询记录文件
tail_pods_log
data=$(cat ${pods_log_file})
# 删除内容为空的日志文件
for file in `ls ${pods_log_dir}`;do
if [[ ! -s $file ]];then
rm -rf ${pods_log_dir}/$file
fi
done
# 压缩日志文件
if [[ "$(ls -A ${pods_log_dir})" ]];then
zip -r ${logs_zipfile} ${pods_log_dir}
if [[ $? -eq 0 ]];then
echo "文件压缩完成"
else
echo "文件压缩失败"
fi
else
echo "${pods_log_dir} 目录为空"
fi
if [[ $data != "" ]];then
# 发送消息到企业微信机器人
curl ${logRobotUrl} -H 'Content-Type: application/json' -d '{
"msgtype": "markdown",
"markdown": {
"content": "<font color='warning'>发现错误日志。相关信息为:</font>
> '"$data"' \n>
> <font color='warning'>**查询日志时使用命令如下:** </font>
> `kubectl logs -n '"${Namespace}"' --timestamps --since-time=\"'"${QueryTime}"'\" ${PodName} `\n>
> <font color='info'>将以上命令中的 PodName 替换为实际的 Pod 名称即可</font>"
}
}'
# 发送邮件
if [[ -e ${logs_zipfile} ]];then
echo -e "生产环境产生错误日志,相关信息如下: \
\n`cat ${pods_log_file}` \
\n查询日志时使用命令如下: \
\nkubectl logs -n haian-prod --timestamps --since-time=${QueryTime} \${PodName}" \
| mail -s "项目-生产环境-错误日志" -a ${logs_zipfile} -c ${mailCcUser} ${mailToUser}
else
echo -e "生产环境产生错误日志,相关信息如下: \
\n`cat ${pods_log_file}` \
\n查询日志时使用命令如下: \
\nkubectl logs -n haian-prod --timestamps --since-time=${QueryTime} \${PodName}" \
| mail -s "项目-生产环境-错误日志" -c ${mailCcUser} ${mailToUser}
fi
# 清空日志文件
echo "" > ${pods_log_file}
rm -rf ${logs_zipfile}
else
echo "不存在错误日志"
fi
}
# 发送 deployment 副本状态到企业微信机器人
function send_deployments_status_messages(){
# 生成 deployments 状态记录文件
get_deployment_status
data=$(cat ${deployments_status_file})
if [[ $data != "" ]];then
curl ${logRobotUrl} -H 'Content-Type: application/json' -d '{
"msgtype": "markdown",
"markdown": {
"content": "<font color='warning'>当前 deployment 服务状态信息如下:</font>
> '"$data"' \n>
> <font color='warning'>**查询deployment 状态使用命令如下:** </font>
> `kubectl get deployment -n ${Namespace} ${DeloymentName}` \n>
> <font color='info'>将以上命令中的变量替换为实际的值即可</font>"
}
}'
# 发送邮件
#echo -e "项目生产环境 Deployment 状态信息如下: \
# \n`cat ${deployments_status_file}`" \
# | mail -s "项目-生产环境-Deployment 状态" -a ${deployments_status_file} -c ${mailCcUser} ${mailToUser}
# 清空文件
echo "" > ${deployments_status_file}
else
echo -e "\033[1;33;32m所有服务状态正常.\e[0m"
fi
}
# 发送服务存活状态到企业微信机器人
function send_liveness_stats_messages(){
# 生成服务存活数据
get_liveness_state
data=$(cat ${livenessState_file})
curl ${livenssRobotUrl} -H 'Content-Type: application/json' -d '{
"msgtype": "markdown",
"markdown": {
"content": "<font color='warning'>当前服务存活状态信息为:</font>
> '"$data"' \n>"
}
}'
# 发送邮件
#echo -e "项目生产环境服务存活状态信息如下: \
# \n`cat ${livenessState_file}`" \
# | mail -s "项目-生产环境-服务状态" -a ${livenessState_file} -c ${mailCcUser} ${mailToUser}
# 清空文件
echo "" > ${livenessState_file}
}
# 脚本使用说明
function Usage(){
echo "$0 可用参数说明如下:"
echo " -l: 查询 Pods 日志"
echo " -d: 查询 Deployment 副本状态"
echo " -s: 查询服务存活状态"
echo " -h: 帮助信息"
}
while getopts "ldsh?*" arg
do
case $arg in
l)
echo "查询 pods 日志"
send_pods_log_messages
;;
d)
echo "查询 Deployment 副本状态"
send_deployments_status_messages
;;
s)
echo "查询服务的存活状态"
send_liveness_stats_messages
;;
h)
Usage
;;
*)
echo ”参数错误“
Usage
;;
esac
done
发邮件
#!/bin/bash
# 发送信息邮件
QueryTime=$(date +"%Y-%m-%dT%H:%M:%S" -d '-8 hours -10 minutes').0000000Z
pods_log_file="test.txt"
# 邮件收件人
EMAIL_RECIVER="bin.zhou@vonechain.com"
# 邮件抄送人
#EMAIL_CC="jiaojiao.zou@vonechain.com,"
# 邮件正文
EMAIL_CONTENT="./test.txt"
#send_email(){
# echo -e "生产环境产生错误日志,相关信息如下: \
# \n`cat ${2}`
# \n查询日志时使用命令如下: \
# \nkubectl logs -n haian-prod --timestamps --since-time=${3} \${PodName}" \
# | mail -s "${1}" -a ${4} -c ${5} ${6}
#}
send_email(){
echo -e $2 | mail -s $1 -a ${4} -c ${5} ${6}
}
subject="项目-生产环境-产生错误日志"
echo -e "生产环境产生错误日志,相关信息如下: \
\n`cat ${pods_log_file}` \
\n查询日志时使用命令如下: \
\nkubectl logs -n haian-prod --timestamps --since-time='${QueryTime}' \${PodName}
\n
" | mail -s ${subject} -a ${pods_log_file} ${EMAIL_RECIVER}
#send_email ${subject} ${pods_log_file} ${QueryTime} ${pods_log_file} ${EMAIL_CC} ${EMAIL_RECIVER}
#echo -e ${content} | mail -s ${subject} -a ${pods_log_file} ${EMAIL_RECIVER}
配置定时任务
0 */2 * * * /root/scripts/checklogs.sh -d >/tmp/crontab_deployment_status.log 2>&1
0 */2 * * * /root/scripts/checklogs.sh -s >/tmp/crontab_liveness_status.log 2>&1
*/10 * * * * /root/scripts/checklogs.sh -l >/tmp/crontab_checklogs.log 2>&1