openshift 容器云从入门到崩溃之九《容器监控-报警》
容器状态监控
主要是监控POD的状态包括重启、不健康等等这些k8s api 状态本身会报出来,在配合zabbix报警
导入zabbix模板关联上oc master主机
<?xml version="1.0" encoding="UTF-8"?> <zabbix_export> <version>3.2</version> <date>2019-02-27T07:33:05Z</date> <groups> <group> <name>Templates</name> </group> </groups> <templates> <template> <template>OC Pods</template> <name>OC Pods</name> <description/> <groups> <group> <name>Templates</name> </group> </groups> <applications> <application> <name>restartCount</name> </application> <application> <name>RunningStatus</name> </application> </applications> <items/> <discovery_rules> <discovery_rule> <name>OC Pods Discover</name> <type>0</type> <snmp_community/> <snmp_oid/> <key>oc.pod.status[discover,discover]</key> <delay>300</delay> <status>0</status> <allowed_hosts/> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <delay_flex/> <params/> <ipmi_sensor/> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <filter> <evaltype>0</evaltype> <formula/> <conditions/> </filter> <lifetime>7</lifetime> <description/> <item_prototypes> <item_prototype> <name>Pod {#POD_NAME} Get Status</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.pod.status[{#POD_NAME},get_status]</key> <delay>300</delay> <history>7</history> <trends>0</trends> <status>0</status> <value_type>4</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>RunningStatus</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>Pod {#POD_NAME} Restarts</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.pod.status[{#POD_NAME},restarts]</key> <delay>300</delay> <history>7</history> <trends>0</trends> <status>0</status> <value_type>4</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>restartCount</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>Pod {#POD_NAME} Running</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.pod.status[{#POD_NAME},running]</key> <delay>300</delay> <history>7</history> <trends>0</trends> <status>0</status> <value_type>4</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>RunningStatus</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> </item_prototypes> <trigger_prototypes> <trigger_prototype> <expression>{OC Pods:oc.pod.status[{#POD_NAME},running].str(Running_true)}=0 and {OC Pods:oc.pod.status[{#POD_NAME},running].str(Pod deleted)}=0</expression> <recovery_mode>0</recovery_mode> <recovery_expression/> <name>Pod {#POD_NAME} Not Running</name> <correlation_mode>0</correlation_mode> <correlation_tag/> <url/> <status>0</status> <priority>1</priority> <description/> <type>0</type> <manual_close>1</manual_close> <dependencies/> <tags/> </trigger_prototype> <trigger_prototype> <expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning)}=1</expression> <recovery_mode>1</recovery_mode> <recovery_expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning,#3)}=0</recovery_expression> <name>Pod {#POD_NAME} restarted Warning</name> <correlation_mode>0</correlation_mode> <correlation_tag/> <url/> <status>0</status> <priority>1</priority> <description/> <type>0</type> <manual_close>1</manual_close> <dependencies/> <tags/> </trigger_prototype> </trigger_prototypes> <graph_prototypes/> <host_prototypes/> </discovery_rule> </discovery_rules> <httptests/> <macros/> <templates/> <screens/> </template> </templates> </zabbix_export>
zabbix客户端配置
修改zabbix_agentd.conf
Timeout=30 UserParameter=oc.pod.status[*],/data/app/zabbix/etc/oc_pod_monitor.sh $1 $2
oc_pod_monitor.sh内容
#!/bin/bash TOKEN="" ENDPOINT="" POD_NAME="`echo "$1" |sed 's/.*=\(.*$\)/\1/'`" Monitoring_type="$2" WORKSPACE="/data/tmp/oc_monitor" mkdir -p $WORKSPACE #通过pod name获得pod所在的namespace NAMESPACE="`jq -r '.items |.[] |.metadata |.name,.namespace' $WORKSPACE/all_pods.json |grep -A1 $POD_NAME |grep -v $POD_NAME`" #验证pod是否存在 if [ "$POD_NAME" == "discover" ]; then echo elif [ ! -n "$NAMESPACE" ]; then echo "Pod deleted" exit 0 fi ##自动发现 case $Monitoring_type in discover) #获取所有pod只保留pod name curl -k \ -H "Authorization: Bearer $TOKEN" \ -H 'Accept: application/json' \ https://$ENDPOINT/api/v1/pods 2>/dev/null > $WORKSPACE/all_pods.json Pod_Name=(`jq -r '.items | .[] | .metadata | .name' $WORKSPACE/all_pods.json |egrep -v 'build|deploy|debug'`) #转换为json格式 printf "{\n" printf '\t"data":[\n' for ((i=0;i<${#Pod_Name[@]};i++)) do NAMESPACE="`jq -r '.items |.[] |.metadata |.name,.namespace' $WORKSPACE/all_pods.json |grep -A1 ${Pod_Name[i]} |grep -v ${Pod_Name[i]}`" Pod_Name_N=""$NAMESPACE"="${Pod_Name[i]}"" printf '\t\t{\n' num=$(echo $((${#Pod_Name[@]}-1))) if [ "$i" == ${num} ]; then printf "\t\t\t\"{#POD_NAME}\":\"${Pod_Name_N}\"}\n" else printf "\t\t\t\"{#POD_NAME}\":\"${Pod_Name_N}\"},\n" fi done printf "\t]\n" printf "}\n" exit 0 ;; get_status)#获取pod状态以供所有项目调用 curl -k \ -H "Authorization: Bearer $TOKEN" \ -H 'Accept: application/json' \ https://${ENDPOINT}/api/v1/namespaces/$NAMESPACE/pods/$POD_NAME/status 2>/dev/null > $WORKSPACE/${NAMESPACE}-${POD_NAME}.status Pod_NotFound="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status |grep '"code": 404'`" if [ -n "$Pod_NotFound" ]; then echo "Pod_Status=NotFound" exit 0 else echo "Success" exit 0 fi ;; esac #获取pod状态数据 if [ -f "$WORKSPACE/${NAMESPACE}-${POD_NAME}.status" ];then Pod_Status="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status`" else echo "" > $WORKSPACE/${NAMESPACE}-${POD_NAME}.status Pod_Status="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status`" fi #处理Pod_Status的异常 if [ ! -n "$Pod_Status" ]; then #处理Pod_Status的为空的异常 echo "Running_true Pod_Status=Null" exit 0 elif [ -n "`echo "$Pod_Status" |grep '"code": 404'`" ]; then #处理pod不存在但是all_pods.json还没更新的异常 echo "Pod_Status=NotFound" exit 0 elif [ "`echo "$Pod_Status" |jq -r '.status |.phase'`" = "Pending" ]; then #验证容器是否在Pending状态 echo "Pending" exit 0 fi #选择要获取的数据 case $Monitoring_type in restarts)#监控pod是否重启过 #判断是否是新pod if [ ! -f "$WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount" ]; then echo "Warning New Pod" echo "0" > $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount exit 0 fi ##获取上次的值 A_line=`sed -n 1p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount` B_line_null="`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`" if [ ! -n "$B_line_null" ]; then #处理有两个restartCount值的pod B_line="0" else B_line=`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount` fi Last_state=`expr $A_line + $B_line` ## ##获取本次的值 echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.restartCount' > $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount A_line=`sed -n 1p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount` B_line_null="`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`" if [ ! -n "$B_line_null" ]; then #处理有两个restartCount值的pod B_line="0" else B_line=`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount` fi Current_state=`expr $A_line + $B_line` ## #对比本次拿到的restartCount值与上此的restartCount值 if [ "$Current_state" -gt "$Last_state" ]; then Restart_status="Warning restart_count=$Current_state" else Restart_status="Normal restart_count=$Current_state" fi echo "$Restart_status" ;; running)#监控pod的运行状态和容器的状态返回字符串 #获取pod和容器的状态 running_status=`echo "$Pod_Status" |jq -r '.status |.phase'` Container_status="`echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.ready' |grep false`" if [ ! -n "$Container_status" ]; then Container_status="_true" else Container_status="_false" fi echo "${running_status}${Container_status}" ;; *) echo "Error parameters" exit 0 ;; esac exit 0
这样POD重启或者新建都会报出来
集群NODE节点监控
主要监控node节点的不健康状态,还有lvm卷容量监控
导入zabbix模板关联上oc master主机
<?xml version="1.0" encoding="UTF-8"?> <zabbix_export> <version>3.2</version> <date>2019-02-27T07:47:32Z</date> <groups> <group> <name>Templates</name> </group> </groups> <templates> <template> <template>OC Node Status</template> <name>OC Node Status</name> <description/> <groups> <group> <name>Templates</name> </group> </groups> <applications> <application> <name>oc_node</name> </application> </applications> <items/> <discovery_rules> <discovery_rule> <name>OC Nodes Discover</name> <type>0</type> <snmp_community/> <snmp_oid/> <key>oc.node.status[discover,discover]</key> <delay>60</delay> <status>0</status> <allowed_hosts/> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <delay_flex/> <params/> <ipmi_sensor/> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <filter> <evaltype>0</evaltype> <formula/> <conditions/> </filter> <lifetime>7</lifetime> <description/> <item_prototypes> <item_prototype> <name>Node {#NODE_NAME} DiskPressure</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.node.status[{#NODE_NAME},DiskPressure]</key> <delay>30</delay> <history>7</history> <trends>0</trends> <status>1</status> <value_type>4</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>oc_node</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>Node {#NODE_NAME} Get Status</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.node.status[{#NODE_NAME},get_status]</key> <delay>30</delay> <history>7</history> <trends>0</trends> <status>0</status> <value_type>4</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications/> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>Node {#NODE_NAME} MemoryPressure</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.node.status[{#NODE_NAME},MemoryPressure]</key> <delay>30</delay> <history>7</history> <trends>0</trends> <status>1</status> <value_type>4</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>oc_node</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>Node {#NODE_NAME} Ready</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.node.status[{#NODE_NAME},node_ready]</key> <delay>30</delay> <history>7</history> <trends>0</trends> <status>0</status> <value_type>4</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>oc_node</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>Node {#NODE_NAME} CPU Limits</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.node.status[{#NODE_NAME},node_resources,cpu_limits]</key> <delay>120</delay> <history>7</history> <trends>0</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units>%</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>oc_node</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>Node {#NODE_NAME} CPU Requests</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.node.status[{#NODE_NAME},node_resources,cpu_requests]</key> <delay>120</delay> <history>7</history> <trends>0</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units>%</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>oc_node</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>Node {#NODE_NAME} Memory Limits</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.node.status[{#NODE_NAME},node_resources,memory_limits]</key> <delay>120</delay> <history>7</history> <trends>0</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units>%</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>oc_node</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>Node {#NODE_NAME} Memory Requests</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.node.status[{#NODE_NAME},node_resources,memory_requests]</key> <delay>120</delay> <history>7</history> <trends>0</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units>%</units> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>oc_node</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> <item_prototype> <name>Node {#NODE_NAME} OutOfDisk</name> <type>0</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>oc.node.status[{#NODE_NAME},OutOfDisk]</key> <delay>30</delay> <history>7</history> <trends>0</trends> <status>1</status> <value_type>4</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>oc_node</name> </application> </applications> <valuemap/> <logtimefmt/> <application_prototypes/> </item_prototype> </item_prototypes> <trigger_prototypes> <trigger_prototype> <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,cpu_limits].last()}>150</expression> <recovery_mode>0</recovery_mode> <recovery_expression/> <name>Node {#NODE_NAME} CPU Limits 150%</name> <correlation_mode>0</correlation_mode> <correlation_tag/> <url/> <status>0</status> <priority>1</priority> <description/> <type>0</type> <manual_close>1</manual_close> <dependencies/> <tags/> </trigger_prototype> <trigger_prototype> <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,cpu_requests].last()}>100</expression> <recovery_mode>0</recovery_mode> <recovery_expression/> <name>Node {#NODE_NAME} CPU Requests 100%</name> <correlation_mode>0</correlation_mode> <correlation_tag/> <url/> <status>0</status> <priority>2</priority> <description/> <type>0</type> <manual_close>1</manual_close> <dependencies/> <tags/> </trigger_prototype> <trigger_prototype> <expression>{OC Node Status:oc.node.status[{#NODE_NAME},DiskPressure].str(DiskPressure_False)}=0</expression> <recovery_mode>0</recovery_mode> <recovery_expression/> <name>Node {#NODE_NAME} DiskPressure</name> <correlation_mode>0</correlation_mode> <correlation_tag/> <url/> <status>1</status> <priority>5</priority> <description/> <type>0</type> <manual_close>1</manual_close> <dependencies/> <tags/> </trigger_prototype> <trigger_prototype> <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,memory_limits].last()}>150</expression> <recovery_mode>0</recovery_mode> <recovery_expression/> <name>Node {#NODE_NAME} Memory Limits 150%</name> <correlation_mode>0</correlation_mode> <correlation_tag/> <url/> <status>0</status> <priority>1</priority> <description/> <type>0</type> <manual_close>1</manual_close> <dependencies/> <tags/> </trigger_prototype> <trigger_prototype> <expression>{OC Node Status:oc.node.status[{#NODE_NAME},MemoryPressure].str(MemoryPressure_False)}=0</expression> <recovery_mode>0</recovery_mode> <recovery_expression/> <name>Node {#NODE_NAME} MemoryPressure</name> <correlation_mode>0</correlation_mode> <correlation_tag/> <url/> <status>1</status> <priority>5</priority> <description/> <type>0</type> <manual_close>1</manual_close> <dependencies/> <tags/> </trigger_prototype> <trigger_prototype> <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,memory_requests].last()}>95</expression> <recovery_mode>0</recovery_mode> <recovery_expression/> <name>Node {#NODE_NAME} Memory Requests 95%</name> <correlation_mode>0</correlation_mode> <correlation_tag/> <url/> <status>0</status> <priority>2</priority> <description/> <type>0</type> <manual_close>1</manual_close> <dependencies/> <tags/> </trigger_prototype> <trigger_prototype> <expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_ready].str(Ready_True)}=0</expression> <recovery_mode>0</recovery_mode> <recovery_expression/> <name>Node {#NODE_NAME} Not Ready</name> <correlation_mode>0</correlation_mode> <correlation_tag/> <url/> <status>0</status> <priority>5</priority> <description/> <type>0</type> <manual_close>1</manual_close> <dependencies/> <tags/> </trigger_prototype> <trigger_prototype> <expression>{OC Node Status:oc.node.status[{#NODE_NAME},OutOfDisk].str(OutOfDisk_False)}=0</expression> <recovery_mode>0</recovery_mode> <recovery_expression/> <name>Node {#NODE_NAME} OutOfDisk</name> <correlation_mode>0</correlation_mode> <correlation_tag/> <url/> <status>1</status> <priority>5</priority> <description/> <type>0</type> <manual_close>1</manual_close> <dependencies/> <tags/> </trigger_prototype> </trigger_prototypes> <graph_prototypes/> <host_prototypes/> </discovery_rule> </discovery_rules> <httptests/> <macros/> <templates/> <screens/> </template> </templates> </zabbix_export>
zabbix客户端配置
修改zabbix_agentd.conf
Timeout=30 UserParameter=oc.node.status[*],/data/app/zabbix/etc/oc_node_monitor.sh $1 $2 $3
oc_node_monitor.sh的内容
#!/bin/bash TOKEN="" ENDPOINT="" NODE_NAME="$1" Monitoring_type="$2" WORKSPACE="/data/tmp/oc_monitor" mkdir -p $WORKSPACE case $Monitoring_type in discover)#自动发现节点 Node_Name=(`curl -k \ -H "Authorization: Bearer $TOKEN" \ -H 'Accept: application/json' \ https://$ENDPOINT/api/v1/nodes 2>/dev/null |jq -r '.items|.[]|.metadata|.name'`) printf "{\n" printf '\t"data":[\n' for ((i=0;i<${#Node_Name[@]};i++)) do printf '\t\t{\n' num=$(echo $((${#Node_Name[@]}-1))) if [ "$i" == ${num} ]; then printf "\t\t\t\"{#NODE_NAME}\":\"${Node_Name[$i]}\"}\n" else printf "\t\t\t\"{#NODE_NAME}\":\"${Node_Name[$i]}\"},\n" fi done printf "\t]\n" printf "}\n" exit 0 ;; get_status)#获取node状态以供所有项目调用 curl -k \ -H "Authorization: Bearer $TOKEN" \ -H 'Accept: application/json' \ https://${ENDPOINT}/api/v1/nodes/$NODE_NAME 2>/dev/null > $WORKSPACE/${NODE_NAME}.status if [ -n "`cat $WORKSPACE/${NODE_NAME}.status |grep '"code": 404'`" ]; then echo "Node_Status=NotFound" exit 0 elif [ ! -n "`cat $WORKSPACE/${NODE_NAME}.status`" ]; then echo "Node_Status=null" exit 0 else echo "Success" exit 0 fi ;; esac case $Monitoring_type in OutOfDisk)#监控node是否磁盘空间不足 Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 1p`" if [ "$Node_Status" == "False" ]; then echo "OutOfDisk_False" elif [ ! -n "$Node_Status" ]; then echo "OutOfDisk_False" else echo "OutOfDisk_$Node_Status" fi ;; MemoryPressure)#监控node是否磁盘空间不足 Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 2p`" if [ "$Node_Status" == "False" ]; then echo "MemoryPressure_False" elif [ ! -n "$Node_Status" ]; then echo "MemoryPressure_False" else echo "MemoryPressure_$Node_Status" fi ;; DiskPressure)#监控node是否磁盘压力太大 Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 3p`" if [ "$Node_Status" == "False" ]; then echo "DiskPressure_False" elif [ ! -n "$Node_Status" ]; then echo "DiskPressure_False" else echo "DiskPressure_$Node_Status" fi ;; node_ready)#监控node是否准备好了 Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 4p`" if [ "$Node_Status" == "True" ]; then echo "Ready_True" elif [ ! -n "$Node_Status" ]; then echo "Ready_True" else echo "Ready_$Node_Status" fi ;; node_resources)#监控node资源分配情况 null="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $2}'`" if [ ! -n "$null" ]; then sleep 1 fi if [ "$3" == "cpu_requests" ]; then data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $2}' |grep -o '[0-9]*'`" if [ $data -gt 0 ]; then echo $data else echo 0 fi elif [ "$3" == "cpu_limits" ]; then data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $4}' |grep -o '[0-9]*'`" if [ $data -gt 0 ]; then echo $data else echo 0 fi elif [ "$3" == "memory_requests" ]; then data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $6}' |grep -o '[0-9]*'`" if [ "$data" -gt 0 ]; then echo $data else echo 0 fi elif [ "$3" == "memory_limits" ]; then data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $8}' |grep -o '[0-9]*'`" if [ $data -gt 0 ]; then echo $data else echo 0 fi fi ;; esac
crontab -e
*/2 * * * * /data/scripts/oc_master_crontab.sh >/dev/null 2>&1
oc_master_crontab.sh内容
node_name=(`oc get node |grep -v "NAME" |awk '{print $1}'`) for ((i=0;i<${#node_name[*]};i++)) do oc describe node "${node_name[i]}" |grep -B 1 "Events" |grep -v "Events" > /data/tmp/oc_monitor/${node_name[i]}.resources chmod -R 777 /data/tmp/ done