Kubernetes Prometheus rule

告警规则


常用告警规则配置

  • alerts

    ## CPU告警规则
    groups:
    - name: CpuAlertRule
      rules:
      - alert: PodCPU告警
        expr: onecore:pod > 80 or twocore:pod / 2 > 80 or squarecore:pod / 4 > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "CPU使用率大于80%"
          value: "{{$value}}%"
          #summary: 'CPU使用率大于80%,当前值为{{.Value}}%,CPU使用率: {{ printf `ceil(100 - ((avg by (instance)(irate(node_cpu_seconds_total{mode="idle",instance="%s"}[1m]))) *100))` $labels.instance | query | first | value }}%'
      - alert: NodeCPU告警
        expr: round(100-avg(irate(node_cpu_seconds_total{mode="idle"}[5m]))by(kubernetes_node)*100) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "CPU使用率大于80%"
          value: "{{$value}}%"
          #summary: 'CPU使用率大于80%,当前值为{{.Value}}%,CPU使用率: {{ printf `ceil(100 - ((avg by (instance)(irate(node_cpu_seconds_total{mode="idle",instance="%s"}[1m]))) *100))` $labels.instance | query | first | value }}%'
    
    ## DISK告警规则
    - name: DiskAlertRule
      rules:
      - alert: Pod磁盘告警
        expr: round(container_fs_usage_bytes{container=~".+",container!~"POD"}/1024/1024/1024/10*100) > 85
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "磁盘使用率大于85%"
          value: "{{$value}}%"
      - alert: Node磁盘告警
        expr: round((1- node_filesystem_avail_bytes{fstype=~"ext.+|nfs.+",mountpoint!~".*docker.*"}/node_filesystem_size_bytes{fstype=~"ext.+|nfs.+",mountpoint!~".*docker.*"})*100) > 85
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "磁盘使用率大于85%"
          value: "{{$value}}%"
    
    ## MEM告警规则
    - name: MemAlertRule
      rules:
      - alert: Pod内存告警
        expr: round(container_memory_usage_bytes{container=~".+",container!~"POD|.+reload",pod!~"^csi.+"}/container_spec_memory_limit_bytes{container=~".+",container!~"POD|.reload",pod!~"^csi.+"}*100) > 85
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "内存使用率大于85%"
          value: "{{$value}}%"
      - alert: Node内存告警
        expr: round(100-((node_memory_MemAvailable_bytes*100)/node_memory_MemTotal_bytes)) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          description: "内存使用率大于85%"
          value: "{{$value}}%"
    
    ## Pod意外重启
    - name: PodRestartAlertRule
      rules:
      - alert: Pod重启告警
        expr: delta(kube_pod_container_status_restarts_total[1m]) > 0
        for: 1s
        labels:
          severity: warning
        annotations:
          description: "Pod发生意外重启事件"
    
    ## JvmCMSOldGC
    - name: PodJvmOldGCAlertRule
      rules:
      - alert: PodJvmCMSOldGC
        expr: round((jvm_memory_pool_bytes_used{pool=~".+Old Gen"}/jvm_memory_pool_bytes_max{pool=~".+Old Gen"})*100) > 89
        for: 5s
        labels:
          severity: warning
        annotations:
          description: "Pod堆内存触发CMSOldGC"
          value: "{{$value}}%"
    
    ## Pod实例异常
    - name: ContainerInstanceAlertRule
      rules:
      - alert: Pod实例异常
        expr: kube_pod_container_status_ready - kube_pod_container_status_running > 0
        for: 20s
        labels:
          severity: warning
        annotations:
          description: "Container实例异常"
    
    ## Pod实例OOM
    - name: ContainerOOMAlertRule
      rules:
      - alert: Pod实例OOM
        expr: kube_pod_container_status_terminated_reason{reason="OOMKilled"} > 0
        for: 1s
        labels:
          severity: warning
        annotations:
          description: "Container实例OOM"
    
    ## Pod实例驱逐
    - name: ContainerEvictionAlertRule
      rules:
      - alert: Pod实例驱逐
        expr: kube_pod_container_status_terminated_reason{reason="Evicted"} > 0
        for: 1s
        labels:
          severity: warning
        annotations:
          description: "Container实例驱逐"
    
    ## MQ内存告警
    - name: MQMemoryAlertRule
      rules:
      - alert: MQ内存水位线
        expr: rabbitmq_node_mem_alarm{job=~".*rabbitmq.*"} == 1
        for: 1s
        labels:
          severity: warning
        annotations:
          description: "RabbitMQ内存高水位线告警"
          summary: RabbitMQ {{`{{ $labels.instance }}`}} High Memory Alarm is going off.  Which means the node hit highwater mark and has cut off network connectivity, see RabbitMQ WebUI
      - alert: MQ内存使用告警
        expr: round(avg(rabbitmq_node_mem_used{job=~".*rabbitmq.*"} / rabbitmq_node_mem_limit{job=~".*rabbitmq.*"})by(node,kubernetes_namespace)*100) > 90
        for: 10s
        labels:
          severity: warning
        annotations:
          description: "RabbitMQ使用告警"
          value: "{{$value}}%"
          summary: RabbitMQ {{`{{ $labels.instance }}`}} Memory Usage > 90%
    
    ##PodJava进程异常
    - name: PodJavaProcessAlertRule
      rules:
      - alert: PodJava进程异常
        expr: sum(up{job="kubernetes-pods-jvm"})by(kubernetes_container_name,kubernetes_pod_name) == 0
        for: 10s
        labels:
          severity: warning
        annotations:
          description: "PodJava进程异常"
          summary: "赶快看看吧,顶不住了"
  • recording_rules

    groups:
      - name: CpuRecordRules
        rules:
        - record: onecore:pod
          expr: round(sum by(pod, container, instance, namespace, name) (irate(container_cpu_usage_seconds_total{container!~"|POD|prod-xianxiang-edu-loan|prod-xy-fund|prod-common-callcenter|prod-risk-service|prod-qn-web-api|prod-xc-fund|prod-xc-user|sys-ingress|etcd|prod-qn-mp|prod-xc-common|prod-xianxiang-zuul|prod-qn-user|prod-xc-riskapi|prod-common-message|prod-common-trust-service|prod-xc-collection|kube-controller-manager|prod-qn-risk|prod-xy-zuul|metrics-server|prod-nflow-manager|kube-scheduler|prod-qn-gateway|prod-xc-pay|coredns|kube-apiserver|prod-qn-oms|prod-common-service|prod-nfsp-service|pord-ingress|prod-qn-cms|prod-internal-ingress|prod-xc-loan|prod-rabbitmq|prometheus-server"}[5m]) * 100))
        - record: twocore:pod
          expr: round(sum by(pod, container, instance, namespace, name) (irate(container_cpu_usage_seconds_total{container=~"prod-xianxiang-edu-loan|prod-xy-fund|prod-common-callcenter|prod-risk-service|prod-qn-web-api|prod-xc-fund|prod-xc-user|sys-ingress|etcd|prod-qn-mp|prod-xc-common|prod-xianxiang-zuul|prod-qn-user|prod-xc-riskapi|prod-common-message|prod-common-trust-service|prod-xc-collection|kube-controller-manager|prod-qn-risk|prod-xy-zuul|metrics-server|prod-nflow-manager|kube-scheduler|prod-qn-gateway|prod-xc-pay|coredns|kube-apiserver|prod-qn-oms|prod-common-service|prod-nfsp-service|pord-ingress|prod-qn-cms|prod-internal-ingress|prod-xc-loan"}[5m]) * 100))
        - record: squarecore:pod
          expr: round(sum by(pod, container, instance, namespace, name) (irate(container_cpu_usage_seconds_total{container=~"prod-rabbitmq|prometheus-server"}[5m]) * 100))

     

posted @ 2022-06-12 11:44  MacoPlus  阅读(616)  评论(0编辑  收藏  举报