prometheus.rules模板

groups:
  - name: 服务器告警
    rules:
    - alert: 服务器宕机告警
      expr: up == 0
      for: 3m
      annotations:
        summary: "Alerting {{$labels.instance}}宕机!"
        description: "环境{{$labels.job}} 服务器{{$labels.instance}}已宕机!"
    - alert: cpu使用率过高告警
      expr: (100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100))* on(instance) group_left(nodename) (node_uname_info) > 85
      for: 5m
      annotations:
        summary: "Alerting环境{{$labels.job}} {{$labels.instance}}({{$labels.nodename}})CPU使用率过高!"
        description: '服务器{{$labels.instance}}({{$labels.nodename}})CPU使用率超过85%(目前使用:{{printf "%.2f" $value}}%)'
    - alert: 系统负载过高
      expr: (node_load1/count without (cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(nodename) (node_uname_info)>4
      for: 3m
      annotations:
        summary: "Alerting环境{{$labels.job}} {{$labels.instance}}({{$labels.nodename}})系统负载过高!"
        description: '{{$labels.instance}}({{$labels.nodename}})当前负载超标率 {{printf "%.2f" $value}}'
    - alert: 内存不足告警
      expr: (100 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100)* on(instance) group_left(nodename) (node_uname_info) > 88
      for: 3m
      annotations:
        summary: "Alerting环境{{$labels.job}} {{$labels.instance}}({{$labels.nodename}})内存使用率过高!"
        description: '服务器{{$labels.instance}}({{$labels.nodename}})内存使用率超过80%(目前使用:{{printf "%.2f" $value}}%)'
    - alert: 硬盘空间不足告警
      expr: (100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) )* on(instance) group_left(nodename) (node_uname_info)> 85
      for: 3m
      annotations:
        summary: "Alerting环境{{$labels.job}} {{$labels.instance}}({{$labels.nodename}})硬盘使用率过高!"
        description: '服务器{{$labels.instance}}({{$labels.nodename}})硬盘使用率超过80%(目前使用:{{printf "%.2f" $value}}%)'
    - alert: TCP连接数
      #expr: node_netstat_Tcp_CurrEstab > 1000
      expr: (node_netstat_Tcp_CurrEstab)* on(instance) group_left(nodename) (node_uname_info) > 1000
      for: 2m
      annotations:
        summary: "Alerting环境{{$labels.job}} {{$labels.instance}} TCP ESTABLISHED连接数过高!"
        description: "{{$labels.instanceJ}} TCP_ESTABLISHED当前连接数{[ $value }"

groups:
- name: Docker_pod
  rules:
  - alert: Docker_pod_Down
    #expr: container_last_seen{image!="",id!=""} == 0
    expr: rate(kube_pod_container_status_restarts_total[5m]) > 0
          # container_last_seen{id!=""} == 0
    for: 2m
    annotations:
      description: "{{ $labels.job }}环境报警 {{ $labels.container }}  {{ $labels.pod }} 重启"
      summary: 'Instance {{ $labels.pod }} 重启中'
groups:
- name: Docker_pod2
  rules:
  - alert: Docker_pod_Down
    #expr: container_last_seen{image!="",id!=""} == 0
    expr: rate(kube_pod_status_phase{phase!="Running"}) > 0 
    for: 2m
    annotations:
      description: "{{ $labels.job }}环境报警 {{ $labels.container }}  {{ $labels.pod }} 启动失败"
      summary: 'Instance {{ $labels.pod }} 启动失败'
posted @ 2023-12-13 16:16  A学无止境A  阅读(58)  评论(0编辑  收藏  举报