5.Prometheus设定报警规则

1.主机及服务存活状态设置报警
2.内存使用率设置报警
3.cpu繁忙百分比设置报警
4.cpu iowait报警
5.disk 使用率百分比报警
6.网卡流量监控

1.主机及服务存活状态设置报警


1.1定义主机标签

 - job_name: pre-yzfs-node-47.100.70.42
    static_configs:
      - targets: ['172.16.40.153:19100']
        labels:
          node_name: cloud-pre-47.100.70.42
          cluster_name: pre

1.2设定规则

groups:
- name: 实例存活告警规则
  rules:
  - alert: 实例存活告警
    expr: up{job="prometheus"} == 0 or up{job="node"} == 0
    for: 1m
    labels:
      user: prometheus
      severity: Disaster
    annotations:
      summary: "Instance {{ $labels.instance }} is down"
      description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
      value: "{{ $value }}"

2.内存使用率设置报警

公式:
((total内存-(free内存+buffer内存+cache内存))/total内存*100
通过公式计算出使用内存的百分比

groups:
- name: 内存告警规则
  rules:
  - alert: "内存使用率告警"
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} 内存报警"
      description: "{{ $labels.alertname }} 内存资源利用率大于90%!(当前值: {{ $value }}%)"
      value: "{{ $value }}"

3.cpu繁忙百分比设置报警

公式:
(1-空闲状态cpu时间/所有状态cpu时间)*100

groups:
- name: CPU报警规则
  rules:
  - alert: CPU使用率告警
    expr: 100 -avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)* 100 > 70
    #expr: (1-sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance)/sum(increase(node_cpu_seconds_total[1m])) by (instance))*100 > 70
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} CPU报警"
      description: "服务器: CPU使用超过70%!(当前值: {{ $value }}%)"
      value: "{{ $value }}"

4.cpu iowait报警

公式:
(cpu_iowait_time[1m]/cpu_total[1m])*100

groups:
- name: CPU Iowaite 报警规则
  rules:
  - alert: CPU Iowait 报警
    expr: (sum(increase(node_cpu_seconds_total{mode="iowait"}[1m])) by (instance)/sum(increase(node_cpu_seconds_total[1m])) by (instance))*100 > 60
    for: 30s
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} CPU Iowait 报警"
      description: "服务器: CPU Iowait 超过60%!(当前值: {{ $value }}%)"
      value: "{{ $value }}"

5.disk 使用率百分比

公式:
(disk_total_size-disk_avail_size)/disk_total_size *100

groups:
- name: 磁盘报警规则
  rules:
  - alert: 磁盘使用率告警
    expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} 磁盘报警"
      description: "服务器:{{$labels.alertname}},磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
      value: "{{ $value }}"

6.网卡流量监控报警

公式:
(进网流量[1m]+出网流量[1m])/1024(Kb)/1024(Mb)

groups:
- name: 网卡流量监控
  rules:
  - alert: 网卡流量
    expr: (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])+irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[1m]))/1024/1024 > 4
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} 网卡流量报警"
      description: "服务器:{{$labels.alertname}},网卡流量超过4M! 当前值: {{ $value }}M)"
      value: "{{ $value }}"

总配置文件

[root@iZuf6fzcihc5izn2c1vz9yZ rules]# cat node_status.yml 
groups:
- name: 实例存活告警规则
  rules:
  - alert: 实例存活告警
    expr: up{job="prometheus"} == 0 or up{job="node"} == 0
    for: 1m
    labels:
      user: prometheus
      severity: Disaster
    annotations:
      summary: "Instance {{ $labels.instance }} is down"
      description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
      value: "{{ $value }}"

- name: 内存告警规则
  rules:
  - alert: "内存使用率告警"
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} 内存报警"
      description: "{{ $labels.alertname }} 内存资源利用率大于90%!(当前值: {{ $value }}%)"
      value: "{{ $value }}"

- name: CPU报警规则
  rules:
  - alert: CPU使用率告警
    expr: 100 -avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)* 100 > 70
    #expr: (1-sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance)/sum(increase(node_cpu_seconds_total[1m])) by (instance))*100 > 70
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} CPU报警"
      description: "服务器: CPU使用超过70%!(当前值: {{ $value }}%)"
      value: "{{ $value }}"

- name: CPU Iowaite 报警规则
  rules:
  - alert: CPU Iowait 报警
    expr: (sum(increase(node_cpu_seconds_total{mode="iowait"}[1m])) by (instance)/sum(increase(node_cpu_seconds_total[1m])) by (instance))*100 > 60
    for: 30s
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} CPU Iowait 报警"
      description: "服务器: CPU Iowait 超过60%!(当前值: {{ $value }}%)"
      value: "{{ $value }}"

- name: 磁盘报警规则
  rules:
  - alert: 磁盘使用率告警
    expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} 磁盘报警"
      description: "服务器:{{$labels.alertname}},磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
      value: "{{ $value }}"

- name: 网卡流量监控
  rules:
  - alert: 网卡流量
    expr: (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])+irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[1m]))/1024/1024 > 4
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} 网卡流量报警"
      description: "服务器:{{$labels.alertname}},网卡流量超过4M! 当前值: {{ $value }}M)"
      value: "{{ $value }}"

- name: 系统15分钟负载报警规则
  rules:
  - alert: 系统15分钟负载告警
    expr: node_load5 > 5.6
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} 系统负载报警"
      description: "服务器:{{$labels.alertname}},系统负载: 使用超过70%!当前值: {{ $value }}%)"
      value: "{{ $value }}"

posted @ 2023-01-24 20:38  老夫聊发少年狂88  阅读(842)  评论(0编辑  收藏  举报