Prometheus告警规则

blackbox_rules.yml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
groups:
- name: blackbox_alert
  rules:
  - alert: blackbox_alert
    expr: probe_success == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "接口/主机/端口 {{ $labels.instance }}  无法联通"
      description: "请尽快检测"
 
  - alert: "ssl证书过期警告"
    expr: (probe_ssl_earliest_cert_expiry - time())/86400 <30
    for: 1h
    labels:
      severity: warn
    annotations:
      description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
      summary: "ssl证书过期警告"

  k8s_rules.yml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
groups:
- name: node.rules
  rules:
  - alert: JobDown #检测job的状态,持续5分钟metrices不能访问会发给altermanager进行报警
    expr: up == 0  #0不正常,1正常
    for: 5m  #持续时间 , 表示持续5分钟获取不到信息,则触发报警
    labels:
      severity: error
      cluster: k8s
    annotations:
      summary: "Job: {{ $labels.job }} down"
      description: "Instance:{{ $labels.instance }}, Job {{ $labels.job }} stop "
  - alert: PodDown
    expr: kube_pod_container_status_running != 1 
    for: 2s
    labels:
      severity: warning
      cluster: k8s
    annotations:
      summary: 'Container: {{ $labels.container }} down'
      description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} is not running'
  - alert: PodReady
    expr: kube_pod_container_status_ready != 1 
    for: 5m   #Ready持续5分钟,说明启动有问题
    labels:
      severity: warning
      cluster: k8s
    annotations:
      summary: 'Container: {{ $labels.container }} ready'
      description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} always ready for 5 minitue'
  - alert: PodRestart
    expr: changes(kube_pod_container_status_restarts_total[30m])>0 #最近30分钟pod重启
    for: 2s
    labels:
      severity: warning
      cluster: k8s
    annotations:
      summary: 'Container: {{ $labels.container }} restart'
      description: 'namespace: {{ $labels.namespace }}, pod: {{ $labels.pod }} restart {{ $value }} times'

  node_rules.yml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
groups:
- name: 主机状态-监控告警  # 命名
  rules:
  - alert: 主机存活告警 # 命名
    expr: up == 0 # 表达式,分析指标判定告警
    for: 60s  # 触发告警持续时间
    labels:   # 自定义告警标签
      severity: warning
    annotations:   # 告警内容注释,根据需要制定
      summary: "{{ $labels.instance }} 宕机超过1分钟!" 
 
  - alert: 主机CPU使用率告警
    expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80
    for: 15m
    labels:
      severity: warning
    annotations:
      summary: "CPU近15分钟使用率大于80%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
 
  - alert: 主机内存使用率告警
    expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes))) * 100 > 85
    for: 15m
    labels:
      severity: warning
    annotations:
      summary: "内存利用率大于85%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
 
 
# 磁盘利用>80%
  - alert: 主机磁盘使用率告警
    expr: 100 - node_filesystem_free_bytes{fstype=~"xfs|ext4"} / node_filesystem_size_bytes{fstype=~"xfs|ext4"} * 100 > 80
    for: 15m
    labels:
      severity: warning
    annotations:
      summary: "磁盘使用率大于80%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
 
  - alert: 数据目录可用量
    expr:  node_filesystem_avail_bytes{mountpoint="/data",fstype=~"ext4|xfs"}  /1073741824  < 2
    for: 1m
    labels:
      status: 严重
    annotations:
      value: "{{ $value }}"
      instance: "{{ $labels.instance }}"
      mountpoint: "{{$labels.mountpoint}}"
      summary: "实例在data挂载点磁盘可用量小于2G!, 当前可用: {{ $value }}G"
 
  - alert: 根目录可用量
    expr:  node_filesystem_avail_bytes{mountpoint="/",fstype=~"ext4|xfs"}  /1073741824  < 2
    for: 1m
    labels:
      status: 严重
    annotations:
      value: "{{ $value }}"
      instance: "{{ $labels.instance }}"
      mountpoint: "{{$labels.mountpoint}}"
      summary: "实例在root挂载点磁盘可用量小于2G!, 当前可用: {{ $value }}G"
 
  - alert: TCP连接数
    expr: node_netstat_Tcp_CurrEstab > 10000
    for: 2m
    labels:
      severity: 严重告警
    annotations:
      summary: " TCP_ESTABLISHED过高!"
      description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%."
 
  - alert: 主机Tcp TimeWait数量过多告警
    expr: node_sockstat_TCP_tw >= 5000
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Tcp TimeWait数量大于5000, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
 
  - alert: 主机iowait较高
    expr: (sum(increase(node_cpu_seconds_total{mode='iowait'}[5m]))by(instance)) / (sum(increase(node_cpu_seconds_total[5m]))by(instance))  *100 >= 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "CPU ioWait近5分钟占比大于等于10%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
 
  - alert: 磁盘IO性能
    expr: avg(irate(node_disk_io_time_seconds_total[1m])) by(instance,job)* 100 > 90
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流入磁盘IO使用率过高,请尽快处理!"
      description: "{{$labels.instance}} 流入磁盘IO大于90%,当前使用率{{ $value }}%."
 
  - alert: 主机磁盘读过大
    expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50*1024 *1024
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "磁盘读过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。"
 
# 写入 > 50MB/s
  - alert: 主机磁盘写过大
    expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50 * 1024 * 1024
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "磁盘写过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。"
 
  - alert: 网络流入
    expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流入网络带宽过高,请尽快处理!"
      description: "{{$labels.instance}} 流入网络带宽持续5分钟高于100M. RX带宽使用量{{$value}}."
  
  - alert: 网络流出
    expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理!"
      description: "{{$labels.instance}} 流出网络带宽持续5分钟高于100M. RX带宽使用量{$value}}."
 
  - alert: 系统15分钟负载告警
    expr: node_load5 > 5.6
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} 系统负载报警"
      description: "服务器:{{$labels.alertname}},系统负载: 使用超过70%!当前值: {{ $value }}%)"
      value: "{{ $value }}"

  

posted @   骑白马de唐僧  阅读(184)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具
点击右上角即可分享
微信分享提示