Prometheus告警规则
blackbox_rules.yml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | groups: - name: blackbox_alert rules: - alert: blackbox_alert expr: probe_success == 0 for : 5m labels: severity: critical annotations: summary: "接口/主机/端口 {{ $labels.instance }} 无法联通" description: "请尽快检测" - alert: "ssl证书过期警告" expr: (probe_ssl_earliest_cert_expiry - time())/86400 <30 for : 1h labels: severity: warn annotations: description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书' summary: "ssl证书过期警告" |
k8s_rules.yml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | groups: - name: node.rules rules: - alert: JobDown #检测job的状态,持续5分钟metrices不能访问会发给altermanager进行报警 expr: up == 0 #0不正常,1正常 for : 5m #持续时间 , 表示持续5分钟获取不到信息,则触发报警 labels: severity: error cluster: k8s annotations: summary: "Job: {{ $labels.job }} down" description: "Instance:{{ $labels.instance }}, Job {{ $labels.job }} stop " - alert: PodDown expr: kube_pod_container_status_running != 1 for : 2s labels: severity: warning cluster: k8s annotations: summary: 'Container: {{ $labels.container }} down' description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} is not running' - alert: PodReady expr: kube_pod_container_status_ready != 1 for : 5m #Ready持续5分钟,说明启动有问题 labels: severity: warning cluster: k8s annotations: summary: 'Container: {{ $labels.container }} ready' description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} always ready for 5 minitue' - alert: PodRestart expr: changes(kube_pod_container_status_restarts_total[30m])>0 #最近30分钟pod重启 for : 2s labels: severity: warning cluster: k8s annotations: summary: 'Container: {{ $labels.container }} restart' description: 'namespace: {{ $labels.namespace }}, pod: {{ $labels.pod }} restart {{ $value }} times' |
node_rules.yml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | groups: - name: 主机状态-监控告警 # 命名 rules: - alert: 主机存活告警 # 命名 expr: up == 0 # 表达式,分析指标判定告警 for : 60s # 触发告警持续时间 labels: # 自定义告警标签 severity: warning annotations: # 告警内容注释,根据需要制定 summary: "{{ $labels.instance }} 宕机超过1分钟!" - alert: 主机CPU使用率告警 expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode= "idle" }[1m]) )) * 100 > 80 for : 15m labels: severity: warning annotations: summary: "CPU近15分钟使用率大于80%, 实例: {{ $labels.instance }},当前值:{{ $value }}%" - alert: 主机内存使用率告警 expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes))) * 100 > 85 for : 15m labels: severity: warning annotations: summary: "内存利用率大于85%, 实例: {{ $labels.instance }},当前值:{{ $value }}%" # 磁盘利用>80% - alert: 主机磁盘使用率告警 expr: 100 - node_filesystem_free_bytes{fstype=~ "xfs|ext4" } / node_filesystem_size_bytes{fstype=~ "xfs|ext4" } * 100 > 80 for : 15m labels: severity: warning annotations: summary: "磁盘使用率大于80%, 实例: {{ $labels.instance }},当前值:{{ $value }}%" - alert: 数据目录可用量 expr: node_filesystem_avail_bytes{mountpoint= "/data" ,fstype=~ "ext4|xfs" } /1073741824 < 2 for : 1m labels: status: 严重 annotations: value: "{{ $value }}" instance: "{{ $labels.instance }}" mountpoint: "{{$labels.mountpoint}}" summary: "实例在data挂载点磁盘可用量小于2G!, 当前可用: {{ $value }}G" - alert: 根目录可用量 expr: node_filesystem_avail_bytes{mountpoint= "/" ,fstype=~ "ext4|xfs" } /1073741824 < 2 for : 1m labels: status: 严重 annotations: value: "{{ $value }}" instance: "{{ $labels.instance }}" mountpoint: "{{$labels.mountpoint}}" summary: "实例在root挂载点磁盘可用量小于2G!, 当前可用: {{ $value }}G" - alert: TCP连接数 expr: node_netstat_Tcp_CurrEstab > 10000 for : 2m labels: severity: 严重告警 annotations: summary: " TCP_ESTABLISHED过高!" description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%." - alert: 主机Tcp TimeWait数量过多告警 expr: node_sockstat_TCP_tw >= 5000 for : 1m labels: severity: warning annotations: summary: "Tcp TimeWait数量大于5000, 实例: {{ $labels.instance }},当前值:{{ $value }}%" - alert: 主机iowait较高 expr: (sum(increase(node_cpu_seconds_total{mode= 'iowait' }[5m])) by (instance)) / (sum(increase(node_cpu_seconds_total[5m])) by (instance)) *100 >= 10 for : 5m labels: severity: warning annotations: summary: "CPU ioWait近5分钟占比大于等于10%, 实例: {{ $labels.instance }},当前值:{{ $value }}%" - alert: 磁盘IO性能 expr: avg(irate(node_disk_io_time_seconds_total[1m])) by (instance,job)* 100 > 90 for : 5m labels: severity: 严重告警 annotations: summary: "{{$labels.instance}} 流入磁盘IO使用率过高,请尽快处理!" description: "{{$labels.instance}} 流入磁盘IO大于90%,当前使用率{{ $value }}%." - alert: 主机磁盘读过大 expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50*1024 *1024 for : 5m labels: severity: warning annotations: summary: "磁盘读过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。" # 写入 > 50MB/s - alert: 主机磁盘写过大 expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50 * 1024 * 1024 for : 5m labels: severity: warning annotations: summary: "磁盘写过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。" - alert: 网络流入 expr: ((sum(rate (node_network_receive_bytes_total{device!~ 'tap.*|veth.*|br.*|docker.*|virbr*|lo*' }[5m])) by (instance,job)) / 100) > 102400 for : 5m labels: severity: 严重告警 annotations: summary: "{{$labels.instance}} 流入网络带宽过高,请尽快处理!" description: "{{$labels.instance}} 流入网络带宽持续5分钟高于100M. RX带宽使用量{{$value}}." - alert: 网络流出 expr: ((sum(rate (node_network_transmit_bytes_total{device!~ 'tap.*|veth.*|br.*|docker.*|virbr*|lo*' }[5m])) by (instance,job)) / 100) > 102400 for : 5m labels: severity: 严重告警 annotations: summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理!" description: "{{$labels.instance}} 流出网络带宽持续5分钟高于100M. RX带宽使用量{$value}}." - alert: 系统15分钟负载告警 expr: node_load5 > 5.6 for : 1m labels: user: prometheus severity: warning annotations: summary: "服务器: {{$labels.alertname}} 系统负载报警" description: "服务器:{{$labels.alertname}},系统负载: 使用超过70%!当前值: {{ $value }}%)" value: "{{ $value }}" |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具