prometheus监控kubernetes容器
prometheus.yaml
# Prometheus self-monitoring 普罗米修斯自我监控 groups: - name: 普罗米修斯-监控告警 #组名,报警规则组名称 rules: #定义角色 # 1.1.1. Prometheus job missing 普罗米修斯失踪 # A Prometheus job has disappeared 普罗米修斯的工作不见了 - alert: PrometheusJobMissing #告警名称,实例在规定时间无法访问发出告警 expr: absent(up{job="prometheus"}) #expr表达式 for: 0m #for持续时间,表示0M获取不到信息,触发告警 labels: severity: warning #告警级别 annotations: #注释告警通知 summary: Prometheus job missing (instance {{ $labels.instance }}) #自定义告警通知 description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.2. Prometheus target missing # A Prometheus target has disappeared. An exporter might be crashed.普罗米修斯的目标消失了。出口商可能会破产。 - alert: PrometheusTargetMissing expr: up == 0 for: 0m labels: severity: critical annotations: summary: Prometheus target missing (instance {{ $labels.instance }}) description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.3. Prometheus all targets missing # A Prometheus job does not have living target anymore. 普罗米修斯的工作已经没有活的目标了 - alert: PrometheusAllTargetsMissing expr: count by (job) (up) == 0 for: 0m labels: severity: critical annotations: summary: Prometheus all targets missing (instance {{ $labels.instance }}) description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.4. Prometheus configuration reload failure # Prometheus configuration reload error 普罗米修斯配置重新加载错误 - alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 0m labels: severity: warning annotations: summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.5. Prometheus too many restarts # Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. 在过去的15分钟里,普罗米修斯已经重启了两次以上。可能是撞车 - alert: PrometheusTooManyRestarts expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 for: 0m labels: severity: warning annotations: summary: Prometheus too many restarts (instance {{ $labels.instance }}) description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.6. Prometheus AlertManager configuration reload failure # AlertManager configuration reload error AlertManager配置重新加载错误 - alert: PrometheusAlertmanagerConfigurationReloadFailure expr: alertmanager_config_last_reload_successful != 1 for: 0m labels: severity: warning annotations: summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.7. Prometheus AlertManager config not synced # Configurations of AlertManager cluster instances are out of sync AlertManager群集实例的配置不同步 - alert: PrometheusAlertmanagerConfigNotSynced expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 for: 0m labels: severity: warning annotations: summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.8. Prometheus AlertManager E2E dead man switch #Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager. 普罗米修斯死神开关是一个随时开火的警报。它被用作通过Alertmanager对普罗米修斯的端到端测试 - alert: PrometheusAlertmanagerE2eDeadManSwitch expr: vector(1) for: 0m labels: severity: critical annotations: summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.9. Prometheus not connected to alertmanager # Prometheus cannot connect the alertmanager 普罗米修斯无法连接alertmanager - alert: PrometheusNotConnectedToAlertmanager expr: prometheus_notifications_alertmanagers_discovered < 1 for: 0m labels: severity: critical annotations: summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.10. Prometheus rule evaluation failures # Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts. 普罗米修斯遇到{$value}}规则评估失败,导致可能被忽略的警报 - alert: PrometheusRuleEvaluationFailures expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.11. Prometheus template text expansion failures # Prometheus encountered {{ $value }} template text expansion failures 普罗米修斯遇到{$value}}模板文本扩展失败 - alert: PrometheusTemplateTextExpansionFailures expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.12. Prometheus rule evaluation slow # Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.普罗米修斯规则评估花费的时间比计划的时间间隔长。它表示存储后端访问速度较慢或查询太复杂。 - alert: PrometheusRuleEvaluationSlow expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds for: 5m labels: severity: warning annotations: summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.13. Prometheus notifications backlog # The Prometheus notification queue has not been empty for 10 minutes 普罗米修斯通知队列已经有10分钟没有空了。 - alert: PrometheusNotificationsBacklog expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 for: 0m labels: severity: warning annotations: summary: Prometheus notifications backlog (instance {{ $labels.instance }}) description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.14. Prometheus AlertManager notification failing # Alertmanager is failing sending notifications Alertmanager无法发送通知 - alert: PrometheusAlertmanagerNotificationFailing expr: rate(alertmanager_notifications_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.15. Prometheus target empty # Prometheus has no target in service discovery 普罗米修斯在服务发现中没有目标 - alert: PrometheusTargetEmpty expr: prometheus_sd_discovered_targets == 0 for: 0m labels: severity: critical annotations: summary: Prometheus target empty (instance {{ $labels.instance }}) description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.16. Prometheus target scraping slow # Prometheus is scraping exporters slowly 普罗米修斯正在慢慢地刮 - alert: PrometheusTargetScrapingSlow expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 for: 5m labels: severity: warning annotations: summary: Prometheus target scraping slow (instance {{ $labels.instance }}) description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.17. Prometheus large scrape # Prometheus has many scrapes that exceed the sample limit 普罗米修斯有许多刮痕超过了样本限制 - alert: PrometheusLargeScrape expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 for: 5m labels: severity: warning annotations: summary: Prometheus large scrape (instance {{ $labels.instance }}) description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.18. Prometheus target scrape duplicate # Prometheus has many samples rejected due to duplicate timestamps but different values 普罗米修斯有许多样本由于重复的时间戳而被拒绝,但值不同 - alert: PrometheusTargetScrapeDuplicate expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 for: 0m labels: severity: warning annotations: summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.19. Prometheus TSDB checkpoint creation failures # Prometheus encountered {{ $value }} checkpoint creation failures 普罗米修斯遇到{$value}}检查点创建失败 - alert: PrometheusTsdbCheckpointCreationFailures expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.20. Prometheus TSDB checkpoint deletion failures # Prometheus encountered {{ $value }} checkpoint deletion failures Prometheus遇到{$value}}检查点删除失败 - alert: PrometheusTsdbCheckpointDeletionFailures expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.21. Prometheus TSDB compactions failed # Prometheus encountered {{ $value }} TSDB compactions failures 普罗米修斯遇到{$value}}TSDB压缩失败 - alert: PrometheusTsdbCompactionsFailed expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.22. Prometheus TSDB head truncations failed # Prometheus encountered {{ $value }} TSDB head truncation failures Prometheus遇到{$value}}TSDB头截断失败 - alert: PrometheusTsdbHeadTruncationsFailed expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.23. Prometheus TSDB reload failures # Prometheus encountered {{ $value }} TSDB reload failures 普罗米修斯遇到{$value}}TSDB重新加载失败 - alert: PrometheusTsdbReloadFailures expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.24. Prometheus TSDB WAL corruptions # Prometheus encountered {{ $value }} TSDB WAL corruptions 普罗米修斯遇到了{$value}}TSDB-WAL腐蚀 - alert: PrometheusTsdbWalCorruptions expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.1.25. Prometheus TSDB WAL truncations failed # Prometheus encountered {{ $value }} TSDB WAL truncation failures 普罗米修斯遇到{$value}}TSDB WAL截断失败 - alert: PrometheusTsdbWalTruncationsFailed expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
windows.yaml
# 1.5. Windows Server : prometheus-community/windows_exporter (5 rules) groups: - name: Docker容器-监控告警 #组名,报警规则组名称 rules: #定义角色 # 1.5.1. Windows Server collector Error # Collector {{ $labels.collector }} was not successful - alert: WindowsServerCollectorError expr: windows_exporter_collector_success == 0 for: 0m labels: severity: critical annotations: summary: Windows Server collector Error (instance {{ $labels.instance }}) description: "Collector {{ $labels.collector }} was not successful\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.5.2. Windows Server service Status # Windows Service state is not OK - alert: WindowsServerServiceStatus expr: windows_service_status{status="ok"} != 1 for: 1m labels: severity: critical annotations: summary: Windows Server service Status (instance {{ $labels.instance }}) description: "Windows Service state is not OK\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.5.3. Windows Server CPU Usage # CPU Usage is more than 80% - alert: WindowsServerCpuUsage expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80 for: 0m labels: severity: warning annotations: summary: Windows Server CPU Usage (instance {{ $labels.instance }}) description: "CPU Usage is more than 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.5.4. Windows Server memory Usage # Memory usage is more than 90% - alert: WindowsServerMemoryUsage expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90 for: 2m labels: severity: warning annotations: summary: Windows Server memory Usage (instance {{ $labels.instance }}) description: "Memory usage is more than 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.5.5. Windows Server disk Space Usage # Disk usage is more than 80% - alert: WindowsServerDiskSpaceUsage expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80 for: 2m labels: severity: critical annotations: summary: Windows Server disk Space Usage (instance {{ $labels.instance }}) description: "Disk usage is more than 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
node-exporter.yaml
# Host and hardware : node-exporter (31 rules) groups: - name: 主机节点-监控告警 #组名,报警规则组名称 rules: #定义角色 # 1.2.1. Host out of memory # 节点内存已满(<10%) - alert: 主机内存 #告警名称,实例在规定时间无法访问发出告警 expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 #expr规则 for: 10m ##for持续时间,表示2M获取不到信息,触发告警 labels: severity: warning #告警级别 annotations: #注释告警通知 summary: 主机内存不足 (instance {{ $labels.instance }}) #自定义告警通知 description: "节点内存已满(<10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.2. Host memory under memory pressure # The node is under heavy memory pressure. High rate of major page faults - alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 2m labels: severity: warning annotations: summary: Host memory under memory pressure (instance {{ $labels.instance }}) description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.3. Host unusual network throughput in # 主机网络接口可能接收的数据太多(>100 MB/s) - alert: 网卡接收数据 expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: summary: 主机网络吞吐量 (instance {{ $labels.instance }}) description: "主机网络接口可能接收的数据太多主机网络接口可能接收的数据太多 (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.4. Host unusual network throughput out # 主机网络接口可能发送太多数据 (> 100 MB/s) - alert: 网卡发送数据 expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: summary: 主机网络吞吐量 (instance {{ $labels.instance }}) description: "主机网络接口可能发送太多数据 (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.5. Host unusual disk read rate # 磁盘可能读取了太多数据(>50 MB/s) - alert: 主机磁盘异常读取 expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning annotations: summary: 主机磁盘读取率 (instance {{ $labels.instance }}) description: "磁盘可能读取了太多数据 (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.6. Host unusual disk write rate # Disk is probably writing too much data (> 50 MB/s) - alert: 主机异常磁盘写入 expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 for: 2m labels: severity: warning annotations: summary: Host unusual disk write rate (instance {{ $labels.instance }}) description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.7. Host out of disk space # Disk is almost full (< 10% left) # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: 主机磁盘空间 expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 for: 2m labels: severity: warning annotations: summary: 主机磁盘空间不足 (instance {{ $labels.instance }}) description: "磁盘快满了 (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.8. Host disk will fill in 24 hours # Filesystem is predicted to run out of space within the next 24 hours at current write rate # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".1 # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: 主机磁盘将在24小时内填满 expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 for: 2m labels: severity: warning annotations: summary: 主机磁盘将占用24小时 (instance {{ $labels.instance }}) description: "文件系统预计将在未来24小时内以当前写入速率耗尽空间\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.9. Host out of inodes # 磁盘上的可用索引节点快用完了(<10%) - alert: 主机inodes expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 20 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/"} == 0 for: 2m labels: severity: warning annotations: summary: 主机已用inode(instance {{ $labels.instance }}) description: "磁盘的可用索引节点快用完了 (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.10. Host inodes will fill in 24 hours # Filesystem is predicted to run out of inodes within the next 24 hours at current write rate - alert: 主机inode将在24小时内用完 expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint="/"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 for: 2m labels: severity: warning annotations: summary: 主机索引节点将在24小时内用完 (instance {{ $labels.instance }}) description: "文件系统预计将在未来24小时内以当前写入速率耗尽inode\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.11. Host unusual disk read latency # Disk latency is growing (read operations > 100ms) - alert: 主机磁盘读取延迟 expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 for: 2m labels: severity: warning annotations: summary: 主机磁盘读取延迟 (instance {{ $labels.instance }}) description: "磁盘延迟正在增长 (读取操作 > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.12. Host unusual disk write latency # Disk latency is growing (write operations > 100ms) - alert: 主机磁盘写入延迟 expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 for: 2m labels: severity: warning annotations: summary: 主机磁盘写入延迟 (instance {{ $labels.instance }}) description: "磁盘延迟正在增长 (写入操作 > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.13. Host high CPU load #mode="idle" 从系统启动开始,累计到当前时刻,除IO等待时间以外的其它等待时间,亦即空闲时间 # CPU load is > 80% - alert: 主机CPU高负载 expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 for: 0m labels: severity: warning annotations: summary: 主机高负载 (instance {{ $labels.instance }}) description: "CPU负载为 > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.14. Host CPU steal noisy neighbor #mode="steal"当运行在虚拟化环境中,花费在其它 OS 中的时间(基于虚拟机监视器 hypervisor 的调度);可以理解成由于虚拟机调度器将 cpu 时间用于其它 OS 了,故当前 OS 无法使用 CPU 的时间。 # CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. - alert: HostCpuStealNoisyNeighbor expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 for: 0m labels: severity: warning annotations: summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) description: "CPU窃取>10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.15. Host context switching # Context switching is growing on node (> 1000 / s) # 1000 context switches is an arbitrary number. # Alert threshold depends on nature of application. # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - alert: 主机上下文切换 expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 15000 for: 0m labels: severity: warning annotations: summary: Host context switching (instance {{ $labels.instance }}) description: "Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.16. Host swap is filling up # Swap is filling up (>80%) - alert: 主机交换分区 expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 for: 2m labels: severity: warning annotations: summary: 主机交换已满 (instance {{ $labels.instance }}) description: "主机交换分区 (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.17. Host systemd service crashed # systemd service crashed - alert: systemd服务崩溃 expr: node_systemd_unit_state{state="failed"} == 1 for: 0m labels: severity: warning annotations: summary: 主机systemd服务崩溃 (instance {{ $labels.instance }}) description: "systemd服务崩溃\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.22. Host kernel version deviations # Different kernel versions are running - alert: 主机内核 expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 for: 6h labels: severity: warning annotations: summary: Host kernel version deviations (instance {{ $labels.instance }}) description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.23. Host OOM kill detected # OOM kill detected - alert: 检测到OOM杀死 expr: increase(node_vmstat_oom_kill[1m]) > 0 for: 0m labels: severity: warning annotations: summary: 检测到主机OOM终止 (instance {{ $labels.instance }}) description: "检测到OOM杀死\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.26. Host Network Receive Errors # Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes. - alert: 主机网络接收错误 expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 for: 2m labels: severity: warning annotations: summary: 主机网络接收错误 (instance {{ $labels.instance }}) description: "主机 {{ $labels.instance }} 接口 {{ $labels.device }} 在过去五分钟内收到错误遇到 {{ printf \"%.0f\" $value }} .\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.27. Host Network Transmit Errors # Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes. - alert: 主机网络传输错误 expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 for: 2m labels: severity: warning annotations: summary: 主机网络传输错误 (instance {{ $labels.instance }}) description: "主机 {{ $labels.instance }} 接口 {{ $labels.device }} 在过去五分钟内收到错误遇到 {{ printf \"%.0f\" $value }} \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.28. Host Network Interface Saturated # The network interface "{{ $labels.interface }}" on "{{ $labels.instance }}" is getting overloaded. - alert: 主机网络接口 expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 for: 1m labels: severity: warning annotations: summary: 主机网络接口饱和 (instance {{ $labels.instance }}) description: "网络接口 \"{{ $labels.interface }}\" 在 \"{{ $labels.instance }}\" 已经超负荷了.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.29. Host conntrack limit # The number of conntrack is approching limit - alert: 连接数接近极限 expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 for: 5m labels: severity: warning annotations: summary: 主机连接数接近极限 (instance {{ $labels.instance }}) description: "主机连接数接近极限\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.30. Host clock skew # Clock skew detected. Clock is out of sync. - alert: 时钟偏移 expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) for: 2m labels: severity: warning annotations: summary: 主机时间偏移 (instance {{ $labels.instance }}) description: "检测到时钟偏移。时钟不同步.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.2.31. Host clock not synchronising # Clock not synchronising. - alert: 主机时间不同步 expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 for: 2m labels: severity: warning annotations: summary: 主机时间不同步 (instance {{ $labels.instance }}) description: "时钟不同步。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
docker.yaml
# Docker containers : google/cAdvisor (6 rules) groups: - name: Docker容器-监控告警 #组名,报警规则组名称 rules: #定义角色 # 1.3.1. Container killed # A container has disappeared - alert: ContainerKilled expr: time() - container_last_seen > 60 for: 0m labels: severity: warning annotations: summary: Container killed (instance {{ $labels.instance }}) description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.3.2. Container CPU usage # Container CPU usage is above 80% # cAdvisor can sometimes consume a lot of CPU, so this alert will fire constantly. # If you want to exclude it from this alert, exclude the serie having an empty name: container_cpu_usage_seconds_total{name!=""} - alert: 容器cpu使用量 expr: sum(rate(container_cpu_system_seconds_total{name=~".+"}[1m])) by (name,Bsc_cloud_region_env_type,Bsc_project_name,Bsc_host_ip,Bsc_host_ip,Bsc_k8s_namespace,Bsc_k8s_pod_name,Bsc_k8s_pod_ip) * 100 > 80 for: 2m labels: severity: warning annotations: summary: 容器cpu使用量 (instance {{ $labels.instance }}) description: "容器cpu使用量达到80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: 容器内存使用率 expr: (container_memory_working_set_bytes/container_spec_memory_limit_bytes )*100 for: 2m labels: severity: warning annotations: summary: 容器内存使用率 description: "容器内存使用率是 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.3.5. Container Volume IO usage # Container Volume IO usage is above 80% - alert: 容器磁盘使用量 expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80 for: 2m labels: severity: warning annotations: summary: Container Volume IO usage (instance {{ $labels.instance }}) description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.3.6. Container high throttle rate # Container is being throttled - alert: ContainerHighThrottleRate expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 for: 2m labels: severity: warning annotations: summary: Container high throttle rate (instance {{ $labels.instance }}) description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
blackbox.yaml
# 1.4. Blackbox : prometheus/blackbox_exporter (8 rules) groups: - name: Blackbox黑匣子-监控告警 #组名,报警规则组名称 rules: #定义角色 # 1.4.1. Blackbox probe failed # Probe failed - alert: BlackboxProbeFailed expr: probe_success == 0 for: 0m labels: severity: critical annotations: summary: Blackbox probe failed (instance {{ $labels.instance }}) description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.2. Blackbox slow probe # Blackbox probe took more than 1s to complete - alert: BlackboxSlowProbe expr: avg_over_time(probe_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: summary: Blackbox slow probe (instance {{ $labels.instance }}) description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.3. Blackbox probe HTTP failure # HTTP status code is not 200-399 - alert: BlackboxProbeHttpFailure expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 for: 0m labels: severity: critical annotations: summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.4. Blackbox SSL certificate will expire soon # SSL certificate expires in 30 days - alert: BlackboxSslCertificateWillExpireSoon expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 for: 0m labels: severity: warning annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) description: "SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.5. Blackbox SSL certificate will expire soon # SSL certificate expires in 3 days - alert: BlackboxSslCertificateWillExpireSoon expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 for: 0m labels: severity: critical annotations: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) description: "SSL certificate expires in 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.6. Blackbox SSL certificate expired # SSL certificate has expired already - alert: BlackboxSslCertificateExpired expr: probe_ssl_earliest_cert_expiry - time() <= 0 for: 0m labels: severity: critical annotations: summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.7. Blackbox probe slow HTTP # HTTP request took more than 1s - alert: BlackboxProbeSlowHttp expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 1.4.8. Blackbox probe slow ping # Blackbox ping took more than 1s - alert: BlackboxProbeSlowPing expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: summary: Blackbox probe slow ping (instance {{ $labels.instance }}) description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
kube-state-mertric.yaml
# 5.1。Kubernetes: kube-state-metrics (33条规则) groups: - name: Docker容器-监控告警 #组名,报警规则组名称 rules: #定义角色 # 5.1.1. Kubernetes Node ready # Node {{ $labels.node }} has been unready for a long time - alert: 节点断开连接 expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 10m labels: severity: critical annotations: summary: 节点断开连接 (instance {{ $labels.instance }}) description: "节点 {{ $labels.node }} 已经很长时间没有联系上了\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.2. Kubernetes memory pressure # {{ $labels.node }} has MemoryPressure condition - alert: k8s节点内存有压力 expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 for: 2m labels: severity: critical annotations: summary: k8s节点内存有压力 (instance {{ $labels.instance }}) description: "{{ $labels.node }} 是否存在内存有压力\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.3. Kubernetes disk pressure # {{ $labels.node }} has DiskPressure condition - alert: k8s节点磁盘有压力 expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 for: 2m labels: severity: critical annotations: summary: k8s节点存在磁盘有压力 (instance {{ $labels.instance }}) description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.4. Kubernetes out of disk # {{ $labels.node }} has OutOfDisk condition - alert: k8s磁盘不足 expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 for: 2m labels: severity: critical annotations: summary: K8s磁盘空间不足 (instance {{ $labels.instance }}) description: "{{ $labels.node }} 磁盘空间不足\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.5. Kubernetes out of capacity # {{ $labels.node }} is out of capacity - alert: 容量不足 expr: sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(pod, namespace) group_left(node) (0 * kube_pod_info)) / sum(kube_node_status_allocatable_pods) by (node) * 100 > 90 for: 2m labels: severity: warning annotations: summary: Kubernetes 容量不足 (instance {{ $labels.instance }}) description: "{{ $labels.node }} 容量不足\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.6. Kubernetes container oom killer # Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. - alert: 十分钟容器被kill的次数 expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 for: 0m labels: severity: warning annotations: summary: 十分钟pod被kill的次数 (instance {{ $labels.instance }}) description: "过去10分钟内容器 {{ $labels.container }} 在pod {{ $labels.namespace }}/{{ $labels.pod }} 被杀死了 {{ $value }} \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.7. Kubernetes Job failed # Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete - alert: job 未能完成 expr: kube_job_status_failed > 0 for: 0m labels: severity: warning annotations: summary: Kubernetes Job 未完成 (instance {{ $labels.instance }}) description: "Job {{$labels.namespace}}/{{$labels.exported_job}} 未能完成\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.9. Kubernetes PersistentVolumeClaim pending # PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending - alert: k8s volumeclaim 已挂起 expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1 for: 2m labels: severity: warning annotations: summary: k8s PersistentVolumeClaim 已挂起 (instance {{ $labels.instance }}) description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} 已挂起\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.12. Kubernetes PersistentVolume error # Persistent volume is in bad state - alert: 永久卷处于错误状态 expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0 for: 0m labels: severity: critical annotations: summary: K8s 永久卷处于错误状态 (instance {{ $labels.instance }}) description: "永久卷处于错误状态\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.13. Kubernetes StatefulSet down # A StatefulSet went down - alert: k8s 状态集 expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1 for: 1m labels: severity: critical annotations: summary: Kubernetes 状态集 down (instance {{ $labels.instance }}) description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.17. Kubernetes Pod not healthy # Pod has been in a non-ready state for longer than 15 minutes. - alert: POd 亚健康状态 expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0 for: 0m labels: severity: critical annotations: summary: k8s Pod not healthy (instance {{ $labels.instance }}) description: "Pod已处于非就绪状态超过15分钟。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.18. Kubernetes pod crash looping # Pod {{ $labels.pod }} is crash looping - alert: K8s Pod CrashLooping expr: increase(kube_pod_container_status_restarts_total[1m]) > 3 for: 2m labels: severity: warning annotations: summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) description: "Pod {{ $labels.pod }} 崩溃循环\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.21. Kubernetes StatefulSet replicas mismatch # A StatefulSet does not match the expected number of replicas. - alert: 状态集与副本的预期数量不匹配 expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas for: 10m labels: severity: warning annotations: summary: Kubernetes 状态集副本不匹配 (instance {{ $labels.instance }}) description: "状态集与副本的预期数量不匹配.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.23. Kubernetes StatefulSet generation mismatch # A StatefulSet has failed but has not been rolled back. - alert: K8s状态集生成失配 expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation for: 10m labels: severity: critical annotations: summary: Kubernetes 状态集生成失配 (instance {{ $labels.instance }}) description: "状态集已失败,但尚未被回滚。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.28. Kubernetes job slow completion # Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time - alert: K8s Job 缓慢完成 expr: kube_job_spec_completions - kube_job_status_succeeded > 0 for: 12h labels: severity: critical annotations: summary: Kubernetes job 完成缓慢 (instance {{ $labels.instance }}) description: "K8s Job {{ $labels.namespace }}/{{ $labels.job_name }} 未及时完成.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.30. Kubernetes API client errors # Kubernetes API client is experiencing high error rate - alert: K8s API客户端错误 expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 for: 2m labels: severity: critical annotations: summary: Kubernetes API客户端错误 (instance {{ $labels.instance }}) description: "Kubernetes API客户端遇到高错误率\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.31. Kubernetes client certificate expires next week # A client certificate used to authenticate to the apiserver is expiring next week. # - alert: KubernetesClientCertificateExpiresNextWeek # expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 # for: 0m # labels: # severity: warning # annotations: # summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }}) # description: "用于向apiserver进行身份验证的客户端证书将于下周过期。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.32. Kubernetes client certificate expires soon # A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. # - alert: KubernetesClientCertificateExpiresSoon # expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60 # for: 0m # labels: # severity: critical # annotations: # summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }}) # description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # 5.1.33. Kubernetes API server latency # Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}. # - alert: KubernetesApiServerLatency # expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1 # for: 2m # labels: # severity: warning # annotations: # summary: Kubernetes API server latency (instance {{ $labels.instance }}) # description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
注:未测试,谨慎使用