prometheus 监控项

此处记录prometheus监控项,exporter为 node_exporter

vim rules.yml
groups:
- name: node
  rules:
  - alert: server_status
    expr: up{job="node"} == 0
    for: 15s
    labels:
      severity: 'critical'
    annotations:
      summary: " node_exporter is down"
- name: cluster
  rules:
  - alert: CPU
    expr: (1-rate(node_cpu_seconds_total{mode="idle"}[1m]))*100 > 90
    for: 5s
    labels:
      severity: 'warning'
    annotations:
      summary: " cpu利用率超过 90%,{{ .Labels.name }}当前值: {{ $value }}%"
#  - alert: LOAD1
#    expr: node_load5 > Logical_CPU_core_total*0.3 or node_load1 > Logical_CPU_core_total*0.4 or node_load15 >  Logical_CPU_core_total*0.2
#    for: 5s
#    labels:
#      severity: 'critical'
#    annotations:
#      summary: " load过高 当前值为 {{ $value }}"
  - alert: LOAD1
    expr: node_load1 > Logical_CPU_core_total*3
    for: 5s
    labels:
      severity: 'warning'
    annotations:
      summary: " load1>cpu*3 当前值为 {{ $value }}"
  - alert: LOAD5
    expr:  node_load5 > Logical_CPU_core_total*2
    for: 5s
    labels:
      severity: 'warning'
    annotations:
      summary: " load5>cpu*2 当前值为 {{ $value }}"
  - alert: LOAD15
    expr: node_load15 >  Logical_CPU_core_total*2
    for: 5s
    labels:
      severity: 'warning'
    annotations:
      summary: " load15>cpu*2 当前值为 {{ $value }}"
  - alert: space_root
    expr: (1-node_filesystem_avail_bytes{fstype=~"xfs|ext4",mountpoint="/"}/node_filesystem_size_bytes{fstype=~"xfs|ext4",mountpoint="/"})*100 > 80
    for: 5s
    labels:
      severity: 'critical'
    annotations:
      summary: " /下空间使用率大于80%  当前值为{{ $value }}% "
  - alert: space_data
    expr: (1-node_filesystem_avail_bytes{fstype=~"xfs|ext4",mountpoint="/data"}/node_filesystem_size_bytes{fstype=~"xfs|ext4",mountpoint="/data"})*100 > 80
    for: 5s
    labels:
      severity: 'critical'
    annotations:
      summary: " /data空间使用率大于80% 当前值为{{ $value }}% "
  - alert: upload_rate
    expr: rate(node_network_transmit_bytes_total{device="eth0"}[1m])/1048576 > 10
    for: 5s
    labels:
      severity: 'warning'
    annotations:
      summary: " 上传速率大于10M 当前值为{{ $value }}M"
  - alert: download_rate
    expr: rate(node_network_receive_bytes_total{device="eth0"}[1m])/1048576 > 10
    for: 5s
    labels:
      severity: 'warning'
    annotations:
      summary: " 下载速率大于10M 当前值为{{ $value }}M "
  - alert: inode_size
    expr: (1-node_filesystem_files_free{fstype=~"xfs|ext4",mountpoint="/"}/node_filesystem_files{fstype=~"xfs|ext4",mountpoint="/"})*100 > 50
    for: 5s
    labels:
      severity: 'critical'
    annotations:
      summary: " /下inode使用率大于50% 当前值为{{ $value }}% "
  - alert: Memory_usage
    expr: (1-(node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes)*100 > 80
    for: 5s
    labels:
      severity: 'warning'
    annotations:
      summary: "内存使用率大于80% 当前值为{{ $value }}% "
  - alert: iowait
    expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100) > 50
    for: 5s
    labels:
      severity: 'critical'
    annotations:
      summary: "cpu iowait大于50% 当前值为{{ $value }}% "
  - alert: procs_zombie
    expr: procs_zombie > 20
    for: 5s
    labels:
      severity: 'critical'
    annotations:
      summary: " procs_zombie 大于20 当前值为{{ $value }} "
  - alert: logined_users
    expr: logined_users_total > 25
    for: 5s
    labels:
      severity: 'critical'
    annotations:
      summary: "logined_users 大于25 当前值为{{ $value }} "
posted @   huandada  阅读(2740)  评论(3编辑  收藏  举报
编辑推荐:
· 用 C# 插值字符串处理器写一个 sscanf
· Java 中堆内存和栈内存上的数据分布和特点
· 开发中对象命名的一点思考
· .NET Core内存结构体系(Windows环境)底层原理浅谈
· C# 深度学习:对抗生成网络(GAN)训练头像生成模型
阅读排行:
· 为什么说在企业级应用开发中,后端往往是效率杀手?
· 本地部署DeepSeek后,没有好看的交互界面怎么行!
· 趁着过年的时候手搓了一个低代码框架
· 用 C# 插值字符串处理器写一个 sscanf
· 推荐一个DeepSeek 大模型的免费 API 项目!兼容OpenAI接口!

阅读目录(Content)

此页目录为空

点击右上角即可分享
微信分享提示