Prometheus之主机告警rules配置

prometheus.yml

# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
           - 10.0.9.15:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - "rules/*.yaml"
  - "alert_rules/*.yaml"


# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["localhost:9090"]

  - job_name: "node"
    static_configs:
      - targets: ["localhost:9100"]
        labels:
          appname: "prometheus-01"

hosts.yaml

 /usr/local/prometheus/rules/hosts.yaml

groups:
- name: Host
  rules:
  - alert: HostMemory Usage
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 >  90
    for: 1m
    labels:
      name: Memory
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: "宿主机内存使用率超过90%."
      value: "{{ $value }}%"
  - alert: HostCPU Usage
    expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.8
    for: 1m
    labels:
      name: CPU
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: "宿主机CPU使用率超过80%."
      value: "{{ $value }}%"
  - alert: HostLoad
    expr: node_load5 > 20
    for: 1m
    labels:
      name: Load
      severity: Warning
    annotations:
      summary: "{{ $labels.appname }} "
      description: " 主机负载5分钟超过20."
      value: "{{ $value }}"
  - alert: HostFilesystem Usage
    expr: (node_filesystem_size_bytes-node_filesystem_free_bytes)/node_filesystem_size_bytes*100>80
    for: 1m
    labels:
      name: Disk
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%."
      value: "{{ $value }}%"
  - alert: HostDiskio writes
    expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
    for: 1m
    labels:
      name: Diskio
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高."
      value: "{{ $value }}iops"
  - alert: HostDiskio reads
    expr: irate(node_disk_reads_completed_total{job=~"Host"}[1m]) > 10
    for: 1m
    labels:
      name: Diskio
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均读取IO负载较高."
      value: "{{ $value }}iops"
  - alert: HostNetwork_receive
    expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576  > 10
    for: 1m
    labels:
      name: Network_receive
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过10Mbps."
      value: "{{ $value }}3Mbps"
  - alert: hostNetwork_transmit
    expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576  > 10
    for: 1m
    labels:
      name: Network_transmit
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过10Mbps."
      value: "{{ $value }}10Mbps"

告警实例

示例

groups:

- name: NodeExporter

  rules:
    - alert: HostOutOfMemory
      expr: 'node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Host out of memory (instance {{ $labels.instance }})
        description: "Node memory is filling up (< 5% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


    - alert: HostOutOfDiskSpace < 20% left
    #  expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
      expr: |
        (
            (node_filesystem_avail_bytes{instance!="21.40.80.240:9100"} * 100) / node_filesystem_size_bytes{instance!="21.40.80.240:9100"} < 20 or
            (node_filesystem_avail_bytes{module!="ZCM"} * 100) / node_filesystem_size_bytes{module!="ZCM"} < 20
        ) and ON (instance, device, mountpoint) (node_filesystem_readonly{instance!="21.40.80.240:9100"} == 0 and node_filesystem_readonly{module!="ZCM"} == 0)
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host out of disk space (instance {{ $labels.instance }})
        description: "Disk is almost full (< 20% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


    - alert: HostOutOfDiskSpace < 10% left
    #  expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 20 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
      expr: |
          (node_filesystem_avail_bytes{module="ZCM"} * 100) / node_filesystem_size_bytes{module="ZCM"} < 10
          and ON (instance, device, mountpoint) node_filesystem_readonly{module="ZCM"} ==0 
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host out of disk space (instance {{ $labels.instance }})
        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

        
    - alert: HostOutOfInodes
      expr: 'node_filesystem_files_free / node_filesystem_files * 100 < 20 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host out of inodes (instance {{ $labels.instance }})
        description: "Disk is almost running out of available inodes (< 20% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostHighCpuLoad
      expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host high CPU load (instance {{ $labels.instance }})
        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

 
    - alert: HostCpuStealNoisyNeighbor
      expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


    - alert: HostSystemdServiceCrashed
      expr: 'node_systemd_unit_state{state="failed"} == 1'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host systemd service crashed (instance {{ $labels.instance }})
        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNodeOvertemperatureAlarm
      expr: 'node_hwmon_temp_crit_alarm_celsius == 1'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostRaidArrayGotInactive
      expr: 'node_md_state{state="inactive"} > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Host RAID array got inactive (instance {{ $labels.instance }})
        description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostRaidDiskFailure
      expr: 'node_md_disks{state="failed"} > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host RAID disk failure (instance {{ $labels.instance }})
        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostRaidMembersDiskRemoveOrOtherError
      expr: node_md_disks{state="active"}%2==1
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Host RAID disk remove or other error (instance {{ $labels.instance }})
        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostOomKillDetected
      expr: 'increase(node_vmstat_oom_kill[1m]) > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host OOM kill detected (instance {{ $labels.instance }})
        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostEdacCorrectableErrorsDetected
      expr: 'increase(node_edac_correctable_errors_total[1m]) > 5'
      for: 5m
      labels:
        severity: info
      annotations:
        summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostEdacUncorrectableErrorsDetected
      expr: 'node_edac_uncorrectable_errors_total > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNetworkReceiveErrors
      expr: 'rate(node_network_receive_errs_total[5m]) / rate(node_network_receive_packets_total[5m]) > 0.01'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host Network Receive Errors (instance {{ $labels.instance }})
        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNetworkTransmitErrors
      expr: 'rate(node_network_transmit_errs_total[5m]) / rate(node_network_transmit_packets_total[5m]) > 0.01'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNetworkReceiveHighUtil
      expr: '100*(rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*|^.*\\..*"}[2m]) / (node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*|^.*\\..*"})) > 80'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host unusual network throughput in (instance {{ $labels.instance }})
        description: "Host network interfaces receiving utils > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNetworkTransmitHighUtil
      expr: '100*(rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*|^.*\\..*"}[2m]) / (node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*|^.*\\..*"})) > 80'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host unusual network throughput out (instance {{ $labels.instance }})
        description: "Host network interfaces sending utils > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNetworkBondDegraded
      expr: '(node_bonding_active - node_bonding_slaves) != 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNetworkBondMemberMissed
      expr: 'node_bonding_active !=2 or node_bonding_slaves !=2'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host Network Bond Member messed (instance {{ $labels.instance }})
        description: "Bond \"{{ $labels.device }}\" member messed on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostConntrackLimit
      expr: 'node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host conntrack limit (instance {{ $labels.instance }})
        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostClockSkew
      expr: '(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host clock skew (instance {{ $labels.instance }})
        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostClockNotSynchronising
      expr: 'min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host clock not synchronising (instance {{ $labels.instance }})
        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostReboot
      expr: '(time() - node_boot_time_seconds) < 86400'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host detect reboot (instance {{ $labels.instance }})
        description: "{{ $labels.instance }} detect reboot < 1 day.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

zookeeper_exporter_rules.yml

  groups:
  - name: zk-alert-example
    rules:
    - alert: create too many znodes
      expr: znode_count > 1000000
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} create too many znodes"
        description: "{{ $labels.instance }} of job {{$labels.job}} create too many znodes: [{{ $value }}]."

    - alert: create too many connections
      expr: num_alive_connections > 50 # suppose we use the default maxClientCnxns: 60
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} create too many connections"
        description: "{{ $labels.instance }} of job {{$labels.job}} create too many connections: [{{ $value }}]."

    - alert: znode total occupied memory is too big
      expr: approximate_data_size /1024 /1024 > 1 * 1024 # more than 1024 MB(1 GB)
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} znode total occupied memory is too big"
        description: "{{ $labels.instance }} of job {{$labels.job}} znode total occupied memory is too big: [{{ $value }}] MB."

    - alert: set too many watch
      expr: watch_count > 30000
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} set too many watch"
        description: "{{ $labels.instance }} of job {{$labels.job}} set too many watch: [{{ $value }}]."

    - alert: a leader election happens
      expr: increase(election_time_count[5m]) > 0
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} a leader election happens"
        description: "{{ $labels.instance }} of job {{$labels.job}} a leader election happens: [{{ $value }}]."

    - alert: open too many files
      expr: open_file_descriptor_count > 300
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} open too many files"
        description: "{{ $labels.instance }} of job {{$labels.job}} open too many files: [{{ $value }}]."

    - alert: fsync time is too long
      expr: rate(fsynctime_sum[1m]) > 100
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} fsync time is too long"
        description: "{{ $labels.instance }} of job {{$labels.job}} fsync time is too long: [{{ $value }}]."

    - alert: take snapshot time is too long
      expr: rate(snapshottime_sum[5m]) > 100
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} take snapshot time is too long"
        description: "{{ $labels.instance }} of job {{$labels.job}} take snapshot time is too long: [{{ $value }}]."

    - alert: avg latency is too high
      expr: avg_latency > 100
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "Instance {{ $labels.instance }} avg latency is too high"
        description: "{{ $labels.instance }} of job {{$labels.job}} avg latency is too high: [{{ $value }}]."

    - alert: JvmMemoryFillingUp
      expr: jvm_memory_bytes_used / jvm_memory_bytes_max{area="heap"} > 0.8
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "JVM memory filling up (instance {{ $labels.instance }})"
        description: "JVM memory is filling up (> 80%)\n labels: {{ $labels }}  value = {{ $value }}\n"

kafka_exporter_rules.yml

groups:

- name: PrometheusCommunityKafkaExporter-kafka

  rules:

    - alert: kafka消费积压告警
      expr: |
           sum(kafka_consumergroup_lag{instance=~"xxxx:9308",topic=~"xxx",job="kafka_exporter",module="日志",consumergroup="syslog-consumer"}) by (consumergroup, topic) > 100000
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: kafka消费积压 topic(syslog)消费积压 (instance {{ $labels.instance }})
        description: "日志中心kafka消费积压 topic(syslog)消费积压 >= 100000\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

elasticsearch_exporter_rules.yml

groups:

- name: PrometheusCommunityElasticsearchExporter-my

  rules:

    - alert: ElasticsearchThreadPoolQueueCountHigh
      expr: |
            sum(elasticsearch_thread_pool_queue_count{cluster=~"ilog_cluster", type=~"search"}) by (name) >= 600
            unless
            absent(elasticsearch_thread_pool_queue_count{cluster=~"ilog_cluster", type=~"search"})
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch Thread Pool Queue Count Too High (instance {{ $labels.instance }})
        description: "The thread pool queue count >= 300\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

 

posted @ 2021-11-24 15:18  小吉猫  阅读(535)  评论(0)    收藏  举报