Alertmanager对接Prometheus实战

启动3种方式

二进制-Alertmanager官网下载

https://prometheus.io/download/#alertmanager

//下载:
wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
//解压:
tar -xf alertmanager-0.24.0.linux-amd64.tar.gz -C /usr/local/
//改名:
mv alertmanager-0.24.0.linux-amd64 alertmanager
//启动
nohup ./alertmanager &
//配置文件
vi alertmanager.yml
//检查配置
./amtool check-config alertmanager.yml 

docker-启动

docker run --name alertmanager -d -p 9093:9093 -v /usr/local/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml prom/alertmanager:v0.24.0

docker-compose-启动

docker-compose up -d

version: '3'
services:
  app:
    container_name: alertmanager
    image: prom/alertmanager:v0.24.0
    ports:
     - "9093:9093"
    restart: unless-stopped
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml

启动成功效果

访问:http://192.168.0.52:9093

配置文件

alertmanager.yml

注意:发送的邮箱需要开启smtp服务

alertmanager.yml

global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.163.com:465'
  smtp_from: 'test1009122179@163.com'
  smtp_auth_username: 'test1009122179@163.com'
  smtp_auth_password: 'XXXXXX'
  smtp_require_tls: false #关闭tls

route:
  group_by: ['alertname'] #告警分组
  group_wait: 10s #告警等待时间,10秒内出现相同报警,在一个组内出现
  group_interval: 30m #告警间隔,如果组内内容不变化,合并为一条警报信息,1 分钟后发送
  repeat_interval: 8h #重复的告警间隔
  receiver: 'email' #优先使用 email 发送

receivers:
  - name: 'email'
    email_configs:
      - to: 'guyouyin@163.com' #邮件接受者,逗号隔开

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

prometheus.yml

在prometheus.yml配置文件中开启alertmanager报警9093端口,及加载报警规则文件

prometheus重启

prometheus.yml

alerting:
  alertmanagers:
    - static_configs:
        - targets:
           - 192.168.0.52:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
 - "rules/*_rules.yml"
 - "rules/*_alerts.yml"

rules/node_rules.yml--报警规则配置文件

node_rules.yml

groups:
  - name: node_rules
    #interval: 15s
    rules:
      # cpu
      - record: instance:cpu_used
        expr: 100 - avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (nodename) * 100
        labels:
          metric_type: CPU_monitor
			# 内存
      - record: instance:memory_used
        expr: 100 - (node_memory_MemAvailable_bytes)/(node_memory_MemTotal_bytes) * 100
        labels:
          metric_type: Memory_monitor
			# 磁盘
      - record: instance:partition_used
        expr: max((node_filesystem_size_bytes{}-node_filesystem_free_bytes{}) *100/(node_filesystem_avail_bytes {}+(node_filesystem_size_bytes{}-node_filesystem_free_bytes{})))by(instance)
        labels:
          metric_type: Partition_monitor

rules/node_alerts.yml--报警规则配置文件

node_alerts.yml

groups:
 - name: node_alerts
   rules:
   - alert: cpu_used
     expr: instance:cpu_used > 60
     for: 1m
     labels:
       severity: warning
     annotations:
       summary: 主机 {{ $labels.nodename }} 的 CPU使用率持续1分钟超出阈值,当前为 {{humanize $value}} %

   - alert: memory_used
     expr: instance:memory_used > 80
     for: 1m
     annotations:
       summary: 主机 {{ $labels.nodename }} 的 内存 使用率持续1分钟超出阈值,当前为 {{humanize $value}} %

   - alert: partition_used
     expr: instance:partition_used > 90
     for: 1m
     annotations:
       summary: 主机 {{ $labels.nodename }} 的 磁盘使用率已达到{{humanize $value}}%,即将超出当前可用空间,请及时扩容!

prometheus-docker-compose.yml

version: '3'
services:
  app:
    container_name: prometheus
    image: prom/prometheus:v2.36.2
    ports:
     - "9090:9090"
    restart: unless-stopped
    volumes:
      # - /etc/localtime:/etc/localtime:ro
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./data:/prometheus
      - ./rules:/etc/prometheus/rules

启动成功效果

访问:http://192.168.0.52:9090/alerts

收到报警

posted @ 2022-07-12 17:54  Jeff的技术栈  阅读(237)  评论(0编辑  收藏  举报
回顶部