Alertmanager对接Prometheus实战
目录
启动3种方式
二进制-Alertmanager官网下载
https://prometheus.io/download/#alertmanager
//下载:
wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
//解压:
tar -xf alertmanager-0.24.0.linux-amd64.tar.gz -C /usr/local/
//改名:
mv alertmanager-0.24.0.linux-amd64 alertmanager
//启动
nohup ./alertmanager &
//配置文件
vi alertmanager.yml
//检查配置
./amtool check-config alertmanager.yml
docker-启动
docker run --name alertmanager -d -p 9093:9093 -v /usr/local/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml prom/alertmanager:v0.24.0
docker-compose-启动
docker-compose up -d
version: '3'
services:
app:
container_name: alertmanager
image: prom/alertmanager:v0.24.0
ports:
- "9093:9093"
restart: unless-stopped
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
启动成功效果
访问:http://192.168.0.52:9093
配置文件
alertmanager.yml
注意:发送的邮箱需要开启smtp服务
alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:465'
smtp_from: 'test1009122179@163.com'
smtp_auth_username: 'test1009122179@163.com'
smtp_auth_password: 'XXXXXX'
smtp_require_tls: false #关闭tls
route:
group_by: ['alertname'] #告警分组
group_wait: 10s #告警等待时间,10秒内出现相同报警,在一个组内出现
group_interval: 30m #告警间隔,如果组内内容不变化,合并为一条警报信息,1 分钟后发送
repeat_interval: 8h #重复的告警间隔
receiver: 'email' #优先使用 email 发送
receivers:
- name: 'email'
email_configs:
- to: 'guyouyin@163.com' #邮件接受者,逗号隔开
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
prometheus.yml
在prometheus.yml配置文件中开启alertmanager报警9093端口,及加载报警规则文件
prometheus重启
prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.0.52:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*_rules.yml"
- "rules/*_alerts.yml"
rules/node_rules.yml--报警规则配置文件
node_rules.yml
groups:
- name: node_rules
#interval: 15s
rules:
# cpu
- record: instance:cpu_used
expr: 100 - avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (nodename) * 100
labels:
metric_type: CPU_monitor
# 内存
- record: instance:memory_used
expr: 100 - (node_memory_MemAvailable_bytes)/(node_memory_MemTotal_bytes) * 100
labels:
metric_type: Memory_monitor
# 磁盘
- record: instance:partition_used
expr: max((node_filesystem_size_bytes{}-node_filesystem_free_bytes{}) *100/(node_filesystem_avail_bytes {}+(node_filesystem_size_bytes{}-node_filesystem_free_bytes{})))by(instance)
labels:
metric_type: Partition_monitor
rules/node_alerts.yml--报警规则配置文件
node_alerts.yml
groups:
- name: node_alerts
rules:
- alert: cpu_used
expr: instance:cpu_used > 60
for: 1m
labels:
severity: warning
annotations:
summary: 主机 {{ $labels.nodename }} 的 CPU使用率持续1分钟超出阈值,当前为 {{humanize $value}} %
- alert: memory_used
expr: instance:memory_used > 80
for: 1m
annotations:
summary: 主机 {{ $labels.nodename }} 的 内存 使用率持续1分钟超出阈值,当前为 {{humanize $value}} %
- alert: partition_used
expr: instance:partition_used > 90
for: 1m
annotations:
summary: 主机 {{ $labels.nodename }} 的 磁盘使用率已达到{{humanize $value}}%,即将超出当前可用空间,请及时扩容!
prometheus-docker-compose.yml
version: '3'
services:
app:
container_name: prometheus
image: prom/prometheus:v2.36.2
ports:
- "9090:9090"
restart: unless-stopped
volumes:
# - /etc/localtime:/etc/localtime:ro
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./data:/prometheus
- ./rules:/etc/prometheus/rules
启动成功效果
访问:http://192.168.0.52:9090/alerts
收到报警
选择了IT,必定终身学习