准备:一个部署号的prometheus,alertmanager
1. alertmanager服务简单邮件告警设置。
vim /export/service/alertmanager/alertmanager.yml
global:
smtp_smarthost: smtp.qq.com:587 #smtp服务器 地址:端口
smtp_from: 7414xxx2@qq.com #
smtp_auth_username: 74142xxx@qq.com #发送方邮件用户名
smtp_auth_identity: 74142xxx@qq.com #发送方邮件用户名
smtp_auth_password: lxxx****uxgxxx #邮箱密码 此处是qq邮箱 采用特有的secret方式登录。
route:
group_by: ['node'] #匹配的标签,需要监控项配置标签这里来匹配,然后发送告警。
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'default-receiver'
receivers:
- name: 'default-receiver' #设置默认的receivers
email_configs:
- to: 'yanghz_2013@163.com' #指定邮件发送给谁
send_resolved: true
inhibit_rules: #设置告警 避免重复多次告警的设置。
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
service alertmanager resetart #重启服务。
设置prometheus告警规则
[root@test165 prometheus]# pwd
/export/service/prometheus
[root@test165 prometheus]# ls
console_libraries consoles data LICENSE nohup.out NOTICE prometheus prometheus.yml promtool rules
[root@test165 prometheus]# ls rules/
hoststats-alert.rules
[root@test165 prometheus]# vim rules/hoststats-alert.rules
groups:
- name: hostStatsAlert
rules:
- alert: hostCpuUsageAlert
expr: sum(avg without (cpu)(irate(node_cpu_guest_seconds_total{mode!='idle'}[5m]))) by (instance) > 0.85
#expr: sum(avg without (cpu)(irate(node_cpu{mode!='idle'}[5m]))) by (instance) > 0.85
for: 1m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} CPU usgae high"
description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})"
- alert: hostCpuUsageAlert_test #自定义一个告警,然后把阈值调小一些 确定可以收到告警信息。
expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance) > 0.55 #把原本的0.85改为了0.55 为了更容易实现告警信息。
for: 1m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} CPU usgae high"
description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})"
- alert: hostMemUsageAlert
expr: (node_memory_MemTotal - node_memory_MemAvailable)/node_memory_MemTotal > 0.85
for: 1m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} MEM usgae high"
description: "{{ $labels.instance }} MEM usage above 85% (current value: {{ $value }})"
测试 再node_export 节点上 使 CPU 耗尽。
[root@test166 node_exporter]# cat /dev/zero >/dev/null
等待一会会收到告警邮件,证明测试成功。

浙公网安备 33010602011771号