Prometheus企业微信告警
自己注册一个企业微信,进入管理控制台。
在应用管理中点击创建应用
创建机器人
发送消息测试
发送消息测试,我这里可以正常收到消息
找到企业ID
找到机器人的AgentId
点击查看,把Secret保存下
查看下部门ID
添加微信告警配置
[root@harbor harbor]# vim /apps/alertmanager/alertmanager.yml
- name: "wechat"
wechat_configs:
- corp_id: wwbf878f48d2348b76 #企业ID
#to user: '@all' #所有人
to_party: 2 #部门ID
agent_id: 1000002 #机器人ID
api_secret: V7WnlfwUiSrUzRKTUCh6sgvKVd_UTZ_pBnGGuwUxLFQ #Secret
send_resolved: true
把发送告警服务器的IP添加到企业IP中,设置完1分钟后生效。
重启
systemctl restart alertmanager.service
去企业微信查看告警。
告警分类发送,根据标签匹配分类发送告警
severity:critical级别的走邮件
service: magedu-pods级别的消息发给钉钉
剩余的都发给企业微信
[root@harbor apps]# cat prometheus/rules/yzy_rules.yml
groups:
- name: alertmanager_pod.rules
rules:
- alert: Pod_all_cpu_usage
expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 1
for: 2m
labels:
#severity: critical
service: magedu-pods
annotations:
description: 容器 {{ $labels.name }} CPU 资源利用率大于10% , (current value is {{ $value }})
summary: Dev CPU 负载告警
- alert: Pod_all_memory_usage
#expr: sort_desc(avg by(name)(irate(container_ memory_usage_bytes{name!=""}[5m]))*100) > 10% #内存大于10%
expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2147483648 #内存大于 2G
for: 2m
labels:
severity: critical
annotations:
description: 容器 {{ $labels.name }} Memory资源利用率大于 2G,(current value is {{ $value }})
summary: Dev Memory 负载告警
- alert: Pod_all_network_receive_usage
expr: sum by (name) (irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1
for: 2m
labels:
severity: critical
annotations:
description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }}
- alert: node内存可用大小
expr: node_memory_MemFree_bytes < 4*1024*1024*1024 #故意写错的
for: 2m
labels:
severity: info
annotations:
description: node节点的可用内存小于4G
[root@harbor apps]# cat alertmanager/alertmanager.yml
global:
resolve_timeout: 1m
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '760478xxx@qq.com'
smtp_auth_username: '760478xxx@qq.com'
smtp_auth_password: 'sxcpymhdrkenbegd'
smtp_hello: '@qq.com'
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 1s
group_interval: 5s
repeat_interval: 10s
receiver: 'wechat'
#添加路由信息
routes:
- receiver: web.hook #critical级别的消息发给邮件
group_wait: 10s
match_re:
severity: critical
- receiver: dingding.alertname #service: magedu-pods级别的消息发给钉钉
group_wait: 10s
match_re:
service: magedu-pods
receivers:
- name: 'wechat'
wechat_configs:
- corp_id: 'wwbf878f48d2348b76'
to_party: '2'
agent_id: '1000002'
api_secret: 'V7WnlfwUiSrUzRKTUCh6sgvKVd_UTZ_pBnGGuwUxLFQ'
send_resolved: true
- name: 'dingding.alertname'
webhook_configs:
- url: 'http://10.211.55.26:8060/dingtalk/alertname/send' #配置dingtalk的地址和端口
send_resolved: true
- name: 'web.hook'
email_configs:
- to: '1500120xxxx@163.com'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
设置微信告警模板
[root@harbor alertmanager]# cat message_template.templ
{{ define "wechat.default.message" }}
{{ range $i, $alert := .Alerts }}
===alertmanager 微信监控报警===
告警状态: {{ .Status }}
告警级别: {{ $alert.Labels.severity }}
告警类型: {{ $alert.Labels.alertname }}
告警应用: {{ $alert.Annotations.summary }}
故障主机: {{ $alert.Labels.instance }}
告警主题: {{ $alert.Annotations.summary }}
触发阀值: {{ $alert.Annotations.value}}
告警详情: {{ $alert.Annotations.description }}
触发时间: {{ $alert.StartsAt.Format "2006-01-02 1504:05" }}
===========end=========
{{ end }}
{{ end }}
配置alertmanager.yml文件,配置完后重启