alertmanager实现自定义WebHook及路由(route)分组实战

                                              作者:尹正杰

版权声明:原创作品,谢绝转载!否则将追究法律责任。

一.alertmanager环境部署

1.alertmanager核心功能

deduplicating:
	Prometheus产生同一条报警发送给多个alertmanager去重后再发送。
	
grouping:
	告警可以分组处理,同一个组共享等待时长等参数,可以做告警聚合。
	
route:
	路由匹配书,可以理解为告警订阅。
	
silencing:
	灵活的告警静默,如按tag。
	
inhibition:
	如果某些其它报警已经触发,则抑制某些警报的通知如机器down,上面的进程down告警不触发。

HA:
	基于gossip可以实现alertmanager的高可用性。

2.部署alertmanager

	1.下载alertmanager
[root@alertmanager44 ~]# wget https://github.com/prometheus/alertmanager/releases/download/v0.27.0/alertmanager-0.27.0.linux-amd64.tar.gz


	2.解压软件包
[root@alertmanager44 ~]# mkdir -pv /yinzhengjie/{softwares,data}


	3.解压软件包
[root@alertmanager44 ~]# tar xf alertmanager-0.27.0.linux-amd64.tar.gz -C /yinzhengjie/softwares/

	4.编写启动脚本
[root@alertmanager44 ~]# cat > /lib/systemd/system/alertmanager.service <<EOF
[Unit]
Description=yinzhengjie's alertmanager Server
Documentation=https://www.cnblogs.com/yinzhengjie
After=network.target

[Service]
ExecStart=/yinzhengjie/softwares/alertmanager-0.27.0.linux-amd64/alertmanager --config.file=/yinzhengjie/softwares/alertmanager-0.27.0.linux-amd64/alertmanager.yml --storage.path=/yinzhengjie/data/alertmanager

[Install]
WantedBy=multi-user.target
EOF


	4.启动服务
[root@alertmanager44 ~]# systemctl daemon-reload 
[root@alertmanager44 ~]# 
[root@alertmanager44 ~]# systemctl enable --now alertmanager.service 
Created symlink /etc/systemd/system/multi-user.target.wants/alertmanager.service → /lib/systemd/system/alertmanager.service.
[root@alertmanager44 ~]# 


	5.访问alertmanager的WebUI
http://10.0.0.44:9093/

3.默认配置文件解读

# 定义的是全局配置
global:
  # 如果一个告警不包括EndsAt,经过此时间后,如果尚未更新警报,则可以将警报声明为已恢复。
  # 这对Prometheus的报警没用任何影响,因为它们始终包含EndsAt
  resolve_timeout: 5m
  # 配置默认的http,表示如果下面webhook为空的时候用此配置。
  http_config:
    follow_redirects: true
    enable_http2: true
  # SMTP配置。
  smtp_hello: localhost
  smtp_require_tls: true
  # 几个默认支持地址,企业常用的是企业微信,钉钉告警,邮件告警等媒介。
  pagerduty_url: https://events.pagerduty.com/v2/enqueue
  opsgenie_api_url: https://api.opsgenie.com/
  wechat_api_url: https://qyapi.weixin.qq.com/cgi-bin/
  victorops_api_url: https://alert.victorops.com/integrations/generic/20131114/alert/
  telegram_api_url: https://api.telegram.org
  webex_api_url: https://webexapis.com/v1/messages
# 配置路由  
route:
  # 代表路由树的默认receiver,如果匹配不中路由就走默认的路由。
  receiver: web.hook
  # 定义分组的名称
  group_by:
  - alertname
  continue: false
  # 代表新的报警最小聚合时间,第一次来的时候最短间隔。
  group_wait: 30s
  # 代表同一个组里面告警聚合时间,同一个group_by里面不同tag的聚合时间。
  group_interval: 5m
  # 代表同一个报警(label完全相同)的最小发送间隔。
  repeat_interval: 1h
# 配置抑制告警规则  
inhibit_rules:
  # 
- source_match:
    severity: critical
  target_match:
    severity: warning
  equal:
  - alertname
  - dev
  - instance
# 接收者配置  
receivers:
- name: web.hook
  webhook_configs:
  - send_resolved: true
    http_config:
      follow_redirects: true
      enable_http2: true
    url: <secret>
    url_file: ""
    max_alerts: 0
# 定义文本模板    
templates: []

二.编写go程序充当告警触发端和接收端

1 发送告警的接口

接口地址:
	https://prometheus.io/docs/alerting/latest/clients/
	
依赖的公共库:
	https://github.com/prometheus/common

2 发送消息到alertmanager

package main

import (
	"bytes"
	"encoding/json"
	"io/ioutil"
	"log"
	"net/http"

	"github.com/prometheus/common/model"
)

func AlertSend(alertmanagerAddress string) {
	labels := model.LabelSet{}

	labels["alertname"] = "杰哥讲运维,报警测试"
	labels["group"] = "yinzhengjie"
	labels["severity"] = "2"
	labels["job"] = "yinzhengjie-k8s"

	annotations := model.LabelSet{}
	annotations["value"] = "2024"

	// Alert结构体的数据和官网一致(https://prometheus.io/docs/alerting/latest/clients/)
	alerts := make([]*model.Alert, 0)

	alert := &model.Alert{
		Labels:       labels,
		Annotations:  annotations,
		GeneratorURL: "https://www.yinzhengjie.com:9090",
	}

	alerts = append(alerts, alert)

	jsonStr, _ := json.Marshal(alerts)

	req, err := http.NewRequest("POST", alertmanagerAddress, bytes.NewBuffer(jsonStr))

	req.Header.Set("Content-Type", "application/json")

	client := &http.Client{}

	resp, err := client.Do(req)

	if err != nil {
		log.Printf("http.post.requst.err|url:%v|err:%v", alertmanagerAddress, err)
		return
	}

	defer resp.Body.Close()

	log.Printf("response Status:%v", resp.Status)
	log.Printf("response Headers:%v", resp.Header)

	body, _ := ioutil.ReadAll(resp.Body)
	log.Printf("response Body:%v", string(body))

}

func main() {
	alertmanagerAddress := "http://10.0.0.44:9093/api/v2/alerts"

	AlertSend(alertmanagerAddress)
}

3.接受告警段代码

3.1 修改alertmanager的配置文件

[root@alertmanager44 ~]# cat /yinzhengjie/softwares/alertmanager-0.27.0.linux-amd64/alertmanager.yml 
route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 1h
  receiver: 'web.hook'
receivers:
  - name: 'web.hook'
    webhook_configs:
      - url: 'http://10.0.0.41:8888/alert'
        http_config: {}
        max_alerts: 0
        send_resolved: true
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']
[root@alertmanager44 ~]# 

3.2 源代码

package main

import (
	"errors"
	"fmt"
	"log"
	"net/http"
    "flag"
	"github.com/gin-gonic/gin"
	"github.com/prometheus/alertmanager/notify/webhook"
)

func AlertReceiveFunc(c *gin.Context) {
	var msg webhook.Message

	if err := c.BindJSON(&msg); err != nil {
		c.JSON(400, errors.New("invalid args"))
		return
	}

	baseMsg := fmt.Sprintf("状态: %s|报警条数: %d", msg.Status, len(msg.Alerts))

	log.Printf("alertReceive|baseMsg:%+v", baseMsg)

	for i := 0; i < len(msg.Alerts); i++ {
		alert := msg.Alerts[i]
		log.Printf("detail|%d%d|alert:%+v", i+1, len(msg.Alerts), alert)
	}

	c.JSON(http.StatusOK, "ok")
}

func main() {

	listenAddr := flag.String("addr", ":8888", "WebUI expose port")
	flag.Parse()

	r := gin.Default()

	r.POST("/alert", AlertReceiveFunc)
	r.Run(*listenAddr)
}

3.3 测试结果验证

如上图所示,我们可以成功看到日志的输出信息。

三.alertmanager分组功能

1.启动3个webhook测试终端

如上图所示,我们启动3个不同的终端测试,用于模拟3个不同的Webhook。

[root@node-exporter42 yinzhengjie]# ./alert-receive --addr :6666

[root@node-exporter42 yinzhengjie]# ./alert-receive --addr :7777

[root@node-exporter42 yinzhengjie]# ./alert-receive --addr :8888

2.alertmanager配置分组

分组说明:
    - alertmanager可以根据设置的路由将告警可以分组处理,发送给对应的接收端;
    - 我们可以定义三个接收组:
        - sre_system
            定义系统组,接收所有服务器机器告警。

        - sre_dba
            定义数据库组,接收数据库服务器告警。

        - sre_bigdata
            定义大数据组,接收大数据集群服务器告警。
 
            
alertmanager配置案例:
[root@prometheus-server31 ~]# cat /yinzhengjie/softwares/alertmanager-0.27.0.linux-amd64/alertmanager.yml 
route:
  group_by: ['alertname']
  group_wait: 1s
  group_interval: 3s
  repeat_interval: 1m
  # 默认发给"sre_system"组用户
  receiver: 'sre_system'
  continue: false
  # 配置子路由
  routes:
    - receiver: 'sre_dba'
      match_re:
        job: yinzhengjie_mysqld_exporter
      # 建议将continue的值设置为true,表示当前的条件是否匹配,都将继续向下匹配规则
      # 这样做的目的是将消息发给最后的系统组(sre_system)
      continue: true
    - receiver: 'sre_bigdata'
      match_re:
        job: yinzhengjie_bigdata_exporter
      continue: true
    - receiver: 'sre_system'
      match_re:
        job: .*
      continue: true
receivers:
  - name: 'sre_system'
    webhook_configs:
      - url: 'http://10.0.0.42:6666/alert'
        http_config: {}
        max_alerts: 0
        send_resolved: true
  - name: 'sre_dba'
    webhook_configs:
      - url: 'http://10.0.0.42:7777/alert'
        http_config: {}
        max_alerts: 0
        send_resolved: true
  - name: 'sre_bigdata'
    webhook_configs:
      - url: 'http://10.0.0.42:8888/alert'
        http_config: {}
        max_alerts: 0
        send_resolved: true
[root@prometheus-server31 ~]# 

3.修改Prometheus的配置文件

[root@prometheus-server31 ~]# vim /yinzhengjie/softwares/prometheus-2.53.2.linux-amd64/prometheus.yml 
...
global:
  scrape_interval: 3s
...
alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - 127.0.0.1:9093      
...
rule_files:
  - "/yinzhengjie/softwares/prometheus-2.53.2.linux-amd64/yinzhengjie_rules.yml"  
 
...
scrape_configs:
  ...
  - job_name: "yinzhengjie_mysqld_exporter"
    static_configs:
      - targets: ["10.0.0.42:9100","10.0.0.43:9100"]

  - job_name: "yinzhengjie_bigdata_exporter"
    static_configs:
      - targets: ["10.0.0.31:9100","10.0.0.41:9100"]     

4.定义规则

	1.编写规则文件
[root@prometheus-server31 ~]# cat /yinzhengjie/softwares/prometheus-2.53.2.linux-amd64/yinzhengjie_rules.yml 
groups:
- name: xixi
  rules:
    - alert: yinzhengjie_mysqld_exporter-alert
      expr: node_boot_time_seconds{instance="10.0.0.42:9100", job="yinzhengjie_mysqld_exporter"} > 0
      labels:
        severity: critical
        blog: "https://www.cnblogs.com/yinzhengjie"
      annotations:
        summary: DBA机器异常

- name: haha
  rules:
    - alert: yinzhengjie_bigdata_exporter-alert
      expr: node_boot_time_seconds{instance="10.0.0.41:9100", job="yinzhengjie_bigdata_exporter"} > 0
      labels:
        severity: warning
        auther: 尹正杰
      annotations:
        summary: 大数据集群机器异常
[root@prometheus-server31 ~]# 



	2.检查配置文件是否正确
[root@prometheus-server31 ~]# /yinzhengjie/softwares/prometheus-2.53.2.linux-amd64/promtool check config /yinzhengjie/softwares/prometheus-2.53.2.linux-amd64/prometheus.yml 
Checking /yinzhengjie/softwares/prometheus-2.53.2.linux-amd64/prometheus.yml
  SUCCESS: 1 rule files found
 SUCCESS: /yinzhengjie/softwares/prometheus-2.53.2.linux-amd64/prometheus.yml is valid prometheus config file syntax

Checking /yinzhengjie/softwares/prometheus-2.53.2.linux-amd64/yinzhengjie_rules.yml
  SUCCESS: 2 rules found

[root@prometheus-server31 ~]# 

5.测试生效

如上图所示,我们可以直接测试重启一下Prometheus和alertmanager服务,观察webhook接收的数据分组情况。

[root@prometheus-server31 ~]# systemctl restart prometheus-server.service alertmanager.service  
posted @ 2024-11-12 02:04  尹正杰  阅读(47)  评论(0编辑  收藏  举报