Prometheus Alertmanager 集成钉钉告警

Prometheus Alertmanager 集成钉钉告警

安装Prometheus、Altermanager

1.使用docker-compose集成Prometheus和Altermanager

cat docker-compose.yml

version: "3"
services:
  prometheus:
    image: prom/prometheus:v2.35.0
    container_name: prometheus
    hostname: prometheus
    volumes:
      #- "./prometheus.yml:/etc/prometheus/prometheus.yml"
      - "./prometheus:/etc/prometheus"
      - "/etc/localtime:/etc/localtime"
      - "./data:/prometheus"
    restart: on-failure
    network_mode: "host"
    logging:
      driver: "json-file"
      options:
        tag: prometheus
    cap_add:
      - ALL
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--web.enable-admin-api'
      - '--web.enable-lifecycle'
  alertmanager:
    image: prom/alertmanager:v0.24.0
    container_name: alertmanager
    hostname: alertmanager
    restart: on-failure
    network_mode: "host"
    logging:
      driver: "json-file"
      options:
        tag: prometheus
    cap_add:
      - ALL
    volumes:
      - ./alertmanager/:/etc/alertmanager/
    command:
      - '--config.file=/etc/alertmanager/config.yml'
      - '--storage.path=/alertmanager'

2.准备Prometheus配置文件

prometheus.yml

cat prometheus.yml

# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanagers:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "/etc/prometheus/rules/*.yml"
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["localhost:9090"]
#  使用配置文件自动发现规则
  - job_name: "actuator_health"
    metrics_path: '/actuator/prometheus'
    file_sd_configs:
    - refresh_interval: 1m
      files:
      - "./service_endpoint*.yml"

  - job_name: "docker"
    file_sd_configs:
    - refresh_interval: 1m
      files:
      - "./docker_endpoint*.yml"

  - job_name: "node-exporter"
    file_sd_configs:
    - refresh_interval: 1m
      files:
      - "./node-exporter*.yml"

service_endpoint_all.yml

cat service_endpoint_all.yml

- targets:
  - ip:20006
  labels:
    servicename: sname01
- targets:
  - ip:20005
  labels:
    servicename: sname01
node-exporter-all.yml

cat node-exporter-all.yml

- targets: ['ip:7100']
  labels:
    hostname: "node-01"
- targets: ['ip:7100']
  labels:
    hostname: "node-02"
- targets: ['ip:7100']
  labels:
    hostname: "node-03"
docker_endpoint_all.yml

cat docker_endpoint_all.yml

- targets: ['ip:7080']
  labels:
    hostname: "env-mid"
- targets: ['ip:7080']
  labels:
    hostname: "env-ap-02"
- targets: ['ip:7080']
  labels:
    hostname: "env-ap-01"

3.准备告警规则

service_alter.yml

cat service_alter.yml # 此规则的labelsannotations将用于下面的告警模板

groups:
- name: Service_Down
  rules:
  - alert: 服务下线通知
    #expr: up{job="actuator_health"}==0
    expr: up{job="actuator_health",servicename!="iot-aircraft_192.168.0.22"}==0
    for: 10s
    labels:
      user: prometheus
      severity: warning
      env: "prod"
      sname: "{{ $labels.servicename }}"
    annotations:
      summary: "{{ $labels.servicename }} 服务下线"
      description: "{{ $labels.servicename }} of job {{ $labels.job }} has been Down."
      title: "{{ $labels.servicename }} 服务状态告警"
[root@prometheus rules]# 

4.准备altermanager配置文件

config.yml

cat config.yml

global:
  #每一分钟检查一次是否恢复
  resolve_timeout: 1m

# 自定义告警模板
templates:
  - '/etc/alertmanager/dingtalk.tmpl'
route:
  #设置默认接收人
  receiver: 'devops' 
  group_by: ['Service_Down']
  #组告警等待时间。也就是告警产生后等待10s,如果有同组告警一起发出
  group_wait: 10s
  #两组告警的间隔时间
  group_interval: 10s
  #重复告警的间隔时间,减少相同微信告警的发送频率
  repeat_interval: 1h
  #采用哪个标签来作为分组依据
  routes:
  - receiver: devops
    group_wait: 10s
    match:
      team: DevOps
receivers:
- name: 'devops'  #与钉钉告警组相匹配
  webhook_configs:
  - url: http://192.168.0.28:8060/dingtalk/devops/send 
    #警报被解决之后是否通知
    send_resolved: true

启动容器

docker-compose up -d

安装dingtalk

1.下载安装包

cd /opt
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
tar xvf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
mv prometheus-webhook-dingtalk-2.1.0.linux-amd64 prometheus-webhook-dingtalk

2.配置系统服务托管

## cat /usr/lib/systemd/system/dingtalk.service

[Unit]
Descripton=dingtalk
Documentation=https://github.com/timonwong/prometheus-webhook-dingtalk/
After=network.target

[Service]
Restart=on-failure
WorkingDirectory=/opt/prometheus-webhook-dingtalk 
ExecStart=/opt/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --config.file=/opt/prometheus-webhook-dingtalk/config.yml --web.enable-ui

[Install]
WantedBy=multi-user.target

#命令行启动
systemctl daemon-reload
systemctl enable dingtalk.service
systemctl start dingtalk.service
systemctl status dingtalk.service
ss -tnl | grep 8060

3.准备配置文件

配置模板路径:

/opt/prometheus-webhook-dingtalk/config.example.yml

复制模板:

cp /opt/prometheus-webhook-dingtalk/config.example.yml /opt/prometheus-webhook-dingtalk/config.yml

修改配置模板: cat config.yml

## Request timeout
# timeout: 5s

## Uncomment following line in order to write template from scratch (be careful!)
#no_builtin_template: true

## Customizable templates path
#templates:
#  - contrib/templates/legacy/template.tmpl
# 配置自定义钉钉消息模板
templates:
  - /opt/prometheus/alertmanager/dingtalk.tmpl

## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
#default_message:
#  title: '{{ template "legacy.title" . }}'
#  text: '{{ template "legacy.content" . }}'

## Targets, previously was known as "profiles"
# 告警分组 可以添加多个
targets:
  devops:
    url: https://oapi.dingtalk.com/robot/send?access_token=631dbf86f484df72d92311e1664d08feef84334b8a668535f0bc8e7cce91a718
    secret: 钉钉key
    message:
      title: '{{ template "ops.title" . }}'
      text: '{{ template "ops.content" . }}'

4.准备自定义消息模板

注意:此模板变量与告警规则相关联

{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}


{{ define "__alert_list" }}{{ range . }}
---
**告警名称**: {{ index .Annotations "title" }}

**告警环境**: {{ .Labels.env }}

**告警级别**: {{ .Labels.severity }}

**告警主机**: {{ .Labels.instance }}

**告警服务**: {{ .Labels.sname }}

**告警信息**: {{ index .Annotations "description" }}

**告警时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}

{{ define "__resolved_list" }}{{ range . }}
---
**告警名称**: {{ index .Annotations "title" }}

**告警环境**: {{ .Labels.env }}

**告警级别**: {{ .Labels.severity }}

**告警主机**: {{ .Labels.instance }}

**告警服务**: {{ .Labels.sname }}

**告警信息**: {{ index .Annotations "description" }}

**告警时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}

**恢复时间**: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}


{{ define "ops.title" }}
{{ template "__subject" . }}
{{ end }}

{{ define "ops.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
![警报 图标](https://pgy-tuchuang.oss-cn-beijing.aliyuncs.com/img/image-20230905113652805.png)
**====侦测到{{ .Alerts.Firing | len  }}个故障====**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}

{{ if gt (len .Alerts.Resolved) 0 }}
![警报 图标](https://pgy-tuchuang.oss-cn-beijing.aliyuncs.com/img/image-20230905113720198.png)
**====恢复{{ .Alerts.Resolved | len  }}个故障====**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}

{{ define "ops.link.title" }}{{ template "ops.title" . }}{{ end }}
{{ define "ops.link.content" }}{{ template "ops.content" . }}{{ end }}
{{ template "ops.title" . }}
{{ template "ops.content" . }}

模板可以使用dingtalk插件的ui界面:http://altermanager:8060/ui调试,开启方法是启动参数添加 --web.enable-ui

image-20220517152218057

5.配置完成后重启服务并检查服务状态

systemctl restart dingtalk.service
systemctl status dingtalk.service
posted @ 2022-05-17 15:26  蒲公英PGY  阅读(580)  评论(0编辑  收藏  举报