prometheus监控+alertmanager告警

prometheus监控+alertmanager告警

 

配置告警规则

  1、创建规则目录

mkdir /usr/local/prometheus/rules

  

  2、编写告警规则文件。

  vim /usr/local/prometheus/rules/rule.yml

#添加以下配置
groups:
- name: instance-abnormal
  rules:
  - alert: POD新增告警!
    expr: |
      kube_pod_status_ready{condition="true"} == 0
      and on(pod)
      kube_pod_container_status_restarts_total == 0
    for: 60s
    labels:
      name: instance
      severity: Warning
      instance: "{{ $labels.pod }}"
    annotations:
      summary: "k8s集群告警!"
      description: "{{ $labels.pod }} 为新增节点!"

  - alert: POD重启告警!
    expr: |
      kube_pod_status_ready{condition="true"} == 0
      and on(pod)
      kube_pod_container_status_restarts_total > 0
    for: 60s
    labels:
      name: instance
      severity: Critical
    annotations:
      summary: "k8s集群POD重启!"
      description: "{{ $labels.pod }} 正在重启!"

- name: instance-down
  rules:
  - alert: k8s集群节点down!
    expr: |
      kube_node_status_condition{condition="Ready",status="true"} == 0
    for: 60s
    labels:
      severity: Critical
    annotations:
      summary: "k8s集群{{ $labels.node }}节点down!"
      description: "{{ $labels.node }} 节点不可用,请尽快检查!"

- name: resource-status
  rules:
  - alert: POD cpu使用率过高!
    expr: |
      sum by (pod, namespace)(rate(container_cpu_usage_seconds_total{name!=""}[60s])) /
      sum by (pod,namespace) (kube_pod_container_resource_limits{resource="cpu"}) > 0.8
    for: 1m
    labels:
      severity: Warning
    annotations:
      summary: "CPU使用率超过80%!"
      description: "{{ $labels.pod }} CPU使用率超过80%,已超过1分钟,请检查!"

  - alert: POD内存使用率过高!
    expr: |
      sum by (pod, namespace)(rate(container_memory_usage_bytes{name!=""}[60s])) /
      sum by (pod,namespace) (kube_pod_container_resource_limits{resource="memory"}) > 0.8
    for: 1m
    labels:
      severity: Warning
    annotations:
      summary: "内存使用率超过80%!"
      description: "{{ $labels.pod }} 内存使用率超过80%,已超过1分钟,请检查!"

  - alert: 主机cpu使用率过高!
    expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100)
    for: 1m
    labels:
      severity: Warning
    annotations:
      summary: "{{ $labels.pod }} CPU使用率超过80%!"
      description: "{{ $labels.pod }} CPU使用率超过80%,已超过1分钟,请检查!"

  - alert: 主机内存使用率过高!
    expr: |
      sum by (pod, namespace)(rate(container_memory_usage_bytes{name!=""}[60s])) /
      sum by (pod,namespace) (kube_pod_container_resource_limits{resource="memory"}) > 0.8
    for: 1m
    labels:
      severity: Warning
    annotations:
      summary: "{{ $labels.node}} 内存使用率超过80%!"
      description: "{{ $labels.node }} 内存使用率超过80%,已超过1分钟,请检查!"

  - alert: 主机磁盘使用率过高!
    expr: |
      sum by (pod, namespace)(rate(container_memory_usage_bytes{name!=""}[60s])) /
      sum by (pod,namespace) (kube_pod_container_resource_limits{resource="memory"}) > 0.8
    for: 1m
    labels:
      severity: Warning
    annotations:
      summary: "{{ $labels.node }} 磁盘使用率超过80%!"
      description: "{{ $labels.node }} 磁盘使用率超过80%,已超过1分钟,请检查!"

  根据自己的需求配置。

 

  3、新增prometheus配置。

  vim /usr/local/prometheus/prometheus.yml

#新增以下配置
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - xxx.xxx.xxx.xxx:9093

rule_files:
  - "/usr/local/prometheus/rules/rule.yml"

 

  4、重新加载prometheus

curl -X POST http://localhost:9090/-/reload

 

部署alertmanager

  1、下载alertmanager

wget https://github.com/prometheus/alertmanager/releases/download/v0.26.0/alertmanager-0.26.0.linux-amd64.tar.gz

  

  2、解压、移动到安装目录、配置版本软连接。

tar -zxf alertmanager-0.26.0.linux-amd64.tar.gz
mv alertmanager-0.26.0.linux-amd64 /opt/alertmanager-0.26.0
ln -s /opt/alertmanager-0.26.0 /usr/local/alertmanager

 

  3、配置systemd管理

  vim /usr/lib/systemd/system/alertmanager.service

[Unit]
Description=Alertmanager Service
After=network.target

[Service]
ExecStart=/usr/local/alertmanager/alertmanager \
--storage.path=/usr/local/alertmanager/data \
--config.file=/usr/local/alertmanager/alertmanager.yml

[Install]
WantedBy=multi-user.target

 

   4、启动alertmanager,设置为开机启动

systemctl start alertmanager
systemctl enable alertmanager

 

  配置邮件告警

  1、修改alertmanager.yml配置,配置邮箱告警。

  vim /usr/local/alertmanager/alertmanager.yml

#修改文件内容
global:
  smtp_smarthost: 'smtp.139.com:25'        # smtp地址,配置前需要检查邮箱是否有开通SMTP,25端口是否通
  smtp_from: 'xxxxxxxx@139.com'            # 发送邮件的邮箱地址
  smtp_auth_username: 'xxxxxxxx'           # 邮箱用户
  smtp_auth_password: 'xxxxxxxx'           # 邮箱密码,这里需要配置的是客户端授权码,开通SMTP时会生成,有过期时间,过期了需要去邮箱系统里重置。
  smtp_require_tls: false           # 是否开启加密连接,默认为true

route:
  group_by: ["alertname"]                  # 分组
  group_wait: 30s                          # 告警等待,等待30秒内的其他告警信息统一发送,发送之后,需要等待group_interval的时间后才再次发送。
  group_interval: 5m                       # 2次发送警告信息之间的间隔时间
  repeat_interval: 1h                      # 同一条报警信息,重复发送的间隔时间
  receiver: email                          # 接收器名称,与receivers中的name对应。

receivers:
- name: 'email'                            # 接收器名称
  email_configs:
  - to: 'xxxxxxxx@qq.com'                  # 接收邮件的邮箱地址

 

  2、重启alertmanager

systemctl restart alertmanager

 

 配置钉钉告警

  1、下载钉钉通知系统工具

wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz

 

  2、解压、移动至安装目录,创建软连接

tar -zxf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
mv prometheus-webhook-dingtalk-2.1.0.linux-amd64 /opt/prometheus-webhook-dingtalk-2.1.0
ln -s /opt/prometheus-webhook-dingtalk-2.1.0 /usr/local/prometheus-webhook-dingtalk

 

  3、创建钉钉告警模板

mkdir /usr/local/prometheus-webhook-dingtalk/templates
vim /usr/local/prometheus-webhook-dingtalk/templates/service.tmpl

  在service.tmpl文件中加入以下配置

{{ template "service.title" . }}

{{ define "service.title" }}
{{ template "__subject" . }}
{{ end }}

{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}

{{ template "service.content" . }}
{{ define "service.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
========监控到{{ .Alerts.Firing | len  }}个故障========
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
========已恢复{{ .Alerts.Resolved | len  }}个故障========
{{ template "__resolved_list" .Alerts.Resolved }}
---
{{ end }}
{{ end }}

{{ define "__alert_list" }}{{ range . }}
---
    **告警类型**: {{ .Labels.alertname }}
    **告警级别**: {{ .Labels.severity }}
    **告警状态**: {{ .Status }}
    **告警主题**: {{ .Annotations.summary }}
    **告警详情**: {{ .Annotations.description }}
    **触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}

{{ define "__resolved_list" }}{{ range . }}
---
    **告警类型**: {{ .Labels.alertname }}
    **告警级别**: {{ .Labels.severity }}
    **告警状态**: {{ .Status }}
    **告警主题**: {{ .Annotations.summary }}
    **告警详情**: {{ .Annotations.description }}
    **触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    **恢复时间**: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}

 

  4、修改配置文件

cp /usr/local/prometheus-webhook-dingtalk/config.example.yml /usr/local/prometheus-webhook-dingtalk/config.yml 
vim /usr/local/prometheus-webhook-dingtalk/config.yml 

  修改config.yml文件

templates:
  - /usr/local/prometheus-webhook-dingtalk/templates/*.tmpl

targets:
  webhook1:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxx
    message:
      title: '{{ template "service.title" . }}'
      text:  '{{ template "service.content" . }}'

  xxxxxxxx为钉钉机器人的token

 

  5、配置systemd管理脚本

[Unit]
Description=prometheus webhook dingtalk
After=network.target

[Service]
ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk \
--config.file=/usr/local/prometheus-webhook-dingtalk/config.yml

[Install]
WantedBy=multi-user.target

 

  6、启动服务,设置为开机启动

systemctl start dingtalk
systemctl enable dingtalk

 

  7、修改alertmanager配置,告警信息同时发送到邮箱和钉钉

  vim  /usr/local/alertmanager/alertmanager.yml

  修改为以下配置

global:
  resolve_timeout: 5m

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 5m
  repeat_interval: 2h
  receiver: 'default'


  routes:
  - receiver: 'email'
    continue: true                                              # 继续匹配后续路由
  - receiver: 'dingding'
    continue: true                                              # 如果有其他接收者也需要接收,可以继续添加

receivers:
- name: 'default'
- name: 'email'
  email_configs:
  - to: 'xxxxxxx@qq.com'                                         #接收告警信息邮件邮箱
    from: 'xxxxxxxx@139.com'                                     #发送邮件信息邮箱
    smarthost: 'smtp.139.com:25'               
    auth_username: 'xxxxxxxx'
    auth_password: 'xxxxxxxx'
    require_tls: false
    send_resolved: true                                          #发送恢复信息

- name: 'dingding'
  webhook_configs:
  - url: 'http://xxx.xxx.xxx.xxx:8060/dingtalk/webhook1/send'    #prometheus-webhook-dingtalk服务地址
    send_resolved: true                                          # 当告警恢复时,也发送通知

 

  8、重启alertmanager

systemctl restart alertmanager

 

posted @ 2024-03-07 17:21  难止汗  阅读(175)  评论(0编辑  收藏  举报