Grafana+Prometheus+Consul+Alertmanager监控
简介
Grafana数据展示
Prometheus数据源,采集数据
Consul服务注册中心
AlertManager企业微信告警
Grafana
安装
Ubuntu and Debian
sudo apt-get install -y adduser libfontconfig1
wget https://dl.grafana.com/enterprise/release/grafana-enterprise_9.0.1_amd64.deb
sudo dpkg -i grafana-enterprise_9.0.1_amd64.deb
Standalone Linux Binaries
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-9.0.1.linux-amd64.tar.gz
tar -zxvf grafana-enterprise-9.0.1.linux-amd64.tar.gz
Red Hat, CentOS, RHEL, and Fedora
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-9.0.1-1.x86_64.rpm
sudo yum install grafana-enterprise-9.0.1-1.x86_64.rpm
OpenSUSE and SUSE
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-9.0.1-1.x86_64.rpm
sudo rpm -i --nodeps grafana-enterprise-9.0.1-1.x86_64.rpm
启动
systemctl start grafana-server
Consul
安装
#下载安装包
wget https://releases.hashicorp.com/consul/1.12.2/consul_1.12.2_linux_amd64.zip
#解压
unzip consul_1.12.2_linux_amd64.zip
#copy consul 到/usr/local/bin/下
cp consul /usr/local/bin/
#设置consul可执行权限
chmod +x /usr/local/bin/consul
#创建数据目录
mkdir -pv /etc/consul.d/ && mkdir -pv /data/consul/ && mkdir -pv /data/consul/shell
编辑配置文件
- vim /etc/consul.d/server.json
{
"datacenter": "dc1",
"data_dir": "/data/consul",
"log_level": "INFO",
"node_name": "consul-server",
"server": true,
"bootstrap_expect": 1,
"bind_addr": "192.168.254.40",
"client_addr": "192.168.254.40",
"ui":true,
"retry_join": ["192.168.254.40"],
"retry_interval": "10s",
"enable_debug": false,
"rejoin_after_leave": true,
"start_join": ["192.168.254.40"],
"enable_syslog": true,
"syslog_facility": "local0"
}
启动Consul服务
nohup consul agent -config-dir=/etc/consul.d > /data/consul/consul.log &
注册和删除服务
#注册服务
curl -X PUT -d '{"id": "服务ID标识","name": "服务名称","address": "服务访问地址","port": '服务访问端口',"tags": ["服务标签"],"checks": [{"http": "http://服务访问地址:服务访问端口/metrics", "interval": "5s"}]}' http://Consul地址:端口/v1/agent/service/register
#删除服务
curl -X PUT http://Consul地址:端口/v1/agent/service/deregister/服务ID
Prometheus
安装
#下载安装包
https://prometheus.io/download/
#解压
tar zxvf prometheus-2.31.1.linux-amd64.tar.gz
编辑配置文件
- vim prometheus.yml
# my global config
global:
scrape_interval: 5s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 5s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus" #job名称
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["192.168.254.40:9090"] #本机地址和prometheus访问端口
#consul 配置
- job_name: 'consul-prometheus'
consul_sd_configs:
- server: '192.168.254.40:8500'
services: []
#修改instance格式,去掉端口
relabel_configs:
- source_labels: ['__address__']
regex: '(.*):.*'
target_label: 'instance'
replacement: '${1}'
启动Prometheus
nohup ./prometheus --web.enable-lifecycle --web.enable-admin-api --config.file=prometheus.yml >/dev/null &
重启/重新加载配置
curl -XPOST http://Prometheus地址:端口/-/reload
Alertmanager
安装
#下载安装包
https://prometheus.io/download/
#解压
tar zxvf alertmanager-0.24.0.linux-amd64.tar.gz
编辑配置文件
#配置文件,vim /opt/consul/alertmanager.yml
global:
resolve_timeout: 5m
templates:
- '/opt/consul/wechat.tmpl' #告警模板
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 1m
repeat_interval: 120m
receiver: 'wechat'
receivers:
- name: 'wechat'
wechat_configs:
- corp_id: 'xxxxxxxxxx' #企业微信ID
agent_id: '123456' #企业微信中用于告警的应用ID
api_secret: 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' #企业微信中用于告警的应用秘钥
to_user: '@all' #发送告警消息的用户,@all表示全部用户
send_resolved: false
编辑告警模板 vim /opt/consul/wechat.tmpl
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
**********告警通知**********
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
{{- end }}
=====================
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.description }}
故障时间: {{ $alert.StartsAt.Local }}
{{ if gt (len $alert.Labels.instance) 0 -}}故障实例: {{ $alert.Labels.instance }}{{- end -}}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
**********恢复通知**********
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
{{- end }}
=====================
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.description }}
故障时间: {{ $alert.StartsAt.Local }}
恢复时间: {{ $alert.EndsAt.Local }}
{{ if gt (len $alert.Labels.instance) 0 -}}故障实例: {{ $alert.Labels.instance }}{{- end -}}
{{- end }}
{{- end }}
{{- end }}
启动Alertmanager
nohup ./alertmanager --config.file=./alertmanager.yml --storage.path=/opt/alertmanager-0.24.0.linux-amd64/data/ --log.level=debug &
效果展示