监控工具 - 使用Docker快速创建Prometheus-Grafana-Alertmanager监控系统
Prometheus
相关命令
docker network create monitoring
mkdir -p /etc/prometheus
vim /etc/prometheus/prometheus.yml
docker run -itd --name prometheus \
--net=monitoring \
-p 9090:9090 \
--restart always \
-v /etc/prometheus:/etc/prometheus \
-v prometheus-data:/prometheus \
prom/prometheus:v2.53.2
配置文件
/etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
运行实例记录
[root@k8s-sample ~]# docker network create monitoring
d622c0cbdd342bb819aa896c057782ac44ec359bcd3b7f9b30bd1cd0064dfc1d
[root@k8s-sample ~]#
[root@k8s-sample ~]# mkdir -p /etc/prometheus
[root@k8s-sample ~]# vim /etc/prometheus/prometheus.yml
[root@k8s-sample ~]# cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker run -itd --name prometheus --net=monitoring -p 9090:9090 --restart always -v /etc/prometheus:/etc/prometheus -v prometheus-data:/prometheus prom/prometheus:v2.53.2
060917136c37c3e5f7c12866e25ab828aecfdc031e1bebb92c153c58e24a9051
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
060917136c37 prom/prometheus:v2.53.2 "/bin/prometheus --c…" 6 seconds ago Up 5 seconds 0.0.0.0:9090->9090/tcp, :::9090->9090/tcp prometheus
[root@k8s-sample ~]#
可直接登录“http://
Grafana
相关命令
docker run -d --name=grafana \
--net=monitoring \
-p 3000:3000 \
--restart always \
-v grafana-data:/var/lib/grafana \
grafana/grafana
运行实例记录
[root@k8s-sample ~]# docker run -d --name=grafana \
--net=monitoring \
-p 3000:3000 \
--restart always \
-v grafana-data:/var/lib/grafana \
grafana/grafana
3e2ed40167581e3c0d836a9b6155a8b0bc37012a7b8e67baa45b2fdd474b0865
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
3e2ed4016758 grafana/grafana "/run.sh" 5 seconds ago Up 5 seconds 0.0.0.0:3000->3000/tcp, :::3000->3000/tcp grafana
b0e8d55c2f2c prom/prometheus:v2.53.2 "/bin/prometheus --c…" 6 minutes ago Up 6 minutes 0.0.0.0:9090->9090/tcp, :::9090->9090/tcp prometheus
[root@k8s-sample ~]#
[root@k8s-sample ~]#
登录“http://
添加数据源:Home --> Connections --> Data sources --> Add data source --> prometheus --> Connection 填写 http://
Exporter
Node Exporter
部署 Node Exporter
部署完成后,浏览器访问 http://
[root@k8s-sample ~]# tar -xzvf node_exporter-1.8.2.linux-amd64.tar.gz -C /opt
node_exporter-1.8.2.linux-amd64/
node_exporter-1.8.2.linux-amd64/NOTICE
node_exporter-1.8.2.linux-amd64/node_exporter
node_exporter-1.8.2.linux-amd64/LICENSE
[root@k8s-sample ~]#
[root@k8s-sample ~]# cd /opt/
[root@k8s-sample opt]# ln -sv node_exporter-1.8.2.linux-amd64 node_exporter
'node_exporter' -> 'node_exporter-1.8.2.linux-amd64'
[root@k8s-sample opt]#
[root@k8s-sample opt]# useradd prometheus && echo "prometheus:prometheus"|chpasswd && chage -M 99999 prometheus
[root@k8s-sample opt]#
[root@k8s-sample opt]# chown -R prometheus:prometheus /opt/node_exporter-1.8.2.linux-amd64/
[root@k8s-sample opt]#
[root@k8s-sample opt]# ll /opt |grep node_exporter
lrwxrwxrwx 1 root root 31 Oct 18 22:34 node_exporter -> node_exporter-1.8.2.linux-amd64
drwxr-xr-x 2 prometheus prometheus 56 Jul 14 19:58 node_exporter-1.8.2.linux-amd64
[root@k8s-sample opt]#
[root@k8s-sample opt]# cd
[root@k8s-sample ~]# vim /usr/lib/systemd/system/node_exporter.service
[root@k8s-sample ~]# cat /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
Documentation=https://prometheus.io/
After=network-online.target
[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/opt/node_exporter/node_exporter
Restart=on-failure
[Install]
WantedBy=multi-user.target
[root@k8s-sample ~]#
[root@k8s-sample ~]# systemctl daemon-reload
[root@k8s-sample ~]# systemctl enable node_exporter.service
Created symlink /etc/systemd/system/multi-user.target.wants/node_exporter.service → /usr/lib/systemd/system/node_exporter.service.
[root@k8s-sample ~]# systemctl start node_exporter.service
[root@k8s-sample ~]# systemctl status node_exporter.service
● node_exporter.service - node_exporter
Loaded: loaded (/usr/lib/systemd/system/node_exporter.service; enabled; preset: disable>
Active: active (running) since Fri 2024-10-18 22:36:19 CST; 7s ago
Docs: https://prometheus.io/
Main PID: 8177 (node_exporter)
Tasks: 5 (limit: 48820)
Memory: 4.7M
CPU: 9ms
CGroup: /system.slice/node_exporter.service
└─8177 /opt/node_exporter/node_exporter
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.348Z caller=node_expo>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.349Z caller=tls_confi>
Oct 18 22:36:19 k8s-sample node_exporter[8177]: ts=2024-10-18T14:36:19.349Z caller=tls_confi>
lines 1-21/21 (END)
[root@k8s-sample ~]#
Prometheus添加监控指标
添加完成后,可以在Web UI页面导航栏的 Status 中选择 Targets 查看监控目标。
[root@k8s-sample ~]# vim /etc/prometheus/prometheus.yml
[root@k8s-sample ~]# cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "linux-server"
metrics_path: "/metrics" # 指标接口路径,默认/metrics
scheme: http # 连接协议,默认http
static_configs:
- targets: ["192.168.16.170:9100"]
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker exec -it prometheus kill -HUP 1
[root@k8s-sample ~]#
Grafana 导入仪表盘
Grafana页面 --> 左侧菜单栏Dashboards --> New --> New dashboard --> Import a dashboard --> 输入仪表盘ID 12633
--> Load加载 --> 设置仪表盘名称和数据源 --> Import完成导入 --> Dashboards看到对应的仪表盘页面。
cAdvisor Exporter
[root@k8s-sample ~]# docker pull swr.cn-north-4.myhuaweicloud.com/ddn-k8s/gcr.io/cadvisor/cadvisor-amd64:v0.49.1
v0.49.1: Pulling from ddn-k8s/gcr.io/cadvisor/cadvisor-amd64
619be1103602: Pull complete
3b8469b194b8: Pull complete
6361eeb1639c: Pull complete
4f4fb700ef54: Pull complete
902eccca70f3: Pull complete
Digest: sha256:00ff3424f13db8d6d62778253e26241c45a8d53343ee09944a474bf88d3511ac
Status: Downloaded newer image for swr.cn-north-4.myhuaweicloud.com/ddn-k8s/gcr.io/cadvisor/cadvisor-amd64:v0.49.1
swr.cn-north-4.myhuaweicloud.com/ddn-k8s/gcr.io/cadvisor/cadvisor-amd64:v0.49.1
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker tag swr.cn-north-4.myhuaweicloud.com/ddn-k8s/gcr.io/cadvisor/cadvisor-amd64:v0.49.1 gcr.io/cadvisor/cadvisor-amd64:v0.49.1
[root@k8s-sample ~]# docker rmi swr.cn-north-4.myhuaweicloud.com/ddn-k8s/gcr.io/cadvisor/cadvisor-amd64:v0.49.1
[root@k8s-sample ~]# docker images |grep cadvisor
gcr.io/cadvisor/cadvisor-amd64 v0.49.1 c02cf39d3dba 7 months ago 80.8MB
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker images |grep cadvisor
gcr.io/cadvisor/cadvisor-amd64 v0.49.1 c02cf39d3dba 7 months ago 80.8MB
[root@k8s-sample ~]# docker run -d --name=cadvisor \
--publish=8080:8080 \
--restart always \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:ro \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--detach=true \
--privileged \
--device=/dev/kmsg \
gcr.io/cadvisor/cadvisor-amd64:v0.49.1
56e4af8073bc960dfeffadb9e962c4107ae482d88cb3e29a651ba4c443962ba0
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
56e4af8073bc gcr.io/cadvisor/cadvisor-amd64:v0.49.1 "/usr/bin/cadvisor -…" 7 seconds ago Up 5 seconds (health: starting) 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp cadvisor
279f91ec6f9f prom/alertmanager "/bin/alertmanager -…" 47 hours ago Up 22 minutes 0.0.0.0:9093->9093/tcp, :::9093->9093/tcp alertmanager
3e2ed4016758 grafana/grafana "/run.sh" 2 days ago Up 22 minutes 0.0.0.0:3000->3000/tcp, :::3000->3000/tcp grafana
b0e8d55c2f2c prom/prometheus:v2.53.2 "/bin/prometheus --c…" 2 days ago Up 22 minutes 0.0.0.0:9090->9090/tcp, :::9090->9090/tcp prometheus
[root@k8s-sample ~]#
直接访问如下页面
- http://
:8080 查看cadvisor的相关信息 - http://
:8080/metrics 查看采集的指标数据
在Prometheus添加监控指标
添加完成后,可以在Web UI页面导航栏的 Status 中选择 Targets 查看监控目标。
[root@k8s-sample ~]# vim /etc/prometheus/prometheus.yml
[root@k8s-sample ~]# cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.16.170:9093
rule_files:
- "./rules/linux-server.yml"
- "./rules/general.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "linux-server"
metrics_path: "/metrics" # 指标接口路径,默认/metrics
scheme: http # 连接协议,默认http
static_configs:
- targets: ["192.168.16.170:9100"]
- job_name: "docker-server"
static_configs:
- targets: ["192.168.16.170:8080"]
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker exec -it prometheus kill -HUP 1
[root@k8s-sample ~]#
Grafana导入仪表盘
Grafana页面 --> 左侧菜单栏Dashboards --> New --> New dashboard --> Import a dashboard --> 输入仪表盘ID 14282
--> Load加载 --> 设置仪表盘名称和数据源 --> Import完成导入 --> Dashboards看到对应的仪表盘页面。
Alertmanager
相关命令
mkdir -p /etc/alertmanager
vim /etc/alertmanager/alertmanager.yml
docker run -d --name=alertmanager \
--net=monitoring \
-v /etc/alertmanager:/etc/alertmanager \
-p 9093:9093 \
--restart always \
prom/alertmanager
编写配置文件
/etc/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'test@163.com'
smtp_auth_username: 'test@163.com'
smtp_auth_password: 'XXXXXX'
smtp_require_tls: false
route:
receiver: 'default-receiver'
group_by: [alertname]
group_wait: 1m
group_interval: 5m
repeat_interval: 30m
receivers:
- name: 'default-receiver'
email_configs:
- to: 'test@yeah.net'
send_resolved: true
[root@k8s-sample ~]# vim /etc/alertmanager/alertmanager.yml
[root@k8s-sample ~]# cat /etc/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'test@163.com'
smtp_auth_username: 'test@163.com'
smtp_auth_password: 'XXXXXX'
smtp_require_tls: false
route:
receiver: 'default-receiver'
group_by: [alertname]
group_wait: 1m
group_interval: 5m
repeat_interval: 30m
receivers:
- name: 'default-receiver'
email_configs:
- to: 'test@yeah.net'
send_resolved: true
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker run -d --name=alertmanager --net=monitoring -v /etc/alertmanager:/etc/alertmanager -p 9093:9093 --restart always prom/alertmanager
279f91ec6f9fe6f154e99b1d110e754361ad7f2c20066967b290990d72b395a0
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
279f91ec6f9f prom/alertmanager "/bin/alertmanager -…" 5 seconds ago Up 4 seconds 0.0.0.0:9093->9093/tcp, :::9093->9093/tcp alertmanager
3e2ed4016758 grafana/grafana "/run.sh" 2 hours ago Up 2 hours 0.0.0.0:3000->3000/tcp, :::3000->3000/tcp grafana
b0e8d55c2f2c prom/prometheus:v2.53.2 "/bin/prometheus --c…" 2 hours ago Up 2 hours 0.0.0.0:9090->9090/tcp, :::9090->9090/tcp prometheus
[root@k8s-sample ~]#
更新Prometheus配置文件,指定alertmanager的访问地址
[root@k8s-sample ~]# vim /etc/prometheus/prometheus.yml
[root@k8s-sample ~]# cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.16.170:9093
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "linux-server"
metrics_path: "/metrics" # 指标接口路径,默认/metrics
scheme: http # 连接协议,默认http
static_configs:
- targets: ["192.168.16.170:9100"]
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker exec -it prometheus kill -HUP 1
[root@k8s-sample ~]#
可以直接登录“http://
Alertmanager 告警规则
相关命令
mkdir -p /etc/prometheus/rules
vim /etc/prometheus/rules/linux-server.yml
vim /etc/prometheus/rules/general.yml
vim /etc/prometheus/prometheus.yml
docker exec -it prometheus kill -HUP 1
创建告警规则文件(主机资源使用率)
groups: # 告警规则组
- name: Linux-Server # 告警规则组名称
rules: # 规则
- alert: HighCPUUsage # 告警名称
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[2m])) by (instance) * 100) > 80 # 触发告警的表达式
for: 2m # 定义触发告警的持续时间
labels: # 告警事件的标签
severity: warning # 定义告警级别
annotations:
summary: "{{ $labels.instance }} CPU使用率超过80%"
description: "{{ $labels.instance }} CPU使用率超过80%,当前值: {{ $value }}"
- alert: HighMemoryUsage
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} 内存使用率超过80%"
description: "{{ $labels.instance }} 内存使用率超过80%,当前值: {{ $value }}"
- alert: HighDiskSpaceUsage
expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} {{ $labels.mountpoint }} 分区使用率超过80%"
description: "{{ $labels.instance }} {{ $labels.mountpoint }} 分区使用率超过80%,当前值: {{ $value }}"
创建告警规则文件(监控目标无法连接)
groups:
- name: General
rules:
- alert: InstanceDown
expr: up == 0 # "up"是内置指标,0表示存活状态,1表示无法连接
for: 1m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }} 连接失败"
description: "{{ $labels.instance }} 连接失败,可能是服务器故障!"
更新Prometheus的配置文件
[root@k8s-sample ~]# vim /etc/prometheus/prometheus.yml
[root@k8s-sample ~]# cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.16.170:9093
rule_files:
- "./rules/linux-server.yml" # 相对路径
- "./rules/general.yml" # 相对路径
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "linux-server"
metrics_path: "/metrics" # 指标接口路径,默认/metrics
scheme: http # 连接协议,默认http
static_configs:
- targets: ["192.168.16.170:9100"]
[root@k8s-sample ~]#
[root@k8s-sample ~]# docker exec -it prometheus kill -HUP 1
[root@k8s-sample ~]#
查看告警规则信息,在如下页面均可以查看到已定义告警规则的相关信息
- 告警页面 http://
:9090/alerts - 规则页面 http://
:9090/rules - 配置页面 http://
:9090/config
测试与验证
通过压力测试工具stress模拟cpu使用率过载告警。
告警触发后,可以在 http://
[root@k8s-sample ~]# dnf install -y epel-release && dnf install stress -y
[root@k8s-sample ~]# stress --version
stress 1.0.4
[root@k8s-sample ~]#
[root@k8s-sample ~]# stress --cpu 8
stress: info: [41378] dispatching hogs: 8 cpu, 0 io, 0 vm, 0 hdd
^C
[root@k8s-sample ~]#
自定义告警内容模版
自定义告警内容可以更直观显示关键信息,提高可读性。
- 在
/etc/alertmanager
目录下创建.tmpl
结尾的模版文件 - Alertmanager配置文件中,通过templates字段指定告警模版文件的路径,在接收者配置中指定模版名称
- Alertmanager重新加载配置文件
行动是绝望的解药!
欢迎转载和引用,但请在明显处保留原文链接和原作者信息!
本博客内容多为个人工作与学习的记录,少数内容来自于网络并略有修改,已尽力标明原文链接和转载说明。如有冒犯,即刻删除!
以所舍,求所得,有所获,方所成。