Prometheus之部署node_exporter
下载node_exporter
root@k8s-master-01:~# wget https://github.com/prometheus/node_exporter/releases/download/v1.2.2/node_exporter-1.2.2.linux-amd64.tar.gz
安装node_exporter
root@k8s-master-01:~# tar xf node_exporter-1.2.2.linux-amd64.tar.gz -C /usr/local/
root@k8s-master-01:~# ln -sv /usr/local/node_exporter-1.2.2.linux-amd64/ /usr/local/node_exporter
'/usr/local/node_exporter' -> '/usr/local/node_exporter-1.2.2.linux-amd64/'
验证安装版本
root@k8s-master-01:~# /usr/local/node_exporter/node_exporter --version
node_exporter, version 1.2.2 (branch: HEAD, revision: 26645363b486e12be40af7ce4fc91e731a33104e)
build user: root@b9cb4aa2eb17
build date: 20210806-13:44:18
go version: go1.16.7
platform: linux/amd64
创建node-exporter.service文件
cat >> /lib/systemd/system/node_exporter.service << EOF
[Unit]
Description=node_exporter
Documentation=https://prometheus.io
After=network.target
[Service]
type=simple
ExecStart=/usr/local/node_exporter/node_exporter --collector.ntp --collector.mountstats --collector.systemd --collector.tcpstat
ExecReload=/bin/kill -HUP $MAINPID
TimeoutStopSec=20s
Restart=always
[Install]
WantedBy=multi-user.target
EOF
node-exporter开机启动
root@k8s-master-01:~# systemctl enable node-exporter
Created symlink /etc/systemd/system/multi-user.target.wants/node-exporter.service → /lib/systemd/system/node-exporter.service.
root@k8s-master-01:~# systemctl start node-exporter
root@k8s-master-01:~# systemctl status node-exporter
● node-exporter.service - node_exporter
Loaded: loaded (/lib/systemd/system/node-exporter.service; enabled; vendor preset: enabled)
Active: active (running) since Tue 2021-11-16 14:54:04 CST; 2s ago
Docs: https://prometheus.io
Main PID: 270390 (node_exporter)
Tasks: 4 (limit: 2245)
Memory: 2.6M
CGroup: /system.slice/node-exporter.service
└─270390 /usr/local/node_exporter/node_exporter --collector.ntp --collector.mountstats --collector.systemd --collector.tcpstat
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=thermal_zone
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=time
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=timex
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=udp_queues
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=uname
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=vmstat
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=xfs
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=zfs
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:199 msg="Listening on" address=:9100
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.883Z caller=tls_config.go:191 msg="TLS is disabled." http2=false
访问node-exporter web界面
Prometheus采集node指标数据
修改Prometheus.yml文件
root@prometheus-01:~# cat /usr/local/prometheus/prometheus.yml
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["192.168.174.103:9090"]
- job_name: "prometheus-node"
static_configs:
- targets: ['192.168.174.100:9100']
重启Prometheus服务
root@prometheus-01:~# systemctl restart prometheus
登录Prometheus web界面验证node节点状态
Prometheus 验证node节点监控数据
Node Exporter的指标
GitHub地址:https://github.com/prometheus/node_exporter
启动内置指标
/usr/local/node_exporter/node_exporter --collector.ntp --collector.mountstats --collector.systemd --collector.tcpstat
禁用内置指标
/usr/local/node_exporter/node_exporter --no-collector.ntp --no-collector.mountstats --no-collector.systemd --no-collector.tcpstat
常用指标
- node_cpu_seconds_total
- node_meeory_MemTotal_bytes
- node_filesystem_size_bytes{mount_point=PATH}
- node_system_unit_state{name=}
- node_vmstat_pswpin #系统每秒从磁盘读到内存的字节数;
- node_vmstat_pswpout #系统每秒从内存写到磁盘的字节数;
CPU使用率
每台主机cpu在5分钟内的平均使用率: (1-avg(irate(node_cpu_seconds_total{mode='idle'}[5m]))by(instance))*100
CPU 饱和度
跟踪CPU的平均负载就能获取到相关主机的CPU饱和度,实际上,它是将主机上的CPU数据考虑在内的一段时间内的平均运行队列长度。
平均负载少于cpu的数量是正常情况,而长时间内超过cpu数量则表示cpu已然饱和;
内存使用率
- node_memory_MemTotal_bytes
- node_memory_MemFree_bytes
- node_memory_Buffers_bytes
- node_memory_Cached_bytes
node_exporter dashboard
推荐模板
推荐模板ID 16098
导入模板
添加最近7天P99资源使用率规则
16098.yaml
groups: #新rule文件需要加这行开头,追加旧的rule文件则不需要。
- name: node_usage_record_rules
interval: 1m
rules:
- record: cpu:usage:rate1m
expr: (1 - avg(irate(node_cpu_seconds_total{mode="idle"}[3m])) by (job,instance)) * 100
- record: mem:usage:rate1m
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
prometheus.yml
rule_files:
- "rules/*.yaml"
- "alert_rules/*.yaml"
查看dashboard
常见问题
Couldn t get SNTP reply
# echo "allow 127/8" >> /etc/chrony/chrony.conf
# systemctl restart chrony.service node_exporter