Prometheus之部署node_exporter

下载node_exporter

root@k8s-master-01:~# wget https://github.com/prometheus/node_exporter/releases/download/v1.2.2/node_exporter-1.2.2.linux-amd64.tar.gz

安装node_exporter

root@k8s-master-01:~# tar xf node_exporter-1.2.2.linux-amd64.tar.gz -C /usr/local/
root@k8s-master-01:~# ln -sv /usr/local/node_exporter-1.2.2.linux-amd64/ /usr/local/node_exporter
'/usr/local/node_exporter' -> '/usr/local/node_exporter-1.2.2.linux-amd64/'

验证安装版本

root@k8s-master-01:~# /usr/local/node_exporter/node_exporter --version
node_exporter, version 1.2.2 (branch: HEAD, revision: 26645363b486e12be40af7ce4fc91e731a33104e)
  build user:       root@b9cb4aa2eb17
  build date:       20210806-13:44:18
  go version:       go1.16.7
  platform:         linux/amd64

创建node-exporter.service文件

cat >> /lib/systemd/system/node_exporter.service << EOF
[Unit]
Description=node_exporter
Documentation=https://prometheus.io
After=network.target

[Service]
type=simple
ExecStart=/usr/local/node_exporter/node_exporter --collector.ntp --collector.mountstats --collector.systemd --collector.tcpstat
ExecReload=/bin/kill -HUP $MAINPID
TimeoutStopSec=20s
Restart=always


[Install]
WantedBy=multi-user.target
EOF

node-exporter开机启动

root@k8s-master-01:~# systemctl enable node-exporter
Created symlink /etc/systemd/system/multi-user.target.wants/node-exporter.service → /lib/systemd/system/node-exporter.service.

root@k8s-master-01:~# systemctl start node-exporter
root@k8s-master-01:~# systemctl status node-exporter
● node-exporter.service - node_exporter
     Loaded: loaded (/lib/systemd/system/node-exporter.service; enabled; vendor preset: enabled)
     Active: active (running) since Tue 2021-11-16 14:54:04 CST; 2s ago
       Docs: https://prometheus.io
   Main PID: 270390 (node_exporter)
      Tasks: 4 (limit: 2245)
     Memory: 2.6M
     CGroup: /system.slice/node-exporter.service
             └─270390 /usr/local/node_exporter/node_exporter --collector.ntp --collector.mountstats --collector.systemd --collector.tcpstat

Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=thermal_zone
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=time
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=timex
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=udp_queues
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=uname
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=vmstat
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=xfs
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:115 collector=zfs
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.882Z caller=node_exporter.go:199 msg="Listening on" address=:9100
Nov 16 14:54:04 k8s-master-01 node_exporter[270390]: level=info ts=2021-11-16T06:54:04.883Z caller=tls_config.go:191 msg="TLS is disabled." http2=false

访问node-exporter web界面

Prometheus采集node指标数据

修改Prometheus.yml文件

root@prometheus-01:~# cat /usr/local/prometheus/prometheus.yml 

scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["192.168.174.103:9090"]
  - job_name: "prometheus-node"
    static_configs:
      - targets: ['192.168.174.100:9100']

重启Prometheus服务

root@prometheus-01:~# systemctl restart prometheus

登录Prometheus web界面验证node节点状态

Prometheus 验证node节点监控数据

Node Exporter的指标

GitHub地址:https://github.com/prometheus/node_exporter

启动内置指标

/usr/local/node_exporter/node_exporter --collector.ntp --collector.mountstats --collector.systemd --collector.tcpstat

禁用内置指标

/usr/local/node_exporter/node_exporter --no-collector.ntp --no-collector.mountstats --no-collector.systemd --no-collector.tcpstat

常用指标

  • node_cpu_seconds_total
  • node_meeory_MemTotal_bytes
  • node_filesystem_size_bytes{mount_point=PATH}
  • node_system_unit_state{name=}
  • node_vmstat_pswpin #系统每秒从磁盘读到内存的字节数;
  • node_vmstat_pswpout  #系统每秒从内存写到磁盘的字节数;

CPU使用率

每台主机cpu在5分钟内的平均使用率: (1-avg(irate(node_cpu_seconds_total{mode='idle'}[5m]))by(instance))*100

CPU 饱和度

跟踪CPU的平均负载就能获取到相关主机的CPU饱和度,实际上,它是将主机上的CPU数据考虑在内的一段时间内的平均运行队列长度。

平均负载少于cpu的数量是正常情况,而长时间内超过cpu数量则表示cpu已然饱和;

内存使用率

  • node_memory_MemTotal_bytes
  • node_memory_MemFree_bytes
  • node_memory_Buffers_bytes
  • node_memory_Cached_bytes

node_exporter dashboard

推荐模板

推荐模板ID 16098

导入模板

添加最近7天P99资源使用率规则

16098.yaml

groups:   #新rule文件需要加这行开头,追加旧的rule文件则不需要。
- name: node_usage_record_rules
  interval: 1m
  rules:
  - record: cpu:usage:rate1m
    expr: (1 - avg(irate(node_cpu_seconds_total{mode="idle"}[3m])) by (job,instance)) * 100
  - record: mem:usage:rate1m
    expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100

prometheus.yml

rule_files:
  - "rules/*.yaml"
  - "alert_rules/*.yaml"

查看dashboard

常见问题

Couldn t get SNTP reply

# echo "allow 127/8" >> /etc/chrony/chrony.conf
# systemctl restart chrony.service node_exporter
posted @ 2021-11-16 15:53  小吉猫  阅读(779)  评论(0编辑  收藏  举报