linux服务器常用监控指标
1、环境准备
1.1、安装docker和docker-compose环境
https://www.cnblogs.com/hg-super-man/p/10908220.html
2、安装node_exporter
https://prometheus.io/download/
2.1 二进制安装node_exporter
# 下载node_exporter二进制压缩包
wget https://github.com/prometheus/node_exporter/releases/download/v1.6.1/node_exporter-1.6.1.linux-amd64.tar.gz
# 解压
tar -zxvf node_exporter-1.6.1.linux-amd64.tar.gz
# 移动到/opt
mv node_exporter-1.6.1.linux-amd64 /opt/prometheus/node_exporter
# 更改prometheus用户的文件夹权限
chown -R prometheus:prometheus /opt/prometheus/node_exporter
# 创建systemd 服务
cat > /etc/systemd/system/node_exporter.service << "EOF"
[Unit]
Description=node_exporter
Documentation=https://prometheus.io
After=network.target
[Service]
Type=simple
User=prometheus
Group=prometheus
Restart=on-failure
ExecStart=/opt/prometheus/node_exporter/node_exporter
[Install]
WantedBy=multi-user.target
EOF
# 启动grafana
systemctl daemon-reload
systemctl start node_exporter
# 检查
systemctl status node_exporter
# 加入到开机自启动
systemctl enable node_exporter
2.1.1、 访问地址
2.1.2、修改prometheus配置
在scrape_config这行下面添加如下配置
vi /opt/prometheus/prometheus/prometheus.yml
# node_exporter配置
- job_name: "node_exporter"
scrape_interval: 15s
static_configs:
- targets: ["localhost:9100"]
labels:
instance: Prometheus服务器
2.1.3、重载prometheus
curl -X POST http://localhost:9090/-/reload
2.2 docker安装node_exporter
mkdir ~/docker/prometheus/docker-compose.yml
vi ~/docker/prometheus/docker-compose.yml
version: '3.3'
services:
node_exporter:
image: prom/node-exporter:v1.6.1
container_name: node_exporter
restart: always
volumes:
- /etc/localtime:/etc/localtime:ro
- /proc:/host/proc:ro
- /sys:/host/sys:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc|rootfs/var/lib/docker)($$|/)'
ports:
- '9100:9100'
2.2.1、 检查
cat ~/docker/prometheus/docker-compose.yml
2.2.2 启动
cd ~/docker/prometheus
docker-compose up -d
2.2.3、 访问地址
2.2.4、修改prometheus配置
在scrape_config这行下面添加如下配置
vi /opt/prometheus/prometheus/prometheus.yml
# node_exporter配置
- job_name: "node_exporter"
scrape_interval: 15s
static_configs:
- targets: ["localhost:9100"]
labels:
instance: Prometheus服务器
- targets: ["ip:9100"]
labels:
instance: test服务器
2.2.5、重载prometheus
curl -X POST http://localhost:9090/-/reload
3、常用监控指标
cpu
内存
磁盘
文件系统
网络
4、触发器设置
4.1、配置
cat >> prometheus/alert.yml <<"EOF"
groups:
- name: Prometheus alert
rules:
# 对任何实例超过30s无法联系的情况发出警报
- alert: 服务告警
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
instance: "{{ $labels.instance }}"
description: "{{ $labels.job }} 服务已关闭"
- name: node-exporter
rules:
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MeTotal_bytest * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: "主机内存不足,实例:{{ $labels.instance }}"
description: "内存可用率<10%,当前值:{{ $value }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: "内存压力不足,实例:{{ $labels.instance }}"
description: "节点内存压力大。重大页面错误率高,当前值:{{ $value }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "异常流入网络吞吐量,实例:{{ $labels.instance }}"
description: "网络流入流量 > 100 MB/s,当前值:{{ $value }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "异常流出网络吞吐量,实例:{{ $labels.instance }}"
description: "网络流出流量 > 100 MB/s,当前值:{{ $value }}"
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "异常磁盘读取,实例:{{ $labels.instance }}"
description: "磁盘读取 > 50 MB/s,当前值:{{ $value }}"
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "异常磁盘写入,实例:{{ $labels.instance }}"
description: "磁盘写入 > 50 MB/s,当前值:{{ $value }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: "磁盘空间不足告警,实例:{{ $labels.instance }}"
description: "剩余磁盘空间 < 10%,当前值:{{ $value }}"
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: "磁盘空间将在24小时内耗尽,实例:{{ $labels.instance }}"
description: "以当前写入速率预计磁盘空间在 24 小时内耗尽,当前值:{{ $value }}"
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint = "/"} / node_filesystem_files{mountpoint = "/"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint = "/"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "磁盘Inodes不足,实例:{{ $labels.instance }}"
description: "剩余磁盘 inodes < 10%,当前值:{{ $value }}"
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: "异常磁盘读取延迟,实例:{{ $labels.instance }}"
description: "磁盘读取延迟 > 100ms,当前值:{{ $value }}"
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: "异常磁盘写入延迟,实例:{{ $labels.instance }}"
description: "磁盘写入延迟 > 100ms,当前值:{{ $value }}"
- alert: high_load
expr: node_load1 > 4
for: 2m
labels:
severity: page
annotations:
summary: "CPU1分钟负载过高,实例:{{ $labels.instance }}"
description: "CPU1分钟负载>4,已经持续2分钟,当前值:{{ $value }}"
- alert: HostCpuIsUnderUtilized
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode= "idle"}[2m])) * 100) > 80
for: 1m
labels:
severity: warning
annotations:
summary: "CPU负载过高,实例:{{ $labels.instance }}"
description: "CPU负载 > 80%,当前值:{{ $value }}"
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 0m
labels:
severity: warning
annotations:
summary: "CPU窃取率异常,实例:{{ $labels.instance }}"
description: "CPU窃取率 > 10%,嘈杂的邻居正在扼杀 VM 性能,或者Spot实例可能失去信用,当前值:{{ $value }}"
- alert: HostSwapIsFillIngUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "磁盘swap空间使用率异常,实例:{{ $labels.instance }}"
description: "磁盘swap空间使用率 > 80%,当前值:{{ $value }}"
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: "异常网络接收错误,实例:{{ $labels.instance }}"
description: "网卡 {{ $labels.device }} 在过去2分钟接收 {{ $value }} 个错误"
- alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: "异常网络传输错误,实例:{{ $labels.instance }}"
description: "网卡 {{ $labels.device }} 在过去2分钟传输 {{ $value }} 个错误"
- alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000
for: 1m
labels:
severity: warning
annotations:
summary: "异常网络接口饱和,实例:{{ $labels.instance }}"
description: "网卡 {{ $labels.device }} 正在超载,当前值 {{ $value }}"
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "异常连接数,实例:{{ $labels.instance }}"
description: "连接数过大,当前值 {{ $value }}"
- alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >=0 ) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <=0)
for: 2m
labels:
severity: warning
annotations:
summary: "异常时钟偏差,实例:{{ $labels.instance }}"
description: "检测到时钟偏差,时钟不同步,当前值 {{ $value }}"
- alert: HostClockNotSynchronising
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
annotations:
summary: "时钟不同步,实例:{{ $labels.instance }}"
description: "时钟不同步"
EOF
4.2、检查
cat prometheus/alert.yml
4.3、检查配置
docker exec -it prometheus promtool check config /etc/prometheus/prometheus.yml
4.4、 重写加载配置
curl -X POST http://localhost:9090/-/reload
5、grafana展示node_exporter的数据
因为我们在安装prometheus是,已经在grafana上添加了prometheus的数据源,并导过来id为1860的模板,所以就不需要导入了,直接查看