第十次作业-20230917
一、在kubernetes 部署Redis Cluster并基于存储类实现数据持久化
1.1 准备 nfs 后端存储
root@k8s-ha1-238:~# mkdir -pv /data/k8sdata/myserver/{redis0,redis1,redis2,redis3,redis4,redis5}
mkdir: created directory '/data/k8sdata/myserver/redis0'
mkdir: created directory '/data/k8sdata/myserver/redis1'
mkdir: created directory '/data/k8sdata/myserver/redis2'
mkdir: created directory '/data/k8sdata/myserver/redis3'
mkdir: created directory '/data/k8sdata/myserver/redis4'
mkdir: created directory '/data/k8sdata/myserver/redis5'
root@k8s-ha1-238:~# cat /etc/exports
/data/k8sdata *(rw,no_root_squash)
root@k8s-ha1-238:~# systemctl restart nfs-server.service
1.2 创建 configMap
# 准备 redis.conf 文件
root@k8s-ha2-deploy-239:~/redis-case/redis-cluster# cat redis.conf
bind 0.0.0.0
protected-mode no
save 3600 1
save 300 100
save 60 10000
save 5 1
stop-writes-on-bgsave-error no
rdbcompression yes
rdbchecksum yes
oom-score-adj no
oom-score-adj-values 0 200 800
requirepass "123456"
masterauth "123456"
logfile "/var/log/redis_6379.log"
appendonly yes
appenddirname "appendonlydir"
appendfilename "appendonly.aof"
appendfsync everysec
auto-aof-rewrite-percentage 100
auto-aof-rewrite-min-size 64mb
aof-load-truncated yes
aof-use-rdb-preamble yes
aof-timestamp-enabled no
cluster-enabled yes
cluster-config-file /var/lib/redis/nodes.conf
cluster-node-timeout 5000
dir /var/lib/redis
port 6379
# 创建 configMap
root@k8s-ha2-deploy-239:~/redis-case/redis-cluster# kubectl create configmap redis-conf --from-file=redis.conf -n myserver
configmap/redis-conf created
root@k8s-ha2-deploy-239:~/redis-case/redis-cluster# kubectl get configmaps -n myserver
NAME DATA AGE
kube-root-ca.crt 1 62d
redis-conf 1 11s
# 查看 configMap 信息
root@k8s-ha2-deploy-239:~/redis-case/redis-cluster# kubectl describe configmaps redis-conf -n myserver
1.3 创建 redis-Cluster
root@k8s-ha2-deploy-239:~/redis-case/redis-cluster# cat redis-cluster.yaml
apiVersion: v1
kind: Service
metadata:
name: redis
namespace: myserver
labels:
app: redis
spec:
selector:
app: redis
appCluster: redis-cluster
ports:
- name: redis
port: 6379
clusterIP: None
---
apiVersion: v1
kind: Service
metadata:
name: redis-access
namespace: myserver
labels:
app: redis
spec:
type: NodePort
selector:
app: redis
appCluster: redis-cluster
ports:
- name: redis-access
protocol: TCP
port: 6379
targetPort: 6379
nodePort: 31379
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: redis
namespace: myserver
spec:
serviceName: redis
replicas: 6
selector:
matchLabels:
app: redis
appCluster: redis-cluster
template:
metadata:
labels:
app: redis
appCluster: redis-cluster
spec:
terminationGracePeriodSeconds: 20
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- redis
topologyKey: kubernetes.io/hostname
containers:
- name: redis
image: registry.cn-hangzhou.aliyuncs.com/zhangshijie/redis:v7.2.0
command:
- "redis-server"
args:
- "/etc/redis/redis.conf"
- "--protected-mode"
- "no"
resources:
requests:
cpu: "500m"
memory: "500Mi"
ports:
- containerPort: 6379
name: redis
protocol: TCP
- containerPort: 16379
name: cluster
protocol: TCP
volumeMounts:
- name: conf
mountPath: /etc/redis
- name: data
mountPath: /var/lib/redis
volumes:
- name: conf
configMap:
name: redis-conf
items:
- key: redis.conf
path: redis.conf
volumeClaimTemplates:
- metadata:
name: data
namespace: myserver
spec:
accessModes: [ "ReadWriteOnce" ]
storageClassName: managed-nfs-storage
resources:
requests:
storage: 5Gi
root@k8s-ha2-deploy-239:~/redis-case/redis-cluster# kubectl apply -f redis-cluster.yaml
service/redis created
service/redis-access created
statefulset.apps/redis created
# 基于Statefulset滚动创建Pod
root@k8s-ha2-deploy-239:~/redis-case/redis-cluster# kubectl get pod -n myserver
NAME READY STATUS RESTARTS AGE
redis-0 1/1 Running 0 2m17s
redis-1 1/1 Running 0 2m13s
redis-2 1/1 Running 0 2m9s
redis-3 1/1 Running 0 117s
redis-4 1/1 Running 0 113s
redis-5 1/1 Running 0 110s
# 基于存储类动态创建PV和PVC
root@k8s-ha2-deploy-239:~/redis-case/redis-cluster# kubectl get pv
NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE
pvc-073524b0-5630-444f-b216-f5aa2d609844 5Gi RWO Retain Bound myserver/data-redis-0 managed-nfs-storage 27s
pvc-768bf3e4-757a-4bb6-bdaf-f7450cde0d42 5Gi RWO Retain Bound myserver/data-redis-5 managed-nfs-storage 1s
pvc-8c446d4b-48bc-4cf8-bd1a-cfae0da64cf2 5Gi RWO Retain Bound myserver/data-redis-2 managed-nfs-storage 20s
pvc-b5276395-645d-42a0-bf48-4144cb086edf 5Gi RWO Retain Bound myserver/data-redis-3 managed-nfs-storage 8s
pvc-cd461dd3-b764-4ddd-b7d7-e3f867cee36d 5Gi RWO Retain Bound myserver/data-redis-1 managed-nfs-storage 24s
pvc-f6ef87bf-ba51-4ac7-8780-58fd434f9fb9 5Gi RWO Retain Bound myserver/data-redis-4 managed-nfs-storage 4s
root@k8s-ha2-deploy-239:~/redis-case/redis-cluster# kubectl get pvc -n myserver
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
data-redis-0 Bound pvc-073524b0-5630-444f-b216-f5aa2d609844 5Gi RWO managed-nfs-storage 34s
data-redis-1 Bound pvc-cd461dd3-b764-4ddd-b7d7-e3f867cee36d 5Gi RWO managed-nfs-storage 30s
data-redis-2 Bound pvc-8c446d4b-48bc-4cf8-bd1a-cfae0da64cf2 5Gi RWO managed-nfs-storage 26s
data-redis-3 Bound pvc-b5276395-645d-42a0-bf48-4144cb086edf 5Gi RWO managed-nfs-storage 14s
data-redis-4 Bound pvc-f6ef87bf-ba51-4ac7-8780-58fd434f9fb9 5Gi RWO managed-nfs-storage 10s
data-redis-5 Bound pvc-768bf3e4-757a-4bb6-bdaf-f7450cde0d42 5Gi RWO managed-nfs-storage 7s
1.4 初始化 redis 集群
1.4.1 开始初始化集群
# 修改国内源
root@k8s-ha2-deploy-239:~# kubectl exec -it -n myserver redis-0 bash
root@redis-0:/etc/apt/sources.list.d# sed -i 's|http://deb.debian.org/debian|http://mirrors.ustc.edu.cn/debian|g' debian.sources
root@redis-0:/etc/apt/sources.list.d# sed -i 's|http://deb.debian.org/debian Bullseye-updates|http://mirrors.ustc.edu.cn/debian Bullseye-updates|g' debian.sources
root@redis-0:/etc/apt/sources.list.d# apt update
root@redis-0:/etc/apt/sources.list.d# apt install dnsutils iputils-ping net-tools vim procps
# 后续手动添加从节点
root@redis-0:~# redis-cli -a 123456 --no-auth-warning --cluster create `dig +short redis-0.redis.myserver.svc.cluster.local`:6379 `dig +short redis-1.redis.myserver.svc.cluster.local`:6379 `dig +short redis-2.redis.myserver.svc.cluster.local`:6379
>>> Performing hash slots allocation on 3 nodes...
Master[0] -> Slots 0 - 5460
Master[1] -> Slots 5461 - 10922
Master[2] -> Slots 10923 - 16383
M: 4417a35979b6ad08195d5e71526b472b9ed24577 10.200.21.78:6379
slots:[0-5460] (5461 slots) master
M: b0737229c37ccd75afb00918d7f6794d4515bdb7 10.200.19.87:6379
slots:[5461-10922] (5462 slots) master
M: 5c1806127661d60730516ebc014294981e9fe867 10.200.143.36:6379
slots:[10923-16383] (5461 slots) master
Can I set the above configuration? (type 'yes' to accept): yes
>>> Nodes configuration updated
>>> Assign a different config epoch to each node
>>> Sending CLUSTER MEET messages to join the cluster
Waiting for the cluster to join
.
>>> Performing Cluster Check (using node 10.200.21.78:6379)
M: 4417a35979b6ad08195d5e71526b472b9ed24577 10.200.21.78:6379
slots:[0-5460] (5461 slots) master
M: b0737229c37ccd75afb00918d7f6794d4515bdb7 10.200.19.87:6379
slots:[5461-10922] (5462 slots) master
M: 5c1806127661d60730516ebc014294981e9fe867 10.200.143.36:6379
slots:[10923-16383] (5461 slots) master
[OK] All nodes agree about slots configuration.
>>> Check for open slots...
>>> Check slots coverage...
[OK] All 16384 slots covered.
1.4.2 查看创建完成的集群
# 查看集群状态信息
root@redis-0:~# redis-cli -a 123456 --no-auth-warning cluster info
cluster_state:ok
cluster_slots_assigned:16384
cluster_slots_ok:16384
cluster_slots_pfail:0
cluster_slots_fail:0
cluster_known_nodes:3
cluster_size:3
cluster_current_epoch:3
cluster_my_epoch:1
cluster_stats_messages_ping_sent:84
cluster_stats_messages_pong_sent:80
cluster_stats_messages_sent:164
cluster_stats_messages_ping_received:78
cluster_stats_messages_pong_received:84
cluster_stats_messages_meet_received:2
cluster_stats_messages_received:164
total_cluster_links_buffer_limit_exceeded:0
# 查看集群节点信息
root@redis-0:~# redis-cli -a 123456 --no-auth-warning cluster nodes
b0737229c37ccd75afb00918d7f6794d4515bdb7 10.200.19.87:6379@16379 master - 0 1710749430467 2 connected 5461-10922
5c1806127661d60730516ebc014294981e9fe867 10.200.143.36:6379@16379 master - 0 1710749431470 3 connected 10923-16383
4417a35979b6ad08195d5e71526b472b9ed24577 10.200.21.78:6379@16379 myself,master - 0 1710749430000 1 connected 0-5460
# 对集群进行检查
root@redis-0:~# redis-cli -a 123456 --no-auth-warning --cluster check `dig +short redis-0.redis.myserver.svc.cluster.local`:6379
10.200.21.78:6379 (4417a359...) -> 0 keys | 5461 slots | 0 slaves.
10.200.19.87:6379 (b0737229...) -> 0 keys | 5462 slots | 0 slaves.
10.200.143.36:6379 (5c180612...) -> 0 keys | 5461 slots | 0 slaves.
[OK] 0 keys in 3 masters.
0.00 keys per slot on average.
>>> Performing Cluster Check (using node 10.200.21.78:6379)
M: 4417a35979b6ad08195d5e71526b472b9ed24577 10.200.21.78:6379
slots:[0-5460] (5461 slots) master
M: b0737229c37ccd75afb00918d7f6794d4515bdb7 10.200.19.87:6379
slots:[5461-10922] (5462 slots) master
M: 5c1806127661d60730516ebc014294981e9fe867 10.200.143.36:6379
slots:[10923-16383] (5461 slots) master
[OK] All nodes agree about slots configuration.
>>> Check for open slots...
>>> Check slots coverage...
[OK] All 16384 slots covered.
1.4.3 将从节点加入到主节点中
# 将redis-3加入到redis-0中
root@redis-0:~# redis-cli -a 123456 --no-auth-warning --cluster add-node `dig +short redis-3.redis.myserver.svc.cluster.local`:6379 `dig +short redis-0.redis.myserver.svc.cluster.local`:6379 --cluster-slave --cluster-master-id 4417a35979b6ad08195d5e71526b472b9ed24577
# 将redis-4加入到redis-1中
root@redis-0:~# redis-cli -a 123456 --no-auth-warning --cluster add-node `dig +short redis-4.redis.myserver.svc.cluster.local`:6379 `dig +short redis-1.redis.myserver.svc.cluster.local`:6379 --cluster-slave --cluster-master-id b0737229c37ccd75afb00918d7f6794d4515bdb7
# 将redis-5加入到redis-2中
root@redis-0:~# redis-cli -a 123456 --no-auth-warning --cluster add-node `dig +short redis-5.redis.myserver.svc.cluster.local`:6379 `dig +short redis-2.redis.myserver.svc.cluster.local`:6379 --cluster-slave --cluster-master-id 5c1806127661d60730516ebc014294981e9fe867
# 查看集群状态
1.5 准备数据写入
root@k8s-master1-230:~# kubectl run -it ubuntu2204 --image=ubuntu:22.04 --restart=Never -n myserver bash
If you don't see a command prompt, try pressing enter.
root@ubuntu2204:/# apt update
root@ubuntu2204:/# apt install python3-pip vim
root@ubuntu2204:/# mkdir ~/.pip
root@ubuntu2204:/# vim ~/.pip/pip.conf
root@ubuntu2204:/# cat ~/.pip/pip.conf
[global]
index-url = https://mirrors.aliyun.com/pypi/simple/
[install]
trusted-host=mirrors.aliyun.com
root@ubuntu2204:/# pip3 install redis-py-cluster
root@ubuntu2204:/# cat redis-cluster.py
#!/usr/bin/env python
import sys
from rediscluster import RedisCluster
def init_redis():
startup_nodes = [
{'host': 'redis-0.redis.myserver.svc.cluster.local', 'port': 6379},
{'host': 'redis-1.redis.myserver.svc.cluster.local', 'port': 6379},
{'host': 'redis-2.redis.myserver.svc.cluster.local', 'port': 6379},
{'host': 'redis-3.redis.myserver.svc.cluster.local', 'port': 6379},
{'host': 'redis-4.redis.myserver.svc.cluster.local', 'port': 6379},
{'host': 'redis-5.redis.myserver.svc.cluster.local', 'port': 6379},
]
try:
conn = RedisCluster(startup_nodes=startup_nodes,
decode_responses=True, password='123456')
print('OK,Connect success!', conn)
for i in range(210,510):
conn.set("key%s" % i, "value%s" % i)
data = conn.get("key%s" % i)
print(data)
except Exception as e:
import traceback
traceback.print_exc()
print("connect error", str(e))
sys.exit(1)
init_redis()
1.6 验证数据写入
root@redis-0:~# redis-cli -a 123456 --no-auth-warning --cluster call redis-0.redis.myserver.svc.cluster.local:6379 dbsize
>>> Calling dbsize
redis-0.redis.myserver.svc.cluster.local:6379: 101
10.200.19.88:6379: 106
10.200.143.37:6379: 93
10.200.143.36:6379: 93
10.200.19.87:6379: 106
10.200.21.79:6379: 101
二、总结Prometheus的组件及数据采集流程
Prometheus 相关组件:
- prometheus server:主服务,接受外部 http 请求、收集指标数据、存储指标数据与查询指标数据等
- prometheus targets:静态发现目标后执行数据抓取
- prometheus alerting:调用 alertmanager 组件实现报警通知
- push gateway:数据收集代理服务器
- data visualization and export:数据可视化与数据导出
Prometheus 数据采集流程:
- Prometheus server 通过静态配置文件或动态发现获取目标
- 向目标 URL 发起 http/https 请求
- 目标接受请求并返回指标数据
- Prometheus server 接受数据并对比告警规则,如果触发告警则进一步执行告警动作并存储数据,不触发告警则只进行数据存储
三、使用apt或二进制部署Prometheus及node-exporter、并基于静态配置实现node-exporter数据采集
Prometheus 安装方式简介
# 使用apt或者yum安装
apt install prometheus
# 基于官方提供的二进制文件安装
https://prometheus.io/download
# 基于docker镜像直接启动或通过docker-compose编排
https://prometheus.io/docs/prometheus/latest/installation
# 基于operator部署在kubernetes环境部署
https://github.com/prometheus-operator/kube-prometheus
3.1 基于二进制部署Prometheus及node-export
3.1.1 部署Prometheus
# 准备好安装包
# https://prometheus.io/download/#prometheus
root@ubuntu18-server11:~# ls
prometheus-2.45.0.linux-amd64.tar.gz
root@ubuntu18-server11:~# mkdir /apps
root@ubuntu18-server11:~# tar xvf prometheus-2.45.0.linux-amd64.tar.gz -C /apps/
root@ubuntu18-server11:/apps# ln -sv prometheus prometheus-2.45.0.linux-amd64/
root@ubuntu18-server11:/apps# cd prometheus
root@ubuntu18-server11:/apps/prometheus# ll
total 227328
drwxr-xr-x 4 1001 123 4096 Jun 23 2023 ./
drwxr-xr-x 3 root root 4096 Mar 19 06:53 ../
drwxr-xr-x 2 1001 123 4096 Jun 23 2023 console_libraries/
drwxr-xr-x 2 1001 123 4096 Jun 23 2023 consoles/
-rw-r--r-- 1 1001 123 11357 Jun 23 2023 LICENSE
-rw-r--r-- 1 1001 123 3773 Jun 23 2023 NOTICE
-rwxr-xr-x 1 1001 123 119846310 Jun 23 2023 prometheus*
-rw-r--r-- 1 1001 123 934 Jun 23 2023 prometheus.yml
-rwxr-xr-x 1 1001 123 112896008 Jun 23 2023 promtool* # 用于检测Prometheus配置文件、metrics数据等
root@ubuntu18-server11:/apps/prometheus# ./promtool check config prometheus.yml
Checking prometheus.yml
SUCCESS: prometheus.yml is valid prometheus config file syntax
# 准备Service启动文件
root@ubuntu18-server11:/apps/prometheus# cat /lib/systemd/system/prometheus.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/apps/prometheus/
ExecStart=/apps/prometheus/prometheus --config.file=/apps/prometheus/prometheus.yml --web.enable-lifecycle
[Install]
WantedBy=multi-user.target
root@ubuntu18-server11:/apps/prometheus# systemctl daemon-reload && systemctl enable --now prometheus.service
Created symlink /etc/systemd/system/multi-user.target.wants/prometheus.service → /lib/systemd/system/prometheus.service.
root@ubuntu18-server11:/apps/prometheus# systemctl status prometheus.service
● prometheus.service - Prometheus Server
Loaded: loaded (/lib/systemd/system/prometheus.service; enabled; vendor preset: enabled)
Active: active (running) since Tue 2024-03-19 07:09:21 UTC; 7s ago
Docs: https://prometheus.io/docs/introduction/overview/
Main PID: 46169 (prometheus)
Tasks: 7 (limit: 4625)
CGroup: /system.slice/prometheus.service
└─46169 /apps/prometheus/prometheus --config.file=/apps/prometheus/prometheus.yml --web.enable-lifecycle
Prometheus 主要配置文件参数:
// 指定配置文件
--config.file=“prometheus.yml”// 指定监听地址
--web.listen-address=“0.0.0.0:9090”// 指定数据存储目录
--storage.tsdb.path=“data/”// 指定block大小,默认512MB
--storage.tsdb.retention.size=B,KB,GB,TB,PB,EB// 数据保存时长,默认15天
storage.tsdb.retention.time=// 最大查询超过时间
--query.timeout=2m// 最大查询并发数
--query.max-concurrency=20// 最大空闲超时时间
--web.read-timeout=5m// 最大并发连接数
--web.max-connections=512// 启用API动态加载配置功能
--web.enable-lifecycle
3.1.2 部署node_exporter
# 下载地址
https://prometheus.io/download/#node_exporter
root@ubuntu18-server11:~# ls
node_exporter-1.6.1.linux-amd64.tar.gz prometheus-2.45.0.linux-amd64.tar.gz
root@ubuntu18-server11:~# tar xvf node_exporter-1.6.1.linux-amd64.tar.gz -C /apps/
root@ubuntu18-server11:~# cd /apps/
root@ubuntu18-server11:/apps# ln -sv node_exporter-1.6.1.linux-amd64/ node_exporter
'node_exporter' -> 'node_exporter-1.6.1.linux-amd64/'
root@ubuntu18-server11:/apps# cd node_exporter
root@ubuntu18-server11:/apps/node_exporter# ls
LICENSE node_exporter NOTICE
# 准备Service文件
root@ubuntu18-server11:/apps/node_exporter# cat /lib/systemd/system/node-exporter.service
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
ExecStart=/apps/node_exporter/node_exporter
[Install]
WantedBy=multi-user.target
root@ubuntu18-server11:/apps/node_exporter# systemctl daemon-reload && systemctl enable --now node-exporter.service
Created symlink /etc/systemd/system/multi-user.target.wants/node-exporter.service → /lib/systemd/system/node-exporter.service.
root@ubuntu18-server11:/apps/node_exporter# systemctl status node-exporter.service
● node-exporter.service - Prometheus Node Exporter
Loaded: loaded (/lib/systemd/system/node-exporter.service; enabled; vendor preset: enabled)
Active: active (running) since Tue 2024-03-19 07:40:28 UTC; 4s ago
Main PID: 46607 (node_exporter)
Tasks: 4 (limit: 4625)
CGroup: /system.slice/node-exporter.service
└─46607 /apps/node_exporter/node_exporter
Prometheus 数据简介:
metrics:指标,有各自的metric name,是一个key value格式组成的某个监控项数据
labels:标签,用于对相同名称的指标进行筛选,一个指标可以有多个不同的标签
node_network_receive_multicast_total{device="eth0"} 0 node_network_receive_multicast_total{device="lo"} 0
samples:样本,存在于TSDB中的数据,由三部分组成
- 指标(包含metric name 和 labels)值(value,指标数据)时间戳(指标写入的时间)
series:序列,有多个 samples 组成的时间序列数据
3.2 基于静态配置实现node-exporter数据采集
需要提前在被监控节点上安装 node-exporter
# 编辑Prometheus的配置文件,增加一个 job-name
root@ubuntu18-server11:/apps/prometheus# vim /apps/prometheus/prometheus.yml
- job_name: "prometheus-node"
static_configs:
- targets: ['192.168.119.171:9100','192.168.119.172:9100','192.168.119.173:9100']
root@ubuntu18-server11:/apps/prometheus# systemctl daemon-reload && systemctl restart prometheus.service
四、基于grafana查看node-exporter指标数据
4.1 安装 grafana
root@ubuntu18-server11:~# apt-get install -y adduser libfontconfig1 musl
root@ubuntu18-server11:~# wget https://dl.grafana.com/enterprise/release/grafana-enterprise_9.3.1_amd64.deb
root@ubuntu18-server11:~# dpkg -i grafana-enterprise_9.3.1_amd64.deb
# 编辑配置文件
root@ubuntu18-server11:~# vim /etc/grafana/grafana.ini
[server]
# Protocol (http, https, h2, socket)
protocol = http
# The ip address to bind to, empty will bind to all interfaces
http_addr = 0.0.0.0
# The http port to use
http_port = 3000
root@ubuntu18-server11:~# systemctl daemon-reload && systemctl enable --now grafana-server.service
4.2 添加数据源进行展示
添加数据源:Configuration --> data source --> add datasource --> prometheus
导入模板
五、总结Prometheus数据类型、PromQL语句的基本使用
Prometheus 提供一个函数式的表达式语言 PromQL,可以使用户实时的查找和聚合时间序列数据,表达式计算结果可以在图表中展示,也可以在 Prometheus 表达式浏览器中以表格形式展示,或者作为数据源,以 HTTP API 的方式提供给外部系统使用。
https://prometheus.io/docs/prometheus/latest/querying/basics/
5.1 Prometheus 数据类型
5.1.1 查询数据类型
Instant Vector:瞬时向量/瞬时数据,是对目标实例查询到的同一个时间戳的一组时间序列数据(按照时间的推移对数据进行存储和展示),每个时间序列包含单个数据样本,比如 node_memory_MemFree_bytes 查询的是当前剩余内存,这就是一个瞬时向量,该表达式的返回值中只会包含该事件序列中的最新的一个样本值,而相应的这样的表达式称之为瞬时向量表达式
root@ubuntu18-server11:~# curl 'http://192.168.119.171:9090/api/v1/query' --data 'query=node_memory_MemFree_bytes' --data time=1710898611
{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"node_memory_MemFree_bytes","instance":"192.168.119.171:9100","job":"prometheus-node"},"value":[1710898611,"3075489792"]},{"metric":{"__name__":"node_memory_MemFree_bytes","instance":"192.168.119.172:9100","job":"prometheus-node"},"value":[1710898611,"3591737344"]},{"metric":{"__name__":"node_memory_MemFree_bytes","instance":"192.168.119.173:9100","job":"prometheus-node"},"value":[1710898611,"3344560128"]}]}}
Range Vector:范围向量/范围数据,是指在任何一个返回内,抓取的所有度量指标数据。比如最近一天的网卡流量趋势图、最近5分钟的node节点存储可用字节数等
root@ubuntu18-server11:~# curl 'http://192.168.119.171:9090/api/v1/query' --data 'query=node_memory_MemFree_bytes{instance="192.168.119.173:9100"}[5m]' --data time=1710899640
{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"__name__":"node_memory_MemFree_bytes","instance":"192.168.119.173:9100","job":"prometheus-node"},"values":[[1710899346.190,"3341963264"],[1710899361.194,"3341963264"],[1710899376.200,"3341963264"],[1710899391.202,"3341963264"],[1710899406.201,"3341963264"],[1710899421.203,"3341963264"],[1710899436.202,"3341930496"],[1710899451.192,"3341930496"],[1710899466.191,"3341930496"],[1710899481.191,"3341963264"],[1710899496.189,"3341963264"],[1710899511.189,"3341963264"],[1710899526.190,"3341963264"],[1710899541.190,"3341963264"],[1710899556.191,"3341963264"],[1710899571.190,"3341963264"],[1710899586.199,"3341963264"],[1710899601.201,"3341963264"],[1710899616.189,"3341963264"],[1710899631.191,"3341963264"]]}]}}
scalar:标量/纯量数据,是一个浮点数类型的数据值,使用 node_load1 获取到一个瞬时向量后,再使用 Prometheus 的内置函数 scalar() 将瞬时向量转换为标量,例如:scalar(sum(node_load1))
root@ubuntu18-server11:~# curl 'http://192.168.119.171:9090/api/v1/query' --data 'query=scalar(sum(node_load1{instance="192.168.119.172:9100"}))' --data time=1710899640
{"status":"success","data":{"resultType":"scalar","result":[1710899640,"0.07"]}}
string:简单的字符串类型的数据,目前未使用
5.1.2 指标数据类型
Counter:计数器,Counter 类型代表一个累积的指标数据,在没有被重启的前提下只增不减,比如磁盘I/O总数、NGINX/API 的请求总数、网卡流经的报文总数等。
Gauge:仪表盘,Gauge 类型代表一个可以任意变化的指标数据,值可以随时增高或减少,如带宽速率、CPU负载、内存利用率、nginx 活动连接数等
Histogram:累积直方图,Hsitogram 会在一段时间范围内对数据进行采样(通常是请求持续时间或响应大小等),加入每分钟产生一个当前的活跃连接数,那么一天就会产生24*60=1440个数据,那么2点的柱状图(bucket)会包含0点到2点的的数据,而4点则会包含0点到4点的数据,可用于统计当天从零点开始到当前时间的数据统计结果,如 http 请求成功率、丢包率等,比如ELK的当天访问IP统计
累计直方图指标包括几个项目
- 数据个数的统计值(Counter计数),后缀使用_count表示
- 所有数据的累计相加之和,使用后缀_sum表示
- 柱状图样式的数据存储桶,存储了从数据开始到当前桶的数据统计结果,使用_bucket表示
Summary:摘要图,默认统计选中的指标的最近10分钟内的数据的分位数,可以制定数据统计时间范围,基于分位数,也称分位点,是指用分割点将随机数据统计并划分为几个具有相同概率的连续区间,常见的为四分位,四分位数是将数据样本统计后分成四个区间,将范围内的数据进行百分比的占比统计,从0到1,表示是0%~100%。利用四分位数,可以快速了解数据的大概统计结果。
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 2.0659e-05
go_gc_duration_seconds{quantile="0.25"} 2.3545e-05
go_gc_duration_seconds{quantile="0.5"} 2.4567e-05
go_gc_duration_seconds{quantile="0.75"} 2.658e-05
go_gc_duration_seconds{quantile="1"} 5.5615e-05
go_gc_duration_seconds_sum 0.005243445 # 数据总和
go_gc_duration_seconds_count 194 # 数据个数
5.2 PromeQL 语句基本使用
5.2.1 node-exporter 指标数据格式
# 没有标签的
metric_name metric_value
# TYPE node_load15 gauge
node_load15 0
# 一个标签的
metric_name{label1_name="label1-value"}metric_value
# TYPE node_network_receive_bytes_total counter
node_network_receive_bytes_total{device="eth0"} 3.5795987e+07
node_network_receive_bytes_total{device="lo"} 3.2632258e+07
# 多个标签的
metric_name{label1_name="label1-value",labelN_name="labelN-value"}metric_value
# TYPE node_filesystem_files_free gauge
node_filesystem_files_free{device="/dev/mapper/ubuntu--vg-ubuntu--lv",fstype="ext4",mountpoint="/"} 1.489365e+06
node_filesystem_files_free{device="/dev/sda2",fstype="ext4",mountpoint="/boot"} 65227
node_filesystem_files_free{device="lxcfs",fstype="fuse.lxcfs",mountpoint="/var/lib/lxcfs"} 0
node_filesystem_files_free{device="tmpfs",fstype="tmpfs",mountpoint="/run"} 498082
node_filesystem_files_free{device="tmpfs",fstype="tmpfs",mountpoint="/run/lock"} 501653
node_filesystem_files_free{device="tmpfs",fstype="tmpfs",mountpoint="/run/user/0"} 501644
PromeQL 查询指标数据示例
node_memory_MemTotal_bytes # 查询node节点总内存大小
node_memory_MemFree_bytes # 查询node节点剩余可用内存
node_memory_MemTotal_bytes{instance="192.168.119.171:9100"} # 基于标签查询指定节点的总内存
node_memory_MemFree_bytes{instance="192.168.119.171:9100"} # 基于标签指定节点的可用内存
node_disk_io_time_seconds_total{device="sda"} # 查询指定磁盘的每秒钟的io消耗
node_filesystem_free_bytes{device="/dev/sda1",fstype="xfs",mountpoint="/"} # 查询指定磁盘的剩余空间
5.2.2 标签匹配
= 选择与提供的字符串完全相同的标签,精确匹配
!= 选择与提供对的字符串不相同的标签,取反
=~ 选择正则表达式与提供的字符串(或子字符串)相匹配的标签
!~ 选择正则表达式与提供的字符串(或子字符串)不匹配的标签
示例
# 精确匹配
node_load1{job='prometheus-node',instance='10.243.20.50:9100'}
# 取反
node_load1{job='prometheus-node',instance!='10.243.20.50:9100'}
# 包含正则且匹配
node_load1{instance=~'192.168.119.*:9100'}
# 包含正则且取反
node_load1{instance!~'192.168.119.*:9100'}
5.2.3 时间范围
对指标数据进行时间范围指定
s、m、h、d、w、y
秒、分钟、小时、天、周、年
瞬时向量表达式,选择当前最新的数据
node_load1{}
区间向量表达式,选择以当前时间为基准,查询所有节点指定时间内的数据
node_load1{}[5m]
区间向量表达式,选择以当前时间为基准,查询指定节点的指定时间内的指标数据
node_load1{instance='192.168.119.171:9100'}[5m]
5.2.4 运算符
对指标数据进行数学运算
+ 加法
- 减法
* 乘法
/ 除法
% 模
^ 幂
示例
# 将内存进行单位转换,转换为MB
node_memory_MemFree_bytes{instance='10.243.20.50:9100'}/1024/1024
# 计算磁盘读写数据量
node_disk_read_bytes_total{instance='192.168.119.171:9100',device="sda"} + node_disk_written_bytes_total{instance='192.168.119.171:9100',device="sda"}
5.2.5 聚合运算
1、最大、最小、平均
# max() 最大值 min() 最小值 avg() 平均值
# 计算每个节点的最大流量值
max(node_network_receive_bytes_total)by(instance)
# 计算每个节点最近5分钟每个device的最大增长流速
2、求数据值相加的和
# sum()
sum(prometheus_http_requests_total)
3、统计返回值的条数
# count()
count(node_os_version)
{} 4 # 可用于返回节点数、Pod数量
4、对value的个数进行计数,并将value复制给自定义标签,从而成为新的label
# count_values()
# 统计不同系统版本的节点数量
count_values("node_version",node_os_version)
{node_version="7"} 1
{node_version="18.04"} 3
5、返回指标数据的值
# abs()
abs(sum(prometheus_http_requests_total{handler="/metrics"}))
6、如果指标有数据就返回空,如果监控项没有数据就返回1(可用于监控项告警通知)
# absent()
absent(sum(prometheus_http_requests_total{handler="/metricss"}))
{} 1 # 错误的监控项,返回 1
7、标准差与求方差
# stddev() 标准差
stddev(prometheus_http_requests_total) # 数据差异越大,数据波动越厉害
# stdvar() 求方差
8、排序
# topk() 样本值排名最大的N个数据
topk(6,prometheus_http_requests_total)
# bottomk() 从小到大排序
bottomk(6,prometheus_http_requests_total)
9、rate 和 irate
# rate()
10、by 和 without
六、基于prometheus pushgateway实现指标数据收集
pushgateway
用于临时的指标数据采集。它不支持数据拉取(pull)模式,需要客户端主动将数据推送给 pushgateway。
pushgateway
可以单独运行在一个节点,然后需要自定义监控脚本把需要监控指标主动推送给 pushgateway 的 API 接口,之后等待 Prometheus Server 抓取数据,即 pushgateway 本身没有抓取监控数据的功能,目前 pushgateway 只能被动等待数据从客户端进行推送。
6.1 部署 pushgateway
下载地址:https://prometheus.io/download/#pushgateway
root@ubuntu18-server12:~# ls
pushgateway-1.6.2.linux-amd64.tar.gz
root@ubuntu18-server12:~# tar xf pushgateway-1.6.2.linux-amd64.tar.gz -C /apps/
root@ubuntu18-server12:~# ln -sv /apps/pushgateway-1.6.2.linux-amd64/ /apps/pushgateway
'/apps/pushgateway' -> '/apps/pushgateway-1.6.2.linux-amd64/'
root@ubuntu18-server12:~# cat /etc/systemd/system/pushgateway.service
[Unit]
Description=Prometheus pushgateway
After=network.target
[Service]
ExecStart=/apps/pushgateway/pushgateway
[Install]
WantedBy=multi-user.target
root@ubuntu18-server12:~# systemctl daemon-reload && systemctl enable --now pushgateway.service
数据持久化保存
root@ubuntu18-server12:~# /apps/pushgateway/pushgateway --help
usage: pushgateway [<flags>]
The Pushgateway
Flags:
-h, --[no-]help Show context-sensitive help (also try --help-long and --help-man).
--[no-]web.systemd-socket Use systemd socket activation listeners instead of port listeners (Linux only).
--web.listen-address=:9091 ...
Addresses on which to expose metrics and web interface. Repeatable for multiple addresses.
--web.config.file="" [EXPERIMENTAL] Path to configuration file that can enable TLS or authentication. See:
https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md
--web.telemetry-path="/metrics"
Path under which to expose metrics.
--web.external-url= The URL under which the Pushgateway is externally reachable.
--web.route-prefix="" Prefix for the internal routes of web endpoints. Defaults to the path of --web.external-url.
--[no-]web.enable-lifecycle
Enable shutdown via HTTP request.
--[no-]web.enable-admin-api
Enable API endpoints for admin control actions.
--persistence.file="" File to persist metrics. If empty, metrics are only kept in memory.
--persistence.interval=5m The minimum interval at which to write out the persistence file.
--[no-]push.disable-consistency-check
Do not check consistency of pushed metrics. DANGEROUS.
--log.level=info Only log messages with the given severity or above. One of: [debug, info, warn, error]
--log.format=logfmt Output format of log messages. One of: [logfmt, json]
--[no-]version Show application version.
# 编辑配置文件,添加相关参数
root@ubuntu18-server12:~# mkdir -p /data/pushgateway
root@ubuntu18-server12:/data/pushgateway# cat /etc/systemd/system/pushgateway.service
[Unit]
Description=Prometheus pushgateway
After=network.target
[Service]
ExecStart=/apps/pushgateway/pushgateway --persistence.file="/data/pushgateway/" --persistence.interval=5m
[Install]
WantedBy=multi-user.target
root@ubuntu18-server12:~# systemctl daemon-reload
root@ubuntu18-server12:~# systemctl restart pushgateway.service
6.2 验证数据采集
# 监听在9091端口,可以通过http://192.168.119.172:9091/metrics对外提供指标数据抓取接口
6.2.1 客户端推送单条指标数据
要 Push 数据到 pushgateway 中,可以通过提供的 API 接口来添加,默认 URL 地址为:
http://<ip>:9091/metrics/job/<JOBNAME>{/<LABEL_NAME>/<LABEL_VALUE>}
<JOBNAME> 是必填项,是 job 的名称,后面可以跟任意数量的标签对,一般我们会添加一个 instance/<INSTANCE_NAME>实例名称标签,来方便区分各个指标在哪个节点产生的
示例
# 推送一个job名称为mytest_job,key为mytest_metric值为2022
root@ubuntu18-server13:~# echo "mytest_metric 2022" | curl --data-binary @- http://192.168.119.172:9091/metrics/job/mytest_job
除了 mytest_metric 指标数据自身以外,pushgateway还为每一条指标数据附加了push_time_seconds 和 push_failure_time_seconds 两个指标,这两个是 PushGateway 自动生成的, 分别用于记录指标数据的成功上传时间和失败上传时间。
6.2.2 Prometheus 配置数据采集
编辑配置文件root@ubuntu18-server11:~# vim /apps/prometheus/prometheus.yml
root@ubuntu18-server11:~# systemctl restart prometheus.service
6.2.2.1 客户端推送多条数据
方式一
root@ubuntu18-server11:~# cat << EOF | curl --data-binary @- http://192.168.119.172:9091/metrics/job/test_job/instance/192.168.119.173
> node_memory_usage 43111123
> node_memory_total 102341238312
> EOF
方式二
# 基于脚本实现自定义数据的收集和推送
root@ubuntu18-server13:~# cat memory_monitor.sh
#!/bin/bash
total_memory=$(free | awk '/Mem/{print $2}')
used_memory=$(free | awk '/Mem/{print $3}')
job_name='custome_momory_monitor'
instance_name=`ifconfig eth0 | grep -w inet | awk '{print $2}'`
pushgateway_server='http://192.168.119.172:9091/metrics/job'
cat << EOF | curl --data-binary @- ${pushgateway_server}/${job_name}/instance/${instance_name}
#TYPE custom_memory_total gauge
custom_memory_total $total_memory
#TYPE custom_memory_used gauge
custom_memory_used $used_memory
EOF
root@ubuntu18-server13:~# bash memory_monitor.sh
6.2.3 删除指标数
# 通过命令行删除或者通过面板删除
root@ubuntu18-server13:~# curl -X DELETE http://192.168.119.172:9091/metrics/job/mytest_job
七、基于prometheus Federation(联邦集群)实现指标收集
Prometheus 联邦集群架构
161、162和163节点都部署 Prometheus 服务,161收集162和163的 Prometheus 服务收集的数据并通过 grafana 进行展示
7.1 部署 Prometheus 和 nodeexport 服务
# 161节点
root@ubuntu18-server:~# systemctl status prometheus.service
● prometheus.service - Prometheus Server
Loaded: loaded (/etc/systemd/system/prometheus.service; enabled; vendor preset: enabled)
Active: active (running) since Mon 2024-03-25 01:53:53 UTC; 7min ago
Docs: https://prometheus.io/docs/introduction/overview/
Main PID: 1261 (prometheus)
Tasks: 9 (limit: 4623)
CGroup: /system.slice/prometheus.service
└─1261 /apps/prometheus/prometheus --config.file=/apps/prometheus/prometheus.yml --web.enable-lifecycle
# 162节点同163节点
root@ubuntu18-server2:~# systemctl status prometheus.service
● prometheus.service - Prometheus Server
Loaded: loaded (/etc/systemd/system/prometheus.service; enabled; vendor preset: enabled)
Active: active (running) since Mon 2024-03-25 01:53:55 UTC; 8min ago
Docs: https://prometheus.io/docs/introduction/overview/
Main PID: 1399 (prometheus)
Tasks: 9 (limit: 4623)
CGroup: /system.slice/prometheus.service
└─1399 /apps/prometheus/prometheus --config.file=/apps/prometheus/prometheus.yml --web.enable-lifecycle
root@ubuntu18-server2:~# systemctl status node-exporter.service
● node-exporter.service - Prometheus Node Exporter
Loaded: loaded (/etc/systemd/system/node-exporter.service; enabled; vendor preset: enabled)
Active: active (running) since Mon 2024-03-25 01:53:55 UTC; 8min ago
Main PID: 1391 (node_exporter)
Tasks: 5 (limit: 4623)
CGroup: /system.slice/node-exporter.service
└─1391 /apps/node_exporter/node_exporter
7.2 配置联邦节点收集node-exporter数据
# 162节点修改配置文件
root@ubuntu18-server2:~# vim /apps/prometheus/prometheus.yml
- job_name: "prometheus-node1"
static_configs:
- targets: ["192.168.119.162:9100"]
root@ubuntu18-server2:~# systemctl daemon-reload && systemctl restart prometheus.service
# 163节点修改配置文件
root@ubuntu18-server3:~# vim /apps/prometheus/prometheus.yml
- job_name: "prometheus-node2"
static_configs:
- targets: ["192.168.119.163:9100"]
root@ubuntu18-server3:~# systemctl daemon-reload && systemctl restart prometheus.service
7.3 在161节点上收集联邦节点采集的数据
root@ubuntu18-server:~# vim /apps/prometheus/prometheus.yml
- job_name: "prometheus-federa-162"
scrape_interval: 10s
honor_labels: true
metrics_path: "/federate"
params:
'match[]':
- '{job="prometheus"}' # 收集的当前节点的数据
- '{job="prometheus-node1"}' # 联邦节点收集的其它主机的数据
static_configs:
- targets: ["192.168.119.162:9090"]
- job_name: "prometheus-federa-163"
scrape_interval: 10s
honor_labels: true
metrics_path: "/federate"
params:
'match[]':
- '{job="prometheus"}' # 收集的当前节点的数据
- '{job="prometheus-node2"}' # 联邦节点收集的其它主机的数据
static_configs:
- targets: ["192.168.119.163:9090"]
root@ubuntu18-server:~# systemctl restart prometheus.service