Prometheus(一)

官网:https://prometheus.io/

一、基于Operator和二进制安装prometheus环境

1.1 Operator部署

operator部署是基于已经编写好的yaml文件,可以将prometheus server、altermanager、grafana、node-exporter等组件一键批量部署。

前置条件:完成部署kubernetes

https://github.com/prometheus-operator/kube-prometheus

1.1.1 下载项目文件

# 下载并解压
wget https://github.com/prometheus-operator/kube-prometheus/archive/refs/heads/main.zip
unzip main.zip
cd kube-prometheus-main/manifests

1.1.2 查看对应yaml文件所需镜像

因部分镜像无法直接,需提前下载

[root@k8s-deploy manifests]#grep -R 'image: ' ./*
./alertmanager-alertmanager.yaml: image: quay.io/prometheus/alertmanager:v0.25.0
./blackboxExporter-deployment.yaml: image: quay.io/prometheus/blackbox-exporter:v0.23.0
./blackboxExporter-deployment.yaml: image: jimmidyson/configmap-reload:v0.5.0
./blackboxExporter-deployment.yaml: image: quay.io/brancz/kube-rbac-proxy:v0.14.0
./grafana-deployment.yaml: image: grafana/grafana:9.3.6
./kubeStateMetrics-deployment.yaml: image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.8.0
./kubeStateMetrics-deployment.yaml: image: quay.io/brancz/kube-rbac-proxy:v0.14.0
./kubeStateMetrics-deployment.yaml: image: quay.io/brancz/kube-rbac-proxy:v0.14.0
./nodeExporter-daemonset.yaml: image: quay.io/prometheus/node-exporter:v1.5.0
./nodeExporter-daemonset.yaml: image: quay.io/brancz/kube-rbac-proxy:v0.14.0
./prometheus-prometheus.yaml: image: quay.io/prometheus/prometheus:v2.42.0
./prometheusAdapter-deployment.yaml: image: registry.k8s.io/prometheus-adapter/prometheus-adapter:v0.10.0
./prometheusOperator-deployment.yaml: image: quay.io/prometheus-operator/prometheus-operator:v0.63.0
./prometheusOperator-deployment.yaml: image: quay.io/brancz/kube-rbac-proxy:v0.14.0

1.1.3 下载镜像

docker pull grafana/grafana:9.3.6
docker pull jimmidyson/configmap-reload:v0.5.0
docker pull quay.io/brancz/kube-rbac-proxy:v0.14.0
docker pull quay.io/prometheus/alertmanager:v0.25.0
docker pull quay.io/prometheus/blackbox-exporter:v0.23.0
docker pull quay.io/prometheus/node-exporter:v1.5.0
docker pull quay.io/prometheus/prometheus:v2.42.0
docker pull quay.io/prometheus-operator/prometheus-operator:v0.63.0
docker pull registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.8.0
docker pull registry.k8s.io/prometheus-adapter/prometheus-adapter:v0.10.0
# 若部分镜像无法直接下载,可通过docker hub搜索同一镜像进行下载
docker pull bitnami/kube-state-metrics:2.8.0
docker pull v5cn/prometheus-adapter:v0.10.0
#docker pull bitnami/kube-rbac-proxy:0.14.0

1.1.4 上传镜像至本地harbor仓库

# 根据实际下载镜像进行打tag
docker tag grafana/grafana:9.3.6 harbor.chu.net/baseimages/grafana:9.3.6
docker tag jimmidyson/configmap-reload:v0.5.0 harbor.chu.net/baseimages/configmap-reload:v0.5.0
docker tag quay.io/brancz/kube-rbac-proxy:v0.14.0 harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0
docker tag quay.io/prometheus/alertmanager:v0.25.0 harbor.chu.net/baseimages/alertmanager:v0.25.0
docker tag quay.io/prometheus/blackbox-exporter:v0.23.0 harbor.chu.net/baseimages/blackbox-exporter:v0.23.0
docker tag quay.io/prometheus/node-exporter:v1.5.0 harbor.chu.net/baseimages/node-exporter:v1.5.0
docker tag quay.io/prometheus/prometheus:v2.42.0 harbor.chu.net/baseimages/prometheus:v2.42.0
docker tag quay.io/prometheus-operator/prometheus-operator:v0.63.0 harbor.chu.net/baseimages/prometheus-operator:v0.63.0
## 若镜像无法下载,使用代替镜像
#docker tag registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.8.0 harbor.chu.net/baseimages/kube-state-metrics:v2.8.0
#docker tag registry.k8s.io/prometheus-adapter/prometheus-adapter:v0.10.0 harbor.chu.net/baseimages/prometheus-adapter:v0.10.0
docker tag bitnami/kube-state-metrics:2.8.0 harbor.chu.net/baseimages/kube-state-metrics:v2.8.0
docker tag v5cn/prometheus-adapter:v0.10.0 harbor.chu.net/baseimages/prometheus-adapter:v0.10.0
# 上传镜像
docker push harbor.chu.net/baseimages/grafana:9.3.6
docker push harbor.chu.net/baseimages/configmap-reload:v0.5.0
docker push harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0
docker push harbor.chu.net/baseimages/alertmanager:v0.25.0
docker push harbor.chu.net/baseimages/blackbox-exporter:v0.23.0
docker push harbor.chu.net/baseimages/node-exporter:v1.5.0
docker push harbor.chu.net/baseimages/prometheus:v2.42.0
docker push harbor.chu.net/baseimages/prometheus-operator:v0.63.0
docker push harbor.chu.net/baseimages/kube-state-metrics:v2.8.0
docker push harbor.chu.net/baseimages/prometheus-adapter:v0.10.0

1.1.5 修改yaml文件镜像名称

sed -i 's@quay.io/prometheus/alertmanager:v0.25.0@harbor.chu.net/baseimages/alertmanager:v0.25.0@g' alertmanager-alertmanager.yaml
sed -i 's@quay.io/prometheus/blackbox-exporter:v0.23.0@harbor.chu.net/baseimages/blackbox-exporter:v0.23.0@g' blackboxExporter-deployment.yaml
sed -i 's@jimmidyson/configmap-reload:v0.5.0@harbor.chu.net/baseimages/configmap-reload:v0.5.0@g' blackboxExporter-deployment.yaml
sed -i 's@quay.io/brancz/kube-rbac-proxy:v0.14.0@harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0@g' blackboxExporter-deployment.yaml
sed -i 's@grafana/grafana:9.3.6@harbor.chu.net/baseimages/grafana:9.3.6@g' grafana-deployment.yaml
sed -i 's@registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.8.0@harbor.chu.net/baseimages/kube-state-metrics:v2.8.0@g' kubeStateMetrics-deployment.yaml
sed -i 's@quay.io/brancz/kube-rbac-proxy:v0.14.0@harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0@g' kubeStateMetrics-deployment.yaml
sed -i 's@quay.io/prometheus/node-exporter:v1.5.0@harbor.chu.net/baseimages/node-exporter:v1.5.0@g' nodeExporter-daemonset.yaml
sed -i 's@quay.io/brancz/kube-rbac-proxy:v0.14.0@harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0@g' nodeExporter-daemonset.yaml
sed -i 's@quay.io/prometheus/prometheus:v2.42.0@harbor.chu.net/baseimages/prometheus:v2.42.0@g' prometheus-prometheus.yaml
sed -i 's@registry.k8s.io/prometheus-adapter/prometheus-adapter:v0.10.0@harbor.chu.net/baseimages/prometheus-adapter:v0.10.0@g' prometheusAdapter-deployment.yaml
sed -i 's@quay.io/prometheus-operator/prometheus-operator:v0.63.0@harbor.chu.net/baseimages/prometheus-operator:v0.63.0@g' prometheusOperator-deployment.yaml
sed -i 's@quay.io/brancz/kube-rbac-proxy:v0.14.0@harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0@g' prometheusOperator-deployment.yaml

查看修改后镜像

[root@k8s-deploy manifests]#grep -R 'image: ' ./*
./alertmanager-alertmanager.yaml: image: harbor.chu.net/baseimages/alertmanager:v0.25.0
./blackboxExporter-deployment.yaml: image: harbor.chu.net/baseimages/blackbox-exporter:v0.23.0
./blackboxExporter-deployment.yaml: image: harbor.chu.net/baseimages/configmap-reload:v0.5.0
./blackboxExporter-deployment.yaml: image: harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0
./grafana-deployment.yaml: image: harbor.chu.net/baseimages/grafana:9.3.6
./kubeStateMetrics-deployment.yaml: image: harbor.chu.net/baseimages/kube-state-metrics:v2.8.0
./kubeStateMetrics-deployment.yaml: image: harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0
./kubeStateMetrics-deployment.yaml: image: harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0
./nodeExporter-daemonset.yaml: image: harbor.chu.net/baseimages/node-exporter:v1.5.0
./nodeExporter-daemonset.yaml: image: harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0
./prometheus-prometheus.yaml: image: harbor.chu.net/baseimages/prometheus:v2.42.0
./prometheusAdapter-deployment.yaml: image: harbor.chu.net/baseimages/prometheus-adapter:v0.10.0
./prometheusOperator-deployment.yaml: image: harbor.chu.net/baseimages/prometheus-operator:v0.63.0
./prometheusOperator-deployment.yaml: image: harbor.chu.net/baseimages/kube-rbac-proxy:v0.14.0

1.1.6 执行创建

# 先创建资源
kubectl apply --server-side -f setup/
# 创建服务
kubectl apply -f ./

查看pod状态

[root@k8s-deploy manifests]#kubectl get pod -n monitoring
NAME READY STATUS RESTARTS AGE
alertmanager-main-0 2/2 Running 1 (27s ago) 51s
alertmanager-main-1 2/2 Running 1 (45s ago) 51s
alertmanager-main-2 2/2 Running 1 (46s ago) 51s
blackbox-exporter-85f8d5786b-pp4sc 3/3 Running 0 70s
grafana-ddfb4c79b-5l2sx 1/1 Running 0 67s
kube-state-metrics-5768c678b8-9wgrp 3/3 Running 0 65s
node-exporter-6glzk 2/2 Running 0 65s
node-exporter-85xbk 2/2 Running 0 65s
node-exporter-98gt7 2/2 Running 0 65s
node-exporter-lx6cl 2/2 Running 0 65s
node-exporter-m74nh 2/2 Running 0 65s
node-exporter-x9q8m 2/2 Running 0 65s
prometheus-adapter-856b98ffc5-8nn69 1/1 Running 0 62s
prometheus-adapter-856b98ffc5-mmmr4 1/1 Running 0 62s
prometheus-k8s-0 2/2 Running 0 49s
prometheus-k8s-1 2/2 Running 0 49s
prometheus-operator-5c7945d6cd-rznx7 2/2 Running 0 61s

查看service,默认为ClusterIP

[root@k8s-deploy manifests]#kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
alertmanager-main ClusterIP 10.100.37.77 <none> 9093/TCP,8080/TCP 93s
alertmanager-operated ClusterIP None <none> 9093/TCP,9094/TCP,9094/UDP 74s
blackbox-exporter ClusterIP 10.100.23.226 <none> 9115/TCP,19115/TCP 92s
grafana ClusterIP 10.100.165.17 <none> 3000/TCP 89s
kube-state-metrics ClusterIP None <none> 8443/TCP,9443/TCP 88s
node-exporter ClusterIP None <none> 9100/TCP 87s
prometheus-adapter ClusterIP 10.100.20.101 <none> 443/TCP 84s
prometheus-k8s ClusterIP 10.100.244.224 <none> 9090/TCP,8080/TCP 85s
prometheus-operated ClusterIP None <none> 9090/TCP 71s
prometheus-operator ClusterIP None <none> 8443/TCP 83s

默认已设置相关网络策略,可先删除相关策略,后续可根据实际需求进行修改调整

[root@k8s-deploy manifests]#for i in `ls |grep network`;do kubectl delete -f $i;done
networkpolicy.networking.k8s.io "alertmanager-main" deleted
networkpolicy.networking.k8s.io "blackbox-exporter" deleted
networkpolicy.networking.k8s.io "grafana" deleted
networkpolicy.networking.k8s.io "kube-state-metrics" deleted
networkpolicy.networking.k8s.io "node-exporter" deleted
networkpolicy.networking.k8s.io "prometheus-k8s" deleted
networkpolicy.networking.k8s.io "prometheus-adapter" deleted
networkpolicy.networking.k8s.io "prometheus-operator" deleted

1.1.7 验证Prometheus Web页面

客户端浏览器访问,需将prometheus-service.yaml​文件中service type更改为NodePort

apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.42.0
name: prometheus-k8s
namespace: monitoring
spec:
type: NodePort # 添加NodePort类型
ports:
- name: web
port: 9090
targetPort: web
nodePort: 39090 # 设置端口号
- name: reloader-web
port: 8080
targetPort: reloader-web
nodePort: 38080 # 设置端口号
selector:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
sessionAffinity: ClientIP

更新service,查看prometheus-k8s​ 暴露node端口号

[root@k8s-deploy manifests]#kubectl apply -f prometheus-service.yaml
[root@k8s-deploy manifests]#kubectl get svc -n monitoring|grep prometheus
prometheus-adapter ClusterIP 10.100.20.101 <none> 443/TCP 15m
prometheus-k8s NodePort 10.100.244.224 <none> 9090:39090/TCP,8080:38080/TCP 15m
prometheus-operated ClusterIP None <none> 9090/TCP 15m
prometheus-operator ClusterIP None <none> 8443/TCP 15m

浏览器访问

查看Status

1.1.8 验证grafana Web页面

客户端浏览器访问,需将prometheus-service.yaml​文件中service type更改为NodePort

apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 9.3.6
name: grafana
namespace: monitoring
spec:
type: NodePort # 添加NodePort类型
ports:
- name: http
port: 3000
targetPort: http
nodePort: 33000 # 设置端口号
selector:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus

更新service

kubectl apply -f grafana-service.yaml
[root@k8s-deploy manifests]#kubectl get svc -n monitoring|grep grafa
grafana NodePort 10.100.165.17 <none> 3000:33000/TCP 44m

浏览器访问,默认用户名、密码(admin:admin)

进入首页

1.2 二进制部署

https://github.com/prometheus

# prometheus部署在k8s集群服务器上
prometheus-server1/k8s-master1 10.0.0.11

1.2.1 下载prometheus server二进制程序

下载地址:https://github.com/prometheus/prometheus/releases

mkdir /apps
cd /apps
wget https://github.com/prometheus/prometheus/releases/download/v2.40.7/prometheus-2.40.7.linux-amd64.tar.gz
tar -xvf prometheus-2.40.7.linux-amd64.tar.gz
ln -s /apps/prometheus-2.40.7.linux-amd64 /apps/prometheus

1.2.2 启动prometheus服务

  1. 创建service文件
cat >>/etc/systemd/system/prometheus.service <<EOF
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/apps/prometheus/
ExecStart=/apps/prometheus/prometheus --config.file=/apps/prometheus/prometheus.yml --web.enable-lifecycle
[Install]
WantedBy=multi-user.target
EOF

说明--web.enable-lifecycle​表示动态加载配置,可以用命令 curl -X POST http://localhost:9090/-/reload​ 重新加载配置文件

prometheus启动参数配置参考:https://www.cnblogs.com/lifuqiang/articles/17007950.html

  1. 启动服务
systemctl daemon-reload
systemctl enable --now prometheus.service
  1. 验证状态
# 查看服务状态
[root@k8s-master1 apps]#systemctl is-active prometheus
active
# 查看监听端口
[root@k8s-master1 apps]#netstat -nltp|grep 9090
tcp6 0 0 :::9090 :::* LISTEN 1965/prometheus

1.2.3 验证prometheus web界面

二、通过node-exporter和cadvisor收集指标数据

2.1 node-exporter

k8s各node节点安装node-exporter(二进制或daemonset方式),用于收集各k8s节点宿主机的监控指标数据,默认监听端口为9100

​​

2.1.1 daemonset方式部署node-exporter

说明:若k8s环境中已通过其他方式部署prometheus node-exporter,需先停止或更改监听端口,防止端口冲突

2.1.1.1 下载node-exporter镜像

docker pull prom/node-exporter:v1.3.1
docker tag prom/node-exporter:v1.3.1 harbor.chu.net/baseimages/node-exporter:v1.3.1
docker push harbor.chu.net/baseimages/node-exporter:v1.3.1

2.1.1.2 编写yaml文件

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitoring
labels:
k8s-app: node-exporter
spec:
selector:
matchLabels:
k8s-app: node-exporter
template:
metadata:
labels:
k8s-app: node-exporter
spec:
tolerations: # 容忍
- effect: NoSchedule
key: node-role.kubernetes.io/master
containers:
- image: harbor.chu.net/baseimages/node-exporter:v1.3.1 # prom/node-exporter:v1.3.1
imagePullPolicy: Always #IfNotPresent #镜像拉取策略
name: prometheus-node-exporter
ports:
- containerPort: 9100
hostPort: 9100 # 宿主机暴露port
protocol: TCP
name: metrics
volumeMounts:
- mountPath: /host/proc
name: proc
- mountPath: /host/sys
name: sys
- mountPath: /host
name: rootfs
args:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /
hostNetwork: true
hostPID: true
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: "true"
labels:
k8s-app: node-exporter
name: node-exporter
namespace: monitoring
spec:
type: NodePort
ports:
- name: http
port: 9100
nodePort: 39100
protocol: TCP
selector:
k8s-app: node-exporter

2.1.1.3 执行创建

kubectl create ns monitoring
kubectl apply -f daemonset-deploy-node-exporter.yaml

查看状态

# 查看pod状态
[root@k8s-deploy ~]#kubectl get pod -n monitoring -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
node-exporter-7s7kf 1/1 Running 0 3m21s 10.0.0.43 10.0.0.43 <none> <none>
node-exporter-hjk6c 1/1 Running 0 3m21s 10.0.0.13 10.0.0.13 <none> <none>
node-exporter-qn8w7 1/1 Running 5 (115s ago) 3m21s 10.0.0.42 10.0.0.42 <none> <none>
node-exporter-qx9kg 1/1 Running 0 3m21s 10.0.0.41 10.0.0.41 <none> <none>
node-exporter-rcszx 1/1 Running 0 3m21s 10.0.0.12 10.0.0.12 <none> <none>
node-exporter-x8hft 1/1 Running 0 3m21s 10.0.0.11 10.0.0.11 <none> <none>
# 查看service
[root@k8s-deploy ~]#kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
node-exporter NodePort 10.100.100.181 <none> 9100:39100/TCP 3m39s
# 宿主机监听端口
[root@k8s-master3 ~]#netstat -ntlp|grep 9100
tcp6 0 0 :::9100 :::* LISTEN 1854108/node_export

2.1.1.4 验证node-exporter web页面

访问宿主机IP:39100

2.1.1.5 验证node-exporter指标数据

https://knowledge.zhaoweiguo.com/build/html/cloudnative/prometheus/metrics/kubernetes-nodes.html

访问service 宿主机IP:39100/metrics

直接访问宿主机IP:9100/metrics

常见指标说明

node_boot_time 系统自启动以后的总运行时间
node_cpu 系统CPU使用量
node_disk* 磁盘IO
node_filesystem* 系统文件使用量
node_load1 系统CPU负载
node_memory* 内存使用量
node_network* 网络带宽指标
go_* node exporter中go相关指标
process_* node exporter自身进程相关运行指标

2.1.2 prometheus server收集node-exporter数据

2.1.2.1 添加采集node节点数据配置

[root@k8s-master1 apps]#cat /apps/prometheus/prometheus.yml
# 全局配置
global:
scrape_interval: 15s # 数据采集间隔时间,默认为1 min
evaluation_interval: 15s # 规则扫描间隔时间,默认为1 min
# scrape_timeout: 10s # 数据采集超时时间,默认为10s。该值不能大于scrape_interval,否则Prometheus将会报错。
# 告警配置
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# 规则配置
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# 数据采集目标配置
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# 添加node节点数据采集配置
- job_name: "prometheus-node"
static_configs: # 静态配置
- targets: ["10.0.0.11:9100","10.0.0.12:9100","10.0.0.13:9100","10.0.0.41:9100","10.0.0.42:9100","10.0.0.43:9100" # node地址,端口

2.1.2.2 重启服务

systemctl restart prometheus.service

2.1.2.3 验证prometheus server数据采集状态

​​

2.1.2.4 验证node数据

​​

2.2 cadvisor

cadvisor(容器顾问)不仅可以收集一台机器上所有运行的容器信息,还提供基础查询界面和http接口,方便其他组件如prometheus进行数据抓取,cadvisor可以对节点机器上的容器进行实时监控和性能数据采集,包括容器的CPU使用情况、内存使用情况、网络吞吐量及文件系统使用情况。

https://github.com/google/cadvisor

2.2.1 daemonset方式部署cadvisor

2.2.1.1 下载cadvisor镜像

docker pull registry.cn-hangzhou.aliyuncs.com/zhangshijie/cadvisor-amd64:v0.39.3
docker tag registry.cn-hangzhou.aliyuncs.com/zhangshijie/cadvisor-amd64:v0.39.3 harbor.chu.net/baseimages/cadvisor-amd64:v0.39.3
docker push harbor.chu.net/baseimages/cadvisor-amd64:v0.39.3

2.2.1.2 编写yaml文件

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: cadvisor
namespace: monitoring
spec:
selector:
matchLabels:
app: cAdvisor
template:
metadata:
labels:
app: cAdvisor
spec:
tolerations: #污点容忍,忽略master的NoSchedule
- effect: NoSchedule
key: node-role.kubernetes.io/master
hostNetwork: true
restartPolicy: Always # 重启策略
containers:
- name: cadvisor
#image: registry.cn-hangzhou.aliyuncs.com/zhangshijie/cadvisor-amd64:v0.39.3 #修改实际镜像
image: harbor.chu.net/baseimages/cadvisor-amd64:v0.39.3
imagePullPolicy: Always # 镜像策略
ports:
- containerPort: 8080
volumeMounts:
- name: root
mountPath: /rootfs
- name: run
mountPath: /var/run
- name: sys
mountPath: /sys
- name: docker
mountPath: /var/lib/containerd
volumes:
- name: root
hostPath:
path: /
- name: run
hostPath:
path: /var/run
- name: sys
hostPath:
path: /sys
- name: docker
hostPath:
path: /var/lib/containerd # containerd默认数据目录,docker默认数据目录为/var/lib/docker

2.2.1.3 执行创建

kubectl create ns monitoring
kubectl apply -f daemonset-deploy-cadvisor.yaml
# 查看pod
[root@k8s-deploy case]#kubectl get pod -n monitoring -owide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
cadvisor-4tf8k 1/1 Running 0 14s 10.0.0.43 10.0.0.43 <none> <none>
cadvisor-bmq2c 1/1 Running 0 14s 10.0.0.12 10.0.0.12 <none> <none>
cadvisor-l5zg6 1/1 Running 0 14s 10.0.0.41 10.0.0.41 <none> <none>
cadvisor-lhzrb 1/1 Running 0 14s 10.0.0.42 10.0.0.42 <none> <none>
cadvisor-pkht7 1/1 Running 0 14s 10.0.0.13 10.0.0.13 <none> <none>
cadvisor-ww5p9 1/1 Running 0 14s 10.0.0.11 10.0.0.11 <none> <none>
# 查看宿主机监听端口
[root@k8s-node1 apps]#netstat -ntlp|grep 8080
tcp6 0 0 :::8080 :::* LISTEN 3795698/cadvisor

2.2.1.4 验证web页面及指标数据

  • 浏览器访问宿主机IP:8080,查看web页面

  • 浏览器访问 宿主机IP:8080/metrics​,查看指标数据

2.2.2 prometheus server收集cadvisor数据

2.2.2.1 cadvisor指标数据

指标名称 类型 含义
container_cpu_load_average_10s gauge 过去10s容器CPU的平均负载
container_cpu_usage_seconds_total counter 容器在每个CPU内核上的累积占用时间(单位: 秒)
container_cpu_system_seconds_total counter System CPU累积占用时间(单位: 秒)
container_cpu_user_seconds_total counter User CPU累积占用时间(单位: 秒)
container_fs_usage_bytes gauge 容器中文件系统的使用量(单位: 字节)
container_fs_limit_bytes gauge 容器可以使用的文件系统总量(单位: 字节)
container_fs_reads_bytes_total counter 容器累积读取数据的总量(单位: 字节)
container_fs_writes_bytes_total counter 容器累积写入数据的总量(单位: 字节)
container_memory_max_usage_bytes gauge 容器的最大内存使用量(单位: 字节)
container_memory_usage_bytes gauge 容器当前的内存使用量(单位: 字节)
container_spec_memory_limit_bytes gauge 容器的内存使用量限制
machine_memory_bytes gauge 当前主机的内存总量
container_network_receive_bytes_total counter 容器网络累积接收数据总量(单位:字节)
container_network_transmit_bytes_total counter 容器网络累积传输数据总量(单位:字节)

当能够正常采集到cAdvisor 的样本数据后,可以通过以下表达式计算容器的CPU使用率:

  • ​容器CPU使用率

    ​​sum(irate(container_cpu_usage_seconds_total{imagel=""}[1m])) without(cpu)
  • 查询容器内存使用量(单位:字节)

    ​​container_memory_usage_bytes{image!=""}
  • 查询容器网络接收量(速率)(单位:字节/秒)

    sum(rate(container_network_receive_bytes_total{image!=""}[1m])) without (interface)
  • 容器网络传输量字节/秒

    sum(rate(container_network_transmit_bytes_total{imagel=""}[1m])) without (interface)
  • 容器文件系统读取速率字节/秒

    sum(rate(container_fs_reads_bytes_totalf{image!=""}[1m])) without (device)
  • 容器文件系统写入速率字节/秒

    sum(rate(container_fs_writes_bytes_total{image!=""}[1m])) without (device)

cadvisor常用容器监控指标

  1. 网络流量

    #容器网络接收的字节数(1分钟内),根据名称查询name=~".+"
    sum(rate(container_network_receive_bytes_total{name=~".+"}[1m])) by (name)
    #容器网络传输的字节数(1分钟内),根据名称查询name=~".+"
    sum(rate(container_network_transmit_bytes_total{name=~".+"}[1m])) by (name)

  2. 容器CPU相关

    #所用容器system cpu的累计使用时间(1min内)
    sum(rate(container_cpu_system_seconds_total[1m]))
    #每个容器system cpu的使用时间(1min内)
    sum(irate(container_cpu_system_seconds_total{imagel=""}[1m])) without (cpu)
    #每个容器的Cpu使用率
    sum(rate(container_cpu_usage_seconds_total{name=~".+"}[1m])) by (name)*100
    #总容器的cpu使用率
    sum(sum(rate(container_cpu_usage_seconds_total{name=~".+"}[1m])) by (name)*100)

2.2.2.2 添加采集cadvisor数据配置

#cat /apps/prometheus/prometheus.yml
......
scrape_configs:
......
# 添加cadvisor信息
- job_name: "prometheus-cadvisor"
static_configs:
- targets: ["10.0.0.11:8080","10.0.0.12:8080","10.0.0.13:8080","10.0.0.41:8080","10.0.0.42:8080","10.0.0.43:8080"]

重启服务

systemctl restart prometheus.service

2.2.2.3 验证prometheus数据采集状态

2.2.2.4 验证cadvisor数据

sum(rate(container_cpu_usage_seconds_total{name=~".+"}[1m])) by (name)*100

三、通过grafana展示prometheus的node和pod数据

官网:https://grafana.com/grafana

grafana server(10.0.0.62)与prometheus server进行分离

3.1 二进制部署grafana

下载:https://grafana.com/grafana/download

国内镜像源下载:https://mirrors.tuna.tsinghua.edu.cn/grafana/

安装说明:https://grafana.com/docs/grafana/latest/setup-grafana/installation/

3.1.1 下载并安装

wget https://mirrors.tuna.tsinghua.edu.cn/grafana/apt/pool/main/g/grafana-enterprise/grafana-enterprise_9.3.0_amd64.deb
apt update
apt-get install -y adduser libfontconfig1
dpkg -i grafana-enterprise_9.3.0_amd64.deb

3.1.2 修改grafana配置文件

vim /etc/grafana/grafana.ini
......
# 配置端口类型、地址、端口号
[server]
protocol = http
http_addr = 10.0.0.62
http_port = 3000

3.1.3 启动服务

systemctl enable grafana-server.service
systemctl restart grafana-server.service

查看端口

[root@grafana opt]#netstat -ntlp|grep 3000
tcp 0 0 10.0.0.62:3000 0.0.0.0:* LISTEN 5268/grafana-server

3.1.4 验证grafana web界面

  1. 登录http://10.0.0.62:3000

  1. 进入首页

3.1.5 添加数据源


选择prometheus

设置数据源名称,访问prometheus server的URL地址

3.2 展示监控数据

模板:https://grafana.com/grafana/dashboards/

3.2.1 展示node数据

3.2.1.1 查找模板

3.2.1.2 查看模板信息,下载模板

​​​

3.2.1.3 导入模板

Dashboard--Import

可选择导入json文件、加载模板ID(会自动下载该模板)、复制json文件内容任一方式导入模板

选择数据源

3.2.1.4 展示node监控数据

进入首页,选择相应的dashboard

查看监控数据

​​​

3.2.2 展示pod数据

3.2.2.1 查找模板

3.2.2.2 查看模板信息,下载模板

3.2.2.3 导入模板

3.2.2.4 展示pod监控数据


四、梳理prometheus服务发现

4.1 服务发现机制

prometheus默认是采用pull方式拉取监控数据的,也就是定时去目标主机上抓取metrics数据,每一个被抓取的目标需要暴露一个HTTP接口,prometheus通过这个暴露的接口就可以获取到相应的指标数据,这种方式需要由目标服务决定采集的目标有哪些,通过配置在scarpe_confis中的各种job来实现,无法动态感知新服务,如果后面增加了节点或组件信息,就得手动修改prometheus配置,并重启prometheus,很不方便,所以出现了动态服务发现,动态服务发现能够自动发现集群中的新端点,并加入到配置中,通过服务发现,prometheus能查询到需要监控的target列表,然后轮询这些target获取监控数据。

4.2 标签重写(relabeling)

prometheus的relabeling能够在抓取到目标实例之前把目标实例的元数据标签动态重新修改,动态添加或者覆盖标签。

prometheus从kubernetes API动态发现target之后,在被发现的target实例中,都包含一些原始的Metadata标签信息,默认标签有:

__address__: 以<host>:<port>格式显示targets地址
__scheme__: 采集的目标服务地址的scheme形式,HTTP或HTTPS
__metrics_path__:采集的目标服务访问路径

4.2.1 重写目的

为了更好的识别监控指标,便于后期调用数据绘图、告警等需求,prometheus支持对发现的目标进行label修改,在两个阶段可以重新标记:

配置
重新标签
relabel_configs
抓取
重新标签
metric_relabel_configs
TSDB
  • relabel_configs

    在对target进行数据采集之前(例如在采集数据之前重新定义标签信息,如目的IP、目的端口等信息),可以使用relabel_configs添加、修改或修改一些标签,也可以只采集特定目标或过滤目标。

  • metric_relabel_configs

    在对target进行数据采集之后,即如果是已抓取到指标数据时,可以使用metric_relabel_configs做最后的重新标记和过滤

4.2.2 label

  • source_label

源标签,没有经过relabel处理之前标签的名称

  • target_label

通过action处理之后新的标签名称

  • regex

给定的值或正则表达式匹配,匹配源标签的值

  • replacement

通过分组替换后标签(target_label)对应的/()/() $1:$2

4.2.3 action

https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config

  • replace

替换标签值,根据regex正则匹配到源标签的值,使用replacement来引用表达式匹配的分组

  • keep

满足regex正则条件的实例进行采集,把source_labels中没有匹配到regex正则内容的target实例丢掉,即只采集匹配成功的实例

  • drop

满足regex正则条件的实例不采集,把source_labels中匹配到的regex正则内容的target实例丢掉,即只采集没有匹配成功的实例

  • hashmod

使用hashmod计算source_labels的Hash值并进行对比,基于自定义的模数取模,以实现对目标进行分类、重新赋值等功能

scrape_configs:
- job_name: ip_job
relabel_configs:
- source_labels: [__address__]
modulus: 4
target_label: __ip_hash
action: hashmod
- source_labels: [__ip_hash]
regex: ^1$
action: keep
  • labelmap

匹配regex所有标签名称,然后复制匹配标签的值进行分组,可以通过replacement分组引用(${1},${2},...)​替代

  • labelkeep

匹配regex所有标签名称,其他不匹配的标签都将从标签集中删除

  • labeldrop

匹配regex所有标签名称,其他匹配的标签都将从标签集中删除

4.3 服务发现类型

prometheus获取数据源target的方式有多种,如静态配置和动态服务发现配置,prometheus目前支持的服务发现有多种,常用发现方式的主要分为以下几种:

静态服务发现、基于文件的服务发现、DNS服务发现、Consul服务发现、基于kubernetes API服务发现。

更多说明见:https://prometheus.io/docs/prometheus/latest/configuration/configuration/#configuration-file

4.3.1 静态服务发现

静态服务发现,基于prometheus配置文件指定的监控目标,每当有一个新的目标实例需要监控,都需要手动修改配置文件,配置目标target

prometheus server配置(yaml)示例:

scrape_configs:
- job_name: "staic_test" # job名称
# metrics_path: "/metrics" # 默认URI
# scheme: http # 默认协议
static_configs: # 静态服务配置
- targets: ["10.0.0.11:8080","10.0.0.12:8080","10.0.0.13:8080"] # 目标端点地址

4.3.2 基于文件的服务发现

基于指定的文件实现服务发现,发现监控目标

prometheus server配置(yaml)示例:

scrape_configs:
# 基于文件服务发现监控配置
- job_name: 'file_sd_test'
scrape_interval: 10s # 数据采集间隔时间
file_sd_configs:
- files: # 支持yaml和json格式文件
- /data/prometheus/static_conf/*.yml
refresh_interval: 10s # 重新读取文件的刷新时间

/data/prometheus/static_conf/​目录下yaml文件内容

- targets: ['10.0.0.11:39100','10.0.0.12:39100']

4.3.3 DNS服务发现

基于DNS的服务发现允许配置指定一组的DNS域名,这些域名会定期查询以发现目标列表,域名需要可以被配置的DNS服务器解析为IP。

此服务发现方式仅支持基本的DNS A、AAAA和SRV记录查询。

A记录: 域名解析为一个IPv4地址
AAAA记录: 域名解析为一个IPv6地址
SRV: SRV记录了哪台计算机提供了具体哪个服务,格式为:服务名称.协议类型.域名(如:_example-server._tcp.www.mydns.com)

prometheus server配置(yaml)示例:

scrape_configs:
- job_name: 'dns_sd_test'
scrape_interval: 10s # 数据采集间隔时间
dns_sd_configs:
- name: ["node1.example.com","node2.example.com"] # 域名
type: A
port: 9100

4.3.4 Consul服务发现

https://www.consul.io/

consul基于golang开发的开源工具,主要面向分布式,服务化的系统提供服务注册、服务发现和配置管理的功能,提供服务发现/注册、健康检查和保持一致性等功能。

Consul是一个分布式k/v数据库,常用于服务的服务注册和发现。基于consul服务动态发现监控目标,prometheus一直监控consul服务,当发现在consul中注册的服务有变化,prometheus就会自动监控到所有注册到consul中目标资源。

prometheus server配置(yaml)示例:

scrape_configs:
- job_name: 'consul_sd_test'
honor_labels: true
metrics_path: "/metrics"
scheme: http
consul_sd_configs:
- server: 10.0.0.11:8500
services: [] # 发现的目标服务名称,空为所有服务
- server: 10.0.0.12:8500
services: []

参数说明:

honor_labels :控制prometheus如何处理已经存在于已抓取数据中的标签与prometheus将附加服务器端的标签之间的冲突("job"和"instance"标签,手动配置的目标标签已经服务发现实现生成的标签)。

如果honor_labels设置为“true”,则保留已抓取数据的标签值并忽略冲突的prometheus服务器端标签来解决标签冲突;另外如果被采集端有标签但是值为空,则使用prometheus本地标签值;如果被采集端没有此标签,但是prometheus配置了,那使用prometheus配置的标签值。

如果honor_labels设置为“false”,则通过将已抓取数据中的冲突标签重命名为exported_<original-label>​(如expoeterd_instance​,exporterd_job​)然后附加服务器端标签来解决标签冲突。

4.3.5 基于kubernetes API实现服务发现

基于kubernetes API实现服务发现,prometheus与kubernetes的API进行交互,动态的发现kubernetes中部署的所有可监控的目标资源。

在Kubernetes中,Prometheus 通过与 Kubernetes API 集成主要支持5种服务发现模式:Node、Service、Pod、Endpoints、Ingress。不同的服务发现模式适用于不同的场景,例如:node适用于与主机相关的监控资源,如节点中运行的Kubernetes 组件状态、节点上运行的容器状态等;service 和 ingress 适用于通过黑盒监控的场景,如对服务的可用性以及服务质量的监控;endpoints 和 pod 均可用于获取 Pod 实例的监控数据,如监控用户或者管理员部署的支持 Prometheus 的应用。

prometheus server配置示例:

...
scrape_configs:
- job_name: "kubernetes_sd_test"
scheme: http
kubernetes_sd_configs:
- role: node

五、在prometheus实现kubernetes-apiserver及coredns服务发现

https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config

5.1 目标发现模式

在Kubernetes中,Prometheus 通过与 Kubernetes API 集成主要支持5种服务发现模式:Node、Service、Pod、Endpoints、Ingress。不同的服务发现模式适用于不同的场景,例如:node适用于与主机相关的监控资源,如节点中运行的Kubernetes 组件状态、节点上运行的容器状态等;service 和 ingress 适用于通过黑盒监控的场景,如对服务的可用性以及服务质量的监控;endpoints 和 pod 均可用于获取 Pod 实例的监控数据,如监控用户或者管理员部署的支持 Prometheus 的应用。

node

node角色可以发现集群中每个node节点的地址端口,默认为Kubelet的HTTP端口。目标地址默认为Kubernetes节点对象的第一个现有地址,地址类型顺序为NodeInternalIP​、NodeExternalIP​、NodeLegacyHostIP​和NodeHostName​。

作用:监控K8S的node节点的服务器相关的指标数据。

可用标签:

__meta_kubernetes_node_name: node节点的名称
__meta_kubernetes_node_label_<labelname>: k8s中node节点的标签.<labelname>代表标签名称
__meta_kubernetes_node_labelpresent_<labelname>: 标签存在则为true.<labelname>代表标签名称
__meta_kubernetes_node_annotation_<annotationname>: k8s中node节点的注解.<annotationname>代表注解名称
__meta_kubernetes_node_annotationpresent_<annotationname>: 注解存在则为true.<annotationname>代表注解名称
__meta_kubernetes_node_address_<address_type>: 不同类型的node节点地址,例如:
_meta_kubernetes_node_address_Hostname="test-k8s-node1"
_meta_kubernetes_node_address_InternalIP="10.0.0.11"
instance: 从apiserver获取到的节点名称

service

service​角色可以发现每个service的ip和port,将其作为target。这对于黑盒监控(blackbox)很有用。

即:一个Service访问到哪个pod,就把哪个pod的数据传上来。使用的场景很少。只是看Service对应业务是否健康的时候可以使用。

可用标签:

__meta_kubernetes_namespace: service所在的命名空间
__meta_kubernetes_service_annotation_<annotationname>: k8s中service的注解
__meta_kubernetes_service_annotationpresent_<annotationname>: 注解存在则为true
__meta_kubernetes_service_cluster_ip: k8s中service的clusterIP
__meta_kubernetes_service_external_name: k8s中service的external_name
__meta_kubernetes_service_label_<labelname>: k8s中service的标签
__meta_kubernetes_service_labelpresent_<labelname>: 标签存在则为true
__meta_kubernetes_service_name: k8s中service的名称
__meta_kubernetes_service_port_name: k8s中service的端口
__meta_kubernetes_service_port_protocol: k8s中service的端口协议
__meta_kubernetes_service_type: k8s中service的类型

pod

pod角色可以发现所有pod并将其中的pod ip作为target。如果有多个端口或者多个容器,将生成多个target(例如:80,443这两个端口,pod ip为10.0.244.22,则将10.0.244.22:80,10.0.244.22:443分别作为抓取的target)。
如果容器没有指定的端口,则会为每个容器创建一个无端口target,以便通过relabel手动添加端口。

可用标签:

__meta_kubernetes_namespace: pod所在的命名空间
__meta_kubernetes_pod_name: pod的名称
__meta_kubernetes_pod_ip: pod的ip
__meta_kubernetes_pod_label_<labelname>: pod的标签
__meta_kubernetes_pod_labelpresent_<labelname>: 标签存在则为true
__meta_kubernetes_pod_annotation_<annotationname>: pod的注解
__meta_kubernetes_pod_annotationpresent_<annotationname>: 注解存在则为true
__meta_kubernetes_pod_container_init: 如果容器是InitContainer,则为true
__meta_kubernetes_pod_container_name: 容器的名称
__meta_kubernetes_pod_container_port_name: 容器的端口名称
__meta_kubernetes_pod_container_port_number: 容器的端口号
__meta_kubernetes_pod_container_port_protocol: 容器的端口协议
__meta_kubernetes_pod_ready: pod的就绪状态,truefalse
__meta_kubernetes_pod_phase: pod的生命周期状态.Pending, Running, Succeeded, Failed or Unknown
__meta_kubernetes_pod_node_name: pod所在node节点名称
__meta_kubernetes_pod_host_ip: pod所在node节点ip
__meta_kubernetes_pod_uid: pod的uid
__meta_kubernetes_pod_controller_kind: pod控制器的类型ReplicaSet ,DaemonSet,Job,StatefulSet...
__meta_kubernetes_pod_controller_name: pod控制器的名称

Endpoints

endpoints​角色可以从ep(endpoints)列表中发现所有targets

可用标签:

__meta_kubernetes_namespace: ep对象所在的命名空间
__meta_kubernetes_endpoints_name: ep的名称
直接从ep对象的列表中获取的所有target,下面的标签将会被附加上
__meta_kubernetes_endpoint_hostname: ep的主机名
__meta_kubernetes_endpoint_node_name: ep的node节点名
__meta_kubernetes_endpoint_ready: ep的就绪状态,truefalse
__meta_kubernetes_endpoint_port_name: ep的端口名称
__meta_kubernetes_endpoint_port_protocol: ep的端口协议
__meta_kubernetes_endpoint_address_target_kind: ep对象的目标类型,比如Pod
__meta_kubernetes_endpoint_address_target_name: ep对象的目标名称,比如pod名称
如果ep是属于service的话,则会附加service角色的所有标签
对于ep的后端节点是pod,则会附加pod角色的所有标签(即上边介绍的pod角色可用标签)
如手动创建一个ep,这个ep关联到一个pod,则prometheus的标签中会包含这个pod角色的所有标签

Ingress

ingress​角色发现ingress的每个路径的target。这通常对黑盒监控很有用。该地址将设置为ingress中指定的host。

可用标签:

__meta_kubernetes_namespace: ingress所在的命名空间
__meta_kubernetes_ingress_name: ingress的名称
__meta_kubernetes_ingress_label_<labelname>: ingress的标签
__meta_kubernetes_ingress_labelpresent_<labelname>: 标签存在则为true
__meta_kubernetes_ingress_annotation_<annotationname>: ingress的注解
__meta_kubernetes_ingress_annotationpresent_<annotationname>: 注解存在则为true
__meta_kubernetes_ingress_scheme: ingress的协议,如果设置了tls则是https,默认http
__meta_kubernetes_ingress_path: ingress中指定的的路径。默认为/

示例

发现并监控prometheus命名空间下所有Service对应的所有pod的metrics数据

...
- job_name: prometheus-monitor
honor_timestamps: true
scrape_interval: 1m
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- prometheus
relabel_configs:
- source_labels: [__meta_kubernetes_service_name]
separator: ;
regex: prometheus-headless
replacement: $1
action: keep
- source_labels: [__meta_kubernetes_pod_container_name]
separator: ;
regex: prometheus
replacement: $1
action: keep
- source_labels: [__meta_kubernetes_namespace]
separator: ;
regex: (.*)
target_label: namespace
replacement: $1
action: replace

发现流程:找命名空间为prometheus​下的所有Service(Service注册在DNS上会暴露端口,因此不用考虑端口),然后Service由于包含了endpoints列表,因此可以找到所有的pod+port,再根据metrics_path可以拼接成http://pod+port/metrics​,进而监控了所有pod的监控指标

role是endpoints:此配置说明是通过Service找Pod

5.2 apiserver服务发现

apiserver作为kubernetes最核心的组件,它的监控也是非常有必要的,对于apiserver的监控,可以直接通过kubernetes的service来获取。

5.2.1 创建RBAC规则

apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: v1
kind: Secret
type: kubernetes.io/service-account-token
metadata:
name: monitoring-token
namespace: monitoring
annotations:
kubernetes.io/service-account.name: "prometheus"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- "extensions"
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring

查看

[root@k8s-deploy ~]#kubectl get svc
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kubernetes ClusterIP 10.100.0.1 <none> 443/TCP 38d
[root@k8s-deploy ~]#kubectl get ep
NAME ENDPOINTS AGE
kubernetes 10.0.0.11:6443,10.0.0.12:6443,10.0.0.13:6443 38d
# 查看serviceaccount
[root@k8s-deploy ~]#kubectl get sa -n monitoring
NAME SECRETS AGE
default 0 2d2h
prometheus 0 4m4s
# 查看secret
[root@k8s-deploy ~]#kubectl get secrets -n monitoring
NAME TYPE DATA AGE
monitoring-token kubernetes.io/service-account-token 3 4m53s

5.2.2 准备文件

  • token
# 生成token
kubectl describe secrets monitoring-token -n monitoring|grep "token:"|awk '{print $2}' > k8s.token
# 复制文件至prometheus server服务器上,需提前在prometheus server上创建目录mkdir -p /apps/certs
scp k8s.token 10.0.0.61:/apps/certs/
  • TLS证书
# 复制k8s上ca.pem(或ca.crt)文件至prometheus server服务器上
[root@prometheus-server1 ~]#scp 10.0.0.11:/etc/kubernetes/ssl/ca.pem /apps/certs

5.2.3 编写配置

  • prometheus server部署在k8s集群内
- job_name: "kubernetes-apiserver"
scheme: https
kubernetes_sd_configs:
- role: endpoints
tls_config: # 配置https方式,需要tls证书
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name]
regex: default;kubernets;https
action: keep
  • prometheus server部署在k8s集群外
- job_name: 'kubernetes-apiservers-monitor'
kubernetes_sd_configs:
- role: endpoints
api_server: https://10.0.0.10:6443 # k8s master VIP
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
scheme: https
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https

匹配说明:

含义为匹配namespace为default,svc名称为kubernetes并且协议是https,匹配成功后进行保留,并且把regex作为source_labels相对应的值。即labels为key、regex为值。

5.2.4 验证apiserver服务发现

查看apiserver信息

[root@k8s-deploy ~]#kubectl get ep
NAME ENDPOINTS AGE
kubernetes 10.0.0.11:6443,10.0.0.12:6443,10.0.0.13:6443 38d

5.2.5 apiserver指标数据

APIserver组件是k8s集群的入口,所有请求都是从apiserver进来的,所以对apiserver指标做监控可以用来判断集群的健康状况。

apiserver_request_total

查询apiserver最近10分钟不同方法的请求数量统计:

sum(rate(apiserver_request_total[10m])) by (resources,subresource,verb)

替换标签

- job_name: 'kubernetes-service-endpoints' # job名称
kubernetes_sd_configs:
- role: endpoints # endpoints发现
api_server: https://10.0.0.10:6443
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
scheme: https
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
relabel_configs: # 标签重写配置
# 保留标签然后再向下执行
- source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_service_name,__meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# 将__meta_kubernetes_namespace修改为kubernetes_namespace
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace

替换之前

替换之后

__meta_kubernetes_namespace​标签替换为kubernetes_namespace

annotation_prometheus_io_scrape

在k8s中,基于prometheus的发现规则,需要在被发现的目的target定义注解匹配annotation_prometheus_io_scrape=true,且必须匹配成功该注解才会保留监控target,然后再进行数据抓取并进行标签替换,如annotation_prometheus_io_scheme标签为http或https:

- job_name: 'kubernetes-test' # job名称
kubernetes_sd_configs:
- role: endpoints # endpoints发现
api_server: https://10.0.0.10:6443
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
scheme: https
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
relabel_configs: # 标签重写配置
# 将annotation_prometheus_io_scrape的值为true,保留标签然后再向下执行
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
# 将annotation_prometheus_io_scheme修改为__scheme__
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?) # 正则匹配http或https协议,其他协议不替换
- source_labels: [__scheme__]
action: replace
target_label: __scheme__
regex: https
replacement: http
# 将annotation_prometheus_io_path替换为__metrics_path__
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path
regex: (.+) #路径为1到任意长度(.表示\n之外的任意单个字符,+表示一次或多次)
# 地址发现即标签重写
- source_labels: [__address__,__meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2 # 格式为地址:端口
# 匹配regex所匹配的标签,然后进行应用
- action: labelmap
regex: __meta_kubernetes_service_label_(.+) #通过正则匹配名称

5.3 coredns服务发现

5.3.1 编写配置

- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
api_server: https://10.0.0.10:6443 # k8s master VIP
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
scheme: https
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__scheme__]
action: replace
target_label: __scheme__
regex: https
replacement: http
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_service_name

5.3.2 查看core-dns状态

[root@k8s-deploy ~]#kubectl describe svc kube-dns -n kube-system
Name: kube-dns
Namespace: kube-system
Labels: addonmanager.kubernetes.io/mode=Reconcile
k8s-app=kube-dns
kubernetes.io/cluster-service=true
kubernetes.io/name=CoreDNS
Annotations: prometheus.io/port: 9153 # 注解标签,用于prometheus匹配发现端口
prometheus.io/scrape: true # 注解标签,用于prometheus匹配抓取数据
Selector: k8s-app=kube-dns
Type: ClusterIP
IP Family Policy: SingleStack
IP Families: IPv4
IP: 10.100.0.2
IPs: 10.100.0.2
Port: dns 53/UDP
TargetPort: 53/UDP
Endpoints: 10.200.107.218:53,10.200.169.133:53,10.200.36.97:53
Port: dns-tcp 53/TCP
TargetPort: 53/TCP
Endpoints: 10.200.107.218:53,10.200.169.133:53,10.200.36.97:53
Port: metrics 9153/TCP
TargetPort: 9153/TCP
Endpoints: 10.200.107.218:9153,10.200.169.133:9153,10.200.36.97:9153
Session Affinity: None
Events: <none>

5.3.3 验证服务发现

修改deployment控制器的副本数,让endpoint数量发生变化,验证自动发现新添加的pod

[root@k8s-master3 ~]#kubectl get deploy -n kube-system
NAME READY UP-TO-DATE AVAILABLE AGE
calico-kube-controllers 1/1 1 1 39d
coredns 3/3 3 3 37d
[root@k8s-master3 ~]#kubectl scale deployment coredns --replicas=4 -n kube-system
deployment.apps/coredns scaled
[root@k8s-master3 ~]#kubectl get deploy -n kube-system
NAME READY UP-TO-DATE AVAILABLE AGE
calico-kube-controllers 1/1 1 1 39d
coredns 4/4 4 4 37d

​​

自动发现新的pod

注:由于prometheus server部署在k8s集群内,可访问ClusterIP,若prometheus部署在k8s集群外,需要将service类型修改为NodePort。

5.3.4 grafana展示监控

模板:14981

posted @   areke  阅读(553)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek “源神”启动!「GitHub 热点速览」
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 我与微信审核的“相爱相杀”看个人小程序副业
· C# 集成 DeepSeek 模型实现 AI 私有化(本地部署与 API 调用教程)
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
点击右上角即可分享
微信分享提示