k8s 监控之Prometheus部署安装
部署监控pod https://github.com/ruidongchenxi/k8s-ack/blob/main/node-export.yaml
[root@k8s-master cka]# cat node-export.yaml apiVersion: apps/v1 kind: DaemonSet metadata: name: node-exporter namespace: monitor-sa labels: name: node-exporter spec: selector: matchLabels: name: node-exporter template: metadata: labels: name: node-exporter spec: hostPID: true hostIPC: true hostNetwork: true containers: - name: node-exporter image: prom/node-exporter:v0.16.0 imagePullPolicy: IfNotPresent ports: - containerPort: 9100 resources: requests: cpu: 0.15 securityContext: privileged: true args: - --path.procfs - /host/proc - --path.sysfs - /host/sys - --collector.filesystem.ignored-mount-points - '"^/(sys|proc|dev|host|etc)($|/)"' volumeMounts: - name: dev mountPath: /host/dev - name: proc mountPath: /host/proc - name: sys mountPath: /host/sys - name: rootfs mountPath: /rootfs tolerations: - key: "node-role.kubernetes.io/master" operator: "Exists" effect: "NoSchedule" volumes: - name: proc hostPath: path: /proc - name: dev hostPath: path: /dev - name: sys hostPath: path: /sys - name: rootfs hostPath: path: / [root@k8s-master cka]# kubectl create ns monitor-sa namespace/monitor-sa created [root@k8s-master cka]# kubectl apply -f node-export.yaml daemonset.apps/node-exporter created
查看cpu监控指标
[root@k8s-master cka]# curl http://192.168.10.50:9100/metrics | grep cpu % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 63228 100 63228 0 0 5437k 0 --:--:-- --:--:-- --:--:-- 5613k # HELP go_memstats_gc_cpu_fraction The fraction of this program's available CPU time used by the GC since the program started. # TYPE go_memstats_gc_cpu_fraction gauge go_memstats_gc_cpu_fraction 4.159327529117454e-06 # HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode. # TYPE node_cpu_guest_seconds_total counter node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0 node_cpu_guest_seconds_total{cpu="0",mode="user"} 0 node_cpu_guest_seconds_total{cpu="1",mode="nice"} 0 node_cpu_guest_seconds_total{cpu="1",mode="user"} 0 node_cpu_guest_seconds_total{cpu="2",mode="nice"} 0 node_cpu_guest_seconds_total{cpu="2",mode="user"} 0 node_cpu_guest_seconds_total{cpu="3",mode="nice"} 0 node_cpu_guest_seconds_total{cpu="3",mode="user"} 0 # HELP node_cpu_seconds_total Seconds the cpus spent in each mode. # TYPE node_cpu_seconds_total counter node_cpu_seconds_total{cpu="0",mode="idle"} 82923.84 node_cpu_seconds_total{cpu="0",mode="iowait"} 46.33 node_cpu_seconds_total{cpu="0",mode="irq"} 0 node_cpu_seconds_total{cpu="0",mode="nice"} 0.04 node_cpu_seconds_total{cpu="0",mode="softirq"} 7.79 node_cpu_seconds_total{cpu="0",mode="steal"} 0 node_cpu_seconds_total{cpu="0",mode="system"} 51.35 node_cpu_seconds_total{cpu="0",mode="user"} 58.6 node_cpu_seconds_total{cpu="1",mode="idle"} 82673.51 node_cpu_seconds_total{cpu="1",mode="iowait"} 1.07 node_cpu_seconds_total{cpu="1",mode="irq"} 0 node_cpu_seconds_total{cpu="1",mode="nice"} 0.04 node_cpu_seconds_total{cpu="1",mode="softirq"} 7.89 node_cpu_seconds_total{cpu="1",mode="steal"} 0 node_cpu_seconds_total{cpu="1",mode="system"} 70.05 node_cpu_seconds_total{cpu="1",mode="user"} 80.02 node_cpu_seconds_total{cpu="2",mode="idle"} 82655.69 node_cpu_seconds_total{cpu="2",mode="iowait"} 1.05 node_cpu_seconds_total{cpu="2",mode="irq"} 0 node_cpu_seconds_total{cpu="2",mode="nice"} 0.11 node_cpu_seconds_total{cpu="2",mode="softirq"} 7.9 node_cpu_seconds_total{cpu="2",mode="steal"} 0 node_cpu_seconds_total{cpu="2",mode="system"} 72.63 node_cpu_seconds_total{cpu="2",mode="user"} 87.56 node_cpu_seconds_total{cpu="3",mode="idle"} 82495.3 node_cpu_seconds_total{cpu="3",mode="iowait"} 0.94 node_cpu_seconds_total{cpu="3",mode="irq"} 0 node_cpu_seconds_total{cpu="3",mode="nice"} 0.04 node_cpu_seconds_total{cpu="3",mode="softirq"} 11.34 node_cpu_seconds_total{cpu="3",mode="steal"} 0 node_cpu_seconds_total{cpu="3",mode="system"} 77.3 node_cpu_seconds_total{cpu="3",mode="user"} 80.99 # HELP node_memory_Percpu_bytes Memory information field Percpu_bytes. # TYPE node_memory_Percpu_bytes gauge node_memory_Percpu_bytes 3.407872e+07 node_scrape_collector_duration_seconds{collector="cpu"} 0.000650834 node_scrape_collector_success{collector="cpu"} 1 # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. # TYPE process_cpu_seconds_total counter process_cpu_seconds_total 0.22
创建运行pod 的sa
[root@k8s-master cka]# kubectl create serviceaccount monitor -n monitor-sa serviceaccount/monitor created [root@k8s-master cka]# kubectl create clusterrolebinding monitor-clusterrolebinding -n monitor-sa --clusterrole=cluster-admin --serviceaccount=monitor-sa:monitor clusterrolebinding.rbac.authorization.k8s.io/monitor-clusterrolebinding created [root@k8s-master cka]# kubectl create clusterrolebinding monitor-clusterrolebinding-1 --clusterrole=cluster-admin --user=system:serviceaccount:monitor:monitor-sa -n monitor-sa clusterrolebinding.rbac.authorization.k8s.io/monitor-clusterrolebinding-1 created [root@k8s-master cka]# kubectl get clusterrolebinding | grep clusterrolebinding-1 monitor-clusterrolebinding-1 ClusterRole/cluster-admin 17s
创建数据目录
[root@k8s-node1 ~]# mkdir /data 您在 /var/spool/mail/root 中有新邮件 [root@k8s-node1 ~]# chmod 777 /data [root@k8s-node2 ~]# mkdir /data && chmod 777 /data 您在 /var/spool/mail/root 中有新邮件
创建prometheus数据存储目录
--- kind: ConfigMap apiVersion: v1 metadata: labels: app: prometheus name: prometheus-config namespace: monitor-sa data: prometheus.yml: | # 具体配置文件 global: # 全局 scrape_interval: 15s #采集时间间隔 scrape_timeout: 10s # 采集数据超时时间 evaluation_interval: 1m # 告警检测时间间隔 scrape_configs: - job_name: 'kubernetes-node' # 采集数据原 kubernetes_sd_configs: - role: node relabel_configs: - source_labels: [__address__] regex: '(.*):10250' replacement: '${1}:9100' target_label: __address__ action: replace - action: labelmap regex: __meta_kubernetes_node_label_(.+) - job_name: 'kubernetes-node-cadvisor' kubernetes_sd_configs: - role: node scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - job_name: 'kubernetes-apiserver' kubernetes_sd_configs: - role: endpoints scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https - job_name: 'kubernetes-service-endpoints' kubernetes_sd_configs: - role: endpoints relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] action: replace target_label: __scheme__ regex: (https?) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name [root@k8s-master cka]# kubectl apply -f prometheus-cfg.yaml configmap/prometheus-config created
通过deployment部署prometheus
[root@k8s-master cka]# cat prometheus-deploy.yaml --- apiVersion: apps/v1 kind: Deployment metadata: name: prometheus-server namespace: monitor-sa labels: app: prometheus spec: replicas: 1 selector: matchLabels: app: prometheus component: server #matchExpressions: #- {key: app, operator: In, values: [prometheus]} #- {key: component, operator: In, values: [server]} template: metadata: labels: app: prometheus component: server annotations: prometheus.io/scrape: 'false' spec: nodeName: k8s-node1 # 部署节点 serviceAccountName: monitor containers: - name: prometheus image: prom/prometheus:v2.2.1 imagePullPolicy: IfNotPresent command: - prometheus - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.path=/prometheus - --storage.tsdb.retention=720h - --web.enable-lifecycle ports: - containerPort: 9090 protocol: TCP volumeMounts: - mountPath: /etc/prometheus name: prometheus-config - mountPath: /prometheus/ name: prometheus-storage-volume volumes: - name: prometheus-config configMap: name: prometheus-config - name: prometheus-storage-volume hostPath: path: /data type: Directory [root@k8s-master cka]# kubectl apply -f prometheus-deploy.yaml deployment.apps/prometheus-server created
部署service
[root@k8s-master cka]# cat prometheus-svc.yaml apiVersion: v1 kind: Service metadata: name: prometheus namespace: monitor-sa labels: app: prometheus spec: type: NodePort ports: - port: 9090 targetPort: 9090 protocol: TCP selector: app: prometheus component: server [root@k8s-master cka]# kubectl apply -f prometheus-svc.yaml service/prometheus created
查看pod 与service
[root@k8s-master cka]# kubectl get svc -n monitor-sa NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE prometheus NodePort 10.110.21.221 <none> 9090:30194/TCP 94s [root@k8s-master cka]# kubectl get pod -n monitor-sa NAME READY STATUS RESTARTS AGE node-exporter-8q9vb 1/1 Running 0 86m node-exporter-fv8n8 1/1 Running 0 86m node-exporter-xjzdc 1/1 Running 0 86m prometheus-server-5b5bb44bb5-7xcn7 1/1 Running 0 7m54s
浏览器访问http://192.168.10.50:30194/targets
service 添加如下注解会监控service
apiVersion: v1 kind: Service metadata: annotations: #添加下面注解 prometheus.io/port: "9153" prometheus.io/scrape: "true" creationTimestamp: "2023-10-16T19:47:18Z" labels: k8s-app: kube-dns kubernetes.io/cluster-service: "true" kubernetes.io/name: CoreDNS name: kube-dns namespace: kube-system resourceVersion: "236" uid: 7162ceef-a1a2-4da8-a4da-d387e619170d
热加载
kube-dns ClusterIP 10.96.0.10 <none> 53/UDP,53/TCP,9153/TCP 23h [root@k8s-master cka]# kubectl edit svc -n kube-system kube-dns # Please edit the object below. Lines beginning with a '#' will be ignored, # and an empty file will abort the edit. If an error occurs while saving this file will be # reopened with the relevant failures. # apiVersion: v1 kind: Service metadata: # annotations: # prometheus.io/port: "9153" # prometheus.io/scrape: "true" creationTimestamp: "2023-10-16T19:47:18Z" labels: service/kube-dns edited
加载配置
[root@k8s-master cka]# kubectl get pod -n monitor-sa -owide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES node-exporter-8q9vb 1/1 Running 0 22h 192.168.10.51 k8s-node1 <none> <none> node-exporter-fv8n8 1/1 Running 0 22h 192.168.10.50 k8s-master <none> <none> node-exporter-xjzdc 1/1 Running 0 22h 192.168.10.52 k8s-node2 <none> <none> prometheus-server-5b5bb44bb5-7xcn7 1/1 Running 0 21h 10.244.36.65 k8s-node1 <none> <none> [root@k8s-master cka]# curl -X POST http://10.244.36.65:9090/-/reload [root@k8s-master cka]# kubectl edit svc -n kube-system kube-dns service/kube-dns edited [root@k8s-master cka]# curl -X POST http://10.244.36.65:9090/-/reload
草都可以从石头缝隙中长出来更可况你呢