k8s部署Prometheus并使用openebs本地持久化存储

环境信息
本次环境信息如下
Kubernetes 1.24.3
openebs openebs.io/version: 3.3.0

安装openebs

 kubectl apply -f https://openebs.github.io/charts/openebs-operator.yaml
 
[root@CentOS8 prometheus]# kubectl get sc
NAME               PROVISIONER        RECLAIMPOLICY   VOLUMEBINDINGMODE      ALLOWVOLUMEEXPANSION   AGE
openebs-device     openebs.io/local   Delete          WaitForFirstConsumer   false                  150m
openebs-hostpath   openebs.io/local   Delete          WaitForFirstConsumer   false                  150m
 
[root@CentOS8 prometheus]# kubectl get all -n openebs
NAME                                                READY   STATUS    RESTARTS   AGE
pod/openebs-localpv-provisioner-dd977fdd5-7h94d     1/1     Running   0          152m
pod/openebs-ndm-85v85                               1/1     Running   0          152m
pod/openebs-ndm-cluster-exporter-866c974856-sgksr   1/1     Running   0          152m
pod/openebs-ndm-node-exporter-hxp2q                 1/1     Running   0          152m
pod/openebs-ndm-operator-85886744bb-pbt9p           1/1     Running   0          152m
 
NAME                                           TYPE        CLUSTER-IP   EXTERNAL-IP   PORT(S)    AGE
service/openebs-ndm-cluster-exporter-service   ClusterIP   None         <none>        9100/TCP   152m
service/openebs-ndm-node-exporter-service      ClusterIP   None         <none>        9101/TCP   152m
 
NAME                                       DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE   NODE SELECTOR   AGE
daemonset.apps/openebs-ndm                 1         1         1       1            1           <none>          152m
daemonset.apps/openebs-ndm-node-exporter   1         1         1       1            1           <none>          152m
 
NAME                                           READY   UP-TO-DATE   AVAILABLE   AGE
deployment.apps/openebs-localpv-provisioner    1/1     1            1           152m
deployment.apps/openebs-ndm-cluster-exporter   1/1     1            1           152m
deployment.apps/openebs-ndm-operator           1/1     1            1           152m
 
NAME                                                      DESIRED   CURRENT   READY   AGE
replicaset.apps/openebs-localpv-provisioner-dd977fdd5     1         1         1       152m
replicaset.apps/openebs-ndm-cluster-exporter-866c974856   1         1         1       152m
replicaset.apps/openebs-ndm-operator-85886744bb           1         1         1       152m
 
 
创建PVC持久化目录
 
[root@CentOS8 prometheus]# kubectl create ns prometheus
namespace/prometheus created
 
[root@CentOS8 prometheus]# cat prometheus-pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
  name: prometheus-data-db       #pvc名称，这里不建议修改
  namespace: prometheus
spec:
  accessModes:
  - ReadWriteOnce
  resources:
    requests:
      storage: 50Gi                            #创建pvc大小
  storageClassName: openebs-hostpath        #这里是我的storageclass，请根据自己的实际情况修改
  
  
#创建pvc
[root@CentOS8 prometheus]# kubectl apply -f prometheus-pvc.yaml
persistentvolumeclaim/prometheus-data-db created
 
#查看pvc创建情况
[root@CentOS8 prometheus]# kubectl get pvc -n prometheus
NAME                 STATUS   VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS       AGE
prometheus-data-db   Bound    pvc-fad4f75b-b103-4a27-bd73-a5adb64f7308   50Gi       RWO            openebs-hostpath   142m

安装Prometheus

首先需要配置configmap

 [root@CentOS8 prometheus]# cat prometheus.configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: prometheus
data:
  prometheus.yml: |
    global:
      scrape_interval: 15s
      scrape_timeout: 15s
    scrape_configs:
    - job_name: 'prometheus'
      static_configs:
      - targets: ['localhost:9090']
    - job_name: 'kubernetes-node'
      kubernetes_sd_configs:
      - role: node
      relabel_configs:
      - source_labels: [__address__]
        regex: '(.*):10250'
        replacement: '${1}:9100'
        target_label: __address__
        action: replace
    - job_name: 'kubernetes-cadvisor'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
    - job_name: 'kubernetes-apiservers'
      kubernetes_sd_configs:
      - role: endpoints
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https
    - job_name: 'kubernetes-service-endpoints'
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::d+)?;(d+)
        replacement: $1:$2
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_name
    - job_name: kubernetes-nodes-cadvisor
      scrape_interval: 10s
      scrape_timeout: 10s
      scheme: https  # remove if you want to scrape metrics on insecure port
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      kubernetes_sd_configs:
        - role: node
      relabel_configs:
        - action: labelmap
          regex: __meta_kubernetes_node_label_(.+)
        - target_label: __address__
          replacement: kubernetes.default.svc:443
        - source_labels: [__meta_kubernetes_node_name]
          regex: (.+)
          target_label: __metrics_path__
          replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
      metric_relabel_configs:
        - action: replace
          source_labels: [id]
          regex: '^/machine.slice/machine-rkt\x2d([^\]+)\.+/([^/]+).service$'
          target_label: rkt_container_name
          replacement: '${2}-${1}'
        - action: replace
          source_labels: [id]
          regex: '^/system.slice/(.+).service$'
          target_label: systemd_service_name
          replacement: '${1}'
    - job_name: kube-state-metrics
      static_configs:
      - targets: ['kube-state-metrics.prometheus.svc.cluster.local:8080']

接下来配置rbac授权文件，否则Deployment会没有权限创建pod

 [root@CentOS8 prometheus]# cat prometheus-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups:
  - ""
  resources:
  - nodes
  - services
  - endpoints
  - pods
  - nodes/proxy
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - configmaps
  - nodes/metrics
  verbs:
  - get
- nonResourceURLs:
  - /metrics
  verbs:
  - get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: prometheus

创建prometheus容器，以Deployment方式运行

 [root@CentOS8 prometheus]# cat prometheus.deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus
  namespace: prometheus
  labels:
    app: prometheus
spec:
  selector:
    matchLabels:
    app: prometheus
  nodeSelector:
    node-role.kubernetes.io/master: ""
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      serviceAccountName: prometheus
      containers:
      # - image: prom/prometheus:latest #官方镜像地址
      - image: registry.cn-hangzhou.aliyuncs.com/urbancabin/prometheus:latest
        name: prometheus
        command:
        - "/bin/prometheus"
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--storage.tsdb.path=/prometheus"
        - "--storage.tsdb.retention=30d"
        - "--web.enable-admin-api"  # 控制对admin HTTP API的访问，其中包括删除时间序列等功能
        - "--web.enable-lifecycle"  # 支持热更新，直接执行localhost:9090/-/reload立即生效
        ports:
        - containerPort: 9090
          protocol: TCP
          name: http
        volumeMounts:
        - mountPath: "/prometheus"
          subPath: prometheus
          name: data
        - mountPath: "/etc/prometheus"
          name: config-volume
        resources:
          requests:
            cpu: 100m
            memory: 512Mi
          limits:
            cpu: 100m
            memory: 512Mi
      securityContext:
        runAsUser: 0
      volumes:
        - name: data
          persistentVolumeClaim:
            claimName: prometheus-data-db
        - configMap:
          name: prometheus-config
          name: config-volume

接下来我们还需要给prometheus创建pvc（我这里后面grafana和prometheus会分开，所以我使用NodePort。如果有ingress可以使用cluster）

 [root@CentOS8 prometheus]# cat prometheus-svc.yaml
apiVersion: v1
kind: Service
metadata:
  name: prometheus
  namespace: prometheus
  labels:
    app: prometheus
spec:
  selector:
    app: prometheus
  type: NodePort
  ports:
    - name: web
      port: 9090
      targetPort: http

接下来在给我们K8s节点设置node-exporter

 [root@CentOS8 prometheus]# cat prometheus-node.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: node-exporter
  namespace: prometheus
  labels:
    name: node-exporter
spec:
  selector:
    matchLabels:
      name: node-exporter
  template:
    metadata:
      labels:
        name: node-exporter
    spec:
      hostPID: true
      hostIPC: true
      hostNetwork: true
      containers:
      - name: node-exporter
	    #image: prom/node-exporter:v0.16.0 #官方地址
        image: registry.cn-hangzhou.aliyuncs.com/urbancabin/node-exporter:v0.16.0
        ports:
        - containerPort: 9100
        resources:
          requests:
            cpu: 0.15
        securityContext:
          privileged: true
        args:
        - --path.procfs
        - /host/proc
        - --path.sysfs
        - /host/sys
        - --collector.filesystem.ignored-mount-points
        - '"^/(sys|proc|dev|host|etc)($|/)"'
        volumeMounts:
        - name: dev
          mountPath: /host/dev
        - name: proc
          mountPath: /host/proc
        - name: sys
          mountPath: /host/sys
        - name: rootfs
          mountPath: /rootfs
      tolerations:
      - key: "node-role.kubernetes.io/master"
        operator: "Exists"
        effect: "NoSchedule"
      volumes:
        - name: proc
          hostPath:
            path: /proc
        - name: dev
          hostPath:
            path: /dev
        - name: sys
          hostPath:
            path: /sys
        - name: rootfs
          hostPath:
            path: /

当我们所以配置文件都设置好后，我们直接kubectl apply -f .即可

检查pod运行状态

 [root@CentOS8 prometheus]# kubectl get all -n prometheus
NAME                                      READY   STATUS    RESTARTS       AGE
pod/grafana-7556f7d7df-nzkmt              1/1     Running   0              153m
pod/kube-state-metrics-5f8785787d-cw8ln   1/1     Running   0              170m
pod/node-exporter-pln47                   1/1     Running   0              18h
pod/prometheus-584b95655f-h4krl           1/1     Running   9 (4m6s ago)   3h32m
 
NAME                         TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)             AGE
service/grafana              NodePort    10.108.237.208   <none>        3000:30820/TCP      162m
service/kube-state-metrics   ClusterIP   10.96.242.162    <none>        8080/TCP,8081/TCP   170m
service/prometheus           NodePort    10.96.81.46      <none>        9090:32440/TCP      18h
 
NAME                           DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE   NODE SELECTOR   AGE
daemonset.apps/node-exporter   1         1         1       1            1           <none>          18h
 
NAME                                 READY   UP-TO-DATE   AVAILABLE   AGE
deployment.apps/grafana              1/1     1            1           162m
deployment.apps/kube-state-metrics   1/1     1            1           170m
deployment.apps/prometheus           1/1     1            1           18h
 
NAME                                            DESIRED   CURRENT   READY   AGE
replicaset.apps/grafana-7556f7d7df              1         1         1       162m
replicaset.apps/kube-state-metrics-5f8785787d   1         1         1       170m
replicaset.apps/prometheus-584b95655f           1         1         1       18h

访问prometheus测试

kube-state-metrics
Kube-State-Metrics简介
kube-state-metrics 通过监听 API Server 生成有关资源对象的状态指标，比如 Deployment 、Node 、 Pod ，需要注意的是 kube-state-metrics 只是简单的提供一个 metrics 数据，并不会存储这些指标数据，所以我们可以使用 Prometheus 来抓取这些数据然后存储，主要关注的是业务相关的一些元数据，比如 Deployment 、 Pod 、副本状态等；调度了多少个 replicas ？现在可用的有几个？多少个 Pod 是running/stopped/terminated 状态？ Pod 重启了多少次？我有多少 job 在运行中

yaml文件如下
本地命名空间为prometheus

 [root@CentOS8 prometheus]# cat kube-metrics-deployment.yaml
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.4.2
  name: kube-state-metrics
  namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.4.2
  name: kube-state-metrics
rules:
- apiGroups:
  - ""
  resources:
  - configmaps
  - secrets
  - nodes
  - pods
  - services
  - resourcequotas
  - replicationcontrollers
  - limitranges
  - persistentvolumeclaims
  - persistentvolumes
  - namespaces
  - endpoints
  verbs:
  - list
  - watch
- apiGroups:
  - apps
  resources:
  - statefulsets
  - daemonsets
  - deployments
  - replicasets
  verbs:
  - list
  - watch
- apiGroups:
  - batch
  resources:
  - cronjobs
  - jobs
  verbs:
  - list
  - watch
- apiGroups:
  - autoscaling
  resources:
  - horizontalpodautoscalers
  verbs:
  - list
  - watch
- apiGroups:
  - authentication.k8s.io
  resources:
  - tokenreviews
  verbs:
  - create
- apiGroups:
  - authorization.k8s.io
  resources:
  - subjectaccessreviews
  verbs:
  - create
- apiGroups:
  - policy
  resources:
  - poddisruptionbudgets
  verbs:
  - list
  - watch
- apiGroups:
  - certificates.k8s.io
  resources:
  - certificatesigningrequests
  verbs:
  - list
  - watch
- apiGroups:
  - storage.k8s.io
  resources:
  - storageclasses
  - volumeattachments
  verbs:
  - list
  - watch
- apiGroups:
  - admissionregistration.k8s.io
  resources:
  - mutatingwebhookconfigurations
  - validatingwebhookconfigurations
  verbs:
  - list
  - watch
- apiGroups:
  - networking.k8s.io
  resources:
  - networkpolicies
  - ingresses
  verbs:
  - list
  - watch
- apiGroups:
  - coordination.k8s.io
  resources:
  - leases
  verbs:
  - list
  - watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.4.2
  name: kube-state-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  namespace: prometheus
---
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scraped: "true"    # 设置能被prometheus抓取到，因为不带这个annotation prometheus-service-endpoints 不会去抓这个metrics
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.4.2
  name: kube-state-metrics
  namespace: prometheus
spec:
#  clusterIP: None  # 允许通过svc来进行访问
  ports:
  - name: http-metrics
    port: 8080
    targetPort: http-metrics
  - name: telemetry
    port: 8081
    targetPort: telemetry
  selector:
    app.kubernetes.io/name: kube-state-metrics
---
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.4.2
  name: kube-state-metrics
  namespace: prometheus
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: kube-state-metrics
  template:
    metadata:
      labels:
        app.kubernetes.io/component: exporter
        app.kubernetes.io/name: kube-state-metrics
        app.kubernetes.io/version: 2.4.2
    spec:
      nodeName: centos8      # 设置在k8s-master-1上运行
      tolerations:                # 设置能容忍在master节点运行
      - key: "node-role.kubernetes.io/master"
        operator: "Exists"
        effect: "NoSchedule"
      automountServiceAccountToken: true
      containers:
#      - image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.2
      - image: anjia0532/google-containers.kube-state-metrics.kube-state-metrics:v2.4.2
        livenessProbe:
          httpGet:
            path: /healthz
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
        name: kube-state-metrics
        ports:
        - containerPort: 8080
          name: http-metrics
        - containerPort: 8081
          name: telemetry
        readinessProbe:
          httpGet:
            path: /
            port: 8081
          initialDelaySeconds: 5
          timeoutSeconds: 5
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            drop:
            - ALL
          readOnlyRootFilesystem: true
          runAsUser: 65534
      serviceAccountName: kube-state-metrics

 [root@CentOS8 prometheus]#  kubectl get pod -n prometheus
NAME                                  READY   STATUS    RESTARTS        AGE
grafana-7556f7d7df-nzkmt              1/1     Running   0               156m
kube-state-metrics-5f8785787d-cw8ln   1/1     Running   0               173m
node-exporter-pln47                   1/1     Running   0               18h
prometheus-584b95655f-h4krl           1/1     Running   9 (7m14s ago)   3h35m

检查svc是否有metric数据

 [root@CentOS8 prometheus]# curl http://xxx:32440/metrics
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 3.5208e-05
go_gc_duration_seconds{quantile="0.25"} 0.000213094
go_gc_duration_seconds{quantile="0.5"} 0.094531627
go_gc_duration_seconds{quantile="0.75"} 0.105158749
go_gc_duration_seconds{quantile="1"} 0.303282049
go_gc_duration_seconds_sum 1.715240602
go_gc_duration_seconds_count 23
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 93
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.17.5"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
......

Grafana
接下来安装Grafana，同样也是基于原来的文章，只不过将Grafana做持久化存储

 [root@CentOS8 prometheus]# cat grafana-pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
  name: grafana-data
  namespace: prometheus
spec:
  accessModes:
  - ReadWriteOnce
  resources:
    requests:
      storage: 50Gi
  storageClassName: openebs-hostpath

创建完pvc后，我们创建deployment和svc

 [root@CentOS8 prometheus]# cat grafana-svc.yaml
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: prometheus
  labels:
    app: grafana
spec:
  type: NodePort
  ports:
    - port: 3000
  selector:
    app: grafana

 #Deployment
 
[root@CentOS8 prometheus]# cat grafana-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: grafana
  namespace: prometheus
  labels:
    app: grafana
spec:
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      app: grafana
  template:
    metadata:
      labels:
        app: grafana
    spec:
      containers:
      - name: grafana
	    # image: grafana/grafana #官方镜像
        image: registry.cn-hangzhou.aliyuncs.com/urbancabin/grafana:latest
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 3000
          name: grafana
        env:
        - name: GF_SECURITY_ADMIN_USER
          value: admin
        - name: GF_SECURITY_ADMIN_PASSWORD
          value: abcdocker
        readinessProbe:
          failureThreshold: 10
          httpGet:
            path: /api/health
            port: 3000
            scheme: HTTP
          initialDelaySeconds: 60
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 30
        livenessProbe:
          failureThreshold: 3
          httpGet:
            path: /api/health
            port: 3000
            scheme: HTTP
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 1
        resources:
          limits:
            cpu: 300m
            memory: 1024Mi
          requests:
            cpu: 300m
            memory: 1024Mi
        volumeMounts:
        - mountPath: /var/lib/grafana
          subPath: grafana
          name: storage
      securityContext:
        fsGroup: 472
        runAsUser: 472
      volumes:
      - name: storage
        persistentVolumeClaim:
          claimName: grafana-data

创建grafana所有yaml

 [root@CentOS8 grafana]# kubectl apply -f .

检查grafana

 [root@CentOS8 prometheus]# kubectl get all -n prometheus |grep grafana
pod/grafana-7556f7d7df-nzkmt              1/1     Running   0             163m
service/grafana              NodePort    10.108.237.208   <none>        3000:30820/TCP      172m
deployment.apps/grafana              1/1     1            1           172m
replicaset.apps/grafana-7556f7d7df              1         1         1       172m

配置grafana
这里grafana只讲几个重点配置部分
设置Prometheus源
http://prometheus.prometheus.svc.cluster.local:9090
这里的需要修改中间的prometheus，这个prometheus为namespace。不懂的看一下svc这块知识

推荐K8s监控模板
https://grafana.com/grafana/dashboards/15661
https://grafana.com/grafana/dashboards/6417
https://grafana.com/grafana/dashboards/16098

参考链接：https://i4t.com/5576.html

posted @ 2022-08-17 12:58 都市小木屋阅读(838) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· 使用 K8spacket 和 Grafana 对 K8s 的 TCP 数据包流量进行可视化

· k8s v1.24.3 二进制高可用架构实践（笔记）

· 11-K8S部署普罗米修斯

· K8S集群实现prometheus+grafana+alertmanager监控报警

· k8s安装kube-promethues

公告

昵称：都市小木屋
园龄： 8年6个月
粉丝： 0
关注： 2

+加关注

2025年3月

日

一

二

三

四

五

六

随笔分类

k8s随笔(6)

随笔档案

阅读排行榜

评论排行榜

1. 使用 K8spacket 和 Grafana 对 K8s 的 TCP 数据包流量进行可视化(1)

都市小木屋

--每一步，都算数

k8s部署Prometheus并使用openebs本地持久化存储

安装openebs

安装Prometheus

首先需要配置configmap

公告

搜索

常用链接

最新随笔

我的标签

随笔分类

随笔档案

阅读排行榜

评论排行榜

最新评论

	kubectl apply -f https://openebs.github.io/charts/openebs-operator.yaml

	[root@CentOS8 prometheus]# kubectl get sc
	NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
	openebs-device openebs.io/local Delete WaitForFirstConsumer false 150m
	openebs-hostpath openebs.io/local Delete WaitForFirstConsumer false 150m

	[root@CentOS8 prometheus]# kubectl get all -n openebs
	NAME READY STATUS RESTARTS AGE
	pod/openebs-localpv-provisioner-dd977fdd5-7h94d 1/1 Running 0 152m
	pod/openebs-ndm-85v85 1/1 Running 0 152m
	pod/openebs-ndm-cluster-exporter-866c974856-sgksr 1/1 Running 0 152m
	pod/openebs-ndm-node-exporter-hxp2q 1/1 Running 0 152m
	pod/openebs-ndm-operator-85886744bb-pbt9p 1/1 Running 0 152m

	NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
	service/openebs-ndm-cluster-exporter-service ClusterIP None <none> 9100/TCP 152m
	service/openebs-ndm-node-exporter-service ClusterIP None <none> 9101/TCP 152m

	NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
	daemonset.apps/openebs-ndm 1 1 1 1 1 <none> 152m
	daemonset.apps/openebs-ndm-node-exporter 1 1 1 1 1 <none> 152m

	NAME READY UP-TO-DATE AVAILABLE AGE
	deployment.apps/openebs-localpv-provisioner 1/1 1 1 152m
	deployment.apps/openebs-ndm-cluster-exporter 1/1 1 1 152m
	deployment.apps/openebs-ndm-operator 1/1 1 1 152m

	NAME DESIRED CURRENT READY AGE
	replicaset.apps/openebs-localpv-provisioner-dd977fdd5 1 1 1 152m
	replicaset.apps/openebs-ndm-cluster-exporter-866c974856 1 1 1 152m
	replicaset.apps/openebs-ndm-operator-85886744bb 1 1 1 152m


	创建PVC持久化目录

	[root@CentOS8 prometheus]# kubectl create ns prometheus
	namespace/prometheus created

	[root@CentOS8 prometheus]# cat prometheus-pvc.yaml
	kind: PersistentVolumeClaim
	apiVersion: v1
	metadata:
	name: prometheus-data-db #pvc名称，这里不建议修改
	namespace: prometheus
	spec:
	accessModes:
	- ReadWriteOnce
	resources:
	requests:
	storage: 50Gi #创建pvc大小
	storageClassName: openebs-hostpath #这里是我的storageclass，请根据自己的实际情况修改


	#创建pvc
	[root@CentOS8 prometheus]# kubectl apply -f prometheus-pvc.yaml
	persistentvolumeclaim/prometheus-data-db created

	#查看pvc创建情况
	[root@CentOS8 prometheus]# kubectl get pvc -n prometheus
	NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
	prometheus-data-db Bound pvc-fad4f75b-b103-4a27-bd73-a5adb64f7308 50Gi RWO openebs-hostpath 142m

	[root@CentOS8 prometheus]# cat prometheus.configmap.yaml
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: prometheus-config
	namespace: prometheus
	data:
	prometheus.yml: \|
	global:
	scrape_interval: 15s
	scrape_timeout: 15s
	scrape_configs:
	- job_name: 'prometheus'
	static_configs:
	- targets: ['localhost:9090']
	- job_name: 'kubernetes-node'
	kubernetes_sd_configs:
	- role: node
	relabel_configs:
	- source_labels: [__address__]
	regex: '(.*):10250'
	replacement: '${1}:9100'
	target_label: __address__
	action: replace
	- job_name: 'kubernetes-cadvisor'
	kubernetes_sd_configs:
	- role: node
	scheme: https
	tls_config:
	ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
	bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
	relabel_configs:
	- action: labelmap
	regex: __meta_kubernetes_node_label_(.+)
	- target_label: __address__
	replacement: kubernetes.default.svc:443
	- source_labels: [__meta_kubernetes_node_name]
	regex: (.+)
	target_label: __metrics_path__
	replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
	- job_name: 'kubernetes-apiservers'
	kubernetes_sd_configs:
	- role: endpoints
	scheme: https
	tls_config:
	ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
	bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
	relabel_configs:
	- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
	action: keep
	regex: default;kubernetes;https
	- job_name: 'kubernetes-service-endpoints'
	kubernetes_sd_configs:
	- role: endpoints
	relabel_configs:
	- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
	action: keep
	regex: true
	- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
	action: replace
	target_label: __scheme__
	regex: (https?)
	- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
	action: replace
	target_label: __metrics_path__
	regex: (.+)
	- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
	action: replace
	target_label: __address__
	regex: ([^:]+)(?::d+)?;(d+)
	replacement: $1:$2
	- action: labelmap
	regex: __meta_kubernetes_service_label_(.+)
	- source_labels: [__meta_kubernetes_namespace]
	action: replace
	target_label: kubernetes_namespace
	- source_labels: [__meta_kubernetes_service_name]
	action: replace
	target_label: kubernetes_name
	- job_name: kubernetes-nodes-cadvisor
	scrape_interval: 10s
	scrape_timeout: 10s
	scheme: https # remove if you want to scrape metrics on insecure port
	tls_config:
	ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
	bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
	kubernetes_sd_configs:
	- role: node
	relabel_configs:
	- action: labelmap
	regex: __meta_kubernetes_node_label_(.+)
	- target_label: __address__
	replacement: kubernetes.default.svc:443
	- source_labels: [__meta_kubernetes_node_name]
	regex: (.+)
	target_label: __metrics_path__
	replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
	metric_relabel_configs:
	- action: replace
	source_labels: [id]
	regex: '^/machine.slice/machine-rkt\x2d([^\]+)\.+/([^/]+).service$'
	target_label: rkt_container_name
	replacement: '${2}-${1}'
	- action: replace
	source_labels: [id]
	regex: '^/system.slice/(.+).service$'
	target_label: systemd_service_name
	replacement: '${1}'
	- job_name: kube-state-metrics
	static_configs:
	- targets: ['kube-state-metrics.prometheus.svc.cluster.local:8080']

	[root@CentOS8 prometheus]# cat prometheus-rbac.yaml
	apiVersion: v1
	kind: ServiceAccount
	metadata:
	name: prometheus
	namespace: prometheus
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: ClusterRole
	metadata:
	name: prometheus
	rules:
	- apiGroups:
	- ""
	resources:
	- nodes
	- services
	- endpoints
	- pods
	- nodes/proxy
	verbs:
	- get
	- list
	- watch
	- apiGroups:
	- ""
	resources:
	- configmaps
	- nodes/metrics
	verbs:
	- get
	- nonResourceURLs:
	- /metrics
	verbs:
	- get
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: ClusterRoleBinding
	metadata:
	name: prometheus
	roleRef:
	apiGroup: rbac.authorization.k8s.io
	kind: ClusterRole
	name: prometheus
	subjects:
	- kind: ServiceAccount
	name: prometheus
	namespace: prometheus

	[root@CentOS8 prometheus]# cat prometheus.deploy.yaml
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: prometheus
	namespace: prometheus
	labels:
	app: prometheus
	spec:
	selector:
	matchLabels:
	app: prometheus
	nodeSelector:
	node-role.kubernetes.io/master: ""
	template:
	metadata:
	labels:
	app: prometheus
	spec:
	serviceAccountName: prometheus
	containers:
	# - image: prom/prometheus:latest #官方镜像地址
	- image: registry.cn-hangzhou.aliyuncs.com/urbancabin/prometheus:latest
	name: prometheus
	command:
	- "/bin/prometheus"
	args:
	- "--config.file=/etc/prometheus/prometheus.yml"
	- "--storage.tsdb.path=/prometheus"
	- "--storage.tsdb.retention=30d"
	- "--web.enable-admin-api" # 控制对admin HTTP API的访问，其中包括删除时间序列等功能
	- "--web.enable-lifecycle" # 支持热更新，直接执行localhost:9090/-/reload立即生效
	ports:
	- containerPort: 9090
	protocol: TCP
	name: http
	volumeMounts:
	- mountPath: "/prometheus"
	subPath: prometheus
	name: data
	- mountPath: "/etc/prometheus"
	name: config-volume
	resources:
	requests:
	cpu: 100m
	memory: 512Mi
	limits:
	cpu: 100m
	memory: 512Mi
	securityContext:
	runAsUser: 0
	volumes:
	- name: data
	persistentVolumeClaim:
	claimName: prometheus-data-db
	- configMap:
	name: prometheus-config
	name: config-volume

	[root@CentOS8 prometheus]# cat prometheus-svc.yaml
	apiVersion: v1
	kind: Service
	metadata:
	name: prometheus
	namespace: prometheus
	labels:
	app: prometheus
	spec:
	selector:
	app: prometheus
	type: NodePort
	ports:
	- name: web
	port: 9090
	targetPort: http

	[root@CentOS8 prometheus]# cat prometheus-node.yaml
	apiVersion: apps/v1
	kind: DaemonSet
	metadata:
	name: node-exporter
	namespace: prometheus
	labels:
	name: node-exporter
	spec:
	selector:
	matchLabels:
	name: node-exporter
	template:
	metadata:
	labels:
	name: node-exporter
	spec:
	hostPID: true
	hostIPC: true
	hostNetwork: true
	containers:
	- name: node-exporter
	#image: prom/node-exporter:v0.16.0 #官方地址
	image: registry.cn-hangzhou.aliyuncs.com/urbancabin/node-exporter:v0.16.0
	ports:
	- containerPort: 9100
	resources:
	requests:
	cpu: 0.15
	securityContext:
	privileged: true
	args:
	- --path.procfs
	- /host/proc
	- --path.sysfs
	- /host/sys
	- --collector.filesystem.ignored-mount-points
	- '"^/(sys\|proc\|dev\|host\|etc)($\|/)"'
	volumeMounts:
	- name: dev
	mountPath: /host/dev
	- name: proc
	mountPath: /host/proc
	- name: sys
	mountPath: /host/sys
	- name: rootfs
	mountPath: /rootfs
	tolerations:
	- key: "node-role.kubernetes.io/master"
	operator: "Exists"
	effect: "NoSchedule"
	volumes:
	- name: proc
	hostPath:
	path: /proc
	- name: dev
	hostPath:
	path: /dev
	- name: sys
	hostPath:
	path: /sys
	- name: rootfs
	hostPath:
	path: /

	[root@CentOS8 prometheus]# kubectl get all -n prometheus
	NAME READY STATUS RESTARTS AGE
	pod/grafana-7556f7d7df-nzkmt 1/1 Running 0 153m
	pod/kube-state-metrics-5f8785787d-cw8ln 1/1 Running 0 170m
	pod/node-exporter-pln47 1/1 Running 0 18h
	pod/prometheus-584b95655f-h4krl 1/1 Running 9 (4m6s ago) 3h32m

	NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
	service/grafana NodePort 10.108.237.208 <none> 3000:30820/TCP 162m
	service/kube-state-metrics ClusterIP 10.96.242.162 <none> 8080/TCP,8081/TCP 170m
	service/prometheus NodePort 10.96.81.46 <none> 9090:32440/TCP 18h

	NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
	daemonset.apps/node-exporter 1 1 1 1 1 <none> 18h

	NAME READY UP-TO-DATE AVAILABLE AGE
	deployment.apps/grafana 1/1 1 1 162m
	deployment.apps/kube-state-metrics 1/1 1 1 170m
	deployment.apps/prometheus 1/1 1 1 18h

	NAME DESIRED CURRENT READY AGE
	replicaset.apps/grafana-7556f7d7df 1 1 1 162m
	replicaset.apps/kube-state-metrics-5f8785787d 1 1 1 170m
	replicaset.apps/prometheus-584b95655f 1 1 1 18h

	[root@CentOS8 prometheus]# cat kube-metrics-deployment.yaml
	apiVersion: v1
	automountServiceAccountToken: false
	kind: ServiceAccount
	metadata:
	labels:
	app.kubernetes.io/component: exporter
	app.kubernetes.io/name: kube-state-metrics
	app.kubernetes.io/version: 2.4.2
	name: kube-state-metrics
	namespace: prometheus
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: ClusterRole
	metadata:
	labels:
	app.kubernetes.io/component: exporter
	app.kubernetes.io/name: kube-state-metrics
	app.kubernetes.io/version: 2.4.2
	name: kube-state-metrics
	rules:
	- apiGroups:
	- ""
	resources:
	- configmaps
	- secrets
	- nodes
	- pods
	- services
	- resourcequotas
	- replicationcontrollers
	- limitranges
	- persistentvolumeclaims
	- persistentvolumes
	- namespaces
	- endpoints
	verbs:
	- list
	- watch
	- apiGroups:
	- apps
	resources:
	- statefulsets
	- daemonsets
	- deployments
	- replicasets
	verbs:
	- list
	- watch
	- apiGroups:
	- batch
	resources:
	- cronjobs
	- jobs
	verbs:
	- list
	- watch
	- apiGroups:
	- autoscaling
	resources:
	- horizontalpodautoscalers
	verbs:
	- list
	- watch
	- apiGroups:
	- authentication.k8s.io
	resources:
	- tokenreviews
	verbs:
	- create
	- apiGroups:
	- authorization.k8s.io
	resources:
	- subjectaccessreviews
	verbs:
	- create
	- apiGroups:
	- policy
	resources:
	- poddisruptionbudgets
	verbs:
	- list
	- watch
	- apiGroups:
	- certificates.k8s.io
	resources:
	- certificatesigningrequests
	verbs:
	- list
	- watch
	- apiGroups:
	- storage.k8s.io
	resources:
	- storageclasses
	- volumeattachments
	verbs:
	- list
	- watch
	- apiGroups:
	- admissionregistration.k8s.io
	resources:
	- mutatingwebhookconfigurations
	- validatingwebhookconfigurations
	verbs:
	- list
	- watch
	- apiGroups:
	- networking.k8s.io
	resources:
	- networkpolicies
	- ingresses
	verbs:
	- list
	- watch
	- apiGroups:
	- coordination.k8s.io
	resources:
	- leases
	verbs:
	- list
	- watch
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: ClusterRoleBinding
	metadata:
	labels:
	app.kubernetes.io/component: exporter
	app.kubernetes.io/name: kube-state-metrics
	app.kubernetes.io/version: 2.4.2
	name: kube-state-metrics
	roleRef:
	apiGroup: rbac.authorization.k8s.io
	kind: ClusterRole
	name: kube-state-metrics
	subjects:
	- kind: ServiceAccount
	name: kube-state-metrics
	namespace: prometheus
	---
	apiVersion: v1
	kind: Service
	metadata:
	annotations:
	prometheus.io/scraped: "true" # 设置能被prometheus抓取到，因为不带这个annotation prometheus-service-endpoints 不会去抓这个metrics
	labels:
	app.kubernetes.io/component: exporter
	app.kubernetes.io/name: kube-state-metrics
	app.kubernetes.io/version: 2.4.2
	name: kube-state-metrics
	namespace: prometheus
	spec:
	# clusterIP: None # 允许通过svc来进行访问
	ports:
	- name: http-metrics
	port: 8080
	targetPort: http-metrics
	- name: telemetry
	port: 8081
	targetPort: telemetry
	selector:
	app.kubernetes.io/name: kube-state-metrics
	---
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	labels:
	app.kubernetes.io/component: exporter
	app.kubernetes.io/name: kube-state-metrics
	app.kubernetes.io/version: 2.4.2
	name: kube-state-metrics
	namespace: prometheus
	spec:
	replicas: 1
	selector:
	matchLabels:
	app.kubernetes.io/name: kube-state-metrics
	template:
	metadata:
	labels:
	app.kubernetes.io/component: exporter
	app.kubernetes.io/name: kube-state-metrics
	app.kubernetes.io/version: 2.4.2
	spec:
	nodeName: centos8 # 设置在k8s-master-1上运行
	tolerations: # 设置能容忍在master节点运行
	- key: "node-role.kubernetes.io/master"
	operator: "Exists"
	effect: "NoSchedule"
	automountServiceAccountToken: true
	containers:
	# - image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.2
	- image: anjia0532/google-containers.kube-state-metrics.kube-state-metrics:v2.4.2
	livenessProbe:
	httpGet:
	path: /healthz
	port: 8080
	initialDelaySeconds: 5
	timeoutSeconds: 5
	name: kube-state-metrics
	ports:
	- containerPort: 8080
	name: http-metrics
	- containerPort: 8081
	name: telemetry
	readinessProbe:
	httpGet:
	path: /
	port: 8081
	initialDelaySeconds: 5
	timeoutSeconds: 5
	securityContext:
	allowPrivilegeEscalation: false
	capabilities:
	drop:
	- ALL
	readOnlyRootFilesystem: true
	runAsUser: 65534
	serviceAccountName: kube-state-metrics

	[root@CentOS8 prometheus]# kubectl get pod -n prometheus
	NAME READY STATUS RESTARTS AGE
	grafana-7556f7d7df-nzkmt 1/1 Running 0 156m
	kube-state-metrics-5f8785787d-cw8ln 1/1 Running 0 173m
	node-exporter-pln47 1/1 Running 0 18h
	prometheus-584b95655f-h4krl 1/1 Running 9 (7m14s ago) 3h35m

	[root@CentOS8 prometheus]# curl http://xxx:32440/metrics
	# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
	# TYPE go_gc_duration_seconds summary
	go_gc_duration_seconds{quantile="0"} 3.5208e-05
	go_gc_duration_seconds{quantile="0.25"} 0.000213094
	go_gc_duration_seconds{quantile="0.5"} 0.094531627
	go_gc_duration_seconds{quantile="0.75"} 0.105158749
	go_gc_duration_seconds{quantile="1"} 0.303282049
	go_gc_duration_seconds_sum 1.715240602
	go_gc_duration_seconds_count 23
	# HELP go_goroutines Number of goroutines that currently exist.
	# TYPE go_goroutines gauge
	go_goroutines 93
	# HELP go_info Information about the Go environment.
	# TYPE go_info gauge
	go_info{version="go1.17.5"} 1
	# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
	......

	[root@CentOS8 prometheus]# cat grafana-pvc.yaml
	kind: PersistentVolumeClaim
	apiVersion: v1
	metadata:
	name: grafana-data
	namespace: prometheus
	spec:
	accessModes:
	- ReadWriteOnce
	resources:
	requests:
	storage: 50Gi
	storageClassName: openebs-hostpath

	[root@CentOS8 prometheus]# cat grafana-svc.yaml
	apiVersion: v1
	kind: Service
	metadata:
	name: grafana
	namespace: prometheus
	labels:
	app: grafana
	spec:
	type: NodePort
	ports:
	- port: 3000
	selector:
	app: grafana

	#Deployment

	[root@CentOS8 prometheus]# cat grafana-deployment.yaml
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: grafana
	namespace: prometheus
	labels:
	app: grafana
	spec:
	revisionHistoryLimit: 10
	selector:
	matchLabels:
	app: grafana
	template:
	metadata:
	labels:
	app: grafana
	spec:
	containers:
	- name: grafana
	# image: grafana/grafana #官方镜像
	image: registry.cn-hangzhou.aliyuncs.com/urbancabin/grafana:latest
	imagePullPolicy: IfNotPresent
	ports:
	- containerPort: 3000
	name: grafana
	env:
	- name: GF_SECURITY_ADMIN_USER
	value: admin
	- name: GF_SECURITY_ADMIN_PASSWORD
	value: abcdocker
	readinessProbe:
	failureThreshold: 10
	httpGet:
	path: /api/health
	port: 3000
	scheme: HTTP
	initialDelaySeconds: 60
	periodSeconds: 10
	successThreshold: 1
	timeoutSeconds: 30
	livenessProbe:
	failureThreshold: 3
	httpGet:
	path: /api/health
	port: 3000
	scheme: HTTP
	periodSeconds: 10
	successThreshold: 1
	timeoutSeconds: 1
	resources:
	limits:
	cpu: 300m
	memory: 1024Mi
	requests:
	cpu: 300m
	memory: 1024Mi
	volumeMounts:
	- mountPath: /var/lib/grafana
	subPath: grafana
	name: storage
	securityContext:
	fsGroup: 472
	runAsUser: 472
	volumes:
	- name: storage
	persistentVolumeClaim:
	claimName: grafana-data