k8s prometheus平台部署相关组件

k8s-prometheus平台部署相关组件

1. K8s-prometheus平台部署相关组件

  • prometheus-deployment.yaml #部署Prometheus

  • **prometheus-configmap.yaml ** #Prometheus配置文件,主要配置Kubernetes服务发现

  • prometheus-rules.yaml #Prometheus告警规则

  • grafana.yaml #可视化展示

  • node-exporter.yml #采集节点资源,通过DaemonSet方式部署,并声明让Prometheus收集

  • kube-state-metrics.yaml #采集K8s资源,并声明让Prometheus收集

  • alertmanager-configmap.yaml #配置文件,配置发件人和收件人

  • alertmanager-deployment.yaml #部署Alertmanager告警组件

2. 案例部署

  • 配置文件编写

    [root@k8s-master prometheus]# cat prometheus-configmap.yaml 
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: prometheus-config
      namespace: ops 
    data:
      prometheus.yml: |
        rule_files:
        - /etc/config/rules/*.rules
    
        scrape_configs:
        - job_name: prometheus
          static_configs:
          - targets:
            - localhost:9090
    
        - job_name: kubernetes-apiservers
          kubernetes_sd_configs:
          - role: endpoints
          relabel_configs:
          - action: keep
            regex: default;kubernetes;https
            source_labels:
            - __meta_kubernetes_namespace
            - __meta_kubernetes_service_name
            - __meta_kubernetes_endpoint_port_name
          scheme: https
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            insecure_skip_verify: true
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
     
        - job_name: kubernetes-nodes-kubelet
          kubernetes_sd_configs:
          - role: node  # 发现集群中的节点
          relabel_configs:
          # 将标签(.*)作为新标签名,原有值不变
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)
          scheme: https
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            insecure_skip_verify: true
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    
        - job_name: kubernetes-nodes-cadvisor
          kubernetes_sd_configs:
          - role: node
          relabel_configs:
          # 将标签(.*)作为新标签名,原有值不变
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)
          # 实际访问指标接口 https://NodeIP:10250/metrics/cadvisor,这里替换默认指标URL路径
          - target_label: __metrics_path__
            replacement: /metrics/cadvisor
          scheme: https
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            insecure_skip_verify: true
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    
        - job_name: kubernetes-service-endpoints
          kubernetes_sd_configs:
          - role: endpoints  # 从Service列表中的Endpoint发现Pod为目标
          relabel_configs:
          # Service没配置注解prometheus.io/scrape的不采集
          - action: keep
            regex: true
            source_labels:
            - __meta_kubernetes_service_annotation_prometheus_io_scrape
          # 重命名采集目标协议
          - action: replace
            regex: (https?)
            source_labels:
            - __meta_kubernetes_service_annotation_prometheus_io_scheme
            target_label: __scheme__
          # 重命名采集目标指标URL路径
          - action: replace
            regex: (.+)
            source_labels:
            - __meta_kubernetes_service_annotation_prometheus_io_path
            target_label: __metrics_path__
          # 重命名采集目标地址
          - action: replace
            regex: ([^:]+)(?::\d+)?;(\d+)
            replacement: $1:$2
            source_labels:
            - __address__
            - __meta_kubernetes_service_annotation_prometheus_io_port
            target_label: __address__
          # 将K8s标签(.*)作为新标签名,原有值不变
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          # 生成命名空间标签
          - action: replace
            source_labels:
            - __meta_kubernetes_namespace
            target_label: kubernetes_namespace
          # 生成Service名称标签
          - action: replace
            source_labels:
            - __meta_kubernetes_service_name
            target_label: kubernetes_name
    
        - job_name: kubernetes-pods
          kubernetes_sd_configs:
          - role: pod   # 发现所有Pod为目标
          # 重命名采集目标协议
          relabel_configs:
          - action: keep
            regex: true
            source_labels:
            - __meta_kubernetes_pod_annotation_prometheus_io_scrape
          # 重命名采集目标指标URL路径
          - action: replace
            regex: (.+)
            source_labels:
            - __meta_kubernetes_pod_annotation_prometheus_io_path
            target_label: __metrics_path__
          # 重命名采集目标地址
          - action: replace
            regex: ([^:]+)(?::\d+)?;(\d+)
            replacement: $1:$2
            source_labels:
            - __address__
            - __meta_kubernetes_pod_annotation_prometheus_io_port
            target_label: __address__
          # 将K8s标签(.*)作为新标签名,原有值不变
          - action: labelmap
            regex: __meta_kubernetes_pod_label_(.+)
          # 生成命名空间标签
          - action: replace
            source_labels:
            - __meta_kubernetes_namespace
            target_label: kubernetes_namespace
          # 生成Service名称标签
          - action: replace
            source_labels:
            - __meta_kubernetes_pod_name
            target_label: kubernetes_pod_name
    
        alerting:
          alertmanagers:
          - static_configs:
              - targets: ["alertmanager:80"]
    
    [root@k8s-master prometheus]# cat prometheus-deployment.yaml
    apiVersion: apps/v1
    kind: Deployment
    metadata:
      name: prometheus 
      namespace: ops
      labels:
        k8s-app: prometheus
    spec:
      replicas: 1
      selector:
        matchLabels:
          k8s-app: prometheus
      template:
        metadata:
          labels:
            k8s-app: prometheus
        spec:
          serviceAccountName: prometheus
          initContainers:
          - name: "init-chown-data"
            image: "busybox:latest"
            imagePullPolicy: "IfNotPresent"
            command: ["chown", "-R", "65534:65534", "/data"]
            volumeMounts:
            - name: prometheus-data
              mountPath: /data
              subPath: ""
          containers:
            - name: prometheus-server-configmap-reload
              image: "jimmidyson/configmap-reload:v0.1"
              imagePullPolicy: "IfNotPresent"
              args:
                - --volume-dir=/etc/config
                - --webhook-url=http://localhost:9090/-/reload
              volumeMounts:
                - name: config-volume
                  mountPath: /etc/config
                  readOnly: true
              resources:
                limits:
                  cpu: 10m
                  memory: 10Mi
                requests:
                  cpu: 10m
                  memory: 10Mi
    
            - name: prometheus-server
              image: "prom/prometheus:v2.20.0"
              imagePullPolicy: "IfNotPresent"
              args:
                - --config.file=/etc/config/prometheus.yml
                - --storage.tsdb.path=/data
                - --web.console.libraries=/etc/prometheus/console_libraries
                - --web.console.templates=/etc/prometheus/consoles
                - --web.enable-lifecycle
              ports:
                - containerPort: 9090
              readinessProbe:
                httpGet:
                  path: /-/ready
                  port: 9090
                initialDelaySeconds: 30
                timeoutSeconds: 30
              livenessProbe:
                httpGet:
                  path: /-/healthy
                  port: 9090
                initialDelaySeconds: 30
                timeoutSeconds: 30
              resources:
                limits:
                  cpu: 500m
                  memory: 1500Mi
                requests:
                  cpu: 200m
                  memory: 1000Mi
                
              volumeMounts:
                - name: config-volume
                  mountPath: /etc/config
                - name: prometheus-data
                  mountPath: /data
                  subPath: ""
                - name: prometheus-rules
                  mountPath: /etc/config/rules
          volumes:
            - name: config-volume
              configMap:
                name: prometheus-config
            - name: prometheus-rules
              configMap:
                name: prometheus-rules
            - name: prometheus-data
              persistentVolumeClaim:
                claimName: prometheus
    ---
    apiVersion: v1
    kind: PersistentVolumeClaim
    metadata:
      name: prometheus
      namespace: ops
    spec:
      storageClassName: "managed-nfs-storage"
      accessModes:
        - ReadWriteMany
      resources:
        requests:
          storage: 10Gi
    ---
    apiVersion: v1
    kind: Service
    metadata: 
      name: prometheus
      namespace: ops
    spec: 
      type: NodePort
      ports: 
        - name: http 
          port: 9090
          protocol: TCP
          targetPort: 9090
          nodePort: 30090
      selector: 
        k8s-app: prometheus
    ---
    apiVersion: v1
    kind: ServiceAccount
    metadata:
      name: prometheus
      namespace: ops
    ---
    apiVersion: rbac.authorization.k8s.io/v1
    kind: ClusterRole
    metadata:
      name: prometheus
    rules:
      - apiGroups:
          - ""
        resources:
          - nodes
          - nodes/metrics
          - services
          - endpoints
          - pods
        verbs:
          - get
          - list
          - watch
      - apiGroups:
          - ""
        resources:
          - configmaps
        verbs:
          - get
      - nonResourceURLs:
          - "/metrics"
        verbs:
          - get
    ---
    apiVersion: rbac.authorization.k8s.io/v1
    kind: ClusterRoleBinding
    metadata:
      name: prometheus
    roleRef:
      apiGroup: rbac.authorization.k8s.io
      kind: ClusterRole
      name: prometheus
    subjects:
    - kind: ServiceAccount
      name: prometheus
      namespace: ops
    
    [root@k8s-master prometheus]# cat prometheus-rules.yaml 
    apiVersion: v1
    kind: ConfigMap
    metadata:
      name: prometheus-rules
      namespace: ops
    data:
      general.rules: |
        groups:
        - name: general.rules
          rules:
          - alert: InstanceDown
            expr: up == 0
            for: 1m
            labels:
              severity: error 
            annotations:
              summary: "Instance {{ $labels.instance }} 停止工作"
              description: "{{ $labels.instance }} job {{ $labels.job }} 已经停止5分钟以上."
              
      node.rules: |
        groups:
        - name: node.rules
          rules:
          - alert: NodeFilesystemUsage
            expr: |
              100 - (node_filesystem_free{fstype=~"ext4|xfs"} / 
              node_filesystem_size{fstype=~"ext4|xfs"} * 100) > 80 
            for: 1m
            labels:
              severity: warning 
            annotations:
              summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
              description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }})"
    
          - alert: NodeMemoryUsage
            expr: |
              100 - (node_memory_MemFree+node_memory_Cached+node_memory_Buffers) / 
              node_memory_MemTotal * 100 > 80
            for: 1m
            labels:
              severity: warning
            annotations:
              summary: "Instance {{ $labels.instance }} 内存使用率过高"
              description: "{{ $labels.instance }}内存使用大于80% (当前值: {{ $value }})"
    
          - alert: NodeCPUUsage    
            expr: |
              100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 60 
            for: 1m
            labels:
              severity: warning
            annotations:
              summary: "Instance {{ $labels.instance }} CPU使用率过高"       
              description: "{{ $labels.instance }}CPU使用大于60% (当前值: {{ $value }})"
    
          - alert: KubeNodeNotReady
            expr: |
              kube_node_status_condition{condition="Ready",status="true"} == 0
            for: 1m
            labels:
              severity: error
            annotations:
              message: '{{ $labels.node }} 已经有10多分钟没有准备好了.'
    
      pod.rules: |
        groups:
        - name: pod.rules
          rules:
          - alert: PodCPUUsage
            expr: |
               sum(rate(container_cpu_usage_seconds_total{image!=""}[1m]) * 100) by (pod, namespace) > 80
            for: 5m
            labels:
              severity: warning 
            annotations:
              summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于80% (当前值: {{ $value }})"
    
          - alert: PodMemoryUsage
            expr: |
               sum(container_memory_rss{image!=""}) by(pod, namespace) / 
               sum(container_spec_memory_limit_bytes{image!=""}) by(pod, namespace) * 100 != +inf > 80
            for: 5m
            labels:
              severity: warning 
            annotations:
              summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于80% (当前值: {{ $value }})"
    
          - alert: PodNetworkReceive
            expr: |
               sum(rate(container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace)  > 30000
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 入口流量大于30MB/s (当前值: {{ $value }}K/s)"           
    
          - alert: PodNetworkTransmit
            expr: | 
               sum(rate(container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod,namespace) > 30000
            for: 5m
            labels:
              severity: warning 
            annotations:
              summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 出口流量大于30MB/s (当前值: {{ $value }}/K/s)"
    
          - alert: PodRestart
            expr: |
               sum(changes(kube_pod_container_status_restarts_total[1m])) by (pod,namespace) > 0
            for: 1m
            labels:
              severity: warning 
            annotations:
              summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod重启 (当前值: {{ $value }})"
    
          - alert: PodFailed
            expr: |
               sum(kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
            for: 5s
            labels:
              severity: error 
            annotations:
              summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"
    
          - alert: PodPending
            expr: | 
               sum(kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
            for: 1m
            labels:
              severity: error
            annotations:
              summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"
    
  • 创建命名空间

    [root@k8s-master prometheus]# kubectl create namespace ops
    namespace/ops created
    
  • 部署prometheus服务

    [root@k8s-master prometheus]# kubectl apply -f prometheus-configmap.yaml 
    configmap/prometheus-config created
    [root@k8s-master prometheus]# kubectl apply -f prometheus-deployment.yaml 
    deployment.apps/prometheus created
    persistentvolumeclaim/prometheus created
    service/prometheus created
    serviceaccount/prometheus created
    clusterrole.rbac.authorization.k8s.io/prometheus created
    clusterrolebinding.rbac.authorization.k8s.io/prometheus created
    [root@k8s-master prometheus]# kubectl apply -f prometheus-rules.yaml 
    
  • 验证prometheus服务是否启动

    [root@k8s-master prometheus]# kubectl get pods -n ops
    NAME                          READY   STATUS    RESTARTS   AGE
    prometheus-859dbbc5f7-rlsqp   2/2     Running   0          4h8m
    
  • 浏览器验证
    image
    image

posted @ 2021-11-12 11:24  七月流星雨  阅读(140)  评论(0编辑  收藏  举报