Kubernetes安装Prometheus+grafana数据持久化

安装 Prometheus

一、搭建前期准备条件

1、在k8s集群创建一个 prometheus 命名空间，做好服务隔离，确保不会影响到其它服务

二、数据持久化环境搭建

1、安装NFS，查看是否已安装NFS

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpm -qa | grep nfs

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpm -qa | grep rpcbind

安装NFS

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# yum -y install nfs-utils rpcbind

创建NFS数据存储目录

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mkdir -p /nfs/kubernetes/prometheus

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# chmod 666 /nfs/kubernetes/prometheus

配置访问

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# echo "/nfs/kubernetes *(rw,no_root_squash,sync)" >>/etc/exports

[root@iZwz95iaf9ikzcszlcw8qpZ mnt]# cat /etc/exports

刷新配置生效

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# exportfs -r

启动NFS

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl start rpcbind

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl enable rpcbind

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl enable nfs

查看RPC服务注册状态

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpcinfo -p localhost

创建挂载目录

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# showmount -e 挂载机器IP地址

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mount -t nfs 挂载机器IP地址:/nfs/kubernetes/ /nfs/kubernetes/prometheus/

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# df -h

三、k8s搭建prometheus

创建prometheus目录用来存放YAML配置

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mkdir -p /opt/prometheus/prometheus

1、创建prometheus YAML文件

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus.configmap.yaml

apiVersion: v1

kind: ConfigMap

metadata:

namespace: prometheus

data:

prometheus.yml: |

global:

scrape_interval: 15s

scrape_timeout: 15s

alerting:

alertmanagers:

- static_configs:

- targets: ["localhost:9093"]

rule_files:

- /etc/prometheus/*.rules

scrape_configs:

- job_name: 'prometheus'

static_configs:

- targets: ['localhost:9090']

- job_name: 'kube-state-metrics'

static_configs:

- targets: ['kube-state-metrics.prometheus.svc.cluster.local:8080']

- job_name: 'kubernetes-node'

kubernetes_sd_configs:

- role: node

relabel_configs:

- source_labels: [__address__]

regex: '(.*):10250'

replacement: '${1}:9003'

target_label: __address__

action: replace

- action: labelmap

regex: __meta_kubernetes_node_label_(.+)

- job_name: 'kubernetes-cadvisor'

kubernetes_sd_configs:

- role: node

scheme: https

tls_config:

ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt

bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

relabel_configs:

- action: labelmap

regex: __meta_kubernetes_node_label_(.+)

- target_label: __address__

replacement: kubernetes.default.svc:443

- source_labels: [__meta_kubernetes_node_name]

regex: (.+)

target_label: __metrics_path__

replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

##############################################################################################

#pod

pod.rules: |

groups:

- name: pod.rules

rules:

- alert: K8sClusterNodeNotready

expr: |

kube_node_status_condition{condition="Ready",status!="true"} == 1

for: 30s

labels:

severity: warning

annotations:

summary: "节点: {{$labels.node}} 状态: Notready"

- alert: PodCPUUsage

expr: |

sum(irate(container_cpu_usage_seconds_total{image!="",container!="POD",container!=""}[1m])) by (pod,namespace) / (sum(container_spec_cpu_quota{image!="",container!="POD",container!=""}/100000) by (pod,namespace)) * 100 > 90

for: 2m

labels:

severity: warning

annotations:

summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于90% (当前值: {{ $value }})"

- alert: PodMemoryUsage

expr: |

sum(container_memory_rss{container!="POD",container!="alermanager",image!="",pod!=""})by(pod,namespace) / sum(container_spec_memory_limit_bytes{container!="",container!="POD"})by(pod,namespace) * 100 != +inf > 90

for: 2m

labels:

severity: error

annotations:

summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于90% (当前值: {{ $value }})"

- alert: PodRestart

expr: |

sum (increase (kube_pod_container_status_restarts_total{}[1m])) by (namespace,pod) >0

for: 1m

labels:

severity: warning

annotations:

summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod重启 (当前值: {{ $value }})"

- alert: PodFailed

expr: |

sum(kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0

for: 5s

labels:

severity: error

annotations:

summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"

- alert: PodPending

expr: |

sum(kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0

for: 1m

labels:

severity: error

annotations:

summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"

- alert: PodErrImagePull

expr: |

sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ErrImagePull"}) == 1

for: 1m

labels:

severity: warning

annotations:

summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ErrImagePull (当前值: {{ $value }})"

- alert: PodImagePullBackOff

expr: |

sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"}) == 1

for: 1m

labels:

severity: warning

annotations:

summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ImagePullBackOff (当前值: {{ $value }})"

- alert: PodCrashLoopBackOff

expr: |

sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) == 1

for: 1m

labels:

severity: warning

annotations:

summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CrashLoopBackOff (当前值: {{ $value }})"

- alert: PodInvalidImageName

expr: |

sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="InvalidImageName"}) == 1

for: 1m

labels:

severity: warning

annotations:

summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态InvalidImageName (当前值: {{ $value }})"

- alert: PodCreateContainerConfigError

expr: |

sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CreateContainerConfigError"}) == 1

for: 1m

labels:

severity: warning

annotations:

summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CreateContainerConfigError (当前值: {{ $value }})"

启动YAML

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus.configmap.yaml

configmap/prometheus-config created

查看状态

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl get configmaps -n prometheus |grep prometheus

2、创建prometheus YAML文件

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus.deploy.yaml

apiVersion: apps/v1

kind: Deployment

metadata:

namespace: prometheus

labels:

app: prometheus

spec:

selector:

matchLabels:

app: prometheus

template:

metadata:

labels:

app: prometheus

spec:

serviceAccountName: prometheus

containers:

- image: prom/prometheus:v2.24.1

command:

- "/bin/prometheus"

args:

- "--config.file=/etc/prometheus/prometheus.yml"

- "--storage.tsdb.path=/prometheus"

- "--storage.tsdb.retention=40d"

- "--web.enable-admin-api" # 控制对admin HTTP API的访问，其中包括删除时间序列等功能

- "--web.enable-lifecycle" # 支持热更新，直接执行localhost:9090/-/reload立即生效

ports:

- containerPort: 9090

protocol: TCP

volumeMounts:

- mountPath: "/prometheus"

subPath: prometheus

- mountPath: "/etc/prometheus"

resources:

requests:

cpu: 4000m

memory: 8192Mi

limits:

cpu: 8000m

memory: 16384Mi

- name: alermanage

image: prom/alertmanager:v0.22.0

imagePullPolicy: IfNotPresent

args:

- "--config.file=/etc/alertmanager/config.yml"

- "--storage.path=/alertmanager/data"

ports:

- containerPort: 9093

volumeMounts:

- mountPath: "/etc/alertmanager"

resources:

requests:

cpu: 2000m

memory: 4096Mi

limits:

cpu: 4000m

memory: 8192Mi

securityContext:

runAsUser: 0

volumes:

- name: data

persistentVolumeClaim:

claimName: prometheus

- configMap:

- name: alertcfg

configMap:

---

apiVersion: v1

kind: Service

metadata:

namespace: prometheus

labels:

app: prometheus

spec:

type: NodePort

selector:

app: prometheus

ports:

- name: http

port: 9090

3、创建prometheus YAML文件

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus-volume.yaml

apiVersion: v1

kind: PersistentVolume

metadata:

spec:

capacity:

storage: 100Gi

accessModes:

- ReadWriteOnce

persistentVolumeReclaimPolicy: Recycle

nfs:

server: 10.11.33.57 #NFS地址

path: /nfs/kubernetes/prometheus #NFS挂载目录

---

apiVersion: v1

kind: PersistentVolumeClaim

metadata:

namespace: prometheus

spec:

accessModes:

- ReadWriteOnce

resources:

requests:

storage: 100Gi

启动YAML 配置文件

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-volume.yaml persistentvolume/prometheus created persistentvolumeclaim/prometheus created

4、创建prometheus YAML文件

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus-rbac.yaml

apiVersion: v1

kind: ServiceAccount

metadata:

namespace: prometheus

---

apiVersion: rbac.authorization.k8s.io/v1

kind: ClusterRole

metadata:

rules:

- apiGroups:

- ""

resources:

- nodes

- services

- endpoints

- pods

- nodes/proxy

verbs:

- get

- list

- watch

- apiGroups:

- ""

resources:

- configmaps

- nodes/metrics

verbs:

- get

- nonResourceURLs:

- /metrics

verbs:

- get

---

apiVersion: rbac.authorization.k8s.io/v1

kind: ClusterRoleBinding

metadata:

roleRef:

apiGroup: rbac.authorization.k8s.io

kind: ClusterRole

subjects:

- kind: ServiceAccount

namespace: prometheus

启动YAML配置文件

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-rbac.yaml

serviceaccount/prometheus created

clusterrole.rbac.authorization.k8s.io/prometheus created

clusterrolebinding.rbac.authorization.k8s.io/prometheus created

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus.deploy.yaml

查看启动状态

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl get pod -n prometheus |grep prometheus

5、创建prometheus YAML文件

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus-svc.yaml

apiVersion: v1

kind: Service

metadata:

namespace: prometheus

labels:

app: prometheus

spec:

type: NodePort

selector:

app: prometheus

ports:

- name: http

port: 9090

- name: alertmanager

port: 9093

targetPort: 9093

启动YMAL配置文件（也可以手动在k8s平台创建）

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-svc.yaml service/prometheus created

四、Prometheus监控Kubernetes 集群节点及应用

每一个节点都会运行一个Pod，如果从集群中删除或添加节点后，也会进行自动扩展

1、编辑prometheus-node-exporter监控的YAML文件

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vimprometheus-node-exporter.yaml

apiVersion: apps/v1

kind: DaemonSet

metadata:

namespace: prometheus

labels:

k8s-app: node-exporter

spec:

selector:

matchLabels:

template:

metadata:

labels:

app: node-exporter

spec:

# affinity:

# nodeAffinity:

# requiredDuringSchedulingIgnoredDuringExecution:

# nodeSelectorTerms:

# - matchExpressions:

# - key: type

# operator: NotIn

# values:

# - virtual-kubelet

hostPID: true

hostIPC: true

hostNetwork: true

containers:

- name: prometheus-node-exporter

image: prom/node-exporter:v1.1.0

ports:

- containerPort: 9003

resources:

requests:

cpu: 0.15

securityContext:

privileged: true

args:

- --web.listen-address

- ":9003"

- --path.procfs

- /host/proc

- --path.sysfs

- /host/sys

- --collector.filesystem.ignored-mount-points

- '"^/(sys|proc|dev|host|etc)($|/)"'

volumeMounts:

- name: dev

mountPath: /host/dev

- name: proc

mountPath: /host/proc

- name: sys

mountPath: /host/sys

- name: rootfs

mountPath: /rootfs

tolerations:

- key: "node-role.kubernetes.io/master"

operator: "Exists"

effect: "NoSchedule"

#如果节点有打污点，需要添加如下配置

- key: "dedicated" #污点标签

operator: "Exists"

effect: "NoExecute" #污点 NoExecute

- key: "eci"

operator: "Exists"

effect: "NoSchedule"

volumes:

- name: proc

hostPath:

path: /proc

- name: dev

hostPath:

path: /dev

- name: sys

hostPath:

path: /sys

- name: rootfs

hostPath:

path: /

启动prometheus-node-exporter监控

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-node-exporter.yaml

daemonset.extensions/node-exporter created

查看prometheus-node-exporter节点监控状态

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl get pod -n prometheus -o wide|grep node

由于要获取的数据是主机的监控指标数据，而node-exporter是运行在容器中的，所以在Pod中需要配置一些Pod的安全策略

安装grafana

一、搭建前期准备条件

1、在k8s集群创建一个 prometheus 命名空间，做好服务隔离，确保不会影响到其它服务

二、数据持久化环境搭建

1、安装NFS，查看是否已安装NFS

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpm -qa | grep nfs

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpm -qa | grep rpcbind

安装NFS

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# yum -y install nfs-utils rpcbind

创建NFS数据存储目录

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mkdir /nfs/kubernetes/prometheus

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# chmod 666 mkdir /nfs/kubernetes/prometheus

配置访问

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# echo "/home/kvm *(rw,no_root_squash,sync)" >>/etc/exports

刷新配置生效

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# exportfs -r

启动NFS

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl start rpcbind

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl enable rpcbind

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl enable nfs

查看RPC服务注册状态

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpcinfo -p localhost

创建挂载目录

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mkdir /testnfs

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# showmount -e 挂载机器IP地址

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mount -t nfs 挂载机器IP地址:/nfs/kubernetes/prometheus/prometheus

[root@iZwz913wwcdk2r7whthcrsZ prometheus]# df -h

三、k8s搭建grafana

1、创建grafana目录用来存放YAML配置

[root@iZwz913wwcdk2r7whthcrsZ grafana]# vim grafana_deployment.yaml

apiVersion: apps/v1

kind: Deployment

metadata:

namespace: prometheus

labels:

app: grafana

k8s-app: grafana

spec:

selector:

matchLabels:

k8s-app: grafana

app: grafana

revisionHistoryLimit: 10

template:

metadata:

labels:

app: grafana

k8s-app: grafana

spec:

containers:

- name: grafana

image: grafana/grafana:7.5.2

imagePullPolicy: IfNotPresent

ports:

- containerPort: 3000

env:

- name: GF_SECURITY_ADMIN_USER

value: admin

- name: GF_SECURITY_ADMIN_PASSWORD

value: bwgfs127127

readinessProbe:

failureThreshold: 10

httpGet:

path: /api/health

port: 3000

scheme: HTTP

initialDelaySeconds: 60

periodSeconds: 10

successThreshold: 1

timeoutSeconds: 30

livenessProbe:

failureThreshold: 3

httpGet:

path: /api/health

port: 3000

scheme: HTTP

periodSeconds: 10

successThreshold: 1

timeoutSeconds: 1

resources:

limits:

cpu: 8000m

memory: 16384Mi

requests:

cpu: 4000m

memory: 8192Mi

volumeMounts:

- mountPath: /var/lib/grafana

subPath: grafana

securityContext:

fsGroup: 472

runAsUser: 0

volumes:

- name: storage

persistentVolumeClaim:

claimName: grafana

2、创建grafana目录用来存放YAML配置

[root@iZwz913wwcdk2r7whthcrsZ grafana]# vim grafana_volume.yaml

apiVersion: v1

kind: PersistentVolume

metadata:

spec:

capacity:

storage: 100Gi

accessModes:

- ReadWriteOnce

persistentVolumeReclaimPolicy: Recycle

nfs:

server: 10.11.33.57 #NFS地址

path: /nfs/kubernetes/prometheus #NFS地址目录

---

apiVersion: v1

kind: PersistentVolumeClaim

metadata:

namespace: prometheus

spec:

accessModes:

- ReadWriteOnce

resources:

requests:

storage: 100Gi

3、创建grafana目录用来存放YAML配置

[root@iZwz913wwcdk2r7whthcrsZ grafana]# vim grafana_svc.yaml

apiVersion: v1

kind: Service

metadata:

namespace: prometheus

labels:

app: grafana

spec:

type: NodePort

ports:

- port: 3000

selector:

app: grafana

4、创建grafana目录用来存放YAML配置

[root@iZwz913wwcdk2r7whthcrsZ grafana]# vim grafana_job.yaml

apiVersion: batch/v1

kind: Job

metadata:

namespace: prometheus

spec:

template:

spec:

restartPolicy: Never

containers:

- name: grafana-chown

command: ["chown", "-R", "472:0", "/var/lib/grafana"]

image: busybox

imagePullPolicy: IfNotPresent

volumeMounts:

- name: storage

subPath: grafana

mountPath: /var/lib/grafana

volumes:

- name: storage

persistentVolumeClaim:

claimName: grafana

5、启动grafana服务

[root@iZwz913wwcdk2r7whthcrsZ grafana]# kubectl create -f grafana_volume.yaml

persistentvolume/grafana created

persistentvolumeclaim/grafana created

[root@iZwz913wwcdk2r7whthcrsZ grafana]# kubectl create -f grafana_job.yaml

job.batch/grafana-chown created

[root@iZwz913wwcdk2r7whthcrsZ grafana]# kubectl apply -f grafana_deployment.yaml

deployment.apps/grafana created

[root@iZwz913wwcdk2r7whthcrsZ grafana]# kubectl create -f grafana_svc.yaml

查看状态

[root@iZwz913wwcdk2r7whthcrsZ grafana]# kubectl get pod,svc -n prometheus |grep grafana

Prometheus配置飞书告警

1、通过 PrometheusAlert全家桶对接飞书告警，PrometheusAlert全家桶安装步骤 https://github.com/feiyu563/PrometheusAlert

2、配置 alermanager告警服务

[root@iZwz9ixflmggrbc154y075Z prometheus]# vim prometheus-alert-conf.yaml

kind: ConfigMap

apiVersion: v1

metadata:

namespace: prometheus

data:

config.yml: |-

global:

resolve_timeout: 30s

route:

receiver: webhook

group_wait: 10s

group_interval: 20s

repeat_interval: 30m

group_by: [alertname]

routes:

- receiver: webhook

group_wait: 10s

match:

team: node

receivers:

- name: webhook

webhook_configs:

- url: 'http://47.106.204.250:8080/prometheusalert?type=fs&tpl=prometheus-fs&fsurl=https://open.feishu.cn/open-apis/bot/v2/hook/327baeb5-2c41-4924-aab8-948ab9b4a92c&at=zhangsan@xxx.com'

send_resolved: true

- url: 'http://47.106.204.250:8080/prometheusalert?type=fs&tpl=prometheus-fs&fsurl=https://open.feishu.cn/open-apis/bot/v2/hook/327baeb5-2c41-4924-aab8-948ab9b4a92c&at=zhangsan@xxx.com'

send_resolved: true

3、启动 alermanager

[root@iZwz9ixflmggrbc154y075Z prometheus]# kubectl create -f prometheus-alert-conf.yaml

4、查看alermanager是否启动

5、告警模板

**[Pod状态] 告警**

告警类型: {{$v.labels.alertname}}

命名空间: {{$v.labels.namespace}}

服务名称: {{$v.annotations.summary}}

告警级别: {{$v.labels.severity}}

告警时间: {{GetCSTtime $v.startsAt}}

**[Pod状态] 恢复**

告警类型: {{$v.labels.alertname}}

命名空间: {{$v.labels.namespace}}

服务名称: {{$v.annotations.summary}}

告警级别: {{$v.labels.severity}}

告警时间: {{GetCSTtime $v.startsAt}}

恢复时间: {{GetCSTtime $v.endsAt}}

posted @ 2023-01-13 11:06 bwgfs 阅读(1216) 评论(0) 收藏举报

刷新页面返回顶部

bwgfs

Kubernetes安装Prometheus+grafana数据持久化

Prometheus配置飞书告警

公告