安装 Prometheus
一、搭建前期准备条件
1、在k8s集群创建一个 prometheus 命名空间,做好服务隔离,确保不会影响到其它服务
二、数据持久化环境搭建
1、安装NFS,查看是否已安装NFS
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpm -qa | grep nfs
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpm -qa | grep rpcbind
安装NFS
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# yum -y install nfs-utils rpcbind
创建NFS数据存储目录
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mkdir -p /nfs/kubernetes/prometheus
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# chmod 666 /nfs/kubernetes/prometheus
配置访问
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# echo "/nfs/kubernetes *(rw,no_root_squash,sync)" >>/etc/exports
[root@iZwz95iaf9ikzcszlcw8qpZ mnt]# cat /etc/exports
刷新配置生效
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# exportfs -r
启动NFS
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl start rpcbind
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl enable rpcbind
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl enable nfs
查看RPC服务注册状态
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpcinfo -p localhost
创建挂载目录
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# showmount -e 挂载机器IP地址
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mount -t nfs 挂载机器IP地址:/nfs/kubernetes/ /nfs/kubernetes/prometheus/
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# df -h
三、k8s搭建prometheus
创建prometheus目录用来存放YAML配置
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mkdir -p /opt/prometheus/prometheus
1、创建prometheus YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus.configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: prometheus
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_timeout: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]
rule_files:
- /etc/prometheus/*.rules
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics.prometheus.svc.cluster.local:8080']
- job_name: 'kubernetes-node'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9003'
target_label: __address__
action: replace
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
##############################################################################################
#pod
pod.rules: |
groups:
- name: pod.rules
rules:
- alert: K8sClusterNodeNotready
expr: |
kube_node_status_condition{condition="Ready",status!="true"} == 1
for: 30s
labels:
severity: warning
annotations:
summary: "节点: {{$labels.node}} 状态: Notready"
- alert: PodCPUUsage
expr: |
sum(irate(container_cpu_usage_seconds_total{image!="",container!="POD",container!=""}[1m])) by (pod,namespace) / (sum(container_spec_cpu_quota{image!="",container!="POD",container!=""}/100000) by (pod,namespace)) * 100 > 90
for: 2m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于90% (当前值: {{ $value }})"
- alert: PodMemoryUsage
expr: |
sum(container_memory_rss{container!="POD",container!="alermanager",image!="",pod!=""})by(pod,namespace) / sum(container_spec_memory_limit_bytes{container!="",container!="POD"})by(pod,namespace) * 100 != +inf > 90
for: 2m
labels:
severity: error
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于90% (当前值: {{ $value }})"
- alert: PodRestart
expr: |
sum (increase (kube_pod_container_status_restarts_total{}[1m])) by (namespace,pod) >0
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod重启 (当前值: {{ $value }})"
- alert: PodFailed
expr: |
sum(kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
for: 5s
labels:
severity: error
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"
- alert: PodPending
expr: |
sum(kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
for: 1m
labels:
severity: error
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"
- alert: PodErrImagePull
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ErrImagePull"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ErrImagePull (当前值: {{ $value }})"
- alert: PodImagePullBackOff
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态ImagePullBackOff (当前值: {{ $value }})"
- alert: PodCrashLoopBackOff
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CrashLoopBackOff (当前值: {{ $value }})"
- alert: PodInvalidImageName
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="InvalidImageName"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态InvalidImageName (当前值: {{ $value }})"
- alert: PodCreateContainerConfigError
expr: |
sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CreateContainerConfigError"}) == 1
for: 1m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态CreateContainerConfigError (当前值: {{ $value }})"
启动YAML
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus.configmap.yaml
configmap/prometheus-config created
查看状态
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl get configmaps -n prometheus |grep prometheus
2、创建prometheus YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus.deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: prometheus
labels:
app: prometheus
spec:
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
containers:
- image: prom/prometheus:v2.24.1
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=40d"
- "--web.enable-admin-api" # 控制对admin HTTP API的访问,其中包括删除时间序列等功能
- "--web.enable-lifecycle" # 支持热更新,直接执行localhost:9090/-/reload立即生效
ports:
- containerPort: 9090
protocol: TCP
name: http
volumeMounts:
- mountPath: "/prometheus"
subPath: prometheus
name: data
- mountPath: "/etc/prometheus"
name: config-volume
resources:
requests:
cpu: 4000m
memory: 8192Mi
limits:
cpu: 8000m
memory: 16384Mi
- name: alermanage
image: prom/alertmanager:v0.22.0
imagePullPolicy: IfNotPresent
args:
- "--config.file=/etc/alertmanager/config.yml"
- "--storage.path=/alertmanager/data"
ports:
- containerPort: 9093
name: http
volumeMounts:
- mountPath: "/etc/alertmanager"
name: alertcfg
resources:
requests:
cpu: 2000m
memory: 4096Mi
limits:
cpu: 4000m
memory: 8192Mi
securityContext:
runAsUser: 0
volumes:
- name: data
persistentVolumeClaim:
claimName: prometheus
- configMap:
name: prometheus-config
name: config-volume
- name: alertcfg
configMap:
name: alermanager
---
apiVersion: v1
kind: Service
metadata:
namespace: prometheus
name: prometheus
labels:
app: prometheus
spec:
type: NodePort
selector:
app: prometheus
ports:
- name: http
port: 9090
3、创建prometheus YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus-volume.yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: prometheus
spec:
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Recycle
nfs:
server: 10.11.33.57 #NFS地址
path: /nfs/kubernetes/prometheus #NFS挂载目录
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus
namespace: prometheus
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
启动YAML 配置文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-volume.yaml persistentvolume/prometheus created persistentvolumeclaim/prometheus created
4、创建prometheus YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prometheus
启动YAML配置文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-rbac.yaml
serviceaccount/prometheus created
clusterrole.rbac.authorization.k8s.io/prometheus created
clusterrolebinding.rbac.authorization.k8s.io/prometheus created
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus.deploy.yaml
查看启动状态
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl get pod -n prometheus |grep prometheus
5、创建prometheus YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vim prometheus-svc.yaml
apiVersion: v1
kind: Service
metadata:
namespace: prometheus
name: prometheus
labels:
app: prometheus
spec:
type: NodePort
selector:
app: prometheus
ports:
- name: http
port: 9090
- name: alertmanager
port: 9093
targetPort: 9093
启动YMAL配置文件(也可以手动在k8s平台创建)
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-svc.yaml service/prometheus created
四、Prometheus监控Kubernetes 集群节点及应用
每一个节点都会运行一个Pod,如果从集群中删除或添加节点后,也会进行自动扩展
1、编辑prometheus-node-exporter监控的YAML文件
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# vimprometheus-node-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: prometheus-node-exporter
namespace: prometheus
labels:
name: prometheus-node-exporter
k8s-app: node-exporter
spec:
selector:
matchLabels:
name: prometheus-node-exporter
template:
metadata:
labels:
name: prometheus-node-exporter
app: node-exporter
spec:
# affinity:
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: type
# operator: NotIn
# values:
# - virtual-kubelet
hostPID: true
hostIPC: true
hostNetwork: true
containers:
- name: prometheus-node-exporter
image: prom/node-exporter:v1.1.0
ports:
- containerPort: 9003
resources:
requests:
cpu: 0.15
securityContext:
privileged: true
args:
- --web.listen-address
- ":9003"
- --path.procfs
- /host/proc
- --path.sysfs
- /host/sys
- --collector.filesystem.ignored-mount-points
- '"^/(sys|proc|dev|host|etc)($|/)"'
volumeMounts:
- name: dev
mountPath: /host/dev
- name: proc
mountPath: /host/proc
- name: sys
mountPath: /host/sys
- name: rootfs
mountPath: /rootfs
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule"
#如果节点有打污点,需要添加如下配置
- key: "dedicated" #污点标签
operator: "Exists"
effect: "NoExecute" #污点 NoExecute
- key: "eci"
operator: "Exists"
effect: "NoSchedule"
volumes:
- name: proc
hostPath:
path: /proc
- name: dev
hostPath:
path: /dev
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /
启动prometheus-node-exporter监控
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl create -f prometheus-node-exporter.yaml
daemonset.extensions/node-exporter created
查看prometheus-node-exporter节点监控状态
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# kubectl get pod -n prometheus -o wide|grep node
由于要获取的数据是主机的监控指标数据,而node-exporter是运行在容器中的,所以在Pod中需要配置一些Pod的安全策略
安装grafana
一、搭建前期准备条件
1、在k8s集群创建一个 prometheus 命名空间,做好服务隔离,确保不会影响到其它服务
二、数据持久化环境搭建
1、安装NFS,查看是否已安装NFS
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpm -qa | grep nfs
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpm -qa | grep rpcbind
安装NFS
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# yum -y install nfs-utils rpcbind
创建NFS数据存储目录
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mkdir /nfs/kubernetes/prometheus
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# chmod 666 mkdir /nfs/kubernetes/prometheus
配置访问
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# echo "/home/kvm *(rw,no_root_squash,sync)" >>/etc/exports
刷新配置生效
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# exportfs -r
启动NFS
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl start rpcbind
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl enable rpcbind
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# systemctl enable nfs
查看RPC服务注册状态
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# rpcinfo -p localhost
创建挂载目录
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mkdir /testnfs
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# showmount -e 挂载机器IP地址
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# mount -t nfs 挂载机器IP地址:/nfs/kubernetes/prometheus/prometheus
[root@iZwz913wwcdk2r7whthcrsZ prometheus]# df -h
三、k8s搭建grafana
1、创建grafana目录用来存放YAML配置
[root@iZwz913wwcdk2r7whthcrsZ grafana]# vim grafana_deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: prometheus
labels:
app: grafana
k8s-app: grafana
spec:
selector:
matchLabels:
k8s-app: grafana
app: grafana
revisionHistoryLimit: 10
template:
metadata:
labels:
app: grafana
k8s-app: grafana
spec:
containers:
- name: grafana
image: grafana/grafana:7.5.2
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
name: grafana
env:
- name: GF_SECURITY_ADMIN_USER
value: admin
- name: GF_SECURITY_ADMIN_PASSWORD
value: bwgfs127127
readinessProbe:
failureThreshold: 10
httpGet:
path: /api/health
port: 3000
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
livenessProbe:
failureThreshold: 3
httpGet:
path: /api/health
port: 3000
scheme: HTTP
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
cpu: 8000m
memory: 16384Mi
requests:
cpu: 4000m
memory: 8192Mi
volumeMounts:
- mountPath: /var/lib/grafana
subPath: grafana
name: storage
securityContext:
fsGroup: 472
runAsUser: 0
volumes:
- name: storage
persistentVolumeClaim:
claimName: grafana
2、创建grafana目录用来存放YAML配置
[root@iZwz913wwcdk2r7whthcrsZ grafana]# vim grafana_volume.yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: grafana
spec:
capacity:
storage: 100Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Recycle
nfs:
server: 10.11.33.57 #NFS地址
path: /nfs/kubernetes/prometheus #NFS地址目录
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana
namespace: prometheus
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
3、创建grafana目录用来存放YAML配置
[root@iZwz913wwcdk2r7whthcrsZ grafana]# vim grafana_svc.yaml
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: prometheus
labels:
app: grafana
spec:
type: NodePort
ports:
- port: 3000
selector:
app: grafana
4、创建grafana目录用来存放YAML配置
[root@iZwz913wwcdk2r7whthcrsZ grafana]# vim grafana_job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: grafana-chown
namespace: prometheus
spec:
template:
spec:
restartPolicy: Never
containers:
- name: grafana-chown
command: ["chown", "-R", "472:0", "/var/lib/grafana"]
image: busybox
imagePullPolicy: IfNotPresent
volumeMounts:
- name: storage
subPath: grafana
mountPath: /var/lib/grafana
volumes:
- name: storage
persistentVolumeClaim:
claimName: grafana
5、启动grafana服务
[root@iZwz913wwcdk2r7whthcrsZ grafana]# kubectl create -f grafana_volume.yaml
persistentvolume/grafana created
persistentvolumeclaim/grafana created
[root@iZwz913wwcdk2r7whthcrsZ grafana]# kubectl create -f grafana_job.yaml
job.batch/grafana-chown created
[root@iZwz913wwcdk2r7whthcrsZ grafana]# kubectl apply -f grafana_deployment.yaml
deployment.apps/grafana created
[root@iZwz913wwcdk2r7whthcrsZ grafana]# kubectl create -f grafana_svc.yaml
查看状态
[root@iZwz913wwcdk2r7whthcrsZ grafana]# kubectl get pod,svc -n prometheus |grep grafana
Prometheus配置飞书告警
2、配置 alermanager告警服务
[root@iZwz9ixflmggrbc154y075Z prometheus]# vim prometheus-alert-conf.yaml
kind: ConfigMap
apiVersion: v1
metadata:
name: alermanager
namespace: prometheus
data:
config.yml: |-
global:
resolve_timeout: 30s
route:
receiver: webhook
group_wait: 10s
group_interval: 20s
repeat_interval: 30m
group_by: [alertname]
routes:
- receiver: webhook
group_wait: 10s
match:
team: node
receivers:
- name: webhook
webhook_configs:
- url: 'http://47.106.204.250:8080/prometheusalert?type=fs&tpl=prometheus-fs&fsurl=https://open.feishu.cn/open-apis/bot/v2/hook/327baeb5-2c41-4924-aab8-948ab9b4a92c&at=zhangsan@xxx.com'
send_resolved: true
- url: 'http://47.106.204.250:8080/prometheusalert?type=fs&tpl=prometheus-fs&fsurl=https://open.feishu.cn/open-apis/bot/v2/hook/327baeb5-2c41-4924-aab8-948ab9b4a92c&at=zhangsan@xxx.com'
send_resolved: true
3、启动 alermanager
[root@iZwz9ixflmggrbc154y075Z prometheus]# kubectl create -f prometheus-alert-conf.yaml
4、查看alermanager是否启动
5、告警模板
{{ range $k,$v:=.alerts }}
{{- if eq $v.status "firing" }}
**[Pod状态] 告警**
告警类型: {{$v.labels.alertname}}
命名空间: {{$v.labels.namespace}}
服务名称: {{$v.annotations.summary}}
告警级别: {{$v.labels.severity}}
告警时间: {{GetCSTtime $v.startsAt}}
{{else}}
**[Pod状态] 恢复**
告警类型: {{$v.labels.alertname}}
命名空间: {{$v.labels.namespace}}
服务名称: {{$v.annotations.summary}}
告警级别: {{$v.labels.severity}}
告警时间: {{GetCSTtime $v.startsAt}}
恢复时间: {{GetCSTtime $v.endsAt}}
{{ end }}
{{ end }}
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 终于写完轮子一部分:tcp代理 了,记录一下
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理