概述
Grafana Loki 是一个日志聚合工具,它是功能齐全的日志堆栈的核心。
Loki 是一个为有效保存日志数据而优化的数据存储。日志数据的高效索引将 Loki 与其他日志系统区分开来。
与其他日志系统不同,Loki 索引是根据标签构建的,原始日志消息未编入索引。
代理(也称为客户端)获取日志,将日志转换为流,并通过 HTTP API 将流推送到 Loki。Promtail 代理专为 Loki 安装而设计,但许多其他代理与 Loki 无缝集成。
Loki 是一个受Prometheus启发的水平可扩展、高可用性、多租户日志聚合系统。它的设计非常具有成本效益且易于操作。它不索引日志的内容,而是为每个日志流设置一组标签。
与其他日志聚合系统相比,Loki:
- 不对日志进行全文索引。通过存储压缩的非结构化日志和仅索引元数据,Loki 操作更简单,运行成本更低。
- 使用已在 Prometheus 中使用的相同标签对日志流进行索引和分组,使您能够使用已在 Prometheus 中使用的相同标签在指标和日志之间无缝切换。
- 特别适合存储Kubernetes Pod 日志。Pod 标签等元数据会被自动抓取和索引。
- 在 Grafana 中有原生支持(需要 Grafana v6.0)。
基于 Loki 的日志记录堆栈由 3 个组件组成:
promtail
是代理,负责收集日志并将其发送给 Loki。loki
是主服务器,负责存储日志和处理查询。- Grafana用于查询和显示日志。
#配置Promtail
#官方查阅地址 https://grafana.com/docs/loki/latest/clients/promtail/configuration/#kubernetes_sd_config
# loki的日志报警补充在文章最底部
#部署Loki
1、创建loki的namespace
[root@master1 loki]# cat loki-ns.yaml
apiVersion: v1
kind: Namespace
metadata:
name: loki
kubectl apply -f loki-ns.yaml
2、创建对应的RBAC权限以及SA账户
[root@master1 loki]# cat loki-rbac.yaml apiVersion: v1 kind: ServiceAccount metadata: name: loki namespace: loki --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: loki namespace: loki rules: - apiGroups: - extensions resourceNames: - loki resources: - podsecuritypolicies verbs: - use --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: loki namespace: loki roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: loki subjects: - kind: ServiceAccount name: loki
#kubectl apply -f loki-rbac.yaml
3、创建configmap 生成Loki的配置文件
[root@master1 loki]# cat loki-cm.yaml apiVersion: v1 kind: ConfigMap metadata: name: loki namespace: loki labels: app: loki data: loki.yaml: | auth_enabled: false ingester: chunk_idle_period: 3m # 如果块没有达到最大的块大小,那么在刷新之前,块应该在内存中不更新多长时间 chunk_block_size: 262144 chunk_retain_period: 1m # 块刷新后应该在内存中保留多长时间 max_transfer_retries: 0 # Number of times to try and transfer chunks when leaving before falling back to flushing to the store. Zero = no transfers are done. wal: enabled: true dir: /data/loki/wal lifecycler: #配置ingester的生命周期,以及在哪里注册以进行发现 ring: kvstore: store: inmemory # 用于ring的后端存储,支持consul、etcd、inmemory replication_factor: 1 # 写入和读取的ingesters数量,至少为1(为了冗余和弹性,默认情况下为3) limits_config: enforce_metric_name: false reject_old_samples: true # 旧样品是否会被拒绝 reject_old_samples_max_age: 168h # 拒绝旧样本的最大时限 schema_config: # 配置从特定时间段开始应该使用哪些索引模式 configs: - from: 2022-07-21 # 创建索引的日期。如果这是唯一的schema_config,则使用过去的日期,否则使用希望切换模式时的日期 store: boltdb-shipper # 索引使用哪个存储,如:cassandra, bigtable, dynamodb,或boltdb object_store: filesystem # 用于块的存储,如:gcs, s3, inmemory, filesystem, cassandra,如果省略,默认值与store相同 schema: v11 index: # 配置如何更新和存储索引 prefix: index_ # 所有周期表的前缀 period: 24h # 表周期 server: http_listen_port: 3100 storage_config: # 为索引和块配置一个或多个存储 boltdb_shipper: active_index_directory: /data/loki/boltdb-shipper-active cache_location: /data/loki/boltdb-shipper-cache cache_ttl: 24h shared_store: filesystem filesystem: directory: /data/loki/chunks chunk_store_config: # 配置如何缓存块,以及在将它们保存到存储之前等待多长时间 max_look_back_period: 0s #限制查询数据的时间,默认是禁用的,这个值应该小于或等于table_manager.retention_period中的值 table_manager: retention_deletes_enabled: true # 日志保留周期开关,用于表保留删除 retention_period: 48h # 日志保留周期,保留期必须是索引/块的倍数 compactor: working_directory: /data/loki/boltdb-shipper-compactor shared_store: filesystem
#kubectl apply -f loki-cm.yaml
4、部署Loki服务
#kubectl apply -f loki.yaml
[root@master1 loki]# cat loki.yaml apiVersion: v1 kind: Service metadata: name: loki namespace: loki labels: app: loki spec: type: ClusterIP ports: - port: 3100 protocol: TCP name: http-metrics targetPort: http-metrics selector: app: loki --- apiVersion: v1 kind: Service metadata: name: loki-outer namespace: loki labels: app: loki spec: type: NodePort ports: - port: 3100 protocol: TCP name: http-metrics targetPort: http-metrics nodePort: 32537 selector: app: loki --- apiVersion: apps/v1 kind: StatefulSet metadata: name: loki namespace: loki labels: app: loki spec: podManagementPolicy: OrderedReady replicas: 1 selector: matchLabels: app: loki serviceName: loki updateStrategy: type: RollingUpdate template: metadata: labels: app: loki spec: serviceAccountName: loki initContainers: [] containers: - name: loki image: 192.168.24.33:32800/base/loki:2.6.1 imagePullPolicy: IfNotPresent args: - -config.file=/etc/loki/loki.yaml volumeMounts: - name: config mountPath: /etc/loki - name: storage mountPath: /data ports: - name: http-metrics containerPort: 3100 protocol: TCP livenessProbe: httpGet: path: /ready port: http-metrics scheme: HTTP initialDelaySeconds: 45 timeoutSeconds: 1 periodSeconds: 10 successThreshold: 1 failureThreshold: 3 readinessProbe: httpGet: path: /ready port: http-metrics scheme: HTTP initialDelaySeconds: 45 timeoutSeconds: 1 periodSeconds: 10 successThreshold: 1 failureThreshold: 3 securityContext: readOnlyRootFilesystem: true terminationGracePeriodSeconds: 4800 volumes: - name: config configMap: defaultMode: 420 name: loki - emptyDir: {} name: storage
# 查看服务是否运行正常
# 部署promtail 收集K8S集群日志
1、#配置configmap 生成Promtail配置文件
kubectl apply -f loki-promtail-configmap.yaml
[root@master1 loki]# cat loki-promtail-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: name: loki-promtail namespace: loki labels: app: promtail data: promtail.yaml: | client: url: http://loki:3100/loki/api/v1/push # 配置Promtail如何连接到Loki的实例 backoff_config: # 配置当请求失败时如何重试请求给Loki max_period: 5m max_retries: 10 min_period: 500ms batchsize: 1048576 # 发送给Loki的最大批次大小(以字节为单位) batchwait: 1s # 发送批处理前等待的最大时间(即使批次大小未达到最大值) external_labels: {} # 所有发送给Loki的日志添加静态标签 timeout: 10s # 等待服务器响应请求的最大时间 positions: #文件偏移量存储位置 filename: /run/promtail/positions.yaml server: # 服务监听端口 http_listen_port: 3101 target_config: #10s更新一次位置文件 sync_period: 10s scrape_configs: #scrpae_configs块配置Promtail如何使用指定的方法从一系列目标中抓取日志 - job_name: kubernetes-pods-name #描述如何从目标转换日志 pipeline_stages: #解析来自 Docker 容器的日志内容,并通过名称用空对象定义 - docker: {} #Kubernetes SD 配置允许从 Kubernetes 的REST API 检索抓取目标,并始终与集群状态保持同步 #k8s服务发现 kubernetes_sd_configs: # role 可以配置为以下类型发现 #pod#service#node#endpoints#ingress - role: pod # 重写标签 relabel_configs: #目标的源标签 - source_labels: - __meta_kubernetes_pod_label_name #需要替换的标签 没有配置action 默认为replace (这里理解为用源标签的__meta_kubernetes_pod_label_name值替换了目标标签__service__的值) target_label: __service__ - source_labels: - __meta_kubernetes_pod_node_name target_label: __host__ - action: drop regex: '' source_labels: - __service__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - action: replace replacement: $1 separator: / source_labels: - __meta_kubernetes_namespace - __service__ target_label: job - action: replace source_labels: - __meta_kubernetes_namespace target_label: namespace - action: replace source_labels: - __meta_kubernetes_pod_name target_label: pod - action: replace source_labels: - __meta_kubernetes_pod_container_name target_label: container - replacement: /var/log/pods/*$1/*.log separator: / source_labels: - __meta_kubernetes_pod_uid - __meta_kubernetes_pod_container_name target_label: __path__ - job_name: kubernetes-pods-app pipeline_stages: - docker: {} kubernetes_sd_configs: - role: pod relabel_configs: - action: drop regex: .+ source_labels: - __meta_kubernetes_pod_label_name - source_labels: - __meta_kubernetes_pod_label_app target_label: __service__ - source_labels: - __meta_kubernetes_pod_node_name target_label: __host__ - action: drop regex: '' source_labels: - __service__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - action: replace replacement: $1 separator: / source_labels: - __meta_kubernetes_namespace - __service__ target_label: job - action: replace source_labels: - __meta_kubernetes_namespace target_label: namespace - action: replace source_labels: - __meta_kubernetes_pod_name target_label: pod - action: replace source_labels: - __meta_kubernetes_pod_container_name target_label: container - replacement: /var/log/pods/*$1/*.log separator: / source_labels: - __meta_kubernetes_pod_uid - __meta_kubernetes_pod_container_name target_label: __path__ - job_name: kubernetes-pods-direct-controllers pipeline_stages: - docker: {} kubernetes_sd_configs: - role: pod relabel_configs: - action: drop regex: .+ separator: '' source_labels: - __meta_kubernetes_pod_label_name - __meta_kubernetes_pod_label_app - action: drop regex: '[0-9a-z-.]+-[0-9a-f]{8,10}' source_labels: - __meta_kubernetes_pod_controller_name - source_labels: - __meta_kubernetes_pod_controller_name target_label: __service__ - source_labels: - __meta_kubernetes_pod_node_name target_label: __host__ - action: drop regex: '' source_labels: - __service__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - action: replace replacement: $1 separator: / source_labels: - __meta_kubernetes_namespace - __service__ target_label: job - action: replace source_labels: - __meta_kubernetes_namespace target_label: namespace - action: replace source_labels: - __meta_kubernetes_pod_name target_label: pod - action: replace source_labels: - __meta_kubernetes_pod_container_name target_label: container - replacement: /var/log/pods/*$1/*.log separator: / source_labels: - __meta_kubernetes_pod_uid - __meta_kubernetes_pod_container_name target_label: __path__ - job_name: kubernetes-pods-indirect-controller pipeline_stages: - docker: {} kubernetes_sd_configs: - role: pod relabel_configs: - action: drop regex: .+ separator: '' source_labels: - __meta_kubernetes_pod_label_name - __meta_kubernetes_pod_label_app - action: keep regex: '[0-9a-z-.]+-[0-9a-f]{8,10}' source_labels: - __meta_kubernetes_pod_controller_name - action: replace regex: '([0-9a-z-.]+)-[0-9a-f]{8,10}' source_labels: - __meta_kubernetes_pod_controller_name target_label: __service__ - source_labels: - __meta_kubernetes_pod_node_name target_label: __host__ - action: drop regex: '' source_labels: - __service__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - action: replace replacement: $1 separator: / source_labels: - __meta_kubernetes_namespace - __service__ target_label: job - action: replace source_labels: - __meta_kubernetes_namespace target_label: namespace - action: replace source_labels: - __meta_kubernetes_pod_name target_label: pod - action: replace source_labels: - __meta_kubernetes_pod_container_name target_label: container - replacement: /var/log/pods/*$1/*.log separator: / source_labels: - __meta_kubernetes_pod_uid - __meta_kubernetes_pod_container_name target_label: __path__ - job_name: kubernetes-pods-static pipeline_stages: - docker: {} kubernetes_sd_configs: - role: pod relabel_configs: - action: drop regex: '' source_labels: - __meta_kubernetes_pod_annotation_kubernetes_io_config_mirror - action: replace source_labels: - __meta_kubernetes_pod_label_component target_label: __service__ - source_labels: - __meta_kubernetes_pod_node_name target_label: __host__ - action: drop regex: '' source_labels: - __service__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - action: replace replacement: $1 separator: / source_labels: - __meta_kubernetes_namespace - __service__ target_label: job - action: replace source_labels: - __meta_kubernetes_namespace target_label: namespace - action: replace source_labels: - __meta_kubernetes_pod_name target_label: pod - action: replace source_labels: - __meta_kubernetes_pod_container_name target_label: container - replacement: /var/log/pods/*$1/*.log separator: / source_labels: - __meta_kubernetes_pod_annotation_kubernetes_io_config_mirror - __meta_kubernetes_pod_container_name target_label: __path__
2、配置Promtail的RBAC权限
kubectl apply -f loki-promtail-rbac.yaml
[root@master1 loki]# cat loki-promtail-rbac.yaml apiVersion: v1 kind: ServiceAccount metadata: name: loki-promtail labels: app: promtail namespace: loki --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: labels: app: promtail name: promtail-clusterrole namespace: loki rules: - apiGroups: [""] # "" indicates the core API group resources: - nodes - nodes/proxy - services - endpoints - pods verbs: ["get", "watch", "list"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: promtail-clusterrolebinding labels: app: promtail namespace: loki subjects: - kind: ServiceAccount name: loki-promtail namespace: loki roleRef: kind: ClusterRole name: promtail-clusterrole apiGroup: rbac.authorization.k8s.io
3、部署Promtail服务
kubectl apply -f loki-promtail.yaml [root@master1 loki]# cat loki-promtail.yaml apiVersion: apps/v1 kind: DaemonSet metadata: name: loki-promtail namespace: loki labels: app: promtail spec: selector: matchLabels: app: promtail updateStrategy: rollingUpdate: maxUnavailable: 1 type: RollingUpdate template: metadata: labels: app: promtail spec: serviceAccountName: loki-promtail containers: - name: promtail image: 192.168.24.33:32800/base/promtail:2.6.1 imagePullPolicy: IfNotPresent args: - -config.file=/etc/promtail/promtail.yaml - -client.url=http://loki:3100/loki/api/v1/push env: - name: HOSTNAME valueFrom: fieldRef: apiVersion: v1 fieldPath: spec.nodeName volumeMounts: - mountPath: /etc/promtail name: config - mountPath: /run/promtail name: run - mountPath: /var/lib/docker/containers name: docker readOnly: true - mountPath: /var/log/pods name: pods readOnly: true ports: - containerPort: 3101 name: http-metrics protocol: TCP securityContext: readOnlyRootFilesystem: true runAsGroup: 0 runAsUser: 0 readinessProbe: failureThreshold: 5 httpGet: path: /ready port: http-metrics scheme: HTTP initialDelaySeconds: 10 periodSeconds: 10 successThreshold: 1 timeoutSeconds: 1 tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master operator: Exists volumes: - name: config configMap: defaultMode: 420 name: loki-promtail - name: run hostPath: path: /run/promtail type: "" - name: docker hostPath: path: /var/lib/docker/containers - name: pods hostPath: path: /var/log/pods
#查看loki命名空间的全部服务是否正常
[root@master1 loki]# kubectl get all -n loki NAME READY STATUS RESTARTS AGE pod/loki-0 1/1 Running 0 36h pod/loki-promtail-759dv 1/1 Running 0 35h pod/loki-promtail-f9v5f 1/1 Running 0 35h pod/loki-promtail-j4wdm 1/1 Running 0 35h NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE service/loki ClusterIP 10.100.102.10 <none> 3100/TCP 36h service/loki-outer NodePort 10.99.36.139 <none> 3100:32537/TCP 36h NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE daemonset.apps/loki-promtail 3 3 3 3 3 <none> 35h NAME READY AGE statefulset.apps/loki 1/1 36h
#配置Grafana链接Loki
#验证Loki数据
#Loki 日志报警测试
先决条件
1、loki开启rule配置 # 本次为示例
apiVersion: v1 kind: ConfigMap metadata: name: loki namespace: loki labels: app: loki data: loki.yaml: | ruler: alertmanager_url: http://alertmanager-service.ns-monitor:9093 # alertmanager地址 enable_alertmanager_v2: true enable_api: true # 启用loki rules API enable_sharding: true # 对rules分片,支持ruler多实例 ring: # ruler服务的一致性哈希环配置,用于支持多实例和分片 kvstore: store: inmemory rule_path: /data/loki/tmp_rules # rules规则文件临时存储路径 storage: # rules规则存储,主要支持本地存储和对象存储 type: local local: directory: /data/loki/rules # rules规则文件存储路径 flush_period: 1m # rules规则加载时间 auth_enabled: false ingester: ... # 后续配置不在展示 与上面无区别
2、 配置报警规则的yaml 忽略第二条 alert [test-prod-error-Log] 该alert是把/var/log/message 也输出到了loki去捕捉错误信息
--- apiVersion: v1 kind: ConfigMap metadata: name: lokirule namespace: loki labels: app: lokirule data: ruler.yaml: | groups: - name: test-error-info rules: - alert: export-server-Error-Log expr: | sum by(app,job,message) (count_over_time({app="tomcat"} |~ "error:|Error:|info" | regexp "(?P<message>.{0,150})"[15m])) > 0 for: 10m labels: severity: log annotations: description: "Error log \r\n >Message: {{ $labels.message }} \r\n >App: {{ $labels.app }} \r\n >Job: {{ $labels.job }}" - alert: test-prod-error-Log expr: | sum by(app,job,message) (count_over_time({job="message"} |~ "error:|Error" | regexp "(?P<message>.{0,150})"[15m])) > 0 for: 10m labels: severity: log annotations: description: "Error log \r\n >Message: {{ $labels.message }} \r\n >App: {{ $labels.app }} \r\n >Job: {{ $labels.job }}"
告警rule配置简解
groups: - name: <string> # 组名称 rules: - alert: <string> # Alert名称 expr: <string> # LogQL查询语句 [ for: <duration> | default = 0s ] # 产生告警的持续时间 labels: # 自定义告警事件的label [ <labelname>: <tmpl_string> ] annotations: # 告警时间的注释 [ <labelname>: <tmpl_string> ]
3、 挂载rule到指定的目录 这里只展示目录的部分
... volumeMounts: - name: config mountPath: /etc/loki - name: storage mountPath: /data - name: lokirule mountPath: /data/loki/rules .... volumes: - name: config configMap: name: loki - emptyDir: {} name: storage - name: lokirule configMap: name: lokirule
配置完成后重启服务即可
# 手动在tomcat日志目录下追加错误日志 测试报警
cd /var/log/pods/default_mytomcat-5f97c868bd-f4d2b_1f4ea8f9-feb7-4a73-8621-678663053058/mytomcat [root@node1 mytomcat]# ls 0.log [root@node1 mytomcat]# echo "error: message test 2022-08-12-18-01 this is Error message" >> 0.log
#等待接受报警邮件
# 至此日志报警测试结束