python抓取prometheus容器数据,并实现监控报警
import json import math import pytz import requests from datetime import datetime class Monitoring(object): def __init__(self): self.namespace_list = ["apollo", "bhpc-admin-nginx","bluehelix","broker","cert-manager","chainnode","clear","elastic-system","exchange","gateway","kube-node-lease","kube-public","kube-system","log","wallet","rc"] # self.namespace_list = ["broker"] self.api_url = 'https://prometheus.doex.io/api/v1/query' self.cpu_threshold = 80 self.mem_threshold = 80 def get_cpu(self,namespace): container_cpu_list = [] query = f'sum(irate(container_cpu_usage_seconds_total{{container !="",container!="POD",namespace=~"{namespace}"}}[2m])) by (container, pod) / (sum(container_spec_cpu_quota{{container !="",container!="POD",namespace=~"{namespace}"}}/100000) by (container, pod)) * 100' params = { 'query': query } response = requests.get(url=self.api_url, params=params) if response.status_code == 200: data = response.json() result = data['data']['result'] if result: for container_data in result: container_name = container_data['metric']['pod'] cpu_usage = float(container_data['value'][1]) if cpu_usage > self.cpu_threshold and not math.isinf(cpu_usage): container_cpu_list.append({"container":container_name, "cpu_usage":cpu_usage}) else: print('找不到指定的容器或者没有配置资源limit') else: print('请求失败:', response.status_code) return container_cpu_list def get_mem(self,namespace): container_mem_list = [] query = f'sum (container_memory_working_set_bytes{{container !="",container!="POD",namespace=~"{namespace}"}}) by (container, pod)/ sum(container_spec_memory_limit_bytes{{container !="",container!="POD",namespace=~"{namespace}"}}) by (container, pod) * 100' #query = r'sum (container_memory_working_set_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod) * 100' #query = r'sum (container_memory_working_set_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod) * 100' params = { 'query': query } response = requests.get(url=self.api_url, params=params) if response.status_code == 200: data = response.json() result = data['data']['result'] if result: for container_data in result: container_name = container_data['metric']['pod'] mem_usage = float(container_data['value'][1]) if mem_usage > self.mem_threshold and not math.isinf(mem_usage): container_mem_list.append({"container": container_name, "mem_usage": mem_usage}) else: print('找不到指定的容器或者没有配置资源limit') else: print('请求失败:', response.status_code) return container_mem_list def send_alert(self,container): current_time = datetime.now() target_timezone = pytz.timezone('Asia/Shanghai') current_time = current_time.astimezone(target_timezone) markdown_cpu = f''' # io环境报警通知: **容器名称:** {container.get("container")} **当前cpu:** {container.get("cpu_usage")}% **报警级别:** 警告 **报警阈值:** {self.cpu_threshold}% **报警时间:** {current_time.strftime('%Y-%m-%d %H:%M:%S')} **问题描述:** {container.get("container")} CPU 使用率超过阈值。 ''' markdown_mem = f''' # io环境报警通知: **容器名称:** {container.get("container")} **当前内存:** {container.get("mem_usage")}% **报警级别:** 警告 **报警阈值:** {self.mem_threshold}% **报警时间:** {current_time.strftime('%Y-%m-%d %H:%M:%S')} **问题描述:** {container.get("container")} 内存 使用率超过阈值。 ''' markdown_text = None if container.get("cpu_usage"): markdown_text = markdown_cpu else: markdown_text = markdown_mem url = r"https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxxxaf216f7" headers = { 'Content-Type': 'application/json' } data = { 'msgtype': 'markdown', 'markdown': { 'content': markdown_text } } if markdown_text: res = requests.post(url=url, headers=headers,json=data) if res.json().get("errcode") == 0: print("发功成功") else: print(res.text) else: print("发送内容为空") if __name__ == '__main__': monitoring = Monitoring() for namespace in monitoring.namespace_list: container_cpu_list = monitoring.get_cpu(namespace) container_mem_list = monitoring.get_mem(namespace) #cpu监控 for container_cpu in container_cpu_list: print(container_cpu) monitoring.send_alert(container_cpu) #内存监控 for container_mem in container_mem_list: print(container_mem) monitoring.send_alert(container_mem)
报警示例图