python抓取prometheus容器数据,并实现监控报警
import json import math import pytz import requests from datetime import datetime class Monitoring(object): def __init__(self): self.namespace_list = ["apollo", "bhpc-admin-nginx","bluehelix","broker","cert-manager","chainnode","clear","elastic-system","exchange","gateway","kube-node-lease","kube-public","kube-system","log","wallet","rc"] # self.namespace_list = ["broker"] self.api_url = 'https://prometheus.doex.io/api/v1/query' self.cpu_threshold = 80 self.mem_threshold = 80 def get_cpu(self,namespace): container_cpu_list = [] query = f'sum(irate(container_cpu_usage_seconds_total{{container !="",container!="POD",namespace=~"{namespace}"}}[2m])) by (container, pod) / (sum(container_spec_cpu_quota{{container !="",container!="POD",namespace=~"{namespace}"}}/100000) by (container, pod)) * 100' params = { 'query': query } response = requests.get(url=self.api_url, params=params) if response.status_code == 200: data = response.json() result = data['data']['result'] if result: for container_data in result: container_name = container_data['metric']['pod'] cpu_usage = float(container_data['value'][1]) if cpu_usage > self.cpu_threshold and not math.isinf(cpu_usage): container_cpu_list.append({"container":container_name, "cpu_usage":cpu_usage}) else: print('找不到指定的容器或者没有配置资源limit') else: print('请求失败:', response.status_code) return container_cpu_list def get_mem(self,namespace): container_mem_list = [] query = f'sum (container_memory_working_set_bytes{{container !="",container!="POD",namespace=~"{namespace}"}}) by (container, pod)/ sum(container_spec_memory_limit_bytes{{container !="",container!="POD",namespace=~"{namespace}"}}) by (container, pod) * 100' #query = r'sum (container_memory_working_set_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod) * 100' #query = r'sum (container_memory_working_set_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod) * 100' params = { 'query': query } response = requests.get(url=self.api_url, params=params) if response.status_code == 200: data = response.json() result = data['data']['result'] if result: for container_data in result: container_name = container_data['metric']['pod'] mem_usage = float(container_data['value'][1]) if mem_usage > self.mem_threshold and not math.isinf(mem_usage): container_mem_list.append({"container": container_name, "mem_usage": mem_usage}) else: print('找不到指定的容器或者没有配置资源limit') else: print('请求失败:', response.status_code) return container_mem_list def send_alert(self,container): current_time = datetime.now() target_timezone = pytz.timezone('Asia/Shanghai') current_time = current_time.astimezone(target_timezone) markdown_cpu = f''' # io环境报警通知: **容器名称:** {container.get("container")} **当前cpu:** {container.get("cpu_usage")}% **报警级别:** 警告 **报警阈值:** {self.cpu_threshold}% **报警时间:** {current_time.strftime('%Y-%m-%d %H:%M:%S')} **问题描述:** {container.get("container")} CPU 使用率超过阈值。 ''' markdown_mem = f''' # io环境报警通知: **容器名称:** {container.get("container")} **当前内存:** {container.get("mem_usage")}% **报警级别:** 警告 **报警阈值:** {self.mem_threshold}% **报警时间:** {current_time.strftime('%Y-%m-%d %H:%M:%S')} **问题描述:** {container.get("container")} 内存 使用率超过阈值。 ''' markdown_text = None if container.get("cpu_usage"): markdown_text = markdown_cpu else: markdown_text = markdown_mem url = r"https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxxxaf216f7" headers = { 'Content-Type': 'application/json' } data = { 'msgtype': 'markdown', 'markdown': { 'content': markdown_text } } if markdown_text: res = requests.post(url=url, headers=headers,json=data) if res.json().get("errcode") == 0: print("发功成功") else: print(res.text) else: print("发送内容为空") if __name__ == '__main__': monitoring = Monitoring() for namespace in monitoring.namespace_list: container_cpu_list = monitoring.get_cpu(namespace) container_mem_list = monitoring.get_mem(namespace) #cpu监控 for container_cpu in container_cpu_list: print(container_cpu) monitoring.send_alert(container_cpu) #内存监控 for container_mem in container_mem_list: print(container_mem) monitoring.send_alert(container_mem)
报警示例图
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构