python抓取prometheus容器数据,并实现监控报警

import json
import math
import pytz
import requests
from datetime import datetime

class Monitoring(object):
    def __init__(self):
        self.namespace_list = ["apollo", "bhpc-admin-nginx","bluehelix","broker","cert-manager","chainnode","clear","elastic-system","exchange","gateway","kube-node-lease","kube-public","kube-system","log","wallet","rc"]
        # self.namespace_list = ["broker"]
        self.api_url = 'https://prometheus.doex.io/api/v1/query'
        self.cpu_threshold = 80
        self.mem_threshold = 80

    def get_cpu(self,namespace):
        container_cpu_list = []
        query = f'sum(irate(container_cpu_usage_seconds_total{{container !="",container!="POD",namespace=~"{namespace}"}}[2m])) by (container, pod) / (sum(container_spec_cpu_quota{{container !="",container!="POD",namespace=~"{namespace}"}}/100000) by (container, pod)) * 100'
        params = {
            'query': query
        }
        response = requests.get(url=self.api_url, params=params)
        if response.status_code == 200:
            data = response.json()
            result = data['data']['result']
            if result:
                for container_data in result:
                    container_name = container_data['metric']['pod']
                    cpu_usage = float(container_data['value'][1])
                    if cpu_usage > self.cpu_threshold and not math.isinf(cpu_usage):
                        container_cpu_list.append({"container":container_name, "cpu_usage":cpu_usage})
            else:
                print('找不到指定的容器或者没有配置资源limit')
        else:
            print('请求失败:', response.status_code)
        return container_cpu_list
    def get_mem(self,namespace):
        container_mem_list = []
        query = f'sum (container_memory_working_set_bytes{{container !="",container!="POD",namespace=~"{namespace}"}}) by (container, pod)/ sum(container_spec_memory_limit_bytes{{container !="",container!="POD",namespace=~"{namespace}"}}) by (container, pod) * 100'
        #query = r'sum (container_memory_working_set_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod) * 100'
        #query = r'sum (container_memory_working_set_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD",namespace=~"broker"}) by (container, pod) * 100'
        params = {
            'query': query
        }
        response = requests.get(url=self.api_url, params=params)
        if response.status_code == 200:
            data = response.json()
            result = data['data']['result']
            if result:
                for container_data in result:
                    container_name = container_data['metric']['pod']
                    mem_usage = float(container_data['value'][1])
                    if mem_usage > self.mem_threshold and not math.isinf(mem_usage):
                        container_mem_list.append({"container": container_name, "mem_usage": mem_usage})
            else:
                print('找不到指定的容器或者没有配置资源limit')
        else:
            print('请求失败:', response.status_code)
        return container_mem_list
    def send_alert(self,container):
        current_time = datetime.now()
        target_timezone = pytz.timezone('Asia/Shanghai')
        current_time = current_time.astimezone(target_timezone)
        markdown_cpu = f'''
        # io环境报警通知:
        **容器名称:** {container.get("container")}
        **当前cpu:** {container.get("cpu_usage")}%
        **报警级别:** 警告
        **报警阈值:** {self.cpu_threshold}%
        **报警时间:** {current_time.strftime('%Y-%m-%d %H:%M:%S')}
        **问题描述:**
        {container.get("container")} CPU 使用率超过阈值。
        '''

        markdown_mem = f'''
        # io环境报警通知:
        **容器名称:** {container.get("container")}
        **当前内存:** {container.get("mem_usage")}%
        **报警级别:** 警告
        **报警阈值:** {self.mem_threshold}%
        **报警时间:** {current_time.strftime('%Y-%m-%d %H:%M:%S')}
        **问题描述:**
        {container.get("container")} 内存 使用率超过阈值。
        '''
        markdown_text = None
        if container.get("cpu_usage"):
            markdown_text = markdown_cpu
        else:
            markdown_text = markdown_mem

        url = r"https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxxxaf216f7"
        headers = {
            'Content-Type': 'application/json'
        }
        data = {
            'msgtype': 'markdown',
            'markdown': {
                'content': markdown_text
            }
        }
        if markdown_text:
            res = requests.post(url=url, headers=headers,json=data)
            if res.json().get("errcode") == 0:
                print("发功成功")
            else:
                print(res.text)
        else:
            print("发送内容为空")


if __name__ == '__main__':
    monitoring = Monitoring()
    for namespace in monitoring.namespace_list:
        container_cpu_list = monitoring.get_cpu(namespace)
        container_mem_list = monitoring.get_mem(namespace)
        #cpu监控
        for container_cpu in container_cpu_list:
            print(container_cpu)
            monitoring.send_alert(container_cpu)
        #内存监控
        for container_mem in container_mem_list:
            print(container_mem)
            monitoring.send_alert(container_mem)

 




报警示例图

 

posted @ 2023-06-15 11:27  力王7314  阅读(259)  评论(0编辑  收藏  举报