涛子 - 简单就是美

成单纯魁增,永继振国兴,克复宗清政,广开家必升

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理
  428 随笔 :: 0 文章 :: 19 评论 :: 22万 阅读

只使用python标准库,监控cpu, load, memory,traffic, connection, diskpace, diskio

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from optparse import OptionParser
import os, sys, pwd, time, re, json

def unit_convert(data):
    if data < 10**3:
        return str(round(float(data), 2))
    elif data > 10**3 and data < 10**6:
        return str(round(float(data) / 10**3, 2)) + 'K'
    elif data > 10**6 and data < 10**9:
        return str(round(float(data) / 10**6, 2)) + 'M'
    elif data > 10**9 and data < 10**12:
        return str(round(float(data) / 10**9, 2)) + 'G'
    elif data > 10**12 and data < 10**15:
        return str(round(float(data) / 10**12, 2)) + 'T'
    '''
    if data < 2**10:
        return str(round(float(data), 2))
    elif data > 2**10 and data < 2**20:
        return str(round(float(data) / 2**10, 2)) + 'K'
    elif data > 2**20 and data < 2**30:
        return str(round(float(data) / 2**20, 2)) + 'M'
    elif data > 2**30 and data < 2**40:
        return str(round(float(data) / 2**30, 2)) + 'G'
    elif data > 2**40 and data < 2**50:
        return str(round(float(data) / 2**40, 2)) + 'T'
    '''

def nagios_handle(status, status_info, perf_data):
    STATUS = {'0': 'OK',  '2': 'CRITICAL'}
    if isinstance(perf_data, str):
        print STATUS[str(status)] + status_info + ' |' + perf_data
    else:
        print STATUS[str(status)] + status_info + ' |' + ' '.join(sorted(perf_data))
    sys.exit(status)

class CPUCollector():
    ''' CPU Collector'''
    def get_stats(self):
        cpu = {}
        total_cpu = 0

        with open('/proc/stat', 'r') as f:
            for line in f:
                m = re.match('^(cpu\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)', line)
                if m:
                    name = m.group(1)
                    cpu[name] = {}
                    cpu[name]['idle'], cpu[name]['system'], cpu[name]['user'], cpu[name]['iowait'] = \
                        int(m.group(5)), int(m.group(4)), int(m.group(2)), int(m.group(6))
                    total_cpu += 1

        data = {
            'timestamp': int(time.mktime(time.localtime())),
            'total_cpu': total_cpu,
            'cpu': cpu,
        }

        return data

    def check_status(self, threshold):
        cache_file = '/tmp/cache_cpu_' + pwd.getpwuid(os.getuid())[0]
        perf_data = []
        avg_cpu = 0

        if not os.path.exists(cache_file):
            new = self.get_stats()
            with open(cache_file, 'w') as f:
                json.dump(new, f, sort_keys=True)

            nagios_handle(0, ': Buffer creation...', '')

        else:
            with open(cache_file, 'r') as f:
                old = json.load(f)

            new = self.get_stats()
            with open(cache_file, 'w') as f:
                json.dump(new, f, sort_keys=True)

            for name in new['cpu'].keys():
                delta_total = (new['cpu'][name]['idle'] + new['cpu'][name]['system'] + new['cpu'][name]['user'] + new['cpu'][name]['iowait']) - \
                    (old['cpu'][name]['idle'] + old['cpu'][name]['system'] + old['cpu'][name]['user'] + old['cpu'][name]['iowait'])
                delta_idle = new['cpu'][name]['idle'] - old['cpu'][name]['idle']
                cpu_usage = 100 - (100.0 * delta_idle / delta_total)

                perf_data.append('\'%s\'=%.2f%%;;;0;100' % (name, cpu_usage))
                avg_cpu += cpu_usage

            avg_cpu = avg_cpu / new['total_cpu']
            perf_data.append('\'total_cpu_avg\'=%.2f%%;0:%d;0:%d;0;100' % (avg_cpu, threshold, threshold))

            status = 0 if avg_cpu < threshold else 2
            status_info = ': CPU(s) average usage is: %.2f%%' % avg_cpu

            nagios_handle(status, status_info, perf_data)

            with open(cache_file, 'w') as f:
                json.dump(new, f, sort_keys=True)

class LoadCollector():
    '''Load Collector'''
    def get_stats(self):
        with open('/proc/loadavg', 'r') as f:
            load1, load5, load15 = f.readline().split()[0:3]
        
        return { 'load1': load1, 'load5': load5, 'load15': load15 }

    def get_cpu_core(self):
        stats = open('/proc/cpuinfo', 'r')
        core = []
        for line in stats:
            if line.startswith('core id'):
                core.append(int(line.split(':')[1]))

        return len(list(set(core))) 

    def check_status(self):
        perf_data = []
        stats = self.get_stats()
        threshold = self.get_cpu_core() * 8

        status = 0 if stats['load5'] > threshold else 2
        status_info = ': Load average: %s, %s, %s' % (stats['load1'], stats['load5'], stats['load15'])
        for name, value in stats.items():
            perf_data.append('\'%s\'=%s;0:%d;0:%d;0;' %(name, value, threshold, threshold))

        nagios_handle(status, status_info, perf_data)

class MemoryCollector():
    '''Memory Collector'''
    def get_stats(self):
        with open('/proc/meminfo', 'r') as f:
            for line in f:
                m = re.match('^(MemTotal|MemFree|Buffers|Cached):\s+(\d+) kB', line)
                if m:
                    if m.group(1) == 'MemTotal':
                        total = float(m.group(2)) * 1024
                    elif m.group(1) == 'MemFree':
                        free = float(m.group(2)) * 1024
                    elif m.group(1) == 'Buffers':
                        buffers = float(m.group(2)) * 1024
                    elif m.group(1) == 'Cached':
                        cached = float(m.group(2)) * 1024

        return total, free, buffers, cached

    def check_status(self, threshold):
        total, free, buffers, cached = self.get_stats()
        used = total - (free + buffers + cached) 
        percent = round(100.0 * used / total, 2)

        status = 0 if percent < threshold else 2
        status_info = ': Ram Used (+buffers/cache): %sB (%.2f%%), Buffer: %sB, Cached: %sB, Total: %sB' \
            % (unit_convert(used), percent, unit_convert(buffers), unit_convert(cached), unit_convert(total))
        perf_data = '\'cached\'=%d;;;0; \'buffer\'=%d;;;0; \'used\'=%d;0:%d;0:%d;0;%d' % \
            (cached, buffers, used, used*threshold/100, used*threshold/100, total)
        
        nagios_handle(status, status_info, perf_data)

class ConnectionsCollector():
    '''Connections Collector'''
    def get_stats(self):
        data = {
            'ERROR' : 0, 
            'ESTABLISHED' : 0, 
            'SYN_SENT' : 0, 
            'SYN_RECV' : 0, 
            'FIN_WAIT1' : 0, 
            'FIN_WAIT2' : 0, 
            'TIME_WAIT' : 0, 
            'CLOSE' : 0, 
            'CLOSE_WAIT' : 0, 
            'LAST_ACK' : 0, 
            'LISTEN' : 0,
            'CLOSING' : 0
        }

        with open('/proc/net/tcp6', 'r') as f:
            for line in f:
                m = re.match('\s*(\d+):\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*)\s+(.*)', line)
                if m:
                    state = m.group(6)
                    if state is '00':
                        data['ERROR'] += 1
                    elif state == '01':
                        data['ESTABLISHED'] += 1
                    elif state == '02':
                        data['SYN_SENT'] += 1
                    elif state == '03':
                        data['SYN_RECV'] += 1
                    elif state == '04':
                        data['FIN_WAIT1'] += 1
                    elif state == '05':
                        data['FIN_WAIT2'] += 1
                    elif state == '06':
                        data['TIME_WAIT'] += 1
                    elif state == '07':
                        data['CLOSE'] += 1
                    elif state is '08':
                        data['CLOSE_WAIT'] += 1
                    elif state is '09':
                        data['LAST_ACK'] += 1
                    elif state is '0A':
                        data['LISTEN'] += 1
                    elif state is '0B':
                        data['CLOSING'] += 1

        if os.path.exists('/proc/net/tcp'):
            with open('/proc/net/tcp', 'r') as f:
                for line in f:
                    m = re.match('\s*(\d+):\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*)\s+(.*)', line)
                    if m:
                        state = m.group(6)
                        if state is '00':
                            data['ERROR'] += 1
                        elif state == '01':
                            data['ESTABLISHED'] += 1
                        elif state == '02':
                            data['SYN_SENT'] += 1
                        elif state == '03':
                            data['SYN_RECV'] += 1
                        elif state == '04':
                            data['FIN_WAIT1'] += 1
                        elif state == '05':
                            data['FIN_WAIT2'] += 1
                        elif state == '06':
                            data['TIME_WAIT'] += 1
                        elif state == '07':
                            data['CLOSE'] += 1
                        elif state is '08':
                            data['CLOSE_WAIT'] += 1
                        elif state is '09':
                            data['LAST_ACK'] += 1
                        elif state is '0A':
                            data['LISTEN'] += 1
                        elif state is '0B':
                            data['CLOSING'] += 1

        return data

    def check_status(self, threshold):
        perf_data = []
        stats = self.get_stats()
        total = sum([i for i in stats.values()])
        
        status = 0 if total < threshold else 2
        status_info = ': Total connections: %d' % total
        for name, value in stats.items():
            perf_data.append('\'%s\'=%d;;;0;' % (name.lower(), value))
        
        nagios_handle(status, status_info, perf_data)

class TrafficCollector():
    '''Traffic Collector'''
    def list_interface(self):
        with open('/proc/net/dev', 'r') as f:
            for line in f:
                m = re.match('\s*(\S+):\s*(.*)', line)
                if m:
                    print m.group(1)

    def get_stats(self):
        nic = {}
        with open('/proc/net/dev', 'r') as f:
            for line in f:
                m = re.match('\s*(\S+):\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+', line)
                if m:
                    name = m.group(1)
                    nic[name] = {}
                    nic[name]['in'], nic[name]['out']  = int(m.group(2))    , int(m.group(10)) 

        data = {
            'timestamp': int(time.mktime(time.localtime())),
            'nic': nic,
        }

        return data

    def check_status(self, interface, speed, percent):
        max = speed * 10**6
        threshold = percent / 100.0 * max

        cache_file = '/tmp/cache_traffic_' + pwd.getpwuid(os.getuid())[0]

        if not os.path.exists(cache_file):
            new = self.get_stats()
            with open(cache_file, 'w') as f:
                json.dump(new, f, sort_keys=True)

            nagios_handle(0, ': Buffer creation...', '')

        else:
            with open(cache_file, 'r') as f:
                old = json.load(f)

            new = self.get_stats()
            with open(cache_file, 'w') as f:
                json.dump(new, f, sort_keys=True)

            if new['nic'][interface]['in'] > old['nic'][interface]['in']:
                traffic_in = (new['nic'][interface]['in'] - old['nic'][interface]['in']) / float(new['timestamp'] - old['timestamp']) * 8
            else:
                traffic_in = (old['nic'][interface]['in'] - new['nic'][interface]['in']) / float(new['timestamp'] - old['timestamp']) * 8

            if new['nic'][interface]['out'] > old['nic'][interface]['out']:
                traffic_out = (new['nic'][interface]['out'] - old['nic'][interface]['out']) / float(new['timestamp'] - old['timestamp']) * 8
            else:
                traffic_out = (old['nic'][interface]['out'] - new['nic'][interface]['out']) / float(new['timestamp'] - old['timestamp']) * 8

            status = 0 if (traffic_in < threshold) and (traffic_out < threshold) else 2

            status_info = ': Interface %s Traffic In : %sb/s (%.2f%%), Out : %sb/s (%.2f%%)' \
                % (interface, unit_convert(traffic_in), traffic_in / max * 100, unit_convert(traffic_out), traffic_out / max * 100)
            
            perf_data = '\'traffic_in\'=%.2fb/s;;0:%.1f;0;%.1f \'traffic_out\'=%.2fb/s;;0:%.1f;0;%.1f' \
                % (traffic_in, threshold, max, traffic_out, threshold, max)

            nagios_handle(status, status_info, perf_data)

class DiskSpaceCollector():
    '''Disk Space Collector'''
    def list_mountpoint(self):
        with open('/proc/mounts', 'r') as f:
            for line in f:
                m = re.match('/dev/(sd[a-z][0-9]|md\S+)\s+(/\S*)\s+(\S+)\s(.*)', line)
                if m:
                    print m.group(1), m.group(2), m.group(3)

    def get_stats(self, mountpoint):
        disk = os.statvfs(mountpoint)
        free = (disk.f_bavail * disk.f_frsize)
        total = (disk.f_blocks * disk.f_frsize)
        used = (disk.f_blocks - disk.f_bfree) * disk.f_frsize
        percent = 100.0 * used / total
        return total, used, free, percent

    def check_status(self, mountpoint, threshold):
        total, used, free, percent = self.get_stats(mountpoint)
        threshold = threshold * 10**9

        status = 0 if  free > threshold else 2
        status_info = ': Storage \'%s\' Total: %sB Used: %sB (%.2f%%) Free: %sB (%.2f%%)' \
            % (mountpoint, unit_convert(total), unit_convert(used), percent, unit_convert(free), 100 - percent)
        perf_data = '\'free\'=%.dB;@0:%d;@0:%d;0;%d' % (free, threshold, threshold, total)
        
        nagios_handle(status, status_info, perf_data)

class DiskIOCollector():
    '''Disk IO Collector'''
    def list_partition(self):
        with open('/proc/diskstats', 'r') as f:
            for line in f:
                m = re.match('\s*\d+\s+\d+\s+(sd[a-z]|md\d+)\s+(.*)', line)
                if m:
                    print m.group(1)

    def get_stats(self):
        cpu = {}
        cpu['total_cpu'] = 0
        disk = {}

        with open('/proc/stat', 'r') as f:
            for line in f:
                m = re.match('^cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)', line)
                n = re.match('cpu\d+', line)
                if m:
                    cpu['idle'], cpu['system'], cpu['user'], cpu['iowait'] = \
                        int(m.group(4)), int(m.group(3)), int(m.group(1)), int(m.group(5))
                if n:
                    cpu['total_cpu'] += 1

        with open('/proc/diskstats', 'r') as f:
            for line in f:
                m = re.match('^\s*(\d+)\s+(\d+)\s+(sd[a-z]|md\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+', line)
                if m:
                    name = m.group(3)
                    disk[name] = {}
                    disk[name]['read_sector'], disk[name]['write_sector'], disk[name]['read_ms'], disk[name]['write_ms'], disk[name]['ms_ticks'] = \
                        int(m.group(6)), int(m.group(10)), int(m.group(7)), int(m.group(11)), int(m.group(13))

        data = {
            'timestamp': int(time.mktime(time.localtime())),
            'cpu': cpu,
            'disk': disk
        }

        return data

    def check_status(self, partition, threshold):
        bytes_per_sector = 512.0
        interrupt_frequency = 1000.0

        cache_file = '/tmp/cache_diskio_' + pwd.getpwuid(os.getuid())[0]

        if not os.path.exists(cache_file):
            new = self.get_stats()
            with open(cache_file, 'w') as f:
                json.dump(new, f, sort_keys=True)

            nagios_handle(0, ': Buffer creation...', '')

        else:
            with open(cache_file, 'r') as f:
                old = json.load(f)

            new = self.get_stats()
        
            read_bytes = (new['disk'][partition]['read_sector'] - old['disk'][partition]['read_sector']) * bytes_per_sector / (new['timestamp'] - old['timestamp']);  
            write_bytes = (new['disk'][partition]['write_sector'] - old['disk'][partition]['write_sector']) * bytes_per_sector / (new['timestamp'] - old['timestamp']);
            read_ms = new['disk'][partition]['read_ms'] - old['disk'][partition]['read_ms']
            write_ms = new['disk'][partition]['write_ms'] - old['disk'][partition]['write_ms']

            delta_ms = ((new['cpu']['idle'] + new['cpu']['system'] + new['cpu']['user'] + new['cpu']['iowait']) - \
                (old['cpu']['idle'] + old['cpu']['system'] + old['cpu']['user'] + old['cpu']['iowait'])) * \
                interrupt_frequency / old['cpu']['total_cpu'] / 100;

            utils = 100 * (new['disk'][partition]['ms_ticks'] - old['disk'][partition]['ms_ticks']) / delta_ms;
            utils = 100 if utils > 100 else utils


            status = 0 if utils < threshold else 2
            perf_data = '\'readio\'=%sB/s;;;0; \'writeio\'=%sB/s;;;0; \'readtime\'=%dms;;;0; \'writetime\'=%dms;;;0; \'utils\'=%.2f%%;;;0;100' \
                % (read_bytes, write_bytes, read_ms, write_ms, utils)
            status_info = ': Partition %s Read I/O: %sB/s, Write I/O: %sB/s, Write Time: %dms, Read Time: %dms, %%Utils: %.2f%%' \
                % (partition, unit_convert(read_bytes), unit_convert(write_bytes), read_ms, write_ms, utils)
        
            nagios_handle(status, status_info, perf_data)

            with open(cache_file, 'w') as f:
                json.dump(new, f, sort_keys=True)

if __name__ == '__main__':
    usage = ''' %prog [options] arg1 arg2

    example: 
    . %prog --mode=cpu --critical=98 (unit: %)
    . %prog --mode=memory --critical=98 (unit: %)
    . %prog --mode=load
    . %prog --mode=connections --critical=30000 (unit: int)
    . %prog --mode=traffic --interface=em1 --speed=1000 --critical=90 (unit: %)
    . %prog --mode=diskspace --name=/opt --critical=5 (unit: GB)
    . %prog --mode=disksio --name=sda --critical=95 (unit: %)

    . %prog --mode=traffic --list-interface
    . %prog --mode=diskspace --list-partition
    . %prog --mode=disksio --list-disk
    '''
    parser = OptionParser(usage)

    parser.add_option('--mode', action='store', type='string', dest='mode', help='check mode')
    parser.add_option('--name', action='store', type='string', dest='name', help='check item name')
    parser.add_option('--critical', action='store', type='int', dest='critical', help='Threshold critical')  
    parser.add_option('--speed', action='store', type='int', dest='speed', help='interface max speed, unit is Mb')
    parser.add_option('--list-interface', action='store_true', dest='interfaces', help='list all NIC interfaces')  
    parser.add_option('--list-partition', action='store_true', dest='partitions', help='list all partitions')
    parser.add_option('--list-disk', action='store_true', dest='disks', help='list all disks')

    (options, args) = parser.parse_args() 

    if options.mode == 'cpu' and options.critical in range(1, 100+1):
        cpu = CPUCollector()
        cpu.check_status(options.critical)

    if options.mode == 'load':
        load = LoadCollector()
        load.check_status()

    if options.mode == 'memory' and options.critical in range(1, 100+1):
        memory = MemoryCollector()
        memory.check_status(options.critical)

    if options.mode == 'connections' and options.critical:
        connections = ConnectionsCollector()
        connections.check_status(options.critical)

    if options.mode == 'traffic' and options.name and options.speed and options.critical:
        traffic = TrafficCollector()
        traffic.check_status(options.name, options.speed, options.critical)
    
    if options.interfaces:
        traffic = TrafficCollector()
        traffic.list_interface()

    if options.mode == 'diskspace' and options.name and options.critical:
        diskspace = DiskSpaceCollector()
        diskspace.check_status(options.name, options.critical)

    if options.partitions:
        diskspace = DiskSpaceCollector()
        diskspace.list_mountpoint()

    if options.mode == 'diskio' and options.name and options.critical:
        diskio = DiskIOCollector()
        diskio.check_status(options.name, options.critical)

    if options.disks:
        diskio = DiskIOCollector()
        diskio.list_partition()
posted on   北京涛子  阅读(284)  评论(0编辑  收藏  举报
编辑推荐:
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
· AI与.NET技术实操系列(六):基于图像分类模型对图像进行分类
点击右上角即可分享
微信分享提示