只使用python标准库,监控cpu, load, memory,traffic, connection, diskpace, diskio
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from optparse import OptionParser
import os, sys, pwd, time, re, json
def unit_convert(data):
if data < 10**3:
return str(round(float(data), 2))
elif data > 10**3 and data < 10**6:
return str(round(float(data) / 10**3, 2)) + 'K'
elif data > 10**6 and data < 10**9:
return str(round(float(data) / 10**6, 2)) + 'M'
elif data > 10**9 and data < 10**12:
return str(round(float(data) / 10**9, 2)) + 'G'
elif data > 10**12 and data < 10**15:
return str(round(float(data) / 10**12, 2)) + 'T'
'''
if data < 2**10:
return str(round(float(data), 2))
elif data > 2**10 and data < 2**20:
return str(round(float(data) / 2**10, 2)) + 'K'
elif data > 2**20 and data < 2**30:
return str(round(float(data) / 2**20, 2)) + 'M'
elif data > 2**30 and data < 2**40:
return str(round(float(data) / 2**30, 2)) + 'G'
elif data > 2**40 and data < 2**50:
return str(round(float(data) / 2**40, 2)) + 'T'
'''
def nagios_handle(status, status_info, perf_data):
STATUS = {'0': 'OK', '2': 'CRITICAL'}
if isinstance(perf_data, str):
print STATUS[str(status)] + status_info + ' |' + perf_data
else:
print STATUS[str(status)] + status_info + ' |' + ' '.join(sorted(perf_data))
sys.exit(status)
class CPUCollector():
''' CPU Collector'''
def get_stats(self):
cpu = {}
total_cpu = 0
with open('/proc/stat', 'r') as f:
for line in f:
m = re.match('^(cpu\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)', line)
if m:
name = m.group(1)
cpu[name] = {}
cpu[name]['idle'], cpu[name]['system'], cpu[name]['user'], cpu[name]['iowait'] = \
int(m.group(5)), int(m.group(4)), int(m.group(2)), int(m.group(6))
total_cpu += 1
data = {
'timestamp': int(time.mktime(time.localtime())),
'total_cpu': total_cpu,
'cpu': cpu,
}
return data
def check_status(self, threshold):
cache_file = '/tmp/cache_cpu_' + pwd.getpwuid(os.getuid())[0]
perf_data = []
avg_cpu = 0
if not os.path.exists(cache_file):
new = self.get_stats()
with open(cache_file, 'w') as f:
json.dump(new, f, sort_keys=True)
nagios_handle(0, ': Buffer creation...', '')
else:
with open(cache_file, 'r') as f:
old = json.load(f)
new = self.get_stats()
with open(cache_file, 'w') as f:
json.dump(new, f, sort_keys=True)
for name in new['cpu'].keys():
delta_total = (new['cpu'][name]['idle'] + new['cpu'][name]['system'] + new['cpu'][name]['user'] + new['cpu'][name]['iowait']) - \
(old['cpu'][name]['idle'] + old['cpu'][name]['system'] + old['cpu'][name]['user'] + old['cpu'][name]['iowait'])
delta_idle = new['cpu'][name]['idle'] - old['cpu'][name]['idle']
cpu_usage = 100 - (100.0 * delta_idle / delta_total)
perf_data.append('\'%s\'=%.2f%%;;;0;100' % (name, cpu_usage))
avg_cpu += cpu_usage
avg_cpu = avg_cpu / new['total_cpu']
perf_data.append('\'total_cpu_avg\'=%.2f%%;0:%d;0:%d;0;100' % (avg_cpu, threshold, threshold))
status = 0 if avg_cpu < threshold else 2
status_info = ': CPU(s) average usage is: %.2f%%' % avg_cpu
nagios_handle(status, status_info, perf_data)
with open(cache_file, 'w') as f:
json.dump(new, f, sort_keys=True)
class LoadCollector():
'''Load Collector'''
def get_stats(self):
with open('/proc/loadavg', 'r') as f:
load1, load5, load15 = f.readline().split()[0:3]
return { 'load1': load1, 'load5': load5, 'load15': load15 }
def get_cpu_core(self):
stats = open('/proc/cpuinfo', 'r')
core = []
for line in stats:
if line.startswith('core id'):
core.append(int(line.split(':')[1]))
return len(list(set(core)))
def check_status(self):
perf_data = []
stats = self.get_stats()
threshold = self.get_cpu_core() * 8
status = 0 if stats['load5'] > threshold else 2
status_info = ': Load average: %s, %s, %s' % (stats['load1'], stats['load5'], stats['load15'])
for name, value in stats.items():
perf_data.append('\'%s\'=%s;0:%d;0:%d;0;' %(name, value, threshold, threshold))
nagios_handle(status, status_info, perf_data)
class MemoryCollector():
'''Memory Collector'''
def get_stats(self):
with open('/proc/meminfo', 'r') as f:
for line in f:
m = re.match('^(MemTotal|MemFree|Buffers|Cached):\s+(\d+) kB', line)
if m:
if m.group(1) == 'MemTotal':
total = float(m.group(2)) * 1024
elif m.group(1) == 'MemFree':
free = float(m.group(2)) * 1024
elif m.group(1) == 'Buffers':
buffers = float(m.group(2)) * 1024
elif m.group(1) == 'Cached':
cached = float(m.group(2)) * 1024
return total, free, buffers, cached
def check_status(self, threshold):
total, free, buffers, cached = self.get_stats()
used = total - (free + buffers + cached)
percent = round(100.0 * used / total, 2)
status = 0 if percent < threshold else 2
status_info = ': Ram Used (+buffers/cache): %sB (%.2f%%), Buffer: %sB, Cached: %sB, Total: %sB' \
% (unit_convert(used), percent, unit_convert(buffers), unit_convert(cached), unit_convert(total))
perf_data = '\'cached\'=%d;;;0; \'buffer\'=%d;;;0; \'used\'=%d;0:%d;0:%d;0;%d' % \
(cached, buffers, used, used*threshold/100, used*threshold/100, total)
nagios_handle(status, status_info, perf_data)
class ConnectionsCollector():
'''Connections Collector'''
def get_stats(self):
data = {
'ERROR' : 0,
'ESTABLISHED' : 0,
'SYN_SENT' : 0,
'SYN_RECV' : 0,
'FIN_WAIT1' : 0,
'FIN_WAIT2' : 0,
'TIME_WAIT' : 0,
'CLOSE' : 0,
'CLOSE_WAIT' : 0,
'LAST_ACK' : 0,
'LISTEN' : 0,
'CLOSING' : 0
}
with open('/proc/net/tcp6', 'r') as f:
for line in f:
m = re.match('\s*(\d+):\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*)\s+(.*)', line)
if m:
state = m.group(6)
if state is '00':
data['ERROR'] += 1
elif state == '01':
data['ESTABLISHED'] += 1
elif state == '02':
data['SYN_SENT'] += 1
elif state == '03':
data['SYN_RECV'] += 1
elif state == '04':
data['FIN_WAIT1'] += 1
elif state == '05':
data['FIN_WAIT2'] += 1
elif state == '06':
data['TIME_WAIT'] += 1
elif state == '07':
data['CLOSE'] += 1
elif state is '08':
data['CLOSE_WAIT'] += 1
elif state is '09':
data['LAST_ACK'] += 1
elif state is '0A':
data['LISTEN'] += 1
elif state is '0B':
data['CLOSING'] += 1
if os.path.exists('/proc/net/tcp'):
with open('/proc/net/tcp', 'r') as f:
for line in f:
m = re.match('\s*(\d+):\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*):(\S*)\s+(\S*):(\S*)\s+(\S*)\s+(\S*)\s+(.*)', line)
if m:
state = m.group(6)
if state is '00':
data['ERROR'] += 1
elif state == '01':
data['ESTABLISHED'] += 1
elif state == '02':
data['SYN_SENT'] += 1
elif state == '03':
data['SYN_RECV'] += 1
elif state == '04':
data['FIN_WAIT1'] += 1
elif state == '05':
data['FIN_WAIT2'] += 1
elif state == '06':
data['TIME_WAIT'] += 1
elif state == '07':
data['CLOSE'] += 1
elif state is '08':
data['CLOSE_WAIT'] += 1
elif state is '09':
data['LAST_ACK'] += 1
elif state is '0A':
data['LISTEN'] += 1
elif state is '0B':
data['CLOSING'] += 1
return data
def check_status(self, threshold):
perf_data = []
stats = self.get_stats()
total = sum([i for i in stats.values()])
status = 0 if total < threshold else 2
status_info = ': Total connections: %d' % total
for name, value in stats.items():
perf_data.append('\'%s\'=%d;;;0;' % (name.lower(), value))
nagios_handle(status, status_info, perf_data)
class TrafficCollector():
'''Traffic Collector'''
def list_interface(self):
with open('/proc/net/dev', 'r') as f:
for line in f:
m = re.match('\s*(\S+):\s*(.*)', line)
if m:
print m.group(1)
def get_stats(self):
nic = {}
with open('/proc/net/dev', 'r') as f:
for line in f:
m = re.match('\s*(\S+):\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+', line)
if m:
name = m.group(1)
nic[name] = {}
nic[name]['in'], nic[name]['out'] = int(m.group(2)) , int(m.group(10))
data = {
'timestamp': int(time.mktime(time.localtime())),
'nic': nic,
}
return data
def check_status(self, interface, speed, percent):
max = speed * 10**6
threshold = percent / 100.0 * max
cache_file = '/tmp/cache_traffic_' + pwd.getpwuid(os.getuid())[0]
if not os.path.exists(cache_file):
new = self.get_stats()
with open(cache_file, 'w') as f:
json.dump(new, f, sort_keys=True)
nagios_handle(0, ': Buffer creation...', '')
else:
with open(cache_file, 'r') as f:
old = json.load(f)
new = self.get_stats()
with open(cache_file, 'w') as f:
json.dump(new, f, sort_keys=True)
if new['nic'][interface]['in'] > old['nic'][interface]['in']:
traffic_in = (new['nic'][interface]['in'] - old['nic'][interface]['in']) / float(new['timestamp'] - old['timestamp']) * 8
else:
traffic_in = (old['nic'][interface]['in'] - new['nic'][interface]['in']) / float(new['timestamp'] - old['timestamp']) * 8
if new['nic'][interface]['out'] > old['nic'][interface]['out']:
traffic_out = (new['nic'][interface]['out'] - old['nic'][interface]['out']) / float(new['timestamp'] - old['timestamp']) * 8
else:
traffic_out = (old['nic'][interface]['out'] - new['nic'][interface]['out']) / float(new['timestamp'] - old['timestamp']) * 8
status = 0 if (traffic_in < threshold) and (traffic_out < threshold) else 2
status_info = ': Interface %s Traffic In : %sb/s (%.2f%%), Out : %sb/s (%.2f%%)' \
% (interface, unit_convert(traffic_in), traffic_in / max * 100, unit_convert(traffic_out), traffic_out / max * 100)
perf_data = '\'traffic_in\'=%.2fb/s;;0:%.1f;0;%.1f \'traffic_out\'=%.2fb/s;;0:%.1f;0;%.1f' \
% (traffic_in, threshold, max, traffic_out, threshold, max)
nagios_handle(status, status_info, perf_data)
class DiskSpaceCollector():
'''Disk Space Collector'''
def list_mountpoint(self):
with open('/proc/mounts', 'r') as f:
for line in f:
m = re.match('/dev/(sd[a-z][0-9]|md\S+)\s+(/\S*)\s+(\S+)\s(.*)', line)
if m:
print m.group(1), m.group(2), m.group(3)
def get_stats(self, mountpoint):
disk = os.statvfs(mountpoint)
free = (disk.f_bavail * disk.f_frsize)
total = (disk.f_blocks * disk.f_frsize)
used = (disk.f_blocks - disk.f_bfree) * disk.f_frsize
percent = 100.0 * used / total
return total, used, free, percent
def check_status(self, mountpoint, threshold):
total, used, free, percent = self.get_stats(mountpoint)
threshold = threshold * 10**9
status = 0 if free > threshold else 2
status_info = ': Storage \'%s\' Total: %sB Used: %sB (%.2f%%) Free: %sB (%.2f%%)' \
% (mountpoint, unit_convert(total), unit_convert(used), percent, unit_convert(free), 100 - percent)
perf_data = '\'free\'=%.dB;@0:%d;@0:%d;0;%d' % (free, threshold, threshold, total)
nagios_handle(status, status_info, perf_data)
class DiskIOCollector():
'''Disk IO Collector'''
def list_partition(self):
with open('/proc/diskstats', 'r') as f:
for line in f:
m = re.match('\s*\d+\s+\d+\s+(sd[a-z]|md\d+)\s+(.*)', line)
if m:
print m.group(1)
def get_stats(self):
cpu = {}
cpu['total_cpu'] = 0
disk = {}
with open('/proc/stat', 'r') as f:
for line in f:
m = re.match('^cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)', line)
n = re.match('cpu\d+', line)
if m:
cpu['idle'], cpu['system'], cpu['user'], cpu['iowait'] = \
int(m.group(4)), int(m.group(3)), int(m.group(1)), int(m.group(5))
if n:
cpu['total_cpu'] += 1
with open('/proc/diskstats', 'r') as f:
for line in f:
m = re.match('^\s*(\d+)\s+(\d+)\s+(sd[a-z]|md\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+', line)
if m:
name = m.group(3)
disk[name] = {}
disk[name]['read_sector'], disk[name]['write_sector'], disk[name]['read_ms'], disk[name]['write_ms'], disk[name]['ms_ticks'] = \
int(m.group(6)), int(m.group(10)), int(m.group(7)), int(m.group(11)), int(m.group(13))
data = {
'timestamp': int(time.mktime(time.localtime())),
'cpu': cpu,
'disk': disk
}
return data
def check_status(self, partition, threshold):
bytes_per_sector = 512.0
interrupt_frequency = 1000.0
cache_file = '/tmp/cache_diskio_' + pwd.getpwuid(os.getuid())[0]
if not os.path.exists(cache_file):
new = self.get_stats()
with open(cache_file, 'w') as f:
json.dump(new, f, sort_keys=True)
nagios_handle(0, ': Buffer creation...', '')
else:
with open(cache_file, 'r') as f:
old = json.load(f)
new = self.get_stats()
read_bytes = (new['disk'][partition]['read_sector'] - old['disk'][partition]['read_sector']) * bytes_per_sector / (new['timestamp'] - old['timestamp']);
write_bytes = (new['disk'][partition]['write_sector'] - old['disk'][partition]['write_sector']) * bytes_per_sector / (new['timestamp'] - old['timestamp']);
read_ms = new['disk'][partition]['read_ms'] - old['disk'][partition]['read_ms']
write_ms = new['disk'][partition]['write_ms'] - old['disk'][partition]['write_ms']
delta_ms = ((new['cpu']['idle'] + new['cpu']['system'] + new['cpu']['user'] + new['cpu']['iowait']) - \
(old['cpu']['idle'] + old['cpu']['system'] + old['cpu']['user'] + old['cpu']['iowait'])) * \
interrupt_frequency / old['cpu']['total_cpu'] / 100;
utils = 100 * (new['disk'][partition]['ms_ticks'] - old['disk'][partition]['ms_ticks']) / delta_ms;
utils = 100 if utils > 100 else utils
status = 0 if utils < threshold else 2
perf_data = '\'readio\'=%sB/s;;;0; \'writeio\'=%sB/s;;;0; \'readtime\'=%dms;;;0; \'writetime\'=%dms;;;0; \'utils\'=%.2f%%;;;0;100' \
% (read_bytes, write_bytes, read_ms, write_ms, utils)
status_info = ': Partition %s Read I/O: %sB/s, Write I/O: %sB/s, Write Time: %dms, Read Time: %dms, %%Utils: %.2f%%' \
% (partition, unit_convert(read_bytes), unit_convert(write_bytes), read_ms, write_ms, utils)
nagios_handle(status, status_info, perf_data)
with open(cache_file, 'w') as f:
json.dump(new, f, sort_keys=True)
if __name__ == '__main__':
usage = ''' %prog [options] arg1 arg2
example:
. %prog --mode=cpu --critical=98 (unit: %)
. %prog --mode=memory --critical=98 (unit: %)
. %prog --mode=load
. %prog --mode=connections --critical=30000 (unit: int)
. %prog --mode=traffic --interface=em1 --speed=1000 --critical=90 (unit: %)
. %prog --mode=diskspace --name=/opt --critical=5 (unit: GB)
. %prog --mode=disksio --name=sda --critical=95 (unit: %)
. %prog --mode=traffic --list-interface
. %prog --mode=diskspace --list-partition
. %prog --mode=disksio --list-disk
'''
parser = OptionParser(usage)
parser.add_option('--mode', action='store', type='string', dest='mode', help='check mode')
parser.add_option('--name', action='store', type='string', dest='name', help='check item name')
parser.add_option('--critical', action='store', type='int', dest='critical', help='Threshold critical')
parser.add_option('--speed', action='store', type='int', dest='speed', help='interface max speed, unit is Mb')
parser.add_option('--list-interface', action='store_true', dest='interfaces', help='list all NIC interfaces')
parser.add_option('--list-partition', action='store_true', dest='partitions', help='list all partitions')
parser.add_option('--list-disk', action='store_true', dest='disks', help='list all disks')
(options, args) = parser.parse_args()
if options.mode == 'cpu' and options.critical in range(1, 100+1):
cpu = CPUCollector()
cpu.check_status(options.critical)
if options.mode == 'load':
load = LoadCollector()
load.check_status()
if options.mode == 'memory' and options.critical in range(1, 100+1):
memory = MemoryCollector()
memory.check_status(options.critical)
if options.mode == 'connections' and options.critical:
connections = ConnectionsCollector()
connections.check_status(options.critical)
if options.mode == 'traffic' and options.name and options.speed and options.critical:
traffic = TrafficCollector()
traffic.check_status(options.name, options.speed, options.critical)
if options.interfaces:
traffic = TrafficCollector()
traffic.list_interface()
if options.mode == 'diskspace' and options.name and options.critical:
diskspace = DiskSpaceCollector()
diskspace.check_status(options.name, options.critical)
if options.partitions:
diskspace = DiskSpaceCollector()
diskspace.list_mountpoint()
if options.mode == 'diskio' and options.name and options.critical:
diskio = DiskIOCollector()
diskio.check_status(options.name, options.critical)
if options.disks:
diskio = DiskIOCollector()
diskio.list_partition()
标签:
python
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
· AI与.NET技术实操系列(六):基于图像分类模型对图像进行分类