[root@Admin etc]# pwd /usr/local/nagios/etc [root@Admin etc]# ll total 144 -rw-rw-r-- 1 nagios nagios 11641 Dec 24 00:58 cgi.cfg -rw-r--r-- 1 root root 42 Dec 28 23:04 htpasswd.users //保存用户名和密码 -rw-rw-r-- 1 nagios nagios 44859 Dec 23 22:46 nagios.cfg //nagios的主配置文件 -rw-r--r-- 1 nagios nagios 7198 Dec 28 21:03 nrpe.cfg //可以在里边定义命令和插件的对应关系 -rw-r--r-- 1 root root 7216 Dec 28 20:55 nrpe.cfg.ori drwxrwxr-x 3 nagios nagios 4096 Dec 29 00:02 objects -rw-rw---- 1 nagios nagios 1340 Dec 22 19:38 resource.cfg //定义变量的,比如一些宏定义变量 [root@Admin etc]# cd objects/ [root@Admin objects]# pwd /usr/local/nagios/etc/objects [root@Admin objects]# ll total 112 -rw-rw-r-- 1 nagios nagios 8046 Dec 24 19:45 commands.cfg //定义命令 -rw-rw-r-- 1 nagios nagios 2166 Dec 24 19:41 contacts.cfg //定义联系人和组 -rw-r--r-- 1 nagios nagios 2098 Dec 26 20:09 hosts.cfg //定义主机 -rw-r--r-- 1 root root 1870 Dec 23 23:32 hosts.cfg.ori -rw-rw-r-- 1 nagios nagios 5403 Dec 22 19:38 localhost.cfg -rw-rw-r-- 1 nagios nagios 3124 Dec 22 19:38 printer.cfg //打印机的配置文件 drwxr-xr-x 2 nagios nagios 4096 Dec 28 23:52 services //默认是没有这个目录的,可以在下边定义一些特殊的服务 -rw-r--r-- 1 nagios nagios 900 Dec 26 20:08 services.cfg //定义服务 -rw-rw-r-- 1 nagios nagios 3293 Dec 22 19:38 switch.cfg //交换机的配置文件 -rw-rw-r-- 1 nagios nagios 10812 Dec 22 19:38 templates.cfg //模板配置文件 -rw-rw-r-- 1 nagios nagios 3208 Dec 22 19:38 timeperiods.cfg //配置周期 -rw-rw-r-- 1 nagios nagios 4019 Dec 22 19:38 windows.cfg //监控windows主机的配置文件 [root@Admin objects]#
...... # 'check_nrpe' command definition define command{ command_name check_nrpe command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$ } ### 'check_weburl' command definition define command{ command_name check_weburl command_line $USER1$/check_http $ARG1$ -w 10 -c 30 } ......
这里边定义了很多的的命令,这些命令对应着相应的插件,可以在services.cfg中引用些命令,这些命令后边可以跟很多的参数,可以通过获取帮助来查看这些参数 [root@Admin libexec]# pwd /usr/local/nagios/libexec [root@Admin libexec]# ./check_nrpe --help NRPE Plugin for Nagios Copyright (c) 1999-2008 Ethan Galstad ( Version: 2.12 Last Modified: 03-10-2008 License: GPL v2 with exemptions (-l for more info) SSL/TLS Available: Anonymous DH Mode, OpenSSL 0.9.6 or higher required Usage: check_nrpe -H <host> [-n] [-u] [-p <port>] [-t <timeout>] [-c <command>] [-a <arglist...>] Options: -n = Do no use SSL -u = Make socket timeouts return an UNKNOWN state instead of CRITICAL <host> = The address of the host running the NRPE daemon [port] = The port on which the daemon is running (default=5666) [timeout] = Number of seconds before connection times out (default=10) [command] = The name of the command that the remote daemon should run [arglist] = Optional arguments that should be passed to the command. Multiple arguments should be separated by a space. If provided, this must be the last option supplied on the command line. 着里边定义了这么多的命令,该如何使用呢?
define service{ use generic-service host_name 145-apache service_description blog_url check_command check_weburl!-I //叹号是一个分隔符,前边是命令,后边就是参数 max_check_attempts 3 normal_check_interval 2 retry_check_interval 1 check_period 24x7 notification_interval 30 notification_period 24x7 notification_options w,u,c,r contact_groups admins }
# You can specify individual object config files as shown below:
cfg_file=/usr/local/nagios/etc/objects/hosts.cfg 自定义的配置文件
cfg_dir=/usr/local/nagios/etc/objects/services 自定义的目录,该目录下的所有的配置文件都会生效
...... 在这里可以定义被监控主机的名字,IP地址,引用了哪一个模板 define host{ use linux-server //引用了linux-server模板,模板是在templates.cfg中定义的,在下边会介绍这个模板中都有啥 host_name 141-MySQL alias 141-MySQL address } ..... # Define an optional hostgroup for Linux machines 定义主机分组,不同的主机使用逗号隔开 define hostgroup{ hostgroup_name linux-servers ; The name of the hostgroup alias Linux Servers ; Long name of the group members 145-apache,129-nagios-server //同一个分组的不同主机使用“,”分割 } ......
咱们在这个templates.cfg搜索一下linux-server这个模板,看看都有啥 define host{ name linux-server 模板名称 use generic-host 又引用了generic-host模板,下边会介绍这个模板中都有啥 check_period 24x7 检测的周期,注意24x7只是一个字符串,并不是7天乘24小时的意思,这个参数是在timeperiods.cfg中定义的 check_interval 5 检查间隔 retry_interval 1 故障之后重新检查的间隔 max_check_attempts 3 故障后最大尝试检测次数 check_command check-host-alive 使用什么命令 下面会介绍这个是个什么命令 notification_period 24x7 使用什么通知周期 notification_interval 300 故障之后,报警的间隔 notification_options d,u,r 主机状态通知项 contact_groups admins 联系人组 register 0 }
....省略.... # 'check-host-alive' command definition define command{ command_name check-host-alive command_line $USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5 //调用的是check_ping插件,因为check_ping插件使用c语言开发的所以没法看源码,凡是用屁股想想也知道使用了ping命令来检测主机的死活 } ....省略.....
define host{ name generic-host ; The name of this host template notifications_enabled 1 ; Host notifications are enabled event_handler_enabled 1 ; Host event handler is enabled flap_detection_enabled 1 ; Flap detection is enabled failure_prediction_enabled 1 ; Failure prediction is enabled process_perf_data 1 ; Process performance data retain_status_information 1 ; Retain status information across program restarts retain_nonstatus_information 1 ; Retain non-status information across program restarts notification_period 24x7 ; Send host notifications at any time register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL HOST, JUST A TEMPLATE! }
define timeperiod{ timeperiod_name 24x7 //名字,不管放假不放假,周末不周末都要通知 alias 24 Hours A Day, 7 Days A Week //小名 sunday 00:00-24:00 monday 00:00-24:00 tuesday 00:00-24:00 wednesday 00:00-24:00 thursday 00:00-24:00 friday 00:00-24:00 saturday 00:00-24:00 } # 'workhours' timeperiod definition define timeperiod{ timeperiod_name workhours //这个就比较人性化,在工作日的时候通知,周一至周五 alias Normal Work Hours monday 09:00-17:00 tuesday 09:00-17:00 wednesday 09:00-17:00 thursday 09:00-17:00 friday 09:00-17:00 }
define service { use generic-service //使用的模板,下面会介绍这个模板中都有什么东西 host_name 141-MySQL,140-LB1 //该服务针对那些主机,不同的主机之间使用逗号隔开 service_description Check Load //服务的描述,这个参数在定义服务分组的时候回用到的 check_command check_nrpe!check_load //指定命令,check_nrpe!是被动模式的服务 contact_groups admins //指定联系人组 }
这个是自定定义的一个主动服务 define service{ use generic-service host_name 145-apache service_description check_port check_command check_weburl!check_port_80 max_check_attempts 3 normal_check_interval 2 retry_check_interval 1 check_period 24x7 notification_interval 30 notification_period 24x7 notification_options w,u,c,r contact_groups admins }
define service{ name generic-service ; The 'name' of this service template active_checks_enabled 1 ; Active service checks are enabled passive_checks_enabled 1 ; Passive service checks are enabled/accepted parallelize_check 1 ; Active service checks should be parallelized (disabling this can lead to major performance problems) obsess_over_service 1 ; We should obsess over this service (if necessary) check_freshness 0 ; Default is to NOT check service 'freshness' notifications_enabled 1 ; Service notifications are enabled event_handler_enabled 1 ; Service event handler is enabled flap_detection_enabled 1 ; Flap detection is enabled failure_prediction_enabled 1 ; Failure prediction is enabled process_perf_data 1 ; Process performance data retain_status_information 1 ; Retain status information across program restarts retain_nonstatus_information 1 ; Retain non-status information across program restarts is_volatile 0 ; The service is not volatile check_period 24x7 ; The service can be checked at any time of the day max_check_attempts 3 ; Re-check the service up to 3 times in order to determine its final (hard) state normal_check_interval 10 ; Check the service every 10 minutes under normal conditions retry_check_interval 2 ; Re-check the service every two minutes until a hard state can be determined contact_groups admins ; Notifications get sent out to everyone in the 'admins' group notification_options w,u,c,r ; Send notifications about warning, unknown, critical, and recovery events notification_interval 60 ; Re-notify about service problems every hour notification_period 24x7 ; Notifications can be sent out at any time register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE! }