hadoop-ansible/roles/nagios-server/templates/hadoop-cluster/services.cfg.j2

323 lines
13 KiB
Django/Jinja

define service {
hostgroup_name hadoop-cluster
service_description 0 is alive
check_command check_host
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
define service {
hostgroup_name hadoop-cluster
service_description NTP status
check_command check_ntp_time
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
event_handler restart-service!ntp
}
define service {
hostgroup_name hadoop-cluster
service_description ssh service
check_command check_ssh
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
define service {
hostgroup_name hadoop-cluster
service_description load average
check_command load_average
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
define service {
hostgroup_name hadoop-cluster
service_description users
check_command users
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
define service {
hostgroup_name hadoop-cluster
service_description processes num.
check_command processes
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
define service {
hostgroup_name hadoop-cluster
service_description zombie processes
check_command zombie_processes
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
define service {
hostgroup_name hadoop-cluster
service_description Network interfaces status
check_command network_interfaces
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
notes_url http://exchange.nagios.org/directory/Plugins/Network-Protocols/SNMP/Advanced-Network-Interface-Check--2D-check_netint--2F-check_snmp_netint/details
}
define service {
hostgroup_name hadoop-cluster
service_description root disk
check_command check_root_disk
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
define service {
hostgroup_name hadoop-worker-nodes
service_description data disk
check_command check_data_disk
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
define service {
hostgroup_name hadoop-cluster-metrics
service_description ganglia gmond collector
check_command check_gmond
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
define service {
hostgroup_name hadoop-cluster
service_description basic services
check_command check_system_pp
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
# Web interfaces
define service {
hostgroup_name mapred-jobtracker
service_description Jobtracker web interface
check_command check_webui!jobtracker
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# Web interfaces
define service {
hostgroup_name mapred-jobtracker-ha
service_description Jobtracker web interface
check_command check_webui!jobtracker_ha
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# Web interfaces
define service {
hostgroup_name hdfs-namenode
service_description Namenode web interface
check_command check_webui!namenode
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# Web interfaces
define service {
hostgroup_name hbase-master
service_description Hbase master web interface
check_command check_webui!hbase
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# HBASE status
define service {
hostgroup_name hbase-master
service_description Hbase status
check_command hadoop_check_hbase_status
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
# contact_groups hadoop-managers,hadoop-users
}
# Map Reduce task trackers
define service {
hostgroup_name mapred-jobtracker
service_description Mapreduce tasktrackers status
check_command hadoop_check_tasktracker
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
event_handler restart-service!hadoop-0.20-mapreduce-tasktracker
contact_groups hadoop-managers,hadoop-users
}
# HDFS datanodes
define service {
hostgroup_name hdfs-namenode
service_description HDFS datanodes status
check_command hadoop_check_datanode
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# HDFS blocks status
define service {
hostgroup_name hdfs-namenode
service_description HDFS blocks status
check_command check_hdfs_blocks!{{ hdfs_nn_http_port }}!1!1
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# HDFS capacity
define service {
hostgroup_name hdfs-namenode
service_description HDFS capacity
check_command check_hdfs_capacity!{{ hdfs_nn_http_port }}!{{ hdfs_warn }}!{{ hdfs_crit }}
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# HDFS rpc queue latency
define service {
hostgroup_name hdfs-namenode
service_description HDFS RPC queue latency
check_command check_rpcq_latency!{{ hdfs_nn_http_port }}!NameNode!3!5
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
# HDFS namenode directories status
define service {
hostgroup_name hdfs-namenode
service_description HDFS namenode directories status
check_command check_name_dir_status!{{ hdfs_nn_http_port }}
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
# HDFS journal nodes
define service {
hostgroup_name hdfs-journal
service_description HDFS HA journal
check_command check_hadoop_http_service!{{ hdfs_journal_http_port }}
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# HBase thrift
define service {
hostgroup_name hbase-thrift
service_description HBase thrift
check_command check_hbase_thrift!{{ hbase_thrift_port }}
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# Zookeeper data #
# Server state (the warning and critical values are mandatory but not used)
define service {
hostgroup_name zookeeper
service_description Zookeeper server state
check_command check_zookeeper!zk_server_state!1!1
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# Outstanding requests
define service {
hostgroup_name zookeeper
service_description Zookeeper outstanding requests
check_command check_zookeeper!zk_outstanding_requests!20!50
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
# Average latency
define service {
hostgroup_name zookeeper
service_description Zookeeper average latency
check_command check_zookeeper!zk_avg_latency!100!500
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
# Ephemerals count
define service {
hostgroup_name zookeeper
service_description Zookeeper ephemerals
check_command check_zookeeper!zk_ephemerals_count!3000!5000
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
# Watch count
define service {
hostgroup_name zookeeper
service_description Zookeeper watch count
check_command check_zookeeper!zk_watch_count!1000!2000
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
# Open file descriptors
define service {
hostgroup_name zookeeper
service_description Zookeeper open file descriptors
check_command check_zookeeper!zk_open_file_descriptor_count!800!950
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
}
# HBASE regionservers health status
define service {
hostgroup_name hbase-region-servers
service_description HBASE regionserver health status
check_command check_hadoop_http_service!60030
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# HDFS datanodes health status
define service {
hostgroup_name hdfs-datanodes
service_description HDFS datanode health status
check_command check_hadoop_http_service!50075
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# Map Reduce tasktrackers health status
define service {
hostgroup_name mapred-tasktrackers
service_description Map Reduce tasktracker health status
check_command check_hadoop_http_service!50060
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}
# HUE interface
define service {
hostgroup_name hue
service_description Hue interface
check_command check_hadoop_http_service!8888
use generic-hadoop-service
notification_interval 0 ; set > 0 if you want to be renotified
contact_groups hadoop-managers,hadoop-users
}