zulip/servers/puppet/modules/humbug/files/nagios3/conf.d/services.cfg

196 lines
7.5 KiB
INI
Raw Normal View History

define service {
use generic-service
hostgroup_name web
service_description HTTP
check_command check_http
contact_groups paging_admins
}
define service {
use generic-service
hostgroup_name all
service_description SSH
check_command check_ssh
contact_groups paging_admins
}
define service {
use generic-service
hostgroup_name all
service_description remote disk
check_command check_remote_disk!22!20%!10%!/
contact_groups paging_admins
}
define service {
use generic-service
hostgroup_name all
service_description remote load
check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0
contact_groups paging_admins
}
define service {
use generic-service
service_description zephyr mirror forwarding
check_command check_zephyr_mirror_forwarding!22
host zmirror
normal_check_interval 2
# Note: the actual check is run via cron, so retry_check_interval
# should always equal normal_check_interval.
retry_check_interval 2
max_check_attempts 2
contact_groups paging_admins
}
define service {
use generic-service
service_description user zephyr mirror liveness
check_command check_user_zephyr_mirror_liveness!22
host app
normal_check_interval 2
retry_check_interval 1
max_check_attempts 2
contact_groups admins
}
define service {
use generic-service
service_description Debian update availability
check_command check_debian_packages!22
hostgroup_name all
max_check_attempts 1
contact_groups admins
}
define service {
use generic-service
service_description Check NTP time
check_command check_ntp_time!22
hostgroup_name all
contact_groups admins
}
define service {
use generic-service
service_description Check feedback bot
check_command check_feedback_bot!22
host bots
# Feedback isn't lost, it just isn't reported through Humbug
# if the bot is down, so don't page.
contact_groups admins
}
define service {
use generic-service
service_description Check send receive time
check_command check_send_receive_time!22
host staging
contact_groups admins
}
## check_postgres.pl services
define service {
use generic-service
service_description Check postgres autovac_freeze
check_command check_postgres!22!autovac_freeze
host postgres
contact_groups admins
}
define service {
use generic-service
service_description Check postgres backends
check_command check_postgres!22!backends
host postgres
contact_groups admins
}
define service {
use generic-service
service_description Check postgres connection
check_command check_postgres!22!connection
host postgres
contact_groups admins
}
define service {
use generic-service
service_description Check postgres hitratio
check_command check_postgres!22!hitratio
host postgres
contact_groups admins
}
define service {
use generic-service
service_description Check postgres locks
check_command check_postgres!22!locks
host postgres
contact_groups admins
}
define service {
use generic-service
service_description Check postgres query_time
check_command check_postgres_alert_args!22!query_time!20 seconds!40 seconds
host postgres
contact_groups admins
}
define service {
use generic-service
service_description Check postgres sequence
check_command check_postgres!22!sequence
host postgres
contact_groups admins
}
define service {
use generic-service
service_description Check postgres timesync
check_command check_postgres!22!timesync
host postgres
contact_groups admins
}
define service {
use generic-service
service_description Check postgres txn_idle
check_command check_postgres_alert_args!22!txn_idle!20 seconds!40 seconds
host postgres
contact_groups admins
}
define service {
use generic-service
service_description Check postgres txn_time
check_command check_postgres_alert_args!22!txn_time!20 seconds!40 seconds
host postgres
contact_groups admins
}
define service {
use generic-service
service_description process_user_activity bot
check_command check_process_user_activity!22
hostgroup_name frontends
# Activity isn't lost if this isn't running, but it is
# critical that it arrive to avoid falsely telling MIT folks
# their mirrors are down, so this should be changed to
# page_admins sometime after a week of no false alerts
# This service is also responsible for active/idle
# status.
contact_groups admins
}
define service {
use generic-service
service_description zmirror subscriptons syncing
check_command check_sync_public_streams!22
hostgroup_name zmirror
# Change this to page_admins after a week
contact_groups admins
}