nagios: Reorder service checks.

This commit is contained in:
Alex Vandiver 2022-06-13 15:46:24 -07:00 committed by Alex Vandiver
parent eaaa2fbff8
commit 93bcb86345
2 changed files with 330 additions and 284 deletions

View File

@ -1,50 +1,55 @@
define service {
use generic-service
hostgroup_name web
service_description HTTPS
check_command check_https_status
contact_groups page_admins
}
### SSH
define service {
use generic-service
hostgroup_name pageable_servers
service_description SSH - pageable
check_command check_ssh
contact_groups page_admins
}
define service {
use generic-service
hostgroup_name flaky_servers
service_description SSH for flaky machines
check_command check_ssh
normal_check_interval 2
retry_check_interval 2
max_check_attempts 5
contact_groups admins
}
define service {
use generic-service
hostgroup_name not_pageable_servers
service_description SSH
check_command check_ssh
contact_groups admins
}
define service {
use generic-service
hostgroup_name pageable_servers
service_description remote disk - pageable
check_command check_ssh
contact_groups page_admins
}
define service {
use generic-service
service_description SSH
hostgroup_name not_pageable_servers
check_command check_ssh
contact_groups admins
}
define service {
use generic-service
service_description SSH for flaky machines
hostgroup_name flaky_servers
check_command check_ssh
normal_check_interval 2
retry_check_interval 2
max_check_attempts 5
contact_groups admins
}
### Disk usage
define service {
use generic-service
service_description Disk usage - pageable
hostgroup_name pageable_servers
check_command check_remote_disk!20%!10%
contact_groups page_admins
}
define service {
use generic-service
service_description Disk usage
hostgroup_name not_pageable_servers
check_command check_remote_disk!20%!10%
contact_groups admins
}
define service {
use generic-service
service_description Disk usage for flaky machines
hostgroup_name flaky_servers
service_description remote disk for flaky machines
check_command check_remote_disk!20%!10%
normal_check_interval 2
retry_check_interval 2
@ -52,35 +57,40 @@ define service {
contact_groups admins
}
### Swap
define service {
use generic-service
hostgroup_name not_pageable_servers
service_description remote disk
check_command check_remote_disk!20%!10%
service_description swap
hostgroup_name all
check_command check_remote_swap!22!80%!50%
contact_groups admins
}
### Load
define service {
use generic-service
service_description Load
hostgroup_name pageable_servers
check_command check_remote_load!22!14.0,12.0,10.0!18.0,16.0,14.0
contact_groups admins
}
define service {
use generic-service
service_description Load
hostgroup_name not_pageable_servers
service_description remote load
check_command check_remote_load!22!7.0,6.0,5.0!10.0,8.0,6.0
contact_groups admins
}
define service {
use generic-service
hostgroup_name pageable_servers
service_description remote load - pageable
check_command check_remote_load!22!14.0,12.0,10.0!18.0,16.0,14.0
contact_groups admins
}
define service {
use generic-service
service_description Load for flaky machines
hostgroup_name flaky_servers
service_description remote load for flaky machines
check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0
normal_check_interval 2
retry_check_interval 2
@ -88,11 +98,62 @@ define service {
contact_groups admins
}
### System updates
define service {
use generic-service
service_description Debian update availability
hostgroup_name all
check_command check_debian_packages!22
contact_groups admins
}
### NTP
define service {
use generic-service
service_description Check AWS NTP time
hostgroup_name aws_host
check_command check_ntp_time!22!169.254.169.123
max_check_attempts 3
contact_groups admins
}
define service {
use generic-service
service_description Check non-AWS NTP time
hostgroup_name non_aws_host
check_command check_ntp_time!22!pool.ntp.org
max_check_attempts 5
contact_groups admins
}
### Service groups
#### Web
define service {
use generic-service
service_description HTTPS
hostgroup_name web
check_command check_https_status
contact_groups page_admins
}
#### zmirror / zmirrorp
define service {
use generic-service
service_description zephyr mirror forwarding
check_command check_zephyr_mirror_forwarding!22
hostgroup_name zmirror_main
check_command check_zephyr_mirror_forwarding!22
normal_check_interval 2
# Note: the actual check is run via cron, so retry_check_interval
# should always equal normal_check_interval.
@ -101,185 +162,11 @@ define service {
contact_groups page_admins
}
define service {
use generic-service
service_description user zephyr mirror liveness
check_command check_user_zephyr_mirror_liveness!22
hostgroup_name prod_frontends
contact_groups admins
}
define service {
use generic-service
hostgroup_name zmirrorp
service_description Check personal zephyr mirrors
check_command check_personal_zephyr_mirrors!22
contact_groups admins
}
define service {
use generic-service
service_description Debian update availability
check_command check_debian_packages!22
hostgroup_name all
contact_groups admins
}
define service {
use generic-service
service_description Check AWS NTP time
check_command check_ntp_time!22!169.254.169.123
max_check_attempts 3
hostgroup_name aws_host
contact_groups admins
}
define service {
use generic-service
service_description Check non-AWS NTP time
check_command check_ntp_time!22!pool.ntp.org
max_check_attempts 5
hostgroup_name non_aws_host
contact_groups admins
}
define service {
use generic-service
service_description Check send receive time
check_command check_send_receive_time!22
max_check_attempts 2
hostgroup_name frontends
contact_groups page_admins
}
define service {
use generic-service
service_description Check analytics state
check_command check_analytics_state!22
max_check_attempts 2
hostgroup_name prod_frontends
contact_groups admins
}
## check_postgres services
define service {
use generic-service
service_description Check PostgreSQL autovac_freeze
check_command check_postgres_alert_args!zulip!nagios!autovac_freeze!101%!105%
hostgroup_name postgresql_primary
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL backends
check_command check_postgres!zulip!nagios!backends
hostgroup_name postgresql
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL connection
check_command check_postgres!zulip!nagios!connection
hostgroup_name postgresql
contact_groups page_admins
}
define service {
use generic-service
service_description Check PostgreSQL disabled triggers
check_command check_postgres!zulip!nagios!disabled_triggers
hostgroup_name postgresql
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL hitratio
check_command check_postgres!zulip!nagios!hitratio
hostgroup_name postgresql
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL locks
check_command check_postgres_alert_args!zulip!nagios!locks!400!600
hostgroup_name postgresql
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL query_time
check_command check_postgres_alert_args!zulip!nagios!query_time!20 seconds!40 seconds
hostgroup_name postgresql
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL sequence
check_command check_postgres!zulip!nagios!sequence
hostgroup_name postgresql
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL timesync
check_command check_postgres!zulip!nagios!timesync
hostgroup_name postgresql
contact_groups admins
}
# define service {
# use generic-service
# service_description Check PostgreSQL txn_idle
# check_command check_postgres_alert_args!zulip!nagios!txn_idle!20 seconds!40 seconds
# hostgroup_name postgresql
# contact_groups admins
# }
define service {
use generic-service
service_description Check PostgreSQL txn_time
check_command check_postgres_alert_args!zulip!nagios!txn_time!20 seconds!40 seconds
hostgroup_name postgresql
contact_groups admins
}
define service {
use generic-service
service_description Check FTS update log length
check_command check_fts_update_log
hostgroup_name postgresql
contact_groups admins
}
define service{
use generic-service
service_description Check PostgreSQL replication lag
check_command check_postgresql_replication_lag
hostgroup_name postgresql
contact_groups admins
}
define service {
use generic-service
service_description Check last PostgreSQL backup time
check_command check_postgresql_backup
hostgroup_name postgresql
contact_groups admins
}
define service {
use generic-service
service_description zmirror subscriptions syncing
check_command check_sync_public_streams!22
hostgroup_name zmirror_main
check_command check_sync_public_streams!22
normal_check_interval 2
retry_check_interval 2
max_check_attempts 5
@ -288,212 +175,351 @@ define service {
define service {
use generic-service
service_description Check redis service
check_command check_redis_ssh!22
max_check_attempts 3
hostgroup_name frontends, redis
service_description Check personal zephyr mirrors
hostgroup_name zmirrorp
check_command check_personal_zephyr_mirrors!22
contact_groups admins
}
#### Application frontends
define service {
use generic-service
service_description Check send receive time
hostgroup_name frontends
check_command check_send_receive_time!22
max_check_attempts 2
contact_groups page_admins
}
define service {
use generic-service
service_description Check RabbitMQ queue sizes
check_command check_rabbitmq_queues!22
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups page_admins
}
define service {
use generic-service
service_description Check for queue worker errors.
check_command check_queue_worker_errors!22
hostgroup_name frontends
service_description user zephyr mirror liveness
hostgroup_name prod_frontends
check_command check_user_zephyr_mirror_liveness!22
contact_groups admins
}
define service {
use generic-service
service_description Check analytics state
hostgroup_name prod_frontends
check_command check_analytics_state!22
max_check_attempts 2
contact_groups admins
}
#### PostgreSQL
define service {
use generic-service
service_description Check PostgreSQL connection
hostgroup_name postgresql
check_command check_postgres!zulip!nagios!connection
contact_groups page_admins
}
define service {
use generic-service
service_description Check PostgreSQL autovac_freeze
hostgroup_name postgresql_primary
check_command check_postgres_alert_args!zulip!nagios!autovac_freeze!101%!105%
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL backends
hostgroup_name postgresql
check_command check_postgres!zulip!nagios!backends
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL disabled triggers
hostgroup_name postgresql
check_command check_postgres!zulip!nagios!disabled_triggers
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL hitratio
hostgroup_name postgresql
check_command check_postgres!zulip!nagios!hitratio
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL locks
hostgroup_name postgresql
check_command check_postgres_alert_args!zulip!nagios!locks!400!600
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL query_time
hostgroup_name postgresql
check_command check_postgres_alert_args!zulip!nagios!query_time!20 seconds!40 seconds
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL sequence
hostgroup_name postgresql
check_command check_postgres!zulip!nagios!sequence
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL timesync
hostgroup_name postgresql
check_command check_postgres!zulip!nagios!timesync
contact_groups admins
}
# define service {
# use generic-service
# service_description Check PostgreSQL txn_idle
# hostgroup_name postgresql
# check_command check_postgres_alert_args!zulip!nagios!txn_idle!20 seconds!40 seconds
# contact_groups admins
# }
define service {
use generic-service
service_description Check PostgreSQL txn_time
hostgroup_name postgresql
check_command check_postgres_alert_args!zulip!nagios!txn_time!20 seconds!40 seconds
contact_groups admins
}
define service {
use generic-service
service_description Check FTS update log length
hostgroup_name postgresql
check_command check_fts_update_log
contact_groups admins
}
define service {
use generic-service
service_description Check PostgreSQL replication lag
hostgroup_name postgresql
check_command check_postgresql_replication_lag
contact_groups admins
}
define service {
use generic-service
service_description Check last PostgreSQL backup time
hostgroup_name postgresql
check_command check_postgresql_backup
contact_groups admins
}
#### Redis
define service {
use generic-service
service_description Check redis service
# Both redis and frontends hostgroups, since frontends SSH proxy redis to themselves
hostgroup_name frontends, redis
check_command check_redis_ssh!22
max_check_attempts 3
contact_groups page_admins
}
#### RabbitMQ / queue workers
define service {
use generic-service
service_description Check RabbitMQ queue sizes
hostgroup_name frontends
check_command check_rabbitmq_queues!22
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
contact_groups page_admins
}
define service {
use generic-service
service_description Check RabbitMQ notify_tornado consumers
hostgroup_name frontends
check_command check_rabbitmq_consumers!notify_tornado
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups page_admins
}
define service {
use generic-service
service_description Check RabbitMQ user_activity_interval consumers
hostgroup_name frontends
check_command check_rabbitmq_consumers!user_activity_interval
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check RabbitMQ user_presence consumers
hostgroup_name frontends
check_command check_rabbitmq_consumers!user_presence
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check RabbitMQ invites consumers
hostgroup_name frontends
check_command check_rabbitmq_consumers!invites
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check RabbitMQ digest email consumers
hostgroup_name frontends
check_command check_rabbitmq_consumers!digest_emails
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check RabbitMQ email mirror consumers
hostgroup_name frontends
check_command check_rabbitmq_consumers!email_mirror
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check RabbitMQ missedmessage mobile notifications consumers
hostgroup_name frontends
check_command check_rabbitmq_consumers!missedmessage_mobile_notifications
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check RabbitMQ missedmessage email consumers
hostgroup_name frontends
check_command check_rabbitmq_consumers!missedmessage_emails
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check RabbitMQ user activity consumers
hostgroup_name frontends
check_command check_rabbitmq_consumers!user_activity
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check embedded_bots queue processor
hostgroup_name frontends
check_command check_remote_arg_string!manage.py process_queue --queue_name=embedded_bots!1:1!1:1
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check missedmessage_emails queue processor
hostgroup_name frontends
check_command check_remote_arg_string!manage.py process_queue --queue_name=missedmessage_emails!1:1!1:1
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check email_senders queue processor
hostgroup_name frontends
check_command check_remote_arg_string!manage.py process_queue --queue_name=email_senders!1:1!1:1
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check outgoing webhooks queue processor
hostgroup_name frontends
check_command check_remote_arg_string!manage.py process_queue --queue_name=outgoing_webhooks!1:1!1:1
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check deferred_work queue processor
hostgroup_name frontends
check_command check_remote_arg_string!manage.py process_queue --queue_name=deferred_work!1:1!1:1
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check worker memory usage
check_command check_worker_memory
hostgroup_name frontends
check_command check_worker_memory
contact_groups admins
}
define service {
use generic-service
hostgroup_name all
service_description swap
check_command check_remote_swap!22!80%!50%
service_description Check for queue worker errors.
hostgroup_name frontends
check_command check_queue_worker_errors!22
contact_groups admins
}
### Smokescreen
define service {
use generic-service
service_description Check Smokescreen proxy
check_command check_proxy_status
hostgroup_name smokescreen
check_command check_proxy_status
contact_groups page_admins
}
define service {
use generic-service
service_description Check desktop APT repository
host_name nagios
check_command check_apt_repo_status!download.zulip.com!/desktop/apt
contact_groups admins
}
define service {
use generic-service
service_description Check chat.fhir.org cert
host_name nagios
check_command check_ssl_certificate!chat.fhir.org
contact_groups admins
}

View File

@ -34,9 +34,29 @@ define service{
check_command check_load!7.0!6.0!5.0!10.0!8.0!6.0
}
### External resources, only run on localhost
define service{
use generic-service
host_name nagios
service_description Check Camo is operational
check_command check_camo!<%= @nagios_camo_check_host %>!<%= @nagios_camo_check_path %>!6!12
}
define service {
use generic-service
service_description Check desktop APT repository
host_name nagios
check_command check_apt_repo_status!download.zulip.com!/desktop/apt
contact_groups admins
}
define service {
use generic-service
service_description Check chat.fhir.org cert
host_name nagios
check_command check_ssl_certificate!chat.fhir.org
contact_groups admins
}