mirror of https://github.com/zulip/zulip.git
nagios: Reorder service checks.
This commit is contained in:
parent
eaaa2fbff8
commit
93bcb86345
|
@ -1,50 +1,55 @@
|
||||||
define service {
|
### SSH
|
||||||
use generic-service
|
|
||||||
hostgroup_name web
|
|
||||||
service_description HTTPS
|
|
||||||
check_command check_https_status
|
|
||||||
contact_groups page_admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
hostgroup_name pageable_servers
|
|
||||||
service_description SSH - pageable
|
service_description SSH - pageable
|
||||||
check_command check_ssh
|
|
||||||
contact_groups page_admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
hostgroup_name flaky_servers
|
|
||||||
service_description SSH for flaky machines
|
|
||||||
check_command check_ssh
|
|
||||||
normal_check_interval 2
|
|
||||||
retry_check_interval 2
|
|
||||||
max_check_attempts 5
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
hostgroup_name not_pageable_servers
|
|
||||||
service_description SSH
|
|
||||||
check_command check_ssh
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
hostgroup_name pageable_servers
|
hostgroup_name pageable_servers
|
||||||
service_description remote disk - pageable
|
check_command check_ssh
|
||||||
|
contact_groups page_admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description SSH
|
||||||
|
hostgroup_name not_pageable_servers
|
||||||
|
check_command check_ssh
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description SSH for flaky machines
|
||||||
|
hostgroup_name flaky_servers
|
||||||
|
check_command check_ssh
|
||||||
|
normal_check_interval 2
|
||||||
|
retry_check_interval 2
|
||||||
|
max_check_attempts 5
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### Disk usage
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Disk usage - pageable
|
||||||
|
hostgroup_name pageable_servers
|
||||||
check_command check_remote_disk!20%!10%
|
check_command check_remote_disk!20%!10%
|
||||||
contact_groups page_admins
|
contact_groups page_admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
|
service_description Disk usage
|
||||||
|
hostgroup_name not_pageable_servers
|
||||||
|
check_command check_remote_disk!20%!10%
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Disk usage for flaky machines
|
||||||
hostgroup_name flaky_servers
|
hostgroup_name flaky_servers
|
||||||
service_description remote disk for flaky machines
|
|
||||||
check_command check_remote_disk!20%!10%
|
check_command check_remote_disk!20%!10%
|
||||||
normal_check_interval 2
|
normal_check_interval 2
|
||||||
retry_check_interval 2
|
retry_check_interval 2
|
||||||
|
@ -52,35 +57,40 @@ define service {
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### Swap
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
hostgroup_name not_pageable_servers
|
service_description swap
|
||||||
service_description remote disk
|
hostgroup_name all
|
||||||
check_command check_remote_disk!20%!10%
|
check_command check_remote_swap!22!80%!50%
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### Load
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Load
|
||||||
|
hostgroup_name pageable_servers
|
||||||
|
check_command check_remote_load!22!14.0,12.0,10.0!18.0,16.0,14.0
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
|
service_description Load
|
||||||
hostgroup_name not_pageable_servers
|
hostgroup_name not_pageable_servers
|
||||||
service_description remote load
|
|
||||||
check_command check_remote_load!22!7.0,6.0,5.0!10.0,8.0,6.0
|
check_command check_remote_load!22!7.0,6.0,5.0!10.0,8.0,6.0
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
hostgroup_name pageable_servers
|
service_description Load for flaky machines
|
||||||
service_description remote load - pageable
|
|
||||||
check_command check_remote_load!22!14.0,12.0,10.0!18.0,16.0,14.0
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
hostgroup_name flaky_servers
|
hostgroup_name flaky_servers
|
||||||
service_description remote load for flaky machines
|
|
||||||
check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0
|
check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0
|
||||||
normal_check_interval 2
|
normal_check_interval 2
|
||||||
retry_check_interval 2
|
retry_check_interval 2
|
||||||
|
@ -88,11 +98,62 @@ define service {
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### System updates
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Debian update availability
|
||||||
|
hostgroup_name all
|
||||||
|
check_command check_debian_packages!22
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### NTP
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check AWS NTP time
|
||||||
|
hostgroup_name aws_host
|
||||||
|
check_command check_ntp_time!22!169.254.169.123
|
||||||
|
max_check_attempts 3
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check non-AWS NTP time
|
||||||
|
hostgroup_name non_aws_host
|
||||||
|
check_command check_ntp_time!22!pool.ntp.org
|
||||||
|
max_check_attempts 5
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Service groups
|
||||||
|
|
||||||
|
#### Web
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description HTTPS
|
||||||
|
hostgroup_name web
|
||||||
|
check_command check_https_status
|
||||||
|
contact_groups page_admins
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#### zmirror / zmirrorp
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description zephyr mirror forwarding
|
service_description zephyr mirror forwarding
|
||||||
check_command check_zephyr_mirror_forwarding!22
|
|
||||||
hostgroup_name zmirror_main
|
hostgroup_name zmirror_main
|
||||||
|
check_command check_zephyr_mirror_forwarding!22
|
||||||
normal_check_interval 2
|
normal_check_interval 2
|
||||||
# Note: the actual check is run via cron, so retry_check_interval
|
# Note: the actual check is run via cron, so retry_check_interval
|
||||||
# should always equal normal_check_interval.
|
# should always equal normal_check_interval.
|
||||||
|
@ -101,185 +162,11 @@ define service {
|
||||||
contact_groups page_admins
|
contact_groups page_admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description user zephyr mirror liveness
|
|
||||||
check_command check_user_zephyr_mirror_liveness!22
|
|
||||||
hostgroup_name prod_frontends
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
hostgroup_name zmirrorp
|
|
||||||
service_description Check personal zephyr mirrors
|
|
||||||
check_command check_personal_zephyr_mirrors!22
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Debian update availability
|
|
||||||
check_command check_debian_packages!22
|
|
||||||
hostgroup_name all
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check AWS NTP time
|
|
||||||
check_command check_ntp_time!22!169.254.169.123
|
|
||||||
max_check_attempts 3
|
|
||||||
hostgroup_name aws_host
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check non-AWS NTP time
|
|
||||||
check_command check_ntp_time!22!pool.ntp.org
|
|
||||||
max_check_attempts 5
|
|
||||||
hostgroup_name non_aws_host
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check send receive time
|
|
||||||
check_command check_send_receive_time!22
|
|
||||||
max_check_attempts 2
|
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups page_admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check analytics state
|
|
||||||
check_command check_analytics_state!22
|
|
||||||
max_check_attempts 2
|
|
||||||
hostgroup_name prod_frontends
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
## check_postgres services
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL autovac_freeze
|
|
||||||
check_command check_postgres_alert_args!zulip!nagios!autovac_freeze!101%!105%
|
|
||||||
hostgroup_name postgresql_primary
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL backends
|
|
||||||
check_command check_postgres!zulip!nagios!backends
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL connection
|
|
||||||
check_command check_postgres!zulip!nagios!connection
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups page_admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL disabled triggers
|
|
||||||
check_command check_postgres!zulip!nagios!disabled_triggers
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL hitratio
|
|
||||||
check_command check_postgres!zulip!nagios!hitratio
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL locks
|
|
||||||
check_command check_postgres_alert_args!zulip!nagios!locks!400!600
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL query_time
|
|
||||||
check_command check_postgres_alert_args!zulip!nagios!query_time!20 seconds!40 seconds
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL sequence
|
|
||||||
check_command check_postgres!zulip!nagios!sequence
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL timesync
|
|
||||||
check_command check_postgres!zulip!nagios!timesync
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
# define service {
|
|
||||||
# use generic-service
|
|
||||||
# service_description Check PostgreSQL txn_idle
|
|
||||||
# check_command check_postgres_alert_args!zulip!nagios!txn_idle!20 seconds!40 seconds
|
|
||||||
# hostgroup_name postgresql
|
|
||||||
# contact_groups admins
|
|
||||||
# }
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL txn_time
|
|
||||||
check_command check_postgres_alert_args!zulip!nagios!txn_time!20 seconds!40 seconds
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check FTS update log length
|
|
||||||
check_command check_fts_update_log
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service{
|
|
||||||
use generic-service
|
|
||||||
service_description Check PostgreSQL replication lag
|
|
||||||
check_command check_postgresql_replication_lag
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check last PostgreSQL backup time
|
|
||||||
check_command check_postgresql_backup
|
|
||||||
hostgroup_name postgresql
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description zmirror subscriptions syncing
|
service_description zmirror subscriptions syncing
|
||||||
check_command check_sync_public_streams!22
|
|
||||||
hostgroup_name zmirror_main
|
hostgroup_name zmirror_main
|
||||||
|
check_command check_sync_public_streams!22
|
||||||
normal_check_interval 2
|
normal_check_interval 2
|
||||||
retry_check_interval 2
|
retry_check_interval 2
|
||||||
max_check_attempts 5
|
max_check_attempts 5
|
||||||
|
@ -288,212 +175,351 @@ define service {
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check redis service
|
service_description Check personal zephyr mirrors
|
||||||
check_command check_redis_ssh!22
|
hostgroup_name zmirrorp
|
||||||
max_check_attempts 3
|
check_command check_personal_zephyr_mirrors!22
|
||||||
hostgroup_name frontends, redis
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#### Application frontends
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check send receive time
|
||||||
|
hostgroup_name frontends
|
||||||
|
check_command check_send_receive_time!22
|
||||||
|
max_check_attempts 2
|
||||||
contact_groups page_admins
|
contact_groups page_admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check RabbitMQ queue sizes
|
service_description user zephyr mirror liveness
|
||||||
check_command check_rabbitmq_queues!22
|
hostgroup_name prod_frontends
|
||||||
# Workaround weird checks 40s after first error causing alerts
|
check_command check_user_zephyr_mirror_liveness!22
|
||||||
# from a single failure because cron hasn't run again yet
|
|
||||||
max_check_attempts 3
|
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups page_admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check for queue worker errors.
|
|
||||||
check_command check_queue_worker_errors!22
|
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check analytics state
|
||||||
|
hostgroup_name prod_frontends
|
||||||
|
check_command check_analytics_state!22
|
||||||
|
max_check_attempts 2
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#### PostgreSQL
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL connection
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgres!zulip!nagios!connection
|
||||||
|
contact_groups page_admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL autovac_freeze
|
||||||
|
hostgroup_name postgresql_primary
|
||||||
|
check_command check_postgres_alert_args!zulip!nagios!autovac_freeze!101%!105%
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL backends
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgres!zulip!nagios!backends
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL disabled triggers
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgres!zulip!nagios!disabled_triggers
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL hitratio
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgres!zulip!nagios!hitratio
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL locks
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgres_alert_args!zulip!nagios!locks!400!600
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL query_time
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgres_alert_args!zulip!nagios!query_time!20 seconds!40 seconds
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL sequence
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgres!zulip!nagios!sequence
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL timesync
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgres!zulip!nagios!timesync
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
# define service {
|
||||||
|
# use generic-service
|
||||||
|
# service_description Check PostgreSQL txn_idle
|
||||||
|
# hostgroup_name postgresql
|
||||||
|
# check_command check_postgres_alert_args!zulip!nagios!txn_idle!20 seconds!40 seconds
|
||||||
|
# contact_groups admins
|
||||||
|
# }
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL txn_time
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgres_alert_args!zulip!nagios!txn_time!20 seconds!40 seconds
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check FTS update log length
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_fts_update_log
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check PostgreSQL replication lag
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgresql_replication_lag
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check last PostgreSQL backup time
|
||||||
|
hostgroup_name postgresql
|
||||||
|
check_command check_postgresql_backup
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#### Redis
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check redis service
|
||||||
|
# Both redis and frontends hostgroups, since frontends SSH proxy redis to themselves
|
||||||
|
hostgroup_name frontends, redis
|
||||||
|
check_command check_redis_ssh!22
|
||||||
|
max_check_attempts 3
|
||||||
|
contact_groups page_admins
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#### RabbitMQ / queue workers
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check RabbitMQ queue sizes
|
||||||
|
hostgroup_name frontends
|
||||||
|
check_command check_rabbitmq_queues!22
|
||||||
|
# Workaround weird checks 40s after first error causing alerts
|
||||||
|
# from a single failure because cron hasn't run again yet
|
||||||
|
max_check_attempts 3
|
||||||
|
contact_groups page_admins
|
||||||
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check RabbitMQ notify_tornado consumers
|
service_description Check RabbitMQ notify_tornado consumers
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_rabbitmq_consumers!notify_tornado
|
check_command check_rabbitmq_consumers!notify_tornado
|
||||||
# Workaround weird checks 40s after first error causing alerts
|
# Workaround weird checks 40s after first error causing alerts
|
||||||
# from a single failure because cron hasn't run again yet
|
# from a single failure because cron hasn't run again yet
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups page_admins
|
contact_groups page_admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check RabbitMQ user_activity_interval consumers
|
service_description Check RabbitMQ user_activity_interval consumers
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_rabbitmq_consumers!user_activity_interval
|
check_command check_rabbitmq_consumers!user_activity_interval
|
||||||
# Workaround weird checks 40s after first error causing alerts
|
# Workaround weird checks 40s after first error causing alerts
|
||||||
# from a single failure because cron hasn't run again yet
|
# from a single failure because cron hasn't run again yet
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check RabbitMQ user_presence consumers
|
service_description Check RabbitMQ user_presence consumers
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_rabbitmq_consumers!user_presence
|
check_command check_rabbitmq_consumers!user_presence
|
||||||
# Workaround weird checks 40s after first error causing alerts
|
# Workaround weird checks 40s after first error causing alerts
|
||||||
# from a single failure because cron hasn't run again yet
|
# from a single failure because cron hasn't run again yet
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check RabbitMQ invites consumers
|
service_description Check RabbitMQ invites consumers
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_rabbitmq_consumers!invites
|
check_command check_rabbitmq_consumers!invites
|
||||||
# Workaround weird checks 40s after first error causing alerts
|
# Workaround weird checks 40s after first error causing alerts
|
||||||
# from a single failure because cron hasn't run again yet
|
# from a single failure because cron hasn't run again yet
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check RabbitMQ digest email consumers
|
service_description Check RabbitMQ digest email consumers
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_rabbitmq_consumers!digest_emails
|
check_command check_rabbitmq_consumers!digest_emails
|
||||||
# Workaround weird checks 40s after first error causing alerts
|
# Workaround weird checks 40s after first error causing alerts
|
||||||
# from a single failure because cron hasn't run again yet
|
# from a single failure because cron hasn't run again yet
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check RabbitMQ email mirror consumers
|
service_description Check RabbitMQ email mirror consumers
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_rabbitmq_consumers!email_mirror
|
check_command check_rabbitmq_consumers!email_mirror
|
||||||
# Workaround weird checks 40s after first error causing alerts
|
# Workaround weird checks 40s after first error causing alerts
|
||||||
# from a single failure because cron hasn't run again yet
|
# from a single failure because cron hasn't run again yet
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check RabbitMQ missedmessage mobile notifications consumers
|
service_description Check RabbitMQ missedmessage mobile notifications consumers
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_rabbitmq_consumers!missedmessage_mobile_notifications
|
check_command check_rabbitmq_consumers!missedmessage_mobile_notifications
|
||||||
# Workaround weird checks 40s after first error causing alerts
|
# Workaround weird checks 40s after first error causing alerts
|
||||||
# from a single failure because cron hasn't run again yet
|
# from a single failure because cron hasn't run again yet
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check RabbitMQ missedmessage email consumers
|
service_description Check RabbitMQ missedmessage email consumers
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_rabbitmq_consumers!missedmessage_emails
|
check_command check_rabbitmq_consumers!missedmessage_emails
|
||||||
# Workaround weird checks 40s after first error causing alerts
|
# Workaround weird checks 40s after first error causing alerts
|
||||||
# from a single failure because cron hasn't run again yet
|
# from a single failure because cron hasn't run again yet
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check RabbitMQ user activity consumers
|
service_description Check RabbitMQ user activity consumers
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_rabbitmq_consumers!user_activity
|
check_command check_rabbitmq_consumers!user_activity
|
||||||
# Workaround weird checks 40s after first error causing alerts
|
# Workaround weird checks 40s after first error causing alerts
|
||||||
# from a single failure because cron hasn't run again yet
|
# from a single failure because cron hasn't run again yet
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check embedded_bots queue processor
|
service_description Check embedded_bots queue processor
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_remote_arg_string!manage.py process_queue --queue_name=embedded_bots!1:1!1:1
|
check_command check_remote_arg_string!manage.py process_queue --queue_name=embedded_bots!1:1!1:1
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check missedmessage_emails queue processor
|
service_description Check missedmessage_emails queue processor
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_remote_arg_string!manage.py process_queue --queue_name=missedmessage_emails!1:1!1:1
|
check_command check_remote_arg_string!manage.py process_queue --queue_name=missedmessage_emails!1:1!1:1
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check email_senders queue processor
|
service_description Check email_senders queue processor
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_remote_arg_string!manage.py process_queue --queue_name=email_senders!1:1!1:1
|
check_command check_remote_arg_string!manage.py process_queue --queue_name=email_senders!1:1!1:1
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check outgoing webhooks queue processor
|
service_description Check outgoing webhooks queue processor
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_remote_arg_string!manage.py process_queue --queue_name=outgoing_webhooks!1:1!1:1
|
check_command check_remote_arg_string!manage.py process_queue --queue_name=outgoing_webhooks!1:1!1:1
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check deferred_work queue processor
|
service_description Check deferred_work queue processor
|
||||||
|
hostgroup_name frontends
|
||||||
check_command check_remote_arg_string!manage.py process_queue --queue_name=deferred_work!1:1!1:1
|
check_command check_remote_arg_string!manage.py process_queue --queue_name=deferred_work!1:1!1:1
|
||||||
max_check_attempts 3
|
max_check_attempts 3
|
||||||
hostgroup_name frontends
|
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check worker memory usage
|
service_description Check worker memory usage
|
||||||
check_command check_worker_memory
|
|
||||||
hostgroup_name frontends
|
hostgroup_name frontends
|
||||||
|
check_command check_worker_memory
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
hostgroup_name all
|
service_description Check for queue worker errors.
|
||||||
service_description swap
|
hostgroup_name frontends
|
||||||
check_command check_remote_swap!22!80%!50%
|
check_command check_queue_worker_errors!22
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Smokescreen
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description Check Smokescreen proxy
|
service_description Check Smokescreen proxy
|
||||||
check_command check_proxy_status
|
|
||||||
hostgroup_name smokescreen
|
hostgroup_name smokescreen
|
||||||
|
check_command check_proxy_status
|
||||||
contact_groups page_admins
|
contact_groups page_admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check desktop APT repository
|
|
||||||
host_name nagios
|
|
||||||
check_command check_apt_repo_status!download.zulip.com!/desktop/apt
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check chat.fhir.org cert
|
|
||||||
host_name nagios
|
|
||||||
check_command check_ssl_certificate!chat.fhir.org
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
|
@ -4,39 +4,59 @@ define host{
|
||||||
alias nagios
|
alias nagios
|
||||||
address 127.0.0.1
|
address 127.0.0.1
|
||||||
hostgroups all
|
hostgroups all
|
||||||
}
|
}
|
||||||
|
|
||||||
define service{
|
define service{
|
||||||
use generic-service
|
use generic-service
|
||||||
host_name nagios
|
host_name nagios
|
||||||
service_description Current Users
|
service_description Current Users
|
||||||
check_command check_users!20!50
|
check_command check_users!20!50
|
||||||
}
|
}
|
||||||
|
|
||||||
define service{
|
define service{
|
||||||
use generic-service
|
use generic-service
|
||||||
host_name nagios
|
host_name nagios
|
||||||
service_description Total Processes
|
service_description Total Processes
|
||||||
check_command check_procs_nokthreads!500!650
|
check_command check_procs_nokthreads!500!650
|
||||||
}
|
}
|
||||||
|
|
||||||
define service{
|
define service{
|
||||||
use generic-service
|
use generic-service
|
||||||
host_name nagios
|
host_name nagios
|
||||||
service_description Disk Space
|
service_description Disk Space
|
||||||
check_command check_local_disk!20%!10%
|
check_command check_local_disk!20%!10%
|
||||||
}
|
}
|
||||||
|
|
||||||
define service{
|
define service{
|
||||||
use generic-service
|
use generic-service
|
||||||
host_name nagios
|
host_name nagios
|
||||||
service_description Current Load
|
service_description Current Load
|
||||||
check_command check_load!7.0!6.0!5.0!10.0!8.0!6.0
|
check_command check_load!7.0!6.0!5.0!10.0!8.0!6.0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### External resources, only run on localhost
|
||||||
|
|
||||||
define service{
|
define service{
|
||||||
use generic-service
|
use generic-service
|
||||||
host_name nagios
|
host_name nagios
|
||||||
service_description Check Camo is operational
|
service_description Check Camo is operational
|
||||||
check_command check_camo!<%= @nagios_camo_check_host %>!<%= @nagios_camo_check_path %>!6!12
|
check_command check_camo!<%= @nagios_camo_check_host %>!<%= @nagios_camo_check_path %>!6!12
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check desktop APT repository
|
||||||
|
host_name nagios
|
||||||
|
check_command check_apt_repo_status!download.zulip.com!/desktop/apt
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check chat.fhir.org cert
|
||||||
|
host_name nagios
|
||||||
|
check_command check_ssl_certificate!chat.fhir.org
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue