nagios: Set max_check_attempts to 3 for rabbitmq consumers.

This works around the fact that we seem to have a mysterous extra
checkup 40s after an error first occurs with these checks, which
always fails because the data is updated by a cron job that runs every
minute.

(imported from commit e7fe9c85e8399115443269287e695b140b4443ff)
This commit is contained in:
Tim Abbott 2013-09-03 18:03:51 -04:00
parent 5b8aa359dd
commit ddc9c53e1b
1 changed files with 15 additions and 0 deletions

View File

@ -256,6 +256,9 @@ define service {
use generic-service
service_description Check rabbitmq queue sizes
check_command check_rabbitmq_queues!22
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups page_admins
}
@ -264,6 +267,9 @@ define service {
use generic-service
service_description Check rabbitmq notify_tornado consumers
check_command check_rabbitmq_tornado_consumers!22
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups page_admins
}
@ -272,6 +278,9 @@ define service {
use generic-service
service_description Check rabbitmq useractivity consumers
check_command check_rabbitmq_useractivity_consumers!22
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
@ -280,6 +289,9 @@ define service {
use generic-service
service_description Check rabbitmq invites consumers
check_command check_rabbitmq_invites_consumers!22
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
@ -288,6 +300,9 @@ define service {
use generic-service
service_description Check rabbitmq signups consumers
check_command check_rabbitmq_signups_consumers!22
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}