From 83c82c8e15e7e693404a91433272a430074a04ab Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Mon, 13 Jun 2022 16:44:57 -0700 Subject: [PATCH] nagios: Adjust load alerting by hostgroup. Even the `pageable_servers` group did not page for high load -- in part because what was "high" depends on the servers. Set slightly better limits based on server role. --- .../files/nagios4/conf.d/services.cfg | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/puppet/zulip_ops/files/nagios4/conf.d/services.cfg b/puppet/zulip_ops/files/nagios4/conf.d/services.cfg index c4f6723b05..ef0bd0c0d1 100644 --- a/puppet/zulip_ops/files/nagios4/conf.d/services.cfg +++ b/puppet/zulip_ops/files/nagios4/conf.d/services.cfg @@ -74,22 +74,30 @@ define service { define service { use generic-service service_description Load - hostgroup_name pageable_servers - check_command check_remote_load!22!14.0,12.0,10.0!18.0,16.0,14.0 - contact_groups admins -} - -define service { - use generic-service - service_description Load - hostgroup_name not_pageable_servers + hostgroup_name fullstack, redis, staging_frontends, smokescreen check_command check_remote_load!22!7.0,6.0,5.0!10.0,8.0,6.0 contact_groups admins } define service { use generic-service - service_description Load for flaky machines + service_description Load + hostgroup_name postgresql + check_command check_remote_load!22!9.0,8.0,7.0!11.0,10.0,9.0 + contact_groups admins +} + +define service { + use generic-service + service_description Load + hostgroup_name prod_frontends + check_command check_remote_load!22!15.0,14.0,12.0!18.0,16.0,14.0 + contact_groups admins +} + +define service { + use generic-service + service_description Load hostgroup_name flaky_servers check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0 normal_check_interval 2