diff --git a/puppet/kandra/files/nagios4/commands.cfg b/puppet/kandra/files/nagios4/commands.cfg index f1accba482..761f019168 100644 --- a/puppet/kandra/files/nagios4/commands.cfg +++ b/puppet/kandra/files/nagios4/commands.cfg @@ -19,6 +19,17 @@ define command{ command_line /usr/bin/printf "%b" "Subject: $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$\n\n***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n$LONGSERVICEOUTPUT$\n" | /usr/bin/msmtp -C /var/lib/nagios/msmtprc -vt $CONTACTEMAIL$ } +# Zulip commands +define command { + command_name notify-host-by-zulip + command_line /usr/local/share/zulip/integrations/nagios/nagios-notify-zulip --stream="kandra ops" --type="$NOTIFICATIONTYPE$" --host="$HOSTADDRESS$" --state="$HOSTSTATE$" --output="$HOSTOUTPUT$" --long-output="$LONGHOSTOUTPUT$" +} + +define command { + command_name notify-service-by-zulip + command_line /usr/local/share/zulip/integrations/nagios/nagios-notify-zulip --stream="kandra ops" --type="$NOTIFICATIONTYPE$" --host="$HOSTADDRESS$" --service="$SERVICEDESC$" --state="$SERVICESTATE$" --output="$SERVICEOUTPUT$" --long-output="$LONGSERVICEOUTPUT$" +} + ################################################################################ # HOST CHECK COMMANDS ################################################################################ diff --git a/puppet/kandra/files/nagios4/conf.d/services.cfg b/puppet/kandra/files/nagios4/conf.d/services.cfg index bb2f81e34e..9cc3a6e52b 100644 --- a/puppet/kandra/files/nagios4/conf.d/services.cfg +++ b/puppet/kandra/files/nagios4/conf.d/services.cfg @@ -35,7 +35,7 @@ define service { service_description Disk usage - pageable hostgroup_name pageable_servers check_command check_remote_disk!20%!10% - contact_groups page_admins + contact_groups ops_message } define service { @@ -43,7 +43,7 @@ define service { service_description Disk usage hostgroup_name not_pageable_servers check_command check_remote_disk!20%!10% - contact_groups admins + contact_groups ops_message } define service { @@ -135,20 +135,37 @@ define service { define service { use generic-service service_description HTTPS - hostgroup_name frontends + hostgroup_name prod_frontends, fullstack check_command check_https_status contact_groups page_admins } +define service { + use generic-service + service_description HTTPS + hostgroup_name staging_frontends + check_command check_https_status + contact_groups ops_message +} + define service { use generic-service service_description Check send receive time - hostgroup_name frontends + hostgroup_name prod_frontends, fullstack check_command check_send_receive_time!22 max_check_attempts 2 contact_groups page_admins } +define service { + use generic-service + service_description Check send receive time + hostgroup_name staging_frontends + check_command check_send_receive_time!22 + max_check_attempts 2 + contact_groups ops_message +} + define service { use generic-service service_description user zephyr mirror liveness @@ -280,19 +297,27 @@ define service { use generic-service service_description Check redis service # Both redis and frontends hostgroups, since frontends SSH proxy redis to themselves - hostgroup_name frontends, redis + hostgroup_name prod_frontends, fullstack, redis check_command check_redis_ssh!22 max_check_attempts 3 contact_groups page_admins } +define service { + use generic-service + service_description Check redis service + hostgroup_name staging_frontends + check_command check_redis_ssh!22 + max_check_attempts 3 + contact_groups ops_message +} #### RabbitMQ / queue workers define service { use generic-service service_description Check RabbitMQ queue sizes - hostgroup_name frontends + hostgroup_name prod_frontends, fullstack check_command check_rabbitmq_queues!22 # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet @@ -300,6 +325,17 @@ define service { contact_groups page_admins } +define service { + use generic-service + service_description Check RabbitMQ queue sizes + hostgroup_name staging_frontends + check_command check_rabbitmq_queues!22 + # Workaround weird checks 40s after first error causing alerts + # from a single failure because cron hasn't run again yet + max_check_attempts 3 + contact_groups ops_message +} + define service { name rabbitmq-consumer-service use generic-service @@ -315,10 +351,18 @@ define service { define service { use rabbitmq-consumer-service service_description Check RabbitMQ notify_tornado consumers + hostgroup_name prod_frontends, fullstack check_command check_rabbitmq_consumers!notify_tornado contact_groups page_admins } +define service { + use rabbitmq-consumer-service + service_description Check RabbitMQ notify_tornado consumers + hostgroup_name staging_frontends + check_command check_rabbitmq_consumers!notify_tornado + contact_groups admins +} define service { use rabbitmq-consumer-service diff --git a/puppet/kandra/templates/nagios4/contacts.cfg.template.erb b/puppet/kandra/templates/nagios4/contacts.cfg.template.erb index 502648ecbb..75cbc02eed 100644 --- a/puppet/kandra/templates/nagios4/contacts.cfg.template.erb +++ b/puppet/kandra/templates/nagios4/contacts.cfg.template.erb @@ -48,6 +48,17 @@ define contact { host_notification_commands notify-host-by-email } +define contact { + contact_name kandra-ops-in-czo + alias Notify kandra-ops on chat.zulip.org + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,r + service_notification_commands notify-service-by-zulip + host_notification_commands notify-host-by-zulip +} + ############################################################################### ############################################################################### # @@ -68,6 +79,12 @@ define contactgroup{ members monitoring,pager } +define contactgroup{ + contactgroup_name ops_message + alias Message admins on CZO + members monitoring,kandra-ops-in-czo +} + define contactgroup{ contactgroup_name test alias Nagios Test Administrators