nagios: Switch staging hosts to not page, but send a zulip.

This commit is contained in:
Alex Vandiver 2024-10-08 18:25:01 -04:00 committed by Tim Abbott
parent ba8b9a445b
commit f325e15439
3 changed files with 78 additions and 6 deletions

View File

@ -19,6 +19,17 @@ define command{
command_line /usr/bin/printf "%b" "Subject: $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$\n\n***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n$LONGSERVICEOUTPUT$\n" | /usr/bin/msmtp -C /var/lib/nagios/msmtprc -vt $CONTACTEMAIL$ command_line /usr/bin/printf "%b" "Subject: $NOTIFICATIONTYPE$ Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$\n\n***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n$LONGSERVICEOUTPUT$\n" | /usr/bin/msmtp -C /var/lib/nagios/msmtprc -vt $CONTACTEMAIL$
} }
# Zulip commands
define command {
command_name notify-host-by-zulip
command_line /usr/local/share/zulip/integrations/nagios/nagios-notify-zulip --stream="kandra ops" --type="$NOTIFICATIONTYPE$" --host="$HOSTADDRESS$" --state="$HOSTSTATE$" --output="$HOSTOUTPUT$" --long-output="$LONGHOSTOUTPUT$"
}
define command {
command_name notify-service-by-zulip
command_line /usr/local/share/zulip/integrations/nagios/nagios-notify-zulip --stream="kandra ops" --type="$NOTIFICATIONTYPE$" --host="$HOSTADDRESS$" --service="$SERVICEDESC$" --state="$SERVICESTATE$" --output="$SERVICEOUTPUT$" --long-output="$LONGSERVICEOUTPUT$"
}
################################################################################ ################################################################################
# HOST CHECK COMMANDS # HOST CHECK COMMANDS
################################################################################ ################################################################################

View File

@ -35,7 +35,7 @@ define service {
service_description Disk usage - pageable service_description Disk usage - pageable
hostgroup_name pageable_servers hostgroup_name pageable_servers
check_command check_remote_disk!20%!10% check_command check_remote_disk!20%!10%
contact_groups page_admins contact_groups ops_message
} }
define service { define service {
@ -43,7 +43,7 @@ define service {
service_description Disk usage service_description Disk usage
hostgroup_name not_pageable_servers hostgroup_name not_pageable_servers
check_command check_remote_disk!20%!10% check_command check_remote_disk!20%!10%
contact_groups admins contact_groups ops_message
} }
define service { define service {
@ -135,20 +135,37 @@ define service {
define service { define service {
use generic-service use generic-service
service_description HTTPS service_description HTTPS
hostgroup_name frontends hostgroup_name prod_frontends, fullstack
check_command check_https_status check_command check_https_status
contact_groups page_admins contact_groups page_admins
} }
define service {
use generic-service
service_description HTTPS
hostgroup_name staging_frontends
check_command check_https_status
contact_groups ops_message
}
define service { define service {
use generic-service use generic-service
service_description Check send receive time service_description Check send receive time
hostgroup_name frontends hostgroup_name prod_frontends, fullstack
check_command check_send_receive_time!22 check_command check_send_receive_time!22
max_check_attempts 2 max_check_attempts 2
contact_groups page_admins contact_groups page_admins
} }
define service {
use generic-service
service_description Check send receive time
hostgroup_name staging_frontends
check_command check_send_receive_time!22
max_check_attempts 2
contact_groups ops_message
}
define service { define service {
use generic-service use generic-service
service_description user zephyr mirror liveness service_description user zephyr mirror liveness
@ -280,19 +297,27 @@ define service {
use generic-service use generic-service
service_description Check redis service service_description Check redis service
# Both redis and frontends hostgroups, since frontends SSH proxy redis to themselves # Both redis and frontends hostgroups, since frontends SSH proxy redis to themselves
hostgroup_name frontends, redis hostgroup_name prod_frontends, fullstack, redis
check_command check_redis_ssh!22 check_command check_redis_ssh!22
max_check_attempts 3 max_check_attempts 3
contact_groups page_admins contact_groups page_admins
} }
define service {
use generic-service
service_description Check redis service
hostgroup_name staging_frontends
check_command check_redis_ssh!22
max_check_attempts 3
contact_groups ops_message
}
#### RabbitMQ / queue workers #### RabbitMQ / queue workers
define service { define service {
use generic-service use generic-service
service_description Check RabbitMQ queue sizes service_description Check RabbitMQ queue sizes
hostgroup_name frontends hostgroup_name prod_frontends, fullstack
check_command check_rabbitmq_queues!22 check_command check_rabbitmq_queues!22
# Workaround weird checks 40s after first error causing alerts # Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet # from a single failure because cron hasn't run again yet
@ -300,6 +325,17 @@ define service {
contact_groups page_admins contact_groups page_admins
} }
define service {
use generic-service
service_description Check RabbitMQ queue sizes
hostgroup_name staging_frontends
check_command check_rabbitmq_queues!22
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
contact_groups ops_message
}
define service { define service {
name rabbitmq-consumer-service name rabbitmq-consumer-service
use generic-service use generic-service
@ -315,10 +351,18 @@ define service {
define service { define service {
use rabbitmq-consumer-service use rabbitmq-consumer-service
service_description Check RabbitMQ notify_tornado consumers service_description Check RabbitMQ notify_tornado consumers
hostgroup_name prod_frontends, fullstack
check_command check_rabbitmq_consumers!notify_tornado check_command check_rabbitmq_consumers!notify_tornado
contact_groups page_admins contact_groups page_admins
} }
define service {
use rabbitmq-consumer-service
service_description Check RabbitMQ notify_tornado consumers
hostgroup_name staging_frontends
check_command check_rabbitmq_consumers!notify_tornado
contact_groups admins
}
define service { define service {
use rabbitmq-consumer-service use rabbitmq-consumer-service

View File

@ -48,6 +48,17 @@ define contact {
host_notification_commands notify-host-by-email host_notification_commands notify-host-by-email
} }
define contact {
contact_name kandra-ops-in-czo
alias Notify kandra-ops on chat.zulip.org
service_notification_period 24x7
host_notification_period 24x7
service_notification_options w,u,c,r
host_notification_options d,r
service_notification_commands notify-service-by-zulip
host_notification_commands notify-host-by-zulip
}
############################################################################### ###############################################################################
############################################################################### ###############################################################################
# #
@ -68,6 +79,12 @@ define contactgroup{
members monitoring,pager members monitoring,pager
} }
define contactgroup{
contactgroup_name ops_message
alias Message admins on CZO
members monitoring,kandra-ops-in-czo
}
define contactgroup{ define contactgroup{
contactgroup_name test contactgroup_name test
alias Nagios Test Administrators alias Nagios Test Administrators