define service { use generic-service hostgroup_name web service_description HTTPS check_command check_https_status contact_groups page_admins } define service { use generic-service hostgroup_name pageable_servers service_description SSH - pageable check_command check_ssh contact_groups page_admins } define service { use generic-service hostgroup_name zmirror service_description SSH for flaky machines check_command check_ssh normal_check_interval 2 retry_check_interval 2 max_check_attempts 5 contact_groups admins } define service { use generic-service hostgroup_name not_pageable_servers service_description SSH check_command check_ssh contact_groups admins } define service { use generic-service hostgroup_name pageable_servers service_description remote disk - pageable check_command check_remote_disk!20%!10% contact_groups page_admins } define service { use generic-service hostgroup_name zmirror service_description remote disk for flaky machines check_command check_remote_disk!20%!10% normal_check_interval 2 retry_check_interval 2 max_check_attempts 5 contact_groups admins } define service { use generic-service hostgroup_name not_pageable_servers service_description remote disk check_command check_remote_disk!20%!10% contact_groups admins } define service { use generic-service hostgroup_name not_pageable_servers service_description remote load check_command check_remote_load!22!7.0,6.0,5.0!10.0,8.0,6.0 contact_groups admins } define service { use generic-service hostgroup_name pageable_servers service_description remote load - pageable check_command check_remote_load!22!7.0,6.0,5.0!10.0,8.0,6.0 contact_groups admins } define service { use generic-service hostgroup_name zmirror service_description remote load for flaky machines check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0 normal_check_interval 2 retry_check_interval 2 max_check_attempts 5 contact_groups admins } define service { use generic-service service_description zephyr mirror forwarding check_command check_zephyr_mirror_forwarding!22 hostgroup_name zmirror_main normal_check_interval 2 # Note: the actual check is run via cron, so retry_check_interval # should always equal normal_check_interval. retry_check_interval 2 max_check_attempts 5 contact_groups page_admins } define service { use generic-service service_description user zephyr mirror liveness check_command check_user_zephyr_mirror_liveness!22 hostgroup prod_frontends contact_groups admins } define service { use generic-service hostgroup zmirrorp service_description Check personal zephyr mirrors check_command check_personal_zephyr_mirrors!22 contact_groups admins } define service { use generic-service service_description Debian update availability check_command check_debian_packages!22 hostgroup_name all contact_groups admins } define service { use generic-service service_description Check NTP time check_command check_ntp_time!22 max_check_attempts 3 hostgroup_name all contact_groups admins } define service { use generic-service service_description Check send receive time check_command check_send_receive_time!22 max_check_attempts 2 hostgroup_name frontends contact_groups page_admins } define service { use generic-service service_description Check analytics state check_command check_analytics_state!22 max_check_attempts 2 hostgroup_name prod_frontends contact_groups admins } ## check_postgres services define service { use generic-service service_description Check Postgres autovac_freeze check_command check_postgres!zulip!nagios!autovac_freeze hostgroup postgresql_primary contact_groups admins } define service { use generic-service service_description Check Postgres backends check_command check_postgres!zulip!nagios!backends hostgroup postgresql contact_groups admins } define service { use generic-service service_description Check Postgres connection check_command check_postgres!zulip!nagios!connection hostgroup postgresql contact_groups page_admins } define service { use generic-service service_description Check Postgres disabled triggers check_command check_postgres!zulip!nagios!disabled_triggers hostgroup postgresql contact_groups admins } define service { use generic-service service_description Check Postgres hitratio check_command check_postgres!zulip!nagios!hitratio hostgroup postgresql contact_groups admins } define service { use generic-service service_description Check Postgres locks check_command check_postgres_alert_args!zulip!nagios!locks!400!600 hostgroup postgresql contact_groups admins } define service { use generic-service service_description Check Postgres query_time check_command check_postgres_alert_args!zulip!nagios!query_time!20 seconds!40 seconds hostgroup postgresql contact_groups admins } define service { use generic-service service_description Check Postgres sequence check_command check_postgres!zulip!nagios!sequence hostgroup postgresql contact_groups admins } define service { use generic-service service_description Check Postgres timesync check_command check_postgres!zulip!nagios!timesync hostgroup postgresql contact_groups admins } # define service { # use generic-service # service_description Check Postgres txn_idle # check_command check_postgres_alert_args!zulip!nagios!txn_idle!20 seconds!40 seconds # hostgroup postgresql # contact_groups admins # } define service { use generic-service service_description Check Postgres txn_time check_command check_postgres_alert_args!zulip!nagios!txn_time!20 seconds!40 seconds hostgroup postgresql contact_groups admins } define service { use generic-service service_description Check FTS update log length check_command check_fts_update_log hostgroup postgresql contact_groups admins } define service{ use generic-service service_description Check Postgres replication lag check_command check_postgresql_replication_lag hostgroup postgresql contact_groups admins } define service { use generic-service service_description Check last Postgres backup time check_command check_postgresql_backup hostgroup postgresql contact_groups admins } define service { use generic-service service_description zmirror subscriptions syncing check_command check_sync_public_streams!22 hostgroup zmirror_main normal_check_interval 2 retry_check_interval 2 max_check_attempts 5 contact_groups admins } define service { use generic-service service_description Check redis service check_command check_redis_ssh!22 max_check_attempts 3 hostgroups frontends, redis contact_groups page_admins } define service { use generic-service service_description Check RabbitMQ queue sizes check_command check_rabbitmq_queues!22 # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups page_admins } define service { use generic-service service_description Check for queue worker errors. check_command check_queue_worker_errors!22 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check RabbitMQ notify_tornado consumers check_command check_rabbitmq_consumers!notify_tornado # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name singletornado_frontends contact_groups page_admins } define service { use generic-service service_description Check RabbitMQ user_activity_interval consumers check_command check_rabbitmq_consumers!user_activity_interval # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check RabbitMQ user_presence consumers check_command check_rabbitmq_consumers!user_presence # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check RabbitMQ invites consumers check_command check_rabbitmq_consumers!invites # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check RabbitMQ signups consumers check_command check_rabbitmq_consumers!signups # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check RabbitMQ digest email consumers check_command check_rabbitmq_consumers!digest_emails # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check RabbitMQ email mirror consumers check_command check_rabbitmq_consumers!email_mirror # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check RabbitMQ missedmessage mobile notifications consumers check_command check_rabbitmq_consumers!missedmessage_mobile_notifications # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check RabbitMQ missedmessage email consumers check_command check_rabbitmq_consumers!missedmessage_email # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check RabbitMQ user activity consumers check_command check_rabbitmq_consumers!user_activity # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check embedded_bots queue processor check_command check_remote_arg_string!manage.py process_queue --queue_name=embedded_bots!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check missedmessage_emails queue processor check_command check_remote_arg_string!manage.py process_queue --queue_name=missedmessage_emails!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check email_senders queue processor check_command check_remote_arg_string!manage.py process_queue --queue_name=email_senders!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check outgoing webhooks queue processor check_command check_remote_arg_string!manage.py process_queue --queue_name=outgoing_webhooks!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check deferred_work queue processor check_command check_remote_arg_string!manage.py process_queue --queue_name=deferred_work!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check worker memory usage check_command check_worker_memory hostgroup_name frontends contact_groups admins } define service { use generic-service hostgroup_name all service_description swap check_command check_remote_swap!22!80%!50% contact_groups admins } define service { use generic-service hostgroup_name staging_frontends service_description Check email deliverer process check_command check_email_deliverer_process contact_groups admins } define service { use generic-service hostgroup_name staging_frontends service_description Check email deliverer backlog check_command check_email_deliverer_backlog contact_groups admins }