define service { use generic-service hostgroup_name web service_description HTTP check_command check_http contact_groups page_admins } define service { use generic-service hostgroup_name pageable_servers service_description SSH - pageable check_command check_ssh contact_groups page_admins } define service { use generic-service hostgroup_name not_pageable_servers service_description SSH check_command check_ssh contact_groups admins } define service { use generic-service hostgroup_name pageable_servers service_description remote disk - pageable check_command check_remote_disk!20%!10% contact_groups page_admins } define service { use generic-service hostgroup_name not_pageable_servers service_description remote disk check_command check_remote_disk!20%!10% contact_groups admins } define service { use generic-service hostgroup_name not_pageable_servers service_description remote load check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0 contact_groups admins } define service { use generic-service hostgroup_name pageable_servers service_description remote load - pageable check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0 contact_groups admins } define service { use generic-service service_description zephyr mirror forwarding check_command check_zephyr_mirror_forwarding!22 host zmirror normal_check_interval 2 # Note: the actual check is run via cron, so retry_check_interval # should always equal normal_check_interval. retry_check_interval 2 max_check_attempts 2 contact_groups admins } define service { use generic-service service_description user zephyr mirror liveness check_command check_user_zephyr_mirror_liveness!22 host prod0 contact_groups admins } define service { use generic-service host zmirror2 service_description Check personal zephyr mirrors check_command check_personal_zephyr_mirrors!22 contact_groups admins } define service { use generic-service service_description Debian update availability check_command check_debian_packages!22 hostgroup_name all contact_groups admins } define service { use generic-service service_description Check NTP time check_command check_ntp_time!22 hostgroup_name all contact_groups admins } define service { use generic-service service_description Check send receive time check_command check_send_receive_time!22 hostgroup_name frontends contact_groups admins } ## check_postgres.pl services define service { use generic-service service_description Check postgres autovac_freeze check_command check_postgres!zulip!zulip!autovac_freeze hostgroup postgres_appdb_primary contact_groups admins } define service { use generic-service service_description Check postgres backends check_command check_postgres!zulip!zulip!backends hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres connection check_command check_postgres!zulip!zulip!connection hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres hitratio check_command check_postgres!zulip!zulip!hitratio hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres locks check_command check_postgres_alert_args!zulip!zulip!locks!100!200 hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres query_time check_command check_postgres_alert_args!zulip!zulip!query_time!20 seconds!40 seconds hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres sequence check_command check_postgres!zulip!zulip!sequence hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres timesync check_command check_postgres!zulip!zulip!timesync hostgroup postgres_appdb contact_groups admins } # define service { # use generic-service # service_description Check postgres txn_idle # check_command check_postgres_alert_args!zulip!zulip!txn_idle!20 seconds!40 seconds # hostgroup postgres # contact_groups admins # } # define service { # use generic-service # service_description Check postgres txn_time # check_command check_postgres_alert_args!zulip!zulip!txn_time!20 seconds!40 seconds # hostgroup postgres # contact_groups admins # } ## non-appdb check_postgres.pl checks define service { use generic-service service_description Check postgres autovac_freeze check_command check_postgres!wiki,trac!zulip!autovac_freeze host trac contact_groups admins } define service { use generic-service service_description Check postgres connection check_command check_postgres!wiki,trac!zulip!connection host trac contact_groups admins } define service { use generic-service service_description Check postgres backends check_command check_postgres!wiki,trac!zulip!backends host trac contact_groups admins } define service { use generic-service service_description Check postgres hitratio check_command check_postgres!wiki,trac!zulip!hitratio host trac contact_groups admins } define service { use generic-service service_description Check postgres locks check_command check_postgres!wiki,trac!zulip!locks host trac contact_groups admins } define service { use generic-service service_description Check postgres timesync check_command check_postgres!wiki,trac!zulip!timesync host trac contact_groups admins } define service { use generic-service service_description Check FTS update log length check_command check_fts_update_log hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check last Postgres backup time check_command check_postgres_backup hostgroup postgres contact_groups admins } define service { use generic-service service_description zmirror subscriptons syncing check_command check_sync_public_streams!22 hostgroup_name zmirror contact_groups page_admins } define service { use generic-service service_description Check rabbitmq queue sizes check_command check_rabbitmq_queues!22 # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups page_admins } define service { use generic-service service_description Check for queue worker errors. check_command check_queue_worker_errors!22 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq notify_tornado consumers check_command check_rabbitmq_consumers!notify_tornado # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups page_admins } define service { use generic-service service_description Check rabbitmq user_activity consumers check_command check_rabbitmq_consumers!user_activity # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq user_activity_interval consumers check_command check_rabbitmq_consumers!user_activity_interval # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq user_presence consumers check_command check_rabbitmq_consumers!user_presence # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq invites consumers check_command check_rabbitmq_consumers!invites # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq signups consumers check_command check_rabbitmq_consumers!signups # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check missedmessage_emails queue processor check_command check_remote_arg_string!manage.py process_queue missedmessage_emails!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check slow_queries queue processor check_command check_remote_arg_string!manage.py process_queue slow_queries!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check worker memory usage check_command check_worker_memory hostgroup_name frontends contact_groups admins } define service { use generic-service hostgroup_name all service_description swap check_command check_remote_swap!22!80%!50% contact_groups admins } define service { use generic-service host staging service_description Check email mirror check_command check_email_mirror!22 contact_groups admins } define service { use generic-service host staging service_description Check email deliverer process which is only used on Zulip Enterprise check_command check_email_deliverer_process contact_groups admins } define service { use generic-service host staging service_description Check email deliverer backlog which is only used on Zulip Enterprise check_command check_email_deliverer_backlog contact_groups admins }