define service { use generic-service hostgroup_name web service_description HTTPS check_command check_https_status contact_groups page_admins } define service { use generic-service hostgroup_name pageable_servers service_description SSH - pageable check_command check_ssh contact_groups page_admins } define service { use generic-service hostgroup_name zmirror service_description SSH for flaky machines check_command check_ssh normal_check_interval 2 retry_check_interval 2 max_check_attempts 5 contact_groups admins } define service { use generic-service hostgroup_name not_pageable_servers service_description SSH check_command check_ssh contact_groups admins } define service { use generic-service hostgroup_name pageable_servers service_description remote disk - pageable check_command check_remote_disk!20%!10% contact_groups page_admins } define service { use generic-service hostgroup_name zmirror service_description remote disk for flaky machines check_command check_remote_disk!20%!10% normal_check_interval 2 retry_check_interval 2 max_check_attempts 5 contact_groups admins } define service { use generic-service hostgroup_name not_pageable_servers service_description remote disk check_command check_remote_disk!20%!10% contact_groups admins } define service { use generic-service hostgroup_name not_pageable_servers service_description remote load check_command check_remote_load!22!7.0,6.0,5.0!10.0,8.0,6.0 contact_groups admins } define service { use generic-service hostgroup_name pageable_servers service_description remote load - pageable check_command check_remote_load!22!7.0,6.0,5.0!10.0,8.0,6.0 contact_groups admins } define service { use generic-service hostgroup_name zmirror service_description remote load for flaky machines check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0 normal_check_interval 2 retry_check_interval 2 max_check_attempts 5 contact_groups admins } define service { use generic-service service_description zephyr mirror forwarding check_command check_zephyr_mirror_forwarding!22 hostgroup_name zmirror_main normal_check_interval 2 # Note: the actual check is run via cron, so retry_check_interval # should always equal normal_check_interval. retry_check_interval 2 max_check_attempts 5 contact_groups page_admins } define service { use generic-service service_description user zephyr mirror liveness check_command check_user_zephyr_mirror_liveness!22 hostgroup prod_frontends contact_groups admins } define service { use generic-service hostgroup zmirrorp service_description Check personal zephyr mirrors check_command check_personal_zephyr_mirrors!22 contact_groups admins } define service { use generic-service service_description Debian update availability check_command check_debian_packages!22 hostgroup_name all contact_groups admins } define service { use generic-service service_description Check NTP time check_command check_ntp_time!22 max_check_attempts 3 hostgroup_name all contact_groups admins } define service { use generic-service service_description Check send receive time check_command check_send_receive_time!22 max_check_attempts 2 hostgroup_name frontends contact_groups page_admins } define service { use generic-service service_description Check analytics state check_command check_analytics_state!22 max_check_attempts 2 hostgroup_name prod_frontends contact_groups admins } ## check_postgres.pl services define service { use generic-service service_description Check postgres autovac_freeze check_command check_postgres!zulip!nagios!autovac_freeze hostgroup postgres_appdb_primary contact_groups admins } define service { use generic-service service_description Check postgres backends check_command check_postgres!zulip!nagios!backends hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres connection check_command check_postgres!zulip!nagios!connection hostgroup postgres_appdb contact_groups page_admins } define service { use generic-service service_description Check postgres disabled triggers check_command check_postgres!zulip!nagios!disabled_triggers hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres hitratio check_command check_postgres!zulip!nagios!hitratio hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres locks check_command check_postgres_alert_args!zulip!nagios!locks!400!600 hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres query_time check_command check_postgres_alert_args!zulip!nagios!query_time!20 seconds!40 seconds hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres sequence check_command check_postgres!zulip!nagios!sequence hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check postgres timesync check_command check_postgres!zulip!nagios!timesync hostgroup postgres_appdb contact_groups admins } # define service { # use generic-service # service_description Check postgres txn_idle # check_command check_postgres_alert_args!zulip!nagios!txn_idle!20 seconds!40 seconds # hostgroup postgres_appdb # contact_groups admins # } define service { use generic-service service_description Check postgres txn_time check_command check_postgres_alert_args!zulip!nagios!txn_time!20 seconds!40 seconds hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check FTS update log length check_command check_fts_update_log hostgroup postgres_appdb contact_groups admins } define service{ use generic-service service_description Check postgres replication lag check_command check_postgres_replication_lag hostgroup postgres_appdb contact_groups admins } define service { use generic-service service_description Check last Postgres backup time check_command check_postgres_backup hostgroup postgres contact_groups admins } define service { use generic-service service_description zmirror subscriptions syncing check_command check_sync_public_streams!22 hostgroup zmirror_main normal_check_interval 2 retry_check_interval 2 max_check_attempts 5 contact_groups admins } define service { use generic-service service_description Check redis service check_command check_redis_ssh!22 max_check_attempts 3 hostgroups frontends, redis contact_groups page_admins } define service { use generic-service service_description Check memcached service check_command check_memcached_ssh!22 max_check_attempts 3 hostgroups frontends contact_groups page_admins } define service { use generic-service service_description Check rabbitmq queue sizes check_command check_rabbitmq_queues!22 # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups page_admins } define service { use generic-service service_description Check for queue worker errors. check_command check_queue_worker_errors!22 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq notify_tornado consumers check_command check_rabbitmq_consumers!notify_tornado # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name singletornado_frontends contact_groups page_admins } define service { use generic-service service_description Check rabbitmq user_activity_interval consumers check_command check_rabbitmq_consumers!user_activity_interval # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq user_presence consumers check_command check_rabbitmq_consumers!user_presence # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq invites consumers check_command check_rabbitmq_consumers!invites # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq signups consumers check_command check_rabbitmq_consumers!signups # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq digest email consumers check_command check_rabbitmq_consumers!digest_emails # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq email mirror consumers check_command check_rabbitmq_consumers!email_mirror # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq missedmessage mobile notifications consumers check_command check_rabbitmq_consumers!missedmessage_mobile_notifications # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } # The following queue workers batch-process events and thus can't be # monitored by checking for running consumers: # # user_activity, missedmessage_emails define service { use generic-service service_description Check embedded_bots queue processor check_command check_remote_arg_string!manage.py process_queue --queue_name=embedded_bots!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check missedmessage_emails queue processor check_command check_remote_arg_string!manage.py process_queue --queue_name=missedmessage_emails!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check email_senders queue processor check_command check_remote_arg_string!manage.py process_queue --queue_name=email_senders!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check outgoing webhooks queue processor check_command check_remote_arg_string!manage.py process_queue --queue_name=outgoing_webhooks!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check deferred_work queue processor check_command check_remote_arg_string!manage.py process_queue --queue_name=deferred_work!1:1!1:1 max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check worker memory usage check_command check_worker_memory hostgroup_name frontends contact_groups admins } define service { use generic-service hostgroup_name all service_description swap check_command check_remote_swap!22!80%!50% contact_groups admins } define service { use generic-service hostgroup_name staging_frontends service_description Check email deliverer process check_command check_email_deliverer_process contact_groups admins } define service { use generic-service hostgroup_name staging_frontends service_description Check email deliverer backlog check_command check_email_deliverer_backlog contact_groups admins }