define service { use generic-service hostgroup_name web service_description HTTP check_command check_http contact_groups page_admins } define service { use generic-service hostgroup_name pageable_servers service_description SSH - pageable check_command check_ssh contact_groups page_admins } define service { use generic-service hostgroup_name not_pageable_servers service_description SSH check_command check_ssh contact_groups admins } define service { use generic-service hostgroup_name pageable_servers service_description remote disk - pageable check_command check_remote_disk!22!20%!10%!/ contact_groups page_admins } define service { use generic-service hostgroup_name not_pageable_servers service_description remote disk check_command check_remote_disk!22!20%!10%!/ contact_groups admins } define service { use generic-service hostgroup_name not_pageable_servers service_description remote load check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0 contact_groups admins } define service { use generic-service hostgroup_name pageable_servers service_description remote load - pageable check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0 contact_groups admins } define service { use generic-service service_description zephyr mirror forwarding check_command check_zephyr_mirror_forwarding!22 host zmirror normal_check_interval 2 # Note: the actual check is run via cron, so retry_check_interval # should always equal normal_check_interval. retry_check_interval 2 max_check_attempts 2 contact_groups page_admins } define service { use generic-service service_description user zephyr mirror liveness check_command check_user_zephyr_mirror_liveness!22 host app contact_groups admins } define service { use generic-service host zmirror2 service_description Check personal zephyr mirrors check_command check_personal_zephyr_mirrors!22 contact_groups admins } define service { use generic-service service_description Debian update availability check_command check_debian_packages!22 hostgroup_name all contact_groups admins } define service { use generic-service service_description Check NTP time check_command check_ntp_time!22 hostgroup_name all contact_groups admins } define service { use generic-service service_description Check send receive time check_command check_send_receive_time!22 hostgroup_name frontends contact_groups admins } ## check_postgres.pl services define service { use generic-service service_description Check postgres autovac_freeze check_command check_postgres!22!autovac_freeze host postgres0 contact_groups admins } define service { use generic-service service_description Check postgres backends check_command check_postgres!22!backends hostgroup postgres contact_groups admins } define service { use generic-service service_description Check postgres connection check_command check_postgres!22!connection hostgroup postgres contact_groups admins } define service { use generic-service service_description Check postgres hitratio check_command check_postgres!22!hitratio hostgroup postgres contact_groups admins } define service { use generic-service service_description Check postgres locks check_command check_postgres_alert_args!22!locks!100!200 hostgroup postgres contact_groups admins } define service { use generic-service service_description Check postgres query_time check_command check_postgres_alert_args!22!query_time!20 seconds!40 seconds hostgroup postgres contact_groups admins } define service { use generic-service service_description Check postgres sequence check_command check_postgres!22!sequence hostgroup postgres contact_groups admins } define service { use generic-service service_description Check postgres timesync check_command check_postgres!22!timesync hostgroup postgres contact_groups admins } # define service { # use generic-service # service_description Check postgres txn_idle # check_command check_postgres_alert_args!22!txn_idle!20 seconds!40 seconds # hostgroup postgres # contact_groups admins # } # define service { # use generic-service # service_description Check postgres txn_time # check_command check_postgres_alert_args!22!txn_time!20 seconds!40 seconds # hostgroup postgres # contact_groups admins # } define service { use generic-service service_description Check FTS update log length check_command check_fts_update_log hostgroup postgres contact_groups admins } define service { use generic-service service_description Check last Postgres backup time check_command check_postgres_backup hostgroup postgres contact_groups admins } define service { use generic-service service_description zmirror subscriptons syncing check_command check_sync_public_streams!22 hostgroup_name zmirror contact_groups page_admins } define service { use generic-service service_description Check rabbitmq queue sizes check_command check_rabbitmq_queues!22 # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups page_admins } define service { use generic-service service_description Check for queue worker errors. check_command check_queue_worker_errors!22 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq notify_tornado consumers check_command check_rabbitmq_consumers!notify_tornado # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups page_admins } define service { use generic-service service_description Check rabbitmq user_activity consumers check_command check_rabbitmq_consumers!user_activity # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq user_activity_interval consumers check_command check_rabbitmq_consumers!user_activity_interval # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq user_presence consumers check_command check_rabbitmq_consumers!user_presence # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq invites consumers check_command check_rabbitmq_consumers!invites # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check rabbitmq signups consumers check_command check_rabbitmq_consumers!signups # Workaround weird checks 40s after first error causing alerts # from a single failure because cron hasn't run again yet max_check_attempts 3 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check missedmessage_emails queue processor check_command check_remote_arg_string!manage.py process_queue missedmessage_emails!1:1!1:1 hostgroup_name frontends contact_groups admins } define service { use generic-service service_description Check slow_queries queue processor check_command check_remote_arg_string!manage.py process_queue slow_queries!1:1!1:1 hostgroup_name frontends contact_groups admins } define service { use generic-service hostgroup_name all service_description swap check_command check_remote_swap!22!80%!50% contact_groups admins } define service { use generic-service host staging service_description Check email mirror check_command check_email_mirror!22 contact_groups admins }