mirror of https://github.com/zulip/zulip.git
358 lines
14 KiB
INI
358 lines
14 KiB
INI
define service {
|
|
use generic-service
|
|
hostgroup_name web
|
|
service_description HTTP
|
|
check_command check_http
|
|
contact_groups page_admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
hostgroup_name pageable_servers
|
|
service_description SSH - pageable
|
|
check_command check_ssh
|
|
contact_groups page_admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
hostgroup_name not_pageable_servers
|
|
service_description SSH
|
|
check_command check_ssh
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
hostgroup_name pageable_servers
|
|
service_description remote disk - pageable
|
|
check_command check_remote_disk!22!20%!10%!/
|
|
contact_groups page_admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
hostgroup_name not_pageable_servers
|
|
service_description remote disk
|
|
check_command check_remote_disk!22!20%!10%!/
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
hostgroup_name not_pageable_servers
|
|
service_description remote load
|
|
check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
hostgroup_name pageable_servers
|
|
service_description remote load - pageable
|
|
check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description zephyr mirror forwarding
|
|
check_command check_zephyr_mirror_forwarding!22
|
|
host zmirror
|
|
normal_check_interval 2
|
|
# Note: the actual check is run via cron, so retry_check_interval
|
|
# should always equal normal_check_interval.
|
|
retry_check_interval 2
|
|
max_check_attempts 2
|
|
contact_groups page_admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description user zephyr mirror liveness
|
|
check_command check_user_zephyr_mirror_liveness!22
|
|
host prod0
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
host zmirror2
|
|
service_description Check personal zephyr mirrors
|
|
check_command check_personal_zephyr_mirrors!22
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Debian update availability
|
|
check_command check_debian_packages!22
|
|
hostgroup_name all
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check NTP time
|
|
check_command check_ntp_time!22
|
|
hostgroup_name all
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check send receive time
|
|
check_command check_send_receive_time!22
|
|
hostgroup_name frontends
|
|
contact_groups admins
|
|
}
|
|
|
|
## check_postgres.pl services
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check postgres autovac_freeze
|
|
check_command check_postgres!22!autovac_freeze
|
|
hostgroup postgres_appdb_primary
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check postgres backends
|
|
check_command check_postgres!22!backends
|
|
hostgroup postgres
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check postgres connection
|
|
check_command check_postgres!22!connection
|
|
hostgroup postgres
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check postgres hitratio
|
|
check_command check_postgres!22!hitratio
|
|
hostgroup postgres
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check postgres locks
|
|
check_command check_postgres_alert_args!22!locks!100!200
|
|
hostgroup postgres
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check postgres query_time
|
|
check_command check_postgres_alert_args!22!query_time!20 seconds!40 seconds
|
|
hostgroup postgres
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check postgres sequence
|
|
check_command check_postgres!22!sequence
|
|
hostgroup postgres
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check postgres timesync
|
|
check_command check_postgres!22!timesync
|
|
hostgroup postgres
|
|
contact_groups admins
|
|
}
|
|
|
|
# define service {
|
|
# use generic-service
|
|
# service_description Check postgres txn_idle
|
|
# check_command check_postgres_alert_args!22!txn_idle!20 seconds!40 seconds
|
|
# hostgroup postgres
|
|
# contact_groups admins
|
|
# }
|
|
|
|
# define service {
|
|
# use generic-service
|
|
# service_description Check postgres txn_time
|
|
# check_command check_postgres_alert_args!22!txn_time!20 seconds!40 seconds
|
|
# hostgroup postgres
|
|
# contact_groups admins
|
|
# }
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check FTS update log length
|
|
check_command check_fts_update_log
|
|
hostgroup postgres
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check last Postgres backup time
|
|
check_command check_postgres_backup
|
|
hostgroup postgres
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description zmirror subscriptons syncing
|
|
check_command check_sync_public_streams!22
|
|
hostgroup_name zmirror
|
|
contact_groups page_admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check rabbitmq queue sizes
|
|
check_command check_rabbitmq_queues!22
|
|
# Workaround weird checks 40s after first error causing alerts
|
|
# from a single failure because cron hasn't run again yet
|
|
max_check_attempts 3
|
|
hostgroup_name frontends
|
|
contact_groups page_admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check for queue worker errors.
|
|
check_command check_queue_worker_errors!22
|
|
hostgroup_name frontends
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check rabbitmq notify_tornado consumers
|
|
check_command check_rabbitmq_consumers!notify_tornado
|
|
# Workaround weird checks 40s after first error causing alerts
|
|
# from a single failure because cron hasn't run again yet
|
|
max_check_attempts 3
|
|
hostgroup_name frontends
|
|
contact_groups page_admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check rabbitmq user_activity consumers
|
|
check_command check_rabbitmq_consumers!user_activity
|
|
# Workaround weird checks 40s after first error causing alerts
|
|
# from a single failure because cron hasn't run again yet
|
|
max_check_attempts 3
|
|
hostgroup_name frontends
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check rabbitmq user_activity_interval consumers
|
|
check_command check_rabbitmq_consumers!user_activity_interval
|
|
# Workaround weird checks 40s after first error causing alerts
|
|
# from a single failure because cron hasn't run again yet
|
|
max_check_attempts 3
|
|
hostgroup_name frontends
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check rabbitmq user_presence consumers
|
|
check_command check_rabbitmq_consumers!user_presence
|
|
# Workaround weird checks 40s after first error causing alerts
|
|
# from a single failure because cron hasn't run again yet
|
|
max_check_attempts 3
|
|
hostgroup_name frontends
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check rabbitmq invites consumers
|
|
check_command check_rabbitmq_consumers!invites
|
|
# Workaround weird checks 40s after first error causing alerts
|
|
# from a single failure because cron hasn't run again yet
|
|
max_check_attempts 3
|
|
hostgroup_name frontends
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check rabbitmq signups consumers
|
|
check_command check_rabbitmq_consumers!signups
|
|
# Workaround weird checks 40s after first error causing alerts
|
|
# from a single failure because cron hasn't run again yet
|
|
max_check_attempts 3
|
|
hostgroup_name frontends
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check missedmessage_emails queue processor
|
|
check_command check_remote_arg_string!manage.py process_queue missedmessage_emails!1:1!1:1
|
|
max_check_attempts 3
|
|
hostgroup_name frontends
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check slow_queries queue processor
|
|
check_command check_remote_arg_string!manage.py process_queue slow_queries!1:1!1:1
|
|
max_check_attempts 3
|
|
hostgroup_name frontends
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
service_description Check worker memory usage
|
|
check_command check_worker_memory
|
|
hostgroup_name frontends
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
hostgroup_name all
|
|
service_description swap
|
|
check_command check_remote_swap!22!80%!50%
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
host staging
|
|
service_description Check email mirror
|
|
check_command check_email_mirror!22
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
host staging
|
|
service_description Check email deliverer process which is only used on Zulip Enterprise
|
|
check_command check_email_deliverer_process
|
|
contact_groups admins
|
|
}
|
|
|
|
define service {
|
|
use generic-service
|
|
host staging
|
|
service_description Check email deliverer backlog which is only used on Zulip Enterprise
|
|
check_command check_email_deliverer_backlog
|
|
contact_groups admins
|
|
}
|