zulip/puppet/zulip_ops/files/nagios3/conf.d/services.cfg

607 lines
25 KiB
INI
Raw Normal View History

define service {
use generic-service
hostgroup_name web
service_description HTTPS
check_command check_https_status
contact_groups page_admins
}
define service {
use generic-service
hostgroup_name pageable_servers
service_description SSH - pageable
check_command check_ssh
contact_groups page_admins
}
define service {
use generic-service
hostgroup_name zmirror
service_description SSH for flaky machines
check_command check_ssh
normal_check_interval 2
retry_check_interval 2
max_check_attempts 5
contact_groups admins
}
define service {
use generic-service
hostgroup_name not_pageable_servers
service_description SSH
check_command check_ssh
contact_groups admins
}
define service {
use generic-service
hostgroup_name pageable_servers
service_description remote disk - pageable
check_command check_remote_disk!20%!10%
contact_groups page_admins
}
define service {
use generic-service
hostgroup_name zmirror
service_description remote disk for flaky machines
check_command check_remote_disk!20%!10%
normal_check_interval 2
retry_check_interval 2
max_check_attempts 5
contact_groups admins
}
define service {
use generic-service
hostgroup_name not_pageable_servers
service_description remote disk
check_command check_remote_disk!20%!10%
contact_groups admins
}
define service {
use generic-service
hostgroup_name not_pageable_servers
service_description remote load
check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0
contact_groups admins
}
define service {
use generic-service
hostgroup_name pageable_servers
service_description remote load - pageable
check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0
contact_groups admins
}
define service {
use generic-service
hostgroup_name zmirror
service_description remote load for flaky machines
check_command check_remote_load!22!5.0,4.0,3.0!10.0,6.0,4.0
normal_check_interval 2
retry_check_interval 2
max_check_attempts 5
contact_groups admins
}
define service {
use generic-service
service_description zephyr mirror forwarding
check_command check_zephyr_mirror_forwarding!22
hostgroup_name zmirror_main
normal_check_interval 2
# Note: the actual check is run via cron, so retry_check_interval
# should always equal normal_check_interval.
retry_check_interval 2
max_check_attempts 5
contact_groups page_admins
}
define service {
use generic-service
service_description user zephyr mirror liveness
check_command check_user_zephyr_mirror_liveness!22
hostgroup prod_frontends
contact_groups admins
}
define service {
use generic-service
hostgroup zmirrorp
service_description Check personal zephyr mirrors
check_command check_personal_zephyr_mirrors!22
contact_groups admins
}
define service {
use generic-service
service_description Debian update availability
check_command check_debian_packages!22
hostgroup_name all
contact_groups admins
}
define service {
use generic-service
service_description Check NTP time
check_command check_ntp_time!22
max_check_attempts 3
hostgroup_name all
contact_groups admins
}
define service {
use generic-service
service_description Check send receive time
check_command check_send_receive_time!22
max_check_attempts 2
hostgroup_name frontends
contact_groups page_admins
}
define service {
use generic-service
service_description Check send receive time_websockets
check_command check_send_receive_time_websockets!22
max_check_attempts 2
hostgroup_name frontends
contact_groups page_admins
}
## check_postgres.pl services
define service {
use generic-service
service_description Check postgres autovac_freeze
check_command check_postgres!zulip!nagios!autovac_freeze
hostgroup postgres_appdb_primary
contact_groups admins
}
define service {
use generic-service
service_description Check postgres backends
check_command check_postgres!zulip!nagios!backends
hostgroup postgres_appdb
contact_groups admins
}
define service {
use generic-service
service_description Check postgres connection
check_command check_postgres!zulip!nagios!connection
hostgroup postgres_appdb
contact_groups page_admins
}
define service {
use generic-service
service_description Check postgres disabled triggers
check_command check_postgres!zulip!nagios!disabled_triggers
hostgroup postgres_appdb
contact_groups admins
}
define service {
use generic-service
service_description Check postgres hitratio
check_command check_postgres!zulip!nagios!hitratio
hostgroup postgres_appdb
contact_groups admins
}
define service {
use generic-service
service_description Check postgres locks
check_command check_postgres_alert_args!zulip!nagios!locks!100!200
hostgroup postgres_appdb
contact_groups admins
}
define service {
use generic-service
service_description Check postgres query_time
check_command check_postgres_alert_args!zulip!nagios!query_time!20 seconds!40 seconds
hostgroup postgres_appdb
contact_groups admins
}
define service {
use generic-service
service_description Check postgres sequence
check_command check_postgres!zulip!nagios!sequence
hostgroup postgres_appdb
contact_groups admins
}
define service {
use generic-service
service_description Check postgres timesync
check_command check_postgres!zulip!nagios!timesync
hostgroup postgres_appdb
contact_groups admins
}
# define service {
# use generic-service
# service_description Check postgres txn_idle
# check_command check_postgres_alert_args!zulip!nagios!txn_idle!20 seconds!40 seconds
# hostgroup postgres_appdb
# contact_groups admins
# }
define service {
use generic-service
service_description Check postgres txn_time
check_command check_postgres_alert_args!zulip!nagios!txn_time!20 seconds!40 seconds
hostgroup postgres_appdb
contact_groups admins
}
## non-appdb check_postgres.pl checks
define service {
use generic-service
service_description Check postgres autovac_freeze
check_command check_postgres!wiki,trac!zulip!autovac_freeze
hostgroup postgres_other
contact_groups admins
}
define service {
use generic-service
service_description Check postgres connection
check_command check_postgres!wiki,trac!zulip!connection
hostgroup postgres_other
contact_groups admins
}
define service {
use generic-service
service_description Check postgres backends
check_command check_postgres!wiki,trac!zulip!backends
hostgroup postgres_other
contact_groups admins
}
define service {
use generic-service
service_description Check postgres disabled triggers
check_command check_postgres!wiki,trac!zulip!disabled_triggers
hostgroup postgres_other
contact_groups admins
}
define service {
use generic-service
service_description Check postgres hitratio
check_command check_postgres!wiki,trac!zulip!hitratio
hostgroup postgres_other
contact_groups admins
}
define service {
use generic-service
service_description Check postgres locks
check_command check_postgres!wiki,trac!zulip!locks
hostgroup postgres_other
contact_groups admins
}
define service {
use generic-service
service_description Check postgres timesync
check_command check_postgres!wiki,trac!zulip!timesync
hostgroup postgres_other
contact_groups admins
}
define service {
use generic-service
service_description Check postgres txn_time
check_command check_postgres_alert_args!wiki,trac!zulip!txn_time!20 seconds!40 seconds
hostgroup postgres_other
contact_groups admins
}
define service {
use generic-service
service_description Check FTS update log length
check_command check_fts_update_log
hostgroup postgres_appdb
contact_groups admins
}
define service {
use generic-service
service_description Check last Postgres backup time
check_command check_postgres_backup
hostgroup postgres
contact_groups admins
}
define service {
use generic-service
service_description zmirror subscriptions syncing
check_command check_sync_public_streams!22
hostgroup zmirror_main
normal_check_interval 2
retry_check_interval 2
max_check_attempts 5
contact_groups admins
}
define service {
use generic-service
service_description Check redis service
check_command check_redis_ssh!22
max_check_attempts 3
hostgroups frontends, redis
contact_groups page_admins
}
define service {
use generic-service
service_description Check memcached service
check_command check_memcached_ssh!22
max_check_attempts 3
hostgroups frontends
contact_groups page_admins
}
define service {
use generic-service
service_description Check rabbitmq queue sizes
check_command check_rabbitmq_queues!22
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups page_admins
}
define service {
use generic-service
service_description Check for queue worker errors.
check_command check_queue_worker_errors!22
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check rabbitmq notify_tornado consumers
check_command check_rabbitmq_consumers!notify_tornado
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups page_admins
}
define service {
use generic-service
service_description Check rabbitmq tornado_return consumers
check_command check_rabbitmq_consumers!tornado_return
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups page_admins
}
define service {
use generic-service
service_description Check rabbitmq user_activity consumers
check_command check_rabbitmq_consumers!user_activity
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check rabbitmq user_activity_interval consumers
check_command check_rabbitmq_consumers!user_activity_interval
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check rabbitmq user_presence consumers
check_command check_rabbitmq_consumers!user_presence
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check rabbitmq invites consumers
check_command check_rabbitmq_consumers!invites
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check rabbitmq signups consumers
check_command check_rabbitmq_consumers!signups
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check rabbitmq digest email consumers
check_command check_rabbitmq_consumers!digest_emails
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check rabbitmq email mirror consumers
check_command check_rabbitmq_consumers!email_mirror
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check rabbitmq feedback messages consumers
check_command check_rabbitmq_consumers!feedback_messages
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check rabbitmq message sender consumers
check_command check_rabbitmq_consumers!message_sender
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups page_admins
}
### The missedmessage_emails queue processor batches events, so don't monitor it this way
# define service {
# use generic-service
# service_description Check rabbitmq missedmessage emails consumers
# check_command check_rabbitmq_consumers!missedmessage_emails
# # Workaround weird checks 40s after first error causing alerts
# # from a single failure because cron hasn't run again yet
# max_check_attempts 3
# hostgroup_name frontends
# contact_groups admins
# }
define service {
use generic-service
service_description Check rabbitmq missedmessage mobile notifications consumers
check_command check_rabbitmq_consumers!missedmessage_mobile_notifications
# Workaround weird checks 40s after first error causing alerts
# from a single failure because cron hasn't run again yet
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
### The slow_queries queue consumer batches events, so don't monitor it this way
# define service {
# use generic-service
# service_description Check rabbitmq slow queries consumers
# check_command check_rabbitmq_consumers!slow_queries
# # Workaround weird checks 40s after first error causing alerts
# # from a single failure because cron hasn't run again yet
# max_check_attempts 3
# hostgroup_name frontends
# contact_groups admins
# }
define service {
use generic-service
service_description Check embedded_bots queue processor
check_command check_remote_arg_string!manage.py process_queue --queue_name=embedded_bots!1:1!1:1
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check missedmessage_emails queue processor
check_command check_remote_arg_string!manage.py process_queue --queue_name=missedmessage_emails!1:1!1:1
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check missedmessage_email_senders queue processor
check_command check_remote_arg_string!manage.py process_queue --queue_name=missedmessage_email_senders!1:1!1:1
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check outgoing webhooks queue processor
check_command check_remote_arg_string!manage.py process_queue --queue_name=outgoing_webhooks!1:1!1:1
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check slow_queries queue processor
check_command check_remote_arg_string!manage.py process_queue --queue_name=slow_queries!1:1!1:1
max_check_attempts 3
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check worker memory usage
check_command check_worker_memory
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
hostgroup_name all
service_description swap
check_command check_remote_swap!22!80%!50%
contact_groups admins
}
define service {
use generic-service
hostgroup_name staging_frontends
service_description Check email deliverer process which is only used on Zulip Voyager
check_command check_email_deliverer_process
contact_groups admins
}
define service {
use generic-service
hostgroup_name staging_frontends
service_description Check email deliverer backlog which is only used on Zulip Voyager
check_command check_email_deliverer_backlog
contact_groups admins
}