[puppet] Add nagios script to look for queue error files.

You should do the puppet apply on the nagios box.

(imported from commit 7cf3a11ede69ed6bd7ba2a4384d83c89cfcc65c0)
This commit is contained in:
Steve Howell 2013-10-29 16:46:31 -04:00
parent 884e602185
commit ede3252f8d
3 changed files with 36 additions and 0 deletions

View File

@ -86,6 +86,11 @@ define command{
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l zulip -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_send_receive_time --nagios --site=https://$HOSTADDRESS$'
}
define command{
command_name check_queue_worker_errors
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l zulip -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_queue_worker_errors
}
define command{
command_name check_postgres
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l zulip -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_postgres.pl --dbname=zulip --dbuser=zulip --action $ARG2$'

View File

@ -224,6 +224,14 @@ define service {
contact_groups page_admins
}
define service {
use generic-service
service_description Check for queue worker errors.
check_command check_queue_worker_errors!22
hostgroup_name frontends
contact_groups admins
}
define service {
use generic-service
service_description Check rabbitmq notify_tornado consumers

View File

@ -0,0 +1,23 @@
#!/usr/bin/env python
"""
Nagios plugin to check that none of our queue workers have reported errors.
"""
import sys
sys.path.append('.')
from zproject import settings
import glob
import os
wildcard = os.path.join(settings.QUEUE_ERROR_DIR, '*.errors')
clean = True
for fn in glob.glob(wildcard):
print('WARNING: Queue errors logged in %s' % (fn,))
clean = False
if not clean:
sys.exit(1)
sys.exit(0)