Add a nagios check for a notify_tornado consumer

(imported from commit 050536bb4ac7384d5b98d5cf6cb7430b2b00dbd5)
This commit is contained in:
Leo Franchi 2013-04-16 14:07:53 -04:00
parent 7b1513b675
commit 350cf79ba0
7 changed files with 135 additions and 24 deletions

59
bots/check-rabbitmq-consumers Executable file
View File

@ -0,0 +1,59 @@
#!/usr/bin/env python
import sys
import subprocess
import time
import optparse
from collections import defaultdict
from os import path, environ
sys.path.append(path.join(path.dirname(__file__), '../tools'))
from humbug_tools import check_output
states = {
0: "OK",
1: "WARNING",
2: "CRITICAL",
3: "UNKNOWN"
}
if 'USER' in environ and not environ['USER'] in ['root', 'rabbitmq']:
print "This script must be run as the root or rabbitmq user"
usage = """Usage: check-rabbitmq-consumers --queue=[queue-name] --min-threshold=[min-threshold]"""
parser = optparse.OptionParser(usage=usage)
parser.add_option('--queue',
dest='queue_name',
default="notify_tornado",
action='store')
parser.add_option('--min-threshold',
dest='min_count',
type="int",
default=1,
action='store')
(options, args) = parser.parse_args()
output = check_output(['/usr/sbin/rabbitmqctl', 'list_consumers'], shell=False)
consumers = defaultdict(int)
for line in output.split('\n'):
parts = line.split('\t')
if len(parts) and parts[0] == options.queue_name:
consumers[parts[0]] += 1
now = int(time.time())
if consumers[options.queue_name] < options.min_count:
status = 2
else:
status = 0
print("%s|%s|%s|queue %s has %s consumers, needs %s" % (
now, status, states[status], options.queue_name,
consumers[options.queue_name], options.min_count))

34
bots/cron_file_helper.py Normal file
View File

@ -0,0 +1,34 @@
#!/usr/bin/env python
import time
def nagios_from_file(results_file):
"""Returns a nagios-appropriate string and return code obtained by
parsing the desired file on disk. The file on disk should be of format
%s|%s % (timestamp, nagios_string)
This file is created by various nagios checking cron jobs such as
check-rabbitmq-queues and check-rabbitmq-consumers"""
data = file(results_file).read().strip()
pieces = data.split('|')
if not len(pieces) == 4:
state = 'UNKNOWN'
ret = 3
data = "Results file malformed"
else:
timestamp = int(pieces[0])
time_diff = time.time() - timestamp
if time_diff > 60 * 2:
ret = 3
state = 'UNKNOWN'
data = "Results file is stale"
else:
ret = int(pieces[1])
state = pieces[2]
data = pieces[3]
return (ret, "%s: %s" % (state, data))

View File

@ -0,0 +1 @@
* * * * * /home/humbug/humbug/bots/check-rabbitmq-consumers &> /var/run/nagios/check-rabbitmq-consumers-tmp; mv /var/run/nagios/check-rabbitmq-consumers-tmp /var/run/nagios/check-rabbitmq-consumers

View File

@ -121,6 +121,11 @@ define command{
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l humbug -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_rabbitmq_queues'
}
define command{
command_name check_rabbitmq_consumers
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l humbug -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_rabbitmq_consumers'
}
define command{
command_name check_remote_swap
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l nagios -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_swap -w $ARG2$ -c $ARG3$'

View File

@ -212,6 +212,14 @@ define service {
contact_groups page_admins
}
define service {
use generic-service
service_description Check rabbitmq notify_tornado consumers
check_command check_rabbitmq_consumers!22
hostgroup_name frontends
contact_groups test_leo
}
define service {
use generic-service
hostgroup_name all

View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""
Nagios plugin to check that the rabbitmq has the correct number of consumers
This script just checks the contents of /var/run/nagios/check-rabbitmq-consumers,
which is generated by bots/check-rabbitmq-consumers.
It is run by cron and can be found at bots/rabbitmq-numconsumers-crontab
"""
import sys
sys.path.append('/home/humbug/humbug/')
from bots.cron_file_helper import nagios_from_file
RESULTS_FILE = "/var/run/nagios/check-rabbitmq-consumers"
ret, result = nagios_from_file(RESULTS_FILE)
print result
exit(ret)

View File

@ -10,31 +10,13 @@ which is generated by bots/check-rabbitmq-queue.
It is run by cron and can be found at bots/rabbitmq-queuesize-crontab
"""
import os
import time
import sys
sys.path.append('/home/humbug/humbug/')
from bots.cron_file_helper import nagios_from_file
RESULTS_FILE = "/var/run/nagios/check-rabbitmq-results"
ret, result = nagios_from_file(RESULTS_FILE)
data = file(RESULTS_FILE).read().strip()
pieces = data.split('|')
if not len(pieces) == 4:
state = 'UNKNOWN'
ret = 3
data = "Results file malformed"
else:
timestamp = int(pieces[0])
time_diff = time.time() - timestamp
if time_diff > 60 * 2:
ret = 3
state = 'UNKNOWN'
data = "Results file is stale"
else:
ret = int(pieces[1])
state = pieces[2]
data = pieces[3]
print "%s: %s" % (state, data)
print result
exit(ret)