mirror of https://github.com/zulip/zulip.git
Add a nagios check for a notify_tornado consumer
(imported from commit 050536bb4ac7384d5b98d5cf6cb7430b2b00dbd5)
This commit is contained in:
parent
7b1513b675
commit
350cf79ba0
|
@ -0,0 +1,59 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
import subprocess
|
||||
import time
|
||||
import optparse
|
||||
from collections import defaultdict
|
||||
|
||||
from os import path, environ
|
||||
|
||||
sys.path.append(path.join(path.dirname(__file__), '../tools'))
|
||||
from humbug_tools import check_output
|
||||
|
||||
states = {
|
||||
0: "OK",
|
||||
1: "WARNING",
|
||||
2: "CRITICAL",
|
||||
3: "UNKNOWN"
|
||||
}
|
||||
|
||||
if 'USER' in environ and not environ['USER'] in ['root', 'rabbitmq']:
|
||||
print "This script must be run as the root or rabbitmq user"
|
||||
|
||||
|
||||
usage = """Usage: check-rabbitmq-consumers --queue=[queue-name] --min-threshold=[min-threshold]"""
|
||||
|
||||
parser = optparse.OptionParser(usage=usage)
|
||||
parser.add_option('--queue',
|
||||
dest='queue_name',
|
||||
default="notify_tornado",
|
||||
action='store')
|
||||
parser.add_option('--min-threshold',
|
||||
dest='min_count',
|
||||
type="int",
|
||||
default=1,
|
||||
action='store')
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
output = check_output(['/usr/sbin/rabbitmqctl', 'list_consumers'], shell=False)
|
||||
|
||||
consumers = defaultdict(int)
|
||||
|
||||
for line in output.split('\n'):
|
||||
parts = line.split('\t')
|
||||
if len(parts) and parts[0] == options.queue_name:
|
||||
consumers[parts[0]] += 1
|
||||
|
||||
|
||||
now = int(time.time())
|
||||
|
||||
if consumers[options.queue_name] < options.min_count:
|
||||
status = 2
|
||||
else:
|
||||
status = 0
|
||||
|
||||
print("%s|%s|%s|queue %s has %s consumers, needs %s" % (
|
||||
now, status, states[status], options.queue_name,
|
||||
consumers[options.queue_name], options.min_count))
|
|
@ -0,0 +1,34 @@
|
|||
#!/usr/bin/env python
|
||||
import time
|
||||
|
||||
def nagios_from_file(results_file):
|
||||
"""Returns a nagios-appropriate string and return code obtained by
|
||||
parsing the desired file on disk. The file on disk should be of format
|
||||
|
||||
%s|%s % (timestamp, nagios_string)
|
||||
|
||||
This file is created by various nagios checking cron jobs such as
|
||||
check-rabbitmq-queues and check-rabbitmq-consumers"""
|
||||
|
||||
data = file(results_file).read().strip()
|
||||
pieces = data.split('|')
|
||||
|
||||
if not len(pieces) == 4:
|
||||
state = 'UNKNOWN'
|
||||
ret = 3
|
||||
data = "Results file malformed"
|
||||
else:
|
||||
timestamp = int(pieces[0])
|
||||
|
||||
time_diff = time.time() - timestamp
|
||||
if time_diff > 60 * 2:
|
||||
ret = 3
|
||||
state = 'UNKNOWN'
|
||||
data = "Results file is stale"
|
||||
else:
|
||||
ret = int(pieces[1])
|
||||
state = pieces[2]
|
||||
data = pieces[3]
|
||||
|
||||
return (ret, "%s: %s" % (state, data))
|
||||
|
|
@ -0,0 +1 @@
|
|||
* * * * * /home/humbug/humbug/bots/check-rabbitmq-consumers &> /var/run/nagios/check-rabbitmq-consumers-tmp; mv /var/run/nagios/check-rabbitmq-consumers-tmp /var/run/nagios/check-rabbitmq-consumers
|
|
@ -121,6 +121,11 @@ define command{
|
|||
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l humbug -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_rabbitmq_queues'
|
||||
}
|
||||
|
||||
define command{
|
||||
command_name check_rabbitmq_consumers
|
||||
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l humbug -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_rabbitmq_consumers'
|
||||
}
|
||||
|
||||
define command{
|
||||
command_name check_remote_swap
|
||||
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l nagios -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_swap -w $ARG2$ -c $ARG3$'
|
||||
|
|
|
@ -212,6 +212,14 @@ define service {
|
|||
contact_groups page_admins
|
||||
}
|
||||
|
||||
define service {
|
||||
use generic-service
|
||||
service_description Check rabbitmq notify_tornado consumers
|
||||
check_command check_rabbitmq_consumers!22
|
||||
hostgroup_name frontends
|
||||
contact_groups test_leo
|
||||
}
|
||||
|
||||
define service {
|
||||
use generic-service
|
||||
hostgroup_name all
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
"""
|
||||
Nagios plugin to check that the rabbitmq has the correct number of consumers
|
||||
|
||||
This script just checks the contents of /var/run/nagios/check-rabbitmq-consumers,
|
||||
which is generated by bots/check-rabbitmq-consumers.
|
||||
|
||||
It is run by cron and can be found at bots/rabbitmq-numconsumers-crontab
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
sys.path.append('/home/humbug/humbug/')
|
||||
from bots.cron_file_helper import nagios_from_file
|
||||
|
||||
RESULTS_FILE = "/var/run/nagios/check-rabbitmq-consumers"
|
||||
|
||||
ret, result = nagios_from_file(RESULTS_FILE)
|
||||
|
||||
print result
|
||||
exit(ret)
|
|
@ -10,31 +10,13 @@ which is generated by bots/check-rabbitmq-queue.
|
|||
It is run by cron and can be found at bots/rabbitmq-queuesize-crontab
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
|
||||
sys.path.append('/home/humbug/humbug/')
|
||||
from bots.cron_file_helper import nagios_from_file
|
||||
|
||||
RESULTS_FILE = "/var/run/nagios/check-rabbitmq-results"
|
||||
ret, result = nagios_from_file(RESULTS_FILE)
|
||||
|
||||
data = file(RESULTS_FILE).read().strip()
|
||||
pieces = data.split('|')
|
||||
|
||||
if not len(pieces) == 4:
|
||||
state = 'UNKNOWN'
|
||||
ret = 3
|
||||
data = "Results file malformed"
|
||||
else:
|
||||
timestamp = int(pieces[0])
|
||||
|
||||
time_diff = time.time() - timestamp
|
||||
if time_diff > 60 * 2:
|
||||
ret = 3
|
||||
state = 'UNKNOWN'
|
||||
data = "Results file is stale"
|
||||
else:
|
||||
ret = int(pieces[1])
|
||||
state = pieces[2]
|
||||
data = pieces[3]
|
||||
|
||||
print "%s: %s" % (state, data)
|
||||
|
||||
print result
|
||||
exit(ret)
|
||||
|
|
Loading…
Reference in New Issue