Fix excessive CPU usage by rabbitmq-numconsumers Nagios checks.

The previous model for these Nagios checks was kinda crazy -- every
minute, we'd run a full `rabbitmctl list_consumers` for each of the
dozen+ consumers that we have, and then do the exact same parsing
logic for each to determine whether the target queue has a running
consumer to write out a state file.

Because `rabbitmctl list_consumers` takes a small amount of resources,
on systems where CPU is very limited (e.g. t2 style AWS instances),
this minor CPU wastage could be problematic.

Now we just do that `rabbitmqctl list_consumers` once per minute, and
output all the state files from a single command.

Further TODO items on this front include removing the hardcoded list
of queues.
This commit is contained in:
Tim Abbott 2016-08-12 14:09:36 -07:00
parent 852af83d3c
commit 88a123d5e0
4 changed files with 34 additions and 48 deletions

View File

@ -2,15 +2,4 @@ SHELL=/bin/bash
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
USER=root
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file notify_tornado
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_activity
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_activity_interval
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_presence
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file invites
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file signups
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file message_sender
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file feedback_messages
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file error_reports
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file digest_emails
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file email_mirror
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file missedmessage_mobile_notifications
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers

View File

@ -22,10 +22,6 @@ if 'USER' in os.environ and not os.environ['USER'] in ['root', 'rabbitmq']:
usage = """Usage: check-rabbitmq-consumers --queue=[queue-name] --min-threshold=[min-threshold]"""
parser = optparse.OptionParser(usage=usage)
parser.add_option('--queue',
dest='queue_name',
default="notify_tornado",
action='store')
parser.add_option('--min-threshold',
dest='min_count',
type="int",
@ -39,19 +35,41 @@ output = subprocess.check_output(['/usr/sbin/rabbitmqctl', 'list_consumers'],
consumers = defaultdict(int) # type: Dict[str, int]
worker_queues = {'error_reports',
'user_presence',
'digest_emails',
'slow_queries',
'missedmessage_mobile_notifications',
'feedback_messages',
'signups',
'notify_tornado',
'message_sender',
'missedmessage_emails',
'email_mirror',
'user_activity_interval',
'invites',
'user_activity'}
for queue_name in worker_queues:
consumers[queue_name] = 0
for line in output.split('\n'):
parts = line.split('\t')
if len(parts) and parts[0] == options.queue_name:
if len(parts) >= 2:
consumers[parts[0]] += 1
now = int(time.time())
if consumers[options.queue_name] < options.min_count:
for queue_name in consumers.keys():
state_file_path = "/var/lib/nagios_state/check-rabbitmq-consumers-" + queue_name
state_file_tmp = state_file_path + "-tmp"
if consumers[queue_name] < options.min_count:
status = 2
else:
status = 0
print("%s|%s|%s|queue %s has %s consumers, needs %s" % (
now, status, states[status], options.queue_name,
consumers[options.queue_name], options.min_count))
with open(state_file_tmp, "w") as f:
f.write("%s|%s|%s|queue %s has %s consumers, needs %s\n" % (
now, status, states[status], queue_name,
consumers[queue_name], options.min_count))
subprocess.check_call(["mv", state_file_tmp, state_file_path])

View File

@ -1,16 +0,0 @@
#!/usr/bin/env bash
set -e
queue=$1
if [ -z "$queue" ]; then
echo "Usage: $0 <queue-name>"
exit 2
fi
ZULIP_DIR=/home/zulip/deployments/current
STATE_DIR=/var/lib/nagios_state
STATE_FILE=$STATE_DIR/check-rabbitmq-consumers-$queue
"$ZULIP_DIR/scripts/nagios/check-rabbitmq-consumers" "--queue=$queue" &> "${STATE_FILE}-tmp";
mv "${STATE_FILE}-tmp" "$STATE_FILE"

View File

@ -81,14 +81,9 @@ if supervisorctl status | grep -vq RUNNING || supervisorctl status | sed 's/^.*u
fi
echo; echo "Now running RabbitMQ consumer Nagios tests"; echo
# First run the check that usually runs in cron and populates the state files
/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
for consumer in notify_tornado user_activity user_activity_interval user_presence invites signups message_sender feedback_messages error_reports digest_emails email_mirror missedmessage_mobile_notifications; do
if ! /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file "$consumer"; then
# Temporary section while we're debugging why this fails nondeterministically in CI
STATE_DIR=/var/lib/nagios_state
ls "$STATE_DIR"
cat "$STATE_DIR"/*
/home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file "$consumer"
fi
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
set +x
echo