mirror of https://github.com/zulip/zulip.git
Fix excessive CPU usage by rabbitmq-numconsumers Nagios checks.
The previous model for these Nagios checks was kinda crazy -- every minute, we'd run a full `rabbitmctl list_consumers` for each of the dozen+ consumers that we have, and then do the exact same parsing logic for each to determine whether the target queue has a running consumer to write out a state file. Because `rabbitmctl list_consumers` takes a small amount of resources, on systems where CPU is very limited (e.g. t2 style AWS instances), this minor CPU wastage could be problematic. Now we just do that `rabbitmqctl list_consumers` once per minute, and output all the state files from a single command. Further TODO items on this front include removing the hardcoded list of queues.
This commit is contained in:
parent
852af83d3c
commit
88a123d5e0
|
@ -2,15 +2,4 @@ SHELL=/bin/bash
|
|||
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
|
||||
USER=root
|
||||
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file notify_tornado
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_activity
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_activity_interval
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_presence
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file invites
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file signups
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file message_sender
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file feedback_messages
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file error_reports
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file digest_emails
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file email_mirror
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file missedmessage_mobile_notifications
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
|
||||
|
|
|
@ -22,10 +22,6 @@ if 'USER' in os.environ and not os.environ['USER'] in ['root', 'rabbitmq']:
|
|||
usage = """Usage: check-rabbitmq-consumers --queue=[queue-name] --min-threshold=[min-threshold]"""
|
||||
|
||||
parser = optparse.OptionParser(usage=usage)
|
||||
parser.add_option('--queue',
|
||||
dest='queue_name',
|
||||
default="notify_tornado",
|
||||
action='store')
|
||||
parser.add_option('--min-threshold',
|
||||
dest='min_count',
|
||||
type="int",
|
||||
|
@ -39,19 +35,41 @@ output = subprocess.check_output(['/usr/sbin/rabbitmqctl', 'list_consumers'],
|
|||
|
||||
consumers = defaultdict(int) # type: Dict[str, int]
|
||||
|
||||
worker_queues = {'error_reports',
|
||||
'user_presence',
|
||||
'digest_emails',
|
||||
'slow_queries',
|
||||
'missedmessage_mobile_notifications',
|
||||
'feedback_messages',
|
||||
'signups',
|
||||
'notify_tornado',
|
||||
'message_sender',
|
||||
'missedmessage_emails',
|
||||
'email_mirror',
|
||||
'user_activity_interval',
|
||||
'invites',
|
||||
'user_activity'}
|
||||
|
||||
for queue_name in worker_queues:
|
||||
consumers[queue_name] = 0
|
||||
|
||||
for line in output.split('\n'):
|
||||
parts = line.split('\t')
|
||||
if len(parts) and parts[0] == options.queue_name:
|
||||
if len(parts) >= 2:
|
||||
consumers[parts[0]] += 1
|
||||
|
||||
|
||||
now = int(time.time())
|
||||
|
||||
if consumers[options.queue_name] < options.min_count:
|
||||
for queue_name in consumers.keys():
|
||||
state_file_path = "/var/lib/nagios_state/check-rabbitmq-consumers-" + queue_name
|
||||
state_file_tmp = state_file_path + "-tmp"
|
||||
|
||||
if consumers[queue_name] < options.min_count:
|
||||
status = 2
|
||||
else:
|
||||
status = 0
|
||||
|
||||
print("%s|%s|%s|queue %s has %s consumers, needs %s" % (
|
||||
now, status, states[status], options.queue_name,
|
||||
consumers[options.queue_name], options.min_count))
|
||||
with open(state_file_tmp, "w") as f:
|
||||
f.write("%s|%s|%s|queue %s has %s consumers, needs %s\n" % (
|
||||
now, status, states[status], queue_name,
|
||||
consumers[queue_name], options.min_count))
|
||||
subprocess.check_call(["mv", state_file_tmp, state_file_path])
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
queue=$1
|
||||
|
||||
if [ -z "$queue" ]; then
|
||||
echo "Usage: $0 <queue-name>"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
ZULIP_DIR=/home/zulip/deployments/current
|
||||
STATE_DIR=/var/lib/nagios_state
|
||||
STATE_FILE=$STATE_DIR/check-rabbitmq-consumers-$queue
|
||||
|
||||
"$ZULIP_DIR/scripts/nagios/check-rabbitmq-consumers" "--queue=$queue" &> "${STATE_FILE}-tmp";
|
||||
mv "${STATE_FILE}-tmp" "$STATE_FILE"
|
|
@ -81,14 +81,9 @@ if supervisorctl status | grep -vq RUNNING || supervisorctl status | sed 's/^.*u
|
|||
fi
|
||||
|
||||
echo; echo "Now running RabbitMQ consumer Nagios tests"; echo
|
||||
# First run the check that usually runs in cron and populates the state files
|
||||
/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
|
||||
for consumer in notify_tornado user_activity user_activity_interval user_presence invites signups message_sender feedback_messages error_reports digest_emails email_mirror missedmessage_mobile_notifications; do
|
||||
if ! /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file "$consumer"; then
|
||||
# Temporary section while we're debugging why this fails nondeterministically in CI
|
||||
STATE_DIR=/var/lib/nagios_state
|
||||
ls "$STATE_DIR"
|
||||
cat "$STATE_DIR"/*
|
||||
/home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file "$consumer"
|
||||
fi
|
||||
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
|
||||
set +x
|
||||
echo
|
||||
|
|
Loading…
Reference in New Issue