mirror of https://github.com/zulip/zulip.git
Fix excessive CPU usage by rabbitmq-numconsumers Nagios checks.
The previous model for these Nagios checks was kinda crazy -- every minute, we'd run a full `rabbitmctl list_consumers` for each of the dozen+ consumers that we have, and then do the exact same parsing logic for each to determine whether the target queue has a running consumer to write out a state file. Because `rabbitmctl list_consumers` takes a small amount of resources, on systems where CPU is very limited (e.g. t2 style AWS instances), this minor CPU wastage could be problematic. Now we just do that `rabbitmqctl list_consumers` once per minute, and output all the state files from a single command. Further TODO items on this front include removing the hardcoded list of queues.
This commit is contained in:
parent
852af83d3c
commit
88a123d5e0
|
@ -2,15 +2,4 @@ SHELL=/bin/bash
|
||||||
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
|
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
|
||||||
USER=root
|
USER=root
|
||||||
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file notify_tornado
|
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_activity
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_activity_interval
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_presence
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file invites
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file signups
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file message_sender
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file feedback_messages
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file error_reports
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file digest_emails
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file email_mirror
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file missedmessage_mobile_notifications
|
|
||||||
|
|
|
@ -22,10 +22,6 @@ if 'USER' in os.environ and not os.environ['USER'] in ['root', 'rabbitmq']:
|
||||||
usage = """Usage: check-rabbitmq-consumers --queue=[queue-name] --min-threshold=[min-threshold]"""
|
usage = """Usage: check-rabbitmq-consumers --queue=[queue-name] --min-threshold=[min-threshold]"""
|
||||||
|
|
||||||
parser = optparse.OptionParser(usage=usage)
|
parser = optparse.OptionParser(usage=usage)
|
||||||
parser.add_option('--queue',
|
|
||||||
dest='queue_name',
|
|
||||||
default="notify_tornado",
|
|
||||||
action='store')
|
|
||||||
parser.add_option('--min-threshold',
|
parser.add_option('--min-threshold',
|
||||||
dest='min_count',
|
dest='min_count',
|
||||||
type="int",
|
type="int",
|
||||||
|
@ -39,19 +35,41 @@ output = subprocess.check_output(['/usr/sbin/rabbitmqctl', 'list_consumers'],
|
||||||
|
|
||||||
consumers = defaultdict(int) # type: Dict[str, int]
|
consumers = defaultdict(int) # type: Dict[str, int]
|
||||||
|
|
||||||
|
worker_queues = {'error_reports',
|
||||||
|
'user_presence',
|
||||||
|
'digest_emails',
|
||||||
|
'slow_queries',
|
||||||
|
'missedmessage_mobile_notifications',
|
||||||
|
'feedback_messages',
|
||||||
|
'signups',
|
||||||
|
'notify_tornado',
|
||||||
|
'message_sender',
|
||||||
|
'missedmessage_emails',
|
||||||
|
'email_mirror',
|
||||||
|
'user_activity_interval',
|
||||||
|
'invites',
|
||||||
|
'user_activity'}
|
||||||
|
|
||||||
|
for queue_name in worker_queues:
|
||||||
|
consumers[queue_name] = 0
|
||||||
|
|
||||||
for line in output.split('\n'):
|
for line in output.split('\n'):
|
||||||
parts = line.split('\t')
|
parts = line.split('\t')
|
||||||
if len(parts) and parts[0] == options.queue_name:
|
if len(parts) >= 2:
|
||||||
consumers[parts[0]] += 1
|
consumers[parts[0]] += 1
|
||||||
|
|
||||||
|
|
||||||
now = int(time.time())
|
now = int(time.time())
|
||||||
|
|
||||||
if consumers[options.queue_name] < options.min_count:
|
for queue_name in consumers.keys():
|
||||||
status = 2
|
state_file_path = "/var/lib/nagios_state/check-rabbitmq-consumers-" + queue_name
|
||||||
else:
|
state_file_tmp = state_file_path + "-tmp"
|
||||||
status = 0
|
|
||||||
|
|
||||||
print("%s|%s|%s|queue %s has %s consumers, needs %s" % (
|
if consumers[queue_name] < options.min_count:
|
||||||
now, status, states[status], options.queue_name,
|
status = 2
|
||||||
consumers[options.queue_name], options.min_count))
|
else:
|
||||||
|
status = 0
|
||||||
|
with open(state_file_tmp, "w") as f:
|
||||||
|
f.write("%s|%s|%s|queue %s has %s consumers, needs %s\n" % (
|
||||||
|
now, status, states[status], queue_name,
|
||||||
|
consumers[queue_name], options.min_count))
|
||||||
|
subprocess.check_call(["mv", state_file_tmp, state_file_path])
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
queue=$1
|
|
||||||
|
|
||||||
if [ -z "$queue" ]; then
|
|
||||||
echo "Usage: $0 <queue-name>"
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
ZULIP_DIR=/home/zulip/deployments/current
|
|
||||||
STATE_DIR=/var/lib/nagios_state
|
|
||||||
STATE_FILE=$STATE_DIR/check-rabbitmq-consumers-$queue
|
|
||||||
|
|
||||||
"$ZULIP_DIR/scripts/nagios/check-rabbitmq-consumers" "--queue=$queue" &> "${STATE_FILE}-tmp";
|
|
||||||
mv "${STATE_FILE}-tmp" "$STATE_FILE"
|
|
|
@ -81,14 +81,9 @@ if supervisorctl status | grep -vq RUNNING || supervisorctl status | sed 's/^.*u
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo; echo "Now running RabbitMQ consumer Nagios tests"; echo
|
echo; echo "Now running RabbitMQ consumer Nagios tests"; echo
|
||||||
|
# First run the check that usually runs in cron and populates the state files
|
||||||
|
/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
|
||||||
for consumer in notify_tornado user_activity user_activity_interval user_presence invites signups message_sender feedback_messages error_reports digest_emails email_mirror missedmessage_mobile_notifications; do
|
for consumer in notify_tornado user_activity user_activity_interval user_presence invites signups message_sender feedback_messages error_reports digest_emails email_mirror missedmessage_mobile_notifications; do
|
||||||
if ! /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file "$consumer"; then
|
|
||||||
# Temporary section while we're debugging why this fails nondeterministically in CI
|
|
||||||
STATE_DIR=/var/lib/nagios_state
|
|
||||||
ls "$STATE_DIR"
|
|
||||||
cat "$STATE_DIR"/*
|
|
||||||
/home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file "$consumer"
|
|
||||||
fi
|
|
||||||
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
|
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
|
||||||
set +x
|
set +x
|
||||||
echo
|
echo
|
||||||
|
|
Loading…
Reference in New Issue