diff --git a/puppet/zulip/files/cron.d/rabbitmq-numconsumers b/puppet/zulip/files/cron.d/rabbitmq-numconsumers index 0bf9f81002..b9cb5e72ab 100644 --- a/puppet/zulip/files/cron.d/rabbitmq-numconsumers +++ b/puppet/zulip/files/cron.d/rabbitmq-numconsumers @@ -2,15 +2,4 @@ SHELL=/bin/bash PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin USER=root -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file notify_tornado -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_activity -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_activity_interval -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file user_presence -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file invites -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file signups -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file message_sender -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file feedback_messages -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file error_reports -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file digest_emails -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file email_mirror -* * * * * root /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file missedmessage_mobile_notifications +* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers diff --git a/scripts/nagios/check-rabbitmq-consumers b/scripts/nagios/check-rabbitmq-consumers index 72b1147416..24d2ff8294 100755 --- a/scripts/nagios/check-rabbitmq-consumers +++ b/scripts/nagios/check-rabbitmq-consumers @@ -22,10 +22,6 @@ if 'USER' in os.environ and not os.environ['USER'] in ['root', 'rabbitmq']: usage = """Usage: check-rabbitmq-consumers --queue=[queue-name] --min-threshold=[min-threshold]""" parser = optparse.OptionParser(usage=usage) -parser.add_option('--queue', - dest='queue_name', - default="notify_tornado", - action='store') parser.add_option('--min-threshold', dest='min_count', type="int", @@ -39,19 +35,41 @@ output = subprocess.check_output(['/usr/sbin/rabbitmqctl', 'list_consumers'], consumers = defaultdict(int) # type: Dict[str, int] +worker_queues = {'error_reports', + 'user_presence', + 'digest_emails', + 'slow_queries', + 'missedmessage_mobile_notifications', + 'feedback_messages', + 'signups', + 'notify_tornado', + 'message_sender', + 'missedmessage_emails', + 'email_mirror', + 'user_activity_interval', + 'invites', + 'user_activity'} + +for queue_name in worker_queues: + consumers[queue_name] = 0 + for line in output.split('\n'): parts = line.split('\t') - if len(parts) and parts[0] == options.queue_name: + if len(parts) >= 2: consumers[parts[0]] += 1 - now = int(time.time()) -if consumers[options.queue_name] < options.min_count: - status = 2 -else: - status = 0 +for queue_name in consumers.keys(): + state_file_path = "/var/lib/nagios_state/check-rabbitmq-consumers-" + queue_name + state_file_tmp = state_file_path + "-tmp" -print("%s|%s|%s|queue %s has %s consumers, needs %s" % ( - now, status, states[status], options.queue_name, - consumers[options.queue_name], options.min_count)) + if consumers[queue_name] < options.min_count: + status = 2 + else: + status = 0 + with open(state_file_tmp, "w") as f: + f.write("%s|%s|%s|queue %s has %s consumers, needs %s\n" % ( + now, status, states[status], queue_name, + consumers[queue_name], options.min_count)) + subprocess.check_call(["mv", state_file_tmp, state_file_path]) diff --git a/scripts/nagios/write-rabbitmq-consumers-state-file b/scripts/nagios/write-rabbitmq-consumers-state-file deleted file mode 100755 index 3009b9aba5..0000000000 --- a/scripts/nagios/write-rabbitmq-consumers-state-file +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash -set -e - -queue=$1 - -if [ -z "$queue" ]; then - echo "Usage: $0 " - exit 2 -fi - -ZULIP_DIR=/home/zulip/deployments/current -STATE_DIR=/var/lib/nagios_state -STATE_FILE=$STATE_DIR/check-rabbitmq-consumers-$queue - -"$ZULIP_DIR/scripts/nagios/check-rabbitmq-consumers" "--queue=$queue" &> "${STATE_FILE}-tmp"; -mv "${STATE_FILE}-tmp" "$STATE_FILE" diff --git a/tools/travis/production-helper b/tools/travis/production-helper index a56e602a42..8f9e498caf 100755 --- a/tools/travis/production-helper +++ b/tools/travis/production-helper @@ -81,14 +81,9 @@ if supervisorctl status | grep -vq RUNNING || supervisorctl status | sed 's/^.*u fi echo; echo "Now running RabbitMQ consumer Nagios tests"; echo +# First run the check that usually runs in cron and populates the state files +/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers for consumer in notify_tornado user_activity user_activity_interval user_presence invites signups message_sender feedback_messages error_reports digest_emails email_mirror missedmessage_mobile_notifications; do - if ! /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file "$consumer"; then - # Temporary section while we're debugging why this fails nondeterministically in CI - STATE_DIR=/var/lib/nagios_state - ls "$STATE_DIR" - cat "$STATE_DIR"/* - /home/zulip/deployments/current/scripts/nagios/write-rabbitmq-consumers-state-file "$consumer" - fi if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then set +x echo