diff --git a/puppet/zulip/files/nagios_plugins/zulip_app_frontend/check_rabbitmq_consumers b/puppet/zulip/files/nagios_plugins/zulip_app_frontend/check_rabbitmq_consumers deleted file mode 100755 index 5e4e1fff4b..0000000000 --- a/puppet/zulip/files/nagios_plugins/zulip_app_frontend/check_rabbitmq_consumers +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - -""" -Nagios plugin to check that the RabbitMQ has the correct number of consumers. - -This script just checks the contents of /var/lib/nagios_state/check-rabbitmq-consumers, -which is generated by scripts/nagios/check-rabbitmq-consumers. - -It is run by cron and can be found at puppet/zulip/files/cron.d/rabbitmq-monitoring -""" -import sys - -sys.path.append("/home/zulip/deployments/current/scripts/nagios") -from cron_file_helper import nagios_from_file - -if len(sys.argv) < 2: - print("Please pass the name of the consumer file to check") - exit(1) - -RESULTS_FILE = f"/var/lib/nagios_state/check-rabbitmq-consumers-{sys.argv[1]}" - -ret, result = nagios_from_file(RESULTS_FILE) - -print(result) -exit(ret) diff --git a/puppet/zulip/files/nagios_plugins/zulip_app_frontend/check_rabbitmq_queues b/puppet/zulip/files/nagios_plugins/zulip_app_frontend/check_rabbitmq_queues deleted file mode 100755 index 8e85d851a2..0000000000 --- a/puppet/zulip/files/nagios_plugins/zulip_app_frontend/check_rabbitmq_queues +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python3 - -""" -Nagios plugin to check that the RabbitMQ queues are not overflowing as a result -of a stuck consumer. - -This script just checks the contents of /var/lib/nagios_state/check-rabbitmq-results, -which is generated by scripts/nagios/check-rabbitmq-queue. - -It is run by cron; the crontab is in puppet/zulip/files/cron.d/rabbitmq-monitoring -""" -import sys - -sys.path.append("/home/zulip/deployments/current/scripts/nagios") -from cron_file_helper import nagios_from_file - -RESULTS_FILE = "/var/lib/nagios_state/check-rabbitmq-results" -ret, result = nagios_from_file(RESULTS_FILE) - -print(result) -exit(ret) diff --git a/puppet/zulip_ops/files/nagios4/commands.cfg b/puppet/zulip_ops/files/nagios4/commands.cfg index c383fbdaf9..d41f013307 100644 --- a/puppet/zulip_ops/files/nagios4/commands.cfg +++ b/puppet/zulip_ops/files/nagios4/commands.cfg @@ -112,7 +112,7 @@ define command{ define command{ command_name check_rabbitmq_queues - command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_queues' + command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file /var/lib/nagios_state/check-rabbitmq-results' } define command{ @@ -122,7 +122,7 @@ define command{ define command{ command_name check_rabbitmq_consumers - command_line /usr/lib/nagios/plugins/check_by_ssh -p 22 -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers $ARG1$' + command_line /usr/lib/nagios/plugins/check_by_ssh -p 22 -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file /var/lib/nagios_state/check-rabbitmq-consumers-$ARG1$' } define command{ diff --git a/scripts/nagios/cron_file_helper.py b/scripts/nagios/cron_file_helper.py deleted file mode 100644 index 8f0f4caf0f..0000000000 --- a/scripts/nagios/cron_file_helper.py +++ /dev/null @@ -1,41 +0,0 @@ -import time -from typing import Tuple - - -def nagios_from_file(results_file: str) -> Tuple[int, str]: - """Returns a nagios-appropriate string and return code obtained by - parsing the desired file on disk. The file on disk should be of format - - %s|%s % (timestamp, nagios_string) - - This file is created by various nagios checking cron jobs such as - check-rabbitmq-queues and check-rabbitmq-consumers""" - - try: - with open(results_file) as f: - data = f.read().strip() - except FileNotFoundError: - state = "UNKNOWN" - ret = 3 - data = "Results file is missing" - else: - pieces = data.split("|") - - if not len(pieces) == 4: - state = "UNKNOWN" - ret = 3 - data = "Results file malformed" - else: - timestamp = int(pieces[0]) - - time_diff = time.time() - timestamp - if time_diff > 60 * 2: - ret = 3 - state = "UNKNOWN" - data = "Results file is stale" - else: - ret = int(pieces[1]) - state = pieces[2] - data = pieces[3] - - return (ret, f"{state}: {data}") diff --git a/tools/ci/production-verify b/tools/ci/production-verify index 389e8ed715..c2076575d7 100755 --- a/tools/ci/production-verify +++ b/tools/ci/production-verify @@ -129,7 +129,7 @@ echo # Then, compute the list of all Django queue workers to run Nagios checks against consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer) for consumer in $consumer_list; do - if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then + if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file "/var/lib/nagios_state/check-rabbitmq-consumers-$consumer"; then set +x echo echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:"