nagios: Switch to generic check_cron_file for queues and consumers.

These share a common root; 91da4bd59b duplicated the code, but
didn't move the existing uses to the new utility.
This commit is contained in:
Alex Vandiver 2022-06-14 14:16:17 -07:00 committed by Alex Vandiver
parent b2d0bad9af
commit 41deef40cf
5 changed files with 3 additions and 90 deletions

View File

@ -1,25 +0,0 @@
#!/usr/bin/env python3
"""
Nagios plugin to check that the RabbitMQ has the correct number of consumers.
This script just checks the contents of /var/lib/nagios_state/check-rabbitmq-consumers,
which is generated by scripts/nagios/check-rabbitmq-consumers.
It is run by cron and can be found at puppet/zulip/files/cron.d/rabbitmq-monitoring
"""
import sys
sys.path.append("/home/zulip/deployments/current/scripts/nagios")
from cron_file_helper import nagios_from_file
if len(sys.argv) < 2:
print("Please pass the name of the consumer file to check")
exit(1)
RESULTS_FILE = f"/var/lib/nagios_state/check-rabbitmq-consumers-{sys.argv[1]}"
ret, result = nagios_from_file(RESULTS_FILE)
print(result)
exit(ret)

View File

@ -1,21 +0,0 @@
#!/usr/bin/env python3
"""
Nagios plugin to check that the RabbitMQ queues are not overflowing as a result
of a stuck consumer.
This script just checks the contents of /var/lib/nagios_state/check-rabbitmq-results,
which is generated by scripts/nagios/check-rabbitmq-queue.
It is run by cron; the crontab is in puppet/zulip/files/cron.d/rabbitmq-monitoring
"""
import sys
sys.path.append("/home/zulip/deployments/current/scripts/nagios")
from cron_file_helper import nagios_from_file
RESULTS_FILE = "/var/lib/nagios_state/check-rabbitmq-results"
ret, result = nagios_from_file(RESULTS_FILE)
print(result)
exit(ret)

View File

@ -112,7 +112,7 @@ define command{
define command{
command_name check_rabbitmq_queues
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_queues'
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file /var/lib/nagios_state/check-rabbitmq-results'
}
define command{
@ -122,7 +122,7 @@ define command{
define command{
command_name check_rabbitmq_consumers
command_line /usr/lib/nagios/plugins/check_by_ssh -p 22 -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers $ARG1$'
command_line /usr/lib/nagios/plugins/check_by_ssh -p 22 -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file /var/lib/nagios_state/check-rabbitmq-consumers-$ARG1$'
}
define command{

View File

@ -1,41 +0,0 @@
import time
from typing import Tuple
def nagios_from_file(results_file: str) -> Tuple[int, str]:
"""Returns a nagios-appropriate string and return code obtained by
parsing the desired file on disk. The file on disk should be of format
%s|%s % (timestamp, nagios_string)
This file is created by various nagios checking cron jobs such as
check-rabbitmq-queues and check-rabbitmq-consumers"""
try:
with open(results_file) as f:
data = f.read().strip()
except FileNotFoundError:
state = "UNKNOWN"
ret = 3
data = "Results file is missing"
else:
pieces = data.split("|")
if not len(pieces) == 4:
state = "UNKNOWN"
ret = 3
data = "Results file malformed"
else:
timestamp = int(pieces[0])
time_diff = time.time() - timestamp
if time_diff > 60 * 2:
ret = 3
state = "UNKNOWN"
data = "Results file is stale"
else:
ret = int(pieces[1])
state = pieces[2]
data = pieces[3]
return (ret, f"{state}: {data}")

View File

@ -129,7 +129,7 @@ echo
# Then, compute the list of all Django queue workers to run Nagios checks against
consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer)
for consumer in $consumer_list; do
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file "/var/lib/nagios_state/check-rabbitmq-consumers-$consumer"; then
set +x
echo
echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:"