mirror of https://github.com/zulip/zulip.git
nagios: Switch to generic check_cron_file for queues and consumers.
These share a common root; 91da4bd59b
duplicated the code, but
didn't move the existing uses to the new utility.
This commit is contained in:
parent
b2d0bad9af
commit
41deef40cf
|
@ -1,25 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Nagios plugin to check that the RabbitMQ has the correct number of consumers.
|
||||
|
||||
This script just checks the contents of /var/lib/nagios_state/check-rabbitmq-consumers,
|
||||
which is generated by scripts/nagios/check-rabbitmq-consumers.
|
||||
|
||||
It is run by cron and can be found at puppet/zulip/files/cron.d/rabbitmq-monitoring
|
||||
"""
|
||||
import sys
|
||||
|
||||
sys.path.append("/home/zulip/deployments/current/scripts/nagios")
|
||||
from cron_file_helper import nagios_from_file
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Please pass the name of the consumer file to check")
|
||||
exit(1)
|
||||
|
||||
RESULTS_FILE = f"/var/lib/nagios_state/check-rabbitmq-consumers-{sys.argv[1]}"
|
||||
|
||||
ret, result = nagios_from_file(RESULTS_FILE)
|
||||
|
||||
print(result)
|
||||
exit(ret)
|
|
@ -1,21 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Nagios plugin to check that the RabbitMQ queues are not overflowing as a result
|
||||
of a stuck consumer.
|
||||
|
||||
This script just checks the contents of /var/lib/nagios_state/check-rabbitmq-results,
|
||||
which is generated by scripts/nagios/check-rabbitmq-queue.
|
||||
|
||||
It is run by cron; the crontab is in puppet/zulip/files/cron.d/rabbitmq-monitoring
|
||||
"""
|
||||
import sys
|
||||
|
||||
sys.path.append("/home/zulip/deployments/current/scripts/nagios")
|
||||
from cron_file_helper import nagios_from_file
|
||||
|
||||
RESULTS_FILE = "/var/lib/nagios_state/check-rabbitmq-results"
|
||||
ret, result = nagios_from_file(RESULTS_FILE)
|
||||
|
||||
print(result)
|
||||
exit(ret)
|
|
@ -112,7 +112,7 @@ define command{
|
|||
|
||||
define command{
|
||||
command_name check_rabbitmq_queues
|
||||
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_queues'
|
||||
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file /var/lib/nagios_state/check-rabbitmq-results'
|
||||
}
|
||||
|
||||
define command{
|
||||
|
@ -122,7 +122,7 @@ define command{
|
|||
|
||||
define command{
|
||||
command_name check_rabbitmq_consumers
|
||||
command_line /usr/lib/nagios/plugins/check_by_ssh -p 22 -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers $ARG1$'
|
||||
command_line /usr/lib/nagios/plugins/check_by_ssh -p 22 -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file /var/lib/nagios_state/check-rabbitmq-consumers-$ARG1$'
|
||||
}
|
||||
|
||||
define command{
|
||||
|
|
|
@ -1,41 +0,0 @@
|
|||
import time
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
def nagios_from_file(results_file: str) -> Tuple[int, str]:
|
||||
"""Returns a nagios-appropriate string and return code obtained by
|
||||
parsing the desired file on disk. The file on disk should be of format
|
||||
|
||||
%s|%s % (timestamp, nagios_string)
|
||||
|
||||
This file is created by various nagios checking cron jobs such as
|
||||
check-rabbitmq-queues and check-rabbitmq-consumers"""
|
||||
|
||||
try:
|
||||
with open(results_file) as f:
|
||||
data = f.read().strip()
|
||||
except FileNotFoundError:
|
||||
state = "UNKNOWN"
|
||||
ret = 3
|
||||
data = "Results file is missing"
|
||||
else:
|
||||
pieces = data.split("|")
|
||||
|
||||
if not len(pieces) == 4:
|
||||
state = "UNKNOWN"
|
||||
ret = 3
|
||||
data = "Results file malformed"
|
||||
else:
|
||||
timestamp = int(pieces[0])
|
||||
|
||||
time_diff = time.time() - timestamp
|
||||
if time_diff > 60 * 2:
|
||||
ret = 3
|
||||
state = "UNKNOWN"
|
||||
data = "Results file is stale"
|
||||
else:
|
||||
ret = int(pieces[1])
|
||||
state = pieces[2]
|
||||
data = pieces[3]
|
||||
|
||||
return (ret, f"{state}: {data}")
|
|
@ -129,7 +129,7 @@ echo
|
|||
# Then, compute the list of all Django queue workers to run Nagios checks against
|
||||
consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer)
|
||||
for consumer in $consumer_list; do
|
||||
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
|
||||
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file "/var/lib/nagios_state/check-rabbitmq-consumers-$consumer"; then
|
||||
set +x
|
||||
echo
|
||||
echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:"
|
||||
|
|
Loading…
Reference in New Issue