mirror of https://github.com/zulip/zulip.git
nagios: Switch to generic check_cron_file for queues and consumers.
These share a common root; 91da4bd59b
duplicated the code, but
didn't move the existing uses to the new utility.
This commit is contained in:
parent
b2d0bad9af
commit
41deef40cf
|
@ -1,25 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
"""
|
|
||||||
Nagios plugin to check that the RabbitMQ has the correct number of consumers.
|
|
||||||
|
|
||||||
This script just checks the contents of /var/lib/nagios_state/check-rabbitmq-consumers,
|
|
||||||
which is generated by scripts/nagios/check-rabbitmq-consumers.
|
|
||||||
|
|
||||||
It is run by cron and can be found at puppet/zulip/files/cron.d/rabbitmq-monitoring
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
|
|
||||||
sys.path.append("/home/zulip/deployments/current/scripts/nagios")
|
|
||||||
from cron_file_helper import nagios_from_file
|
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
print("Please pass the name of the consumer file to check")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
RESULTS_FILE = f"/var/lib/nagios_state/check-rabbitmq-consumers-{sys.argv[1]}"
|
|
||||||
|
|
||||||
ret, result = nagios_from_file(RESULTS_FILE)
|
|
||||||
|
|
||||||
print(result)
|
|
||||||
exit(ret)
|
|
|
@ -1,21 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
"""
|
|
||||||
Nagios plugin to check that the RabbitMQ queues are not overflowing as a result
|
|
||||||
of a stuck consumer.
|
|
||||||
|
|
||||||
This script just checks the contents of /var/lib/nagios_state/check-rabbitmq-results,
|
|
||||||
which is generated by scripts/nagios/check-rabbitmq-queue.
|
|
||||||
|
|
||||||
It is run by cron; the crontab is in puppet/zulip/files/cron.d/rabbitmq-monitoring
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
|
|
||||||
sys.path.append("/home/zulip/deployments/current/scripts/nagios")
|
|
||||||
from cron_file_helper import nagios_from_file
|
|
||||||
|
|
||||||
RESULTS_FILE = "/var/lib/nagios_state/check-rabbitmq-results"
|
|
||||||
ret, result = nagios_from_file(RESULTS_FILE)
|
|
||||||
|
|
||||||
print(result)
|
|
||||||
exit(ret)
|
|
|
@ -112,7 +112,7 @@ define command{
|
||||||
|
|
||||||
define command{
|
define command{
|
||||||
command_name check_rabbitmq_queues
|
command_name check_rabbitmq_queues
|
||||||
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_queues'
|
command_line /usr/lib/nagios/plugins/check_by_ssh -p $ARG1$ -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file /var/lib/nagios_state/check-rabbitmq-results'
|
||||||
}
|
}
|
||||||
|
|
||||||
define command{
|
define command{
|
||||||
|
@ -122,7 +122,7 @@ define command{
|
||||||
|
|
||||||
define command{
|
define command{
|
||||||
command_name check_rabbitmq_consumers
|
command_name check_rabbitmq_consumers
|
||||||
command_line /usr/lib/nagios/plugins/check_by_ssh -p 22 -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers $ARG1$'
|
command_line /usr/lib/nagios/plugins/check_by_ssh -p 22 -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file /var/lib/nagios_state/check-rabbitmq-consumers-$ARG1$'
|
||||||
}
|
}
|
||||||
|
|
||||||
define command{
|
define command{
|
||||||
|
|
|
@ -1,41 +0,0 @@
|
||||||
import time
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
|
|
||||||
def nagios_from_file(results_file: str) -> Tuple[int, str]:
|
|
||||||
"""Returns a nagios-appropriate string and return code obtained by
|
|
||||||
parsing the desired file on disk. The file on disk should be of format
|
|
||||||
|
|
||||||
%s|%s % (timestamp, nagios_string)
|
|
||||||
|
|
||||||
This file is created by various nagios checking cron jobs such as
|
|
||||||
check-rabbitmq-queues and check-rabbitmq-consumers"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open(results_file) as f:
|
|
||||||
data = f.read().strip()
|
|
||||||
except FileNotFoundError:
|
|
||||||
state = "UNKNOWN"
|
|
||||||
ret = 3
|
|
||||||
data = "Results file is missing"
|
|
||||||
else:
|
|
||||||
pieces = data.split("|")
|
|
||||||
|
|
||||||
if not len(pieces) == 4:
|
|
||||||
state = "UNKNOWN"
|
|
||||||
ret = 3
|
|
||||||
data = "Results file malformed"
|
|
||||||
else:
|
|
||||||
timestamp = int(pieces[0])
|
|
||||||
|
|
||||||
time_diff = time.time() - timestamp
|
|
||||||
if time_diff > 60 * 2:
|
|
||||||
ret = 3
|
|
||||||
state = "UNKNOWN"
|
|
||||||
data = "Results file is stale"
|
|
||||||
else:
|
|
||||||
ret = int(pieces[1])
|
|
||||||
state = pieces[2]
|
|
||||||
data = pieces[3]
|
|
||||||
|
|
||||||
return (ret, f"{state}: {data}")
|
|
|
@ -129,7 +129,7 @@ echo
|
||||||
# Then, compute the list of all Django queue workers to run Nagios checks against
|
# Then, compute the list of all Django queue workers to run Nagios checks against
|
||||||
consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer)
|
consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer)
|
||||||
for consumer in $consumer_list; do
|
for consumer in $consumer_list; do
|
||||||
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
|
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_cron_file "/var/lib/nagios_state/check-rabbitmq-consumers-$consumer"; then
|
||||||
set +x
|
set +x
|
||||||
echo
|
echo
|
||||||
echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:"
|
echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:"
|
||||||
|
|
Loading…
Reference in New Issue