2020-03-21 13:10:22 +01:00
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import time
|
|
|
|
import subprocess
|
|
|
|
import json
|
|
|
|
|
|
|
|
from collections import defaultdict
|
|
|
|
from typing import Any, DefaultDict, Dict, List
|
|
|
|
|
|
|
|
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
|
|
|
|
normal_queues = [
|
|
|
|
'deferred_work',
|
|
|
|
'digest_emails',
|
|
|
|
'email_mirror',
|
|
|
|
'embed_links',
|
|
|
|
'embedded_bots',
|
|
|
|
'error_reports',
|
|
|
|
'invites',
|
|
|
|
'email_senders',
|
|
|
|
'missedmessage_emails',
|
|
|
|
'missedmessage_mobile_notifications',
|
|
|
|
'outgoing_webhooks',
|
|
|
|
'signups',
|
|
|
|
'user_activity',
|
|
|
|
'user_activity_interval',
|
|
|
|
'user_presence',
|
|
|
|
]
|
|
|
|
|
|
|
|
OK = 0
|
|
|
|
WARNING = 1
|
|
|
|
CRITICAL = 2
|
|
|
|
UNKNOWN = 3
|
|
|
|
|
|
|
|
states = {
|
|
|
|
0: "OK",
|
|
|
|
1: "WARNING",
|
|
|
|
2: "CRITICAL",
|
python: Use trailing commas consistently.
Automatically generated by the following script, based on the output
of lint with flake8-comma:
import re
import sys
last_filename = None
last_row = None
lines = []
for msg in sys.stdin:
m = re.match(
r"\x1b\[35mflake8 \|\x1b\[0m \x1b\[1;31m(.+):(\d+):(\d+): (\w+)", msg
)
if m:
filename, row_str, col_str, err = m.groups()
row, col = int(row_str), int(col_str)
if filename == last_filename:
assert last_row != row
else:
if last_filename is not None:
with open(last_filename, "w") as f:
f.writelines(lines)
with open(filename) as f:
lines = f.readlines()
last_filename = filename
last_row = row
line = lines[row - 1]
if err in ["C812", "C815"]:
lines[row - 1] = line[: col - 1] + "," + line[col - 1 :]
elif err in ["C819"]:
assert line[col - 2] == ","
lines[row - 1] = line[: col - 2] + line[col - 1 :].lstrip(" ")
if last_filename is not None:
with open(last_filename, "w") as f:
f.writelines(lines)
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2020-04-10 05:23:40 +02:00
|
|
|
3: "UNKNOWN",
|
2020-03-21 13:10:22 +01:00
|
|
|
}
|
|
|
|
|
2020-04-22 01:09:50 +02:00
|
|
|
MAX_SECONDS_TO_CLEAR_FOR_BURSTS: DefaultDict[str, int] = defaultdict(
|
2020-03-21 13:10:22 +01:00
|
|
|
lambda: 120,
|
|
|
|
digest_emails=600,
|
2020-04-22 01:09:50 +02:00
|
|
|
)
|
|
|
|
MAX_SECONDS_TO_CLEAR_NORMAL: DefaultDict[str, int] = defaultdict(
|
2020-03-21 13:10:22 +01:00
|
|
|
lambda: 30,
|
2020-04-10 22:14:12 +02:00
|
|
|
digest_emails=1200,
|
|
|
|
missedmessage_mobile_notifications=120,
|
2020-04-22 01:09:50 +02:00
|
|
|
)
|
|
|
|
CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS: DefaultDict[str, int] = defaultdict(
|
2020-03-21 13:10:22 +01:00
|
|
|
lambda: 240,
|
|
|
|
digest_emails=1200,
|
2020-04-22 01:09:50 +02:00
|
|
|
)
|
|
|
|
CRITICAL_SECONDS_TO_CLEAR_NORMAL: DefaultDict[str, int] = defaultdict(
|
2020-03-21 13:10:22 +01:00
|
|
|
lambda: 60,
|
2020-04-10 22:14:12 +02:00
|
|
|
missedmessage_mobile_notifications=180,
|
2020-03-21 13:10:22 +01:00
|
|
|
digest_emails=600,
|
2020-04-22 01:09:50 +02:00
|
|
|
)
|
2020-03-21 13:10:22 +01:00
|
|
|
|
|
|
|
def analyze_queue_stats(queue_name: str, stats: Dict[str, Any],
|
|
|
|
queue_count_rabbitmqctl: int) -> Dict[str, Any]:
|
|
|
|
now = int(time.time())
|
|
|
|
if stats == {}:
|
|
|
|
return dict(status=UNKNOWN,
|
|
|
|
name=queue_name,
|
|
|
|
message='invalid or no stats data')
|
|
|
|
|
|
|
|
if now - stats['update_time'] > 180 and queue_count_rabbitmqctl > 10:
|
|
|
|
# Queue isn't updating the stats file and has some events in
|
|
|
|
# the backlog, it's likely stuck.
|
|
|
|
#
|
|
|
|
# TODO: There's an unfortunate race where if the queue has
|
|
|
|
# been empty for the last hour (because there haven't been 50
|
|
|
|
# new events in the last hour), and then gets a burst, this
|
|
|
|
# condition will be true for the first (event_handling_time *
|
|
|
|
# 50).
|
|
|
|
return dict(status=CRITICAL,
|
|
|
|
name=queue_name,
|
2020-06-10 06:41:04 +02:00
|
|
|
message='queue appears to be stuck, last update {}, queue size {}'.format(
|
2020-05-14 15:26:12 +02:00
|
|
|
stats['update_time'], queue_count_rabbitmqctl))
|
2020-03-21 13:10:22 +01:00
|
|
|
|
|
|
|
current_size = stats['current_queue_size']
|
|
|
|
average_consume_time = stats['recent_average_consume_time']
|
|
|
|
if average_consume_time is None:
|
|
|
|
# Queue just started; we can't effectively estimate anything.
|
|
|
|
#
|
|
|
|
# If the queue is stuck in this state and not processing
|
|
|
|
# anything, eventually the `update_time` rule above will fire.
|
|
|
|
return dict(status=OK,
|
|
|
|
name=queue_name,
|
|
|
|
message='')
|
|
|
|
|
|
|
|
expected_time_to_clear_backlog = current_size * average_consume_time
|
|
|
|
time_since_emptied = now - stats['queue_last_emptied_timestamp']
|
|
|
|
if time_since_emptied > max(300, CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS[queue_name]):
|
|
|
|
# We need the max() expression in case the rules for the queue
|
|
|
|
# permit longer processing times than 300s - to prevent
|
|
|
|
# incorrectly throwing an error by changing the classification
|
|
|
|
# of the the backlog from "burst" to "not burst" after 300s,
|
|
|
|
# while the worker is still processing it and staying below
|
|
|
|
# the CRITICAL threshold.
|
|
|
|
if expected_time_to_clear_backlog > MAX_SECONDS_TO_CLEAR_NORMAL[queue_name]:
|
|
|
|
if expected_time_to_clear_backlog > CRITICAL_SECONDS_TO_CLEAR_NORMAL[queue_name]:
|
|
|
|
status = CRITICAL
|
|
|
|
else:
|
|
|
|
status = WARNING
|
|
|
|
|
|
|
|
return dict(status=status,
|
|
|
|
name=queue_name,
|
2020-06-10 06:41:04 +02:00
|
|
|
message=f'clearing the backlog will take too long: {expected_time_to_clear_backlog}s, size: {current_size}')
|
2020-03-21 13:10:22 +01:00
|
|
|
else:
|
|
|
|
# We slept recently, so treat this as a burst.
|
|
|
|
if expected_time_to_clear_backlog > MAX_SECONDS_TO_CLEAR_FOR_BURSTS[queue_name]:
|
|
|
|
if expected_time_to_clear_backlog > CRITICAL_SECONDS_TO_CLEAR_FOR_BURSTS[queue_name]:
|
|
|
|
status = CRITICAL
|
|
|
|
else:
|
|
|
|
status = WARNING
|
|
|
|
|
|
|
|
return dict(status=status,
|
|
|
|
name=queue_name,
|
2020-06-10 06:41:04 +02:00
|
|
|
message=f'clearing the burst will take too long: {expected_time_to_clear_backlog}s, size: {current_size}')
|
2020-03-21 13:10:22 +01:00
|
|
|
|
|
|
|
return dict(status=OK,
|
|
|
|
name=queue_name,
|
|
|
|
message='')
|
|
|
|
|
|
|
|
WARN_COUNT_THRESHOLD_DEFAULT = 10
|
|
|
|
CRITICAL_COUNT_THRESHOLD_DEFAULT = 50
|
|
|
|
def check_other_queues(queue_counts_dict: Dict[str, int]) -> List[Dict[str, Any]]:
|
|
|
|
""" Do a simple queue size check for queues whose workers don't publish stats files."""
|
|
|
|
|
|
|
|
results = []
|
|
|
|
for queue, count in queue_counts_dict.items():
|
|
|
|
if queue in normal_queues:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if count > CRITICAL_COUNT_THRESHOLD_DEFAULT:
|
|
|
|
results.append(dict(status=CRITICAL, name=queue,
|
2020-06-10 06:41:04 +02:00
|
|
|
message=f'count critical: {count}'))
|
2020-03-21 13:10:22 +01:00
|
|
|
elif count > WARN_COUNT_THRESHOLD_DEFAULT:
|
|
|
|
results.append(dict(status=WARNING, name=queue,
|
2020-06-10 06:41:04 +02:00
|
|
|
message=f'count warning: {count}'))
|
2020-03-21 13:10:22 +01:00
|
|
|
else:
|
|
|
|
results.append(dict(status=OK, name=queue, message=''))
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
def check_rabbitmq_queues() -> None:
|
|
|
|
pattern = re.compile(r'(\w+)\t(\d+)')
|
|
|
|
if 'USER' in os.environ and not os.environ['USER'] in ['root', 'rabbitmq']:
|
|
|
|
print("This script must be run as the root or rabbitmq user")
|
|
|
|
|
|
|
|
list_queues_output = subprocess.check_output(['/usr/sbin/rabbitmqctl', 'list_queues'],
|
|
|
|
universal_newlines=True)
|
|
|
|
list_consumers_output = subprocess.check_output(['/usr/sbin/rabbitmqctl', 'list_consumers'],
|
|
|
|
universal_newlines=True)
|
|
|
|
|
|
|
|
queue_counts_rabbitmqctl = dict()
|
|
|
|
for line in list_queues_output.split("\n"):
|
|
|
|
line = line.strip()
|
|
|
|
m = pattern.match(line)
|
|
|
|
if m:
|
|
|
|
queue = m.group(1)
|
|
|
|
count = int(m.group(2))
|
|
|
|
queue_counts_rabbitmqctl[queue] = count
|
|
|
|
|
|
|
|
queues_with_consumers = []
|
|
|
|
for line in list_consumers_output.split('\n'):
|
|
|
|
parts = line.split('\t')
|
|
|
|
if len(parts) >= 2:
|
|
|
|
queue_name = parts[0]
|
|
|
|
if queue_name.startswith("notify_tornado"):
|
|
|
|
continue
|
|
|
|
queues_with_consumers.append(queue_name)
|
|
|
|
|
|
|
|
queue_stats_dir = subprocess.check_output([os.path.join(ZULIP_PATH, 'scripts/get-django-setting'),
|
2020-04-22 01:45:30 +02:00
|
|
|
'QUEUE_STATS_DIR'],
|
2020-03-21 13:10:22 +01:00
|
|
|
universal_newlines=True).strip()
|
2020-04-22 01:09:50 +02:00
|
|
|
queue_stats: Dict[str, Dict[str, Any]] = dict()
|
2020-03-21 13:10:22 +01:00
|
|
|
queues_to_check = set(normal_queues).intersection(set(queues_with_consumers))
|
|
|
|
for queue in queues_to_check:
|
|
|
|
fn = queue + ".stats"
|
|
|
|
file_path = os.path.join(queue_stats_dir, fn)
|
|
|
|
if not os.path.exists(file_path):
|
|
|
|
queue_stats[queue] = {}
|
|
|
|
continue
|
|
|
|
|
2020-04-10 01:58:24 +02:00
|
|
|
with open(file_path) as f:
|
2020-03-21 13:10:22 +01:00
|
|
|
try:
|
|
|
|
queue_stats[queue] = json.load(f)
|
|
|
|
except json.decoder.JSONDecodeError:
|
|
|
|
queue_stats[queue] = {}
|
|
|
|
|
|
|
|
results = []
|
|
|
|
for queue_name, stats in queue_stats.items():
|
2020-04-10 21:36:13 +02:00
|
|
|
results.append(analyze_queue_stats(queue_name, stats, queue_counts_rabbitmqctl[queue_name]))
|
2020-03-21 13:10:22 +01:00
|
|
|
|
|
|
|
results.extend(check_other_queues(queue_counts_rabbitmqctl))
|
|
|
|
|
|
|
|
status = max(result['status'] for result in results)
|
|
|
|
|
|
|
|
now = int(time.time())
|
|
|
|
|
|
|
|
if status > 0:
|
|
|
|
queue_error_template = "queue {} problem: {}:{}"
|
|
|
|
error_message = '; '.join([
|
|
|
|
queue_error_template.format(result['name'], states[result['status']], result['message'])
|
|
|
|
for result in results if result['status'] > 0
|
|
|
|
])
|
2020-06-10 06:41:04 +02:00
|
|
|
print(f"{now}|{status}|{states[status]}|{error_message}")
|
2020-03-21 13:10:22 +01:00
|
|
|
else:
|
2020-06-10 06:41:04 +02:00
|
|
|
print(f"{now}|{status}|{states[status]}|queues normal")
|