diff --git a/analytics/management/commands/check_analytics_state.py b/analytics/management/commands/check_analytics_state.py index 05eadc6111..84286b72a3 100644 --- a/analytics/management/commands/check_analytics_state.py +++ b/analytics/management/commands/check_analytics_state.py @@ -1,13 +1,13 @@ -import os -import time +from dataclasses import dataclass from datetime import timedelta -from typing import Any, Dict +from typing import Any, Literal from django.utils.timezone import now as timezone_now from typing_extensions import override from analytics.lib.counts import ALL_COUNT_STATS, CountStat from analytics.models import installation_epoch +from scripts.lib.zulip_tools import atomic_nagios_write from zerver.lib.management import ZulipBaseCommand from zerver.lib.timestamp import TimeZoneNotUTCError, floor_to_day, floor_to_hour, verify_UTC from zerver.models import Realm @@ -20,6 +20,12 @@ states = { } +@dataclass +class NagiosResult: + status: Literal["ok", "warning", "critical", "unknown"] + message: str + + class Command(ZulipBaseCommand): help = """Checks FillState table. @@ -28,19 +34,11 @@ class Command(ZulipBaseCommand): @override def handle(self, *args: Any, **options: Any) -> None: fill_state = self.get_fill_state() - status = fill_state["status"] - message = fill_state["message"] + atomic_nagios_write("check-analytics-state", fill_state.status, fill_state.message) - state_file_path = "/var/lib/nagios_state/check-analytics-state" - state_file_tmp = state_file_path + "-tmp" - - with open(state_file_tmp, "w") as f: - f.write(f"{int(time.time())}|{status}|{states[status]}|{message}\n") - os.rename(state_file_tmp, state_file_path) - - def get_fill_state(self) -> Dict[str, Any]: + def get_fill_state(self) -> NagiosResult: if not Realm.objects.exists(): - return {"status": 0, "message": "No realms exist, so not checking FillState."} + return NagiosResult(status="ok", message="No realms exist, so not checking FillState.") warning_unfilled_properties = [] critical_unfilled_properties = [] @@ -51,7 +49,9 @@ class Command(ZulipBaseCommand): try: verify_UTC(last_fill) except TimeZoneNotUTCError: - return {"status": 2, "message": f"FillState not in UTC for {property}"} + return NagiosResult( + status="critical", message=f"FillState not in UTC for {property}" + ) if stat.frequency == CountStat.DAY: floor_function = floor_to_day @@ -63,10 +63,10 @@ class Command(ZulipBaseCommand): critical_threshold = timedelta(minutes=150) if floor_function(last_fill) != last_fill: - return { - "status": 2, - "message": f"FillState not on {stat.frequency} boundary for {property}", - } + return NagiosResult( + status="critical", + message=f"FillState not on {stat.frequency} boundary for {property}", + ) time_to_last_fill = timezone_now() - last_fill if time_to_last_fill > critical_threshold: @@ -75,18 +75,18 @@ class Command(ZulipBaseCommand): warning_unfilled_properties.append(property) if len(critical_unfilled_properties) == 0 and len(warning_unfilled_properties) == 0: - return {"status": 0, "message": "FillState looks fine."} + return NagiosResult(status="ok", message="FillState looks fine.") if len(critical_unfilled_properties) == 0: - return { - "status": 1, - "message": "Missed filling {} once.".format( + return NagiosResult( + status="warning", + message="Missed filling {} once.".format( ", ".join(warning_unfilled_properties), ), - } - return { - "status": 2, - "message": "Missed filling {} once. Missed filling {} at least twice.".format( + ) + return NagiosResult( + status="critical", + message="Missed filling {} once. Missed filling {} at least twice.".format( ", ".join(warning_unfilled_properties), ", ".join(critical_unfilled_properties), ), - } + ) diff --git a/puppet/kandra/files/cron.d/rabbitmq-monitoring b/puppet/kandra/files/cron.d/rabbitmq-monitoring deleted file mode 100644 index 6de866fe2e..0000000000 --- a/puppet/kandra/files/cron.d/rabbitmq-monitoring +++ /dev/null @@ -1,26 +0,0 @@ -# Edit this file to introduce tasks to be run by cron. -# -# Each task to run has to be defined through a single line -# indicating with different fields when the task will be run -# and what command to run for the task -# -# To define the time you can provide concrete values for -# minute (m), hour (h), day of month (dom), month (mon), -# and day of week (dow) or use '*' in these fields (for 'any').# -# Notice that tasks will be started based on the cron's system -# daemon's notion of time and timezones. -# -# Output of the crontab jobs (including errors) is sent through -# email to the user the crontab file belongs to (unless redirected). -# -# For example, you can run a backup of all your user accounts -# at 5 a.m every week with: -# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ -# -# For more information see the manual pages of crontab(5) and cron(8) -# -# m h dom mon dow command -SHELL=/bin/bash - -* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-queue &> /var/lib/nagios_state/check-rabbitmq-results-tmp; mv /var/lib/nagios_state/check-rabbitmq-results-tmp /var/lib/nagios_state/check-rabbitmq-results -* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers diff --git a/puppet/kandra/files/nagios_plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness b/puppet/kandra/files/nagios_plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness index 685b9fa144..adb91379ea 100755 --- a/puppet/kandra/files/nagios_plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness +++ b/puppet/kandra/files/nagios_plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness @@ -10,15 +10,14 @@ Django ORM. import os import sys from datetime import timedelta -from typing import NoReturn, Optional sys.path.append("/home/zulip/deployments/current") from scripts.lib.setup_path import setup_path +from scripts.lib.zulip_tools import atomic_nagios_write setup_path() import django -from django.db.models import QuerySet from django.utils.timezone import now as timezone_now os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings" @@ -27,51 +26,13 @@ sys.path.append("/home/zulip/deployments/current/zerver") django.setup() -from typing import Dict from zerver.models import UserActivity from zerver.models.clients import get_client -states: Dict[str, int] = { - "OK": 0, - "WARNING": 1, - "CRITICAL": 2, - "UNKNOWN": 3, -} - -state_file_path = "/var/lib/nagios_state/check_user_zephyr_mirror_liveness" now = timezone_now() -def report( - state: str, short_msg: str, all_users: Optional[QuerySet[UserActivity]] = None -) -> NoReturn: - too_old_data = "" - if all_users is not None: - recently_inactive_users = ( - all_users.filter(last_visit__lt=now - timedelta(minutes=10)) - .distinct("user_profile_id") - .difference( - all_users.filter(last_visit__lt=now - timedelta(minutes=60)).distinct( - "user_profile_id" - ) - ) - ) - too_old_data = "\nLast call to get_message for recently out of date mirrors:\n" + "\n".join( - "{:>16}: {}".format( - user.user_profile.email, - user.last_visit.strftime("%Y-%m-%d %H:%M %Z"), - ) - for user in recently_inactive_users - ) - - with open(state_file_path + ".tmp", "w") as f: - f.write(f"{int(now.timestamp())}|{states[state]}|{state}|{short_msg}{too_old_data}") - os.rename(state_file_path + ".tmp", state_file_path) - print(f"{state}: {short_msg}{too_old_data}") - sys.exit(states[state]) - - zephyr_client = get_client("zephyr_mirror") all_users = UserActivity.objects.filter( # We need to use the client_id so we can use the partial index we @@ -84,23 +45,51 @@ all_users = UserActivity.objects.filter( query__in=["get_events", "/api/v1/events"], client_id=zephyr_client.id, ) -new_inactive_users = ( +new_inactive_user_count = ( all_users.filter(last_visit__lt=now - timedelta(minutes=10)) .values("user_profile_id") .distinct("user_profile_id") .count() ) -old_inactive_users = ( +old_inactive_user_count = ( all_users.filter(last_visit__lt=now - timedelta(minutes=60)) .values("user_profile_id") .distinct("user_profile_id") .count() ) -recently_inactive_users = new_inactive_users - old_inactive_users +recently_inactive_user_count = new_inactive_user_count - old_inactive_user_count -if recently_inactive_users / float(old_inactive_users) > 0.25: - report("CRITICAL", "Many mirrors recently became inactive", all_users) +if recently_inactive_user_count / float(old_inactive_user_count) > 0.25: + recently_inactive_users = ( + all_users.filter(last_visit__lt=now - timedelta(minutes=10)) + .distinct("user_profile_id") + .difference( + all_users.filter(last_visit__lt=now - timedelta(minutes=60)).distinct("user_profile_id") + ) + ) + too_old_data = ( + "Many mirrors recently became inactive\n" + "Last call to get_message for recently out of date mirrors:\n" + + "\n".join( + "{:>16}: {}".format( + user.user_profile.email, + user.last_visit.strftime("%Y-%m-%d %H:%M %Z"), + ) + for user in recently_inactive_users + ) + ) + + sys.exit( + atomic_nagios_write( + "check_user_zephyr_mirror_liveness", "critical", too_old_data, int(now.timestamp()) + ) + ) else: - report("OK", "Most mirrors that were recently active continue to be active") + atomic_nagios_write( + "check_user_zephyr_mirror_liveness", + "ok", + "Most mirrors that were recently active continue to be active", + int(now.timestamp()), + ) diff --git a/puppet/kandra/manifests/app_frontend_monitoring.pp b/puppet/kandra/manifests/app_frontend_monitoring.pp index e54ed28afd..ce3d56e77e 100644 --- a/puppet/kandra/manifests/app_frontend_monitoring.pp +++ b/puppet/kandra/manifests/app_frontend_monitoring.pp @@ -6,12 +6,16 @@ class kandra::app_frontend_monitoring { include kandra::prometheus::uwsgi include kandra::prometheus::process kandra::firewall_allow { 'grok_exporter': port => '9144' } + file { '/etc/cron.d/rabbitmq-monitoring': - ensure => file, - require => Package[rabbitmq-server], - owner => 'root', - group => 'root', - mode => '0644', - source => 'puppet:///modules/kandra/cron.d/rabbitmq-monitoring', + ensure => absent, + } + zulip::cron { 'check-rabbitmq-queue': + minute => '*', + command => '/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-queue', + } + zulip::cron { 'check-rabbitmq-consumers': + minute => '*', + command => '/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers', } } diff --git a/puppet/kandra/manifests/prod_app_frontend_once.pp b/puppet/kandra/manifests/prod_app_frontend_once.pp index 989c2e680b..974dab6504 100644 --- a/puppet/kandra/manifests/prod_app_frontend_once.pp +++ b/puppet/kandra/manifests/prod_app_frontend_once.pp @@ -27,6 +27,6 @@ class kandra::prod_app_frontend_once { zulip::cron { 'check_user_zephyr_mirror_liveness': hour => '*', minute => '*', - command => '/usr/lib/nagios/plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness >/dev/null', + command => '/usr/lib/nagios/plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness', } } diff --git a/puppet/zulip/files/nagios_plugins/zulip_app_frontend/check_send_receive_time b/puppet/zulip/files/nagios_plugins/zulip_app_frontend/check_send_receive_time index e7c6a1cc4b..dd62dc98f4 100755 --- a/puppet/zulip/files/nagios_plugins/zulip_app_frontend/check_send_receive_time +++ b/puppet/zulip/files/nagios_plugins/zulip_app_frontend/check_send_receive_time @@ -13,11 +13,12 @@ import random import sys import time import traceback -from typing import Any, Dict, List, NoReturn, Optional +from typing import Any, Dict, List, Literal, NoReturn, Optional sys.path.append(".") sys.path.append("/home/zulip/deployments/current") from scripts.lib.setup_path import setup_path +from scripts.lib.zulip_tools import atomic_nagios_write setup_path() @@ -47,43 +48,33 @@ parser.add_argument("--insecure", action="store_true") options = parser.parse_args() -def report(state: str, timestamp: Any = None, msg: Optional[str] = None) -> NoReturn: - states = { - "OK": 0, - "WARNING": 1, - "CRITICAL": 2, - "UNKNOWN": 3, - } - - now = int(time.time()) +def report( + state: Literal["ok", "warning", "critical", "unknown"], + timestamp: Optional[float] = None, + msg: Optional[str] = None, +) -> NoReturn: if msg is None: msg = f"send time was {timestamp}" - state_file_path = "/var/lib/nagios_state/check_send_receive_state" - with open(state_file_path + ".tmp", "w") as f: - f.write(f"{now}|{states[state]}|{state}|{msg}\n") - os.rename(state_file_path + ".tmp", state_file_path) - if states[state] > 0: - print(f"{state}: {msg}") - sys.exit(states[state]) + sys.exit(atomic_nagios_write("check_send_receive_state", state, msg)) def send_zulip(sender: zulip.Client, message: Dict[str, Any]) -> None: result = sender.send_message(message) if result["result"] != "success": - report("CRITICAL", msg=f"Error sending Zulip, args were: {message}, {result}") + report("critical", msg=f"Error sending Zulip, args were: {message}, {result}") def get_zulips() -> List[Dict[str, Any]]: global last_event_id res = zulip_recipient.get_events(queue_id=queue_id, last_event_id=last_event_id) if "error" in res.get("result", {}): - report("CRITICAL", msg="Error receiving Zulips, error was: {}".format(res["msg"])) + report("critical", msg="Error receiving Zulips, error was: {}".format(res["msg"])) for event in res["events"]: last_event_id = max(last_event_id, int(event["id"])) # If we get a heartbeat event, that means we've been hanging for # 40s, and we should bail. if "heartbeat" in (event["type"] for event in res["events"]): - report("CRITICAL", msg="Got heartbeat waiting for Zulip, which means get_events is hanging") + report("critical", msg="Got heartbeat waiting for Zulip, which means get_events is hanging") return [event["message"] for event in res["events"]] @@ -120,10 +111,10 @@ zulip_recipient = zulip.Client( try: res = zulip_recipient.register(event_types=["message"]) if "error" in res.get("result", {}): - report("CRITICAL", msg="Error subscribing to Zulips: {}".format(res["msg"])) + report("critical", msg="Error subscribing to Zulips: {}".format(res["msg"])) queue_id, last_event_id = (res["queue_id"], res["last_event_id"]) except Exception: - report("CRITICAL", msg=f"Error subscribing to Zulips:\n{traceback.format_exc()}") + report("critical", msg=f"Error subscribing to Zulips:\n{traceback.format_exc()}") msg_to_send = str(random.getrandbits(64)) time_start = time.perf_counter() @@ -148,8 +139,8 @@ while msg_to_send not in msg_content: zulip_recipient.deregister(queue_id) if seconds_diff > 12: - report("CRITICAL", timestamp=seconds_diff) + report("critical", timestamp=seconds_diff) if seconds_diff > 3: - report("WARNING", timestamp=seconds_diff) + report("warning", timestamp=seconds_diff) else: - report("OK", timestamp=seconds_diff) + report("ok", timestamp=seconds_diff) diff --git a/scripts/lib/check_rabbitmq_queue.py b/scripts/lib/check_rabbitmq_queue.py index fa12b7531f..ee6ed8ddf6 100644 --- a/scripts/lib/check_rabbitmq_queue.py +++ b/scripts/lib/check_rabbitmq_queue.py @@ -10,7 +10,7 @@ from typing import Any, DefaultDict, Dict, List ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(ZULIP_PATH) -from scripts.lib.zulip_tools import get_config, get_config_file +from scripts.lib.zulip_tools import atomic_nagios_write, get_config, get_config_file normal_queues = [ "deferred_work", @@ -190,8 +190,6 @@ def check_rabbitmq_queues() -> None: status = max(result["status"] for result in results) - now = int(time.time()) - if status > 0: queue_error_template = "queue {} problem: {}:{}" error_message = "; ".join( @@ -199,6 +197,12 @@ def check_rabbitmq_queues() -> None: for result in results if result["status"] > 0 ) - print(f"{now}|{status}|{states[status]}|{error_message}") + sys.exit( + atomic_nagios_write( + "check-rabbitmq-results", + "critical" if status == CRITICAL else "warning", + error_message, + ) + ) else: - print(f"{now}|{status}|{states[status]}|queues normal") + atomic_nagios_write("check-rabbitmq-results", "ok", "queues normal") diff --git a/scripts/lib/zulip_tools.py b/scripts/lib/zulip_tools.py index 2124224b8d..38a7fff3b5 100755 --- a/scripts/lib/zulip_tools.py +++ b/scripts/lib/zulip_tools.py @@ -16,7 +16,7 @@ import sys import time import uuid from datetime import datetime, timedelta -from typing import IO, Any, Dict, List, Optional, Sequence, Set, Union, overload +from typing import IO, Any, Dict, List, Literal, Optional, Sequence, Set, Union, overload from urllib.parse import SplitResult import zoneinfo @@ -723,6 +723,32 @@ def listening_publicly(port: int) -> List[str]: return [line.split()[4] for line in lines] +def atomic_nagios_write( + name: str, + status: Literal["ok", "warning", "critical", "unknown"], + message: Optional[str] = None, + event_time: Optional[int] = None, +) -> int: + if message is None: + message = status + if event_time is None: + event_time = int(time.time()) + if status == "ok": + status_int = 0 + elif status == "warning": + status_int = 1 + elif status == "critical": + status_int = 2 + elif status == "unknown": + status_int = 3 + + path = "/var/lib/nagios_state/" + name + with open(path + ".tmp", "w") as fh: + fh.write("|".join([str(event_time), str(status_int), status, message]) + "\n") + os.rename(path + ".tmp", path) + return status_int + + if __name__ == "__main__": cmd = sys.argv[1] if cmd == "make_deploy_path": diff --git a/scripts/nagios/check-rabbitmq-consumers b/scripts/nagios/check-rabbitmq-consumers index e78ad6d682..c256cf8e65 100755 --- a/scripts/nagios/check-rabbitmq-consumers +++ b/scripts/nagios/check-rabbitmq-consumers @@ -11,14 +11,12 @@ from typing import Dict ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(ZULIP_PATH) from scripts.lib.check_rabbitmq_queue import normal_queues -from scripts.lib.zulip_tools import get_config, get_config_file, get_tornado_ports - -states = { - 0: "OK", - 1: "WARNING", - 2: "CRITICAL", - 3: "UNKNOWN", -} +from scripts.lib.zulip_tools import ( + atomic_nagios_write, + get_config, + get_config_file, + get_tornado_ports, +) if "USER" in os.environ and os.environ["USER"] not in ["root", "rabbitmq"]: print("This script must be run as the root or rabbitmq user") @@ -56,9 +54,6 @@ for line in output.split("\n"): now = int(time.time()) for queue_name in consumers: - state_file_path = "/var/lib/nagios_state/check-rabbitmq-consumers-" + queue_name - state_file_tmp = state_file_path + "-tmp" - target_count = 1 if queue_name == "notify_tornado": target_count = TORNADO_PROCESSES @@ -67,12 +62,8 @@ for queue_name in consumers: get_config(config_file, "application_server", "mobile_notification_shards", "1") ) - if consumers[queue_name] < target_count: - status = 2 - else: - status = 0 - with open(state_file_tmp, "w") as f: - f.write( - f"{now}|{status}|{states[status]}|queue {queue_name} has {consumers[queue_name]} consumers, needs {target_count}\n" - ) - os.rename(state_file_tmp, state_file_path) + atomic_nagios_write( + "check-rabbitmq-consumers-" + queue_name, + "critical" if consumers[queue_name] < target_count else "ok", + "queue {queue_name} has {consumers[queue_name]} consumers, needs {target_count}", + ) diff --git a/zproject/default_settings.py b/zproject/default_settings.py index 79566c270a..651b79087d 100644 --- a/zproject/default_settings.py +++ b/zproject/default_settings.py @@ -575,7 +575,7 @@ GOOGLE_ANALYTICS_ID: Optional[str] = None # This is overridden by dev_settings.py for droplets. IS_DEV_DROPLET = False -# Used by puppet/kandra/files/cron.d/check_send_receive_time. +# Used by the `check_send_receive_time` monitoring tool. NAGIOS_BOT_HOST = SYSTEM_BOT_REALM + "." + EXTERNAL_HOST # Use half of the available CPUs for data import purposes.