from dataclasses import dataclass from datetime import timedelta from typing import Any, Literal from django.utils.timezone import now as timezone_now from typing_extensions import override from analytics.lib.counts import ALL_COUNT_STATS, CountStat from analytics.models import installation_epoch from scripts.lib.zulip_tools import atomic_nagios_write from zerver.lib.management import ZulipBaseCommand from zerver.lib.timestamp import TimeZoneNotUTCError, floor_to_day, floor_to_hour, verify_UTC from zerver.models import Realm states = { 0: "OK", 1: "WARNING", 2: "CRITICAL", 3: "UNKNOWN", } @dataclass class NagiosResult: status: Literal["ok", "warning", "critical", "unknown"] message: str class Command(ZulipBaseCommand): help = """Checks FillState table. Run as a cron job that runs every hour.""" @override def handle(self, *args: Any, **options: Any) -> None: fill_state = self.get_fill_state() atomic_nagios_write("check-analytics-state", fill_state.status, fill_state.message) def get_fill_state(self) -> NagiosResult: if not Realm.objects.exists(): return NagiosResult(status="ok", message="No realms exist, so not checking FillState.") warning_unfilled_properties = [] critical_unfilled_properties = [] for property, stat in ALL_COUNT_STATS.items(): last_fill = stat.last_successful_fill() if last_fill is None: last_fill = installation_epoch() try: verify_UTC(last_fill) except TimeZoneNotUTCError: return NagiosResult( status="critical", message=f"FillState not in UTC for {property}" ) if stat.frequency == CountStat.DAY: floor_function = floor_to_day warning_threshold = timedelta(hours=26) critical_threshold = timedelta(hours=50) else: # CountStat.HOUR floor_function = floor_to_hour warning_threshold = timedelta(minutes=90) critical_threshold = timedelta(minutes=150) if floor_function(last_fill) != last_fill: return NagiosResult( status="critical", message=f"FillState not on {stat.frequency} boundary for {property}", ) time_to_last_fill = timezone_now() - last_fill if time_to_last_fill > critical_threshold: critical_unfilled_properties.append(property) elif time_to_last_fill > warning_threshold: warning_unfilled_properties.append(property) if len(critical_unfilled_properties) == 0 and len(warning_unfilled_properties) == 0: return NagiosResult(status="ok", message="FillState looks fine.") if len(critical_unfilled_properties) == 0: return NagiosResult( status="warning", message="Missed filling {} once.".format( ", ".join(warning_unfilled_properties), ), ) return NagiosResult( status="critical", message="Missed filling {} once. Missed filling {} at least twice.".format( ", ".join(warning_unfilled_properties), ", ".join(critical_unfilled_properties), ), )