zulip/analytics/management/commands/check_analytics_state.py

93 lines
3.3 KiB
Python

from dataclasses import dataclass
from datetime import timedelta
from typing import Any, Literal
from django.utils.timezone import now as timezone_now
from typing_extensions import override
from analytics.lib.counts import ALL_COUNT_STATS, CountStat
from analytics.models import installation_epoch
from scripts.lib.zulip_tools import atomic_nagios_write
from zerver.lib.management import ZulipBaseCommand
from zerver.lib.timestamp import TimeZoneNotUTCError, floor_to_day, floor_to_hour, verify_UTC
from zerver.models import Realm
states = {
0: "OK",
1: "WARNING",
2: "CRITICAL",
3: "UNKNOWN",
}
@dataclass
class NagiosResult:
status: Literal["ok", "warning", "critical", "unknown"]
message: str
class Command(ZulipBaseCommand):
help = """Checks FillState table.
Run as a cron job that runs every hour."""
@override
def handle(self, *args: Any, **options: Any) -> None:
fill_state = self.get_fill_state()
atomic_nagios_write("check-analytics-state", fill_state.status, fill_state.message)
def get_fill_state(self) -> NagiosResult:
if not Realm.objects.exists():
return NagiosResult(status="ok", message="No realms exist, so not checking FillState.")
warning_unfilled_properties = []
critical_unfilled_properties = []
for property, stat in ALL_COUNT_STATS.items():
last_fill = stat.last_successful_fill()
if last_fill is None:
last_fill = installation_epoch()
try:
verify_UTC(last_fill)
except TimeZoneNotUTCError:
return NagiosResult(
status="critical", message=f"FillState not in UTC for {property}"
)
if stat.frequency == CountStat.DAY:
floor_function = floor_to_day
warning_threshold = timedelta(hours=26)
critical_threshold = timedelta(hours=50)
else: # CountStat.HOUR
floor_function = floor_to_hour
warning_threshold = timedelta(minutes=90)
critical_threshold = timedelta(minutes=150)
if floor_function(last_fill) != last_fill:
return NagiosResult(
status="critical",
message=f"FillState not on {stat.frequency} boundary for {property}",
)
time_to_last_fill = timezone_now() - last_fill
if time_to_last_fill > critical_threshold:
critical_unfilled_properties.append(property)
elif time_to_last_fill > warning_threshold:
warning_unfilled_properties.append(property)
if len(critical_unfilled_properties) == 0 and len(warning_unfilled_properties) == 0:
return NagiosResult(status="ok", message="FillState looks fine.")
if len(critical_unfilled_properties) == 0:
return NagiosResult(
status="warning",
message="Missed filling {} once.".format(
", ".join(warning_unfilled_properties),
),
)
return NagiosResult(
status="critical",
message="Missed filling {} once. Missed filling {} at least twice.".format(
", ".join(warning_unfilled_properties),
", ".join(critical_unfilled_properties),
),
)