puppet: Factor out pattern of writing a nagios state file atomically.

This commit is contained in:
Alex Vandiver 2024-05-22 04:22:22 +00:00 committed by Tim Abbott
parent 230040caa9
commit f246b82f67
10 changed files with 138 additions and 159 deletions

View File

@ -1,13 +1,13 @@
import os from dataclasses import dataclass
import time
from datetime import timedelta from datetime import timedelta
from typing import Any, Dict from typing import Any, Literal
from django.utils.timezone import now as timezone_now from django.utils.timezone import now as timezone_now
from typing_extensions import override from typing_extensions import override
from analytics.lib.counts import ALL_COUNT_STATS, CountStat from analytics.lib.counts import ALL_COUNT_STATS, CountStat
from analytics.models import installation_epoch from analytics.models import installation_epoch
from scripts.lib.zulip_tools import atomic_nagios_write
from zerver.lib.management import ZulipBaseCommand from zerver.lib.management import ZulipBaseCommand
from zerver.lib.timestamp import TimeZoneNotUTCError, floor_to_day, floor_to_hour, verify_UTC from zerver.lib.timestamp import TimeZoneNotUTCError, floor_to_day, floor_to_hour, verify_UTC
from zerver.models import Realm from zerver.models import Realm
@ -20,6 +20,12 @@ states = {
} }
@dataclass
class NagiosResult:
status: Literal["ok", "warning", "critical", "unknown"]
message: str
class Command(ZulipBaseCommand): class Command(ZulipBaseCommand):
help = """Checks FillState table. help = """Checks FillState table.
@ -28,19 +34,11 @@ class Command(ZulipBaseCommand):
@override @override
def handle(self, *args: Any, **options: Any) -> None: def handle(self, *args: Any, **options: Any) -> None:
fill_state = self.get_fill_state() fill_state = self.get_fill_state()
status = fill_state["status"] atomic_nagios_write("check-analytics-state", fill_state.status, fill_state.message)
message = fill_state["message"]
state_file_path = "/var/lib/nagios_state/check-analytics-state" def get_fill_state(self) -> NagiosResult:
state_file_tmp = state_file_path + "-tmp"
with open(state_file_tmp, "w") as f:
f.write(f"{int(time.time())}|{status}|{states[status]}|{message}\n")
os.rename(state_file_tmp, state_file_path)
def get_fill_state(self) -> Dict[str, Any]:
if not Realm.objects.exists(): if not Realm.objects.exists():
return {"status": 0, "message": "No realms exist, so not checking FillState."} return NagiosResult(status="ok", message="No realms exist, so not checking FillState.")
warning_unfilled_properties = [] warning_unfilled_properties = []
critical_unfilled_properties = [] critical_unfilled_properties = []
@ -51,7 +49,9 @@ class Command(ZulipBaseCommand):
try: try:
verify_UTC(last_fill) verify_UTC(last_fill)
except TimeZoneNotUTCError: except TimeZoneNotUTCError:
return {"status": 2, "message": f"FillState not in UTC for {property}"} return NagiosResult(
status="critical", message=f"FillState not in UTC for {property}"
)
if stat.frequency == CountStat.DAY: if stat.frequency == CountStat.DAY:
floor_function = floor_to_day floor_function = floor_to_day
@ -63,10 +63,10 @@ class Command(ZulipBaseCommand):
critical_threshold = timedelta(minutes=150) critical_threshold = timedelta(minutes=150)
if floor_function(last_fill) != last_fill: if floor_function(last_fill) != last_fill:
return { return NagiosResult(
"status": 2, status="critical",
"message": f"FillState not on {stat.frequency} boundary for {property}", message=f"FillState not on {stat.frequency} boundary for {property}",
} )
time_to_last_fill = timezone_now() - last_fill time_to_last_fill = timezone_now() - last_fill
if time_to_last_fill > critical_threshold: if time_to_last_fill > critical_threshold:
@ -75,18 +75,18 @@ class Command(ZulipBaseCommand):
warning_unfilled_properties.append(property) warning_unfilled_properties.append(property)
if len(critical_unfilled_properties) == 0 and len(warning_unfilled_properties) == 0: if len(critical_unfilled_properties) == 0 and len(warning_unfilled_properties) == 0:
return {"status": 0, "message": "FillState looks fine."} return NagiosResult(status="ok", message="FillState looks fine.")
if len(critical_unfilled_properties) == 0: if len(critical_unfilled_properties) == 0:
return { return NagiosResult(
"status": 1, status="warning",
"message": "Missed filling {} once.".format( message="Missed filling {} once.".format(
", ".join(warning_unfilled_properties), ", ".join(warning_unfilled_properties),
), ),
} )
return { return NagiosResult(
"status": 2, status="critical",
"message": "Missed filling {} once. Missed filling {} at least twice.".format( message="Missed filling {} once. Missed filling {} at least twice.".format(
", ".join(warning_unfilled_properties), ", ".join(warning_unfilled_properties),
", ".join(critical_unfilled_properties), ", ".join(critical_unfilled_properties),
), ),
} )

View File

@ -1,26 +0,0 @@
# Edit this file to introduce tasks to be run by cron.
#
# Each task to run has to be defined through a single line
# indicating with different fields when the task will be run
# and what command to run for the task
#
# To define the time you can provide concrete values for
# minute (m), hour (h), day of month (dom), month (mon),
# and day of week (dow) or use '*' in these fields (for 'any').#
# Notice that tasks will be started based on the cron's system
# daemon's notion of time and timezones.
#
# Output of the crontab jobs (including errors) is sent through
# email to the user the crontab file belongs to (unless redirected).
#
# For example, you can run a backup of all your user accounts
# at 5 a.m every week with:
# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/
#
# For more information see the manual pages of crontab(5) and cron(8)
#
# m h dom mon dow command
SHELL=/bin/bash
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-queue &> /var/lib/nagios_state/check-rabbitmq-results-tmp; mv /var/lib/nagios_state/check-rabbitmq-results-tmp /var/lib/nagios_state/check-rabbitmq-results
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers

View File

@ -10,15 +10,14 @@ Django ORM.
import os import os
import sys import sys
from datetime import timedelta from datetime import timedelta
from typing import NoReturn, Optional
sys.path.append("/home/zulip/deployments/current") sys.path.append("/home/zulip/deployments/current")
from scripts.lib.setup_path import setup_path from scripts.lib.setup_path import setup_path
from scripts.lib.zulip_tools import atomic_nagios_write
setup_path() setup_path()
import django import django
from django.db.models import QuerySet
from django.utils.timezone import now as timezone_now from django.utils.timezone import now as timezone_now
os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings" os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"
@ -27,51 +26,13 @@ sys.path.append("/home/zulip/deployments/current/zerver")
django.setup() django.setup()
from typing import Dict
from zerver.models import UserActivity from zerver.models import UserActivity
from zerver.models.clients import get_client from zerver.models.clients import get_client
states: Dict[str, int] = {
"OK": 0,
"WARNING": 1,
"CRITICAL": 2,
"UNKNOWN": 3,
}
state_file_path = "/var/lib/nagios_state/check_user_zephyr_mirror_liveness"
now = timezone_now() now = timezone_now()
def report(
state: str, short_msg: str, all_users: Optional[QuerySet[UserActivity]] = None
) -> NoReturn:
too_old_data = ""
if all_users is not None:
recently_inactive_users = (
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
.distinct("user_profile_id")
.difference(
all_users.filter(last_visit__lt=now - timedelta(minutes=60)).distinct(
"user_profile_id"
)
)
)
too_old_data = "\nLast call to get_message for recently out of date mirrors:\n" + "\n".join(
"{:>16}: {}".format(
user.user_profile.email,
user.last_visit.strftime("%Y-%m-%d %H:%M %Z"),
)
for user in recently_inactive_users
)
with open(state_file_path + ".tmp", "w") as f:
f.write(f"{int(now.timestamp())}|{states[state]}|{state}|{short_msg}{too_old_data}")
os.rename(state_file_path + ".tmp", state_file_path)
print(f"{state}: {short_msg}{too_old_data}")
sys.exit(states[state])
zephyr_client = get_client("zephyr_mirror") zephyr_client = get_client("zephyr_mirror")
all_users = UserActivity.objects.filter( all_users = UserActivity.objects.filter(
# We need to use the client_id so we can use the partial index we # We need to use the client_id so we can use the partial index we
@ -84,23 +45,51 @@ all_users = UserActivity.objects.filter(
query__in=["get_events", "/api/v1/events"], query__in=["get_events", "/api/v1/events"],
client_id=zephyr_client.id, client_id=zephyr_client.id,
) )
new_inactive_users = ( new_inactive_user_count = (
all_users.filter(last_visit__lt=now - timedelta(minutes=10)) all_users.filter(last_visit__lt=now - timedelta(minutes=10))
.values("user_profile_id") .values("user_profile_id")
.distinct("user_profile_id") .distinct("user_profile_id")
.count() .count()
) )
old_inactive_users = ( old_inactive_user_count = (
all_users.filter(last_visit__lt=now - timedelta(minutes=60)) all_users.filter(last_visit__lt=now - timedelta(minutes=60))
.values("user_profile_id") .values("user_profile_id")
.distinct("user_profile_id") .distinct("user_profile_id")
.count() .count()
) )
recently_inactive_users = new_inactive_users - old_inactive_users recently_inactive_user_count = new_inactive_user_count - old_inactive_user_count
if recently_inactive_users / float(old_inactive_users) > 0.25: if recently_inactive_user_count / float(old_inactive_user_count) > 0.25:
report("CRITICAL", "Many mirrors recently became inactive", all_users) recently_inactive_users = (
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
.distinct("user_profile_id")
.difference(
all_users.filter(last_visit__lt=now - timedelta(minutes=60)).distinct("user_profile_id")
)
)
too_old_data = (
"Many mirrors recently became inactive\n"
"Last call to get_message for recently out of date mirrors:\n"
+ "\n".join(
"{:>16}: {}".format(
user.user_profile.email,
user.last_visit.strftime("%Y-%m-%d %H:%M %Z"),
)
for user in recently_inactive_users
)
)
sys.exit(
atomic_nagios_write(
"check_user_zephyr_mirror_liveness", "critical", too_old_data, int(now.timestamp())
)
)
else: else:
report("OK", "Most mirrors that were recently active continue to be active") atomic_nagios_write(
"check_user_zephyr_mirror_liveness",
"ok",
"Most mirrors that were recently active continue to be active",
int(now.timestamp()),
)

View File

@ -6,12 +6,16 @@ class kandra::app_frontend_monitoring {
include kandra::prometheus::uwsgi include kandra::prometheus::uwsgi
include kandra::prometheus::process include kandra::prometheus::process
kandra::firewall_allow { 'grok_exporter': port => '9144' } kandra::firewall_allow { 'grok_exporter': port => '9144' }
file { '/etc/cron.d/rabbitmq-monitoring': file { '/etc/cron.d/rabbitmq-monitoring':
ensure => file, ensure => absent,
require => Package[rabbitmq-server], }
owner => 'root', zulip::cron { 'check-rabbitmq-queue':
group => 'root', minute => '*',
mode => '0644', command => '/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-queue',
source => 'puppet:///modules/kandra/cron.d/rabbitmq-monitoring', }
zulip::cron { 'check-rabbitmq-consumers':
minute => '*',
command => '/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers',
} }
} }

View File

@ -27,6 +27,6 @@ class kandra::prod_app_frontend_once {
zulip::cron { 'check_user_zephyr_mirror_liveness': zulip::cron { 'check_user_zephyr_mirror_liveness':
hour => '*', hour => '*',
minute => '*', minute => '*',
command => '/usr/lib/nagios/plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness >/dev/null', command => '/usr/lib/nagios/plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness',
} }
} }

View File

@ -13,11 +13,12 @@ import random
import sys import sys
import time import time
import traceback import traceback
from typing import Any, Dict, List, NoReturn, Optional from typing import Any, Dict, List, Literal, NoReturn, Optional
sys.path.append(".") sys.path.append(".")
sys.path.append("/home/zulip/deployments/current") sys.path.append("/home/zulip/deployments/current")
from scripts.lib.setup_path import setup_path from scripts.lib.setup_path import setup_path
from scripts.lib.zulip_tools import atomic_nagios_write
setup_path() setup_path()
@ -47,43 +48,33 @@ parser.add_argument("--insecure", action="store_true")
options = parser.parse_args() options = parser.parse_args()
def report(state: str, timestamp: Any = None, msg: Optional[str] = None) -> NoReturn: def report(
states = { state: Literal["ok", "warning", "critical", "unknown"],
"OK": 0, timestamp: Optional[float] = None,
"WARNING": 1, msg: Optional[str] = None,
"CRITICAL": 2, ) -> NoReturn:
"UNKNOWN": 3,
}
now = int(time.time())
if msg is None: if msg is None:
msg = f"send time was {timestamp}" msg = f"send time was {timestamp}"
state_file_path = "/var/lib/nagios_state/check_send_receive_state" sys.exit(atomic_nagios_write("check_send_receive_state", state, msg))
with open(state_file_path + ".tmp", "w") as f:
f.write(f"{now}|{states[state]}|{state}|{msg}\n")
os.rename(state_file_path + ".tmp", state_file_path)
if states[state] > 0:
print(f"{state}: {msg}")
sys.exit(states[state])
def send_zulip(sender: zulip.Client, message: Dict[str, Any]) -> None: def send_zulip(sender: zulip.Client, message: Dict[str, Any]) -> None:
result = sender.send_message(message) result = sender.send_message(message)
if result["result"] != "success": if result["result"] != "success":
report("CRITICAL", msg=f"Error sending Zulip, args were: {message}, {result}") report("critical", msg=f"Error sending Zulip, args were: {message}, {result}")
def get_zulips() -> List[Dict[str, Any]]: def get_zulips() -> List[Dict[str, Any]]:
global last_event_id global last_event_id
res = zulip_recipient.get_events(queue_id=queue_id, last_event_id=last_event_id) res = zulip_recipient.get_events(queue_id=queue_id, last_event_id=last_event_id)
if "error" in res.get("result", {}): if "error" in res.get("result", {}):
report("CRITICAL", msg="Error receiving Zulips, error was: {}".format(res["msg"])) report("critical", msg="Error receiving Zulips, error was: {}".format(res["msg"]))
for event in res["events"]: for event in res["events"]:
last_event_id = max(last_event_id, int(event["id"])) last_event_id = max(last_event_id, int(event["id"]))
# If we get a heartbeat event, that means we've been hanging for # If we get a heartbeat event, that means we've been hanging for
# 40s, and we should bail. # 40s, and we should bail.
if "heartbeat" in (event["type"] for event in res["events"]): if "heartbeat" in (event["type"] for event in res["events"]):
report("CRITICAL", msg="Got heartbeat waiting for Zulip, which means get_events is hanging") report("critical", msg="Got heartbeat waiting for Zulip, which means get_events is hanging")
return [event["message"] for event in res["events"]] return [event["message"] for event in res["events"]]
@ -120,10 +111,10 @@ zulip_recipient = zulip.Client(
try: try:
res = zulip_recipient.register(event_types=["message"]) res = zulip_recipient.register(event_types=["message"])
if "error" in res.get("result", {}): if "error" in res.get("result", {}):
report("CRITICAL", msg="Error subscribing to Zulips: {}".format(res["msg"])) report("critical", msg="Error subscribing to Zulips: {}".format(res["msg"]))
queue_id, last_event_id = (res["queue_id"], res["last_event_id"]) queue_id, last_event_id = (res["queue_id"], res["last_event_id"])
except Exception: except Exception:
report("CRITICAL", msg=f"Error subscribing to Zulips:\n{traceback.format_exc()}") report("critical", msg=f"Error subscribing to Zulips:\n{traceback.format_exc()}")
msg_to_send = str(random.getrandbits(64)) msg_to_send = str(random.getrandbits(64))
time_start = time.perf_counter() time_start = time.perf_counter()
@ -148,8 +139,8 @@ while msg_to_send not in msg_content:
zulip_recipient.deregister(queue_id) zulip_recipient.deregister(queue_id)
if seconds_diff > 12: if seconds_diff > 12:
report("CRITICAL", timestamp=seconds_diff) report("critical", timestamp=seconds_diff)
if seconds_diff > 3: if seconds_diff > 3:
report("WARNING", timestamp=seconds_diff) report("warning", timestamp=seconds_diff)
else: else:
report("OK", timestamp=seconds_diff) report("ok", timestamp=seconds_diff)

View File

@ -10,7 +10,7 @@ from typing import Any, DefaultDict, Dict, List
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(ZULIP_PATH) sys.path.append(ZULIP_PATH)
from scripts.lib.zulip_tools import get_config, get_config_file from scripts.lib.zulip_tools import atomic_nagios_write, get_config, get_config_file
normal_queues = [ normal_queues = [
"deferred_work", "deferred_work",
@ -190,8 +190,6 @@ def check_rabbitmq_queues() -> None:
status = max(result["status"] for result in results) status = max(result["status"] for result in results)
now = int(time.time())
if status > 0: if status > 0:
queue_error_template = "queue {} problem: {}:{}" queue_error_template = "queue {} problem: {}:{}"
error_message = "; ".join( error_message = "; ".join(
@ -199,6 +197,12 @@ def check_rabbitmq_queues() -> None:
for result in results for result in results
if result["status"] > 0 if result["status"] > 0
) )
print(f"{now}|{status}|{states[status]}|{error_message}") sys.exit(
atomic_nagios_write(
"check-rabbitmq-results",
"critical" if status == CRITICAL else "warning",
error_message,
)
)
else: else:
print(f"{now}|{status}|{states[status]}|queues normal") atomic_nagios_write("check-rabbitmq-results", "ok", "queues normal")

View File

@ -16,7 +16,7 @@ import sys
import time import time
import uuid import uuid
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import IO, Any, Dict, List, Optional, Sequence, Set, Union, overload from typing import IO, Any, Dict, List, Literal, Optional, Sequence, Set, Union, overload
from urllib.parse import SplitResult from urllib.parse import SplitResult
import zoneinfo import zoneinfo
@ -723,6 +723,32 @@ def listening_publicly(port: int) -> List[str]:
return [line.split()[4] for line in lines] return [line.split()[4] for line in lines]
def atomic_nagios_write(
name: str,
status: Literal["ok", "warning", "critical", "unknown"],
message: Optional[str] = None,
event_time: Optional[int] = None,
) -> int:
if message is None:
message = status
if event_time is None:
event_time = int(time.time())
if status == "ok":
status_int = 0
elif status == "warning":
status_int = 1
elif status == "critical":
status_int = 2
elif status == "unknown":
status_int = 3
path = "/var/lib/nagios_state/" + name
with open(path + ".tmp", "w") as fh:
fh.write("|".join([str(event_time), str(status_int), status, message]) + "\n")
os.rename(path + ".tmp", path)
return status_int
if __name__ == "__main__": if __name__ == "__main__":
cmd = sys.argv[1] cmd = sys.argv[1]
if cmd == "make_deploy_path": if cmd == "make_deploy_path":

View File

@ -11,14 +11,12 @@ from typing import Dict
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(ZULIP_PATH) sys.path.append(ZULIP_PATH)
from scripts.lib.check_rabbitmq_queue import normal_queues from scripts.lib.check_rabbitmq_queue import normal_queues
from scripts.lib.zulip_tools import get_config, get_config_file, get_tornado_ports from scripts.lib.zulip_tools import (
atomic_nagios_write,
states = { get_config,
0: "OK", get_config_file,
1: "WARNING", get_tornado_ports,
2: "CRITICAL", )
3: "UNKNOWN",
}
if "USER" in os.environ and os.environ["USER"] not in ["root", "rabbitmq"]: if "USER" in os.environ and os.environ["USER"] not in ["root", "rabbitmq"]:
print("This script must be run as the root or rabbitmq user") print("This script must be run as the root or rabbitmq user")
@ -56,9 +54,6 @@ for line in output.split("\n"):
now = int(time.time()) now = int(time.time())
for queue_name in consumers: for queue_name in consumers:
state_file_path = "/var/lib/nagios_state/check-rabbitmq-consumers-" + queue_name
state_file_tmp = state_file_path + "-tmp"
target_count = 1 target_count = 1
if queue_name == "notify_tornado": if queue_name == "notify_tornado":
target_count = TORNADO_PROCESSES target_count = TORNADO_PROCESSES
@ -67,12 +62,8 @@ for queue_name in consumers:
get_config(config_file, "application_server", "mobile_notification_shards", "1") get_config(config_file, "application_server", "mobile_notification_shards", "1")
) )
if consumers[queue_name] < target_count: atomic_nagios_write(
status = 2 "check-rabbitmq-consumers-" + queue_name,
else: "critical" if consumers[queue_name] < target_count else "ok",
status = 0 "queue {queue_name} has {consumers[queue_name]} consumers, needs {target_count}",
with open(state_file_tmp, "w") as f:
f.write(
f"{now}|{status}|{states[status]}|queue {queue_name} has {consumers[queue_name]} consumers, needs {target_count}\n"
) )
os.rename(state_file_tmp, state_file_path)

View File

@ -575,7 +575,7 @@ GOOGLE_ANALYTICS_ID: Optional[str] = None
# This is overridden by dev_settings.py for droplets. # This is overridden by dev_settings.py for droplets.
IS_DEV_DROPLET = False IS_DEV_DROPLET = False
# Used by puppet/kandra/files/cron.d/check_send_receive_time. # Used by the `check_send_receive_time` monitoring tool.
NAGIOS_BOT_HOST = SYSTEM_BOT_REALM + "." + EXTERNAL_HOST NAGIOS_BOT_HOST = SYSTEM_BOT_REALM + "." + EXTERNAL_HOST
# Use half of the available CPUs for data import purposes. # Use half of the available CPUs for data import purposes.