puppet: Factor out pattern of writing a nagios state file atomically.

This commit is contained in:
Alex Vandiver 2024-05-22 04:22:22 +00:00 committed by Tim Abbott
parent 230040caa9
commit f246b82f67
10 changed files with 138 additions and 159 deletions

View File

@ -1,13 +1,13 @@
import os
import time
from dataclasses import dataclass
from datetime import timedelta
from typing import Any, Dict
from typing import Any, Literal
from django.utils.timezone import now as timezone_now
from typing_extensions import override
from analytics.lib.counts import ALL_COUNT_STATS, CountStat
from analytics.models import installation_epoch
from scripts.lib.zulip_tools import atomic_nagios_write
from zerver.lib.management import ZulipBaseCommand
from zerver.lib.timestamp import TimeZoneNotUTCError, floor_to_day, floor_to_hour, verify_UTC
from zerver.models import Realm
@ -20,6 +20,12 @@ states = {
}
@dataclass
class NagiosResult:
status: Literal["ok", "warning", "critical", "unknown"]
message: str
class Command(ZulipBaseCommand):
help = """Checks FillState table.
@ -28,19 +34,11 @@ class Command(ZulipBaseCommand):
@override
def handle(self, *args: Any, **options: Any) -> None:
fill_state = self.get_fill_state()
status = fill_state["status"]
message = fill_state["message"]
atomic_nagios_write("check-analytics-state", fill_state.status, fill_state.message)
state_file_path = "/var/lib/nagios_state/check-analytics-state"
state_file_tmp = state_file_path + "-tmp"
with open(state_file_tmp, "w") as f:
f.write(f"{int(time.time())}|{status}|{states[status]}|{message}\n")
os.rename(state_file_tmp, state_file_path)
def get_fill_state(self) -> Dict[str, Any]:
def get_fill_state(self) -> NagiosResult:
if not Realm.objects.exists():
return {"status": 0, "message": "No realms exist, so not checking FillState."}
return NagiosResult(status="ok", message="No realms exist, so not checking FillState.")
warning_unfilled_properties = []
critical_unfilled_properties = []
@ -51,7 +49,9 @@ class Command(ZulipBaseCommand):
try:
verify_UTC(last_fill)
except TimeZoneNotUTCError:
return {"status": 2, "message": f"FillState not in UTC for {property}"}
return NagiosResult(
status="critical", message=f"FillState not in UTC for {property}"
)
if stat.frequency == CountStat.DAY:
floor_function = floor_to_day
@ -63,10 +63,10 @@ class Command(ZulipBaseCommand):
critical_threshold = timedelta(minutes=150)
if floor_function(last_fill) != last_fill:
return {
"status": 2,
"message": f"FillState not on {stat.frequency} boundary for {property}",
}
return NagiosResult(
status="critical",
message=f"FillState not on {stat.frequency} boundary for {property}",
)
time_to_last_fill = timezone_now() - last_fill
if time_to_last_fill > critical_threshold:
@ -75,18 +75,18 @@ class Command(ZulipBaseCommand):
warning_unfilled_properties.append(property)
if len(critical_unfilled_properties) == 0 and len(warning_unfilled_properties) == 0:
return {"status": 0, "message": "FillState looks fine."}
return NagiosResult(status="ok", message="FillState looks fine.")
if len(critical_unfilled_properties) == 0:
return {
"status": 1,
"message": "Missed filling {} once.".format(
return NagiosResult(
status="warning",
message="Missed filling {} once.".format(
", ".join(warning_unfilled_properties),
),
}
return {
"status": 2,
"message": "Missed filling {} once. Missed filling {} at least twice.".format(
)
return NagiosResult(
status="critical",
message="Missed filling {} once. Missed filling {} at least twice.".format(
", ".join(warning_unfilled_properties),
", ".join(critical_unfilled_properties),
),
}
)

View File

@ -1,26 +0,0 @@
# Edit this file to introduce tasks to be run by cron.
#
# Each task to run has to be defined through a single line
# indicating with different fields when the task will be run
# and what command to run for the task
#
# To define the time you can provide concrete values for
# minute (m), hour (h), day of month (dom), month (mon),
# and day of week (dow) or use '*' in these fields (for 'any').#
# Notice that tasks will be started based on the cron's system
# daemon's notion of time and timezones.
#
# Output of the crontab jobs (including errors) is sent through
# email to the user the crontab file belongs to (unless redirected).
#
# For example, you can run a backup of all your user accounts
# at 5 a.m every week with:
# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/
#
# For more information see the manual pages of crontab(5) and cron(8)
#
# m h dom mon dow command
SHELL=/bin/bash
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-queue &> /var/lib/nagios_state/check-rabbitmq-results-tmp; mv /var/lib/nagios_state/check-rabbitmq-results-tmp /var/lib/nagios_state/check-rabbitmq-results
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers

View File

@ -10,15 +10,14 @@ Django ORM.
import os
import sys
from datetime import timedelta
from typing import NoReturn, Optional
sys.path.append("/home/zulip/deployments/current")
from scripts.lib.setup_path import setup_path
from scripts.lib.zulip_tools import atomic_nagios_write
setup_path()
import django
from django.db.models import QuerySet
from django.utils.timezone import now as timezone_now
os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"
@ -27,51 +26,13 @@ sys.path.append("/home/zulip/deployments/current/zerver")
django.setup()
from typing import Dict
from zerver.models import UserActivity
from zerver.models.clients import get_client
states: Dict[str, int] = {
"OK": 0,
"WARNING": 1,
"CRITICAL": 2,
"UNKNOWN": 3,
}
state_file_path = "/var/lib/nagios_state/check_user_zephyr_mirror_liveness"
now = timezone_now()
def report(
state: str, short_msg: str, all_users: Optional[QuerySet[UserActivity]] = None
) -> NoReturn:
too_old_data = ""
if all_users is not None:
recently_inactive_users = (
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
.distinct("user_profile_id")
.difference(
all_users.filter(last_visit__lt=now - timedelta(minutes=60)).distinct(
"user_profile_id"
)
)
)
too_old_data = "\nLast call to get_message for recently out of date mirrors:\n" + "\n".join(
"{:>16}: {}".format(
user.user_profile.email,
user.last_visit.strftime("%Y-%m-%d %H:%M %Z"),
)
for user in recently_inactive_users
)
with open(state_file_path + ".tmp", "w") as f:
f.write(f"{int(now.timestamp())}|{states[state]}|{state}|{short_msg}{too_old_data}")
os.rename(state_file_path + ".tmp", state_file_path)
print(f"{state}: {short_msg}{too_old_data}")
sys.exit(states[state])
zephyr_client = get_client("zephyr_mirror")
all_users = UserActivity.objects.filter(
# We need to use the client_id so we can use the partial index we
@ -84,23 +45,51 @@ all_users = UserActivity.objects.filter(
query__in=["get_events", "/api/v1/events"],
client_id=zephyr_client.id,
)
new_inactive_users = (
new_inactive_user_count = (
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
.values("user_profile_id")
.distinct("user_profile_id")
.count()
)
old_inactive_users = (
old_inactive_user_count = (
all_users.filter(last_visit__lt=now - timedelta(minutes=60))
.values("user_profile_id")
.distinct("user_profile_id")
.count()
)
recently_inactive_users = new_inactive_users - old_inactive_users
recently_inactive_user_count = new_inactive_user_count - old_inactive_user_count
if recently_inactive_users / float(old_inactive_users) > 0.25:
report("CRITICAL", "Many mirrors recently became inactive", all_users)
if recently_inactive_user_count / float(old_inactive_user_count) > 0.25:
recently_inactive_users = (
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
.distinct("user_profile_id")
.difference(
all_users.filter(last_visit__lt=now - timedelta(minutes=60)).distinct("user_profile_id")
)
)
too_old_data = (
"Many mirrors recently became inactive\n"
"Last call to get_message for recently out of date mirrors:\n"
+ "\n".join(
"{:>16}: {}".format(
user.user_profile.email,
user.last_visit.strftime("%Y-%m-%d %H:%M %Z"),
)
for user in recently_inactive_users
)
)
sys.exit(
atomic_nagios_write(
"check_user_zephyr_mirror_liveness", "critical", too_old_data, int(now.timestamp())
)
)
else:
report("OK", "Most mirrors that were recently active continue to be active")
atomic_nagios_write(
"check_user_zephyr_mirror_liveness",
"ok",
"Most mirrors that were recently active continue to be active",
int(now.timestamp()),
)

View File

@ -6,12 +6,16 @@ class kandra::app_frontend_monitoring {
include kandra::prometheus::uwsgi
include kandra::prometheus::process
kandra::firewall_allow { 'grok_exporter': port => '9144' }
file { '/etc/cron.d/rabbitmq-monitoring':
ensure => file,
require => Package[rabbitmq-server],
owner => 'root',
group => 'root',
mode => '0644',
source => 'puppet:///modules/kandra/cron.d/rabbitmq-monitoring',
ensure => absent,
}
zulip::cron { 'check-rabbitmq-queue':
minute => '*',
command => '/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-queue',
}
zulip::cron { 'check-rabbitmq-consumers':
minute => '*',
command => '/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers',
}
}

View File

@ -27,6 +27,6 @@ class kandra::prod_app_frontend_once {
zulip::cron { 'check_user_zephyr_mirror_liveness':
hour => '*',
minute => '*',
command => '/usr/lib/nagios/plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness >/dev/null',
command => '/usr/lib/nagios/plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness',
}
}

View File

@ -13,11 +13,12 @@ import random
import sys
import time
import traceback
from typing import Any, Dict, List, NoReturn, Optional
from typing import Any, Dict, List, Literal, NoReturn, Optional
sys.path.append(".")
sys.path.append("/home/zulip/deployments/current")
from scripts.lib.setup_path import setup_path
from scripts.lib.zulip_tools import atomic_nagios_write
setup_path()
@ -47,43 +48,33 @@ parser.add_argument("--insecure", action="store_true")
options = parser.parse_args()
def report(state: str, timestamp: Any = None, msg: Optional[str] = None) -> NoReturn:
states = {
"OK": 0,
"WARNING": 1,
"CRITICAL": 2,
"UNKNOWN": 3,
}
now = int(time.time())
def report(
state: Literal["ok", "warning", "critical", "unknown"],
timestamp: Optional[float] = None,
msg: Optional[str] = None,
) -> NoReturn:
if msg is None:
msg = f"send time was {timestamp}"
state_file_path = "/var/lib/nagios_state/check_send_receive_state"
with open(state_file_path + ".tmp", "w") as f:
f.write(f"{now}|{states[state]}|{state}|{msg}\n")
os.rename(state_file_path + ".tmp", state_file_path)
if states[state] > 0:
print(f"{state}: {msg}")
sys.exit(states[state])
sys.exit(atomic_nagios_write("check_send_receive_state", state, msg))
def send_zulip(sender: zulip.Client, message: Dict[str, Any]) -> None:
result = sender.send_message(message)
if result["result"] != "success":
report("CRITICAL", msg=f"Error sending Zulip, args were: {message}, {result}")
report("critical", msg=f"Error sending Zulip, args were: {message}, {result}")
def get_zulips() -> List[Dict[str, Any]]:
global last_event_id
res = zulip_recipient.get_events(queue_id=queue_id, last_event_id=last_event_id)
if "error" in res.get("result", {}):
report("CRITICAL", msg="Error receiving Zulips, error was: {}".format(res["msg"]))
report("critical", msg="Error receiving Zulips, error was: {}".format(res["msg"]))
for event in res["events"]:
last_event_id = max(last_event_id, int(event["id"]))
# If we get a heartbeat event, that means we've been hanging for
# 40s, and we should bail.
if "heartbeat" in (event["type"] for event in res["events"]):
report("CRITICAL", msg="Got heartbeat waiting for Zulip, which means get_events is hanging")
report("critical", msg="Got heartbeat waiting for Zulip, which means get_events is hanging")
return [event["message"] for event in res["events"]]
@ -120,10 +111,10 @@ zulip_recipient = zulip.Client(
try:
res = zulip_recipient.register(event_types=["message"])
if "error" in res.get("result", {}):
report("CRITICAL", msg="Error subscribing to Zulips: {}".format(res["msg"]))
report("critical", msg="Error subscribing to Zulips: {}".format(res["msg"]))
queue_id, last_event_id = (res["queue_id"], res["last_event_id"])
except Exception:
report("CRITICAL", msg=f"Error subscribing to Zulips:\n{traceback.format_exc()}")
report("critical", msg=f"Error subscribing to Zulips:\n{traceback.format_exc()}")
msg_to_send = str(random.getrandbits(64))
time_start = time.perf_counter()
@ -148,8 +139,8 @@ while msg_to_send not in msg_content:
zulip_recipient.deregister(queue_id)
if seconds_diff > 12:
report("CRITICAL", timestamp=seconds_diff)
report("critical", timestamp=seconds_diff)
if seconds_diff > 3:
report("WARNING", timestamp=seconds_diff)
report("warning", timestamp=seconds_diff)
else:
report("OK", timestamp=seconds_diff)
report("ok", timestamp=seconds_diff)

View File

@ -10,7 +10,7 @@ from typing import Any, DefaultDict, Dict, List
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(ZULIP_PATH)
from scripts.lib.zulip_tools import get_config, get_config_file
from scripts.lib.zulip_tools import atomic_nagios_write, get_config, get_config_file
normal_queues = [
"deferred_work",
@ -190,8 +190,6 @@ def check_rabbitmq_queues() -> None:
status = max(result["status"] for result in results)
now = int(time.time())
if status > 0:
queue_error_template = "queue {} problem: {}:{}"
error_message = "; ".join(
@ -199,6 +197,12 @@ def check_rabbitmq_queues() -> None:
for result in results
if result["status"] > 0
)
print(f"{now}|{status}|{states[status]}|{error_message}")
sys.exit(
atomic_nagios_write(
"check-rabbitmq-results",
"critical" if status == CRITICAL else "warning",
error_message,
)
)
else:
print(f"{now}|{status}|{states[status]}|queues normal")
atomic_nagios_write("check-rabbitmq-results", "ok", "queues normal")

View File

@ -16,7 +16,7 @@ import sys
import time
import uuid
from datetime import datetime, timedelta
from typing import IO, Any, Dict, List, Optional, Sequence, Set, Union, overload
from typing import IO, Any, Dict, List, Literal, Optional, Sequence, Set, Union, overload
from urllib.parse import SplitResult
import zoneinfo
@ -723,6 +723,32 @@ def listening_publicly(port: int) -> List[str]:
return [line.split()[4] for line in lines]
def atomic_nagios_write(
name: str,
status: Literal["ok", "warning", "critical", "unknown"],
message: Optional[str] = None,
event_time: Optional[int] = None,
) -> int:
if message is None:
message = status
if event_time is None:
event_time = int(time.time())
if status == "ok":
status_int = 0
elif status == "warning":
status_int = 1
elif status == "critical":
status_int = 2
elif status == "unknown":
status_int = 3
path = "/var/lib/nagios_state/" + name
with open(path + ".tmp", "w") as fh:
fh.write("|".join([str(event_time), str(status_int), status, message]) + "\n")
os.rename(path + ".tmp", path)
return status_int
if __name__ == "__main__":
cmd = sys.argv[1]
if cmd == "make_deploy_path":

View File

@ -11,14 +11,12 @@ from typing import Dict
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(ZULIP_PATH)
from scripts.lib.check_rabbitmq_queue import normal_queues
from scripts.lib.zulip_tools import get_config, get_config_file, get_tornado_ports
states = {
0: "OK",
1: "WARNING",
2: "CRITICAL",
3: "UNKNOWN",
}
from scripts.lib.zulip_tools import (
atomic_nagios_write,
get_config,
get_config_file,
get_tornado_ports,
)
if "USER" in os.environ and os.environ["USER"] not in ["root", "rabbitmq"]:
print("This script must be run as the root or rabbitmq user")
@ -56,9 +54,6 @@ for line in output.split("\n"):
now = int(time.time())
for queue_name in consumers:
state_file_path = "/var/lib/nagios_state/check-rabbitmq-consumers-" + queue_name
state_file_tmp = state_file_path + "-tmp"
target_count = 1
if queue_name == "notify_tornado":
target_count = TORNADO_PROCESSES
@ -67,12 +62,8 @@ for queue_name in consumers:
get_config(config_file, "application_server", "mobile_notification_shards", "1")
)
if consumers[queue_name] < target_count:
status = 2
else:
status = 0
with open(state_file_tmp, "w") as f:
f.write(
f"{now}|{status}|{states[status]}|queue {queue_name} has {consumers[queue_name]} consumers, needs {target_count}\n"
)
os.rename(state_file_tmp, state_file_path)
atomic_nagios_write(
"check-rabbitmq-consumers-" + queue_name,
"critical" if consumers[queue_name] < target_count else "ok",
"queue {queue_name} has {consumers[queue_name]} consumers, needs {target_count}",
)

View File

@ -575,7 +575,7 @@ GOOGLE_ANALYTICS_ID: Optional[str] = None
# This is overridden by dev_settings.py for droplets.
IS_DEV_DROPLET = False
# Used by puppet/kandra/files/cron.d/check_send_receive_time.
# Used by the `check_send_receive_time` monitoring tool.
NAGIOS_BOT_HOST = SYSTEM_BOT_REALM + "." + EXTERNAL_HOST
# Use half of the available CPUs for data import purposes.