mirror of https://github.com/zulip/zulip.git
puppet: Factor out pattern of writing a nagios state file atomically.
This commit is contained in:
parent
230040caa9
commit
f246b82f67
|
@ -1,13 +1,13 @@
|
|||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from datetime import timedelta
|
||||
from typing import Any, Dict
|
||||
from typing import Any, Literal
|
||||
|
||||
from django.utils.timezone import now as timezone_now
|
||||
from typing_extensions import override
|
||||
|
||||
from analytics.lib.counts import ALL_COUNT_STATS, CountStat
|
||||
from analytics.models import installation_epoch
|
||||
from scripts.lib.zulip_tools import atomic_nagios_write
|
||||
from zerver.lib.management import ZulipBaseCommand
|
||||
from zerver.lib.timestamp import TimeZoneNotUTCError, floor_to_day, floor_to_hour, verify_UTC
|
||||
from zerver.models import Realm
|
||||
|
@ -20,6 +20,12 @@ states = {
|
|||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class NagiosResult:
|
||||
status: Literal["ok", "warning", "critical", "unknown"]
|
||||
message: str
|
||||
|
||||
|
||||
class Command(ZulipBaseCommand):
|
||||
help = """Checks FillState table.
|
||||
|
||||
|
@ -28,19 +34,11 @@ class Command(ZulipBaseCommand):
|
|||
@override
|
||||
def handle(self, *args: Any, **options: Any) -> None:
|
||||
fill_state = self.get_fill_state()
|
||||
status = fill_state["status"]
|
||||
message = fill_state["message"]
|
||||
atomic_nagios_write("check-analytics-state", fill_state.status, fill_state.message)
|
||||
|
||||
state_file_path = "/var/lib/nagios_state/check-analytics-state"
|
||||
state_file_tmp = state_file_path + "-tmp"
|
||||
|
||||
with open(state_file_tmp, "w") as f:
|
||||
f.write(f"{int(time.time())}|{status}|{states[status]}|{message}\n")
|
||||
os.rename(state_file_tmp, state_file_path)
|
||||
|
||||
def get_fill_state(self) -> Dict[str, Any]:
|
||||
def get_fill_state(self) -> NagiosResult:
|
||||
if not Realm.objects.exists():
|
||||
return {"status": 0, "message": "No realms exist, so not checking FillState."}
|
||||
return NagiosResult(status="ok", message="No realms exist, so not checking FillState.")
|
||||
|
||||
warning_unfilled_properties = []
|
||||
critical_unfilled_properties = []
|
||||
|
@ -51,7 +49,9 @@ class Command(ZulipBaseCommand):
|
|||
try:
|
||||
verify_UTC(last_fill)
|
||||
except TimeZoneNotUTCError:
|
||||
return {"status": 2, "message": f"FillState not in UTC for {property}"}
|
||||
return NagiosResult(
|
||||
status="critical", message=f"FillState not in UTC for {property}"
|
||||
)
|
||||
|
||||
if stat.frequency == CountStat.DAY:
|
||||
floor_function = floor_to_day
|
||||
|
@ -63,10 +63,10 @@ class Command(ZulipBaseCommand):
|
|||
critical_threshold = timedelta(minutes=150)
|
||||
|
||||
if floor_function(last_fill) != last_fill:
|
||||
return {
|
||||
"status": 2,
|
||||
"message": f"FillState not on {stat.frequency} boundary for {property}",
|
||||
}
|
||||
return NagiosResult(
|
||||
status="critical",
|
||||
message=f"FillState not on {stat.frequency} boundary for {property}",
|
||||
)
|
||||
|
||||
time_to_last_fill = timezone_now() - last_fill
|
||||
if time_to_last_fill > critical_threshold:
|
||||
|
@ -75,18 +75,18 @@ class Command(ZulipBaseCommand):
|
|||
warning_unfilled_properties.append(property)
|
||||
|
||||
if len(critical_unfilled_properties) == 0 and len(warning_unfilled_properties) == 0:
|
||||
return {"status": 0, "message": "FillState looks fine."}
|
||||
return NagiosResult(status="ok", message="FillState looks fine.")
|
||||
if len(critical_unfilled_properties) == 0:
|
||||
return {
|
||||
"status": 1,
|
||||
"message": "Missed filling {} once.".format(
|
||||
return NagiosResult(
|
||||
status="warning",
|
||||
message="Missed filling {} once.".format(
|
||||
", ".join(warning_unfilled_properties),
|
||||
),
|
||||
}
|
||||
return {
|
||||
"status": 2,
|
||||
"message": "Missed filling {} once. Missed filling {} at least twice.".format(
|
||||
)
|
||||
return NagiosResult(
|
||||
status="critical",
|
||||
message="Missed filling {} once. Missed filling {} at least twice.".format(
|
||||
", ".join(warning_unfilled_properties),
|
||||
", ".join(critical_unfilled_properties),
|
||||
),
|
||||
}
|
||||
)
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
# Edit this file to introduce tasks to be run by cron.
|
||||
#
|
||||
# Each task to run has to be defined through a single line
|
||||
# indicating with different fields when the task will be run
|
||||
# and what command to run for the task
|
||||
#
|
||||
# To define the time you can provide concrete values for
|
||||
# minute (m), hour (h), day of month (dom), month (mon),
|
||||
# and day of week (dow) or use '*' in these fields (for 'any').#
|
||||
# Notice that tasks will be started based on the cron's system
|
||||
# daemon's notion of time and timezones.
|
||||
#
|
||||
# Output of the crontab jobs (including errors) is sent through
|
||||
# email to the user the crontab file belongs to (unless redirected).
|
||||
#
|
||||
# For example, you can run a backup of all your user accounts
|
||||
# at 5 a.m every week with:
|
||||
# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/
|
||||
#
|
||||
# For more information see the manual pages of crontab(5) and cron(8)
|
||||
#
|
||||
# m h dom mon dow command
|
||||
SHELL=/bin/bash
|
||||
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-queue &> /var/lib/nagios_state/check-rabbitmq-results-tmp; mv /var/lib/nagios_state/check-rabbitmq-results-tmp /var/lib/nagios_state/check-rabbitmq-results
|
||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
|
|
@ -10,15 +10,14 @@ Django ORM.
|
|||
import os
|
||||
import sys
|
||||
from datetime import timedelta
|
||||
from typing import NoReturn, Optional
|
||||
|
||||
sys.path.append("/home/zulip/deployments/current")
|
||||
from scripts.lib.setup_path import setup_path
|
||||
from scripts.lib.zulip_tools import atomic_nagios_write
|
||||
|
||||
setup_path()
|
||||
|
||||
import django
|
||||
from django.db.models import QuerySet
|
||||
from django.utils.timezone import now as timezone_now
|
||||
|
||||
os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"
|
||||
|
@ -27,51 +26,13 @@ sys.path.append("/home/zulip/deployments/current/zerver")
|
|||
|
||||
django.setup()
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from zerver.models import UserActivity
|
||||
from zerver.models.clients import get_client
|
||||
|
||||
states: Dict[str, int] = {
|
||||
"OK": 0,
|
||||
"WARNING": 1,
|
||||
"CRITICAL": 2,
|
||||
"UNKNOWN": 3,
|
||||
}
|
||||
|
||||
state_file_path = "/var/lib/nagios_state/check_user_zephyr_mirror_liveness"
|
||||
now = timezone_now()
|
||||
|
||||
|
||||
def report(
|
||||
state: str, short_msg: str, all_users: Optional[QuerySet[UserActivity]] = None
|
||||
) -> NoReturn:
|
||||
too_old_data = ""
|
||||
if all_users is not None:
|
||||
recently_inactive_users = (
|
||||
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
|
||||
.distinct("user_profile_id")
|
||||
.difference(
|
||||
all_users.filter(last_visit__lt=now - timedelta(minutes=60)).distinct(
|
||||
"user_profile_id"
|
||||
)
|
||||
)
|
||||
)
|
||||
too_old_data = "\nLast call to get_message for recently out of date mirrors:\n" + "\n".join(
|
||||
"{:>16}: {}".format(
|
||||
user.user_profile.email,
|
||||
user.last_visit.strftime("%Y-%m-%d %H:%M %Z"),
|
||||
)
|
||||
for user in recently_inactive_users
|
||||
)
|
||||
|
||||
with open(state_file_path + ".tmp", "w") as f:
|
||||
f.write(f"{int(now.timestamp())}|{states[state]}|{state}|{short_msg}{too_old_data}")
|
||||
os.rename(state_file_path + ".tmp", state_file_path)
|
||||
print(f"{state}: {short_msg}{too_old_data}")
|
||||
sys.exit(states[state])
|
||||
|
||||
|
||||
zephyr_client = get_client("zephyr_mirror")
|
||||
all_users = UserActivity.objects.filter(
|
||||
# We need to use the client_id so we can use the partial index we
|
||||
|
@ -84,23 +45,51 @@ all_users = UserActivity.objects.filter(
|
|||
query__in=["get_events", "/api/v1/events"],
|
||||
client_id=zephyr_client.id,
|
||||
)
|
||||
new_inactive_users = (
|
||||
new_inactive_user_count = (
|
||||
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
|
||||
.values("user_profile_id")
|
||||
.distinct("user_profile_id")
|
||||
.count()
|
||||
)
|
||||
|
||||
old_inactive_users = (
|
||||
old_inactive_user_count = (
|
||||
all_users.filter(last_visit__lt=now - timedelta(minutes=60))
|
||||
.values("user_profile_id")
|
||||
.distinct("user_profile_id")
|
||||
.count()
|
||||
)
|
||||
|
||||
recently_inactive_users = new_inactive_users - old_inactive_users
|
||||
recently_inactive_user_count = new_inactive_user_count - old_inactive_user_count
|
||||
|
||||
if recently_inactive_users / float(old_inactive_users) > 0.25:
|
||||
report("CRITICAL", "Many mirrors recently became inactive", all_users)
|
||||
if recently_inactive_user_count / float(old_inactive_user_count) > 0.25:
|
||||
recently_inactive_users = (
|
||||
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
|
||||
.distinct("user_profile_id")
|
||||
.difference(
|
||||
all_users.filter(last_visit__lt=now - timedelta(minutes=60)).distinct("user_profile_id")
|
||||
)
|
||||
)
|
||||
too_old_data = (
|
||||
"Many mirrors recently became inactive\n"
|
||||
"Last call to get_message for recently out of date mirrors:\n"
|
||||
+ "\n".join(
|
||||
"{:>16}: {}".format(
|
||||
user.user_profile.email,
|
||||
user.last_visit.strftime("%Y-%m-%d %H:%M %Z"),
|
||||
)
|
||||
for user in recently_inactive_users
|
||||
)
|
||||
)
|
||||
|
||||
sys.exit(
|
||||
atomic_nagios_write(
|
||||
"check_user_zephyr_mirror_liveness", "critical", too_old_data, int(now.timestamp())
|
||||
)
|
||||
)
|
||||
else:
|
||||
report("OK", "Most mirrors that were recently active continue to be active")
|
||||
atomic_nagios_write(
|
||||
"check_user_zephyr_mirror_liveness",
|
||||
"ok",
|
||||
"Most mirrors that were recently active continue to be active",
|
||||
int(now.timestamp()),
|
||||
)
|
||||
|
|
|
@ -6,12 +6,16 @@ class kandra::app_frontend_monitoring {
|
|||
include kandra::prometheus::uwsgi
|
||||
include kandra::prometheus::process
|
||||
kandra::firewall_allow { 'grok_exporter': port => '9144' }
|
||||
|
||||
file { '/etc/cron.d/rabbitmq-monitoring':
|
||||
ensure => file,
|
||||
require => Package[rabbitmq-server],
|
||||
owner => 'root',
|
||||
group => 'root',
|
||||
mode => '0644',
|
||||
source => 'puppet:///modules/kandra/cron.d/rabbitmq-monitoring',
|
||||
ensure => absent,
|
||||
}
|
||||
zulip::cron { 'check-rabbitmq-queue':
|
||||
minute => '*',
|
||||
command => '/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-queue',
|
||||
}
|
||||
zulip::cron { 'check-rabbitmq-consumers':
|
||||
minute => '*',
|
||||
command => '/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers',
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,6 +27,6 @@ class kandra::prod_app_frontend_once {
|
|||
zulip::cron { 'check_user_zephyr_mirror_liveness':
|
||||
hour => '*',
|
||||
minute => '*',
|
||||
command => '/usr/lib/nagios/plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness >/dev/null',
|
||||
command => '/usr/lib/nagios/plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness',
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,11 +13,12 @@ import random
|
|||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from typing import Any, Dict, List, NoReturn, Optional
|
||||
from typing import Any, Dict, List, Literal, NoReturn, Optional
|
||||
|
||||
sys.path.append(".")
|
||||
sys.path.append("/home/zulip/deployments/current")
|
||||
from scripts.lib.setup_path import setup_path
|
||||
from scripts.lib.zulip_tools import atomic_nagios_write
|
||||
|
||||
setup_path()
|
||||
|
||||
|
@ -47,43 +48,33 @@ parser.add_argument("--insecure", action="store_true")
|
|||
options = parser.parse_args()
|
||||
|
||||
|
||||
def report(state: str, timestamp: Any = None, msg: Optional[str] = None) -> NoReturn:
|
||||
states = {
|
||||
"OK": 0,
|
||||
"WARNING": 1,
|
||||
"CRITICAL": 2,
|
||||
"UNKNOWN": 3,
|
||||
}
|
||||
|
||||
now = int(time.time())
|
||||
def report(
|
||||
state: Literal["ok", "warning", "critical", "unknown"],
|
||||
timestamp: Optional[float] = None,
|
||||
msg: Optional[str] = None,
|
||||
) -> NoReturn:
|
||||
if msg is None:
|
||||
msg = f"send time was {timestamp}"
|
||||
state_file_path = "/var/lib/nagios_state/check_send_receive_state"
|
||||
with open(state_file_path + ".tmp", "w") as f:
|
||||
f.write(f"{now}|{states[state]}|{state}|{msg}\n")
|
||||
os.rename(state_file_path + ".tmp", state_file_path)
|
||||
if states[state] > 0:
|
||||
print(f"{state}: {msg}")
|
||||
sys.exit(states[state])
|
||||
sys.exit(atomic_nagios_write("check_send_receive_state", state, msg))
|
||||
|
||||
|
||||
def send_zulip(sender: zulip.Client, message: Dict[str, Any]) -> None:
|
||||
result = sender.send_message(message)
|
||||
if result["result"] != "success":
|
||||
report("CRITICAL", msg=f"Error sending Zulip, args were: {message}, {result}")
|
||||
report("critical", msg=f"Error sending Zulip, args were: {message}, {result}")
|
||||
|
||||
|
||||
def get_zulips() -> List[Dict[str, Any]]:
|
||||
global last_event_id
|
||||
res = zulip_recipient.get_events(queue_id=queue_id, last_event_id=last_event_id)
|
||||
if "error" in res.get("result", {}):
|
||||
report("CRITICAL", msg="Error receiving Zulips, error was: {}".format(res["msg"]))
|
||||
report("critical", msg="Error receiving Zulips, error was: {}".format(res["msg"]))
|
||||
for event in res["events"]:
|
||||
last_event_id = max(last_event_id, int(event["id"]))
|
||||
# If we get a heartbeat event, that means we've been hanging for
|
||||
# 40s, and we should bail.
|
||||
if "heartbeat" in (event["type"] for event in res["events"]):
|
||||
report("CRITICAL", msg="Got heartbeat waiting for Zulip, which means get_events is hanging")
|
||||
report("critical", msg="Got heartbeat waiting for Zulip, which means get_events is hanging")
|
||||
return [event["message"] for event in res["events"]]
|
||||
|
||||
|
||||
|
@ -120,10 +111,10 @@ zulip_recipient = zulip.Client(
|
|||
try:
|
||||
res = zulip_recipient.register(event_types=["message"])
|
||||
if "error" in res.get("result", {}):
|
||||
report("CRITICAL", msg="Error subscribing to Zulips: {}".format(res["msg"]))
|
||||
report("critical", msg="Error subscribing to Zulips: {}".format(res["msg"]))
|
||||
queue_id, last_event_id = (res["queue_id"], res["last_event_id"])
|
||||
except Exception:
|
||||
report("CRITICAL", msg=f"Error subscribing to Zulips:\n{traceback.format_exc()}")
|
||||
report("critical", msg=f"Error subscribing to Zulips:\n{traceback.format_exc()}")
|
||||
msg_to_send = str(random.getrandbits(64))
|
||||
time_start = time.perf_counter()
|
||||
|
||||
|
@ -148,8 +139,8 @@ while msg_to_send not in msg_content:
|
|||
zulip_recipient.deregister(queue_id)
|
||||
|
||||
if seconds_diff > 12:
|
||||
report("CRITICAL", timestamp=seconds_diff)
|
||||
report("critical", timestamp=seconds_diff)
|
||||
if seconds_diff > 3:
|
||||
report("WARNING", timestamp=seconds_diff)
|
||||
report("warning", timestamp=seconds_diff)
|
||||
else:
|
||||
report("OK", timestamp=seconds_diff)
|
||||
report("ok", timestamp=seconds_diff)
|
||||
|
|
|
@ -10,7 +10,7 @@ from typing import Any, DefaultDict, Dict, List
|
|||
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
sys.path.append(ZULIP_PATH)
|
||||
from scripts.lib.zulip_tools import get_config, get_config_file
|
||||
from scripts.lib.zulip_tools import atomic_nagios_write, get_config, get_config_file
|
||||
|
||||
normal_queues = [
|
||||
"deferred_work",
|
||||
|
@ -190,8 +190,6 @@ def check_rabbitmq_queues() -> None:
|
|||
|
||||
status = max(result["status"] for result in results)
|
||||
|
||||
now = int(time.time())
|
||||
|
||||
if status > 0:
|
||||
queue_error_template = "queue {} problem: {}:{}"
|
||||
error_message = "; ".join(
|
||||
|
@ -199,6 +197,12 @@ def check_rabbitmq_queues() -> None:
|
|||
for result in results
|
||||
if result["status"] > 0
|
||||
)
|
||||
print(f"{now}|{status}|{states[status]}|{error_message}")
|
||||
sys.exit(
|
||||
atomic_nagios_write(
|
||||
"check-rabbitmq-results",
|
||||
"critical" if status == CRITICAL else "warning",
|
||||
error_message,
|
||||
)
|
||||
)
|
||||
else:
|
||||
print(f"{now}|{status}|{states[status]}|queues normal")
|
||||
atomic_nagios_write("check-rabbitmq-results", "ok", "queues normal")
|
||||
|
|
|
@ -16,7 +16,7 @@ import sys
|
|||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from typing import IO, Any, Dict, List, Optional, Sequence, Set, Union, overload
|
||||
from typing import IO, Any, Dict, List, Literal, Optional, Sequence, Set, Union, overload
|
||||
from urllib.parse import SplitResult
|
||||
|
||||
import zoneinfo
|
||||
|
@ -723,6 +723,32 @@ def listening_publicly(port: int) -> List[str]:
|
|||
return [line.split()[4] for line in lines]
|
||||
|
||||
|
||||
def atomic_nagios_write(
|
||||
name: str,
|
||||
status: Literal["ok", "warning", "critical", "unknown"],
|
||||
message: Optional[str] = None,
|
||||
event_time: Optional[int] = None,
|
||||
) -> int:
|
||||
if message is None:
|
||||
message = status
|
||||
if event_time is None:
|
||||
event_time = int(time.time())
|
||||
if status == "ok":
|
||||
status_int = 0
|
||||
elif status == "warning":
|
||||
status_int = 1
|
||||
elif status == "critical":
|
||||
status_int = 2
|
||||
elif status == "unknown":
|
||||
status_int = 3
|
||||
|
||||
path = "/var/lib/nagios_state/" + name
|
||||
with open(path + ".tmp", "w") as fh:
|
||||
fh.write("|".join([str(event_time), str(status_int), status, message]) + "\n")
|
||||
os.rename(path + ".tmp", path)
|
||||
return status_int
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cmd = sys.argv[1]
|
||||
if cmd == "make_deploy_path":
|
||||
|
|
|
@ -11,14 +11,12 @@ from typing import Dict
|
|||
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(ZULIP_PATH)
|
||||
from scripts.lib.check_rabbitmq_queue import normal_queues
|
||||
from scripts.lib.zulip_tools import get_config, get_config_file, get_tornado_ports
|
||||
|
||||
states = {
|
||||
0: "OK",
|
||||
1: "WARNING",
|
||||
2: "CRITICAL",
|
||||
3: "UNKNOWN",
|
||||
}
|
||||
from scripts.lib.zulip_tools import (
|
||||
atomic_nagios_write,
|
||||
get_config,
|
||||
get_config_file,
|
||||
get_tornado_ports,
|
||||
)
|
||||
|
||||
if "USER" in os.environ and os.environ["USER"] not in ["root", "rabbitmq"]:
|
||||
print("This script must be run as the root or rabbitmq user")
|
||||
|
@ -56,9 +54,6 @@ for line in output.split("\n"):
|
|||
now = int(time.time())
|
||||
|
||||
for queue_name in consumers:
|
||||
state_file_path = "/var/lib/nagios_state/check-rabbitmq-consumers-" + queue_name
|
||||
state_file_tmp = state_file_path + "-tmp"
|
||||
|
||||
target_count = 1
|
||||
if queue_name == "notify_tornado":
|
||||
target_count = TORNADO_PROCESSES
|
||||
|
@ -67,12 +62,8 @@ for queue_name in consumers:
|
|||
get_config(config_file, "application_server", "mobile_notification_shards", "1")
|
||||
)
|
||||
|
||||
if consumers[queue_name] < target_count:
|
||||
status = 2
|
||||
else:
|
||||
status = 0
|
||||
with open(state_file_tmp, "w") as f:
|
||||
f.write(
|
||||
f"{now}|{status}|{states[status]}|queue {queue_name} has {consumers[queue_name]} consumers, needs {target_count}\n"
|
||||
atomic_nagios_write(
|
||||
"check-rabbitmq-consumers-" + queue_name,
|
||||
"critical" if consumers[queue_name] < target_count else "ok",
|
||||
"queue {queue_name} has {consumers[queue_name]} consumers, needs {target_count}",
|
||||
)
|
||||
os.rename(state_file_tmp, state_file_path)
|
||||
|
|
|
@ -575,7 +575,7 @@ GOOGLE_ANALYTICS_ID: Optional[str] = None
|
|||
# This is overridden by dev_settings.py for droplets.
|
||||
IS_DEV_DROPLET = False
|
||||
|
||||
# Used by puppet/kandra/files/cron.d/check_send_receive_time.
|
||||
# Used by the `check_send_receive_time` monitoring tool.
|
||||
NAGIOS_BOT_HOST = SYSTEM_BOT_REALM + "." + EXTERNAL_HOST
|
||||
|
||||
# Use half of the available CPUs for data import purposes.
|
||||
|
|
Loading…
Reference in New Issue