mirror of https://github.com/zulip/zulip.git
puppet: Factor out pattern of writing a nagios state file atomically.
This commit is contained in:
parent
230040caa9
commit
f246b82f67
|
@ -1,13 +1,13 @@
|
||||||
import os
|
from dataclasses import dataclass
|
||||||
import time
|
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import Any, Dict
|
from typing import Any, Literal
|
||||||
|
|
||||||
from django.utils.timezone import now as timezone_now
|
from django.utils.timezone import now as timezone_now
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from analytics.lib.counts import ALL_COUNT_STATS, CountStat
|
from analytics.lib.counts import ALL_COUNT_STATS, CountStat
|
||||||
from analytics.models import installation_epoch
|
from analytics.models import installation_epoch
|
||||||
|
from scripts.lib.zulip_tools import atomic_nagios_write
|
||||||
from zerver.lib.management import ZulipBaseCommand
|
from zerver.lib.management import ZulipBaseCommand
|
||||||
from zerver.lib.timestamp import TimeZoneNotUTCError, floor_to_day, floor_to_hour, verify_UTC
|
from zerver.lib.timestamp import TimeZoneNotUTCError, floor_to_day, floor_to_hour, verify_UTC
|
||||||
from zerver.models import Realm
|
from zerver.models import Realm
|
||||||
|
@ -20,6 +20,12 @@ states = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NagiosResult:
|
||||||
|
status: Literal["ok", "warning", "critical", "unknown"]
|
||||||
|
message: str
|
||||||
|
|
||||||
|
|
||||||
class Command(ZulipBaseCommand):
|
class Command(ZulipBaseCommand):
|
||||||
help = """Checks FillState table.
|
help = """Checks FillState table.
|
||||||
|
|
||||||
|
@ -28,19 +34,11 @@ class Command(ZulipBaseCommand):
|
||||||
@override
|
@override
|
||||||
def handle(self, *args: Any, **options: Any) -> None:
|
def handle(self, *args: Any, **options: Any) -> None:
|
||||||
fill_state = self.get_fill_state()
|
fill_state = self.get_fill_state()
|
||||||
status = fill_state["status"]
|
atomic_nagios_write("check-analytics-state", fill_state.status, fill_state.message)
|
||||||
message = fill_state["message"]
|
|
||||||
|
|
||||||
state_file_path = "/var/lib/nagios_state/check-analytics-state"
|
def get_fill_state(self) -> NagiosResult:
|
||||||
state_file_tmp = state_file_path + "-tmp"
|
|
||||||
|
|
||||||
with open(state_file_tmp, "w") as f:
|
|
||||||
f.write(f"{int(time.time())}|{status}|{states[status]}|{message}\n")
|
|
||||||
os.rename(state_file_tmp, state_file_path)
|
|
||||||
|
|
||||||
def get_fill_state(self) -> Dict[str, Any]:
|
|
||||||
if not Realm.objects.exists():
|
if not Realm.objects.exists():
|
||||||
return {"status": 0, "message": "No realms exist, so not checking FillState."}
|
return NagiosResult(status="ok", message="No realms exist, so not checking FillState.")
|
||||||
|
|
||||||
warning_unfilled_properties = []
|
warning_unfilled_properties = []
|
||||||
critical_unfilled_properties = []
|
critical_unfilled_properties = []
|
||||||
|
@ -51,7 +49,9 @@ class Command(ZulipBaseCommand):
|
||||||
try:
|
try:
|
||||||
verify_UTC(last_fill)
|
verify_UTC(last_fill)
|
||||||
except TimeZoneNotUTCError:
|
except TimeZoneNotUTCError:
|
||||||
return {"status": 2, "message": f"FillState not in UTC for {property}"}
|
return NagiosResult(
|
||||||
|
status="critical", message=f"FillState not in UTC for {property}"
|
||||||
|
)
|
||||||
|
|
||||||
if stat.frequency == CountStat.DAY:
|
if stat.frequency == CountStat.DAY:
|
||||||
floor_function = floor_to_day
|
floor_function = floor_to_day
|
||||||
|
@ -63,10 +63,10 @@ class Command(ZulipBaseCommand):
|
||||||
critical_threshold = timedelta(minutes=150)
|
critical_threshold = timedelta(minutes=150)
|
||||||
|
|
||||||
if floor_function(last_fill) != last_fill:
|
if floor_function(last_fill) != last_fill:
|
||||||
return {
|
return NagiosResult(
|
||||||
"status": 2,
|
status="critical",
|
||||||
"message": f"FillState not on {stat.frequency} boundary for {property}",
|
message=f"FillState not on {stat.frequency} boundary for {property}",
|
||||||
}
|
)
|
||||||
|
|
||||||
time_to_last_fill = timezone_now() - last_fill
|
time_to_last_fill = timezone_now() - last_fill
|
||||||
if time_to_last_fill > critical_threshold:
|
if time_to_last_fill > critical_threshold:
|
||||||
|
@ -75,18 +75,18 @@ class Command(ZulipBaseCommand):
|
||||||
warning_unfilled_properties.append(property)
|
warning_unfilled_properties.append(property)
|
||||||
|
|
||||||
if len(critical_unfilled_properties) == 0 and len(warning_unfilled_properties) == 0:
|
if len(critical_unfilled_properties) == 0 and len(warning_unfilled_properties) == 0:
|
||||||
return {"status": 0, "message": "FillState looks fine."}
|
return NagiosResult(status="ok", message="FillState looks fine.")
|
||||||
if len(critical_unfilled_properties) == 0:
|
if len(critical_unfilled_properties) == 0:
|
||||||
return {
|
return NagiosResult(
|
||||||
"status": 1,
|
status="warning",
|
||||||
"message": "Missed filling {} once.".format(
|
message="Missed filling {} once.".format(
|
||||||
", ".join(warning_unfilled_properties),
|
", ".join(warning_unfilled_properties),
|
||||||
),
|
),
|
||||||
}
|
)
|
||||||
return {
|
return NagiosResult(
|
||||||
"status": 2,
|
status="critical",
|
||||||
"message": "Missed filling {} once. Missed filling {} at least twice.".format(
|
message="Missed filling {} once. Missed filling {} at least twice.".format(
|
||||||
", ".join(warning_unfilled_properties),
|
", ".join(warning_unfilled_properties),
|
||||||
", ".join(critical_unfilled_properties),
|
", ".join(critical_unfilled_properties),
|
||||||
),
|
),
|
||||||
}
|
)
|
||||||
|
|
|
@ -1,26 +0,0 @@
|
||||||
# Edit this file to introduce tasks to be run by cron.
|
|
||||||
#
|
|
||||||
# Each task to run has to be defined through a single line
|
|
||||||
# indicating with different fields when the task will be run
|
|
||||||
# and what command to run for the task
|
|
||||||
#
|
|
||||||
# To define the time you can provide concrete values for
|
|
||||||
# minute (m), hour (h), day of month (dom), month (mon),
|
|
||||||
# and day of week (dow) or use '*' in these fields (for 'any').#
|
|
||||||
# Notice that tasks will be started based on the cron's system
|
|
||||||
# daemon's notion of time and timezones.
|
|
||||||
#
|
|
||||||
# Output of the crontab jobs (including errors) is sent through
|
|
||||||
# email to the user the crontab file belongs to (unless redirected).
|
|
||||||
#
|
|
||||||
# For example, you can run a backup of all your user accounts
|
|
||||||
# at 5 a.m every week with:
|
|
||||||
# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/
|
|
||||||
#
|
|
||||||
# For more information see the manual pages of crontab(5) and cron(8)
|
|
||||||
#
|
|
||||||
# m h dom mon dow command
|
|
||||||
SHELL=/bin/bash
|
|
||||||
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-queue &> /var/lib/nagios_state/check-rabbitmq-results-tmp; mv /var/lib/nagios_state/check-rabbitmq-results-tmp /var/lib/nagios_state/check-rabbitmq-results
|
|
||||||
* * * * * root /home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
|
|
|
@ -10,15 +10,14 @@ Django ORM.
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import NoReturn, Optional
|
|
||||||
|
|
||||||
sys.path.append("/home/zulip/deployments/current")
|
sys.path.append("/home/zulip/deployments/current")
|
||||||
from scripts.lib.setup_path import setup_path
|
from scripts.lib.setup_path import setup_path
|
||||||
|
from scripts.lib.zulip_tools import atomic_nagios_write
|
||||||
|
|
||||||
setup_path()
|
setup_path()
|
||||||
|
|
||||||
import django
|
import django
|
||||||
from django.db.models import QuerySet
|
|
||||||
from django.utils.timezone import now as timezone_now
|
from django.utils.timezone import now as timezone_now
|
||||||
|
|
||||||
os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"
|
os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"
|
||||||
|
@ -27,51 +26,13 @@ sys.path.append("/home/zulip/deployments/current/zerver")
|
||||||
|
|
||||||
django.setup()
|
django.setup()
|
||||||
|
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
from zerver.models import UserActivity
|
from zerver.models import UserActivity
|
||||||
from zerver.models.clients import get_client
|
from zerver.models.clients import get_client
|
||||||
|
|
||||||
states: Dict[str, int] = {
|
|
||||||
"OK": 0,
|
|
||||||
"WARNING": 1,
|
|
||||||
"CRITICAL": 2,
|
|
||||||
"UNKNOWN": 3,
|
|
||||||
}
|
|
||||||
|
|
||||||
state_file_path = "/var/lib/nagios_state/check_user_zephyr_mirror_liveness"
|
|
||||||
now = timezone_now()
|
now = timezone_now()
|
||||||
|
|
||||||
|
|
||||||
def report(
|
|
||||||
state: str, short_msg: str, all_users: Optional[QuerySet[UserActivity]] = None
|
|
||||||
) -> NoReturn:
|
|
||||||
too_old_data = ""
|
|
||||||
if all_users is not None:
|
|
||||||
recently_inactive_users = (
|
|
||||||
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
|
|
||||||
.distinct("user_profile_id")
|
|
||||||
.difference(
|
|
||||||
all_users.filter(last_visit__lt=now - timedelta(minutes=60)).distinct(
|
|
||||||
"user_profile_id"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
too_old_data = "\nLast call to get_message for recently out of date mirrors:\n" + "\n".join(
|
|
||||||
"{:>16}: {}".format(
|
|
||||||
user.user_profile.email,
|
|
||||||
user.last_visit.strftime("%Y-%m-%d %H:%M %Z"),
|
|
||||||
)
|
|
||||||
for user in recently_inactive_users
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(state_file_path + ".tmp", "w") as f:
|
|
||||||
f.write(f"{int(now.timestamp())}|{states[state]}|{state}|{short_msg}{too_old_data}")
|
|
||||||
os.rename(state_file_path + ".tmp", state_file_path)
|
|
||||||
print(f"{state}: {short_msg}{too_old_data}")
|
|
||||||
sys.exit(states[state])
|
|
||||||
|
|
||||||
|
|
||||||
zephyr_client = get_client("zephyr_mirror")
|
zephyr_client = get_client("zephyr_mirror")
|
||||||
all_users = UserActivity.objects.filter(
|
all_users = UserActivity.objects.filter(
|
||||||
# We need to use the client_id so we can use the partial index we
|
# We need to use the client_id so we can use the partial index we
|
||||||
|
@ -84,23 +45,51 @@ all_users = UserActivity.objects.filter(
|
||||||
query__in=["get_events", "/api/v1/events"],
|
query__in=["get_events", "/api/v1/events"],
|
||||||
client_id=zephyr_client.id,
|
client_id=zephyr_client.id,
|
||||||
)
|
)
|
||||||
new_inactive_users = (
|
new_inactive_user_count = (
|
||||||
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
|
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
|
||||||
.values("user_profile_id")
|
.values("user_profile_id")
|
||||||
.distinct("user_profile_id")
|
.distinct("user_profile_id")
|
||||||
.count()
|
.count()
|
||||||
)
|
)
|
||||||
|
|
||||||
old_inactive_users = (
|
old_inactive_user_count = (
|
||||||
all_users.filter(last_visit__lt=now - timedelta(minutes=60))
|
all_users.filter(last_visit__lt=now - timedelta(minutes=60))
|
||||||
.values("user_profile_id")
|
.values("user_profile_id")
|
||||||
.distinct("user_profile_id")
|
.distinct("user_profile_id")
|
||||||
.count()
|
.count()
|
||||||
)
|
)
|
||||||
|
|
||||||
recently_inactive_users = new_inactive_users - old_inactive_users
|
recently_inactive_user_count = new_inactive_user_count - old_inactive_user_count
|
||||||
|
|
||||||
if recently_inactive_users / float(old_inactive_users) > 0.25:
|
if recently_inactive_user_count / float(old_inactive_user_count) > 0.25:
|
||||||
report("CRITICAL", "Many mirrors recently became inactive", all_users)
|
recently_inactive_users = (
|
||||||
|
all_users.filter(last_visit__lt=now - timedelta(minutes=10))
|
||||||
|
.distinct("user_profile_id")
|
||||||
|
.difference(
|
||||||
|
all_users.filter(last_visit__lt=now - timedelta(minutes=60)).distinct("user_profile_id")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
too_old_data = (
|
||||||
|
"Many mirrors recently became inactive\n"
|
||||||
|
"Last call to get_message for recently out of date mirrors:\n"
|
||||||
|
+ "\n".join(
|
||||||
|
"{:>16}: {}".format(
|
||||||
|
user.user_profile.email,
|
||||||
|
user.last_visit.strftime("%Y-%m-%d %H:%M %Z"),
|
||||||
|
)
|
||||||
|
for user in recently_inactive_users
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
sys.exit(
|
||||||
|
atomic_nagios_write(
|
||||||
|
"check_user_zephyr_mirror_liveness", "critical", too_old_data, int(now.timestamp())
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
report("OK", "Most mirrors that were recently active continue to be active")
|
atomic_nagios_write(
|
||||||
|
"check_user_zephyr_mirror_liveness",
|
||||||
|
"ok",
|
||||||
|
"Most mirrors that were recently active continue to be active",
|
||||||
|
int(now.timestamp()),
|
||||||
|
)
|
||||||
|
|
|
@ -6,12 +6,16 @@ class kandra::app_frontend_monitoring {
|
||||||
include kandra::prometheus::uwsgi
|
include kandra::prometheus::uwsgi
|
||||||
include kandra::prometheus::process
|
include kandra::prometheus::process
|
||||||
kandra::firewall_allow { 'grok_exporter': port => '9144' }
|
kandra::firewall_allow { 'grok_exporter': port => '9144' }
|
||||||
|
|
||||||
file { '/etc/cron.d/rabbitmq-monitoring':
|
file { '/etc/cron.d/rabbitmq-monitoring':
|
||||||
ensure => file,
|
ensure => absent,
|
||||||
require => Package[rabbitmq-server],
|
}
|
||||||
owner => 'root',
|
zulip::cron { 'check-rabbitmq-queue':
|
||||||
group => 'root',
|
minute => '*',
|
||||||
mode => '0644',
|
command => '/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-queue',
|
||||||
source => 'puppet:///modules/kandra/cron.d/rabbitmq-monitoring',
|
}
|
||||||
|
zulip::cron { 'check-rabbitmq-consumers':
|
||||||
|
minute => '*',
|
||||||
|
command => '/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers',
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,6 @@ class kandra::prod_app_frontend_once {
|
||||||
zulip::cron { 'check_user_zephyr_mirror_liveness':
|
zulip::cron { 'check_user_zephyr_mirror_liveness':
|
||||||
hour => '*',
|
hour => '*',
|
||||||
minute => '*',
|
minute => '*',
|
||||||
command => '/usr/lib/nagios/plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness >/dev/null',
|
command => '/usr/lib/nagios/plugins/zulip_zephyr_mirror/check_user_zephyr_mirror_liveness',
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,11 +13,12 @@ import random
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from typing import Any, Dict, List, NoReturn, Optional
|
from typing import Any, Dict, List, Literal, NoReturn, Optional
|
||||||
|
|
||||||
sys.path.append(".")
|
sys.path.append(".")
|
||||||
sys.path.append("/home/zulip/deployments/current")
|
sys.path.append("/home/zulip/deployments/current")
|
||||||
from scripts.lib.setup_path import setup_path
|
from scripts.lib.setup_path import setup_path
|
||||||
|
from scripts.lib.zulip_tools import atomic_nagios_write
|
||||||
|
|
||||||
setup_path()
|
setup_path()
|
||||||
|
|
||||||
|
@ -47,43 +48,33 @@ parser.add_argument("--insecure", action="store_true")
|
||||||
options = parser.parse_args()
|
options = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def report(state: str, timestamp: Any = None, msg: Optional[str] = None) -> NoReturn:
|
def report(
|
||||||
states = {
|
state: Literal["ok", "warning", "critical", "unknown"],
|
||||||
"OK": 0,
|
timestamp: Optional[float] = None,
|
||||||
"WARNING": 1,
|
msg: Optional[str] = None,
|
||||||
"CRITICAL": 2,
|
) -> NoReturn:
|
||||||
"UNKNOWN": 3,
|
|
||||||
}
|
|
||||||
|
|
||||||
now = int(time.time())
|
|
||||||
if msg is None:
|
if msg is None:
|
||||||
msg = f"send time was {timestamp}"
|
msg = f"send time was {timestamp}"
|
||||||
state_file_path = "/var/lib/nagios_state/check_send_receive_state"
|
sys.exit(atomic_nagios_write("check_send_receive_state", state, msg))
|
||||||
with open(state_file_path + ".tmp", "w") as f:
|
|
||||||
f.write(f"{now}|{states[state]}|{state}|{msg}\n")
|
|
||||||
os.rename(state_file_path + ".tmp", state_file_path)
|
|
||||||
if states[state] > 0:
|
|
||||||
print(f"{state}: {msg}")
|
|
||||||
sys.exit(states[state])
|
|
||||||
|
|
||||||
|
|
||||||
def send_zulip(sender: zulip.Client, message: Dict[str, Any]) -> None:
|
def send_zulip(sender: zulip.Client, message: Dict[str, Any]) -> None:
|
||||||
result = sender.send_message(message)
|
result = sender.send_message(message)
|
||||||
if result["result"] != "success":
|
if result["result"] != "success":
|
||||||
report("CRITICAL", msg=f"Error sending Zulip, args were: {message}, {result}")
|
report("critical", msg=f"Error sending Zulip, args were: {message}, {result}")
|
||||||
|
|
||||||
|
|
||||||
def get_zulips() -> List[Dict[str, Any]]:
|
def get_zulips() -> List[Dict[str, Any]]:
|
||||||
global last_event_id
|
global last_event_id
|
||||||
res = zulip_recipient.get_events(queue_id=queue_id, last_event_id=last_event_id)
|
res = zulip_recipient.get_events(queue_id=queue_id, last_event_id=last_event_id)
|
||||||
if "error" in res.get("result", {}):
|
if "error" in res.get("result", {}):
|
||||||
report("CRITICAL", msg="Error receiving Zulips, error was: {}".format(res["msg"]))
|
report("critical", msg="Error receiving Zulips, error was: {}".format(res["msg"]))
|
||||||
for event in res["events"]:
|
for event in res["events"]:
|
||||||
last_event_id = max(last_event_id, int(event["id"]))
|
last_event_id = max(last_event_id, int(event["id"]))
|
||||||
# If we get a heartbeat event, that means we've been hanging for
|
# If we get a heartbeat event, that means we've been hanging for
|
||||||
# 40s, and we should bail.
|
# 40s, and we should bail.
|
||||||
if "heartbeat" in (event["type"] for event in res["events"]):
|
if "heartbeat" in (event["type"] for event in res["events"]):
|
||||||
report("CRITICAL", msg="Got heartbeat waiting for Zulip, which means get_events is hanging")
|
report("critical", msg="Got heartbeat waiting for Zulip, which means get_events is hanging")
|
||||||
return [event["message"] for event in res["events"]]
|
return [event["message"] for event in res["events"]]
|
||||||
|
|
||||||
|
|
||||||
|
@ -120,10 +111,10 @@ zulip_recipient = zulip.Client(
|
||||||
try:
|
try:
|
||||||
res = zulip_recipient.register(event_types=["message"])
|
res = zulip_recipient.register(event_types=["message"])
|
||||||
if "error" in res.get("result", {}):
|
if "error" in res.get("result", {}):
|
||||||
report("CRITICAL", msg="Error subscribing to Zulips: {}".format(res["msg"]))
|
report("critical", msg="Error subscribing to Zulips: {}".format(res["msg"]))
|
||||||
queue_id, last_event_id = (res["queue_id"], res["last_event_id"])
|
queue_id, last_event_id = (res["queue_id"], res["last_event_id"])
|
||||||
except Exception:
|
except Exception:
|
||||||
report("CRITICAL", msg=f"Error subscribing to Zulips:\n{traceback.format_exc()}")
|
report("critical", msg=f"Error subscribing to Zulips:\n{traceback.format_exc()}")
|
||||||
msg_to_send = str(random.getrandbits(64))
|
msg_to_send = str(random.getrandbits(64))
|
||||||
time_start = time.perf_counter()
|
time_start = time.perf_counter()
|
||||||
|
|
||||||
|
@ -148,8 +139,8 @@ while msg_to_send not in msg_content:
|
||||||
zulip_recipient.deregister(queue_id)
|
zulip_recipient.deregister(queue_id)
|
||||||
|
|
||||||
if seconds_diff > 12:
|
if seconds_diff > 12:
|
||||||
report("CRITICAL", timestamp=seconds_diff)
|
report("critical", timestamp=seconds_diff)
|
||||||
if seconds_diff > 3:
|
if seconds_diff > 3:
|
||||||
report("WARNING", timestamp=seconds_diff)
|
report("warning", timestamp=seconds_diff)
|
||||||
else:
|
else:
|
||||||
report("OK", timestamp=seconds_diff)
|
report("ok", timestamp=seconds_diff)
|
||||||
|
|
|
@ -10,7 +10,7 @@ from typing import Any, DefaultDict, Dict, List
|
||||||
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
sys.path.append(ZULIP_PATH)
|
sys.path.append(ZULIP_PATH)
|
||||||
from scripts.lib.zulip_tools import get_config, get_config_file
|
from scripts.lib.zulip_tools import atomic_nagios_write, get_config, get_config_file
|
||||||
|
|
||||||
normal_queues = [
|
normal_queues = [
|
||||||
"deferred_work",
|
"deferred_work",
|
||||||
|
@ -190,8 +190,6 @@ def check_rabbitmq_queues() -> None:
|
||||||
|
|
||||||
status = max(result["status"] for result in results)
|
status = max(result["status"] for result in results)
|
||||||
|
|
||||||
now = int(time.time())
|
|
||||||
|
|
||||||
if status > 0:
|
if status > 0:
|
||||||
queue_error_template = "queue {} problem: {}:{}"
|
queue_error_template = "queue {} problem: {}:{}"
|
||||||
error_message = "; ".join(
|
error_message = "; ".join(
|
||||||
|
@ -199,6 +197,12 @@ def check_rabbitmq_queues() -> None:
|
||||||
for result in results
|
for result in results
|
||||||
if result["status"] > 0
|
if result["status"] > 0
|
||||||
)
|
)
|
||||||
print(f"{now}|{status}|{states[status]}|{error_message}")
|
sys.exit(
|
||||||
|
atomic_nagios_write(
|
||||||
|
"check-rabbitmq-results",
|
||||||
|
"critical" if status == CRITICAL else "warning",
|
||||||
|
error_message,
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print(f"{now}|{status}|{states[status]}|queues normal")
|
atomic_nagios_write("check-rabbitmq-results", "ok", "queues normal")
|
||||||
|
|
|
@ -16,7 +16,7 @@ import sys
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import IO, Any, Dict, List, Optional, Sequence, Set, Union, overload
|
from typing import IO, Any, Dict, List, Literal, Optional, Sequence, Set, Union, overload
|
||||||
from urllib.parse import SplitResult
|
from urllib.parse import SplitResult
|
||||||
|
|
||||||
import zoneinfo
|
import zoneinfo
|
||||||
|
@ -723,6 +723,32 @@ def listening_publicly(port: int) -> List[str]:
|
||||||
return [line.split()[4] for line in lines]
|
return [line.split()[4] for line in lines]
|
||||||
|
|
||||||
|
|
||||||
|
def atomic_nagios_write(
|
||||||
|
name: str,
|
||||||
|
status: Literal["ok", "warning", "critical", "unknown"],
|
||||||
|
message: Optional[str] = None,
|
||||||
|
event_time: Optional[int] = None,
|
||||||
|
) -> int:
|
||||||
|
if message is None:
|
||||||
|
message = status
|
||||||
|
if event_time is None:
|
||||||
|
event_time = int(time.time())
|
||||||
|
if status == "ok":
|
||||||
|
status_int = 0
|
||||||
|
elif status == "warning":
|
||||||
|
status_int = 1
|
||||||
|
elif status == "critical":
|
||||||
|
status_int = 2
|
||||||
|
elif status == "unknown":
|
||||||
|
status_int = 3
|
||||||
|
|
||||||
|
path = "/var/lib/nagios_state/" + name
|
||||||
|
with open(path + ".tmp", "w") as fh:
|
||||||
|
fh.write("|".join([str(event_time), str(status_int), status, message]) + "\n")
|
||||||
|
os.rename(path + ".tmp", path)
|
||||||
|
return status_int
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cmd = sys.argv[1]
|
cmd = sys.argv[1]
|
||||||
if cmd == "make_deploy_path":
|
if cmd == "make_deploy_path":
|
||||||
|
|
|
@ -11,14 +11,12 @@ from typing import Dict
|
||||||
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
ZULIP_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
sys.path.append(ZULIP_PATH)
|
sys.path.append(ZULIP_PATH)
|
||||||
from scripts.lib.check_rabbitmq_queue import normal_queues
|
from scripts.lib.check_rabbitmq_queue import normal_queues
|
||||||
from scripts.lib.zulip_tools import get_config, get_config_file, get_tornado_ports
|
from scripts.lib.zulip_tools import (
|
||||||
|
atomic_nagios_write,
|
||||||
states = {
|
get_config,
|
||||||
0: "OK",
|
get_config_file,
|
||||||
1: "WARNING",
|
get_tornado_ports,
|
||||||
2: "CRITICAL",
|
)
|
||||||
3: "UNKNOWN",
|
|
||||||
}
|
|
||||||
|
|
||||||
if "USER" in os.environ and os.environ["USER"] not in ["root", "rabbitmq"]:
|
if "USER" in os.environ and os.environ["USER"] not in ["root", "rabbitmq"]:
|
||||||
print("This script must be run as the root or rabbitmq user")
|
print("This script must be run as the root or rabbitmq user")
|
||||||
|
@ -56,9 +54,6 @@ for line in output.split("\n"):
|
||||||
now = int(time.time())
|
now = int(time.time())
|
||||||
|
|
||||||
for queue_name in consumers:
|
for queue_name in consumers:
|
||||||
state_file_path = "/var/lib/nagios_state/check-rabbitmq-consumers-" + queue_name
|
|
||||||
state_file_tmp = state_file_path + "-tmp"
|
|
||||||
|
|
||||||
target_count = 1
|
target_count = 1
|
||||||
if queue_name == "notify_tornado":
|
if queue_name == "notify_tornado":
|
||||||
target_count = TORNADO_PROCESSES
|
target_count = TORNADO_PROCESSES
|
||||||
|
@ -67,12 +62,8 @@ for queue_name in consumers:
|
||||||
get_config(config_file, "application_server", "mobile_notification_shards", "1")
|
get_config(config_file, "application_server", "mobile_notification_shards", "1")
|
||||||
)
|
)
|
||||||
|
|
||||||
if consumers[queue_name] < target_count:
|
atomic_nagios_write(
|
||||||
status = 2
|
"check-rabbitmq-consumers-" + queue_name,
|
||||||
else:
|
"critical" if consumers[queue_name] < target_count else "ok",
|
||||||
status = 0
|
"queue {queue_name} has {consumers[queue_name]} consumers, needs {target_count}",
|
||||||
with open(state_file_tmp, "w") as f:
|
|
||||||
f.write(
|
|
||||||
f"{now}|{status}|{states[status]}|queue {queue_name} has {consumers[queue_name]} consumers, needs {target_count}\n"
|
|
||||||
)
|
)
|
||||||
os.rename(state_file_tmp, state_file_path)
|
|
||||||
|
|
|
@ -575,7 +575,7 @@ GOOGLE_ANALYTICS_ID: Optional[str] = None
|
||||||
# This is overridden by dev_settings.py for droplets.
|
# This is overridden by dev_settings.py for droplets.
|
||||||
IS_DEV_DROPLET = False
|
IS_DEV_DROPLET = False
|
||||||
|
|
||||||
# Used by puppet/kandra/files/cron.d/check_send_receive_time.
|
# Used by the `check_send_receive_time` monitoring tool.
|
||||||
NAGIOS_BOT_HOST = SYSTEM_BOT_REALM + "." + EXTERNAL_HOST
|
NAGIOS_BOT_HOST = SYSTEM_BOT_REALM + "." + EXTERNAL_HOST
|
||||||
|
|
||||||
# Use half of the available CPUs for data import purposes.
|
# Use half of the available CPUs for data import purposes.
|
||||||
|
|
Loading…
Reference in New Issue