From 5ee4b642adf1eeed09fd6da376e77299a6ff79e7 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Tue, 12 Sep 2023 20:34:54 +0000 Subject: [PATCH] views: Add a /health healthcheck endpoint. This endpoint verifies that the services that Zulip needs to function are running, and Django can talk to them. It is designed to be used as a readiness probe[^1] for Zulip, either by Kubernetes, or some other reverse-proxy load-balancer in front of Zulip. Because of this, it limits access to only localhost and the IP addresses of configured reverse proxies. Tests are limited because we cannot stop running services (which would impact other concurrent tests) and there would be extremely limited utility to mocking the very specific methods we're calling to raising the exceptions that we're looking for. [^1]: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ --- puppet/zulip/manifests/app_frontend_base.pp | 8 +++ .../nginx/healthcheck.conf.template.erb | 12 ++++ zerver/lib/exceptions.py | 6 ++ zerver/middleware.py | 2 +- zerver/tests/test_health.py | 24 +++++++ zerver/views/health.py | 65 +++++++++++++++++++ zproject/urls.py | 4 ++ 7 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 puppet/zulip/templates/nginx/healthcheck.conf.template.erb create mode 100644 zerver/tests/test_health.py create mode 100644 zerver/views/health.py diff --git a/puppet/zulip/manifests/app_frontend_base.pp b/puppet/zulip/manifests/app_frontend_base.pp index 001100db6b..122dc8572c 100644 --- a/puppet/zulip/manifests/app_frontend_base.pp +++ b/puppet/zulip/manifests/app_frontend_base.pp @@ -66,6 +66,14 @@ class zulip::app_frontend_base { notify => Service['nginx'], } } + file { '/etc/nginx/zulip-include/app.d/healthcheck.conf': + require => File['/etc/nginx/zulip-include/app.d'], + owner => 'root', + group => 'root', + mode => '0644', + content => template('zulip/nginx/healthcheck.conf.template.erb'), + notify => Service['nginx'], + } file { '/etc/nginx/zulip-include/upstreams': require => Package[$zulip::common::nginx], diff --git a/puppet/zulip/templates/nginx/healthcheck.conf.template.erb b/puppet/zulip/templates/nginx/healthcheck.conf.template.erb new file mode 100644 index 0000000000..bad4a7d98d --- /dev/null +++ b/puppet/zulip/templates/nginx/healthcheck.conf.template.erb @@ -0,0 +1,12 @@ +location /health { + allow 127.0.0.1; + allow ::1; + +<% @loadbalancers.each do |host| -%> + allow <%= host %>; +<% end -%> + + deny all; + + include uwsgi_params; +} diff --git a/zerver/lib/exceptions.py b/zerver/lib/exceptions.py index 47e23c168d..bbfa71dfbd 100644 --- a/zerver/lib/exceptions.py +++ b/zerver/lib/exceptions.py @@ -42,6 +42,7 @@ class ErrorCode(Enum): MOVE_MESSAGES_TIME_LIMIT_EXCEEDED = auto() REACTION_ALREADY_EXISTS = auto() REACTION_DOES_NOT_EXIST = auto() + SERVER_NOT_READY = auto() class JsonableError(Exception): @@ -533,3 +534,8 @@ class ApiParamValidationError(JsonableError): def __init__(self, msg: str, error_type: str) -> None: super().__init__(msg) self.error_type = error_type + + +class ServerNotReadyError(JsonableError): + code = ErrorCode.SERVER_NOT_READY + http_status_code = 500 diff --git a/zerver/middleware.py b/zerver/middleware.py index 80167f5a42..a021cc0c9b 100644 --- a/zerver/middleware.py +++ b/zerver/middleware.py @@ -539,7 +539,7 @@ class HostDomainMiddleware(MiddlewareMixin): # # API authentication will end up checking for an invalid # realm, and throw a JSON-format error if appropriate. - if request.path.startswith(("/static/", "/api/", "/json/")): + if request.path.startswith(("/static/", "/api/", "/json/")) or request.path == "/health": return None subdomain = get_subdomain(request) diff --git a/zerver/tests/test_health.py b/zerver/tests/test_health.py new file mode 100644 index 0000000000..7b284a51f1 --- /dev/null +++ b/zerver/tests/test_health.py @@ -0,0 +1,24 @@ +from unittest import mock + +from zerver.lib.exceptions import ServerNotReadyError +from zerver.lib.test_classes import ZulipTestCase + + +class HealthTest(ZulipTestCase): + def test_healthy(self) -> None: + # We do not actually use rabbitmq in tests, so this fails + # unless it's mocked out. + with mock.patch("zerver.views.health.check_rabbitmq"): + result = self.client_get("/health") + self.assert_json_success(result) + + def test_database_failure(self) -> None: + with mock.patch( + "zerver.views.health.check_database", + side_effect=ServerNotReadyError("Cannot query postgresql"), + ), self.assertLogs(level="ERROR") as logs: + result = self.client_get("/health") + self.assert_json_error(result, "Cannot query postgresql", status_code=500) + self.assertIn( + "zerver.lib.exceptions.ServerNotReadyError: Cannot query postgresql", logs.output[0] + ) diff --git a/zerver/views/health.py b/zerver/views/health.py new file mode 100644 index 0000000000..9d2ce56ffe --- /dev/null +++ b/zerver/views/health.py @@ -0,0 +1,65 @@ +from django.db.migrations.recorder import MigrationRecorder +from django.http import HttpRequest, HttpResponse +from django.utils.crypto import get_random_string +from django.utils.translation import gettext as _ +from pika import BlockingConnection + +from zerver.lib.cache import cache_delete, cache_get, cache_set +from zerver.lib.exceptions import ServerNotReadyError +from zerver.lib.queue import get_queue_client +from zerver.lib.redis_utils import get_redis_client +from zerver.lib.response import json_success + + +def check_database() -> None: + try: + if not MigrationRecorder.Migration.objects.exists(): + raise ServerNotReadyError(_("Database is empty")) # nocoverage + except ServerNotReadyError: # nocoverage + raise + except Exception: # nocoverage + raise ServerNotReadyError(_("Cannot query postgresql")) + + +def check_rabbitmq() -> None: # nocoverage + try: + conn = get_queue_client().connection + if conn is None: + raise ServerNotReadyError(_("Cannot connect to rabbitmq")) + assert isinstance(conn, BlockingConnection) + conn.process_data_events() + except ServerNotReadyError: + raise + except Exception: + raise ServerNotReadyError(_("Cannot query rabbitmq")) + + +def check_redis() -> None: + try: + get_redis_client().ping() + except Exception: # nocoverage + raise ServerNotReadyError(_("Cannot query redis")) + + +def check_memcached() -> None: + try: + roundtrip_key = "health_check_" + get_random_string(32) + roundtrip_value = get_random_string(32) + cache_set(roundtrip_key, roundtrip_value) + got_value = cache_get(roundtrip_key)[0] + if got_value != roundtrip_value: + raise ServerNotReadyError(_("Cannot write to memcached")) # nocoverage + cache_delete(roundtrip_key) + except ServerNotReadyError: # nocoverage + raise + except Exception: # nocoverage + raise ServerNotReadyError(_("Cannot query memcached")) + + +def health(request: HttpRequest) -> HttpResponse: + check_database() + check_rabbitmq() + check_redis() + check_memcached() + + return json_success(request) diff --git a/zproject/urls.py b/zproject/urls.py index 06708eb818..c8adc6235f 100644 --- a/zproject/urls.py +++ b/zproject/urls.py @@ -54,6 +54,7 @@ from zerver.views.documentation import IntegrationView, MarkdownDirectoryView, i from zerver.views.drafts import create_drafts, delete_draft, edit_draft, fetch_drafts from zerver.views.email_mirror import email_mirror_message from zerver.views.events_register import events_register_backend +from zerver.views.health import health from zerver.views.home import accounts_accept_terms, desktop_home, home from zerver.views.hotspots import mark_hotspot_as_read from zerver.views.invite import ( @@ -836,6 +837,9 @@ urls += [ path("api/v1/", include(v1_api_mobile_patterns)), ] +# Healthcheck URL +urls += [path("health", health)] + # The sequence is important; if i18n URLs don't come first then # reverse URL mapping points to i18n URLs which causes the frontend # tests to fail