tornado: Perform rolling client restarts after servers are restarted.

Decouple the sending of client restart events from the restarting of the servers. Restarts use the new Tornado restart-clients endpoint to inject "restart" events into queues of clients which were loaded from the previous Tornado process. The rate is controlled by the `application_server.client_restart_rate`, in clients per minute, or a flag to `restart-clients` which overrides it. Note that a web client will also spread its restart over 5 minutes, so artificially-slow client restarts are generally not very necessary. Restarts of clients are deferred to until after post-deploy hooks are run, such that the pre- and post- deploy hooks are around the actual server restarts, even if pushing restart events to clients takes significant time.
2024-02-08 21:04:07 +00:00 · 2024-02-08 21:04:07 +00:00 · ec6f64f7b0
parent 27d53ecbe1
commit ec6f64f7b0
5 changed files with 91 additions and 3 deletions
--- a/puppet/zulip/templates/supervisor/zulip.conf.template.erb
+++ b/puppet/zulip/templates/supervisor/zulip.conf.template.erb
@ -26,7 +26,7 @@ directory=/home/zulip/deployments/current/

 <% if @tornado_ports.length > 1 -%>
 [program:zulip-tornado]
-command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:98%(process_num)02d
+command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:98%(process_num)02d --no-immediate-reloads
 process_name=zulip-tornado-port-98%(process_num)02d
 environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>"
 priority=200                   ; the relative start priority (default 999)
@ -43,7 +43,7 @@ directory=/home/zulip/deployments/current/
 numprocs=<%= @tornado_ports.length %>
 <% else -%>
 [program:zulip-tornado]
-command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:9800
+command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:9800 --no-immediate-reloads
 environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>"
 priority=200                   ; the relative start priority (default 999)
 autostart=true                 ; start at supervisord start (default: true)
--- a/scripts/lib/upgrade-zulip-stage-2
+++ b/scripts/lib/upgrade-zulip-stage-2
@ -495,7 +495,7 @@ else:
        )

    logging.info("Restarting Zulip...")
-    start_args = ["--skip-checks"]
+    start_args = ["--skip-checks", "--skip-client-reloads"]
    if not HAS_FILLED_CACHES:
        start_args.append("--fill-cache")
    if IS_SERVER_UP:
@ -509,6 +509,9 @@ else:

    run_hooks("post-deploy")

+    if not args.skip_client_reloads:
+        subprocess.check_call(["./scripts/reload-clients"], preexec_fn=su_to_zulip)
+
 if args.audit_fts_indexes:
    logging.info("Correcting full-text search indexes for updated dictionary files")
    logging.info("This may take a while but the server should work while it runs.")
--- a/scripts/lib/zulip_tools.py
+++ b/scripts/lib/zulip_tools.py
@ -682,6 +682,11 @@ def start_arg_parser(action: str, add_help: bool = False) -> argparse.ArgumentPa
    parser.add_argument(
        "--skip-checks", action="store_true", help="Skip syntax and database checks"
    )
+    parser.add_argument(
+        "--skip-client-reloads",
+        action="store_true",
+        help="Do not send reload events to web clients",
+    )
    if action == "restart":
        parser.add_argument(
            "--less-graceful",
--- a/scripts/reload-clients
+++ b/scripts/reload-clients
@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+import argparse
+import configparser
+import logging
+import os
+import sys
+import time
+
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+from scripts.lib.setup_path import setup_path
+
+setup_path()
+
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util import Retry
+
+from scripts.lib.zulip_tools import get_config, get_config_file, get_tornado_ports
+
+config_file = get_config_file()
+reload_rate = int(
+    get_config(
+        config_file,
+        "application_server",
+        "client_reload_rate",
+        "50",
+    )
+)
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--rate", type=int, help="Number of clients to reload per second", default=reload_rate
+)
+
+args = parser.parse_args()
+reload_rate = args.rate
+
+secret_config_file = configparser.RawConfigParser()
+secret_config_file.read("/etc/zulip/zulip-secrets.conf")
+shared_secret = get_config(secret_config_file, "secrets", "shared_secret")
+assert shared_secret
+
+# Perform relatively slow retries (2s, 4s, 8s) with backoff, including
+# on POST requests.  Failure to send this request successfully means
+# that clients may fail to reload, so we want to be somewhat resilient
+# to failures.  Since we are on localhost, we do not expect network
+# failures, only Tornado restarts, to cause failures here.
+retry = Retry(total=3, backoff_factor=1, allowed_methods=Retry.DEFAULT_ALLOWED_METHODS | {"POST"})
+c = requests.Session()
+c.mount("http://", HTTPAdapter(max_retries=retry))
+
+logging.Formatter.converter = time.gmtime
+logging.basicConfig(format="%(asctime)s reload-clients: %(message)s", level=logging.INFO)
+
+for port in get_tornado_ports(config_file):
+    logging.info("Starting to send client reload events to Tornado port %d", port)
+    try:
+        complete = False
+        # Rather than make a sustained one request per second, we batch
+        # into 5-second chunks of 5 times the client_reload_rate
+        SECONDS_PER_BATCH = 5
+        while not complete:
+            logging.info("Sending reload events to %d clients", reload_rate * SECONDS_PER_BATCH)
+            resp = c.post(
+                f"http://127.0.0.1:{port}/api/internal/web_reload_clients",
+                data={"client_count": reload_rate * SECONDS_PER_BATCH, "secret": shared_secret},
+                timeout=5,
+            )
+            resp.raise_for_status()
+            complete = resp.json()["complete"]
+            time.sleep(SECONDS_PER_BATCH)
+    except requests.exceptions.HTTPError:
+        # Failures in one shard likely won't affect other shards --
+        # give up on this shard, and try the next one,
+        logging.exception("Failed to send web_reload_clients request to Tornado port %d", port)
--- a/scripts/restart-server
+++ b/scripts/restart-server
@ -221,6 +221,11 @@ if action == "start" or args.less_graceful:
        logging.info("Starting workers")
        subprocess.check_call(["supervisorctl", "start", *workers])

+if has_application_server() and not args.skip_client_reloads:
+    # All of the servers have been (re)started; now enqueue events in
+    # the Tornado servers to tell clients to reload.
+    subprocess.check_call(["./scripts/reload-clients"])
+
 logging.info("Done!")
 print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)