tornado: Perform rolling client restarts after servers are restarted.

Decouple the sending of client restart events from the restarting of
the servers.  Restarts use the new Tornado restart-clients endpoint to
inject "restart" events into queues of clients which were loaded from
the previous Tornado process.  The rate is controlled by the
`application_server.client_restart_rate`, in clients per minute, or a
flag to `restart-clients` which overrides it.  Note that a web client
will also spread its restart over 5 minutes, so artificially-slow
client restarts are generally not very necessary.

Restarts of clients are deferred to until after post-deploy hooks are
run, such that the pre- and post- deploy hooks are around the actual
server restarts, even if pushing restart events to clients takes
significant time.
This commit is contained in:
Alex Vandiver 2024-02-08 21:04:07 +00:00 committed by Tim Abbott
parent 27d53ecbe1
commit ec6f64f7b0
5 changed files with 91 additions and 3 deletions

View File

@ -26,7 +26,7 @@ directory=/home/zulip/deployments/current/
<% if @tornado_ports.length > 1 -%> <% if @tornado_ports.length > 1 -%>
[program:zulip-tornado] [program:zulip-tornado]
command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:98%(process_num)02d command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:98%(process_num)02d --no-immediate-reloads
process_name=zulip-tornado-port-98%(process_num)02d process_name=zulip-tornado-port-98%(process_num)02d
environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>" environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>"
priority=200 ; the relative start priority (default 999) priority=200 ; the relative start priority (default 999)
@ -43,7 +43,7 @@ directory=/home/zulip/deployments/current/
numprocs=<%= @tornado_ports.length %> numprocs=<%= @tornado_ports.length %>
<% else -%> <% else -%>
[program:zulip-tornado] [program:zulip-tornado]
command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:9800 command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:9800 --no-immediate-reloads
environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>" environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>"
priority=200 ; the relative start priority (default 999) priority=200 ; the relative start priority (default 999)
autostart=true ; start at supervisord start (default: true) autostart=true ; start at supervisord start (default: true)

View File

@ -495,7 +495,7 @@ else:
) )
logging.info("Restarting Zulip...") logging.info("Restarting Zulip...")
start_args = ["--skip-checks"] start_args = ["--skip-checks", "--skip-client-reloads"]
if not HAS_FILLED_CACHES: if not HAS_FILLED_CACHES:
start_args.append("--fill-cache") start_args.append("--fill-cache")
if IS_SERVER_UP: if IS_SERVER_UP:
@ -509,6 +509,9 @@ else:
run_hooks("post-deploy") run_hooks("post-deploy")
if not args.skip_client_reloads:
subprocess.check_call(["./scripts/reload-clients"], preexec_fn=su_to_zulip)
if args.audit_fts_indexes: if args.audit_fts_indexes:
logging.info("Correcting full-text search indexes for updated dictionary files") logging.info("Correcting full-text search indexes for updated dictionary files")
logging.info("This may take a while but the server should work while it runs.") logging.info("This may take a while but the server should work while it runs.")

View File

@ -682,6 +682,11 @@ def start_arg_parser(action: str, add_help: bool = False) -> argparse.ArgumentPa
parser.add_argument( parser.add_argument(
"--skip-checks", action="store_true", help="Skip syntax and database checks" "--skip-checks", action="store_true", help="Skip syntax and database checks"
) )
parser.add_argument(
"--skip-client-reloads",
action="store_true",
help="Do not send reload events to web clients",
)
if action == "restart": if action == "restart":
parser.add_argument( parser.add_argument(
"--less-graceful", "--less-graceful",

75
scripts/reload-clients Executable file
View File

@ -0,0 +1,75 @@
#!/usr/bin/env python3
import argparse
import configparser
import logging
import os
import sys
import time
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from scripts.lib.setup_path import setup_path
setup_path()
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from scripts.lib.zulip_tools import get_config, get_config_file, get_tornado_ports
config_file = get_config_file()
reload_rate = int(
get_config(
config_file,
"application_server",
"client_reload_rate",
"50",
)
)
parser = argparse.ArgumentParser()
parser.add_argument(
"--rate", type=int, help="Number of clients to reload per second", default=reload_rate
)
args = parser.parse_args()
reload_rate = args.rate
secret_config_file = configparser.RawConfigParser()
secret_config_file.read("/etc/zulip/zulip-secrets.conf")
shared_secret = get_config(secret_config_file, "secrets", "shared_secret")
assert shared_secret
# Perform relatively slow retries (2s, 4s, 8s) with backoff, including
# on POST requests. Failure to send this request successfully means
# that clients may fail to reload, so we want to be somewhat resilient
# to failures. Since we are on localhost, we do not expect network
# failures, only Tornado restarts, to cause failures here.
retry = Retry(total=3, backoff_factor=1, allowed_methods=Retry.DEFAULT_ALLOWED_METHODS | {"POST"})
c = requests.Session()
c.mount("http://", HTTPAdapter(max_retries=retry))
logging.Formatter.converter = time.gmtime
logging.basicConfig(format="%(asctime)s reload-clients: %(message)s", level=logging.INFO)
for port in get_tornado_ports(config_file):
logging.info("Starting to send client reload events to Tornado port %d", port)
try:
complete = False
# Rather than make a sustained one request per second, we batch
# into 5-second chunks of 5 times the client_reload_rate
SECONDS_PER_BATCH = 5
while not complete:
logging.info("Sending reload events to %d clients", reload_rate * SECONDS_PER_BATCH)
resp = c.post(
f"http://127.0.0.1:{port}/api/internal/web_reload_clients",
data={"client_count": reload_rate * SECONDS_PER_BATCH, "secret": shared_secret},
timeout=5,
)
resp.raise_for_status()
complete = resp.json()["complete"]
time.sleep(SECONDS_PER_BATCH)
except requests.exceptions.HTTPError:
# Failures in one shard likely won't affect other shards --
# give up on this shard, and try the next one,
logging.exception("Failed to send web_reload_clients request to Tornado port %d", port)

View File

@ -221,6 +221,11 @@ if action == "start" or args.less_graceful:
logging.info("Starting workers") logging.info("Starting workers")
subprocess.check_call(["supervisorctl", "start", *workers]) subprocess.check_call(["supervisorctl", "start", *workers])
if has_application_server() and not args.skip_client_reloads:
# All of the servers have been (re)started; now enqueue events in
# the Tornado servers to tell clients to reload.
subprocess.check_call(["./scripts/reload-clients"])
logging.info("Done!") logging.info("Done!")
print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC) print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)