diff --git a/puppet/zulip/templates/supervisor/zulip.conf.template.erb b/puppet/zulip/templates/supervisor/zulip.conf.template.erb index 1b5a00103b..683bd17cd5 100644 --- a/puppet/zulip/templates/supervisor/zulip.conf.template.erb +++ b/puppet/zulip/templates/supervisor/zulip.conf.template.erb @@ -26,7 +26,7 @@ directory=/home/zulip/deployments/current/ <% if @tornado_ports.length > 1 -%> [program:zulip-tornado] -command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:98%(process_num)02d +command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:98%(process_num)02d --no-immediate-reloads process_name=zulip-tornado-port-98%(process_num)02d environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>" priority=200 ; the relative start priority (default 999) @@ -43,7 +43,7 @@ directory=/home/zulip/deployments/current/ numprocs=<%= @tornado_ports.length %> <% else -%> [program:zulip-tornado] -command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:9800 +command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:9800 --no-immediate-reloads environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>" priority=200 ; the relative start priority (default 999) autostart=true ; start at supervisord start (default: true) diff --git a/scripts/lib/upgrade-zulip-stage-2 b/scripts/lib/upgrade-zulip-stage-2 index b7105e452f..fee5d30a60 100755 --- a/scripts/lib/upgrade-zulip-stage-2 +++ b/scripts/lib/upgrade-zulip-stage-2 @@ -495,7 +495,7 @@ else: ) logging.info("Restarting Zulip...") - start_args = ["--skip-checks"] + start_args = ["--skip-checks", "--skip-client-reloads"] if not HAS_FILLED_CACHES: start_args.append("--fill-cache") if IS_SERVER_UP: @@ -509,6 +509,9 @@ else: run_hooks("post-deploy") + if not args.skip_client_reloads: + subprocess.check_call(["./scripts/reload-clients"], preexec_fn=su_to_zulip) + if args.audit_fts_indexes: logging.info("Correcting full-text search indexes for updated dictionary files") logging.info("This may take a while but the server should work while it runs.") diff --git a/scripts/lib/zulip_tools.py b/scripts/lib/zulip_tools.py index 3f392beb49..8708428d44 100755 --- a/scripts/lib/zulip_tools.py +++ b/scripts/lib/zulip_tools.py @@ -682,6 +682,11 @@ def start_arg_parser(action: str, add_help: bool = False) -> argparse.ArgumentPa parser.add_argument( "--skip-checks", action="store_true", help="Skip syntax and database checks" ) + parser.add_argument( + "--skip-client-reloads", + action="store_true", + help="Do not send reload events to web clients", + ) if action == "restart": parser.add_argument( "--less-graceful", diff --git a/scripts/reload-clients b/scripts/reload-clients new file mode 100755 index 0000000000..6982e22cc4 --- /dev/null +++ b/scripts/reload-clients @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +import argparse +import configparser +import logging +import os +import sys +import time + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from scripts.lib.setup_path import setup_path + +setup_path() + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + +from scripts.lib.zulip_tools import get_config, get_config_file, get_tornado_ports + +config_file = get_config_file() +reload_rate = int( + get_config( + config_file, + "application_server", + "client_reload_rate", + "50", + ) +) + +parser = argparse.ArgumentParser() +parser.add_argument( + "--rate", type=int, help="Number of clients to reload per second", default=reload_rate +) + +args = parser.parse_args() +reload_rate = args.rate + +secret_config_file = configparser.RawConfigParser() +secret_config_file.read("/etc/zulip/zulip-secrets.conf") +shared_secret = get_config(secret_config_file, "secrets", "shared_secret") +assert shared_secret + +# Perform relatively slow retries (2s, 4s, 8s) with backoff, including +# on POST requests. Failure to send this request successfully means +# that clients may fail to reload, so we want to be somewhat resilient +# to failures. Since we are on localhost, we do not expect network +# failures, only Tornado restarts, to cause failures here. +retry = Retry(total=3, backoff_factor=1, allowed_methods=Retry.DEFAULT_ALLOWED_METHODS | {"POST"}) +c = requests.Session() +c.mount("http://", HTTPAdapter(max_retries=retry)) + +logging.Formatter.converter = time.gmtime +logging.basicConfig(format="%(asctime)s reload-clients: %(message)s", level=logging.INFO) + +for port in get_tornado_ports(config_file): + logging.info("Starting to send client reload events to Tornado port %d", port) + try: + complete = False + # Rather than make a sustained one request per second, we batch + # into 5-second chunks of 5 times the client_reload_rate + SECONDS_PER_BATCH = 5 + while not complete: + logging.info("Sending reload events to %d clients", reload_rate * SECONDS_PER_BATCH) + resp = c.post( + f"http://127.0.0.1:{port}/api/internal/web_reload_clients", + data={"client_count": reload_rate * SECONDS_PER_BATCH, "secret": shared_secret}, + timeout=5, + ) + resp.raise_for_status() + complete = resp.json()["complete"] + time.sleep(SECONDS_PER_BATCH) + except requests.exceptions.HTTPError: + # Failures in one shard likely won't affect other shards -- + # give up on this shard, and try the next one, + logging.exception("Failed to send web_reload_clients request to Tornado port %d", port) diff --git a/scripts/restart-server b/scripts/restart-server index 2c1bfe7d46..c25bfbfa98 100755 --- a/scripts/restart-server +++ b/scripts/restart-server @@ -221,6 +221,11 @@ if action == "start" or args.less_graceful: logging.info("Starting workers") subprocess.check_call(["supervisorctl", "start", *workers]) +if has_application_server() and not args.skip_client_reloads: + # All of the servers have been (re)started; now enqueue events in + # the Tornado servers to tell clients to reload. + subprocess.check_call(["./scripts/reload-clients"]) + logging.info("Done!") print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)