mirror of https://github.com/zulip/zulip.git
tornado: Perform rolling client restarts after servers are restarted.
Decouple the sending of client restart events from the restarting of the servers. Restarts use the new Tornado restart-clients endpoint to inject "restart" events into queues of clients which were loaded from the previous Tornado process. The rate is controlled by the `application_server.client_restart_rate`, in clients per minute, or a flag to `restart-clients` which overrides it. Note that a web client will also spread its restart over 5 minutes, so artificially-slow client restarts are generally not very necessary. Restarts of clients are deferred to until after post-deploy hooks are run, such that the pre- and post- deploy hooks are around the actual server restarts, even if pushing restart events to clients takes significant time.
This commit is contained in:
parent
27d53ecbe1
commit
ec6f64f7b0
|
@ -26,7 +26,7 @@ directory=/home/zulip/deployments/current/
|
|||
|
||||
<% if @tornado_ports.length > 1 -%>
|
||||
[program:zulip-tornado]
|
||||
command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:98%(process_num)02d
|
||||
command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:98%(process_num)02d --no-immediate-reloads
|
||||
process_name=zulip-tornado-port-98%(process_num)02d
|
||||
environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>"
|
||||
priority=200 ; the relative start priority (default 999)
|
||||
|
@ -43,7 +43,7 @@ directory=/home/zulip/deployments/current/
|
|||
numprocs=<%= @tornado_ports.length %>
|
||||
<% else -%>
|
||||
[program:zulip-tornado]
|
||||
command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:9800
|
||||
command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:9800 --no-immediate-reloads
|
||||
environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>"
|
||||
priority=200 ; the relative start priority (default 999)
|
||||
autostart=true ; start at supervisord start (default: true)
|
||||
|
|
|
@ -495,7 +495,7 @@ else:
|
|||
)
|
||||
|
||||
logging.info("Restarting Zulip...")
|
||||
start_args = ["--skip-checks"]
|
||||
start_args = ["--skip-checks", "--skip-client-reloads"]
|
||||
if not HAS_FILLED_CACHES:
|
||||
start_args.append("--fill-cache")
|
||||
if IS_SERVER_UP:
|
||||
|
@ -509,6 +509,9 @@ else:
|
|||
|
||||
run_hooks("post-deploy")
|
||||
|
||||
if not args.skip_client_reloads:
|
||||
subprocess.check_call(["./scripts/reload-clients"], preexec_fn=su_to_zulip)
|
||||
|
||||
if args.audit_fts_indexes:
|
||||
logging.info("Correcting full-text search indexes for updated dictionary files")
|
||||
logging.info("This may take a while but the server should work while it runs.")
|
||||
|
|
|
@ -682,6 +682,11 @@ def start_arg_parser(action: str, add_help: bool = False) -> argparse.ArgumentPa
|
|||
parser.add_argument(
|
||||
"--skip-checks", action="store_true", help="Skip syntax and database checks"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-client-reloads",
|
||||
action="store_true",
|
||||
help="Do not send reload events to web clients",
|
||||
)
|
||||
if action == "restart":
|
||||
parser.add_argument(
|
||||
"--less-graceful",
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import configparser
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from scripts.lib.setup_path import setup_path
|
||||
|
||||
setup_path()
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util import Retry
|
||||
|
||||
from scripts.lib.zulip_tools import get_config, get_config_file, get_tornado_ports
|
||||
|
||||
config_file = get_config_file()
|
||||
reload_rate = int(
|
||||
get_config(
|
||||
config_file,
|
||||
"application_server",
|
||||
"client_reload_rate",
|
||||
"50",
|
||||
)
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--rate", type=int, help="Number of clients to reload per second", default=reload_rate
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
reload_rate = args.rate
|
||||
|
||||
secret_config_file = configparser.RawConfigParser()
|
||||
secret_config_file.read("/etc/zulip/zulip-secrets.conf")
|
||||
shared_secret = get_config(secret_config_file, "secrets", "shared_secret")
|
||||
assert shared_secret
|
||||
|
||||
# Perform relatively slow retries (2s, 4s, 8s) with backoff, including
|
||||
# on POST requests. Failure to send this request successfully means
|
||||
# that clients may fail to reload, so we want to be somewhat resilient
|
||||
# to failures. Since we are on localhost, we do not expect network
|
||||
# failures, only Tornado restarts, to cause failures here.
|
||||
retry = Retry(total=3, backoff_factor=1, allowed_methods=Retry.DEFAULT_ALLOWED_METHODS | {"POST"})
|
||||
c = requests.Session()
|
||||
c.mount("http://", HTTPAdapter(max_retries=retry))
|
||||
|
||||
logging.Formatter.converter = time.gmtime
|
||||
logging.basicConfig(format="%(asctime)s reload-clients: %(message)s", level=logging.INFO)
|
||||
|
||||
for port in get_tornado_ports(config_file):
|
||||
logging.info("Starting to send client reload events to Tornado port %d", port)
|
||||
try:
|
||||
complete = False
|
||||
# Rather than make a sustained one request per second, we batch
|
||||
# into 5-second chunks of 5 times the client_reload_rate
|
||||
SECONDS_PER_BATCH = 5
|
||||
while not complete:
|
||||
logging.info("Sending reload events to %d clients", reload_rate * SECONDS_PER_BATCH)
|
||||
resp = c.post(
|
||||
f"http://127.0.0.1:{port}/api/internal/web_reload_clients",
|
||||
data={"client_count": reload_rate * SECONDS_PER_BATCH, "secret": shared_secret},
|
||||
timeout=5,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
complete = resp.json()["complete"]
|
||||
time.sleep(SECONDS_PER_BATCH)
|
||||
except requests.exceptions.HTTPError:
|
||||
# Failures in one shard likely won't affect other shards --
|
||||
# give up on this shard, and try the next one,
|
||||
logging.exception("Failed to send web_reload_clients request to Tornado port %d", port)
|
|
@ -221,6 +221,11 @@ if action == "start" or args.less_graceful:
|
|||
logging.info("Starting workers")
|
||||
subprocess.check_call(["supervisorctl", "start", *workers])
|
||||
|
||||
if has_application_server() and not args.skip_client_reloads:
|
||||
# All of the servers have been (re)started; now enqueue events in
|
||||
# the Tornado servers to tell clients to reload.
|
||||
subprocess.check_call(["./scripts/reload-clients"])
|
||||
|
||||
logging.info("Done!")
|
||||
print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)
|
||||
|
||||
|
|
Loading…
Reference in New Issue