mirror of https://github.com/zulip/zulip.git
tornado: Perform rolling client restarts after servers are restarted.
Decouple the sending of client restart events from the restarting of the servers. Restarts use the new Tornado restart-clients endpoint to inject "restart" events into queues of clients which were loaded from the previous Tornado process. The rate is controlled by the `application_server.client_restart_rate`, in clients per minute, or a flag to `restart-clients` which overrides it. Note that a web client will also spread its restart over 5 minutes, so artificially-slow client restarts are generally not very necessary. Restarts of clients are deferred to until after post-deploy hooks are run, such that the pre- and post- deploy hooks are around the actual server restarts, even if pushing restart events to clients takes significant time.
This commit is contained in:
parent
27d53ecbe1
commit
ec6f64f7b0
|
@ -26,7 +26,7 @@ directory=/home/zulip/deployments/current/
|
||||||
|
|
||||||
<% if @tornado_ports.length > 1 -%>
|
<% if @tornado_ports.length > 1 -%>
|
||||||
[program:zulip-tornado]
|
[program:zulip-tornado]
|
||||||
command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:98%(process_num)02d
|
command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:98%(process_num)02d --no-immediate-reloads
|
||||||
process_name=zulip-tornado-port-98%(process_num)02d
|
process_name=zulip-tornado-port-98%(process_num)02d
|
||||||
environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>"
|
environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>"
|
||||||
priority=200 ; the relative start priority (default 999)
|
priority=200 ; the relative start priority (default 999)
|
||||||
|
@ -43,7 +43,7 @@ directory=/home/zulip/deployments/current/
|
||||||
numprocs=<%= @tornado_ports.length %>
|
numprocs=<%= @tornado_ports.length %>
|
||||||
<% else -%>
|
<% else -%>
|
||||||
[program:zulip-tornado]
|
[program:zulip-tornado]
|
||||||
command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:9800
|
command=/home/zulip/deployments/current/manage.py runtornado 127.0.0.1:9800 --no-immediate-reloads
|
||||||
environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>"
|
environment=PYTHONUNBUFFERED=1,HTTP_proxy="<%= @proxy %>",HTTPS_proxy="<%= @proxy %>"
|
||||||
priority=200 ; the relative start priority (default 999)
|
priority=200 ; the relative start priority (default 999)
|
||||||
autostart=true ; start at supervisord start (default: true)
|
autostart=true ; start at supervisord start (default: true)
|
||||||
|
|
|
@ -495,7 +495,7 @@ else:
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info("Restarting Zulip...")
|
logging.info("Restarting Zulip...")
|
||||||
start_args = ["--skip-checks"]
|
start_args = ["--skip-checks", "--skip-client-reloads"]
|
||||||
if not HAS_FILLED_CACHES:
|
if not HAS_FILLED_CACHES:
|
||||||
start_args.append("--fill-cache")
|
start_args.append("--fill-cache")
|
||||||
if IS_SERVER_UP:
|
if IS_SERVER_UP:
|
||||||
|
@ -509,6 +509,9 @@ else:
|
||||||
|
|
||||||
run_hooks("post-deploy")
|
run_hooks("post-deploy")
|
||||||
|
|
||||||
|
if not args.skip_client_reloads:
|
||||||
|
subprocess.check_call(["./scripts/reload-clients"], preexec_fn=su_to_zulip)
|
||||||
|
|
||||||
if args.audit_fts_indexes:
|
if args.audit_fts_indexes:
|
||||||
logging.info("Correcting full-text search indexes for updated dictionary files")
|
logging.info("Correcting full-text search indexes for updated dictionary files")
|
||||||
logging.info("This may take a while but the server should work while it runs.")
|
logging.info("This may take a while but the server should work while it runs.")
|
||||||
|
|
|
@ -682,6 +682,11 @@ def start_arg_parser(action: str, add_help: bool = False) -> argparse.ArgumentPa
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--skip-checks", action="store_true", help="Skip syntax and database checks"
|
"--skip-checks", action="store_true", help="Skip syntax and database checks"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-client-reloads",
|
||||||
|
action="store_true",
|
||||||
|
help="Do not send reload events to web clients",
|
||||||
|
)
|
||||||
if action == "restart":
|
if action == "restart":
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--less-graceful",
|
"--less-graceful",
|
||||||
|
|
|
@ -0,0 +1,75 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import configparser
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
|
from scripts.lib.setup_path import setup_path
|
||||||
|
|
||||||
|
setup_path()
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from urllib3.util import Retry
|
||||||
|
|
||||||
|
from scripts.lib.zulip_tools import get_config, get_config_file, get_tornado_ports
|
||||||
|
|
||||||
|
config_file = get_config_file()
|
||||||
|
reload_rate = int(
|
||||||
|
get_config(
|
||||||
|
config_file,
|
||||||
|
"application_server",
|
||||||
|
"client_reload_rate",
|
||||||
|
"50",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--rate", type=int, help="Number of clients to reload per second", default=reload_rate
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
reload_rate = args.rate
|
||||||
|
|
||||||
|
secret_config_file = configparser.RawConfigParser()
|
||||||
|
secret_config_file.read("/etc/zulip/zulip-secrets.conf")
|
||||||
|
shared_secret = get_config(secret_config_file, "secrets", "shared_secret")
|
||||||
|
assert shared_secret
|
||||||
|
|
||||||
|
# Perform relatively slow retries (2s, 4s, 8s) with backoff, including
|
||||||
|
# on POST requests. Failure to send this request successfully means
|
||||||
|
# that clients may fail to reload, so we want to be somewhat resilient
|
||||||
|
# to failures. Since we are on localhost, we do not expect network
|
||||||
|
# failures, only Tornado restarts, to cause failures here.
|
||||||
|
retry = Retry(total=3, backoff_factor=1, allowed_methods=Retry.DEFAULT_ALLOWED_METHODS | {"POST"})
|
||||||
|
c = requests.Session()
|
||||||
|
c.mount("http://", HTTPAdapter(max_retries=retry))
|
||||||
|
|
||||||
|
logging.Formatter.converter = time.gmtime
|
||||||
|
logging.basicConfig(format="%(asctime)s reload-clients: %(message)s", level=logging.INFO)
|
||||||
|
|
||||||
|
for port in get_tornado_ports(config_file):
|
||||||
|
logging.info("Starting to send client reload events to Tornado port %d", port)
|
||||||
|
try:
|
||||||
|
complete = False
|
||||||
|
# Rather than make a sustained one request per second, we batch
|
||||||
|
# into 5-second chunks of 5 times the client_reload_rate
|
||||||
|
SECONDS_PER_BATCH = 5
|
||||||
|
while not complete:
|
||||||
|
logging.info("Sending reload events to %d clients", reload_rate * SECONDS_PER_BATCH)
|
||||||
|
resp = c.post(
|
||||||
|
f"http://127.0.0.1:{port}/api/internal/web_reload_clients",
|
||||||
|
data={"client_count": reload_rate * SECONDS_PER_BATCH, "secret": shared_secret},
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
complete = resp.json()["complete"]
|
||||||
|
time.sleep(SECONDS_PER_BATCH)
|
||||||
|
except requests.exceptions.HTTPError:
|
||||||
|
# Failures in one shard likely won't affect other shards --
|
||||||
|
# give up on this shard, and try the next one,
|
||||||
|
logging.exception("Failed to send web_reload_clients request to Tornado port %d", port)
|
|
@ -221,6 +221,11 @@ if action == "start" or args.less_graceful:
|
||||||
logging.info("Starting workers")
|
logging.info("Starting workers")
|
||||||
subprocess.check_call(["supervisorctl", "start", *workers])
|
subprocess.check_call(["supervisorctl", "start", *workers])
|
||||||
|
|
||||||
|
if has_application_server() and not args.skip_client_reloads:
|
||||||
|
# All of the servers have been (re)started; now enqueue events in
|
||||||
|
# the Tornado servers to tell clients to reload.
|
||||||
|
subprocess.check_call(["./scripts/reload-clients"])
|
||||||
|
|
||||||
logging.info("Done!")
|
logging.info("Done!")
|
||||||
print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)
|
print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue