mirror of https://github.com/zulip/zulip.git
restart-server: Reorder supervisorctl calls for less downtime.
Instead of taking the "onion" approach, where all services are stopped, and then started back up again, default to a rolling restart across all processes. This draws out how long the overall "restart" takes, but minimizes the time that any of the services are down. This minimizes user-visible impact and queue buildup. In cases where speed is more important than minimal impact (for example, there is already a current outage), a --less-graceful flag is provided, which brings the services down more suddenly, and back up in a still-correct order.
This commit is contained in:
parent
3116f72894
commit
daabc52a78
|
@ -29,6 +29,12 @@ logging.basicConfig(format=f"%(asctime)s {action}-server: %(message)s", level=lo
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--fill-cache", action="store_true", help="Fill the memcached caches")
|
parser.add_argument("--fill-cache", action="store_true", help="Fill the memcached caches")
|
||||||
|
if action == "restart":
|
||||||
|
parser.add_argument(
|
||||||
|
"--less-graceful",
|
||||||
|
action="store_true",
|
||||||
|
help="Restart with more concern for expediency than minimizing availability interruption",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
deploy_path = os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
|
deploy_path = os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
|
@ -47,10 +53,6 @@ if args.fill_cache:
|
||||||
logging.info("Filling memcached caches")
|
logging.info("Filling memcached caches")
|
||||||
subprocess.check_call(["./manage.py", "fill_memcached_caches"])
|
subprocess.check_call(["./manage.py", "fill_memcached_caches"])
|
||||||
|
|
||||||
core_server_services = ["zulip-django"]
|
|
||||||
if os.path.exists("/etc/supervisor/conf.d/zulip/thumbor.conf"):
|
|
||||||
core_server_services.append("zulip-thumbor")
|
|
||||||
|
|
||||||
current_symlink = os.path.join(DEPLOYMENTS_DIR, "current")
|
current_symlink = os.path.join(DEPLOYMENTS_DIR, "current")
|
||||||
last_symlink = os.path.join(DEPLOYMENTS_DIR, "last")
|
last_symlink = os.path.join(DEPLOYMENTS_DIR, "last")
|
||||||
change_symlink = os.readlink(current_symlink) != deploy_path
|
change_symlink = os.readlink(current_symlink) != deploy_path
|
||||||
|
@ -61,11 +63,57 @@ if change_symlink:
|
||||||
config_file = get_config_file()
|
config_file = get_config_file()
|
||||||
tornado_ports = get_tornado_ports(config_file)
|
tornado_ports = get_tornado_ports(config_file)
|
||||||
|
|
||||||
# We restart just the zulip-tornado service early, in order to
|
# Start by restarting the workers and similar processes, one at a
|
||||||
|
# time. Workers can always support processing events with old event
|
||||||
|
# contents, but cannot necessarily understand events enqueued by a
|
||||||
|
# newer Django process. Restarting them one at a time, rather than
|
||||||
|
# all-at-once, minimizes the downtime of each, and reduces startup
|
||||||
|
# contention.
|
||||||
|
#
|
||||||
|
# For "start" or less-graceful circumstances, we don't need to
|
||||||
|
# iterate; we'll stop all of them at once, and start them all later.
|
||||||
|
# In those cases, using the glob form is faster -- but if we do need
|
||||||
|
# to iterate, we need to expand the glob.
|
||||||
|
if action == "start" or args.less_graceful:
|
||||||
|
workers = ["zulip-workers:*"]
|
||||||
|
else:
|
||||||
|
worker_status = subprocess.run(
|
||||||
|
["supervisorctl", "status", "zulip-workers:*"],
|
||||||
|
universal_newlines=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
# `supercisorctl status` returns 3 if any are stopped, which is fine here.
|
||||||
|
if worker_status.returncode not in (0, 3):
|
||||||
|
worker_status.check_returncode()
|
||||||
|
workers = [status_line.split()[0] for status_line in worker_status.stdout.splitlines()]
|
||||||
|
|
||||||
|
if os.path.exists("/etc/supervisor/conf.d/zulip/thumbor.conf"):
|
||||||
|
workers.append("zulip-thumbor")
|
||||||
|
if os.path.exists("/etc/supervisor/conf.d/zulip/zulip_db.conf"):
|
||||||
|
workers.append("process-fts-updates")
|
||||||
|
|
||||||
|
if action == "restart":
|
||||||
|
if args.less_graceful:
|
||||||
|
# The less graceful form stops every worker now; we start them
|
||||||
|
# back up at the end.
|
||||||
|
logging.info("Stopping workers")
|
||||||
|
subprocess.check_call(["supervisorctl", "stop", *workers])
|
||||||
|
else:
|
||||||
|
# We cannot pass all of these to one `supervisorctl restart`
|
||||||
|
# because that takes them all down at once, waits until they are
|
||||||
|
# all down, and then brings them back up; doing them sequentially
|
||||||
|
# requires multiple `supervisorctl restart` calls.
|
||||||
|
for worker in workers:
|
||||||
|
logging.info("Restarting %s", worker)
|
||||||
|
subprocess.check_call(["supervisorctl", "restart", worker])
|
||||||
|
|
||||||
|
# Next, we restart the Tornado processes sequentially, in order to
|
||||||
# minimize downtime of the tornado service caused by too many Python
|
# minimize downtime of the tornado service caused by too many Python
|
||||||
# processes restarting at the same time resulting in it receiving
|
# processes restarting at the same time, resulting in each receiving
|
||||||
# insufficient priority. This is important, because Tornado is the
|
# insufficient priority. This is important, because Tornado is the
|
||||||
# main source of user-visible downtime when we restart a Zulip server.
|
# main source of user-visible downtime when we restart a Zulip server.
|
||||||
|
# We do this before restarting Django, in case there are new event
|
||||||
|
# types which it will need to know how to deal with.
|
||||||
if len(tornado_ports) > 1:
|
if len(tornado_ports) > 1:
|
||||||
for p in tornado_ports:
|
for p in tornado_ports:
|
||||||
# Restart Tornado processes individually for a better rate of
|
# Restart Tornado processes individually for a better rate of
|
||||||
|
@ -78,26 +126,22 @@ else:
|
||||||
logging.info("%s Tornado process", verbing)
|
logging.info("%s Tornado process", verbing)
|
||||||
subprocess.check_call(["supervisorctl", action, "zulip-tornado", "zulip-tornado:*"])
|
subprocess.check_call(["supervisorctl", action, "zulip-tornado", "zulip-tornado:*"])
|
||||||
|
|
||||||
# Restart the uWSGI and related processes via supervisorctl.
|
# Finally, restart the Django uWSGI processes.
|
||||||
if action == "restart":
|
logging.info("%s django server", verbing)
|
||||||
logging.info("Stopping workers")
|
subprocess.check_call(["supervisorctl", action, "zulip-django"])
|
||||||
subprocess.check_call(["supervisorctl", "stop", "zulip-workers:*"])
|
|
||||||
logging.info("Stopping server core")
|
|
||||||
subprocess.check_call(["supervisorctl", "stop", *core_server_services])
|
|
||||||
|
|
||||||
logging.info("Starting server core")
|
# If we were doing this non-gracefully, or starting as opposed to
|
||||||
subprocess.check_call(["supervisorctl", "start", *reversed(core_server_services)])
|
# restarting, we need to turn the workers (back) on. There's no
|
||||||
logging.info("Starting workers")
|
# advantage to doing this not-all-at-once.
|
||||||
subprocess.check_call(["supervisorctl", "start", "zulip-workers:*"])
|
if action == "start" or args.less_graceful:
|
||||||
|
logging.info("Starting workers")
|
||||||
|
subprocess.check_call(["supervisorctl", "start", *workers])
|
||||||
|
|
||||||
using_sso = subprocess.check_output(["./scripts/get-django-setting", "USING_APACHE_SSO"])
|
using_sso = subprocess.check_output(["./scripts/get-django-setting", "USING_APACHE_SSO"])
|
||||||
if using_sso.strip() == b"True":
|
if using_sso.strip() == b"True":
|
||||||
logging.info("Restarting Apache WSGI process...")
|
logging.info("Restarting Apache WSGI process...")
|
||||||
subprocess.check_call(["pkill", "-f", "apache2", "-u", "zulip"])
|
subprocess.check_call(["pkill", "-f", "apache2", "-u", "zulip"])
|
||||||
|
|
||||||
if os.path.exists("/etc/supervisor/conf.d/zulip/zulip_db.conf"):
|
|
||||||
subprocess.check_call(["supervisorctl", action, "process-fts-updates"])
|
|
||||||
|
|
||||||
logging.info("Done!")
|
logging.info("Done!")
|
||||||
print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)
|
print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue