diff --git a/scripts/restart-server b/scripts/restart-server index fe685bd968..d35eeefcf1 100755 --- a/scripts/restart-server +++ b/scripts/restart-server @@ -29,6 +29,12 @@ logging.basicConfig(format=f"%(asctime)s {action}-server: %(message)s", level=lo parser = argparse.ArgumentParser() parser.add_argument("--fill-cache", action="store_true", help="Fill the memcached caches") +if action == "restart": + parser.add_argument( + "--less-graceful", + action="store_true", + help="Restart with more concern for expediency than minimizing availability interruption", + ) args = parser.parse_args() deploy_path = os.path.realpath(os.path.join(os.path.dirname(__file__), "..")) @@ -47,10 +53,6 @@ if args.fill_cache: logging.info("Filling memcached caches") subprocess.check_call(["./manage.py", "fill_memcached_caches"]) -core_server_services = ["zulip-django"] -if os.path.exists("/etc/supervisor/conf.d/zulip/thumbor.conf"): - core_server_services.append("zulip-thumbor") - current_symlink = os.path.join(DEPLOYMENTS_DIR, "current") last_symlink = os.path.join(DEPLOYMENTS_DIR, "last") change_symlink = os.readlink(current_symlink) != deploy_path @@ -61,11 +63,57 @@ if change_symlink: config_file = get_config_file() tornado_ports = get_tornado_ports(config_file) -# We restart just the zulip-tornado service early, in order to +# Start by restarting the workers and similar processes, one at a +# time. Workers can always support processing events with old event +# contents, but cannot necessarily understand events enqueued by a +# newer Django process. Restarting them one at a time, rather than +# all-at-once, minimizes the downtime of each, and reduces startup +# contention. +# +# For "start" or less-graceful circumstances, we don't need to +# iterate; we'll stop all of them at once, and start them all later. +# In those cases, using the glob form is faster -- but if we do need +# to iterate, we need to expand the glob. +if action == "start" or args.less_graceful: + workers = ["zulip-workers:*"] +else: + worker_status = subprocess.run( + ["supervisorctl", "status", "zulip-workers:*"], + universal_newlines=True, + stdout=subprocess.PIPE, + ) + # `supercisorctl status` returns 3 if any are stopped, which is fine here. + if worker_status.returncode not in (0, 3): + worker_status.check_returncode() + workers = [status_line.split()[0] for status_line in worker_status.stdout.splitlines()] + +if os.path.exists("/etc/supervisor/conf.d/zulip/thumbor.conf"): + workers.append("zulip-thumbor") +if os.path.exists("/etc/supervisor/conf.d/zulip/zulip_db.conf"): + workers.append("process-fts-updates") + +if action == "restart": + if args.less_graceful: + # The less graceful form stops every worker now; we start them + # back up at the end. + logging.info("Stopping workers") + subprocess.check_call(["supervisorctl", "stop", *workers]) + else: + # We cannot pass all of these to one `supervisorctl restart` + # because that takes them all down at once, waits until they are + # all down, and then brings them back up; doing them sequentially + # requires multiple `supervisorctl restart` calls. + for worker in workers: + logging.info("Restarting %s", worker) + subprocess.check_call(["supervisorctl", "restart", worker]) + +# Next, we restart the Tornado processes sequentially, in order to # minimize downtime of the tornado service caused by too many Python -# processes restarting at the same time resulting in it receiving +# processes restarting at the same time, resulting in each receiving # insufficient priority. This is important, because Tornado is the # main source of user-visible downtime when we restart a Zulip server. +# We do this before restarting Django, in case there are new event +# types which it will need to know how to deal with. if len(tornado_ports) > 1: for p in tornado_ports: # Restart Tornado processes individually for a better rate of @@ -78,26 +126,22 @@ else: logging.info("%s Tornado process", verbing) subprocess.check_call(["supervisorctl", action, "zulip-tornado", "zulip-tornado:*"]) -# Restart the uWSGI and related processes via supervisorctl. -if action == "restart": - logging.info("Stopping workers") - subprocess.check_call(["supervisorctl", "stop", "zulip-workers:*"]) - logging.info("Stopping server core") - subprocess.check_call(["supervisorctl", "stop", *core_server_services]) +# Finally, restart the Django uWSGI processes. +logging.info("%s django server", verbing) +subprocess.check_call(["supervisorctl", action, "zulip-django"]) -logging.info("Starting server core") -subprocess.check_call(["supervisorctl", "start", *reversed(core_server_services)]) -logging.info("Starting workers") -subprocess.check_call(["supervisorctl", "start", "zulip-workers:*"]) +# If we were doing this non-gracefully, or starting as opposed to +# restarting, we need to turn the workers (back) on. There's no +# advantage to doing this not-all-at-once. +if action == "start" or args.less_graceful: + logging.info("Starting workers") + subprocess.check_call(["supervisorctl", "start", *workers]) using_sso = subprocess.check_output(["./scripts/get-django-setting", "USING_APACHE_SSO"]) if using_sso.strip() == b"True": logging.info("Restarting Apache WSGI process...") subprocess.check_call(["pkill", "-f", "apache2", "-u", "zulip"]) -if os.path.exists("/etc/supervisor/conf.d/zulip/zulip_db.conf"): - subprocess.check_call(["supervisorctl", action, "process-fts-updates"]) - logging.info("Done!") print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)