From 7c4293a7d3d738e2785ca39c01aee8d42051c9ba Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Wed, 9 Mar 2022 12:24:21 -0800 Subject: [PATCH] restart-server: Check if service is running before restart, vs start. In some instances (e.g. during upgrades) we run `restart-server` and not `start-server`, even though we expect the server to most likely already be stopped. `supervisorctl restart servicename` if the service is stopped produces the perhaps-alarming message: ``` restart-server: Restarting servicename servicename: ERROR (not running) servicename: started ``` This may cause operators to worry that something is broken, when it is not. Check if the service is already running, and switch from "restart" to "start" in cases where it is not. The race condition here is safe -- if the service transitions from stopped to started between the check and the `start` call, it will merely output: ``` servicename: ERROR (already started) ``` ...and continue, as that has exit status 0. If the service transitions from started to stopped between the check and the `restart` call, we are merely back in the current case, where it outputs: ``` servicename: ERROR (not running) servicename: started ``` In none of these cases does a call to "restart" fail to result in the service being stopped and then started. --- scripts/restart-server | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/scripts/restart-server b/scripts/restart-server index 0f9c05cc00..00f5c61f6e 100755 --- a/scripts/restart-server +++ b/scripts/restart-server @@ -98,6 +98,14 @@ aux_services = list_supervisor_processes(["go-camo", "smokescreen"], only_runnin if aux_services: subprocess.check_call(["supervisorctl", "start", *aux_services]) + +def restart_or_start(service: str) -> None: + our_verb = action + if our_verb == "restart" and len(list_supervisor_processes([service], only_running=True)) == 0: + our_verb = "start" + subprocess.check_call(["supervisorctl", our_verb, service]) + + if action == "restart" and len(workers) > 0: if args.less_graceful: # The less graceful form stops every worker now; we start them @@ -111,7 +119,7 @@ if action == "restart" and len(workers) > 0: # requires multiple `supervisorctl restart` calls. for worker in workers: logging.info("Restarting %s", worker) - subprocess.check_call(["supervisorctl", "restart", worker]) + restart_or_start(worker) if has_application_server(): # Next, we restart the Tornado processes sequentially, in order to @@ -130,12 +138,10 @@ if has_application_server(): # supervisord group where if any individual process is slow to # stop, the whole bundle stays stopped for an extended time. logging.info("%s Tornado process on port %s", verbing, p) - subprocess.check_call( - ["supervisorctl", action, f"zulip-tornado:zulip-tornado-port-{p}"] - ) + restart_or_start(f"zulip-tornado:zulip-tornado-port-{p}") else: logging.info("%s Tornado process", verbing) - subprocess.check_call(["supervisorctl", action, "zulip-tornado:*"]) + restart_or_start("zulip-tornado:*") # Finally, restart the Django uWSGI processes. if ( @@ -160,7 +166,7 @@ if has_application_server(): subprocess.check_call(["supervisorctl", "start", "zulip-django"]) else: logging.info("%s django server", verbing) - subprocess.check_call(["supervisorctl", action, "zulip-django"]) + restart_or_start("zulip-django") using_sso = subprocess.check_output(["./scripts/get-django-setting", "USING_APACHE_SSO"]) if using_sso.strip() == b"True":