scripts: Add {start,stop,restart}-server support for postgresql role.

During the upgrade process of a postgresql-only Zulip installation,
(`puppet_classes = zulip::profile::postgresql` in
`/etc/zulip/zulip.conf`) either `scripts/start-server` or
`scripts/stop-server` fail because they try to handle supervisor
services that are not available (e.g. Tornado) since only
`/etc/supervisor/conf.d/zulip/zulip_db.conf` is present and not
`/etc/supervisor/conf.d/zulip/zulip.conf`.

While this wasn't previously supported, it's a pretty reasonable thing
to do, and can be readily supported by just adding a few conditionals.
This commit is contained in:
Robert Imschweiler 2021-04-27 20:48:19 +02:00 committed by Tim Abbott
parent 772500d1c6
commit 534d78232c
3 changed files with 71 additions and 60 deletions

View File

@ -599,6 +599,10 @@ def is_vagrant_env_host(path: str) -> bool:
return ".vagrant" in os.listdir(path) return ".vagrant" in os.listdir(path)
def has_application_server() -> bool:
return os.path.exists("/etc/supervisor/conf.d/zulip/zulip.conf")
def deport(netloc: str) -> str: def deport(netloc: str) -> str:
"""Remove the port from a hostname:port string. Brackets on a literal """Remove the port from a hostname:port string. Brackets on a literal
IPv6 address are included.""" IPv6 address are included."""

View File

@ -16,6 +16,7 @@ from scripts.lib.zulip_tools import (
WARNING, WARNING,
get_config_file, get_config_file,
get_tornado_ports, get_tornado_ports,
has_application_server,
overwrite_symlink, overwrite_symlink,
) )
@ -62,35 +63,37 @@ if change_symlink:
config_file = get_config_file() config_file = get_config_file()
tornado_ports = get_tornado_ports(config_file) tornado_ports = get_tornado_ports(config_file)
workers = []
# Start by restarting the workers and similar processes, one at a if has_application_server():
# time. Workers can always support processing events with old event # Start by restarting the workers and similar processes, one at a
# contents, but cannot necessarily understand events enqueued by a # time. Workers can always support processing events with old event
# newer Django process. Restarting them one at a time, rather than # contents, but cannot necessarily understand events enqueued by a
# all-at-once, minimizes the downtime of each, and reduces startup # newer Django process. Restarting them one at a time, rather than
# contention. # all-at-once, minimizes the downtime of each, and reduces startup
# # contention.
# For "start" or less-graceful circumstances, we don't need to #
# iterate; we'll stop all of them at once, and start them all later. # For "start" or less-graceful circumstances, we don't need to
# In those cases, using the glob form is faster -- but if we do need # iterate; we'll stop all of them at once, and start them all later.
# to iterate, we need to expand the glob. # In those cases, using the glob form is faster -- but if we do need
if action == "start" or args.less_graceful: # to iterate, we need to expand the glob.
workers = ["zulip-workers:*"] if action == "start" or args.less_graceful:
else: workers.append("zulip-workers:*")
worker_status = subprocess.run( else:
["supervisorctl", "status", "zulip-workers:*"], worker_status = subprocess.run(
universal_newlines=True, ["supervisorctl", "status", "zulip-workers:*"],
stdout=subprocess.PIPE, universal_newlines=True,
) stdout=subprocess.PIPE,
# `supercisorctl status` returns 3 if any are stopped, which is fine here. )
if worker_status.returncode not in (0, 3): # `supercisorctl status` returns 3 if any are stopped, which is fine here.
worker_status.check_returncode() if worker_status.returncode not in (0, 3):
workers = [status_line.split()[0] for status_line in worker_status.stdout.splitlines()] worker_status.check_returncode()
workers.extend(status_line.split()[0] for status_line in worker_status.stdout.splitlines())
if os.path.exists("/etc/supervisor/conf.d/zulip/zulip_db.conf"): if os.path.exists("/etc/supervisor/conf.d/zulip/zulip_db.conf"):
workers.append("process-fts-updates") workers.append("process-fts-updates")
if action == "restart": if action == "restart" and len(workers) > 0:
if args.less_graceful: if args.less_graceful:
# The less graceful form stops every worker now; we start them # The less graceful form stops every worker now; we start them
# back up at the end. # back up at the end.
@ -105,41 +108,44 @@ if action == "restart":
logging.info("Restarting %s", worker) logging.info("Restarting %s", worker)
subprocess.check_call(["supervisorctl", "restart", worker]) subprocess.check_call(["supervisorctl", "restart", worker])
# Next, we restart the Tornado processes sequentially, in order to if has_application_server():
# minimize downtime of the tornado service caused by too many Python # Next, we restart the Tornado processes sequentially, in order to
# processes restarting at the same time, resulting in each receiving # minimize downtime of the tornado service caused by too many Python
# insufficient priority. This is important, because Tornado is the # processes restarting at the same time, resulting in each receiving
# main source of user-visible downtime when we restart a Zulip server. # insufficient priority. This is important, because Tornado is the
# We do this before restarting Django, in case there are new event # main source of user-visible downtime when we restart a Zulip server.
# types which it will need to know how to deal with. # We do this before restarting Django, in case there are new event
if len(tornado_ports) > 1: # types which it will need to know how to deal with.
for p in tornado_ports: if len(tornado_ports) > 1:
# Restart Tornado processes individually for a better rate of for p in tornado_ports:
# restarts. This also avoids behavior with restarting a whole # Restart Tornado processes individually for a better rate of
# supervisord group where if any individual process is slow to # restarts. This also avoids behavior with restarting a whole
# stop, the whole bundle stays stopped for an extended time. # supervisord group where if any individual process is slow to
logging.info("%s Tornado process on port %s", verbing, p) # stop, the whole bundle stays stopped for an extended time.
subprocess.check_call(["supervisorctl", action, f"zulip-tornado:zulip-tornado-port-{p}"]) logging.info("%s Tornado process on port %s", verbing, p)
else: subprocess.check_call(
logging.info("%s Tornado process", verbing) ["supervisorctl", action, f"zulip-tornado:zulip-tornado-port-{p}"]
subprocess.check_call(["supervisorctl", action, "zulip-tornado", "zulip-tornado:*"]) )
else:
logging.info("%s Tornado process", verbing)
subprocess.check_call(["supervisorctl", action, "zulip-tornado", "zulip-tornado:*"])
# Finally, restart the Django uWSGI processes. # Finally, restart the Django uWSGI processes.
logging.info("%s django server", verbing) logging.info("%s django server", verbing)
subprocess.check_call(["supervisorctl", action, "zulip-django"]) subprocess.check_call(["supervisorctl", action, "zulip-django"])
using_sso = subprocess.check_output(["./scripts/get-django-setting", "USING_APACHE_SSO"])
if using_sso.strip() == b"True":
logging.info("Restarting Apache WSGI process...")
subprocess.check_call(["pkill", "-x", "apache2", "-u", "zulip"])
# If we were doing this non-gracefully, or starting as opposed to # If we were doing this non-gracefully, or starting as opposed to
# restarting, we need to turn the workers (back) on. There's no # restarting, we need to turn the workers (back) on. There's no
# advantage to doing this not-all-at-once. # advantage to doing this not-all-at-once.
if action == "start" or args.less_graceful: if (action == "start" or args.less_graceful) and len(workers) > 0:
logging.info("Starting workers") logging.info("Starting workers")
subprocess.check_call(["supervisorctl", "start", *workers]) subprocess.check_call(["supervisorctl", "start", *workers])
using_sso = subprocess.check_output(["./scripts/get-django-setting", "USING_APACHE_SSO"])
if using_sso.strip() == b"True":
logging.info("Restarting Apache WSGI process...")
subprocess.check_call(["pkill", "-x", "apache2", "-u", "zulip"])
logging.info("Done!") logging.info("Done!")
print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC) print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)

View File

@ -7,7 +7,7 @@ import sys
import time import time
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from scripts.lib.zulip_tools import ENDC, OKGREEN, WARNING from scripts.lib.zulip_tools import ENDC, OKGREEN, WARNING, has_application_server
deploy_path = os.path.realpath(os.path.join(os.path.dirname(__file__), "..")) deploy_path = os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
os.chdir(deploy_path) os.chdir(deploy_path)
@ -25,14 +25,15 @@ services = []
if os.path.exists("/etc/supervisor/conf.d/zulip/zulip_db.conf"): if os.path.exists("/etc/supervisor/conf.d/zulip/zulip_db.conf"):
services.append("process-fts-updates") services.append("process-fts-updates")
# Contrary to the order in (re)start-server, we stop django before the if has_application_server():
# workers, to increase the chance that we finish processing any work # Contrary to the order in (re)start-server, we stop django before the
# that may have been enqueued by the Django, leaving the final state # workers, to increase the chance that we finish processing any work
# closer to "empty." We stop Django before Tornado so it doesn't try # that may have been enqueued by the Django, leaving the final state
# to make requests to make queues with a down'd Tornado. # closer to "empty." We stop Django before Tornado so it doesn't try
services.append("zulip-django") # to make requests to make queues with a down'd Tornado.
services.extend(["zulip-tornado", "zulip-tornado:*"]) services.append("zulip-django")
services.append("zulip-workers:*") services.extend(["zulip-tornado", "zulip-tornado:*"])
services.append("zulip-workers:*")
subprocess.check_call(["supervisorctl", "stop", *services]) subprocess.check_call(["supervisorctl", "stop", *services])