#!/usr/bin/env python3 import argparse import logging import os import pwd import shlex import subprocess import sys import time sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from scripts.lib.zulip_tools import ( DEPLOYMENTS_DIR, ENDC, OKGREEN, WARNING, get_config_file, get_tornado_ports, overwrite_symlink, ) action = "restart" if not sys.argv[0].endswith("restart-server"): action = "start" verbing = action.title() + "ing" logging.Formatter.converter = time.gmtime logging.basicConfig(format=f"%(asctime)s {action}-server: %(message)s", level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument("--fill-cache", action="store_true", help="Fill the memcached caches") if action == "restart": parser.add_argument( "--less-graceful", action="store_true", help="Restart with more concern for expediency than minimizing availability interruption", ) args = parser.parse_args() deploy_path = os.path.realpath(os.path.join(os.path.dirname(__file__), "..")) os.chdir(deploy_path) if pwd.getpwuid(os.getuid()).pw_name != "zulip": logging.error("Must be run as user 'zulip'.") sys.exit(1) # Send a statsd event on restarting the server subprocess.check_call( ["./manage.py", "send_stats", "incr", "events.server_restart", str(int(time.time()))] ) if args.fill_cache: logging.info("Filling memcached caches") subprocess.check_call(["./manage.py", "fill_memcached_caches"]) current_symlink = os.path.join(DEPLOYMENTS_DIR, "current") last_symlink = os.path.join(DEPLOYMENTS_DIR, "last") change_symlink = os.readlink(current_symlink) != deploy_path if change_symlink: overwrite_symlink(os.readlink(current_symlink), last_symlink) overwrite_symlink(deploy_path, current_symlink) config_file = get_config_file() tornado_ports = get_tornado_ports(config_file) # Start by restarting the workers and similar processes, one at a # time. Workers can always support processing events with old event # contents, but cannot necessarily understand events enqueued by a # newer Django process. Restarting them one at a time, rather than # all-at-once, minimizes the downtime of each, and reduces startup # contention. # # For "start" or less-graceful circumstances, we don't need to # iterate; we'll stop all of them at once, and start them all later. # In those cases, using the glob form is faster -- but if we do need # to iterate, we need to expand the glob. if action == "start" or args.less_graceful: workers = ["zulip-workers:*"] else: worker_status = subprocess.run( ["supervisorctl", "status", "zulip-workers:*"], universal_newlines=True, stdout=subprocess.PIPE, ) # `supercisorctl status` returns 3 if any are stopped, which is fine here. if worker_status.returncode not in (0, 3): worker_status.check_returncode() workers = [status_line.split()[0] for status_line in worker_status.stdout.splitlines()] if os.path.exists("/etc/supervisor/conf.d/zulip/zulip_db.conf"): workers.append("process-fts-updates") if action == "restart": if args.less_graceful: # The less graceful form stops every worker now; we start them # back up at the end. logging.info("Stopping workers") subprocess.check_call(["supervisorctl", "stop", *workers]) else: # We cannot pass all of these to one `supervisorctl restart` # because that takes them all down at once, waits until they are # all down, and then brings them back up; doing them sequentially # requires multiple `supervisorctl restart` calls. for worker in workers: logging.info("Restarting %s", worker) subprocess.check_call(["supervisorctl", "restart", worker]) # Next, we restart the Tornado processes sequentially, in order to # minimize downtime of the tornado service caused by too many Python # processes restarting at the same time, resulting in each receiving # insufficient priority. This is important, because Tornado is the # main source of user-visible downtime when we restart a Zulip server. # We do this before restarting Django, in case there are new event # types which it will need to know how to deal with. if len(tornado_ports) > 1: for p in tornado_ports: # Restart Tornado processes individually for a better rate of # restarts. This also avoids behavior with restarting a whole # supervisord group where if any individual process is slow to # stop, the whole bundle stays stopped for an extended time. logging.info("%s Tornado process on port %s", verbing, p) subprocess.check_call(["supervisorctl", action, f"zulip-tornado:zulip-tornado-port-{p}"]) else: logging.info("%s Tornado process", verbing) subprocess.check_call(["supervisorctl", action, "zulip-tornado", "zulip-tornado:*"]) # Finally, restart the Django uWSGI processes. logging.info("%s django server", verbing) subprocess.check_call(["supervisorctl", action, "zulip-django"]) # If we were doing this non-gracefully, or starting as opposed to # restarting, we need to turn the workers (back) on. There's no # advantage to doing this not-all-at-once. if action == "start" or args.less_graceful: logging.info("Starting workers") subprocess.check_call(["supervisorctl", "start", *workers]) using_sso = subprocess.check_output(["./scripts/get-django-setting", "USING_APACHE_SSO"]) if using_sso.strip() == b"True": logging.info("Restarting Apache WSGI process...") subprocess.check_call(["pkill", "-x", "apache2", "-u", "zulip"]) logging.info("Done!") print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC) if change_symlink and "PWD" in os.environ: for symlink in [last_symlink, current_symlink]: if os.path.commonprefix([os.environ["PWD"], symlink]) == symlink: print( """ {}Your shell entered its current directory through a symlink: {} which has now changed. Your shell will not see this change until you run: cd {} to traverse the symlink again.{} """.format( WARNING, symlink, shlex.quote(os.environ["PWD"]), ENDC ), file=sys.stderr, )