mirror of https://github.com/zulip/zulip.git
265 lines
10 KiB
Python
Executable File
265 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import contextlib
|
|
import logging
|
|
import os
|
|
import pwd
|
|
import shlex
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
|
from scripts.lib.setup_path import setup_path
|
|
|
|
setup_path()
|
|
|
|
from scripts.lib.supervisor import list_supervisor_processes
|
|
from scripts.lib.zulip_tools import (
|
|
DEPLOYMENTS_DIR,
|
|
ENDC,
|
|
OKGREEN,
|
|
WARNING,
|
|
get_config,
|
|
get_config_file,
|
|
get_tornado_ports,
|
|
has_application_server,
|
|
has_process_fts_updates,
|
|
overwrite_symlink,
|
|
start_arg_parser,
|
|
su_to_zulip,
|
|
)
|
|
|
|
action = "restart"
|
|
if not sys.argv[0].endswith("restart-server"):
|
|
action = "start"
|
|
verbing = action.title() + "ing"
|
|
|
|
logging.Formatter.converter = time.gmtime
|
|
logging.basicConfig(format=f"%(asctime)s {action}-server: %(message)s", level=logging.INFO)
|
|
|
|
parser = start_arg_parser(action=action, add_help=True)
|
|
args = parser.parse_args()
|
|
|
|
deploy_path = os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
|
|
os.chdir(deploy_path)
|
|
|
|
username = pwd.getpwuid(os.getuid()).pw_name
|
|
if username == "root":
|
|
su_to_zulip()
|
|
elif username != "zulip":
|
|
logging.error("Must be run as user 'zulip'.")
|
|
sys.exit(1)
|
|
|
|
|
|
if not args.skip_checks:
|
|
logging.info("Running syntax and database checks")
|
|
subprocess.check_call(["./manage.py", "check", "--database", "default"])
|
|
|
|
if args.fill_cache:
|
|
logging.info("Filling memcached caches")
|
|
subprocess.check_call(["./manage.py", "fill_memcached_caches", "--automated", "--skip-checks"])
|
|
|
|
current_symlink = os.path.join(DEPLOYMENTS_DIR, "current")
|
|
last_symlink = os.path.join(DEPLOYMENTS_DIR, "last")
|
|
change_symlink = os.readlink(current_symlink) != deploy_path
|
|
if change_symlink:
|
|
overwrite_symlink(os.readlink(current_symlink), last_symlink)
|
|
overwrite_symlink(deploy_path, current_symlink)
|
|
|
|
config_file = get_config_file()
|
|
tornado_ports = get_tornado_ports(config_file)
|
|
workers = []
|
|
|
|
if has_application_server():
|
|
# Start by restarting the workers and similar processes, one at a
|
|
# time. Workers can always support processing events with old event
|
|
# contents, but cannot necessarily understand events enqueued by a
|
|
# newer Django process. Restarting them one at a time, rather than
|
|
# all-at-once, minimizes the downtime of each, and reduces startup
|
|
# contention.
|
|
#
|
|
# For "start" or less-graceful circumstances, we don't need to
|
|
# iterate; we'll stop all of them at once, and start them all later.
|
|
# In those cases, using the glob form is faster -- but if we do need
|
|
# to iterate, we need to expand the glob.
|
|
if action == "start" or args.less_graceful:
|
|
workers.append("zulip-workers:*")
|
|
else:
|
|
workers.extend(list_supervisor_processes(["zulip-workers:*"]))
|
|
|
|
if has_application_server(once=True):
|
|
# These used to be included in "zulip-workers:*"; since we may
|
|
# be restarting an older version of Zulip, which has not
|
|
# applied puppet to reload the new list of processes, only
|
|
# stop them if they currently exist according to
|
|
# `supervisorctl`.
|
|
workers.extend(
|
|
list_supervisor_processes(
|
|
[
|
|
"zulip_deliver_scheduled_emails",
|
|
"zulip_deliver_scheduled_messages",
|
|
]
|
|
)
|
|
)
|
|
|
|
# This is an optional service, so may or may not exist
|
|
workers.extend(list_supervisor_processes(["zulip-katex"]))
|
|
|
|
# This does have some Python code, which reads from settings.py,
|
|
# so we need to restart it on every deploy. A short outage during
|
|
# the restart is fine, as clients will transparently retry.
|
|
workers.append("zulip-tus")
|
|
|
|
if has_process_fts_updates():
|
|
workers.append("process-fts-updates")
|
|
|
|
# Before we start (re)starting main services, make sure to start any
|
|
# optional auxiliary services that we don't stop, but do expect to be
|
|
# running, and aren't currently. These get new versions by getting
|
|
# updated supervisor files, and puppet restarts them -- so we never
|
|
# restart them in here, only start them.
|
|
aux_services = list_supervisor_processes(["go-camo", "smokescreen"], only_running=False)
|
|
if aux_services:
|
|
subprocess.check_call(["supervisorctl", "start", *aux_services])
|
|
|
|
# If none of the workers nor the application servers are running, this
|
|
# is actually a "start," not a restart, which means we will defer
|
|
# workers to later.
|
|
if (
|
|
action == "restart"
|
|
and len(
|
|
list_supervisor_processes([*workers, "zulip-django", "zulip-tornado:*"], only_running=True)
|
|
)
|
|
== 0
|
|
):
|
|
action = "start"
|
|
verbing = "Starting"
|
|
elif action == "start":
|
|
existing_services = list_supervisor_processes([*workers, "zulip-django", "zulip-tornado:*"])
|
|
running_services = list_supervisor_processes(
|
|
[*workers, "zulip-django", "zulip-tornado:*"], only_running=True
|
|
)
|
|
if existing_services == running_services:
|
|
logging.info("Zulip is already started; nothing to do!")
|
|
sys.exit(0)
|
|
|
|
|
|
def restart_or_start(service: str) -> None:
|
|
our_verb = action
|
|
existing_services = list_supervisor_processes([service])
|
|
running_services = list_supervisor_processes([service], only_running=True)
|
|
if our_verb == "restart" and len(running_services) == 0:
|
|
our_verb = "start"
|
|
elif our_verb == "start" and existing_services == running_services:
|
|
logging.info("%s already started!", service)
|
|
return
|
|
subprocess.check_call(["supervisorctl", our_verb, service])
|
|
|
|
|
|
if action == "restart" and len(workers) > 0:
|
|
if args.less_graceful:
|
|
# The less graceful form stops every worker now; we start them
|
|
# back up at the end.
|
|
logging.info("Stopping workers")
|
|
subprocess.check_call(["supervisorctl", "stop", *workers])
|
|
else:
|
|
# We cannot pass all of these to one `supervisorctl restart`
|
|
# because that takes them all down at once, waits until they are
|
|
# all down, and then brings them back up; doing them sequentially
|
|
# requires multiple `supervisorctl restart` calls.
|
|
for worker in workers:
|
|
logging.info("Restarting %s", worker)
|
|
restart_or_start(worker)
|
|
|
|
if has_application_server():
|
|
# Next, we restart the Tornado processes sequentially, in order to
|
|
# minimize downtime of the tornado service caused by too many
|
|
# Python processes restarting at the same time, resulting in each
|
|
# receiving insufficient priority. This is important, because
|
|
# Tornado being unavailable for too long is the main source of
|
|
# user-visible downtime when we restart a Zulip server. We do
|
|
# this before restarting Django, in case there are new event types
|
|
# which it will need to know how to deal with.
|
|
if len(tornado_ports) > 1:
|
|
for p in tornado_ports:
|
|
# Restart Tornado processes individually for a better rate of
|
|
# restarts. This also avoids behavior with restarting a whole
|
|
# supervisord group where if any individual process is slow to
|
|
# stop, the whole bundle stays stopped for an extended time.
|
|
logging.info("%s Tornado process on port %s", verbing, p)
|
|
restart_or_start(f"zulip-tornado:zulip-tornado-port-{p}")
|
|
else:
|
|
logging.info("%s Tornado process", verbing)
|
|
restart_or_start("zulip-tornado:*")
|
|
|
|
# Finally, restart the Django uWSGI processes.
|
|
if (
|
|
action == "restart"
|
|
and not args.less_graceful
|
|
and get_config(config_file, "application_server", "rolling_restart", False)
|
|
and os.path.exists("/home/zulip/deployments/uwsgi-control")
|
|
):
|
|
# See if it's currently running
|
|
uwsgi_status = subprocess.run(
|
|
["supervisorctl", "status", "zulip-django"],
|
|
stdout=subprocess.DEVNULL,
|
|
check=False,
|
|
)
|
|
if uwsgi_status.returncode == 0:
|
|
logging.info("Starting rolling restart of django server")
|
|
with contextlib.suppress(FileNotFoundError):
|
|
os.unlink("/var/lib/zulip/django-workers.ready")
|
|
with open("/home/zulip/deployments/uwsgi-control", "w") as control_socket:
|
|
# "c" is chain-reloading:
|
|
# https://uwsgi-docs.readthedocs.io/en/latest/MasterFIFO.html#available-commands
|
|
control_socket.write("c")
|
|
n = 0
|
|
while not os.path.exists("/var/lib/zulip/django-workers.ready"):
|
|
time.sleep(1)
|
|
n += 1
|
|
if n % 5 == 0:
|
|
logging.info("...")
|
|
logging.info("Chain reloading complete")
|
|
else:
|
|
logging.info("Starting django server")
|
|
subprocess.check_call(["supervisorctl", "start", "zulip-django"])
|
|
else:
|
|
logging.info("%s django server", verbing)
|
|
restart_or_start("zulip-django")
|
|
|
|
using_sso = subprocess.check_output(["./scripts/get-django-setting", "USING_APACHE_SSO"])
|
|
if using_sso.strip() == b"True":
|
|
logging.info("Restarting Apache WSGI process...")
|
|
subprocess.check_call(["pkill", "-x", "apache2", "-u", "zulip"])
|
|
|
|
# If we were doing this non-gracefully, or starting as opposed to
|
|
# restarting, we need to turn the workers (back) on. There's no
|
|
# advantage to doing this not-all-at-once.
|
|
if action == "start" or args.less_graceful:
|
|
workers = list_supervisor_processes(workers, only_running=False)
|
|
if workers:
|
|
logging.info("Starting workers")
|
|
subprocess.check_call(["supervisorctl", "start", *workers])
|
|
|
|
if has_application_server() and not args.skip_client_reloads:
|
|
# All of the servers have been (re)started; now enqueue events in
|
|
# the Tornado servers to tell clients to reload.
|
|
subprocess.check_call(["./scripts/reload-clients"])
|
|
|
|
logging.info("Done!")
|
|
print(OKGREEN + f"Zulip {action}ed successfully!" + ENDC)
|
|
|
|
if change_symlink and "PWD" in os.environ:
|
|
for symlink in [last_symlink, current_symlink]:
|
|
if os.path.commonprefix([os.environ["PWD"], symlink]) == symlink:
|
|
print(
|
|
"""
|
|
{}Your shell entered its current directory through a symlink:
|
|
{}
|
|
which has now changed. Your shell will not see this change until you run:
|
|
cd {}
|
|
to traverse the symlink again.{}
|
|
""".format(WARNING, symlink, shlex.quote(os.environ["PWD"]), ENDC),
|
|
file=sys.stderr,
|
|
)
|