zulip/scripts/restart-server

113 lines
4.1 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env python3
import argparse
import logging
import os
import pwd
import shlex
import subprocess
import sys
import time
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from scripts.lib.zulip_tools import (
DEPLOYMENTS_DIR,
ENDC,
OKGREEN,
WARNING,
get_config_file,
get_tornado_ports,
overwrite_symlink,
)
logging.Formatter.converter = time.gmtime
logging.basicConfig(format="%(asctime)s restart-server: %(message)s", level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument('--fill-cache', action='store_true', help='Fill the memcached caches')
args = parser.parse_args()
deploy_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '..'))
os.chdir(deploy_path)
if pwd.getpwuid(os.getuid()).pw_name != "zulip":
logging.error("Must be run as user 'zulip'.")
sys.exit(1)
# Send a statsd event on restarting the server
subprocess.check_call(
["./manage.py", "send_stats", "incr", "events.server_restart", str(int(time.time()))]
)
if args.fill_cache:
logging.info("Filling memcached caches")
subprocess.check_call(["./manage.py", "fill_memcached_caches"])
dependencies: Remove WebSockets system for sending messages. Zulip has had a small use of WebSockets (specifically, for the code path of sending messages, via the webapp only) since ~2013. We originally added this use of WebSockets in the hope that the latency benefits of doing so would allow us to avoid implementing a markdown local echo; they were not. Further, HTTP/2 may have eliminated the latency difference we hoped to exploit by using WebSockets in any case. While we’d originally imagined using WebSockets for other endpoints, there was never a good justification for moving more components to the WebSockets system. This WebSockets code path had a lot of downsides/complexity, including: * The messy hack involving constructing an emulated request object to hook into doing Django requests. * The `message_senders` queue processor system, which increases RAM needs and must be provisioned independently from the rest of the server). * A duplicate check_send_receive_time Nagios test specific to WebSockets. * The requirement for users to have their firewalls/NATs allow WebSocket connections, and a setting to disable them for networks where WebSockets don’t work. * Dependencies on the SockJS family of libraries, which has at times been poorly maintained, and periodically throws random JavaScript exceptions in our production environments without a deep enough traceback to effectively investigate. * A total of about 1600 lines of our code related to the feature. * Increased load on the Tornado system, especially around a Zulip server restart, and especially for large installations like zulipchat.com, resulting in extra delay before messages can be sent again. As detailed in https://github.com/zulip/zulip/pull/12862#issuecomment-536152397, it appears that removing WebSockets moderately increases the time it takes for the `send_message` API query to return from the server, but does not significantly change the time between when a message is sent and when it is received by clients. We don’t understand the reason for that change (suggesting the possibility of a measurement error), and even if it is a real change, we consider that potential small latency regression to be acceptable. If we later want WebSockets, we’ll likely want to just use Django Channels. Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2019-07-23 01:43:40 +02:00
core_server_services = ["zulip-django"]
if os.path.exists("/etc/supervisor/conf.d/thumbor.conf"):
core_server_services.append("zulip-thumbor")
current_symlink = os.path.join(DEPLOYMENTS_DIR, "current")
last_symlink = os.path.join(DEPLOYMENTS_DIR, "last")
change_symlink = os.readlink(current_symlink) != deploy_path
if change_symlink:
overwrite_symlink(os.readlink(current_symlink), last_symlink)
overwrite_symlink(deploy_path, current_symlink)
config_file = get_config_file()
tornado_ports = get_tornado_ports(config_file)
# We restart just the zulip-tornado service early, in order to
# minimize downtime of the tornado service caused by too many Python
# processes restarting at the same time resulting in it receiving
# insufficient priority. This is important, because Tornado is the
# main source of user-visible downtime when we restart a Zulip server.
if len(tornado_ports) > 1:
for p in tornado_ports:
# Restart Tornado processes individually for a better rate of
# restarts. This also avoids behavior with restarting a whole
# supervisord group where if any individual process is slow to
# stop, the whole bundle stays stopped for an extended time.
logging.info("Restarting Tornado process on port %s", p)
subprocess.check_call(["supervisorctl", "restart", f"zulip-tornado:zulip-tornado-port-{p}"])
else:
logging.info("Restarting Tornado process")
subprocess.check_call(["supervisorctl", "restart", "zulip-tornado", "zulip-tornado:*"])
# Restart the uWSGI and related processes via supervisorctl.
logging.info("Stopping workers")
subprocess.check_call(["supervisorctl", "stop", "zulip-workers:*"])
logging.info("Stopping server core")
subprocess.check_call(["supervisorctl", "stop", *core_server_services])
logging.info("Starting server core")
subprocess.check_call(["supervisorctl", "start", *reversed(core_server_services)])
logging.info("Starting workers")
subprocess.check_call(["supervisorctl", "start", "zulip-workers:*"])
using_sso = subprocess.check_output(['./scripts/get-django-setting', 'USING_APACHE_SSO'])
if using_sso.strip() == b'True':
logging.info("Restarting Apache WSGI process...")
subprocess.check_call(["pkill", "-f", "apache2", "-u", "zulip"])
if os.path.exists("/etc/supervisor/conf.d/zulip_db.conf"):
subprocess.check_call(["supervisorctl", "restart", "process-fts-updates"])
logging.info("Done!")
print(OKGREEN + "Application restarted successfully!" + ENDC)
if change_symlink and "PWD" in os.environ:
for symlink in [last_symlink, current_symlink]:
if os.path.commonprefix([os.environ["PWD"], symlink]) == symlink:
print(
"""
{}Your shell entered its current directory through a symlink:
{}
which has now changed. Your shell will not see this change until you run:
cd {}
to traverse the symlink again.{}
""".format(
WARNING, symlink, shlex.quote(os.environ["PWD"]), ENDC
),
file=sys.stderr,
)