py3: Switch almost all shebang lines to use `python3`.
This causes `upgrade-zulip-from-git`, as well as a no-option run of
`tools/build-release-tarball`, to produce a Zulip install running
Python 3, rather than Python 2. In particular this means that the
virtualenv we create, in which all application code runs, is Python 3.
One shebang line, on `zulip-ec2-configure-interfaces`, explicitly
keeps Python 2, and at least one external ops script, `wal-e`, also
still runs on Python 2. See discussion on the respective previous
commits that made those explicit. There may also be some other
third-party scripts we use, outside of this source tree and running
outside our virtualenv, that still run on Python 2.
2017-08-02 23:15:16 +02:00
|
|
|
#!/usr/bin/env python3
|
2019-01-14 17:30:53 +01:00
|
|
|
import argparse
|
2018-11-28 02:09:00 +01:00
|
|
|
import configparser
|
2020-06-11 00:54:34 +02:00
|
|
|
import logging
|
2013-01-31 16:49:09 +01:00
|
|
|
import os
|
2013-06-19 21:16:39 +02:00
|
|
|
import pwd
|
2020-06-11 00:54:34 +02:00
|
|
|
import shlex
|
2013-01-31 16:49:09 +01:00
|
|
|
import subprocess
|
2020-06-11 00:54:34 +02:00
|
|
|
import sys
|
2013-04-18 22:58:32 +02:00
|
|
|
import time
|
2013-10-25 23:20:40 +02:00
|
|
|
|
2013-10-25 23:46:02 +02:00
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
2020-06-11 00:54:34 +02:00
|
|
|
from scripts.lib.zulip_tools import DEPLOYMENTS_DIR, ENDC, OKGREEN, WARNING, overwrite_symlink
|
2013-03-13 19:26:51 +01:00
|
|
|
|
2018-08-12 01:56:58 +02:00
|
|
|
logging.Formatter.converter = time.gmtime
|
2013-03-13 19:26:51 +01:00
|
|
|
logging.basicConfig(format="%(asctime)s restart-server: %(message)s",
|
|
|
|
level=logging.INFO)
|
2013-01-31 16:49:09 +01:00
|
|
|
|
2019-01-14 17:30:53 +01:00
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('--fill-cache', action='store_true', dest='fill_cache', default=False,
|
|
|
|
help='Fill the memcached caches')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2013-06-03 19:29:52 +02:00
|
|
|
deploy_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '..'))
|
|
|
|
os.chdir(deploy_path)
|
2013-01-31 16:49:09 +01:00
|
|
|
|
2016-07-23 20:33:58 +02:00
|
|
|
if pwd.getpwuid(os.getuid()).pw_name != "zulip":
|
2013-11-01 00:00:30 +01:00
|
|
|
logging.error("Must be run as user 'zulip'.")
|
|
|
|
sys.exit(1)
|
2013-06-19 17:25:42 +02:00
|
|
|
|
2013-04-18 22:58:32 +02:00
|
|
|
# Send a statsd event on restarting the server
|
2016-11-22 01:44:16 +01:00
|
|
|
subprocess.check_call(["./manage.py", "send_stats", "incr", "events.server_restart", str(int(time.time()))])
|
2013-04-18 22:58:32 +02:00
|
|
|
|
2019-01-14 17:30:53 +01:00
|
|
|
if args.fill_cache:
|
|
|
|
logging.info("Filling memcached caches")
|
|
|
|
subprocess.check_call(["./manage.py", "fill_memcached_caches"])
|
2013-05-30 21:05:34 +02:00
|
|
|
|
dependencies: Remove WebSockets system for sending messages.
Zulip has had a small use of WebSockets (specifically, for the code
path of sending messages, via the webapp only) since ~2013. We
originally added this use of WebSockets in the hope that the latency
benefits of doing so would allow us to avoid implementing a markdown
local echo; they were not. Further, HTTP/2 may have eliminated the
latency difference we hoped to exploit by using WebSockets in any
case.
While we’d originally imagined using WebSockets for other endpoints,
there was never a good justification for moving more components to the
WebSockets system.
This WebSockets code path had a lot of downsides/complexity,
including:
* The messy hack involving constructing an emulated request object to
hook into doing Django requests.
* The `message_senders` queue processor system, which increases RAM
needs and must be provisioned independently from the rest of the
server).
* A duplicate check_send_receive_time Nagios test specific to
WebSockets.
* The requirement for users to have their firewalls/NATs allow
WebSocket connections, and a setting to disable them for networks
where WebSockets don’t work.
* Dependencies on the SockJS family of libraries, which has at times
been poorly maintained, and periodically throws random JavaScript
exceptions in our production environments without a deep enough
traceback to effectively investigate.
* A total of about 1600 lines of our code related to the feature.
* Increased load on the Tornado system, especially around a Zulip
server restart, and especially for large installations like
zulipchat.com, resulting in extra delay before messages can be sent
again.
As detailed in
https://github.com/zulip/zulip/pull/12862#issuecomment-536152397, it
appears that removing WebSockets moderately increases the time it
takes for the `send_message` API query to return from the server, but
does not significantly change the time between when a message is sent
and when it is received by clients. We don’t understand the reason
for that change (suggesting the possibility of a measurement error),
and even if it is a real change, we consider that potential small
latency regression to be acceptable.
If we later want WebSockets, we’ll likely want to just use Django
Channels.
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
2019-07-23 01:43:40 +02:00
|
|
|
core_server_services = ["zulip-django"]
|
2017-05-24 02:46:52 +02:00
|
|
|
if os.path.exists("/etc/supervisor/conf.d/thumbor.conf"):
|
|
|
|
core_server_services.append("zulip-thumbor")
|
|
|
|
|
2018-08-11 01:28:06 +02:00
|
|
|
current_symlink = os.path.join(DEPLOYMENTS_DIR, "current")
|
|
|
|
last_symlink = os.path.join(DEPLOYMENTS_DIR, "last")
|
2019-09-20 02:23:23 +02:00
|
|
|
change_symlink = os.readlink(current_symlink) != deploy_path
|
|
|
|
if change_symlink:
|
2018-07-18 23:50:15 +02:00
|
|
|
overwrite_symlink(os.readlink(current_symlink), last_symlink)
|
|
|
|
overwrite_symlink(deploy_path, current_symlink)
|
2018-08-11 01:28:06 +02:00
|
|
|
|
2018-11-28 02:09:00 +01:00
|
|
|
config_file = configparser.RawConfigParser()
|
|
|
|
config_file.read("/etc/zulip/zulip.conf")
|
|
|
|
|
|
|
|
try:
|
|
|
|
tornado_processes = int(config_file.get('application_server', 'tornado_processes'))
|
|
|
|
except (configparser.NoSectionError, configparser.NoOptionError):
|
|
|
|
tornado_processes = 1
|
|
|
|
|
2018-08-11 01:28:06 +02:00
|
|
|
# We restart just the zulip-tornado service early, in order to
|
|
|
|
# minimize downtime of the tornado service caused by too many Python
|
|
|
|
# processes restarting at the same time resulting in it receiving
|
|
|
|
# insufficient priority. This is important, because Tornado is the
|
|
|
|
# main source of user-visible downtime when we restart a Zulip server.
|
2018-11-28 02:09:00 +01:00
|
|
|
if tornado_processes > 1:
|
2020-03-27 14:23:34 +01:00
|
|
|
for p in range(9800, 9800+tornado_processes):
|
|
|
|
# Restart Tornado processes individually for a better rate of
|
|
|
|
# restarts. This also avoids behavior with restarting a whole
|
|
|
|
# supervisord group where if any individual process is slow to
|
|
|
|
# stop, the whole bundle stays stopped for an extended time.
|
2020-05-26 10:39:08 +02:00
|
|
|
logging.info("Restarting Tornado process on port %s", p)
|
2020-06-10 06:41:04 +02:00
|
|
|
subprocess.check_call(["supervisorctl", "restart", f"zulip-tornado:port-{p}"])
|
2018-11-28 02:09:00 +01:00
|
|
|
else:
|
2020-03-27 14:23:34 +01:00
|
|
|
logging.info("Restarting Tornado process")
|
2018-11-28 02:09:00 +01:00
|
|
|
subprocess.check_call(["supervisorctl", "restart", "zulip-tornado", "zulip-tornado:*"])
|
2018-08-11 01:28:06 +02:00
|
|
|
|
2016-11-23 13:36:09 +01:00
|
|
|
# Restart the uWSGI and related processes via supervisorctl.
|
[manual] restart-server: Minimize downtime for message sender worker.
The manual step here is that we need to do the `puppet apply` before
pushing this commit, or `restart-server` will crash.
Previously we shut down everything in one group, which performed
poorly with supervisor's bad performance on restarting many daemons at
once. Now we shut down the unimportant stuff, then the important
stuff, bring back the important stuff, and then bring back the
unimportant stuff.
This new model has a little over 5s of downtime for the core
user-facing daemons -- which is still far more than would be ideal,
but a lot less than the 13s or so that we had before.
Here's some logs with the current setup for the tornado/django downtime:
2013-12-19 20:16:51,995 restart-server: Stopping daemons
2013-12-19 20:16:53,461 restart-server: Starting daemons
2013-12-19 20:16:57,146 restart-server: Starting workers
Compare with the behavior on master today:
2013-12-19 20:21:45,281 restart-server: Stopping daemons
2013-12-19 20:21:49,225 restart-server: Starting daemons
2013-12-19 20:21:58,463 restart-server: Done!
(imported from commit b2c1ba77f3dc989551d0939779208465a8410435)
2013-12-19 21:07:02 +01:00
|
|
|
logging.info("Stopping workers")
|
|
|
|
subprocess.check_call(["supervisorctl", "stop", "zulip-workers:*"])
|
|
|
|
logging.info("Stopping server core")
|
2017-05-24 02:46:52 +02:00
|
|
|
subprocess.check_call(["supervisorctl", "stop"] + core_server_services)
|
2016-08-05 01:58:57 +02:00
|
|
|
|
[manual] restart-server: Minimize downtime for message sender worker.
The manual step here is that we need to do the `puppet apply` before
pushing this commit, or `restart-server` will crash.
Previously we shut down everything in one group, which performed
poorly with supervisor's bad performance on restarting many daemons at
once. Now we shut down the unimportant stuff, then the important
stuff, bring back the important stuff, and then bring back the
unimportant stuff.
This new model has a little over 5s of downtime for the core
user-facing daemons -- which is still far more than would be ideal,
but a lot less than the 13s or so that we had before.
Here's some logs with the current setup for the tornado/django downtime:
2013-12-19 20:16:51,995 restart-server: Stopping daemons
2013-12-19 20:16:53,461 restart-server: Starting daemons
2013-12-19 20:16:57,146 restart-server: Starting workers
Compare with the behavior on master today:
2013-12-19 20:21:45,281 restart-server: Stopping daemons
2013-12-19 20:21:49,225 restart-server: Starting daemons
2013-12-19 20:21:58,463 restart-server: Done!
(imported from commit b2c1ba77f3dc989551d0939779208465a8410435)
2013-12-19 21:07:02 +01:00
|
|
|
logging.info("Starting server core")
|
2017-05-24 02:46:52 +02:00
|
|
|
subprocess.check_call(["supervisorctl", "start"] + core_server_services)
|
[manual] restart-server: Minimize downtime for message sender worker.
The manual step here is that we need to do the `puppet apply` before
pushing this commit, or `restart-server` will crash.
Previously we shut down everything in one group, which performed
poorly with supervisor's bad performance on restarting many daemons at
once. Now we shut down the unimportant stuff, then the important
stuff, bring back the important stuff, and then bring back the
unimportant stuff.
This new model has a little over 5s of downtime for the core
user-facing daemons -- which is still far more than would be ideal,
but a lot less than the 13s or so that we had before.
Here's some logs with the current setup for the tornado/django downtime:
2013-12-19 20:16:51,995 restart-server: Stopping daemons
2013-12-19 20:16:53,461 restart-server: Starting daemons
2013-12-19 20:16:57,146 restart-server: Starting workers
Compare with the behavior on master today:
2013-12-19 20:21:45,281 restart-server: Stopping daemons
2013-12-19 20:21:49,225 restart-server: Starting daemons
2013-12-19 20:21:58,463 restart-server: Done!
(imported from commit b2c1ba77f3dc989551d0939779208465a8410435)
2013-12-19 21:07:02 +01:00
|
|
|
logging.info("Starting workers")
|
|
|
|
subprocess.check_call(["supervisorctl", "start", "zulip-workers:*"])
|
2013-01-31 16:49:09 +01:00
|
|
|
|
2016-05-08 04:02:32 +02:00
|
|
|
using_sso = subprocess.check_output(['./scripts/get-django-setting', 'USING_APACHE_SSO'])
|
2016-07-26 06:40:05 +02:00
|
|
|
if using_sso.strip() == b'True':
|
2013-11-15 00:40:23 +01:00
|
|
|
logging.info("Restarting Apache WSGI process...")
|
|
|
|
subprocess.check_call(["pkill", "-f", "apache2", "-u", "zulip"])
|
|
|
|
|
2018-07-31 01:27:53 +02:00
|
|
|
if os.path.exists("/etc/supervisor/conf.d/zulip_db.conf"):
|
|
|
|
subprocess.check_call(["supervisorctl", "restart", "process-fts-updates"])
|
|
|
|
|
2013-03-13 19:26:51 +01:00
|
|
|
logging.info("Done!")
|
2016-03-10 17:15:34 +01:00
|
|
|
print(OKGREEN + "Application restarted successfully!" + ENDC)
|
2019-09-20 02:23:23 +02:00
|
|
|
|
|
|
|
if change_symlink and "PWD" in os.environ:
|
|
|
|
for symlink in [last_symlink, current_symlink]:
|
|
|
|
if os.path.commonprefix([os.environ["PWD"], symlink]) == symlink:
|
|
|
|
print(
|
|
|
|
"""
|
|
|
|
%sYour shell entered its current directory through a symlink:
|
|
|
|
%s
|
|
|
|
which has now changed. Your shell will not see this change until you run:
|
|
|
|
cd %s
|
|
|
|
to traverse the symlink again.%s
|
|
|
|
"""
|
|
|
|
% (WARNING, symlink, shlex.quote(os.environ["PWD"]), ENDC),
|
|
|
|
file=sys.stderr,
|
|
|
|
)
|