[manual] restart-server: Minimize downtime for message sender worker.

The manual step here is that we need to do the `puppet apply` before
pushing this commit, or `restart-server` will crash.

Previously we shut down everything in one group, which performed
poorly with supervisor's bad performance on restarting many daemons at
once.  Now we shut down the unimportant stuff, then the important
stuff, bring back the important stuff, and then bring back the
unimportant stuff.

This new model has a little over 5s of downtime for the core
user-facing daemons -- which is still far more than would be ideal,
but a lot less than the 13s or so that we had before.

Here's some logs with the current setup for the tornado/django downtime:
2013-12-19 20:16:51,995 restart-server: Stopping daemons
2013-12-19 20:16:53,461 restart-server: Starting daemons
2013-12-19 20:16:57,146 restart-server: Starting workers

Compare with the behavior on master today:
2013-12-19 20:21:45,281 restart-server: Stopping daemons
2013-12-19 20:21:49,225 restart-server: Starting daemons
2013-12-19 20:21:58,463 restart-server: Done!

(imported from commit b2c1ba77f3dc989551d0939779208465a8410435)
This commit is contained in:
Tim Abbott 2013-12-19 15:07:02 -05:00
parent 66e72d4705
commit b2d01e2da0
2 changed files with 12 additions and 4 deletions

View File

@ -219,7 +219,10 @@ directory=/home/zulip/deployments/current/
[group:zulip-workers] [group:zulip-workers]
; each refers to 'x' in [program:x] definitions ; each refers to 'x' in [program:x] definitions
programs=zulip-events-user-activity,zulip-events-user-activity-interval,zulip-events-user-presence,zulip-events-signups,zulip-events-confirmation-emails,zulip-events-missedmessage_reminders,zulip-events-slowqueries,zulip-events-message_sender,zulip-events-feedback_messages,zulip-events-digest_emails,zulip-events-error_reports,zulip-deliver-enqueued-emails,zulip-events-missedmessage_mobile_notifications programs=zulip-events-user-activity,zulip-events-user-activity-interval,zulip-events-user-presence,zulip-events-signups,zulip-events-confirmation-emails,zulip-events-missedmessage_reminders,zulip-events-slowqueries,zulip-events-feedback_messages,zulip-events-digest_emails,zulip-events-error_reports,zulip-deliver-enqueued-emails,zulip-events-missedmessage_mobile_notifications
[group:zulip-senders]
programs=zulip-events-message_sender
; The [include] section can just contain the "files" setting. This ; The [include] section can just contain the "files" setting. This
; setting can list multiple files (separated by whitespace or ; setting can list multiple files (separated by whitespace or

View File

@ -26,10 +26,15 @@ logging.info("Filling memcached caches")
subprocess.check_call(["python", "./manage.py", "fill_memcached_caches"]) subprocess.check_call(["python", "./manage.py", "fill_memcached_caches"])
# Restart the FastCGI and related processes via supervisorctl. # Restart the FastCGI and related processes via supervisorctl.
logging.info("Killing daemons") logging.info("Stopping workers")
subprocess.check_call(["supervisorctl", "stop", "zulip-workers:* zulip-django zulip-tornado"]) subprocess.check_call(["supervisorctl", "stop", "zulip-workers:*"])
logging.info("Stopping server core")
subprocess.check_call(["supervisorctl", "stop", "zulip-senders:* zulip-django zulip-tornado"])
subprocess.check_call(["ln", '-nsf', deploy_path, os.path.join(DEPLOYMENTS_DIR, "current")]) subprocess.check_call(["ln", '-nsf', deploy_path, os.path.join(DEPLOYMENTS_DIR, "current")])
subprocess.check_call(["supervisorctl", "start", "zulip-tornado zulip-django zulip-workers:*"]) logging.info("Starting server core")
subprocess.check_call(["supervisorctl", "start", "zulip-tornado zulip-django zulip-senders:*"])
logging.info("Starting workers")
subprocess.check_call(["supervisorctl", "start", "zulip-workers:*"])
using_sso = subprocess.check_output(['./bin/get-django-setting', 'USING_SSO']) using_sso = subprocess.check_output(['./bin/get-django-setting', 'USING_SSO'])
if using_sso.strip() == 'True': if using_sso.strip() == 'True':