From 54c2c0201134c9f6ffab0b0aacb114b4f8f01437 Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Sun, 21 Jul 2024 18:26:12 -0700 Subject: [PATCH] thumbnail: Add support for multiple queue workers. There's no need for sharding, but this allows one to spend a bit of extra memory to reduce image-processing latency when bursts of images are uploaded at once. --- docs/production/system-configuration.md | 8 ++++++++ puppet/zulip/manifests/app_frontend_base.pp | 6 +++++- puppet/zulip/templates/supervisor/zulip.conf.template.erb | 6 ++++++ scripts/lib/check_rabbitmq_queue.py | 5 +++++ scripts/nagios/check-rabbitmq-consumers | 4 ++++ 5 files changed, 28 insertions(+), 1 deletion(-) diff --git a/docs/production/system-configuration.md b/docs/production/system-configuration.md index 8750d5f2ab..2ead049795 100644 --- a/docs/production/system-configuration.md +++ b/docs/production/system-configuration.md @@ -167,6 +167,14 @@ immutable, this serves only as a potential additional limit on the size of the contents on disk; `s3_disk_cache_size` is expected to be the primary control for cache sizing. +#### `thumbnail_workers` + +How many image-thumbnailing workers to run. Defaults to 1; adding more +workers can prevent the image-thumbnailing queue backlogging when +large numbers of very large image files are uploaded at once. (When +backlogged, image previews for images that have not yet been +thumbnailed will appear as loading spinners). + #### `nameserver` When the [S3 storage backend][s3-backend] is in use, downloads from S3 are diff --git a/puppet/zulip/manifests/app_frontend_base.pp b/puppet/zulip/manifests/app_frontend_base.pp index db2237298e..4e4a06b449 100644 --- a/puppet/zulip/manifests/app_frontend_base.pp +++ b/puppet/zulip/manifests/app_frontend_base.pp @@ -158,7 +158,11 @@ class zulip::app_frontend_base { } else { $uwsgi_default_processes = 3 } - $mobile_notification_shards = Integer(zulipconf('application_server','mobile_notification_shards', 1)) + + # Not the different naming scheme for sharded workers, where each gets its own queue, + # vs when multiple workers service the same queue. + $thumbnail_workers = Integer(zulipconf('application_server', 'thumbnail_workers', 1)) + $mobile_notification_shards = Integer(zulipconf('application_server', 'mobile_notification_shards', 1)) $tornado_ports = $zulip::tornado_sharding::tornado_ports $proxy_host = zulipconf('http_proxy', 'host', 'localhost') diff --git a/puppet/zulip/templates/supervisor/zulip.conf.template.erb b/puppet/zulip/templates/supervisor/zulip.conf.template.erb index c3b9c89a65..eda9c8468c 100644 --- a/puppet/zulip/templates/supervisor/zulip.conf.template.erb +++ b/puppet/zulip/templates/supervisor/zulip.conf.template.erb @@ -67,6 +67,12 @@ command=nice -n10 /home/zulip/deployments/current/manage.py process_queue --queu stdout_logfile=/var/log/zulip/events_<%= queue %>_shard%(process_num)s.log ; stdout log path, NONE for none; default AUTO numprocs=<%= @mobile_notification_shards %> numprocs_start=1 +<% elsif queue == "thumbnail" and @thumbnail_workers > 1 -%> +process_name=zulip_events_<%= queue %>_worker%(process_num)s +command=nice -n10 /home/zulip/deployments/current/manage.py process_queue --queue_name=<%= queue %> --skip-checks --worker_num %(process_num)s +stdout_logfile=/var/log/zulip/events_<%= queue %>_worker%(process_num)s.log ; stdout log path, NONE for none; default AUTO +numprocs=<%= @thumbnail_workers %> +numprocs_start=1 <% else -%> command=nice -n10 /home/zulip/deployments/current/manage.py process_queue --queue_name=<%= queue %> --skip-checks stdout_logfile=/var/log/zulip/events_<%= queue %>.log ; stdout log path, NONE for none; default AUTO diff --git a/scripts/lib/check_rabbitmq_queue.py b/scripts/lib/check_rabbitmq_queue.py index 190feac0f9..4307409d0c 100644 --- a/scripts/lib/check_rabbitmq_queue.py +++ b/scripts/lib/check_rabbitmq_queue.py @@ -163,12 +163,17 @@ def check_rabbitmq_queues() -> None: text=True, ).strip() queue_stats: dict[str, dict[str, Any]] = {} + check_queues = normal_queues if mobile_notification_shards > 1: + # For sharded queue workers, where there's a separate queue + # for each shard, we need to make sure none of those are + # backlogged. check_queues += [ f"missedmessage_mobile_notifications_shard{d}" for d in range(1, mobile_notification_shards + 1) ] + queues_to_check = set(check_queues).intersection(set(queues_with_consumers)) for queue in queues_to_check: fn = queue + ".stats" diff --git a/scripts/nagios/check-rabbitmq-consumers b/scripts/nagios/check-rabbitmq-consumers index 34c72dea2a..0a0ca7ec05 100755 --- a/scripts/nagios/check-rabbitmq-consumers +++ b/scripts/nagios/check-rabbitmq-consumers @@ -47,6 +47,8 @@ for line in output.split("\n"): queue_name = parts[0] if queue_name.startswith("notify_tornado_"): queue_name = "notify_tornado" + + # Collapse sharded queues into a single queue for this check. queue_name = re.sub(r"_shard(\d+)", "", queue_name) consumers[queue_name] += 1 @@ -60,6 +62,8 @@ for queue_name in consumers: target_count = int( get_config(config_file, "application_server", "mobile_notification_shards", "1") ) + elif queue_name == "thumbnail": + target_count = int(get_config(config_file, "application_server", "thumbnail_workers", "1")) atomic_nagios_write( "check-rabbitmq-consumers-" + queue_name,