From 54c2c0201134c9f6ffab0b0aacb114b4f8f01437 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@zulip.com>
Date: Sun, 21 Jul 2024 18:26:12 -0700
Subject: [PATCH] thumbnail: Add support for multiple queue workers.

There's no need for sharding, but this allows one to spend a bit of
extra memory to reduce image-processing latency when bursts of images
are uploaded at once.
---
 docs/production/system-configuration.md                   | 8 ++++++++
 puppet/zulip/manifests/app_frontend_base.pp               | 6 +++++-
 puppet/zulip/templates/supervisor/zulip.conf.template.erb | 6 ++++++
 scripts/lib/check_rabbitmq_queue.py                       | 5 +++++
 scripts/nagios/check-rabbitmq-consumers                   | 4 ++++
 5 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/docs/production/system-configuration.md b/docs/production/system-configuration.md
index 8750d5f2ab..2ead049795 100644
--- a/docs/production/system-configuration.md
+++ b/docs/production/system-configuration.md
@@ -167,6 +167,14 @@ immutable, this serves only as a potential additional limit on the
 size of the contents on disk; `s3_disk_cache_size` is expected to be
 the primary control for cache sizing.
 
+#### `thumbnail_workers`
+
+How many image-thumbnailing workers to run. Defaults to 1; adding more
+workers can prevent the image-thumbnailing queue backlogging when
+large numbers of very large image files are uploaded at once. (When
+backlogged, image previews for images that have not yet been
+thumbnailed will appear as loading spinners).
+
 #### `nameserver`
 
 When the [S3 storage backend][s3-backend] is in use, downloads from S3 are
diff --git a/puppet/zulip/manifests/app_frontend_base.pp b/puppet/zulip/manifests/app_frontend_base.pp
index db2237298e..4e4a06b449 100644
--- a/puppet/zulip/manifests/app_frontend_base.pp
+++ b/puppet/zulip/manifests/app_frontend_base.pp
@@ -158,7 +158,11 @@ class zulip::app_frontend_base {
   } else {
     $uwsgi_default_processes = 3
   }
-  $mobile_notification_shards = Integer(zulipconf('application_server','mobile_notification_shards', 1))
+
+  # Not the different naming scheme for sharded workers, where each gets its own queue,
+  # vs when multiple workers service the same queue.
+  $thumbnail_workers = Integer(zulipconf('application_server', 'thumbnail_workers', 1))
+  $mobile_notification_shards = Integer(zulipconf('application_server', 'mobile_notification_shards', 1))
   $tornado_ports = $zulip::tornado_sharding::tornado_ports
 
   $proxy_host = zulipconf('http_proxy', 'host', 'localhost')
diff --git a/puppet/zulip/templates/supervisor/zulip.conf.template.erb b/puppet/zulip/templates/supervisor/zulip.conf.template.erb
index c3b9c89a65..eda9c8468c 100644
--- a/puppet/zulip/templates/supervisor/zulip.conf.template.erb
+++ b/puppet/zulip/templates/supervisor/zulip.conf.template.erb
@@ -67,6 +67,12 @@ command=nice -n10 /home/zulip/deployments/current/manage.py process_queue --queu
 stdout_logfile=/var/log/zulip/events_<%= queue %>_shard%(process_num)s.log         ; stdout log path, NONE for none; default AUTO
 numprocs=<%= @mobile_notification_shards %>
 numprocs_start=1
+<% elsif queue == "thumbnail" and @thumbnail_workers > 1 -%>
+process_name=zulip_events_<%= queue %>_worker%(process_num)s
+command=nice -n10 /home/zulip/deployments/current/manage.py process_queue --queue_name=<%= queue %> --skip-checks --worker_num %(process_num)s
+stdout_logfile=/var/log/zulip/events_<%= queue %>_worker%(process_num)s.log         ; stdout log path, NONE for none; default AUTO
+numprocs=<%= @thumbnail_workers %>
+numprocs_start=1
 <% else -%>
 command=nice -n10 /home/zulip/deployments/current/manage.py process_queue --queue_name=<%= queue %> --skip-checks
 stdout_logfile=/var/log/zulip/events_<%= queue %>.log         ; stdout log path, NONE for none; default AUTO
diff --git a/scripts/lib/check_rabbitmq_queue.py b/scripts/lib/check_rabbitmq_queue.py
index 190feac0f9..4307409d0c 100644
--- a/scripts/lib/check_rabbitmq_queue.py
+++ b/scripts/lib/check_rabbitmq_queue.py
@@ -163,12 +163,17 @@ def check_rabbitmq_queues() -> None:
         text=True,
     ).strip()
     queue_stats: dict[str, dict[str, Any]] = {}
+
     check_queues = normal_queues
     if mobile_notification_shards > 1:
+        # For sharded queue workers, where there's a separate queue
+        # for each shard, we need to make sure none of those are
+        # backlogged.
         check_queues += [
             f"missedmessage_mobile_notifications_shard{d}"
             for d in range(1, mobile_notification_shards + 1)
         ]
+
     queues_to_check = set(check_queues).intersection(set(queues_with_consumers))
     for queue in queues_to_check:
         fn = queue + ".stats"
diff --git a/scripts/nagios/check-rabbitmq-consumers b/scripts/nagios/check-rabbitmq-consumers
index 34c72dea2a..0a0ca7ec05 100755
--- a/scripts/nagios/check-rabbitmq-consumers
+++ b/scripts/nagios/check-rabbitmq-consumers
@@ -47,6 +47,8 @@ for line in output.split("\n"):
         queue_name = parts[0]
         if queue_name.startswith("notify_tornado_"):
             queue_name = "notify_tornado"
+
+        # Collapse sharded queues into a single queue for this check.
         queue_name = re.sub(r"_shard(\d+)", "", queue_name)
         consumers[queue_name] += 1
 
@@ -60,6 +62,8 @@ for queue_name in consumers:
         target_count = int(
             get_config(config_file, "application_server", "mobile_notification_shards", "1")
         )
+    elif queue_name == "thumbnail":
+        target_count = int(get_config(config_file, "application_server", "thumbnail_workers", "1"))
 
     atomic_nagios_write(
         "check-rabbitmq-consumers-" + queue_name,