thumbnail: Make thumbnailing work with data import.

We didn't have thumbnailing for images coming from data import and this commit adds the functionality. There are a few fundamental issues that the implementation needs to solve. 1. The images come from an untrusted source and therefore we don't want to just pass them through to thumbnailing without checking. For that reason, we cannot just import ImageAttachment rows from the export data, even for zulip=>zulip imports. The right way to process images is to pass them to maybe_thumbail(), which runs libvips_check_image() on them to verify we're okay with thumbnailing, creates ImageAttachment rows for them and sends them to the thumbnailing queue worker. This approach lets us handle both zulip=>zulip and 3rd party=>zulip imports in the same way, 2. There is a somewhat circular dependency between the Message, Attachment and ImageAttachment import process: - ImageAttachments would ideally be created after importing Attachments, but they need to already exist at the time of Message import. Otherwise, the markdown processor doesn't know it has to add HTML for image previews to messages that reference images. This would mean that messages imported from 3rd party tools don't get image previews. - Attachments only get created after Message import however, due to the many-to-many relationship between Message and Attachment. This is solved by fixing up some data of Attachments pre-emptively, such as the path_ids. This gives us the necessary information for creating ImageAttachments before importing Messages. While we generate ImageAttachment rows synchronously, the actual thumbnailing job is sent to the queue worker. Theoretically, the worker could be very backlogged and not process the thumbnails anytime soon. This is fine - if the app is loaded and tries to display a message with such a not-yet-generated thumbnail, the code in `serve_file` will generate the thumbnails synchronously on the fly and the user will see the image preview displayed normally. See: 1b47134d0d/zerver/views/upload.py (L333-L342)
2024-10-17 21:20:49 +02:00 · 2024-10-17 21:20:49 +02:00 · da4443f392
parent a6b0385229
commit da4443f392
5 changed files with 155 additions and 18 deletions
--- a/zerver/lib/export.py
+++ b/zerver/lib/export.py
@ -260,6 +260,9 @@ NON_EXPORTED_TABLES = {
    "zerver_submessage",
    # Drafts don't need to be exported as they are supposed to be more ephemeral.
    "zerver_draft",
+    # The importer cannot trust ImageAttachment objects anyway and needs to check
+    # and process images for thumbnailing on its own.
+    "zerver_imageattachment",
    # For any tables listed below here, it's a bug that they are not present in the export.
 }

--- a/zerver/lib/import_realm.py
+++ b/zerver/lib/import_realm.py
@ -8,6 +8,7 @@ from typing import Any

 import bmemcached
 import orjson
+import pyvips
 from bs4 import BeautifulSoup
 from django.conf import settings
 from django.core.cache import cache
@ -34,7 +35,7 @@ from zerver.lib.push_notifications import sends_notifications_directly
 from zerver.lib.remote_server import maybe_enqueue_audit_log_upload
 from zerver.lib.server_initialization import create_internal_realm, server_initialized
 from zerver.lib.streams import render_stream_description
-from zerver.lib.thumbnail import BadImageError
+from zerver.lib.thumbnail import THUMBNAIL_ACCEPT_IMAGE_TYPES, BadImageError, maybe_thumbnail
 from zerver.lib.timestamp import datetime_to_timestamp
 from zerver.lib.upload import ensure_avatar_image, sanitize_name, upload_backend, upload_emoji_image
 from zerver.lib.upload.s3 import get_bucket
@ -161,7 +162,16 @@ id_map_to_list: dict[str, dict[int, list[int]]] = {
 }

 path_maps: dict[str, dict[str, str]] = {
-    "attachment_path": {},
+    # Maps original attachment path pre-import to the final, post-import
+    # attachment path.
+    "old_attachment_path_to_new_path": {},
+    # Inverse of old_attachment_path_to_new_path.
+    "new_attachment_path_to_old_path": {},
+    # Maps the new (post-import) attachment path to the absolute path to the file
+    # in the on-disk export data that we're importing.
+    # Allows code running after this is filled to access file contents
+    # without needing to go through S3 to get it.
+    "new_attachment_path_to_local_data_path": {},
 }

 message_id_to_attachments: dict[str, dict[int, list[str]]] = {
@ -212,13 +222,12 @@ def fix_upload_links(data: TableData, message_table: TableName) -> None:
    for message in data[message_table]:
        if message["has_attachment"] is True:
            for attachment_path in message_id_to_attachments[message_table][message["id"]]:
-                message["content"] = message["content"].replace(
-                    attachment_path, path_maps["attachment_path"][attachment_path]
-                )
+                old_path = path_maps["new_attachment_path_to_old_path"][attachment_path]
+                message["content"] = message["content"].replace(old_path, attachment_path)

                if message["rendered_content"]:
                    message["rendered_content"] = message["rendered_content"].replace(
-                        attachment_path, path_maps["attachment_path"][attachment_path]
+                        old_path, attachment_path
                    )


@ -976,7 +985,11 @@ def import_uploads(
            relative_path = upload_backend.generate_message_upload_path(
                str(record["realm_id"]), sanitize_name(os.path.basename(record["path"]))
            )
-            path_maps["attachment_path"][record["s3_path"]] = relative_path
+            path_maps["old_attachment_path_to_new_path"][record["s3_path"]] = relative_path
+            path_maps["new_attachment_path_to_old_path"][relative_path] = record["s3_path"]
+            path_maps["new_attachment_path_to_local_data_path"][relative_path] = os.path.join(
+                import_dir, record["path"]
+            )

        if s3_uploads:
            key = bucket.Object(relative_path)
@ -1612,6 +1625,22 @@ def do_import_realm(import_dir: Path, subdomain: str, processes: int = 1) -> Rea
    with open(attachments_file, "rb") as f:
        attachment_data = orjson.loads(f.read())

+    # We need to import ImageAttachments before messages, as the message rendering logic
+    # checks for existence of ImageAttachment records to determine if HTML content for image
+    # preview needs to be added to a message.
+    # In order for ImageAttachments to be correctly created, we need to know the new path_ids
+    # and content_types of the attachments.
+    #
+    # Begin by fixing up the Attachment data.
+    fix_attachments_data(attachment_data)
+    # Now we're ready create ImageAttachment rows and enqueue thumbnailing
+    # for the images.
+    # This order ensures that during message import, rendered_content will be generated
+    # correctly with image previews.
+    # The important detail here is that we **only** care about having ImageAttachment
+    # rows ready at the time of message import. Thumbnailing happens in a queue worker
+    # in a different process, and we don't care about when it'll complete.
+    create_image_attachments_and_maybe_enqueue_thumbnailing(realm, attachment_data)
    map_messages_to_attachments(attachment_data)

    # Import zerver_message and zerver_usermessage
@ -1930,10 +1959,6 @@ def import_attachments(data: TableData) -> None:
        "scheduledmessage_id",
    )

-    # Update 'path_id' for the attachments
-    for attachment in data[parent_db_table_name]:
-        attachment["path_id"] = path_maps["attachment_path"][attachment["path_id"]]
-
    # Next, load the parent rows.
    bulk_import_model(data, parent_model)

@ -1960,6 +1985,36 @@ def import_attachments(data: TableData) -> None:
            logging.info("Successfully imported M2M table %s", m2m_table_name)


+def fix_attachments_data(attachment_data: TableData) -> None:
+    for attachment in attachment_data["zerver_attachment"]:
+        attachment["path_id"] = path_maps["old_attachment_path_to_new_path"][attachment["path_id"]]
+
+        # In the case of images, content_type needs to be set for thumbnailing.
+        # Zulip exports set this, but third-party exports may not.
+        if attachment.get("content_type") is None:
+            guessed_content_type = guess_type(attachment["path_id"])[0]
+            if guessed_content_type in THUMBNAIL_ACCEPT_IMAGE_TYPES:
+                attachment["content_type"] = guessed_content_type
+
+
+def create_image_attachments_and_maybe_enqueue_thumbnailing(
+    realm: Realm, attachment_data: TableData
+) -> None:
+    for attachment in attachment_data["zerver_attachment"]:
+        if attachment["content_type"] not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
+            continue
+
+        path_id = attachment["path_id"]
+        content_type = attachment["content_type"]
+
+        # We don't have to go to S3 to obtain the file. We still have the export
+        # data on disk and stored the absolute path to it.
+        local_filename = path_maps["new_attachment_path_to_local_data_path"][path_id]
+        pyvips_source = pyvips.Source.new_from_file(local_filename)
+        maybe_thumbnail(pyvips_source, content_type, path_id, realm.id)
+        continue
+
+
 def import_analytics_data(realm: Realm, import_dir: Path, crossrealm_user_ids: set[int]) -> None:
    analytics_filename = os.path.join(import_dir, "analytics.json")
    if not os.path.exists(analytics_filename):
--- a/zerver/lib/thumbnail.py
+++ b/zerver/lib/thumbnail.py
@ -14,7 +14,7 @@ from typing_extensions import override

 from zerver.lib.exceptions import ErrorCode, JsonableError
 from zerver.lib.queue import queue_event_on_commit
-from zerver.models import AbstractAttachment, ImageAttachment
+from zerver.models import ImageAttachment

 DEFAULT_AVATAR_SIZE = 100
 MEDIUM_AVATAR_SIZE = 500
@ -279,9 +279,9 @@ def missing_thumbnails(image_attachment: ImageAttachment) -> list[ThumbnailForma


 def maybe_thumbnail(
-    attachment: AbstractAttachment, content: bytes | pyvips.Source
+    content: bytes | pyvips.Source, content_type: str | None, path_id: str, realm_id: int
 ) -> ImageAttachment | None:
-    if attachment.content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
+    if content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
        # If it doesn't self-report as an image file that we might want
        # to thumbnail, don't parse the bytes at all.
        return None
@ -301,8 +301,8 @@ def maybe_thumbnail(
                (width, height) = (image.width, image.height)

            image_row = ImageAttachment.objects.create(
-                realm_id=attachment.realm_id,
-                path_id=attachment.path_id,
+                realm_id=realm_id,
+                path_id=path_id,
                original_width_px=width,
                original_height_px=height,
                frames=image.get_n_pages(),
--- a/zerver/lib/upload/init.py
+++ b/zerver/lib/upload/init.py
@ -70,7 +70,7 @@ def create_attachment(
        size=file_size,
        content_type=content_type,
    )
-    maybe_thumbnail(attachment, file_real_data)
+    maybe_thumbnail(file_real_data, content_type, path_id, realm.id)
    from zerver.actions.uploads import notify_attachment_update

    notify_attachment_update(user_profile, "add", attachment.to_dict())
--- a/zerver/tests/test_import_export.py
+++ b/zerver/tests/test_import_export.py
@ -93,6 +93,7 @@ from zerver.models import (
 )
 from zerver.models.clients import get_client
 from zerver.models.groups import SystemGroups
+from zerver.models.messages import ImageAttachment
 from zerver.models.presence import PresenceSequence
 from zerver.models.realm_audit_logs import AuditLogEventType
 from zerver.models.realms import get_realm
@ -335,6 +336,9 @@ class ExportFile(ZulipTestCase):


 class RealmImportExportTest(ExportFile):
+    def create_user_and_login(self, email: str, realm: Realm) -> None:
+        self.register(email, "test", subdomain=realm.subdomain)
+
    def export_realm(
        self,
        realm: Realm,
@ -811,6 +815,32 @@ class RealmImportExportTest(ExportFile):
            )
            self.assertEqual(realm_emoji.name, "hawaii")

+        # We want to set up some image data to verify image attachment thumbnailing works correctly
+        # in the import.
+        # We'll create a new user to use as the sender of the messages with such images,
+        # so that we can easily find them after importing - by fetching messages sent
+        # by the thumbnailing_test_user_email account.
+        thumbnailing_test_user_email = "thumbnailing_test@zulip.com"
+        self.create_user_and_login(thumbnailing_test_user_email, original_realm)
+        thumbnailing_test_user = get_user_by_delivery_email(
+            thumbnailing_test_user_email, original_realm
+        )
+
+        # Send a message with the image. After the import, we'll verify that this message
+        # and the associated ImageAttachment have been created correctly.
+        image_path_id = self.upload_and_thumbnail_image("img.png")
+        self.send_stream_message(
+            sender=thumbnailing_test_user,
+            stream_name="Verona",
+            content=f"An [image](/user_uploads/{image_path_id})",
+        )
+        image_attachment = ImageAttachment.objects.get(path_id=image_path_id)
+        # Malform some ImageAttachment info. These shouldn't get exported (and certainly not imported!)
+        # anyway, so we can test that this misinformation doesn't make its way into the imported realm.
+        image_attachment.original_width_px = 9999
+        image_attachment.original_height_px = 9999
+        image_attachment.save()
+
        # Deactivate a user to ensure such a case is covered.
        do_deactivate_user(self.example_user("aaron"), acting_user=None)

@ -1003,7 +1033,14 @@ class RealmImportExportTest(ExportFile):

        self.export_realm(original_realm, export_type=RealmExport.EXPORT_FULL_WITHOUT_CONSENT)

-        with self.settings(BILLING_ENABLED=False), self.assertLogs(level="INFO"):
+        with (
+            self.settings(BILLING_ENABLED=False),
+            self.assertLogs(level="INFO"),
+            # With captureOnCommitCallbacks we ensure that tasks delegated to the queue workers
+            # are executed immediately. We use this to make thumbnailing runs in the import
+            # process in this test.
+            self.captureOnCommitCallbacks(execute=True),
+        ):
            do_import_realm(get_output_dir(), "test-zulip")

        # Make sure our export/import didn't somehow leak info into the
@ -1164,6 +1201,48 @@ class RealmImportExportTest(ExportFile):
            Message.objects.filter(realm=imported_realm).count(),
        )

+        # Verify thumbnailing.
+        imported_thumbnailing_test_user = get_user_by_delivery_email(
+            thumbnailing_test_user_email, imported_realm
+        )
+        imported_messages_with_thumbnail = Message.objects.filter(
+            sender=imported_thumbnailing_test_user, realm=imported_realm
+        )
+        imported_message_with_thumbnail = imported_messages_with_thumbnail.latest("id")
+        attachment_with_thumbnail = Attachment.objects.get(
+            owner=imported_thumbnailing_test_user, messages=imported_message_with_thumbnail
+        )
+
+        path_id = attachment_with_thumbnail.path_id
+        # An ImageAttachment has been created in the import process.
+        imported_image_attachment = ImageAttachment.objects.get(
+            path_id=path_id, realm=imported_realm
+        )
+
+        # It figured out the dimensions correctly and didn't inherit the bad data in the
+        # original ImageAttachment.
+        self.assertEqual(imported_image_attachment.original_width_px, 128)
+        self.assertEqual(imported_image_attachment.original_height_px, 128)
+        # ImageAttachment.thumbnail_metadata contains information about thumbnails that actually
+        # got generated. By asserting it's not empty, we make sure thumbnailing ran for the image
+        # and that we didn't merely create the ImageAttachment row in the database.
+        self.assertNotEqual(len(imported_image_attachment.thumbnail_metadata), 0)
+        self.assertTrue(imported_image_attachment.thumbnail_metadata[0])
+
+        # Content and rendered_content got updated correctly, to point to the correct, new path_id
+        # and include the HTML for image preview using the thumbnail.
+        self.assertEqual(
+            imported_message_with_thumbnail.content, f"An [image](/user_uploads/{path_id})"
+        )
+        expected_rendered_preview = (
+            f'<p>An <a href="/user_uploads/{path_id}">image</a></p>\n'
+            f'<div class="message_inline_image"><a href="/user_uploads/{path_id}" title="image">'
+            f'<img data-original-dimensions="128x128" src="/user_uploads/thumbnail/{path_id}/840x560.webp"></a></div>'
+        )
+        self.assertEqual(
+            imported_message_with_thumbnail.rendered_content, expected_rendered_preview
+        )
+
    def test_import_message_edit_history(self) -> None:
        realm = get_realm("zulip")
        iago = self.example_user("iago")