mirror of https://github.com/zulip/zulip.git
thumbnail: Make thumbnailing work with data import.
We didn't have thumbnailing for images coming from data import and this
commit adds the functionality.
There are a few fundamental issues that the implementation needs to
solve.
1. The images come from an untrusted source and therefore we don't want
to just pass them through to thumbnailing without checking. For that
reason, we cannot just import ImageAttachment rows from the export
data, even for zulip=>zulip imports.
The right way to process images is to pass them to maybe_thumbail(),
which runs libvips_check_image() on them to verify we're okay with
thumbnailing, creates ImageAttachment rows for them and sends them
to the thumbnailing queue worker. This approach lets us handle both
zulip=>zulip and 3rd party=>zulip imports in the same way,
2. There is a somewhat circular dependency between the Message,
Attachment and ImageAttachment import process:
- ImageAttachments would ideally be created after importing
Attachments, but they need to already exist at the time of Message
import. Otherwise, the markdown processor doesn't know it has to add
HTML for image previews to messages that reference images. This would
mean that messages imported from 3rd party tools don't get image
previews.
- Attachments only get created after Message import however, due to the
many-to-many relationship between Message and Attachment.
This is solved by fixing up some data of Attachments pre-emptively, such
as the path_ids. This gives us the necessary information for creating
ImageAttachments before importing Messages.
While we generate ImageAttachment rows synchronously, the actual
thumbnailing job is sent to the queue worker. Theoretically, the worker
could be very backlogged and not process the thumbnails anytime soon.
This is fine - if the app is loaded and tries to display a message with
such a not-yet-generated thumbnail, the code in `serve_file` will
generate the thumbnails synchronously on the fly and the user will see
the image preview displayed normally. See:
1b47134d0d/zerver/views/upload.py (L333-L342)
This commit is contained in:
parent
a6b0385229
commit
da4443f392
|
@ -260,6 +260,9 @@ NON_EXPORTED_TABLES = {
|
||||||
"zerver_submessage",
|
"zerver_submessage",
|
||||||
# Drafts don't need to be exported as they are supposed to be more ephemeral.
|
# Drafts don't need to be exported as they are supposed to be more ephemeral.
|
||||||
"zerver_draft",
|
"zerver_draft",
|
||||||
|
# The importer cannot trust ImageAttachment objects anyway and needs to check
|
||||||
|
# and process images for thumbnailing on its own.
|
||||||
|
"zerver_imageattachment",
|
||||||
# For any tables listed below here, it's a bug that they are not present in the export.
|
# For any tables listed below here, it's a bug that they are not present in the export.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ from typing import Any
|
||||||
|
|
||||||
import bmemcached
|
import bmemcached
|
||||||
import orjson
|
import orjson
|
||||||
|
import pyvips
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
|
@ -34,7 +35,7 @@ from zerver.lib.push_notifications import sends_notifications_directly
|
||||||
from zerver.lib.remote_server import maybe_enqueue_audit_log_upload
|
from zerver.lib.remote_server import maybe_enqueue_audit_log_upload
|
||||||
from zerver.lib.server_initialization import create_internal_realm, server_initialized
|
from zerver.lib.server_initialization import create_internal_realm, server_initialized
|
||||||
from zerver.lib.streams import render_stream_description
|
from zerver.lib.streams import render_stream_description
|
||||||
from zerver.lib.thumbnail import BadImageError
|
from zerver.lib.thumbnail import THUMBNAIL_ACCEPT_IMAGE_TYPES, BadImageError, maybe_thumbnail
|
||||||
from zerver.lib.timestamp import datetime_to_timestamp
|
from zerver.lib.timestamp import datetime_to_timestamp
|
||||||
from zerver.lib.upload import ensure_avatar_image, sanitize_name, upload_backend, upload_emoji_image
|
from zerver.lib.upload import ensure_avatar_image, sanitize_name, upload_backend, upload_emoji_image
|
||||||
from zerver.lib.upload.s3 import get_bucket
|
from zerver.lib.upload.s3 import get_bucket
|
||||||
|
@ -161,7 +162,16 @@ id_map_to_list: dict[str, dict[int, list[int]]] = {
|
||||||
}
|
}
|
||||||
|
|
||||||
path_maps: dict[str, dict[str, str]] = {
|
path_maps: dict[str, dict[str, str]] = {
|
||||||
"attachment_path": {},
|
# Maps original attachment path pre-import to the final, post-import
|
||||||
|
# attachment path.
|
||||||
|
"old_attachment_path_to_new_path": {},
|
||||||
|
# Inverse of old_attachment_path_to_new_path.
|
||||||
|
"new_attachment_path_to_old_path": {},
|
||||||
|
# Maps the new (post-import) attachment path to the absolute path to the file
|
||||||
|
# in the on-disk export data that we're importing.
|
||||||
|
# Allows code running after this is filled to access file contents
|
||||||
|
# without needing to go through S3 to get it.
|
||||||
|
"new_attachment_path_to_local_data_path": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
message_id_to_attachments: dict[str, dict[int, list[str]]] = {
|
message_id_to_attachments: dict[str, dict[int, list[str]]] = {
|
||||||
|
@ -212,13 +222,12 @@ def fix_upload_links(data: TableData, message_table: TableName) -> None:
|
||||||
for message in data[message_table]:
|
for message in data[message_table]:
|
||||||
if message["has_attachment"] is True:
|
if message["has_attachment"] is True:
|
||||||
for attachment_path in message_id_to_attachments[message_table][message["id"]]:
|
for attachment_path in message_id_to_attachments[message_table][message["id"]]:
|
||||||
message["content"] = message["content"].replace(
|
old_path = path_maps["new_attachment_path_to_old_path"][attachment_path]
|
||||||
attachment_path, path_maps["attachment_path"][attachment_path]
|
message["content"] = message["content"].replace(old_path, attachment_path)
|
||||||
)
|
|
||||||
|
|
||||||
if message["rendered_content"]:
|
if message["rendered_content"]:
|
||||||
message["rendered_content"] = message["rendered_content"].replace(
|
message["rendered_content"] = message["rendered_content"].replace(
|
||||||
attachment_path, path_maps["attachment_path"][attachment_path]
|
old_path, attachment_path
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -976,7 +985,11 @@ def import_uploads(
|
||||||
relative_path = upload_backend.generate_message_upload_path(
|
relative_path = upload_backend.generate_message_upload_path(
|
||||||
str(record["realm_id"]), sanitize_name(os.path.basename(record["path"]))
|
str(record["realm_id"]), sanitize_name(os.path.basename(record["path"]))
|
||||||
)
|
)
|
||||||
path_maps["attachment_path"][record["s3_path"]] = relative_path
|
path_maps["old_attachment_path_to_new_path"][record["s3_path"]] = relative_path
|
||||||
|
path_maps["new_attachment_path_to_old_path"][relative_path] = record["s3_path"]
|
||||||
|
path_maps["new_attachment_path_to_local_data_path"][relative_path] = os.path.join(
|
||||||
|
import_dir, record["path"]
|
||||||
|
)
|
||||||
|
|
||||||
if s3_uploads:
|
if s3_uploads:
|
||||||
key = bucket.Object(relative_path)
|
key = bucket.Object(relative_path)
|
||||||
|
@ -1612,6 +1625,22 @@ def do_import_realm(import_dir: Path, subdomain: str, processes: int = 1) -> Rea
|
||||||
with open(attachments_file, "rb") as f:
|
with open(attachments_file, "rb") as f:
|
||||||
attachment_data = orjson.loads(f.read())
|
attachment_data = orjson.loads(f.read())
|
||||||
|
|
||||||
|
# We need to import ImageAttachments before messages, as the message rendering logic
|
||||||
|
# checks for existence of ImageAttachment records to determine if HTML content for image
|
||||||
|
# preview needs to be added to a message.
|
||||||
|
# In order for ImageAttachments to be correctly created, we need to know the new path_ids
|
||||||
|
# and content_types of the attachments.
|
||||||
|
#
|
||||||
|
# Begin by fixing up the Attachment data.
|
||||||
|
fix_attachments_data(attachment_data)
|
||||||
|
# Now we're ready create ImageAttachment rows and enqueue thumbnailing
|
||||||
|
# for the images.
|
||||||
|
# This order ensures that during message import, rendered_content will be generated
|
||||||
|
# correctly with image previews.
|
||||||
|
# The important detail here is that we **only** care about having ImageAttachment
|
||||||
|
# rows ready at the time of message import. Thumbnailing happens in a queue worker
|
||||||
|
# in a different process, and we don't care about when it'll complete.
|
||||||
|
create_image_attachments_and_maybe_enqueue_thumbnailing(realm, attachment_data)
|
||||||
map_messages_to_attachments(attachment_data)
|
map_messages_to_attachments(attachment_data)
|
||||||
|
|
||||||
# Import zerver_message and zerver_usermessage
|
# Import zerver_message and zerver_usermessage
|
||||||
|
@ -1930,10 +1959,6 @@ def import_attachments(data: TableData) -> None:
|
||||||
"scheduledmessage_id",
|
"scheduledmessage_id",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update 'path_id' for the attachments
|
|
||||||
for attachment in data[parent_db_table_name]:
|
|
||||||
attachment["path_id"] = path_maps["attachment_path"][attachment["path_id"]]
|
|
||||||
|
|
||||||
# Next, load the parent rows.
|
# Next, load the parent rows.
|
||||||
bulk_import_model(data, parent_model)
|
bulk_import_model(data, parent_model)
|
||||||
|
|
||||||
|
@ -1960,6 +1985,36 @@ def import_attachments(data: TableData) -> None:
|
||||||
logging.info("Successfully imported M2M table %s", m2m_table_name)
|
logging.info("Successfully imported M2M table %s", m2m_table_name)
|
||||||
|
|
||||||
|
|
||||||
|
def fix_attachments_data(attachment_data: TableData) -> None:
|
||||||
|
for attachment in attachment_data["zerver_attachment"]:
|
||||||
|
attachment["path_id"] = path_maps["old_attachment_path_to_new_path"][attachment["path_id"]]
|
||||||
|
|
||||||
|
# In the case of images, content_type needs to be set for thumbnailing.
|
||||||
|
# Zulip exports set this, but third-party exports may not.
|
||||||
|
if attachment.get("content_type") is None:
|
||||||
|
guessed_content_type = guess_type(attachment["path_id"])[0]
|
||||||
|
if guessed_content_type in THUMBNAIL_ACCEPT_IMAGE_TYPES:
|
||||||
|
attachment["content_type"] = guessed_content_type
|
||||||
|
|
||||||
|
|
||||||
|
def create_image_attachments_and_maybe_enqueue_thumbnailing(
|
||||||
|
realm: Realm, attachment_data: TableData
|
||||||
|
) -> None:
|
||||||
|
for attachment in attachment_data["zerver_attachment"]:
|
||||||
|
if attachment["content_type"] not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
|
||||||
|
continue
|
||||||
|
|
||||||
|
path_id = attachment["path_id"]
|
||||||
|
content_type = attachment["content_type"]
|
||||||
|
|
||||||
|
# We don't have to go to S3 to obtain the file. We still have the export
|
||||||
|
# data on disk and stored the absolute path to it.
|
||||||
|
local_filename = path_maps["new_attachment_path_to_local_data_path"][path_id]
|
||||||
|
pyvips_source = pyvips.Source.new_from_file(local_filename)
|
||||||
|
maybe_thumbnail(pyvips_source, content_type, path_id, realm.id)
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
def import_analytics_data(realm: Realm, import_dir: Path, crossrealm_user_ids: set[int]) -> None:
|
def import_analytics_data(realm: Realm, import_dir: Path, crossrealm_user_ids: set[int]) -> None:
|
||||||
analytics_filename = os.path.join(import_dir, "analytics.json")
|
analytics_filename = os.path.join(import_dir, "analytics.json")
|
||||||
if not os.path.exists(analytics_filename):
|
if not os.path.exists(analytics_filename):
|
||||||
|
|
|
@ -14,7 +14,7 @@ from typing_extensions import override
|
||||||
|
|
||||||
from zerver.lib.exceptions import ErrorCode, JsonableError
|
from zerver.lib.exceptions import ErrorCode, JsonableError
|
||||||
from zerver.lib.queue import queue_event_on_commit
|
from zerver.lib.queue import queue_event_on_commit
|
||||||
from zerver.models import AbstractAttachment, ImageAttachment
|
from zerver.models import ImageAttachment
|
||||||
|
|
||||||
DEFAULT_AVATAR_SIZE = 100
|
DEFAULT_AVATAR_SIZE = 100
|
||||||
MEDIUM_AVATAR_SIZE = 500
|
MEDIUM_AVATAR_SIZE = 500
|
||||||
|
@ -279,9 +279,9 @@ def missing_thumbnails(image_attachment: ImageAttachment) -> list[ThumbnailForma
|
||||||
|
|
||||||
|
|
||||||
def maybe_thumbnail(
|
def maybe_thumbnail(
|
||||||
attachment: AbstractAttachment, content: bytes | pyvips.Source
|
content: bytes | pyvips.Source, content_type: str | None, path_id: str, realm_id: int
|
||||||
) -> ImageAttachment | None:
|
) -> ImageAttachment | None:
|
||||||
if attachment.content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
|
if content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES:
|
||||||
# If it doesn't self-report as an image file that we might want
|
# If it doesn't self-report as an image file that we might want
|
||||||
# to thumbnail, don't parse the bytes at all.
|
# to thumbnail, don't parse the bytes at all.
|
||||||
return None
|
return None
|
||||||
|
@ -301,8 +301,8 @@ def maybe_thumbnail(
|
||||||
(width, height) = (image.width, image.height)
|
(width, height) = (image.width, image.height)
|
||||||
|
|
||||||
image_row = ImageAttachment.objects.create(
|
image_row = ImageAttachment.objects.create(
|
||||||
realm_id=attachment.realm_id,
|
realm_id=realm_id,
|
||||||
path_id=attachment.path_id,
|
path_id=path_id,
|
||||||
original_width_px=width,
|
original_width_px=width,
|
||||||
original_height_px=height,
|
original_height_px=height,
|
||||||
frames=image.get_n_pages(),
|
frames=image.get_n_pages(),
|
||||||
|
|
|
@ -70,7 +70,7 @@ def create_attachment(
|
||||||
size=file_size,
|
size=file_size,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
)
|
)
|
||||||
maybe_thumbnail(attachment, file_real_data)
|
maybe_thumbnail(file_real_data, content_type, path_id, realm.id)
|
||||||
from zerver.actions.uploads import notify_attachment_update
|
from zerver.actions.uploads import notify_attachment_update
|
||||||
|
|
||||||
notify_attachment_update(user_profile, "add", attachment.to_dict())
|
notify_attachment_update(user_profile, "add", attachment.to_dict())
|
||||||
|
|
|
@ -93,6 +93,7 @@ from zerver.models import (
|
||||||
)
|
)
|
||||||
from zerver.models.clients import get_client
|
from zerver.models.clients import get_client
|
||||||
from zerver.models.groups import SystemGroups
|
from zerver.models.groups import SystemGroups
|
||||||
|
from zerver.models.messages import ImageAttachment
|
||||||
from zerver.models.presence import PresenceSequence
|
from zerver.models.presence import PresenceSequence
|
||||||
from zerver.models.realm_audit_logs import AuditLogEventType
|
from zerver.models.realm_audit_logs import AuditLogEventType
|
||||||
from zerver.models.realms import get_realm
|
from zerver.models.realms import get_realm
|
||||||
|
@ -335,6 +336,9 @@ class ExportFile(ZulipTestCase):
|
||||||
|
|
||||||
|
|
||||||
class RealmImportExportTest(ExportFile):
|
class RealmImportExportTest(ExportFile):
|
||||||
|
def create_user_and_login(self, email: str, realm: Realm) -> None:
|
||||||
|
self.register(email, "test", subdomain=realm.subdomain)
|
||||||
|
|
||||||
def export_realm(
|
def export_realm(
|
||||||
self,
|
self,
|
||||||
realm: Realm,
|
realm: Realm,
|
||||||
|
@ -811,6 +815,32 @@ class RealmImportExportTest(ExportFile):
|
||||||
)
|
)
|
||||||
self.assertEqual(realm_emoji.name, "hawaii")
|
self.assertEqual(realm_emoji.name, "hawaii")
|
||||||
|
|
||||||
|
# We want to set up some image data to verify image attachment thumbnailing works correctly
|
||||||
|
# in the import.
|
||||||
|
# We'll create a new user to use as the sender of the messages with such images,
|
||||||
|
# so that we can easily find them after importing - by fetching messages sent
|
||||||
|
# by the thumbnailing_test_user_email account.
|
||||||
|
thumbnailing_test_user_email = "thumbnailing_test@zulip.com"
|
||||||
|
self.create_user_and_login(thumbnailing_test_user_email, original_realm)
|
||||||
|
thumbnailing_test_user = get_user_by_delivery_email(
|
||||||
|
thumbnailing_test_user_email, original_realm
|
||||||
|
)
|
||||||
|
|
||||||
|
# Send a message with the image. After the import, we'll verify that this message
|
||||||
|
# and the associated ImageAttachment have been created correctly.
|
||||||
|
image_path_id = self.upload_and_thumbnail_image("img.png")
|
||||||
|
self.send_stream_message(
|
||||||
|
sender=thumbnailing_test_user,
|
||||||
|
stream_name="Verona",
|
||||||
|
content=f"An [image](/user_uploads/{image_path_id})",
|
||||||
|
)
|
||||||
|
image_attachment = ImageAttachment.objects.get(path_id=image_path_id)
|
||||||
|
# Malform some ImageAttachment info. These shouldn't get exported (and certainly not imported!)
|
||||||
|
# anyway, so we can test that this misinformation doesn't make its way into the imported realm.
|
||||||
|
image_attachment.original_width_px = 9999
|
||||||
|
image_attachment.original_height_px = 9999
|
||||||
|
image_attachment.save()
|
||||||
|
|
||||||
# Deactivate a user to ensure such a case is covered.
|
# Deactivate a user to ensure such a case is covered.
|
||||||
do_deactivate_user(self.example_user("aaron"), acting_user=None)
|
do_deactivate_user(self.example_user("aaron"), acting_user=None)
|
||||||
|
|
||||||
|
@ -1003,7 +1033,14 @@ class RealmImportExportTest(ExportFile):
|
||||||
|
|
||||||
self.export_realm(original_realm, export_type=RealmExport.EXPORT_FULL_WITHOUT_CONSENT)
|
self.export_realm(original_realm, export_type=RealmExport.EXPORT_FULL_WITHOUT_CONSENT)
|
||||||
|
|
||||||
with self.settings(BILLING_ENABLED=False), self.assertLogs(level="INFO"):
|
with (
|
||||||
|
self.settings(BILLING_ENABLED=False),
|
||||||
|
self.assertLogs(level="INFO"),
|
||||||
|
# With captureOnCommitCallbacks we ensure that tasks delegated to the queue workers
|
||||||
|
# are executed immediately. We use this to make thumbnailing runs in the import
|
||||||
|
# process in this test.
|
||||||
|
self.captureOnCommitCallbacks(execute=True),
|
||||||
|
):
|
||||||
do_import_realm(get_output_dir(), "test-zulip")
|
do_import_realm(get_output_dir(), "test-zulip")
|
||||||
|
|
||||||
# Make sure our export/import didn't somehow leak info into the
|
# Make sure our export/import didn't somehow leak info into the
|
||||||
|
@ -1164,6 +1201,48 @@ class RealmImportExportTest(ExportFile):
|
||||||
Message.objects.filter(realm=imported_realm).count(),
|
Message.objects.filter(realm=imported_realm).count(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Verify thumbnailing.
|
||||||
|
imported_thumbnailing_test_user = get_user_by_delivery_email(
|
||||||
|
thumbnailing_test_user_email, imported_realm
|
||||||
|
)
|
||||||
|
imported_messages_with_thumbnail = Message.objects.filter(
|
||||||
|
sender=imported_thumbnailing_test_user, realm=imported_realm
|
||||||
|
)
|
||||||
|
imported_message_with_thumbnail = imported_messages_with_thumbnail.latest("id")
|
||||||
|
attachment_with_thumbnail = Attachment.objects.get(
|
||||||
|
owner=imported_thumbnailing_test_user, messages=imported_message_with_thumbnail
|
||||||
|
)
|
||||||
|
|
||||||
|
path_id = attachment_with_thumbnail.path_id
|
||||||
|
# An ImageAttachment has been created in the import process.
|
||||||
|
imported_image_attachment = ImageAttachment.objects.get(
|
||||||
|
path_id=path_id, realm=imported_realm
|
||||||
|
)
|
||||||
|
|
||||||
|
# It figured out the dimensions correctly and didn't inherit the bad data in the
|
||||||
|
# original ImageAttachment.
|
||||||
|
self.assertEqual(imported_image_attachment.original_width_px, 128)
|
||||||
|
self.assertEqual(imported_image_attachment.original_height_px, 128)
|
||||||
|
# ImageAttachment.thumbnail_metadata contains information about thumbnails that actually
|
||||||
|
# got generated. By asserting it's not empty, we make sure thumbnailing ran for the image
|
||||||
|
# and that we didn't merely create the ImageAttachment row in the database.
|
||||||
|
self.assertNotEqual(len(imported_image_attachment.thumbnail_metadata), 0)
|
||||||
|
self.assertTrue(imported_image_attachment.thumbnail_metadata[0])
|
||||||
|
|
||||||
|
# Content and rendered_content got updated correctly, to point to the correct, new path_id
|
||||||
|
# and include the HTML for image preview using the thumbnail.
|
||||||
|
self.assertEqual(
|
||||||
|
imported_message_with_thumbnail.content, f"An [image](/user_uploads/{path_id})"
|
||||||
|
)
|
||||||
|
expected_rendered_preview = (
|
||||||
|
f'<p>An <a href="/user_uploads/{path_id}">image</a></p>\n'
|
||||||
|
f'<div class="message_inline_image"><a href="/user_uploads/{path_id}" title="image">'
|
||||||
|
f'<img data-original-dimensions="128x128" src="/user_uploads/thumbnail/{path_id}/840x560.webp"></a></div>'
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
imported_message_with_thumbnail.rendered_content, expected_rendered_preview
|
||||||
|
)
|
||||||
|
|
||||||
def test_import_message_edit_history(self) -> None:
|
def test_import_message_edit_history(self) -> None:
|
||||||
realm = get_realm("zulip")
|
realm = get_realm("zulip")
|
||||||
iago = self.example_user("iago")
|
iago = self.example_user("iago")
|
||||||
|
|
Loading…
Reference in New Issue