From 9cad9644e7a93c3a00f7423124fda9a4bc554735 Mon Sep 17 00:00:00 2001 From: Mateusz Mandera Date: Tue, 6 Aug 2024 23:15:24 +0200 Subject: [PATCH] import_util: Fix file extensions of emoji files downloaded from Slack. The Slack API when returning the emoji records, returns the record for its thumbsup_all emoji with the url ending with .png, even though the file is a gif. For that reason, we have to make that code correct file extensions based on the response content-type. Emojis are the smallest set of images to download, so for simplicity of implementation, we remove the parallelization of the downloads in favor of just processing them serially. --- zerver/data_import/import_util.py | 41 +++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/zerver/data_import/import_util.py b/zerver/data_import/import_util.py index b0cdeada91..4ece5d4199 100644 --- a/zerver/data_import/import_util.py +++ b/zerver/data_import/import_util.py @@ -15,8 +15,11 @@ from django.utils.timezone import now as timezone_now from zerver.data_import.sequencer import NEXT_ID from zerver.lib.avatar_hash import user_avatar_base_path_from_ids +from zerver.lib.mime_types import guess_extension from zerver.lib.partial import partial from zerver.lib.stream_color import STREAM_ASSIGNMENT_COLORS as STREAM_COLORS +from zerver.lib.thumbnail import THUMBNAIL_ACCEPT_IMAGE_TYPES, BadImageError +from zerver.lib.upload.base import INLINE_MIME_TYPES from zerver.models import ( Attachment, DirectMessageGroup, @@ -685,9 +688,7 @@ def build_realm_emoji(realm_id: int, name: str, id: int, file_name: str) -> Zerv ) -def get_emojis(emoji_dir: str, upload: list[str]) -> None: - emoji_url = upload[0] - emoji_path = upload[1] +def get_emojis(emoji_dir: str, emoji_url: str, emoji_path: str) -> str | None: upload_emoji_path = os.path.join(emoji_dir, emoji_path) response = requests.get(emoji_url, stream=True) @@ -695,6 +696,8 @@ def get_emojis(emoji_dir: str, upload: list[str]) -> None: with open(upload_emoji_path, "wb") as emoji_file: shutil.copyfileobj(response.raw, emoji_file) + return response.headers.get("Content-Type") + def process_emojis( zerver_realmemoji: list[ZerverFieldsT], @@ -711,7 +714,6 @@ def process_emojis( 3. emoji_url_map: Maps emoji name to its url """ emoji_records = [] - upload_emoji_list = [] logging.info("######### GETTING EMOJIS #########\n") logging.info("DOWNLOADING EMOJIS .......\n") for emoji in zerver_realmemoji: @@ -720,8 +722,6 @@ def process_emojis( realm_id=emoji["realm"], emoji_file_name=emoji["name"] ) - upload_emoji_list.append([emoji_url, emoji_path]) - emoji_record = dict(emoji) emoji_record["path"] = emoji_path emoji_record["s3_path"] = emoji_path @@ -730,8 +730,33 @@ def process_emojis( emoji_records.append(emoji_record) - # Run downloads in parallel - run_parallel_wrapper(partial(get_emojis, emoji_dir), upload_emoji_list, threads=threads) + # Directly download the emoji and patch the file_name with the correct extension + # based on the content-type returned by the server. This is needed because Slack + # sometimes returns an emoji url with .png extension despite the file being a gif. + content_type = get_emojis(emoji_dir, emoji_url, emoji_path) + if content_type is None: + logging.warning( + "Emoji %s has an unspecified content type. Using the original file extension.", + emoji["name"], + ) + continue + + if ( + content_type not in THUMBNAIL_ACCEPT_IMAGE_TYPES + or content_type not in INLINE_MIME_TYPES + ): + raise BadImageError( + f"Emoji {emoji['name']} is not an image file. Content type: {content_type}" + ) + + file_extension = guess_extension(content_type, strict=False) + assert file_extension is not None + + old_file_name = emoji_record["file_name"] + new_file_name = f"{old_file_name.rsplit('.', 1)[0]}{file_extension}" + + emoji_record["file_name"] = new_file_name + emoji["file_name"] = new_file_name logging.info("######### GETTING EMOJIS FINISHED #########\n") return emoji_records