From ab6b6639e60b508abf5d1962d7d85bcb3253ff4c Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Wed, 17 Jul 2024 16:14:17 +0000 Subject: [PATCH] migrations: Derive emoji content-type from the bytes. --- tools/semgrep-py.yml | 2 +- zerver/migrations/0553_copy_emoji_images.py | 63 ++++++++++++++------- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/tools/semgrep-py.yml b/tools/semgrep-py.yml index 4f812a658c..c0a5156063 100644 --- a/tools/semgrep-py.yml +++ b/tools/semgrep-py.yml @@ -54,7 +54,7 @@ rules: - id: dont-import-models-in-migrations patterns: - - pattern-not: from zerver.lib.mime_types import guess_type + - pattern-not: from zerver.lib.mime_types import $X - pattern-not: from zerver.lib.redis_utils import get_redis_client - pattern-not: from zerver.lib.utils import generate_api_key - pattern-not: from zerver.models.linkifiers import filter_pattern_validator diff --git a/zerver/migrations/0553_copy_emoji_images.py b/zerver/migrations/0553_copy_emoji_images.py index e4b63c6d95..3c27af5b1a 100644 --- a/zerver/migrations/0553_copy_emoji_images.py +++ b/zerver/migrations/0553_copy_emoji_images.py @@ -7,6 +7,7 @@ from typing import Any import boto3 import botocore +import magic import pyvips from botocore.client import Config from django.conf import settings @@ -14,7 +15,7 @@ from django.db import migrations from django.db.backends.base.schema import BaseDatabaseSchemaEditor from django.db.migrations.state import StateApps -from zerver.lib.mime_types import guess_extension, guess_type +from zerver.lib.mime_types import guess_extension # From zerver.lib.thumbnail DEFAULT_EMOJI_SIZE = 64 @@ -142,31 +143,50 @@ def thumbnail_local_emoji(apps: StateApps) -> None: if total_processed % 100 == 0: print(f"Processed {total_processed} custom emoji") + old_file_name = emoji.file_name try: - old_file_name = emoji.file_name - content_type = guess_type(old_file_name)[0] + base_path = os.path.join( + settings.LOCAL_AVATARS_DIR, str(emoji.realm_id), "emoji/images" + ) + copy_from_path = f"{base_path}/{old_file_name}.original" + if not os.path.exists(copy_from_path) and os.path.exists( + f"{base_path}/{old_file_name}" + ): + # Imports currently don't write ".original" files, so check without that + copy_from_path = f"{base_path}/{old_file_name}" + if not os.path.exists(copy_from_path): + raise SkipImageError("Failed to read .original file: Does not exist") + + with open(copy_from_path, "rb") as fh: + original_bytes = fh.read() + + # We used to accept any bytes which pillow could + # thumbnail, with any filename, and would use the + # guessed-from-filename content-type when serving the + # emoji. Examine the bytes of the image to verify that it + # is an image of reasonable type, and then derive the real + # filename extension (which we will still use for deriving + # content-type at serving time) from that. This ensures + # that the contents are a valid image, and that we put the + # right content-type on it when served -- the filename + # used for the initial upload becomes completely + # irrelevant. + content_type = magic.from_buffer(original_bytes[:1024], mime=True) + if content_type not in VALID_EMOJI_CONTENT_TYPE: raise SkipImageError(f"Invalid content-type: {content_type}") new_file_name = get_emoji_file_name(content_type, emoji.id) if old_file_name == new_file_name: continue - base_path = os.path.join( - settings.LOCAL_AVATARS_DIR, str(emoji.realm_id), "emoji/images" - ) + print(f"{base_path}/{old_file_name} -> {base_path}/{new_file_name}") try: if os.path.exists(f"{base_path}/{new_file_name}.original"): os.unlink(f"{base_path}/{new_file_name}.original") - from_file = f"{base_path}/{old_file_name}.original" - if not os.path.exists(from_file) and os.path.exists(f"{base_path}/{old_file_name}"): - # Imports currently don't write ".original" files, so check without that - from_file = f"{base_path}/{old_file_name}" - os.link(from_file, f"{base_path}/{new_file_name}.original") - with open(f"{base_path}/{new_file_name}.original", "rb") as fh: - original_bytes = fh.read() + os.link(copy_from_path, f"{base_path}/{new_file_name}.original") except OSError as e: - raise SkipImageError(f"Failed to read original file: {e}") + raise SkipImageError(f"Failed to update .original file: {e}") animated, still = resize_emoji(original_bytes, new_file_name) try: @@ -230,7 +250,6 @@ def thumbnail_s3(apps: StateApps) -> None: try: old_data = avatar_bucket.Object(copy_from_path).get() original_bytes = old_data["Body"].read() - content_type = old_data["ContentType"] except botocore.exceptions.ClientError: # Imports currently don't write ".original" files, so check without that try: @@ -239,10 +258,16 @@ def thumbnail_s3(apps: StateApps) -> None: except botocore.exceptions.ClientError as e: raise SkipImageError(f"Failed to read .original file: {e}") original_bytes = old_data["Body"].read() - # They also may have uploaded as "application/octet-stream", so guess the - # content-type from the filename. If we can't guess, then we'll hit the - # SkipImageError case right below this. - content_type = guess_type(old_file_name)[0] or "application/octet-stream" + + # We used to accept any bytes which pillow could + # thumbnail, with any filename, and would store the + # guessed-from-filename content-type in S3, to be used + # when serving the emoji. Examine the bytes of the image + # to verify that it is an image of reasonable type, and + # then both store that content-type in S3 (for later + # serving), as well as using it to derive the right + # filename extension (for clarity). + content_type = magic.from_buffer(original_bytes[:1024], mime=True) if content_type not in VALID_EMOJI_CONTENT_TYPE: raise SkipImageError(f"Invalid content-type: {content_type}")