zulip/zerver/lib/upload/s3.py

import logging
import os
import secrets
from collections.abc import Callable, Iterator
from datetime import datetime
from typing import IO, TYPE_CHECKING, Any, Literal
from urllib.parse import urljoin, urlsplit, urlunsplit

import botocore
import pyvips
from botocore.client import Config
from botocore.response import StreamingBody
from django.conf import settings
from django.utils.http import content_disposition_header
from typing_extensions import override

from zerver.lib.partial import partial
from zerver.lib.thumbnail import resize_logo, resize_realm_icon
from zerver.lib.upload.base import INLINE_MIME_TYPES, StreamingSourceWithSize, ZulipUploadBackend
from zerver.models import Realm, RealmEmoji, UserProfile

if TYPE_CHECKING:
    from mypy_boto3_s3.client import S3Client
    from mypy_boto3_s3.service_resource import Bucket, Object

# Duration that the signed upload URLs that we redirect to when
# accessing uploaded files are available for clients to fetch before
# they expire.
SIGNED_UPLOAD_URL_DURATION = 60

# Performance note:
#
# For writing files to S3, the file could either be stored in RAM
# (if it is less than 2.5MiB or so) or an actual temporary file on disk.
#
# Because we set FILE_UPLOAD_MAX_MEMORY_SIZE to 0, only the latter case
# should occur in practice.
#
# This is great, because passing the pseudofile object that Django gives
# you to boto would be a pain.

# To come up with a s3 key we randomly generate a "directory". The
# "file name" is the original filename provided by the user run
# through a sanitization function.


# https://github.com/boto/botocore/issues/2644 means that the IMDS
# request _always_ pulls from the environment.  Monkey-patch the
# `should_bypass_proxies` function if we need to skip them, based
# on S3_SKIP_PROXY.
if settings.S3_SKIP_PROXY is True:  # nocoverage
    botocore.utils.should_bypass_proxies = lambda url: True


def get_bucket(bucket_name: str, authed: bool = True) -> "Bucket":
    import boto3

    return boto3.resource(
        "s3",
        aws_access_key_id=settings.S3_KEY if authed else None,
        aws_secret_access_key=settings.S3_SECRET_KEY if authed else None,
        region_name=settings.S3_REGION,
        endpoint_url=settings.S3_ENDPOINT_URL,
        config=Config(
            signature_version=None if authed else botocore.UNSIGNED,
            s3={"addressing_style": settings.S3_ADDRESSING_STYLE},
        ),
    ).Bucket(bucket_name)


def upload_content_to_s3(
    bucket: "Bucket",
    path: str,
    content_type: str | None,
    user_profile: UserProfile | None,
    contents: bytes,
    *,
    storage_class: Literal[
        "GLACIER_IR",
        "INTELLIGENT_TIERING",
        "ONEZONE_IA",
        "REDUCED_REDUNDANCY",
        "STANDARD",
        "STANDARD_IA",
    ] = "STANDARD",
    cache_control: str | None = None,
    extra_metadata: dict[str, str] | None = None,
    filename: str | None = None,
) -> None:
    # Note that these steps are also replicated in
    # handle_upload_pre_finish_hook in zerver.views.tus, to update
    # properties for files uploaded via TUS.

    key = bucket.Object(path)
    metadata: dict[str, str] = {}
    if user_profile:
        metadata["user_profile_id"] = str(user_profile.id)
        metadata["realm_id"] = str(user_profile.realm_id)
    if extra_metadata is not None:
        metadata.update(extra_metadata)

    extras = {}
    if content_type is None:  # nocoverage
        content_type = ""
    is_attachment = content_type not in INLINE_MIME_TYPES
    if filename is not None:
        extras["ContentDisposition"] = content_disposition_header(is_attachment, filename)
    elif is_attachment:
        extras["ContentDisposition"] = "attachment"
    if cache_control is not None:
        extras["CacheControl"] = cache_control

    key.put(
        Body=contents,
        Metadata=metadata,
        ContentType=content_type,
        StorageClass=storage_class,
        **extras,  # type: ignore[arg-type] # The dynamic kwargs here confuse mypy.
    )


BOTO_CLIENT: "S3Client | None" = None


def get_boto_client() -> "S3Client":
    """
    Creating the client takes a long time so we need to cache it.
    """
    global BOTO_CLIENT
    if BOTO_CLIENT is None:
        BOTO_CLIENT = get_bucket(settings.S3_AUTH_UPLOADS_BUCKET).meta.client
    return BOTO_CLIENT


def get_signed_upload_url(path: str, filename: str, force_download: bool = False) -> str:
    params = {
        "Bucket": settings.S3_AUTH_UPLOADS_BUCKET,
        "Key": path,
    }
    if force_download:
        params["ResponseContentDisposition"] = (
            content_disposition_header(True, filename) or "attachment"
        )

    return get_boto_client().generate_presigned_url(
        ClientMethod="get_object",
        Params=params,
        ExpiresIn=SIGNED_UPLOAD_URL_DURATION,
        HttpMethod="GET",
    )


class S3UploadBackend(ZulipUploadBackend):
    def __init__(self) -> None:
        from mypy_boto3_s3.service_resource import Bucket

        self.avatar_bucket = get_bucket(settings.S3_AVATAR_BUCKET)
        self.uploads_bucket = get_bucket(settings.S3_AUTH_UPLOADS_BUCKET)
        self.export_bucket: Bucket | None = None
        if settings.S3_EXPORT_BUCKET:
            self.export_bucket = get_bucket(settings.S3_EXPORT_BUCKET)

        self.public_upload_url_base = self.construct_public_upload_url_base()

    def delete_file_from_s3(self, path_id: str, bucket: "Bucket") -> bool:
        key = bucket.Object(path_id)

        try:
            key.load()
        except botocore.exceptions.ClientError:
            file_name = path_id.split("/")[-1]
            logging.warning(
                "%s does not exist. Its entry in the database will be removed.", file_name
            )
            return False
        key.delete()
        return True

    def construct_public_upload_url_base(self) -> str:
        # Return the pattern for public URL for a key in the S3 Avatar bucket.
        # For Amazon S3 itself, this will return the following:
        #     f"https://{self.avatar_bucket.name}.{network_location}/{key}"
        #
        # However, we need this function to properly handle S3 style
        # file upload backends that Zulip supports, which can have a
        # different URL format. Configuring no signature and providing
        # no access key makes `generate_presigned_url` just return the
        # normal public URL for a key.
        #
        # It unfortunately takes 2ms per query to call
        # generate_presigned_url. Since we need to potentially compute
        # hundreds of avatar URLs in single `GET /messages` request,
        # we instead back-compute the URL pattern here.

        # The S3_AVATAR_PUBLIC_URL_PREFIX setting is used to override
        # this prefix, for instance if a CloudFront distribution is
        # used.
        if settings.S3_AVATAR_PUBLIC_URL_PREFIX is not None:
            prefix = settings.S3_AVATAR_PUBLIC_URL_PREFIX
            if not prefix.endswith("/"):
                prefix += "/"
            return prefix

        DUMMY_KEY = "dummy_key_ignored"

        # We do not access self.avatar_bucket.meta.client directly,
        # since that client is auth'd, and we want only the direct
        # unauthed endpoint here.
        client = get_bucket(self.avatar_bucket.name, authed=False).meta.client
        dummy_signed_url = client.generate_presigned_url(
            ClientMethod="get_object",
            Params={
                "Bucket": self.avatar_bucket.name,
                "Key": DUMMY_KEY,
            },
            ExpiresIn=0,
        )
        split_url = urlsplit(dummy_signed_url)
        assert split_url.path.endswith(f"/{DUMMY_KEY}")

        return urlunsplit(
            (split_url.scheme, split_url.netloc, split_url.path.removesuffix(DUMMY_KEY), "", "")
        )

    @override
    def get_public_upload_root_url(self) -> str:
        return self.public_upload_url_base

    def get_public_upload_url(
        self,
        key: str,
    ) -> str:
        assert not key.startswith("/")
        return urljoin(self.public_upload_url_base, key)

    @override
    def generate_message_upload_path(self, realm_id: str, sanitized_file_name: str) -> str:
        return "/".join(
            [
                realm_id,
                secrets.token_urlsafe(18),
                sanitized_file_name,
            ]
        )

    @override
    def upload_message_attachment(
        self,
        path_id: str,
        filename: str,
        content_type: str,
        file_data: bytes,
        user_profile: UserProfile | None,
    ) -> None:
        upload_content_to_s3(
            self.uploads_bucket,
            path_id,
            content_type,
            user_profile,
            file_data,
            storage_class=settings.S3_UPLOADS_STORAGE_CLASS,
            filename=filename,
        )

    @override
    def save_attachment_contents(self, path_id: str, filehandle: IO[bytes]) -> None:
        for chunk in self.uploads_bucket.Object(path_id).get()["Body"]:
            filehandle.write(chunk)

    @override
    def attachment_vips_source(self, path_id: str) -> StreamingSourceWithSize:
        metadata = self.uploads_bucket.Object(path_id).get()

        def s3_read(streamingbody: StreamingBody, size: int) -> bytes:
            return streamingbody.read(amt=size)

        source: pyvips.Source = pyvips.SourceCustom()
        source.on_read(partial(s3_read, metadata["Body"]))
        return StreamingSourceWithSize(size=metadata["ContentLength"], source=source)

    @override
    def delete_message_attachment(self, path_id: str) -> bool:
        return self.delete_file_from_s3(path_id, self.uploads_bucket)

    @override
    def delete_message_attachments(self, path_ids: list[str]) -> None:
        self.uploads_bucket.delete_objects(
            Delete={"Objects": [{"Key": path_id} for path_id in path_ids]}
        )

    @override
    def all_message_attachments(
        self,
        include_thumbnails: bool = False,
        prefix: str = "",
    ) -> Iterator[tuple[str, datetime]]:
        client = self.uploads_bucket.meta.client
        paginator = client.get_paginator("list_objects_v2")
        page_iterator = paginator.paginate(Bucket=self.uploads_bucket.name, Prefix=prefix)

        for page in page_iterator:
            if page["KeyCount"] > 0:
                for item in page["Contents"]:
                    if not include_thumbnails and item["Key"].startswith("thumbnail/"):
                        continue
                    yield (
                        item["Key"],
                        item["LastModified"],
                    )

    @override
    def get_avatar_url(self, hash_key: str, medium: bool = False) -> str:
        return self.get_public_upload_url(self.get_avatar_path(hash_key, medium))

    @override
    def get_avatar_contents(self, file_path: str) -> tuple[bytes, str]:
        key = self.avatar_bucket.Object(file_path + ".original")
        image_data = key.get()["Body"].read()
        content_type = key.content_type
        return image_data, content_type

    @override
    def upload_single_avatar_image(
        self,
        file_path: str,
        *,
        user_profile: UserProfile,
        image_data: bytes,
        content_type: str | None,
        future: bool = True,
    ) -> None:
        extra_metadata = {"avatar_version": str(user_profile.avatar_version + (1 if future else 0))}
        upload_content_to_s3(
            self.avatar_bucket,
            file_path,
            content_type,
            user_profile,
            image_data,
            extra_metadata=extra_metadata,
            cache_control="public, max-age=31536000, immutable",
        )

    @override
    def delete_avatar_image(self, path_id: str) -> None:
        self.delete_file_from_s3(path_id + ".original", self.avatar_bucket)
        self.delete_file_from_s3(self.get_avatar_path(path_id, True), self.avatar_bucket)
        self.delete_file_from_s3(self.get_avatar_path(path_id, False), self.avatar_bucket)

    @override
    def get_realm_icon_url(self, realm_id: int, version: int) -> str:
        public_url = self.get_public_upload_url(f"{realm_id}/realm/icon.png")
        return public_url + f"?version={version}"

    @override
    def upload_realm_icon_image(
        self, icon_file: IO[bytes], user_profile: UserProfile, content_type: str
    ) -> None:
        s3_file_name = os.path.join(self.realm_avatar_and_logo_path(user_profile.realm), "icon")

        image_data = icon_file.read()
        upload_content_to_s3(
            self.avatar_bucket,
            s3_file_name + ".original",
            content_type,
            user_profile,
            image_data,
        )

        resized_data = resize_realm_icon(image_data)
        upload_content_to_s3(
            self.avatar_bucket,
            s3_file_name + ".png",
            "image/png",
            user_profile,
            resized_data,
        )
        # See avatar_url in avatar.py for URL.  (That code also handles the case
        # that users use gravatar.)

    @override
    def get_realm_logo_url(self, realm_id: int, version: int, night: bool) -> str:
        if not night:
            file_name = "logo.png"
        else:
            file_name = "night_logo.png"
        public_url = self.get_public_upload_url(f"{realm_id}/realm/{file_name}")
        return public_url + f"?version={version}"

    @override
    def upload_realm_logo_image(
        self, logo_file: IO[bytes], user_profile: UserProfile, night: bool, content_type: str
    ) -> None:
        if night:
            basename = "night_logo"
        else:
            basename = "logo"
        s3_file_name = os.path.join(self.realm_avatar_and_logo_path(user_profile.realm), basename)

        image_data = logo_file.read()
        upload_content_to_s3(
            self.avatar_bucket,
            s3_file_name + ".original",
            content_type,
            user_profile,
            image_data,
        )

        resized_data = resize_logo(image_data)
        upload_content_to_s3(
            self.avatar_bucket,
            s3_file_name + ".png",
            "image/png",
            user_profile,
            resized_data,
        )
        # See avatar_url in avatar.py for URL.  (That code also handles the case
        # that users use gravatar.)

    @override
    def get_emoji_url(self, emoji_file_name: str, realm_id: int, still: bool = False) -> str:
        if still:
            emoji_path = RealmEmoji.STILL_PATH_ID_TEMPLATE.format(
                realm_id=realm_id,
                emoji_filename_without_extension=os.path.splitext(emoji_file_name)[0],
            )
            return self.get_public_upload_url(emoji_path)
        else:
            emoji_path = RealmEmoji.PATH_ID_TEMPLATE.format(
                realm_id=realm_id, emoji_file_name=emoji_file_name
            )
            return self.get_public_upload_url(emoji_path)

    @override
    def upload_single_emoji_image(
        self, path: str, content_type: str | None, user_profile: UserProfile, image_data: bytes
    ) -> None:
        upload_content_to_s3(
            self.avatar_bucket,
            path,
            content_type,
            user_profile,
            image_data,
            cache_control="public, max-age=31536000, immutable",
        )

    @override
    def get_export_tarball_url(self, realm: Realm, export_path: str) -> str:
        export_path = export_path.removeprefix("/")
        if self.export_bucket:
            # Fix old data if the row was created when an export bucket was not in use.
            export_path = export_path.removeprefix("exports/")
            client = self.export_bucket.meta.client
            return client.generate_presigned_url(
                ClientMethod="get_object",
                Params={
                    "Bucket": self.export_bucket.name,
                    "Key": export_path,
                },
                # Expires in one week, the longest allowed by AWS
                ExpiresIn=60 * 60 * 24 * 7,
            )
        else:
            if not export_path.startswith("exports/"):
                export_path = "exports/" + export_path
            client = self.avatar_bucket.meta.client
            signed_url = client.generate_presigned_url(
                ClientMethod="get_object",
                Params={
                    "Bucket": self.avatar_bucket.name,
                    "Key": export_path,
                },
                ExpiresIn=0,
            )
            # Strip off the signing query parameters, since this URL is public
            return urlsplit(signed_url)._replace(query="").geturl()

    def export_object(self, tarball_path: str) -> "Object":
        if self.export_bucket:
            return self.export_bucket.Object(
                os.path.join(secrets.token_hex(16), os.path.basename(tarball_path))
            )
        else:
            # We fall back to the avatar bucket, because it's world-readable.
            return self.avatar_bucket.Object(
                os.path.join("exports", secrets.token_hex(16), os.path.basename(tarball_path))
            )

    @override
    def upload_export_tarball(
        self,
        realm: Realm,
        tarball_path: str,
        percent_callback: Callable[[Any], None] | None = None,
    ) -> str:
        key = self.export_object(tarball_path)

        if percent_callback is None:
            key.upload_file(Filename=tarball_path)
        else:
            key.upload_file(Filename=tarball_path, Callback=percent_callback)

        return self.get_export_tarball_url(realm, key.key)

    @override
    def delete_export_tarball(self, export_path: str) -> str | None:
        assert export_path.startswith("/")
        path_id = export_path.removeprefix("/")
        bucket = self.export_bucket or self.avatar_bucket
        if self.delete_file_from_s3(path_id, bucket):
            return export_path
        return None