diff --git a/docs/production/upload-backends.md b/docs/production/upload-backends.md index 367dee7e68..d6c3af72e1 100644 --- a/docs/production/upload-backends.md +++ b/docs/production/upload-backends.md @@ -241,3 +241,64 @@ lifecycle transition cost][s3-pricing]. [s3-storage-class]: https://aws.amazon.com/s3/storage-classes/ [s3-storage-class-constant]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html#AmazonS3-PutObject-request-header-StorageClass [s3-pricing]: https://aws.amazon.com/s3/pricing/ + +## Data export bucket + +The [data export process](export-and-import.md#data-export) process, when +[triggered from the +UI](https://zulip.com/help/export-your-organization), uploads the +completed export so it is available to download from the server; this +is also available [from the command +line](export-and-import.md#export-your-zulip-data) by passing +`--upload`. When the S3 backend is used, these uploads are done to S3. + +By default, they are uploaded to the bucket with user avatars +(`S3_AVATAR_BUCKET`), because that bucket is world-readable, allowing +easy generation of links to download the export. + +If you would like to store exports in a dedicated bucket, you can set +`S3_EXPORT_BUCKET` in your `/etc/zulip/settings.py`. This bucket +should also be configured like the uploads bucket, only allowing write +access to the Zulip account, as it will generate links which are valid +for 1 week at a time: + +```json +{ + "Version": "2012-10-17", + "Id": "Policy1468991802322", + "Statement": [ + { + "Sid": "Stmt1468991795390", + "Effect": "Allow", + "Principal": { + "AWS": "ARN_PRINCIPAL_HERE" + }, + "Action": [ + "s3:GetObject", + "s3:DeleteObject", + "s3:PutObject" + ], + "Resource": "arn:aws:s3:::BUCKET_NAME_HERE/*" + }, + { + "Sid": "Stmt1468991795391", + "Effect": "Allow", + "Principal": { + "AWS": "ARN_PRINCIPAL_HERE" + }, + "Action": "s3:ListBucket", + "Resource": "arn:aws:s3:::BUCKET_NAME_HERE" + } + ] +} +``` + +You should copy existing exports to the new bucket. For instance, +using the [AWS CLI](https://aws.amazon.com/cli/)'s [`aws s3 +sync`](https://docs.aws.amazon.com/cli/latest/reference/s3/sync.html), +if the old bucket was named `example-zulip-avatars` and the new export +bucket is named `example-zulip-exports`: + +``` +aws s3 sync s3://example-zulip-avatars/exports/ s3://example-zulip-exports/ +``` diff --git a/zerver/lib/export.py b/zerver/lib/export.py index 2dc462ffd8..1c968d0f8f 100644 --- a/zerver/lib/export.py +++ b/zerver/lib/export.py @@ -2470,9 +2470,6 @@ def export_realm_wrapper( if not upload: return None - # We upload to the `avatars` bucket because that's world-readable - # without additional configuration. We'll likely want to change - # that in the future. print("Uploading export tarball...") public_url = zerver.lib.upload.upload_backend.upload_export_tarball( realm, tarball_path, percent_callback=percent_callback diff --git a/zerver/lib/upload/s3.py b/zerver/lib/upload/s3.py index 3eba2a7c1f..6350de186a 100644 --- a/zerver/lib/upload/s3.py +++ b/zerver/lib/upload/s3.py @@ -14,7 +14,7 @@ from botocore.response import StreamingBody from django.conf import settings from django.utils.http import content_disposition_header from mypy_boto3_s3.client import S3Client -from mypy_boto3_s3.service_resource import Bucket +from mypy_boto3_s3.service_resource import Bucket, Object from typing_extensions import override from zerver.lib.partial import partial @@ -145,6 +145,10 @@ class S3UploadBackend(ZulipUploadBackend): def __init__(self) -> None: self.avatar_bucket = get_bucket(settings.S3_AVATAR_BUCKET) self.uploads_bucket = get_bucket(settings.S3_AUTH_UPLOADS_BUCKET) + self.export_bucket: Bucket | None = None + if settings.S3_EXPORT_BUCKET: + self.export_bucket = get_bucket(settings.S3_EXPORT_BUCKET) + self.public_upload_url_base = self.construct_public_upload_url_base() def delete_file_from_s3(self, path_id: str, bucket: Bucket) -> bool: @@ -428,18 +432,45 @@ class S3UploadBackend(ZulipUploadBackend): @override def get_export_tarball_url(self, realm: Realm, export_path: str) -> str: - client = self.avatar_bucket.meta.client - signed_url = client.generate_presigned_url( - ClientMethod="get_object", - Params={ - "Bucket": self.avatar_bucket.name, - # export_path has a leading / - "Key": export_path.removeprefix("/"), - }, - ExpiresIn=0, - ) - # Strip off the signing query parameters, since this URL is public - return urlsplit(signed_url)._replace(query="").geturl() + export_path = export_path.removeprefix("/") + if self.export_bucket: + # Fix old data if the row was created when an export bucket was not in use. + export_path = export_path.removeprefix("exports/") + client = self.export_bucket.meta.client + return client.generate_presigned_url( + ClientMethod="get_object", + Params={ + "Bucket": self.export_bucket.name, + "Key": export_path, + }, + # Expires in one week, the longest allowed by AWS + ExpiresIn=60 * 60 * 24 * 7, + ) + else: + if not export_path.startswith("exports/"): + export_path = "exports/" + export_path + client = self.avatar_bucket.meta.client + signed_url = client.generate_presigned_url( + ClientMethod="get_object", + Params={ + "Bucket": self.avatar_bucket.name, + "Key": export_path, + }, + ExpiresIn=0, + ) + # Strip off the signing query parameters, since this URL is public + return urlsplit(signed_url)._replace(query="").geturl() + + def export_object(self, tarball_path: str) -> Object: + if self.export_bucket: + return self.export_bucket.Object( + os.path.join(secrets.token_hex(16), os.path.basename(tarball_path)) + ) + else: + # We fall back to the avatar bucket, because it's world-readable. + return self.avatar_bucket.Object( + os.path.join("exports", secrets.token_hex(16), os.path.basename(tarball_path)) + ) @override def upload_export_tarball( @@ -448,10 +479,7 @@ class S3UploadBackend(ZulipUploadBackend): tarball_path: str, percent_callback: Callable[[Any], None] | None = None, ) -> str: - # We use the avatar bucket, because it's world-readable. - key = self.avatar_bucket.Object( - os.path.join("exports", secrets.token_hex(16), os.path.basename(tarball_path)) - ) + key = self.export_object(tarball_path) if percent_callback is None: key.upload_file(Filename=tarball_path) @@ -464,6 +492,7 @@ class S3UploadBackend(ZulipUploadBackend): def delete_export_tarball(self, export_path: str) -> str | None: assert export_path.startswith("/") path_id = export_path.removeprefix("/") - if self.delete_file_from_s3(path_id, self.avatar_bucket): + bucket = self.export_bucket or self.avatar_bucket + if self.delete_file_from_s3(path_id, bucket): return export_path return None diff --git a/zerver/tests/test_realm_export.py b/zerver/tests/test_realm_export.py index 7a6afb376c..2e7c6e2cd3 100644 --- a/zerver/tests/test_realm_export.py +++ b/zerver/tests/test_realm_export.py @@ -1,5 +1,6 @@ import os from unittest.mock import patch +from urllib.parse import urlsplit import botocore.exceptions from django.conf import settings @@ -42,7 +43,7 @@ class RealmExportTest(ZulipTestCase): def test_endpoint_s3(self) -> None: admin = self.example_user("iago") self.login_user(admin) - bucket = create_s3_buckets(settings.S3_AVATAR_BUCKET)[0] + bucket = create_s3_buckets(settings.S3_EXPORT_BUCKET)[0] tarball_path = create_dummy_file("test-export.tar.gz") # Test the export logic. @@ -82,9 +83,10 @@ class RealmExportTest(ZulipTestCase): # Test that the export we have is the export we created. export_dict = response_dict["exports"] self.assertEqual(export_dict[0]["id"], audit_log_entry.id) + parsed_url = urlsplit(export_dict[0]["export_url"]) self.assertEqual( - export_dict[0]["export_url"], - "https://test-avatar-bucket.s3.amazonaws.com" + export_path, + parsed_url._replace(query="").geturl(), + "https://test-export-bucket.s3.amazonaws.com" + export_path, ) self.assertEqual(export_dict[0]["acting_user_id"], admin.id) self.assert_length( diff --git a/zerver/tests/test_upload_s3.py b/zerver/tests/test_upload_s3.py index 3a33763b4f..58bd5a7732 100644 --- a/zerver/tests/test_upload_s3.py +++ b/zerver/tests/test_upload_s3.py @@ -2,11 +2,15 @@ import os import re from io import BytesIO, StringIO from unittest.mock import patch -from urllib.parse import urlsplit +from urllib.parse import parse_qs, urlsplit +import boto3 import botocore.exceptions import pyvips from django.conf import settings +from django.test import override_settings +from moto.core.decorator import mock_aws +from mypy_boto3_s3.type_defs import CopySourceTypeDef import zerver.lib.upload from zerver.actions.create_user import do_create_user @@ -569,7 +573,7 @@ class S3Test(ZulipTestCase): @use_s3_backend def test_tarball_upload_and_deletion(self) -> None: - bucket = create_s3_buckets(settings.S3_AVATAR_BUCKET)[0] + bucket = create_s3_buckets(settings.S3_EXPORT_BUCKET)[0] user_profile = self.example_user("iago") self.assertTrue(user_profile.is_realm_admin) @@ -590,11 +594,18 @@ class S3Test(ZulipTestCase): # Verify the percent_callback API works self.assertEqual(total_bytes_transferred, 5) - result = re.search(re.compile(r"([0-9a-fA-F]{32})"), url) + parsed_url = urlsplit(url) + result = re.search(re.compile(r"/([0-9a-fA-F]{32})/"), url) if result is not None: hex_value = result.group(1) - expected_url = f"https://{bucket.name}.s3.amazonaws.com/exports/{hex_value}/{os.path.basename(tarball_path)}" - self.assertEqual(url, expected_url) + expected_url = ( + f"https://{bucket.name}.s3.amazonaws.com/{hex_value}/{os.path.basename(tarball_path)}" + ) + self.assertEqual(parsed_url._replace(query="").geturl(), expected_url) + params = parse_qs(parsed_url.query) + self.assertEqual(params["AWSAccessKeyId"], ["test-key"]) + self.assertIn("Signature", params) + self.assertIn("Expires", params) # Delete the tarball. with self.assertLogs(level="WARNING") as warn_log: @@ -603,5 +614,132 @@ class S3Test(ZulipTestCase): warn_log.output, ["WARNING:root:not_a_file does not exist. Its entry in the database will be removed."], ) + self.assertEqual(delete_export_tarball(parsed_url.path), parsed_url.path) + + @override_settings(S3_EXPORT_BUCKET="") + @use_s3_backend + def test_tarball_upload_and_deletion_no_export_bucket(self) -> None: + bucket = create_s3_buckets(settings.S3_AVATAR_BUCKET)[0] + + user_profile = self.example_user("iago") + self.assertTrue(user_profile.is_realm_admin) + + tarball_path = os.path.join(settings.TEST_WORKER_DIR, "tarball.tar.gz") + with open(tarball_path, "w") as f: + f.write("dummy") + + url = upload_export_tarball(user_profile.realm, tarball_path) + result = re.search(re.compile(r"/([0-9a-fA-F]{32})/"), url) + if result is not None: + hex_value = result.group(1) + expected_url = f"https://{bucket.name}.s3.amazonaws.com/exports/{hex_value}/{os.path.basename(tarball_path)}" + self.assertEqual(url, expected_url) + + # Delete the tarball. path_id = urlsplit(url).path self.assertEqual(delete_export_tarball(path_id), path_id) + + @mock_aws + def test_tarball_upload_avatar_bucket_download_export_bucket(self) -> None: + """Test to verify that tarballs uploaded to avatar bucket can be later + accessed via export bucket when server is configured to use export bucket. + """ + user_profile = self.example_user("iago") + self.assertTrue(user_profile.is_realm_admin) + + tarball_path = os.path.join(settings.TEST_WORKER_DIR, "tarball.tar.gz") + with open(tarball_path, "w") as f: + f.write("dummy") + + # Upload export tarball to the avatar bucket. + with override_settings(S3_EXPORT_BUCKET=""): + backend = S3UploadBackend() + avatar_bucket = create_s3_buckets(settings.S3_AVATAR_BUCKET)[0] + with patch("zerver.lib.upload.upload_backend", backend): + public_url = upload_export_tarball(user_profile.realm, tarball_path) + avatar_object_key = urlsplit(public_url).path.removeprefix("/") + + # Verify that old tarballs (uploaded to avatar bucket) can be accessed + # from export bucket, once server is configured to use a separate export bucket. + with override_settings(S3_EXPORT_BUCKET=settings.S3_EXPORT_BUCKET): + backend = S3UploadBackend() + export_bucket = create_s3_buckets(settings.S3_EXPORT_BUCKET)[0] + + # Copy existing exports to the new bucket. + # This operation is performed as a part of configuring export bucket. + session = boto3.session.Session(settings.S3_KEY, settings.S3_SECRET_KEY) + s3 = session.resource("s3") + copy_source: CopySourceTypeDef = { + "Bucket": avatar_bucket.name, + "Key": avatar_object_key, + } + export_object_key = avatar_object_key.removeprefix("exports/") + s3.meta.client.copy(copy_source, export_bucket.name, export_object_key) + + # Verify copy operation. + object = s3.Object(export_bucket.name, export_object_key) + content = object.get()["Body"].read() + self.assertEqual(content, b"dummy") + + # Verify that tarball can be accessed using old 'avatar_object_key'. + url = backend.get_export_tarball_url(user_profile.realm, avatar_object_key) + parsed_url = urlsplit(url) + result = re.search(re.compile(r"/([0-9a-fA-F]{32})/"), url) + if result is not None: + hex_value = result.group(1) + expected_url = f"https://{export_bucket.name}.s3.amazonaws.com/{hex_value}/{os.path.basename(tarball_path)}" + self.assertEqual(parsed_url._replace(query="").geturl(), expected_url) + params = parse_qs(parsed_url.query) + self.assertEqual(params["AWSAccessKeyId"], ["test-key"]) + self.assertIn("Signature", params) + self.assertIn("Expires", params) + + @mock_aws + def test_tarball_upload_export_bucket_download_avatar_bucket(self) -> None: + """Test to verify that tarballs uploaded to export bucket can be later + accessed via avatar bucket when server is configured to use ONLY avatar bucket. + """ + user_profile = self.example_user("iago") + self.assertTrue(user_profile.is_realm_admin) + + tarball_path = os.path.join(settings.TEST_WORKER_DIR, "tarball.tar.gz") + with open(tarball_path, "w") as f: + f.write("dummy") + + # Upload export tarball to the export bucket. + with override_settings(S3_EXPORT_BUCKET=settings.S3_EXPORT_BUCKET): + backend = S3UploadBackend() + export_bucket = create_s3_buckets(settings.S3_EXPORT_BUCKET)[0] + with patch("zerver.lib.upload.upload_backend", backend): + public_url = upload_export_tarball(user_profile.realm, tarball_path) + export_object_key = urlsplit(public_url).path.removeprefix("/") + + # Verify that old tarballs (uploaded to export bucket) can be accessed + # from avatar bucket, once server is configured to use ONLY avatar bucket. + with override_settings(S3_EXPORT_BUCKET=""): + backend = S3UploadBackend() + avatar_bucket = create_s3_buckets(settings.S3_AVATAR_BUCKET)[0] + + # Copy existing exports to the avatar bucket. + # This operation is performed as a part of the changes to use ONLY avatar bucket. + session = boto3.session.Session(settings.S3_KEY, settings.S3_SECRET_KEY) + s3 = session.resource("s3") + copy_source: CopySourceTypeDef = { + "Bucket": export_bucket.name, + "Key": export_object_key, + } + avatar_object_key = "exports/" + export_object_key + s3.meta.client.copy(copy_source, avatar_bucket.name, avatar_object_key) + + # Verify copy operation. + object = s3.Object(avatar_bucket.name, avatar_object_key) + content = object.get()["Body"].read() + self.assertEqual(content, b"dummy") + + # Verify that tarball can still be accessed using old 'export_object_key'. + url = backend.get_export_tarball_url(user_profile.realm, export_object_key) + result = re.search(re.compile(r"/([0-9a-fA-F]{32})/"), url) + if result is not None: + hex_value = result.group(1) + expected_url = f"https://{avatar_bucket.name}.s3.amazonaws.com/exports/{hex_value}/{os.path.basename(tarball_path)}" + self.assertEqual(url, expected_url) diff --git a/zproject/default_settings.py b/zproject/default_settings.py index 099d88d202..b4c7f6b691 100644 --- a/zproject/default_settings.py +++ b/zproject/default_settings.py @@ -151,6 +151,7 @@ DEFAULT_AVATAR_URI: str | None = None DEFAULT_LOGO_URI: str | None = None S3_AVATAR_BUCKET = "" S3_AUTH_UPLOADS_BUCKET = "" +S3_EXPORT_BUCKET = "" S3_REGION: str | None = None S3_ENDPOINT_URL: str | None = None S3_ADDRESSING_STYLE: Literal["auto", "virtual", "path"] = "auto" diff --git a/zproject/prod_settings_template.py b/zproject/prod_settings_template.py index 9a28017bc3..2cd819e086 100644 --- a/zproject/prod_settings_template.py +++ b/zproject/prod_settings_template.py @@ -793,6 +793,7 @@ ENABLE_FILE_LINKS = False LOCAL_UPLOADS_DIR = "/home/zulip/uploads" # S3_AUTH_UPLOADS_BUCKET = "" # S3_AVATAR_BUCKET = "" +# S3_EXPORT_BUCKET = "" # S3_REGION = None # S3_ENDPOINT_URL = None # S3_AVATAR_PUBLIC_URL_PREFIX = None diff --git a/zproject/test_extra_settings.py b/zproject/test_extra_settings.py index 71a5705ed4..a2c600b09d 100644 --- a/zproject/test_extra_settings.py +++ b/zproject/test_extra_settings.py @@ -145,6 +145,7 @@ S3_KEY = "test-key" S3_SECRET_KEY = "test-secret-key" S3_AUTH_UPLOADS_BUCKET = "test-authed-bucket" S3_AVATAR_BUCKET = "test-avatar-bucket" +S3_EXPORT_BUCKET = "test-export-bucket" INLINE_URL_EMBED_PREVIEW = False