retention: Prevent deletion of partially-archived messages.

Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
    has_other_messages=Exists(
        Attachment.objects.filter(id=OuterRef("id"))
        .exclude(messages=None)
        .exclude(scheduled_messages=None)
    )
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```

...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.

Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.

Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
This commit is contained in:
Alex Vandiver 2023-07-28 18:53:07 +00:00 committed by Tim Abbott
parent 0f918d9071
commit b67108c8c6
4 changed files with 356 additions and 133 deletions

View File

@ -74,11 +74,17 @@ def do_delete_old_unclaimed_attachments(weeks_ago: int) -> None:
weeks_ago weeks_ago
) )
# An attachment may be removed from Attachments and
# ArchiveAttachments in the same run; prevent warnings from the
# backing store by only removing it from there once.
already_removed = set()
for attachment in old_unclaimed_attachments: for attachment in old_unclaimed_attachments:
delete_message_attachment(attachment.path_id) delete_message_attachment(attachment.path_id)
already_removed.add(attachment.path_id)
attachment.delete() attachment.delete()
for archived_attachment in old_unclaimed_archived_attachments: for archived_attachment in old_unclaimed_archived_attachments:
delete_message_attachment(archived_attachment.path_id) if archived_attachment.path_id not in already_removed:
delete_message_attachment(archived_attachment.path_id)
archived_attachment.delete() archived_attachment.delete()

View File

@ -3739,9 +3739,9 @@ def get_old_unclaimed_attachments(
) )
old_archived_attachments = ArchivedAttachment.objects.annotate( old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists( has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id")) Attachment.objects.filter(id=OuterRef("id")).exclude(
.exclude(messages=None) messages=None, scheduled_messages=None
.exclude(scheduled_messages=None) )
) )
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False) ).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)

View File

@ -0,0 +1,345 @@
import os
import re
from datetime import datetime, timedelta
from io import StringIO
from typing import Optional
import time_machine
from django.conf import settings
from django.utils.timezone import now as timezone_now
from zerver.actions.message_delete import do_delete_messages
from zerver.actions.scheduled_messages import check_schedule_message, delete_scheduled_message
from zerver.actions.uploads import do_delete_old_unclaimed_attachments
from zerver.lib.retention import clean_archived_data
from zerver.lib.test_classes import UploadSerializeMixin, ZulipTestCase
from zerver.models import (
ArchivedAttachment,
Attachment,
Message,
UserProfile,
get_client,
)
class UnclaimedAttachmentTest(UploadSerializeMixin, ZulipTestCase):
def make_attachment(
self, filename: str, when: Optional[datetime] = None, uploader: Optional[UserProfile] = None
) -> Attachment:
if when is None:
when = timezone_now() - timedelta(weeks=2)
if uploader is None:
uploader = self.example_user("hamlet")
self.login_user(uploader)
with time_machine.travel(when):
file_obj = StringIO("zulip!")
file_obj.name = filename
response = self.assert_json_success(
self.client_post("/json/user_uploads", {"file": file_obj})
)
path_id = re.sub("/user_uploads/", "", response["uri"])
return Attachment.objects.get(path_id=path_id)
def assert_exists(
self,
attachment: Attachment,
*,
has_file: bool,
has_attachment: bool,
has_archived_attachment: bool,
) -> None:
assert settings.LOCAL_FILES_DIR
self.assertEqual( # File existence on disk
os.path.isfile(os.path.join(settings.LOCAL_FILES_DIR, attachment.path_id)), has_file
)
self.assertEqual( # Attachment row
Attachment.objects.filter(id=attachment.id).exists(), has_attachment
)
self.assertEqual( # ArchivedAttachment row
ArchivedAttachment.objects.filter(id=attachment.id).exists(), has_archived_attachment
)
def test_delete_unused_upload(self) -> None:
unused_attachment = self.make_attachment("unused.txt")
self.assert_exists(
unused_attachment, has_file=True, has_attachment=True, has_archived_attachment=False
)
# If we have 3 weeks of grace, nothing happens
do_delete_old_unclaimed_attachments(3)
self.assert_exists(
unused_attachment, has_file=True, has_attachment=True, has_archived_attachment=False
)
# If we have 1 weeks of grace, the Attachment is deleted, and so is the file on disk
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
unused_attachment, has_file=False, has_attachment=False, has_archived_attachment=False
)
def test_delete_used_upload(self) -> None:
hamlet = self.example_user("hamlet")
attachment = self.make_attachment("used.txt")
# Send message referencing that message
self.subscribe(hamlet, "Denmark")
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
self.send_stream_message(hamlet, "Denmark", body, "test")
# Because the message is claimed, it is not removed
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
)
def test_delete_upload_archived_message(self) -> None:
hamlet = self.example_user("hamlet")
attachment = self.make_attachment("used.txt")
# Send message referencing that message
self.subscribe(hamlet, "Denmark")
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
# Delete that message; this moves it to ArchivedAttachment but leaves the file on disk
do_delete_messages(hamlet.realm, [Message.objects.get(id=message_id)])
self.assert_exists(
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
)
# Removing unclaimed attachments leaves the file, since it is
# attached to an existing ArchivedAttachment
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
)
# Now purge the ArchivedMessage
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
clean_archived_data()
# The attachment still exists as an unclaimed ArchivedAttachment
self.assert_exists(
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
)
# Removing unclaimed attachments now cleans it out
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
)
def test_delete_one_message(self) -> None:
hamlet = self.example_user("hamlet")
attachment = self.make_attachment("used.txt")
# Send message referencing that message
self.subscribe(hamlet, "Denmark")
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
first_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
second_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
# Delete the second message; this leaves an Attachment and an
# ArchivedAttachment, both associated with a message
do_delete_messages(hamlet.realm, [Message.objects.get(id=first_message_id)])
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Removing unclaimed attachments leaves the file, since it is
# attached to an existing Attachment and ArchivedAttachment
# which have Messages and ArchivedMessages, respectively
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Purging the ArchivedMessage does not affect the Attachment
# or ArchivedAttachment
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
clean_archived_data()
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Removing unclaimed attachments still does nothing, because
# the ArchivedAttachment is protected by the existing
# Attachment.
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Deleting the other message now leaves just an ArchivedAttachment
do_delete_messages(hamlet.realm, [Message.objects.get(id=second_message_id)])
self.assert_exists(
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
)
# Cleaning out the archived message and purging unclaimed
# attachments now finally removes it.
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
clean_archived_data()
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
)
def test_delete_with_scheduled_messages(self) -> None:
hamlet = self.example_user("hamlet")
attachment = self.make_attachment("used.txt")
# Schedule a future send with the attachment
self.subscribe(hamlet, "Denmark")
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
scheduled_message_id = check_schedule_message(
hamlet,
get_client("website"),
"stream",
[self.get_stream_id("Denmark")],
"Test topic",
body,
timezone_now() + timedelta(days=365),
hamlet.realm,
)
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
)
# The ScheduledMessage protects the attachment from being removed
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
)
# Deleting the ScheduledMessage leaves the attachment dangling
delete_scheduled_message(hamlet, scheduled_message_id)
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
)
# Having no referents, it is now a target for removal
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
)
def test_delete_with_scheduled_message_and_archive(self) -> None:
hamlet = self.example_user("hamlet")
attachment = self.make_attachment("used.txt")
# Schedule a message, and also send one now
self.subscribe(hamlet, "Denmark")
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
scheduled_message_id = check_schedule_message(
hamlet,
get_client("website"),
"stream",
[self.get_stream_id("Denmark")],
"Test topic",
body,
timezone_now() + timedelta(days=365),
hamlet.realm,
)
sent_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
)
# Deleting the sent message leaves us with an Attachment
# attached to the scheduled message, and an archived
# attachment with an archived message
do_delete_messages(hamlet.realm, [Message.objects.get(id=sent_message_id)])
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Expiring the archived message leaves a dangling
# ArchivedAttachment and a protected Attachment
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
clean_archived_data()
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Removing unclaimed attachments deletes nothing, since the
# the ArchivedAttachment is protected by the Attachment which
# is still protected by the scheduled message
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Deleting the ScheduledMessage leaves the attachment fully dangling
delete_scheduled_message(hamlet, scheduled_message_id)
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Having no referents, it is now a target for removal
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
)
def test_delete_with_unscheduled_message_and_archive(self) -> None:
# This is subtly different from the test above -- we delete
# the scheduled message first, which is the only way to get an
# Attachment with not referents as well as an
# ArchivedAttachment which does have references. Normally,
# the process of archiving prunes Attachments which have no
# references.
hamlet = self.example_user("hamlet")
attachment = self.make_attachment("used.txt")
# Schedule a message, and also send one now
self.subscribe(hamlet, "Denmark")
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
scheduled_message_id = check_schedule_message(
hamlet,
get_client("website"),
"stream",
[self.get_stream_id("Denmark")],
"Test topic",
body,
timezone_now() + timedelta(days=365),
hamlet.realm,
)
sent_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
)
# Delete the message and then unschedule the scheduled message
# before expiring the ArchivedMessages.
do_delete_messages(hamlet.realm, [Message.objects.get(id=sent_message_id)])
delete_scheduled_message(hamlet, scheduled_message_id)
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Attempting to expire unclaimed attachments leaves the
# unreferenced Attachment which is protected by the
# ArchivedAttachment which has archived messages referencing
# it.
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Expiring archived messages leaves us with a dangling
# Attachment and ArchivedAttachment, with neither having
# referents.
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
clean_archived_data()
self.assert_exists(
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
)
# Having no referents in either place, it is now a target for
# removal
do_delete_old_unclaimed_attachments(1)
self.assert_exists(
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
)

View File

@ -1,4 +1,3 @@
import datetime
import io import io
import os import os
import re import re
@ -10,20 +9,16 @@ from unittest.mock import patch
import orjson import orjson
from django.conf import settings from django.conf import settings
from django.utils.timezone import now as timezone_now
from PIL import Image from PIL import Image
from urllib3 import encode_multipart_formdata from urllib3 import encode_multipart_formdata
from urllib3.fields import RequestField from urllib3.fields import RequestField
import zerver.lib.upload import zerver.lib.upload
from zerver.actions.create_realm import do_create_realm from zerver.actions.create_realm import do_create_realm
from zerver.actions.message_delete import do_delete_messages
from zerver.actions.message_send import internal_send_private_message from zerver.actions.message_send import internal_send_private_message
from zerver.actions.realm_icon import do_change_icon_source from zerver.actions.realm_icon import do_change_icon_source
from zerver.actions.realm_logo import do_change_logo_source from zerver.actions.realm_logo import do_change_logo_source
from zerver.actions.realm_settings import do_change_realm_plan_type, do_set_realm_property from zerver.actions.realm_settings import do_change_realm_plan_type, do_set_realm_property
from zerver.actions.scheduled_messages import check_schedule_message, delete_scheduled_message
from zerver.actions.uploads import do_delete_old_unclaimed_attachments
from zerver.actions.user_settings import do_delete_avatar_image from zerver.actions.user_settings import do_delete_avatar_image
from zerver.lib.avatar import avatar_url, get_avatar_field from zerver.lib.avatar import avatar_url, get_avatar_field
from zerver.lib.cache import cache_get, get_realm_used_upload_space_cache_key from zerver.lib.cache import cache_get, get_realm_used_upload_space_cache_key
@ -31,7 +26,6 @@ from zerver.lib.create_user import copy_default_settings
from zerver.lib.initial_password import initial_password from zerver.lib.initial_password import initial_password
from zerver.lib.realm_icon import realm_icon_url from zerver.lib.realm_icon import realm_icon_url
from zerver.lib.realm_logo import get_realm_logo_url from zerver.lib.realm_logo import get_realm_logo_url
from zerver.lib.retention import clean_archived_data
from zerver.lib.test_classes import UploadSerializeMixin, ZulipTestCase from zerver.lib.test_classes import UploadSerializeMixin, ZulipTestCase
from zerver.lib.test_helpers import ( from zerver.lib.test_helpers import (
avatar_disk_path, avatar_disk_path,
@ -39,19 +33,17 @@ from zerver.lib.test_helpers import (
ratelimit_rule, ratelimit_rule,
read_test_image_file, read_test_image_file,
) )
from zerver.lib.upload import delete_message_attachment, upload_message_attachment from zerver.lib.upload import upload_message_attachment
from zerver.lib.upload.base import BadImageError, ZulipUploadBackend, resize_emoji, sanitize_name from zerver.lib.upload.base import BadImageError, ZulipUploadBackend, resize_emoji, sanitize_name
from zerver.lib.upload.local import LocalUploadBackend from zerver.lib.upload.local import LocalUploadBackend
from zerver.lib.upload.s3 import S3UploadBackend from zerver.lib.upload.s3 import S3UploadBackend
from zerver.lib.users import get_api_key from zerver.lib.users import get_api_key
from zerver.models import ( from zerver.models import (
ArchivedAttachment,
Attachment, Attachment,
Message, Message,
Realm, Realm,
RealmDomain, RealmDomain,
UserProfile, UserProfile,
get_client,
get_realm, get_realm,
get_system_bot, get_system_bot,
get_user_by_delivery_email, get_user_by_delivery_email,
@ -361,126 +353,6 @@ class FileUploadTest(UploadSerializeMixin, ZulipTestCase):
self.assertEqual(response.status_code, 404) self.assertEqual(response.status_code, 404)
self.assert_in_response("This file does not exist or has been deleted.", response) self.assert_in_response("This file does not exist or has been deleted.", response)
def test_delete_old_unclaimed_attachments(self) -> None:
# Upload some files and make them older than a week
hamlet = self.example_user("hamlet")
self.login("hamlet")
d1 = StringIO("zulip!")
d1.name = "dummy_1.txt"
result = self.client_post("/json/user_uploads", {"file": d1})
response_dict = self.assert_json_success(result)
d1_path_id = re.sub("/user_uploads/", "", response_dict["uri"])
d2 = StringIO("zulip!")
d2.name = "dummy_2.txt"
result = self.client_post("/json/user_uploads", {"file": d2})
response_dict = self.assert_json_success(result)
d2_path_id = re.sub("/user_uploads/", "", response_dict["uri"])
d3 = StringIO("zulip!")
d3.name = "dummy_3.txt"
result = self.client_post("/json/user_uploads", {"file": d3})
d3_path_id = re.sub("/user_uploads/", "", result.json()["uri"])
d4 = StringIO("zulip!")
d4.name = "dummy_4.txt"
result = self.client_post("/json/user_uploads", {"file": d4})
d4_path_id = re.sub("/user_uploads/", "", result.json()["uri"])
two_week_ago = timezone_now() - datetime.timedelta(weeks=2)
# This Attachment will have a message linking to it:
d1_attachment = Attachment.objects.get(path_id=d1_path_id)
d1_attachment.create_time = two_week_ago
d1_attachment.save()
self.assertEqual(repr(d1_attachment), "<Attachment: dummy_1.txt>")
# This Attachment won't have any messages.
d2_attachment = Attachment.objects.get(path_id=d2_path_id)
d2_attachment.create_time = two_week_ago
d2_attachment.save()
# This Attachment will have a message that gets archived. The file
# should not be deleted until the message is deleted from the archive.
d3_attachment = Attachment.objects.get(path_id=d3_path_id)
d3_attachment.create_time = two_week_ago
d3_attachment.save()
# This Attachment will just have a ScheduledMessage referencing it. It should not be deleted
# until the ScheduledMessage is deleted.
d4_attachment = Attachment.objects.get(path_id=d4_path_id)
d4_attachment.create_time = two_week_ago
d4_attachment.save()
# Send message referencing only dummy_1
self.subscribe(hamlet, "Denmark")
body = (
f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/"
+ d1_path_id
+ ")"
)
self.send_stream_message(hamlet, "Denmark", body, "test")
# Send message referencing dummy_3 - it will be archived next.
body = (
f"Some more files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/"
+ d3_path_id
+ ")"
)
message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
assert settings.LOCAL_FILES_DIR
d3_local_path = os.path.join(settings.LOCAL_FILES_DIR, d3_path_id)
self.assertTrue(os.path.exists(d3_local_path))
body = (
f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/"
+ d4_path_id
+ ")"
)
scheduled_message_d4_id = check_schedule_message(
hamlet,
get_client("website"),
"stream",
[self.get_stream_id("Verona")],
"Test topic",
body,
timezone_now() + datetime.timedelta(days=365),
hamlet.realm,
)
self.assertEqual(
list(d4_attachment.scheduled_messages.all().values_list("id", flat=True)),
[scheduled_message_d4_id],
)
d4_local_path = os.path.join(settings.LOCAL_FILES_DIR, d4_path_id)
self.assertTrue(os.path.exists(d4_local_path))
do_delete_messages(hamlet.realm, [Message.objects.get(id=message_id)])
# dummy_2 should not exist in database or the uploads folder
do_delete_old_unclaimed_attachments(2)
self.assertTrue(not Attachment.objects.filter(path_id=d2_path_id).exists())
with self.assertLogs(level="WARNING") as warn_log:
self.assertTrue(not delete_message_attachment(d2_path_id))
self.assertEqual(
warn_log.output,
["WARNING:root:dummy_2.txt does not exist. Its entry in the database will be removed."],
)
# dummy_3 file should still exist, because the attachment has been moved to the archive.
self.assertTrue(os.path.exists(d3_local_path))
# After the archive gets emptied, do_delete_old_unclaimed_attachments should result
# in deleting the file, since it is now truly no longer referenced.
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
clean_archived_data()
do_delete_old_unclaimed_attachments(2)
self.assertFalse(os.path.exists(d3_local_path))
self.assertTrue(not Attachment.objects.filter(path_id=d3_path_id).exists())
self.assertTrue(not ArchivedAttachment.objects.filter(path_id=d3_path_id).exists())
# dummy_4 only get deleted after the scheduled message is deleted.
self.assertTrue(os.path.exists(d4_local_path))
delete_scheduled_message(hamlet, scheduled_message_d4_id)
do_delete_old_unclaimed_attachments(2)
self.assertFalse(os.path.exists(d4_local_path))
self.assertTrue(not Attachment.objects.filter(path_id=d4_path_id).exists())
def test_attachment_url_without_upload(self) -> None: def test_attachment_url_without_upload(self) -> None:
hamlet = self.example_user("hamlet") hamlet = self.example_user("hamlet")
self.login_user(hamlet) self.login_user(hamlet)