retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
import os
|
|
|
|
import re
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
from io import StringIO
|
|
|
|
from typing import Optional
|
|
|
|
|
|
|
|
import time_machine
|
|
|
|
from django.conf import settings
|
|
|
|
from django.utils.timezone import now as timezone_now
|
|
|
|
|
|
|
|
from zerver.actions.message_delete import do_delete_messages
|
|
|
|
from zerver.actions.scheduled_messages import check_schedule_message, delete_scheduled_message
|
|
|
|
from zerver.actions.uploads import do_delete_old_unclaimed_attachments
|
|
|
|
from zerver.lib.retention import clean_archived_data
|
|
|
|
from zerver.lib.test_classes import UploadSerializeMixin, ZulipTestCase
|
2023-12-15 04:33:19 +01:00
|
|
|
from zerver.models import ArchivedAttachment, Attachment, Message, UserProfile
|
|
|
|
from zerver.models.clients import get_client
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
|
|
|
|
|
|
|
|
class UnclaimedAttachmentTest(UploadSerializeMixin, ZulipTestCase):
|
|
|
|
def make_attachment(
|
|
|
|
self, filename: str, when: Optional[datetime] = None, uploader: Optional[UserProfile] = None
|
|
|
|
) -> Attachment:
|
|
|
|
if when is None:
|
|
|
|
when = timezone_now() - timedelta(weeks=2)
|
|
|
|
if uploader is None:
|
|
|
|
uploader = self.example_user("hamlet")
|
|
|
|
self.login_user(uploader)
|
|
|
|
|
2023-09-29 17:00:28 +02:00
|
|
|
with time_machine.travel(when, tick=False):
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
file_obj = StringIO("zulip!")
|
|
|
|
file_obj.name = filename
|
|
|
|
response = self.assert_json_success(
|
|
|
|
self.client_post("/json/user_uploads", {"file": file_obj})
|
|
|
|
)
|
2024-04-26 20:30:22 +02:00
|
|
|
path_id = re.sub(r"/user_uploads/", "", response["uri"])
|
retention: Prevent deletion of partially-archived messages.
Previously, this code:
```python3
old_archived_attachments = ArchivedAttachment.objects.annotate(
has_other_messages=Exists(
Attachment.objects.filter(id=OuterRef("id"))
.exclude(messages=None)
.exclude(scheduled_messages=None)
)
).filter(messages=None, create_time__lt=delta_weeks_ago, has_other_messages=False)
```
...protected from removal any ArchivedAttachment objects where there
was an Attachment which had _both_ a message _and_ a scheduled
message, instead of _either_ a message _or_ a scheduled message.
Since files are removed from disk when the ArchivedAttachment rows are
deleted, this meant that if an upload was referenced in two messages,
and one was deleted, the file was permanently deleted when the
ArchivedMessage and ArchivedAttachment were cleaned up, despite being
still referenced in live Messages and Attachments.
Switch from `.exclude(messages=None).exclude(scheduled_messages=None)`
to `.exclude(messages=None, scheduled_messages=None)` which "OR"s
those conditions appropriately.
Pull the relevant test into its own file, and expand it significantly
to cover this, and other, corner cases.
2023-07-28 20:53:07 +02:00
|
|
|
return Attachment.objects.get(path_id=path_id)
|
|
|
|
|
|
|
|
def assert_exists(
|
|
|
|
self,
|
|
|
|
attachment: Attachment,
|
|
|
|
*,
|
|
|
|
has_file: bool,
|
|
|
|
has_attachment: bool,
|
|
|
|
has_archived_attachment: bool,
|
|
|
|
) -> None:
|
|
|
|
assert settings.LOCAL_FILES_DIR
|
|
|
|
self.assertEqual( # File existence on disk
|
|
|
|
os.path.isfile(os.path.join(settings.LOCAL_FILES_DIR, attachment.path_id)), has_file
|
|
|
|
)
|
|
|
|
self.assertEqual( # Attachment row
|
|
|
|
Attachment.objects.filter(id=attachment.id).exists(), has_attachment
|
|
|
|
)
|
|
|
|
self.assertEqual( # ArchivedAttachment row
|
|
|
|
ArchivedAttachment.objects.filter(id=attachment.id).exists(), has_archived_attachment
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_unused_upload(self) -> None:
|
|
|
|
unused_attachment = self.make_attachment("unused.txt")
|
|
|
|
self.assert_exists(
|
|
|
|
unused_attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# If we have 3 weeks of grace, nothing happens
|
|
|
|
do_delete_old_unclaimed_attachments(3)
|
|
|
|
self.assert_exists(
|
|
|
|
unused_attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# If we have 1 weeks of grace, the Attachment is deleted, and so is the file on disk
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
unused_attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_used_upload(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
|
|
|
attachment = self.make_attachment("used.txt")
|
|
|
|
|
|
|
|
# Send message referencing that message
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
|
|
|
|
# Because the message is claimed, it is not removed
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_upload_archived_message(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
|
|
|
attachment = self.make_attachment("used.txt")
|
|
|
|
|
|
|
|
# Send message referencing that message
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
|
|
|
|
# Delete that message; this moves it to ArchivedAttachment but leaves the file on disk
|
|
|
|
do_delete_messages(hamlet.realm, [Message.objects.get(id=message_id)])
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Removing unclaimed attachments leaves the file, since it is
|
|
|
|
# attached to an existing ArchivedAttachment
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Now purge the ArchivedMessage
|
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
|
|
|
|
# The attachment still exists as an unclaimed ArchivedAttachment
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Removing unclaimed attachments now cleans it out
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_one_message(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
|
|
|
attachment = self.make_attachment("used.txt")
|
|
|
|
|
|
|
|
# Send message referencing that message
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
first_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
second_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
|
|
|
|
# Delete the second message; this leaves an Attachment and an
|
|
|
|
# ArchivedAttachment, both associated with a message
|
|
|
|
do_delete_messages(hamlet.realm, [Message.objects.get(id=first_message_id)])
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Removing unclaimed attachments leaves the file, since it is
|
|
|
|
# attached to an existing Attachment and ArchivedAttachment
|
|
|
|
# which have Messages and ArchivedMessages, respectively
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Purging the ArchivedMessage does not affect the Attachment
|
|
|
|
# or ArchivedAttachment
|
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Removing unclaimed attachments still does nothing, because
|
|
|
|
# the ArchivedAttachment is protected by the existing
|
|
|
|
# Attachment.
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Deleting the other message now leaves just an ArchivedAttachment
|
|
|
|
do_delete_messages(hamlet.realm, [Message.objects.get(id=second_message_id)])
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=False, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Cleaning out the archived message and purging unclaimed
|
|
|
|
# attachments now finally removes it.
|
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_with_scheduled_messages(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
|
|
|
attachment = self.make_attachment("used.txt")
|
|
|
|
|
|
|
|
# Schedule a future send with the attachment
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
scheduled_message_id = check_schedule_message(
|
|
|
|
hamlet,
|
|
|
|
get_client("website"),
|
|
|
|
"stream",
|
|
|
|
[self.get_stream_id("Denmark")],
|
|
|
|
"Test topic",
|
|
|
|
body,
|
|
|
|
timezone_now() + timedelta(days=365),
|
|
|
|
hamlet.realm,
|
|
|
|
)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# The ScheduledMessage protects the attachment from being removed
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# Deleting the ScheduledMessage leaves the attachment dangling
|
|
|
|
delete_scheduled_message(hamlet, scheduled_message_id)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# Having no referents, it is now a target for removal
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_with_scheduled_message_and_archive(self) -> None:
|
|
|
|
hamlet = self.example_user("hamlet")
|
|
|
|
attachment = self.make_attachment("used.txt")
|
|
|
|
|
|
|
|
# Schedule a message, and also send one now
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
scheduled_message_id = check_schedule_message(
|
|
|
|
hamlet,
|
|
|
|
get_client("website"),
|
|
|
|
"stream",
|
|
|
|
[self.get_stream_id("Denmark")],
|
|
|
|
"Test topic",
|
|
|
|
body,
|
|
|
|
timezone_now() + timedelta(days=365),
|
|
|
|
hamlet.realm,
|
|
|
|
)
|
|
|
|
sent_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# Deleting the sent message leaves us with an Attachment
|
|
|
|
# attached to the scheduled message, and an archived
|
|
|
|
# attachment with an archived message
|
|
|
|
do_delete_messages(hamlet.realm, [Message.objects.get(id=sent_message_id)])
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Expiring the archived message leaves a dangling
|
|
|
|
# ArchivedAttachment and a protected Attachment
|
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Removing unclaimed attachments deletes nothing, since the
|
|
|
|
# the ArchivedAttachment is protected by the Attachment which
|
|
|
|
# is still protected by the scheduled message
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Deleting the ScheduledMessage leaves the attachment fully dangling
|
|
|
|
delete_scheduled_message(hamlet, scheduled_message_id)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Having no referents, it is now a target for removal
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_delete_with_unscheduled_message_and_archive(self) -> None:
|
|
|
|
# This is subtly different from the test above -- we delete
|
|
|
|
# the scheduled message first, which is the only way to get an
|
|
|
|
# Attachment with not referents as well as an
|
|
|
|
# ArchivedAttachment which does have references. Normally,
|
|
|
|
# the process of archiving prunes Attachments which have no
|
|
|
|
# references.
|
|
|
|
hamlet = self.example_user("hamlet")
|
|
|
|
attachment = self.make_attachment("used.txt")
|
|
|
|
|
|
|
|
# Schedule a message, and also send one now
|
|
|
|
self.subscribe(hamlet, "Denmark")
|
|
|
|
body = f"Some files here ...[zulip.txt](http://{hamlet.realm.host}/user_uploads/{attachment.path_id})"
|
|
|
|
scheduled_message_id = check_schedule_message(
|
|
|
|
hamlet,
|
|
|
|
get_client("website"),
|
|
|
|
"stream",
|
|
|
|
[self.get_stream_id("Denmark")],
|
|
|
|
"Test topic",
|
|
|
|
body,
|
|
|
|
timezone_now() + timedelta(days=365),
|
|
|
|
hamlet.realm,
|
|
|
|
)
|
|
|
|
sent_message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=False
|
|
|
|
)
|
|
|
|
|
|
|
|
# Delete the message and then unschedule the scheduled message
|
|
|
|
# before expiring the ArchivedMessages.
|
|
|
|
do_delete_messages(hamlet.realm, [Message.objects.get(id=sent_message_id)])
|
|
|
|
delete_scheduled_message(hamlet, scheduled_message_id)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Attempting to expire unclaimed attachments leaves the
|
|
|
|
# unreferenced Attachment which is protected by the
|
|
|
|
# ArchivedAttachment which has archived messages referencing
|
|
|
|
# it.
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Expiring archived messages leaves us with a dangling
|
|
|
|
# Attachment and ArchivedAttachment, with neither having
|
|
|
|
# referents.
|
|
|
|
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
|
|
|
clean_archived_data()
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=True, has_attachment=True, has_archived_attachment=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Having no referents in either place, it is now a target for
|
|
|
|
# removal
|
|
|
|
do_delete_old_unclaimed_attachments(1)
|
|
|
|
self.assert_exists(
|
|
|
|
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
|
|
|
)
|