mirror of https://github.com/zulip/zulip.git
upload: Batch deleting old attachments.
delete_message_attachments() is faster than calling delete_message_attachment() one-by-one.
This commit is contained in:
parent
502b9a76a9
commit
5f25fae0fa
|
@ -3,7 +3,7 @@ from typing import Any
|
|||
|
||||
from zerver.lib.attachments import get_old_unclaimed_attachments, validate_attachment_request
|
||||
from zerver.lib.markdown import MessageRenderingResult
|
||||
from zerver.lib.upload import claim_attachment, delete_message_attachment
|
||||
from zerver.lib.upload import claim_attachment, delete_message_attachments
|
||||
from zerver.models import Attachment, Message, ScheduledMessage, Stream, UserProfile
|
||||
from zerver.tornado.django_api import send_event_on_commit
|
||||
|
||||
|
@ -62,6 +62,9 @@ def do_claim_attachments(
|
|||
return claimed
|
||||
|
||||
|
||||
DELETE_BATCH_SIZE = 1000
|
||||
|
||||
|
||||
def do_delete_old_unclaimed_attachments(weeks_ago: int) -> None:
|
||||
old_unclaimed_attachments, old_unclaimed_archived_attachments = get_old_unclaimed_attachments(
|
||||
weeks_ago
|
||||
|
@ -71,14 +74,23 @@ def do_delete_old_unclaimed_attachments(weeks_ago: int) -> None:
|
|||
# ArchiveAttachments in the same run; prevent warnings from the
|
||||
# backing store by only removing it from there once.
|
||||
already_removed = set()
|
||||
storage_paths = []
|
||||
for attachment in old_unclaimed_attachments:
|
||||
delete_message_attachment(attachment.path_id)
|
||||
storage_paths.append(attachment.path_id)
|
||||
already_removed.add(attachment.path_id)
|
||||
attachment.delete()
|
||||
if len(storage_paths) >= DELETE_BATCH_SIZE:
|
||||
delete_message_attachments(storage_paths)
|
||||
storage_paths = []
|
||||
for archived_attachment in old_unclaimed_archived_attachments:
|
||||
if archived_attachment.path_id not in already_removed:
|
||||
delete_message_attachment(archived_attachment.path_id)
|
||||
storage_paths.append(archived_attachment.path_id)
|
||||
archived_attachment.delete()
|
||||
if len(storage_paths) >= DELETE_BATCH_SIZE:
|
||||
delete_message_attachments(storage_paths)
|
||||
storage_paths = []
|
||||
if storage_paths:
|
||||
delete_message_attachments(storage_paths)
|
||||
|
||||
|
||||
def check_attachment_reference_change(
|
||||
|
|
|
@ -2,6 +2,7 @@ import os
|
|||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from io import StringIO
|
||||
from unittest.mock import patch
|
||||
|
||||
import time_machine
|
||||
from django.conf import settings
|
||||
|
@ -337,3 +338,62 @@ class UnclaimedAttachmentTest(UploadSerializeMixin, ZulipTestCase):
|
|||
self.assert_exists(
|
||||
attachment, has_file=False, has_attachment=False, has_archived_attachment=False
|
||||
)
|
||||
|
||||
def test_delete_batch_size(self) -> None:
|
||||
attachments = [self.make_attachment("unused.txt") for _ in range(10)]
|
||||
|
||||
with (
|
||||
patch("zerver.actions.uploads.DELETE_BATCH_SIZE", 6),
|
||||
patch("zerver.actions.uploads.delete_message_attachments") as delete_mock,
|
||||
):
|
||||
do_delete_old_unclaimed_attachments(1)
|
||||
|
||||
# We expect all of the 10 attachments to be deleted,
|
||||
# across two different calls of 6- and 4-element lists
|
||||
self.assertEqual(delete_mock.call_count, 2)
|
||||
self.assert_length(delete_mock.call_args_list[0][0][0], 6)
|
||||
self.assert_length(delete_mock.call_args_list[1][0][0], 4)
|
||||
|
||||
self.assertEqual(
|
||||
set(delete_mock.call_args_list[0][0][0] + delete_mock.call_args_list[1][0][0]),
|
||||
{attachment.path_id for attachment in attachments},
|
||||
)
|
||||
|
||||
def test_delete_batch_size_archived(self) -> None:
|
||||
hamlet = self.example_user("hamlet")
|
||||
attachments = [self.make_attachment("unused.txt") for _ in range(20)]
|
||||
|
||||
# Send message referencing 10/20 of those attachments
|
||||
self.subscribe(hamlet, "Denmark")
|
||||
body = "Some files here\n" + "\n".join(
|
||||
f"[a](http://{hamlet.realm.host}/user_uploads/{attachment.path_id}"
|
||||
for attachment in attachments[:10]
|
||||
)
|
||||
message_id = self.send_stream_message(hamlet, "Denmark", body, "test")
|
||||
|
||||
# Delete and purge the message, leaving both the ArchivedAttachments dangling
|
||||
do_delete_messages(hamlet.realm, [Message.objects.get(id=message_id)])
|
||||
with self.settings(ARCHIVED_DATA_VACUUMING_DELAY_DAYS=0):
|
||||
clean_archived_data()
|
||||
|
||||
# Removing unclaimed attachments now cleans them all out
|
||||
with (
|
||||
patch("zerver.actions.uploads.DELETE_BATCH_SIZE", 6),
|
||||
patch("zerver.actions.uploads.delete_message_attachments") as delete_mock,
|
||||
):
|
||||
do_delete_old_unclaimed_attachments(1)
|
||||
|
||||
# We expect all of the 20 attachments (10 of which are
|
||||
# ArchivedAttachments) to be deleted, across four different
|
||||
# calls: 6, 6, 6, 2
|
||||
self.assertEqual(delete_mock.call_count, 4)
|
||||
self.assert_length(delete_mock.call_args_list[0][0][0], 6)
|
||||
self.assert_length(delete_mock.call_args_list[1][0][0], 6)
|
||||
self.assert_length(delete_mock.call_args_list[2][0][0], 6)
|
||||
self.assert_length(delete_mock.call_args_list[3][0][0], 2)
|
||||
|
||||
deleted_path_ids = {elem for call in delete_mock.call_args_list for elem in call[0][0]}
|
||||
self.assertEqual(
|
||||
deleted_path_ids,
|
||||
{attachment.path_id for attachment in attachments},
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue