2016-10-25 19:51:31 +02:00
|
|
|
|
|
|
|
from datetime import timedelta
|
2017-05-14 21:14:26 +02:00
|
|
|
|
2019-06-05 20:22:08 +02:00
|
|
|
from django.conf import settings
|
2017-05-14 21:14:26 +02:00
|
|
|
from django.db import connection, transaction
|
2019-06-06 15:10:39 +02:00
|
|
|
from django.db.models import Q
|
2017-04-15 04:03:56 +02:00
|
|
|
from django.utils.timezone import now as timezone_now
|
2019-06-10 19:20:09 +02:00
|
|
|
|
|
|
|
from zerver.lib.logging_util import log_to_file
|
2016-11-01 11:26:38 +01:00
|
|
|
from zerver.models import (Message, UserMessage, ArchivedMessage, ArchivedUserMessage, Realm,
|
2019-05-30 15:24:18 +02:00
|
|
|
Attachment, ArchivedAttachment, Reaction, ArchivedReaction,
|
2019-06-05 20:22:08 +02:00
|
|
|
SubMessage, ArchivedSubMessage, Recipient, Stream,
|
|
|
|
get_stream_recipients, get_user_including_cross_realm)
|
2016-10-25 19:51:31 +02:00
|
|
|
|
2019-06-13 15:25:52 +02:00
|
|
|
from typing import Any, Dict, Iterator, List
|
2016-10-25 19:51:31 +02:00
|
|
|
|
2019-06-10 19:20:09 +02:00
|
|
|
import logging
|
|
|
|
|
|
|
|
logger = logging.getLogger('zulip.retention')
|
|
|
|
log_to_file(logger, settings.RETENTION_LOG_PATH)
|
|
|
|
|
2019-06-10 18:09:50 +02:00
|
|
|
MESSAGE_BATCH_SIZE = 1000
|
|
|
|
|
2019-05-30 15:06:45 +02:00
|
|
|
models_with_message_key = [
|
|
|
|
{
|
|
|
|
'class': Reaction,
|
|
|
|
'archive_class': ArchivedReaction,
|
|
|
|
'table_name': 'zerver_reaction',
|
|
|
|
'archive_table_name': 'zerver_archivedreaction'
|
|
|
|
},
|
2019-05-30 15:24:18 +02:00
|
|
|
{
|
|
|
|
'class': SubMessage,
|
|
|
|
'archive_class': ArchivedSubMessage,
|
|
|
|
'table_name': 'zerver_submessage',
|
|
|
|
'archive_table_name': 'zerver_archivedsubmessage'
|
|
|
|
},
|
2019-06-05 20:47:04 +02:00
|
|
|
{
|
|
|
|
'class': UserMessage,
|
|
|
|
'archive_class': ArchivedUserMessage,
|
|
|
|
'table_name': 'zerver_usermessage',
|
|
|
|
'archive_table_name': 'zerver_archivedusermessage'
|
|
|
|
},
|
2019-05-30 15:06:45 +02:00
|
|
|
] # type: List[Dict[str, Any]]
|
2016-10-25 19:51:31 +02:00
|
|
|
|
2016-11-01 11:26:38 +01:00
|
|
|
@transaction.atomic
|
2019-06-06 13:25:56 +02:00
|
|
|
def move_rows(src_model: Any, raw_query: str, returning_id: bool=False,
|
|
|
|
**kwargs: Any) -> List[int]:
|
2016-11-01 11:26:38 +01:00
|
|
|
src_db_table = src_model._meta.db_table
|
|
|
|
src_fields = ["{}.{}".format(src_db_table, field.column) for field in src_model._meta.fields]
|
|
|
|
dst_fields = [field.column for field in src_model._meta.fields]
|
|
|
|
sql_args = {
|
|
|
|
'src_fields': ','.join(src_fields),
|
|
|
|
'dst_fields': ','.join(dst_fields),
|
|
|
|
'archive_timestamp': timezone_now()
|
|
|
|
}
|
|
|
|
sql_args.update(kwargs)
|
|
|
|
with connection.cursor() as cursor:
|
|
|
|
cursor.execute(
|
|
|
|
raw_query.format(**sql_args)
|
|
|
|
)
|
2019-06-02 17:29:25 +02:00
|
|
|
if returning_id:
|
|
|
|
return [row[0] for row in cursor.fetchall()] # return list of row ids
|
|
|
|
else:
|
|
|
|
return []
|
2016-10-25 19:51:31 +02:00
|
|
|
|
2019-06-02 17:29:25 +02:00
|
|
|
def ids_list_to_sql_query_format(ids: List[int]) -> str:
|
|
|
|
assert len(ids) > 0
|
2016-10-25 19:51:31 +02:00
|
|
|
|
2019-06-02 17:29:25 +02:00
|
|
|
ids_tuple = tuple(ids)
|
|
|
|
if len(ids_tuple) > 1:
|
|
|
|
ids_string = str(ids_tuple)
|
|
|
|
elif len(ids_tuple) == 1:
|
|
|
|
ids_string = '({})'.format(ids_tuple[0])
|
|
|
|
|
|
|
|
return ids_string
|
|
|
|
|
2019-06-10 18:09:50 +02:00
|
|
|
def run_message_batch_query(query: str, chunk_size: int=MESSAGE_BATCH_SIZE,
|
2019-06-13 15:25:52 +02:00
|
|
|
**kwargs: Any) -> Iterator[List[int]]:
|
2019-06-10 18:09:50 +02:00
|
|
|
while True:
|
|
|
|
new_chunk = move_rows(Message, query, chunk_size=chunk_size, **kwargs)
|
|
|
|
if new_chunk:
|
2019-06-13 15:25:52 +02:00
|
|
|
yield new_chunk
|
2019-06-10 18:09:50 +02:00
|
|
|
|
|
|
|
# We run the loop, until the query returns fewer results than chunk_size, which means we are done:
|
|
|
|
if len(new_chunk) < chunk_size:
|
2019-06-13 15:25:52 +02:00
|
|
|
break
|
2019-06-10 18:09:50 +02:00
|
|
|
|
|
|
|
# Note about batching these Message archiving queries:
|
|
|
|
# We can simply use LIMIT without worrying about OFFSETs and ordering
|
|
|
|
# while executing batches, because any Message already archived (in the previous batch)
|
|
|
|
# will not show up in the "SELECT ... FROM zerver_message ..." query for the next batches.
|
|
|
|
|
2019-06-05 20:22:08 +02:00
|
|
|
def move_expired_messages_to_archive_by_recipient(recipient: Recipient,
|
2019-06-10 18:09:50 +02:00
|
|
|
message_retention_days: int,
|
2019-06-13 15:25:52 +02:00
|
|
|
chunk_size: int=MESSAGE_BATCH_SIZE) -> Iterator[List[int]]:
|
|
|
|
# Important: This function is a generator, and you need to iterate
|
|
|
|
# through the Iterator it returns to execute the queries.
|
2016-11-01 11:26:38 +01:00
|
|
|
query = """
|
|
|
|
INSERT INTO zerver_archivedmessage ({dst_fields}, archive_timestamp)
|
2019-06-10 18:09:50 +02:00
|
|
|
SELECT {src_fields}, '{archive_timestamp}'
|
|
|
|
FROM zerver_message
|
|
|
|
LEFT JOIN zerver_archivedmessage ON zerver_archivedmessage.id = zerver_message.id
|
|
|
|
WHERE zerver_message.recipient_id = {recipient_id}
|
|
|
|
AND zerver_message.pub_date < '{check_date}'
|
|
|
|
AND zerver_archivedmessage.id is NULL
|
|
|
|
LIMIT {chunk_size}
|
2019-06-05 20:22:08 +02:00
|
|
|
RETURNING id
|
|
|
|
"""
|
|
|
|
check_date = timezone_now() - timedelta(days=message_retention_days)
|
|
|
|
|
2019-06-13 15:25:52 +02:00
|
|
|
yield from run_message_batch_query(query, returning_id=True,
|
|
|
|
recipient_id=recipient.id, check_date=check_date.isoformat(),
|
|
|
|
chunk_size=chunk_size)
|
2019-06-05 20:22:08 +02:00
|
|
|
|
2019-06-10 18:09:50 +02:00
|
|
|
def move_expired_personal_and_huddle_messages_to_archive(realm: Realm,
|
|
|
|
chunk_size: int=MESSAGE_BATCH_SIZE
|
2019-06-13 15:25:52 +02:00
|
|
|
) -> Iterator[List[int]]:
|
|
|
|
# Important: This function is a generator, and you need to iterate
|
|
|
|
# through the Iterator it returns to execute the queries.
|
2019-06-05 20:22:08 +02:00
|
|
|
cross_realm_bot_ids_list = [get_user_including_cross_realm(email).id
|
|
|
|
for email in settings.CROSS_REALM_BOT_EMAILS]
|
|
|
|
cross_realm_bot_ids = str(tuple(cross_realm_bot_ids_list))
|
|
|
|
recipient_types = (Recipient.PERSONAL, Recipient.HUDDLE)
|
|
|
|
|
|
|
|
# Archive expired personal and huddle Messages in the realm, except cross-realm messages:
|
|
|
|
# TODO: Remove the "zerver_userprofile.id NOT IN {cross_realm_bot_ids}" clause
|
|
|
|
# once https://github.com/zulip/zulip/issues/11015 is solved.
|
|
|
|
query = """
|
|
|
|
INSERT INTO zerver_archivedmessage ({dst_fields}, archive_timestamp)
|
2019-06-10 18:09:50 +02:00
|
|
|
SELECT {src_fields}, '{archive_timestamp}'
|
|
|
|
FROM zerver_message
|
|
|
|
INNER JOIN zerver_recipient ON zerver_recipient.id = zerver_message.recipient_id
|
|
|
|
INNER JOIN zerver_userprofile ON zerver_userprofile.id = zerver_message.sender_id
|
|
|
|
LEFT JOIN zerver_archivedmessage ON zerver_archivedmessage.id = zerver_message.id
|
|
|
|
WHERE zerver_userprofile.id NOT IN {cross_realm_bot_ids}
|
|
|
|
AND zerver_userprofile.realm_id = {realm_id}
|
|
|
|
AND zerver_recipient.type in {recipient_types}
|
|
|
|
AND zerver_message.pub_date < '{check_date}'
|
|
|
|
AND zerver_archivedmessage.id is NULL
|
|
|
|
LIMIT {chunk_size}
|
2019-06-02 17:29:25 +02:00
|
|
|
RETURNING id
|
2016-11-01 11:26:38 +01:00
|
|
|
"""
|
|
|
|
assert realm.message_retention_days is not None
|
|
|
|
check_date = timezone_now() - timedelta(days=realm.message_retention_days)
|
|
|
|
|
2019-06-13 15:25:52 +02:00
|
|
|
yield from run_message_batch_query(query, returning_id=True, cross_realm_bot_ids=cross_realm_bot_ids,
|
|
|
|
realm_id=realm.id, recipient_types=recipient_types,
|
|
|
|
check_date=check_date.isoformat(), chunk_size=chunk_size)
|
2019-06-02 17:29:25 +02:00
|
|
|
|
2019-06-10 19:38:17 +02:00
|
|
|
def move_to_archive_and_delete_models_with_message_key(msg_ids: List[int]) -> None:
|
2019-06-10 18:09:50 +02:00
|
|
|
assert len(msg_ids) > 0
|
2019-06-02 17:29:25 +02:00
|
|
|
|
2019-05-30 15:06:45 +02:00
|
|
|
for model in models_with_message_key:
|
|
|
|
query = """
|
2019-06-10 19:38:17 +02:00
|
|
|
WITH archived_data AS (
|
|
|
|
INSERT INTO {archive_table_name} ({dst_fields}, archive_timestamp)
|
|
|
|
SELECT {src_fields}, '{archive_timestamp}'
|
|
|
|
FROM {table_name}
|
|
|
|
LEFT JOIN {archive_table_name} ON {archive_table_name}.id = {table_name}.id
|
|
|
|
WHERE {table_name}.message_id IN {message_ids}
|
|
|
|
AND {archive_table_name}.id IS NULL
|
|
|
|
RETURNING id
|
|
|
|
)
|
|
|
|
DELETE FROM {table_name}
|
|
|
|
WHERE id IN (SELECT id FROM archived_data)
|
2019-05-30 15:06:45 +02:00
|
|
|
"""
|
2019-06-06 13:25:56 +02:00
|
|
|
move_rows(model['class'], query, table_name=model['table_name'],
|
|
|
|
archive_table_name=model['archive_table_name'],
|
|
|
|
message_ids=ids_list_to_sql_query_format(msg_ids))
|
2019-06-02 17:29:25 +02:00
|
|
|
|
2019-06-05 20:22:08 +02:00
|
|
|
def move_attachments_to_archive(msg_ids: List[int]) -> None:
|
2019-06-10 18:09:50 +02:00
|
|
|
assert len(msg_ids) > 0
|
2019-05-30 15:06:45 +02:00
|
|
|
|
2016-11-01 11:26:38 +01:00
|
|
|
query = """
|
|
|
|
INSERT INTO zerver_archivedattachment ({dst_fields}, archive_timestamp)
|
|
|
|
SELECT {src_fields}, '{archive_timestamp}'
|
|
|
|
FROM zerver_attachment
|
|
|
|
INNER JOIN zerver_attachment_messages
|
|
|
|
ON zerver_attachment_messages.attachment_id = zerver_attachment.id
|
|
|
|
LEFT JOIN zerver_archivedattachment ON zerver_archivedattachment.id = zerver_attachment.id
|
2019-06-02 17:29:25 +02:00
|
|
|
WHERE zerver_attachment_messages.message_id IN {message_ids}
|
2016-11-01 11:26:38 +01:00
|
|
|
AND zerver_archivedattachment.id IS NULL
|
|
|
|
GROUP BY zerver_attachment.id
|
|
|
|
"""
|
2019-06-06 13:25:56 +02:00
|
|
|
move_rows(Attachment, query, message_ids=ids_list_to_sql_query_format(msg_ids))
|
2016-11-01 11:26:38 +01:00
|
|
|
|
|
|
|
|
2019-06-12 17:05:58 +02:00
|
|
|
def move_attachment_messages_to_archive(msg_ids: List[int]) -> None:
|
2019-06-10 18:09:50 +02:00
|
|
|
assert len(msg_ids) > 0
|
2019-06-02 17:29:25 +02:00
|
|
|
|
2016-11-01 11:26:38 +01:00
|
|
|
query = """
|
2019-06-10 19:38:17 +02:00
|
|
|
WITH archived_data AS (
|
|
|
|
INSERT INTO zerver_archivedattachment_messages (id, archivedattachment_id, archivedmessage_id)
|
|
|
|
SELECT zerver_attachment_messages.id, zerver_attachment_messages.attachment_id,
|
|
|
|
zerver_attachment_messages.message_id
|
|
|
|
FROM zerver_attachment_messages
|
|
|
|
LEFT JOIN zerver_archivedattachment_messages
|
|
|
|
ON zerver_archivedattachment_messages.id = zerver_attachment_messages.id
|
|
|
|
WHERE zerver_attachment_messages.message_id IN {message_ids}
|
|
|
|
AND zerver_archivedattachment_messages.id IS NULL
|
|
|
|
RETURNING id
|
|
|
|
)
|
|
|
|
DELETE FROM zerver_attachment_messages
|
|
|
|
WHERE id IN (SELECT id FROM archived_data)
|
2016-11-01 11:26:38 +01:00
|
|
|
"""
|
|
|
|
with connection.cursor() as cursor:
|
2019-06-03 12:19:58 +02:00
|
|
|
cursor.execute(query.format(message_ids=ids_list_to_sql_query_format(msg_ids)))
|
2016-11-01 11:26:38 +01:00
|
|
|
|
2019-06-05 20:22:08 +02:00
|
|
|
def delete_messages(msg_ids: List[int]) -> None:
|
2019-06-06 20:41:07 +02:00
|
|
|
# Important note: This also deletes related objects with a foreign
|
|
|
|
# key to Message (due to `on_delete=CASCADE` in our models
|
|
|
|
# configuration), so we need to be sure we've taken care of
|
|
|
|
# archiving the messages before doing this step.
|
2019-06-05 20:47:04 +02:00
|
|
|
Message.objects.filter(id__in=msg_ids).delete()
|
2016-11-01 11:26:38 +01:00
|
|
|
|
2019-06-12 15:39:39 +02:00
|
|
|
def delete_expired_attachments(realm: Realm) -> None:
|
2019-06-10 19:20:09 +02:00
|
|
|
logger.info("Cleaning up attachments for realm " + realm.string_id)
|
2019-06-05 20:22:08 +02:00
|
|
|
Attachment.objects.filter(
|
2019-06-12 15:39:39 +02:00
|
|
|
messages__isnull=True,
|
|
|
|
realm_id=realm.id,
|
|
|
|
id__in=ArchivedAttachment.objects.filter(realm_id=realm.id),
|
2019-06-05 20:22:08 +02:00
|
|
|
).delete()
|
|
|
|
|
2019-06-05 20:47:04 +02:00
|
|
|
def move_related_objects_to_archive(msg_ids: List[int]) -> None:
|
2019-06-10 19:38:17 +02:00
|
|
|
move_to_archive_and_delete_models_with_message_key(msg_ids)
|
2019-06-05 20:22:08 +02:00
|
|
|
move_attachments_to_archive(msg_ids)
|
2019-06-12 17:05:58 +02:00
|
|
|
move_attachment_messages_to_archive(msg_ids)
|
2019-06-05 20:22:08 +02:00
|
|
|
|
2019-06-13 17:09:35 +02:00
|
|
|
def run_archiving_in_chunks(message_id_chunks: Iterator[List[int]]) -> int:
|
|
|
|
# This function is carefully designed to achieve our
|
|
|
|
# transactionality goals: A batch of messages is either fully
|
|
|
|
# archived-and-deleted or not transactionally.
|
|
|
|
#
|
|
|
|
# We implement this design by calling `next()` explicitly inside
|
|
|
|
# the `transaction.atomic()` block, ensuring that the queries that
|
|
|
|
# populate message_id_chunks run inside the same transaction block
|
|
|
|
# as the code that handles archiving related objects like
|
|
|
|
# UserMessage, Reaction, and Attachment.
|
|
|
|
message_count = 0
|
|
|
|
while True:
|
|
|
|
with transaction.atomic():
|
|
|
|
try:
|
|
|
|
chunk = next(message_id_chunks)
|
|
|
|
except StopIteration:
|
|
|
|
break
|
|
|
|
|
|
|
|
move_related_objects_to_archive(chunk)
|
|
|
|
delete_messages(chunk)
|
|
|
|
message_count += len(chunk)
|
|
|
|
|
|
|
|
return message_count
|
|
|
|
|
2019-06-10 18:09:50 +02:00
|
|
|
def archive_messages_by_recipient(recipient: Recipient, message_retention_days: int,
|
2019-06-10 19:20:09 +02:00
|
|
|
chunk_size: int=MESSAGE_BATCH_SIZE) -> int:
|
2019-06-10 18:09:50 +02:00
|
|
|
message_id_chunks = move_expired_messages_to_archive_by_recipient(recipient, message_retention_days,
|
|
|
|
chunk_size)
|
2019-06-13 17:09:35 +02:00
|
|
|
return run_archiving_in_chunks(message_id_chunks)
|
2019-06-05 20:22:08 +02:00
|
|
|
|
2019-06-12 15:39:39 +02:00
|
|
|
def archive_personal_and_huddle_messages(realm: Realm, chunk_size: int=MESSAGE_BATCH_SIZE) -> None:
|
2019-06-10 19:20:09 +02:00
|
|
|
logger.info("Archiving personal and huddle messages for realm " + realm.string_id)
|
|
|
|
|
2019-06-12 15:39:39 +02:00
|
|
|
message_id_chunks = move_expired_personal_and_huddle_messages_to_archive(realm, chunk_size)
|
2019-06-13 17:09:35 +02:00
|
|
|
message_count = run_archiving_in_chunks(message_id_chunks)
|
2019-06-10 19:20:09 +02:00
|
|
|
|
|
|
|
logger.info("Done. Archived {} messages".format(message_count))
|
2019-06-05 20:22:08 +02:00
|
|
|
|
2019-06-12 15:39:39 +02:00
|
|
|
def archive_stream_messages(realm: Realm, chunk_size: int=MESSAGE_BATCH_SIZE) -> None:
|
2019-06-10 19:20:09 +02:00
|
|
|
logger.info("Archiving stream messages for realm " + realm.string_id)
|
2019-06-12 15:39:39 +02:00
|
|
|
# We don't archive, if the stream has message_retention_days set to -1,
|
|
|
|
# or if neither the stream nor the realm have a retention policy.
|
|
|
|
streams = Stream.objects.exclude(message_retention_days=-1).filter(
|
|
|
|
Q(message_retention_days__isnull=False) | Q(realm__message_retention_days__isnull=False),
|
|
|
|
realm_id=realm.id
|
|
|
|
)
|
|
|
|
retention_policy_dict = {} # type: Dict[int, int]
|
|
|
|
for stream in streams:
|
|
|
|
# if stream.message_retention_days is null, use the realm's policy
|
|
|
|
if stream.message_retention_days:
|
|
|
|
retention_policy_dict[stream.id] = stream.message_retention_days
|
|
|
|
else:
|
|
|
|
retention_policy_dict[stream.id] = stream.realm.message_retention_days
|
|
|
|
|
|
|
|
recipients = get_stream_recipients([stream.id for stream in streams])
|
2019-06-10 19:20:09 +02:00
|
|
|
message_count = 0
|
2019-06-12 15:39:39 +02:00
|
|
|
for recipient in recipients:
|
2019-06-10 19:20:09 +02:00
|
|
|
message_count += archive_messages_by_recipient(
|
|
|
|
recipient, retention_policy_dict[recipient.type_id], chunk_size
|
|
|
|
)
|
|
|
|
|
|
|
|
logger.info("Done. Archived {} messages.".format(message_count))
|
2017-05-14 21:14:26 +02:00
|
|
|
|
2019-06-10 18:09:50 +02:00
|
|
|
def archive_messages(chunk_size: int=MESSAGE_BATCH_SIZE) -> None:
|
2019-06-10 19:20:09 +02:00
|
|
|
logger.info("Starting the archiving process with chunk_size {}".format(chunk_size))
|
|
|
|
|
2019-06-12 15:39:39 +02:00
|
|
|
for realm in Realm.objects.all():
|
|
|
|
archive_stream_messages(realm, chunk_size)
|
|
|
|
if realm.message_retention_days:
|
|
|
|
archive_personal_and_huddle_messages(realm, chunk_size)
|
2019-06-05 20:22:08 +02:00
|
|
|
|
2019-06-12 15:39:39 +02:00
|
|
|
# Messages have been archived for the realm, now we can clean up attachments:
|
|
|
|
delete_expired_attachments(realm)
|
2017-05-14 21:14:26 +02:00
|
|
|
|
|
|
|
@transaction.atomic
|
2018-09-13 16:29:12 +02:00
|
|
|
def move_messages_to_archive(message_ids: List[int]) -> None:
|
|
|
|
messages = list(Message.objects.filter(id__in=message_ids).values())
|
|
|
|
if not messages:
|
2017-05-14 21:14:26 +02:00
|
|
|
raise Message.DoesNotExist
|
2019-05-28 11:38:53 +02:00
|
|
|
|
|
|
|
ArchivedMessage.objects.bulk_create([ArchivedMessage(**message) for message in messages])
|
|
|
|
|
2019-06-12 17:24:15 +02:00
|
|
|
move_related_objects_to_archive(message_ids)
|
2017-05-14 21:14:26 +02:00
|
|
|
# Remove data from main tables
|
2019-06-12 17:24:15 +02:00
|
|
|
delete_messages(message_ids)
|
2019-05-28 11:38:53 +02:00
|
|
|
|
2018-09-13 16:29:12 +02:00
|
|
|
archived_attachments = ArchivedAttachment.objects.filter(messages__id__in=message_ids).distinct()
|
2017-05-14 21:14:26 +02:00
|
|
|
Attachment.objects.filter(messages__isnull=True, id__in=archived_attachments).delete()
|