realm_export: Handle hard head-of-queue failures.

Realm exports may OOM on deployments with low memory; to ensure
forward progress, log the start time in the RealmAuditLog entry, and
key off of the existence of that to prevent re-attempting an export
which was already tried once.
This commit is contained in:
Alex Vandiver 2023-05-16 16:19:06 +00:00 committed by Tim Abbott
parent 4a43856ba7
commit 7811e99548
2 changed files with 40 additions and 0 deletions

View File

@ -9,6 +9,7 @@ from django.utils.timezone import now as timezone_now
from analytics.models import RealmCount from analytics.models import RealmCount
from zerver.lib.exceptions import JsonableError from zerver.lib.exceptions import JsonableError
from zerver.lib.queue import queue_json_publish
from zerver.lib.test_classes import ZulipTestCase from zerver.lib.test_classes import ZulipTestCase
from zerver.lib.test_helpers import ( from zerver.lib.test_helpers import (
HostRequestMock, HostRequestMock,
@ -243,6 +244,31 @@ class RealmExportTest(ZulipTestCase):
result = self.client_delete(f"/json/export/realm/{export_id}") result = self.client_delete(f"/json/export/realm/{export_id}")
self.assert_json_error(result, "Export failed, nothing to delete") self.assert_json_error(result, "Export failed, nothing to delete")
# If the queue worker sees the same export-id again, it aborts
# instead of retrying
with patch("zerver.lib.export.do_export_realm") as mock_export:
with self.assertLogs(level="INFO") as info_logs:
queue_json_publish(
"deferred_work",
{
"type": "realm_export",
"time": 42,
"realm_id": admin.realm.id,
"user_profile_id": admin.id,
"id": export_id,
},
)
mock_export.assert_not_called()
self.assertEqual(
info_logs.output,
[
(
"ERROR:zerver.worker.queue_processors:Marking export for realm zulip "
"as failed due to retry -- possible OOM during export?"
)
],
)
def test_realm_export_rate_limited(self) -> None: def test_realm_export_rate_limited(self) -> None:
admin = self.example_user("iago") admin = self.example_user("iago")
self.login_user(admin) self.login_user(admin)

View File

@ -1066,6 +1066,20 @@ class DeferredWorker(QueueProcessingWorker):
extra_data = {} extra_data = {}
if export_event.extra_data is not None: if export_event.extra_data is not None:
extra_data = orjson.loads(export_event.extra_data) extra_data = orjson.loads(export_event.extra_data)
if extra_data.get("started_timestamp") is not None:
logger.error(
"Marking export for realm %s as failed due to retry -- possible OOM during export?",
realm.string_id,
)
extra_data["failed_timestamp"] = timezone_now().timestamp()
export_event.extra_data = orjson.dumps(extra_data).decode()
export_event.save(update_fields=["extra_data"])
notify_realm_export(user_profile)
return
extra_data["started_timestamp"] = timezone_now().timestamp()
export_event.extra_data = orjson.dumps(extra_data).decode()
export_event.save(update_fields=["extra_data"])
logger.info( logger.info(
"Starting realm export for realm %s into %s, initiated by user_profile_id %s", "Starting realm export for realm %s into %s, initiated by user_profile_id %s",