migrations: Backfill extra_data_json for audit log entries.

This migration is reasonably complex because of various anomalies in existing
data.

Note that there are cases when extra_data does not contain data that is
proper json with possibly single quotes. Thus we need to use
"ast.literal_eval" to cover that.

There is also a special case for "event_type == USER_FULL_NAME_CHANGED",
where extra_data is a plain str. This event_type is only used for
RealmAuditLog, so the zilencer migration script does not need to handle
it.

The migration does not handle "event_type == REALM_DISCOUNT_CHANGED"
because ast.literal_eval only allow Python literals. We expect the admin
to populate the jsonified extra_data for extra_data_json manually
beforehand.

This chunks the backfilling migration to reduce potential block time.

The migration for zilencer is mostly similar to the one for zerver; except that
the backfill helper is added in a wrapper and unrelated events are
removed.

**Logging and error recovery**

We print out a warning when the extra_data_json field of an entry
would have been overwritten by a value inconsistent with what we derived
from extra_data. Usually this only happens when the extra_data was
corrupted before this migration. This prevents data loss by backing up
possibly corrupted data in extra_data_json with the keys
"inconsistent_old_extra_data" and "inconsistent_old_extra_data_json".
More roundtrips to the database are needed for inconsistent data, which are
expected to be infrequent.

This also outputs messages when there are audit log entries with decimals,
indicating that such entries are not backfilled. Do note that audit log
entries with decimals are not populated with "inconsistent_old_extra_data_*"
in the JSONField, because they are not overwritten.

For such audit log entries with "extra_data_json" marked as inconsistent,
we skip them in the migration.  Because when we have discovered anomalies in a
previous run, there is no need to overwrite them again nesting the extra keys
we added to it.

**Testing**

We create a migration test case utilizing the property of bulk_create
that it doesn't call our modified save method.

We extend ZulipTestCase to support verifying console output at the test
case level. The implementation is crude but the use case should be rare
enough that we don't need it to be too elaborate.

Signed-off-by: Zixuan James Li <p359101898@gmail.com>
This commit is contained in:
Zixuan Li 2023-07-15 12:43:23 -04:00 committed by GitHub
parent b29ec4d62e
commit a0cf624eaa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 570 additions and 78 deletions

View File

@ -141,6 +141,10 @@ class UploadSerializeMixin(SerializeMixin):
class ZulipTestCaseMixin(SimpleTestCase):
# Ensure that the test system just shows us diffs
maxDiff: Optional[int] = None
# This bypasses BAN_CONSOLE_OUTPUT for the test case when set.
# Override this to verify if the given extra console output matches the
# expectation.
expected_console_output: Optional[str] = None
def setUp(self) -> None:
super().setUp()
@ -170,7 +174,7 @@ class ZulipTestCaseMixin(SimpleTestCase):
self.mock_initialize.stop()
def run(self, result: Optional[TestResult] = None) -> Optional[TestResult]: # nocoverage
if not settings.BAN_CONSOLE_OUTPUT:
if not settings.BAN_CONSOLE_OUTPUT and self.expected_console_output is None:
return super().run(result)
extra_output_finder = ExtraConsoleOutputFinder()
with tee_stderr_and_find_extra_console_output(
@ -180,6 +184,11 @@ class ZulipTestCaseMixin(SimpleTestCase):
if extra_output_finder.full_extra_output and (
test_result is None or test_result.wasSuccessful()
):
extra_output = extra_output_finder.full_extra_output.decode(errors="replace")
if self.expected_console_output is not None:
self.assertEqual(extra_output, self.expected_console_output)
return test_result
exception_message = f"""
---- UNEXPECTED CONSOLE OUTPUT DETECTED ----
@ -196,7 +205,7 @@ You should be able to quickly reproduce this failure with:
./tools/test-backend --ban-console-output {self.id()}
Output:
{extra_output_finder.full_extra_output.decode(errors="replace")}
{extra_output}
--------------------------------------------
"""
raise ExtraConsoleOutputInTestError(exception_message)

View File

@ -0,0 +1,181 @@
# Generated by Django 4.0.7 on 2022-09-30 20:30
import ast
from typing import List, Tuple, Type
import orjson
from django.db import migrations, transaction
from django.db.backends.base.schema import BaseDatabaseSchemaEditor
from django.db.migrations.state import StateApps
from django.db.models import F, JSONField, Model
from django.db.models.functions import Cast, JSONObject
# This migration is mostly the same as
# backfill_remote_realmauditlog_extradata_to_json_field in zilencer.
OLD_VALUE = "1"
NEW_VALUE = "2"
USER_FULL_NAME_CHANGED = 124
REALM_DISCOUNT_CHANGED = 209
BATCH_SIZE = 5000
DISCOUNT_DATA_TEMPLATE = """Audit log entry {id} with event type REALM_DISCOUNT_CHANGED is skipped.
The data consistency needs to be manually checked.
Discount data to remove after the upcoming JSONField migration:
{data_to_remove}
Discount data to keep after the upcoming JSONField migration:
{data_to_keep}
"""
OVERWRITE_TEMPLATE = """Audit log entry with id {id} has extra_data_json been inconsistently overwritten.
The old value is:
{old_value}
The new value is:
{new_value}
"""
@transaction.atomic
def do_bulk_backfill_extra_data(
audit_log_model: Type[Model], id_lower_bound: int, id_upper_bound: int
) -> None:
# First handle the special case for audit logs with the
# USER_FULL_NAME_CHANGED event, which stores the full name not as
# str(dict()) but a plain str. Note that we only update the entries where
# extra_data_json has the default value, because we do not want to override
# existing audit log entries with a NEW_VALUE of None for extra_data_json.
# We do not need to skip existing entries for other parts of backfilling
# because we have double-write implemented so that the backfilled value
# will still be consistent.
audit_log_model.objects.filter(
event_type=USER_FULL_NAME_CHANGED,
id__range=(id_lower_bound, id_upper_bound),
extra_data_json={},
# extra_data used to keeps track of the old name. As a result, we know
# nothing about what NEW_VALUE would be especially if the name has been
# changed multiple times. extra_data_json is a JSONObject whose
# OLD_VALUE and NEW_VALUE is mapped from the value of the extra_data
# field (which is just a old full name string) and None, respectively.
# Documentation for JSONObject:
# https://docs.djangoproject.com/en/4.2/ref/models/database-functions/#jsonobject
).update(extra_data_json=JSONObject(**{OLD_VALUE: "extra_data", NEW_VALUE: None}))
inconsistent_extra_data_json: List[Tuple[int, str, object, object]] = []
# A dict converted with str() will start with a open bracket followed by a
# single quote, as opposed to a JSON-encoded value, which will use a
# _double_ quote. We use this to filter out those entries with malformed
# extra_data to be handled later. This should only update rows with
# extra_data populated with orjson.dumps.
# The first query below checks for entries that would have extra_data_json
# being overwritten by the migration with a value inconsistent with its
# previous value.
inconsistent_extra_data_json.extend(
audit_log_model.objects.filter(
extra_data__isnull=False, id__range=(id_lower_bound, id_upper_bound)
)
.annotate(new_extra_data_json=Cast("extra_data", output_field=JSONField()))
.exclude(extra_data__startswith="{'")
.exclude(event_type=USER_FULL_NAME_CHANGED)
.exclude(extra_data_json={})
.exclude(extra_data_json=F("new_extra_data_json"))
.values_list("id", "extra_data", "extra_data_json", "new_extra_data_json")
)
(
audit_log_model.objects.filter(
extra_data__isnull=False,
id__range=(id_lower_bound, id_upper_bound),
extra_data_json__inconsistent_old_extra_data__isnull=True,
)
.exclude(extra_data__startswith="{'")
.exclude(event_type=USER_FULL_NAME_CHANGED)
.update(extra_data_json=Cast("extra_data", output_field=JSONField()))
)
python_valued_audit_log_entries = audit_log_model.objects.filter(
extra_data__startswith="{'",
id__range=(id_lower_bound, id_upper_bound),
extra_data_json__inconsistent_old_extra_data__isnull=True,
)
for audit_log_entry in python_valued_audit_log_entries:
# extra_data for entries that store dict stringified with builtins.str()
# are converted back with ast.literal_eval for safety and efficiency.
# str()'d extra_data with the REALM_DISCOUNT_CHANGED event type is not
# handled by this migration. We expect that all such entries are
# manually converted beforehand or an error will occur during the
# migration, because ast.literal_eval does not allow the evaluation of
# Decimal.
old_value = audit_log_entry.extra_data_json # type: ignore[attr-defined] # The migration cannot depend on zerver.models, which contains the real type of the RealmAuditLog model, so it cannot be properly typed.
if audit_log_entry.event_type == REALM_DISCOUNT_CHANGED: # type: ignore[attr-defined] # Explained above.
print(
DISCOUNT_DATA_TEMPLATE.format(
id=audit_log_entry.id, # type: ignore[attr-defined] # Explained above.
data_to_remove=audit_log_entry.extra_data, # type: ignore[attr-defined] # Explained above.
data_to_keep=old_value,
)
)
continue
new_value = ast.literal_eval(audit_log_entry.extra_data) # type: ignore[attr-defined] # Explained above.
if old_value != {} and old_value != new_value:
inconsistent_extra_data_json.append((audit_log_entry.id, audit_log_entry.extra_data, old_value, new_value)) # type: ignore[attr-defined] # Explained above.
audit_log_entry.extra_data_json = new_value # type: ignore[attr-defined] # Explained above.
audit_log_model.objects.bulk_update(python_valued_audit_log_entries, fields=["extra_data_json"])
if inconsistent_extra_data_json:
audit_log_entries = []
for (
audit_log_entry_id,
old_extra_data,
old_extra_data_json,
new_extra_data_json,
) in inconsistent_extra_data_json:
audit_log_entry = audit_log_model.objects.get(id=audit_log_entry_id)
assert isinstance(old_extra_data_json, dict)
if "inconsistent_old_extra_data" in old_extra_data_json:
# Skip entries that have been backfilled and detected as
# anomalies before.
continue
assert isinstance(new_extra_data_json, dict)
audit_log_entry.extra_data_json = { # type: ignore[attr-defined] # Explained above.
**new_extra_data_json,
"inconsistent_old_extra_data": old_extra_data,
"inconsistent_old_extra_data_json": old_extra_data_json,
}
audit_log_entries.append(audit_log_entry)
print(
OVERWRITE_TEMPLATE.format(
id=audit_log_entry_id,
old_value=orjson.dumps(old_extra_data_json).decode(),
new_value=orjson.dumps(new_extra_data_json).decode(),
)
)
audit_log_model.objects.bulk_update(audit_log_entries, fields=["extra_data_json"])
def backfill_extra_data(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
audit_log_model = apps.get_model("zerver", "RealmAuditLog")
if not audit_log_model.objects.filter(extra_data__isnull=False).exists():
return
audit_log_entries = audit_log_model.objects.filter(extra_data__isnull=False)
id_lower_bound = audit_log_entries.earliest("id").id
id_upper_bound = audit_log_entries.latest("id").id
while id_lower_bound <= id_upper_bound:
do_bulk_backfill_extra_data(
audit_log_model, id_lower_bound, min(id_lower_bound + BATCH_SIZE, id_upper_bound)
)
id_lower_bound += BATCH_SIZE + 1
class Migration(migrations.Migration):
atomic = False
dependencies = [
("zerver", "0459_remove_invalid_characters_from_user_group_name"),
]
operations = [
migrations.RunPython(
backfill_extra_data, reverse_code=migrations.RunPython.noop, elidable=True
),
]

View File

@ -4,8 +4,12 @@
# You can also read
# https://www.caktusgroup.com/blog/2016/02/02/writing-unit-tests-django-migrations/
# to get a tutorial on the framework that inspired this feature.
from decimal import Decimal
from typing import Optional
import orjson
from django.db.migrations.state import StateApps
from django.utils.timezone import now as timezone_now
from zerver.lib.test_classes import MigrationsTestCase
from zerver.lib.test_helpers import use_db_models
@ -25,89 +29,239 @@ from zerver.lib.test_helpers import use_db_models
# As a result, we generally mark these tests as skipped once they have
# been tested for a migration being merged.
USER_ACTIVATED = 102
USER_FULL_NAME_CHANGED = 124
REALM_DISCOUNT_CHANGED = 209
OLD_VALUE = "1"
NEW_VALUE = "2"
class LinkifierURLFormatString(MigrationsTestCase):
migrate_from = "0440_realmfilter_url_template"
migrate_to = "0441_backfill_realmfilter_url_template"
class RealmAuditLogExtraData(MigrationsTestCase):
migrate_from = "0459_remove_invalid_characters_from_user_group_name"
migrate_to = "0460_backfill_realmauditlog_extradata_to_json_field"
full_name_change_log_id: Optional[int] = None
valid_json_log_id: Optional[int] = None
str_json_log_id: Optional[int] = None
# The BATCH_SIZE is defined as 5000 in
# backfill_realmauditlog_extradata_to_json_field, this later is used to test
# if batching works properly.
DATA_SIZE = 10005
expected_console_output = """Audit log entry 50003 with event type REALM_DISCOUNT_CHANGED is skipped.
The data consistency needs to be manually checked.
Discount data to remove after the upcoming JSONField migration:
{'old_discount': Decimal('25.0000'), 'new_discount': Decimal('50')}
Discount data to keep after the upcoming JSONField migration:
{}
Audit log entry 50004 with event type REALM_DISCOUNT_CHANGED is skipped.
The data consistency needs to be manually checked.
Discount data to remove after the upcoming JSONField migration:
{'old_discount': Decimal('25.0000'), 'new_discount': Decimal('50')}
Discount data to keep after the upcoming JSONField migration:
{'new_discount': '50', 'old_discount': '25.0000'}
Audit log entry with id 50001 has extra_data_json been inconsistently overwritten.
The old value is:
{"corrupted":"foo"}
The new value is:
{"key":"value"}
Audit log entry with id 50002 has extra_data_json been inconsistently overwritten.
The old value is:
{"corrupted":"bar"}
The new value is:
{"key":"value"}
"""
@use_db_models
def setUpBeforeMigration(self, apps: StateApps) -> None:
RealmFilter = apps.get_model("zerver", "RealmFilter")
Realm = apps.get_model("zerver", "Realm")
RealmAuditLog = apps.get_model("zerver", "RealmAuditLog")
event_time = timezone_now()
realm = Realm.objects.get(string_id="zulip")
iago = self.example_user("iago")
full_name_change_log = RealmAuditLog(
realm=realm,
event_type=USER_FULL_NAME_CHANGED,
event_time=event_time,
extra_data="foo",
)
urls = [
"http://example.com/",
"https://example.com/",
"https://user:password@example.com/",
"https://example.com/@user/thing",
"https://example.com/!path",
"https://example.com/foo.bar",
"https://example.com/foo[bar]",
"https://example.com/{foo}",
"https://example.com/{foo}{bars}",
"https://example.com/{foo}/and/{bar}",
"https://example.com/?foo={foo}",
"https://example.com/%ab",
"https://example.com/%ba",
"https://example.com/%21",
"https://example.com/words%20with%20spaces",
"https://example.com/back%20to%20{back}",
"https://example.com/encoded%2fwith%2fletters",
"https://example.com/encoded%2Fwith%2Fupper%2Fcase%2Fletters",
"https://example.com/%%",
"https://example.com/%%(",
"https://example.com/%%()",
"https://example.com/%%(foo",
"https://example.com/%%(foo)",
"https://example.com/%%(foo)s",
"https://example.com/%(foo)s",
"https://example.com/%(foo)s%(bar)s",
]
self.linkifier_ids = []
new_full_name_change_log = RealmAuditLog(
realm=realm,
event_type=USER_FULL_NAME_CHANGED,
event_time=event_time,
extra_data="foo",
extra_data_json={OLD_VALUE: "foo", NEW_VALUE: "bar"},
)
for index, url in enumerate(urls):
self.linkifier_ids.append(
RealmFilter.objects.create(
realm=iago.realm,
pattern=f"dummy{index}",
url_format_string=url,
).id
valid_json_log = RealmAuditLog(
realm=realm,
event_type=USER_ACTIVATED,
event_time=event_time,
extra_data=orjson.dumps({"key": "value"}).decode(),
)
str_json_log = RealmAuditLog(
realm=realm,
event_type=USER_ACTIVATED,
event_time=event_time,
extra_data=str({"key": "value"}),
)
self.backfilled_inconsistent_log_id = RealmAuditLog.objects.create(
realm=realm,
event_type=USER_ACTIVATED,
event_time=event_time,
extra_data=orjson.dumps({"key": "baz"}).decode(),
extra_data_json={
"key": "baz",
"inconsistent_old_extra_data": orjson.dumps({"key": "baz"}).decode(),
"inconsistent_old_extra_data_json": {"key": "value corrupted"},
},
).id
# The following audit log entries have preset ids because we use
# them to assert the generated log output that is defined before
# the test case is run.
inconsistent_json_log = RealmAuditLog(
id=50001,
realm=realm,
event_type=USER_ACTIVATED,
event_time=event_time,
extra_data=orjson.dumps({"key": "value"}).decode(),
extra_data_json={"corrupted": "foo"},
)
inconsistent_str_json_log = RealmAuditLog(
id=50002,
realm=realm,
event_type=USER_ACTIVATED,
event_time=event_time,
extra_data=str({"key": "value"}),
extra_data_json={"corrupted": "bar"},
)
self.old_decimal_log_id = RealmAuditLog.objects.create(
id=50003,
realm=realm,
event_type=REALM_DISCOUNT_CHANGED,
event_time=event_time,
extra_data=str({"old_discount": Decimal("25.0000"), "new_discount": Decimal("50")}),
).id
self.new_decimal_log_id = RealmAuditLog.objects.create(
id=50004,
realm=realm,
event_type=REALM_DISCOUNT_CHANGED,
event_time=event_time,
extra_data=str({"old_discount": Decimal("25.0000"), "new_discount": Decimal("50")}),
extra_data_json={"old_discount": Decimal("25.0000"), "new_discount": Decimal("50")},
).id
RealmAuditLog.objects.bulk_create(
[
full_name_change_log,
new_full_name_change_log,
valid_json_log,
str_json_log,
inconsistent_json_log,
inconsistent_str_json_log,
]
)
self.full_name_change_log_id = full_name_change_log.id
self.new_full_name_change_log_id = new_full_name_change_log.id
self.valid_json_log_id = valid_json_log.id
self.str_json_log_id = str_json_log.id
other_logs = []
for i in range(self.DATA_SIZE):
other_logs.append(
RealmAuditLog(
realm=realm,
event_type=USER_ACTIVATED,
event_time=event_time,
extra_data=orjson.dumps({"data": i}).decode(),
)
)
def test_converted_url_templates(self) -> None:
RealmFilter = self.apps.get_model("zerver", "RealmFilter")
expected_urls = [
"http://example.com/",
"https://example.com/",
"https://user:password@example.com/",
"https://example.com/@user/thing",
"https://example.com/!path",
"https://example.com/foo.bar",
"https://example.com/foo[bar]",
"https://example.com/%7Bfoo%7D",
"https://example.com/%7Bfoo%7D%7Bbars%7D",
"https://example.com/%7Bfoo%7D/and/%7Bbar%7D",
"https://example.com/?foo=%7Bfoo%7D",
"https://example.com/%ab",
"https://example.com/%ba",
"https://example.com/%21",
"https://example.com/words%20with%20spaces",
"https://example.com/back%20to%20%7Bback%7D",
"https://example.com/encoded%2fwith%2fletters",
"https://example.com/encoded%2Fwith%2Fupper%2Fcase%2Fletters",
"https://example.com/%",
"https://example.com/%(",
"https://example.com/%()",
"https://example.com/%(foo",
"https://example.com/%(foo)",
"https://example.com/%(foo)s",
"https://example.com/{foo}",
"https://example.com/{foo}{bar}",
self.other_logs_id = [
audit_log.id for audit_log in RealmAuditLog.objects.bulk_create(other_logs)
]
for linkifier_id, expected in zip(self.linkifier_ids, expected_urls):
linkifier = RealmFilter.objects.filter(id=linkifier_id).first()
self.assertIsNotNone(linkifier)
self.assertEqual(linkifier.url_template, expected)
# No new audit log entry should have extra_data_json populated as of
# now except for the entries created with non-default values.
self.assert_length(
RealmAuditLog.objects.filter(
event_time__gte=event_time,
).exclude(
extra_data_json={},
),
5,
)
def test_realmaudit_log_extra_data_to_json(self) -> None:
RealmAuditLog = self.apps.get_model("zerver", "RealmAuditLog")
self.assertIsNotNone(self.full_name_change_log_id)
self.assertIsNotNone(self.valid_json_log_id)
self.assertIsNotNone(self.str_json_log_id)
full_name_change_log = RealmAuditLog.objects.filter(id=self.full_name_change_log_id).first()
new_full_name_change_log = RealmAuditLog.objects.filter(
id=self.new_full_name_change_log_id
).first()
valid_json_log = RealmAuditLog.objects.filter(id=self.valid_json_log_id).first()
str_json_log = RealmAuditLog.objects.filter(id=self.str_json_log_id).first()
self.assertIsNotNone(full_name_change_log)
self.assertEqual(full_name_change_log.extra_data_json, {"1": "foo", "2": None})
self.assertIsNotNone(new_full_name_change_log)
self.assertEqual(new_full_name_change_log.extra_data_json, {"1": "foo", "2": "bar"})
self.assertIsNotNone(valid_json_log)
self.assertEqual(valid_json_log.extra_data_json, {"key": "value"})
self.assertIsNotNone(str_json_log)
self.assertEqual(str_json_log.extra_data_json, {"key": "value"})
other_logs = RealmAuditLog.objects.filter(id__in=self.other_logs_id).order_by("id")
self.assertIsNotNone(other_logs)
self.assert_length(other_logs, self.DATA_SIZE)
for index, audit_log in enumerate(other_logs):
self.assertEqual(audit_log.extra_data_json, {"data": index})
inconsistent_json_log = RealmAuditLog.objects.get(
extra_data_json__inconsistent_old_extra_data=orjson.dumps({"key": "value"}).decode()
)
self.assertIsNotNone(inconsistent_json_log)
self.assertEqual(inconsistent_json_log.id, 50001)
self.assertEqual(
inconsistent_json_log.extra_data_json["inconsistent_old_extra_data_json"],
{"corrupted": "foo"},
)
inconsistent_str_json_log = RealmAuditLog.objects.get(
extra_data_json__inconsistent_old_extra_data=str({"key": "value"})
)
self.assertIsNotNone(inconsistent_str_json_log)
self.assertEqual(inconsistent_str_json_log.id, 50002)
self.assertEqual(
inconsistent_str_json_log.extra_data_json["inconsistent_old_extra_data_json"],
{"corrupted": "bar"},
)
backfilled_inconsistent_log = RealmAuditLog.objects.get(
id=self.backfilled_inconsistent_log_id
)
self.assertIsNotNone(backfilled_inconsistent_log)
self.assertEqual(
backfilled_inconsistent_log.extra_data_json,
{
"key": "baz",
"inconsistent_old_extra_data": orjson.dumps({"key": "baz"}).decode(),
"inconsistent_old_extra_data_json": {"key": "value corrupted"},
},
)

View File

@ -0,0 +1,148 @@
# Generated by Django 4.0.7 on 2022-09-30 20:30
import ast
from typing import Callable, List, Tuple, Type
import orjson
from django.db import migrations, transaction
from django.db.backends.base.schema import BaseDatabaseSchemaEditor
from django.db.migrations.state import StateApps
from django.db.models import F, JSONField, Model
from django.db.models.functions import Cast
# This migration is mostly the same as
# backfill_realmauditlog_extradata_to_json_field in zerver.
OLD_VALUE = "1"
NEW_VALUE = "2"
USER_FULL_NAME_CHANGED = 124
REALM_DISCOUNT_CHANGED = 209
BATCH_SIZE = 5000
OVERWRITE_TEMPLATE = """Audit log entry with id {id} has extra_data_json been inconsistently overwritten.
The old value is:
{old_value}
The new value is:
{new_value}
"""
@transaction.atomic
def do_bulk_backfill_extra_data(
audit_log_model: Type[Model], id_lower_bound: int, id_upper_bound: int
) -> None:
inconsistent_extra_data_json: List[Tuple[int, str, object, object]] = []
# A dict converted with str() will start with a open bracket followed by a
# single quote, as opposed to a JSON-encoded value, which will use a
# _double_ quote. We use this to filter out those entries with malformed
# extra_data to be handled later. This should only update rows with
# extra_data populated with orjson.dumps.
# The first query below checks for entries that would have extra_data_json
# being overwritten by the migration with a value inconsistent with its
# previous value.
inconsistent_extra_data_json.extend(
audit_log_model.objects.filter(
extra_data__isnull=False, id__range=(id_lower_bound, id_upper_bound)
)
.annotate(new_extra_data_json=Cast("extra_data", output_field=JSONField()))
.exclude(extra_data__startswith="{'")
.exclude(extra_data_json={})
.exclude(extra_data_json=F("new_extra_data_json"))
.values_list("id", "extra_data", "extra_data_json", "new_extra_data_json")
)
(
audit_log_model.objects.filter(
extra_data__isnull=False,
id__range=(id_lower_bound, id_upper_bound),
extra_data_json__inconsistent_old_extra_data__isnull=True,
)
.exclude(extra_data__startswith="{'")
.update(extra_data_json=Cast("extra_data", output_field=JSONField()))
)
python_valued_audit_log_entries = audit_log_model.objects.filter(
extra_data__startswith="{'",
id__range=(id_lower_bound, id_upper_bound),
extra_data_json__inconsistent_old_extra_data__isnull=True,
)
for audit_log_entry in python_valued_audit_log_entries:
# extra_data for entries that store dict stringified with builtins.str()
# are converted back with ast.literal_eval for safety and efficiency.
old_value = audit_log_entry.extra_data_json # type: ignore[attr-defined] # The migration cannot depend on zerver.models, which contains the real type of the RealmAuditLog model, so it cannot be properly typed.
new_value = ast.literal_eval(audit_log_entry.extra_data) # type: ignore[attr-defined] # Explained above.
if old_value != {} and old_value != new_value:
inconsistent_extra_data_json.append((audit_log_entry.id, audit_log_entry.extra_data, old_value, new_value)) # type: ignore[attr-defined] # Explained above.
audit_log_entry.extra_data_json = new_value # type: ignore[attr-defined] # Explained above.
audit_log_model.objects.bulk_update(python_valued_audit_log_entries, fields=["extra_data_json"])
if inconsistent_extra_data_json:
audit_log_entries = []
for (
audit_log_entry_id,
old_extra_data,
old_extra_data_json,
new_extra_data_json,
) in inconsistent_extra_data_json:
audit_log_entry = audit_log_model.objects.get(id=audit_log_entry_id)
assert isinstance(old_extra_data_json, dict)
if "inconsistent_old_extra_data" in old_extra_data_json:
# Skip entries that have been backfilled and detected as
# anomalies before.
continue
assert isinstance(new_extra_data_json, dict)
audit_log_entry.extra_data_json = { # type: ignore[attr-defined] # Explained above.
**new_extra_data_json,
"inconsistent_old_extra_data": old_extra_data,
"inconsistent_old_extra_data_json": old_extra_data_json,
}
audit_log_entries.append(audit_log_entry)
print(
OVERWRITE_TEMPLATE.format(
id=audit_log_entry_id,
old_value=orjson.dumps(old_extra_data_json).decode(),
new_value=orjson.dumps(new_extra_data_json).decode(),
)
)
audit_log_model.objects.bulk_update(audit_log_entries, fields=["extra_data_json"])
def backfill_extra_data(model_name: str) -> Callable[[StateApps, BaseDatabaseSchemaEditor], None]:
def inner(apps: StateApps, schema_editor: BaseDatabaseSchemaEditor) -> None:
audit_log_model = apps.get_model("zilencer", model_name)
if not audit_log_model.objects.filter(extra_data__isnull=False).exists():
return
audit_log_entries = audit_log_model.objects.filter(extra_data__isnull=False)
id_lower_bound = audit_log_entries.earliest("id").id
id_upper_bound = audit_log_entries.latest("id").id
while id_lower_bound <= id_upper_bound:
do_bulk_backfill_extra_data(
audit_log_model, id_lower_bound, min(id_lower_bound + BATCH_SIZE, id_upper_bound)
)
id_lower_bound += BATCH_SIZE + 1
do_bulk_backfill_extra_data(audit_log_model, id_lower_bound, id_upper_bound)
return inner
class Migration(migrations.Migration):
atomic = False
dependencies = [
("zilencer", "0026_auditlog_models_extra_data_json"),
]
operations = [
migrations.RunPython(
backfill_extra_data("RemoteRealmAuditLog"),
reverse_code=migrations.RunPython.noop,
elidable=True,
),
migrations.RunPython(
backfill_extra_data("RemoteZulipServerAuditLog"),
reverse_code=migrations.RunPython.noop,
elidable=True,
),
]