migration: Add `extra_data_json` for audit log models.

Note that we use the DjangoJSONEncoder so that we have builtin support
for parsing Decimal and datetime.

During this intermediate state, the migration that creates
extra_data_json field has been run. We prepare for running the backfilling
migration that populates extra_data_json from extra_data.

This change implements double-write, which is important to keep the
state of extra data consistent. For most extra_data usage, this is
handled by the overriden `save` method on `AbstractRealmAuditLog`, where
we either generates extra_data_json using orjson.loads or
ast.literal_eval.

While backfilling ensures that old realm audit log entries have
extra_data_json populated, double-write ensures that any new entries
generated will also have extra_data_json set. So that we can then safely
rename extra_data_json to extra_data while ensuring the non-nullable
invariant.

For completeness, we additionally set RealmAuditLog.NEW_VALUE for
the USER_FULL_NAME_CHANGED event. This cannot be handled with the
overridden `save`.

This addresses: https://github.com/zulip/zulip/pull/23116#discussion_r1040277795

Note that extra_data_json at this point is not used yet. So the test
cases do not need to switch to testing extra_data_json. This is later
done after we rename extra_data_json to extra_data.

Double-write for the remote server audit logs is special, because we only
get the dumped bytes from an external source. Luckily, none of the
payload carries extra_data that is not generated using orjson.dumps for
audit logs of event types in SYNC_BILLING_EVENTS. This can be verified
by looking at:

`git grep -A 6 -E "event_type=.*(USER_CREATED|USER_ACTIVATED|USER_DEACTIVATED|USER_REACTIVATED|USER_ROLE_CHANGED|REALM_DEACTIVATED|REALM_REACTIVATED)"`

Therefore, we just need to populate extra_data_json doing an
orjson.loads call after a None-check.

Co-authored-by: Zixuan James Li <p359101898@gmail.com>
This commit is contained in:
Zixuan Li 2023-06-07 15:14:43 -04:00 committed by GitHub
parent 437b2747b1
commit e39e04c3ce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 128 additions and 9 deletions

View File

@ -654,6 +654,7 @@ def do_change_remote_server_plan_type(remote_server: RemoteZulipServer, plan_typ
server=remote_server, server=remote_server,
event_time=timezone_now(), event_time=timezone_now(),
extra_data=str({"old_value": old_value, "new_value": plan_type}), extra_data=str({"old_value": old_value, "new_value": plan_type}),
extra_data_json={"old_value": old_value, "new_value": plan_type},
) )
@ -738,6 +739,8 @@ def process_initial_upgrade(
event_time=billing_cycle_anchor, event_time=billing_cycle_anchor,
event_type=RealmAuditLog.CUSTOMER_PLAN_CREATED, event_type=RealmAuditLog.CUSTOMER_PLAN_CREATED,
extra_data=orjson.dumps(plan_params, default=decimal_to_float).decode(), extra_data=orjson.dumps(plan_params, default=decimal_to_float).decode(),
# Note that DjangoJSONEncoder has builtin support for parsing Decimal
extra_data_json=plan_params,
) )
if not free_trial: if not free_trial:
@ -974,6 +977,7 @@ def attach_discount_to_realm(
event_type=RealmAuditLog.REALM_DISCOUNT_CHANGED, event_type=RealmAuditLog.REALM_DISCOUNT_CHANGED,
event_time=timezone_now(), event_time=timezone_now(),
extra_data=str({"old_discount": old_discount, "new_discount": discount}), extra_data=str({"old_discount": old_discount, "new_discount": discount}),
extra_data_json={"old_discount": old_discount, "new_discount": discount},
) )
@ -989,6 +993,7 @@ def update_sponsorship_status(
event_type=RealmAuditLog.REALM_SPONSORSHIP_PENDING_STATUS_CHANGED, event_type=RealmAuditLog.REALM_SPONSORSHIP_PENDING_STATUS_CHANGED,
event_time=timezone_now(), event_time=timezone_now(),
extra_data=str({"sponsorship_pending": sponsorship_pending}), extra_data=str({"sponsorship_pending": sponsorship_pending}),
extra_data_json={"sponsorship_pending": sponsorship_pending},
) )
@ -1243,4 +1248,5 @@ def update_billing_method_of_current_plan(
event_type=RealmAuditLog.REALM_BILLING_METHOD_CHANGED, event_type=RealmAuditLog.REALM_BILLING_METHOD_CHANGED,
event_time=timezone_now(), event_time=timezone_now(),
extra_data=str({"charge_automatically": charge_automatically}), extra_data=str({"charge_automatically": charge_automatically}),
extra_data_json={"charge_automatically": charge_automatically},
) )

View File

@ -48,4 +48,4 @@ API_FEATURE_LEVEL = 185
# historical commits sharing the same major version, in which case a # historical commits sharing the same major version, in which case a
# minor version bump suffices. # minor version bump suffices.
PROVISION_VERSION = (243, 0) PROVISION_VERSION = (243, 1)

View File

@ -204,6 +204,7 @@ def do_change_full_name(
event_type=RealmAuditLog.USER_FULL_NAME_CHANGED, event_type=RealmAuditLog.USER_FULL_NAME_CHANGED,
event_time=event_time, event_time=event_time,
extra_data=old_name, extra_data=old_name,
extra_data_json={RealmAuditLog.OLD_VALUE: old_name, RealmAuditLog.NEW_VALUE: full_name},
) )
payload = dict(user_id=user_profile.id, full_name=user_profile.full_name) payload = dict(user_id=user_profile.id, full_name=user_profile.full_name)
send_event( send_event(

View File

@ -125,6 +125,9 @@ REALMAUDITLOG_PUSHED_FIELDS = [
"realm", "realm",
"event_time", "event_time",
"backfilled", "backfilled",
# Note that we don't need to add extra_data_json here because
# the view remote_server_post_analytics populates extra_data_json
# from the provided extra_data.
"extra_data", "extra_data",
"event_type", "event_type",
] ]

View File

@ -0,0 +1,20 @@
# Generated by Django 4.0.7 on 2022-09-30 20:25
import django
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("zerver", "0451_add_userprofile_api_key_index"),
]
operations = [
migrations.AddField(
model_name="realmauditlog",
name="extra_data_json",
field=models.JSONField(
default=dict, encoder=django.core.serializers.json.DjangoJSONEncoder
),
),
]

View File

@ -1,3 +1,4 @@
import ast
import datetime import datetime
import secrets import secrets
import time import time
@ -10,6 +11,7 @@ from typing import (
Any, Any,
Callable, Callable,
Dict, Dict,
Iterable,
List, List,
Optional, Optional,
Pattern, Pattern,
@ -39,6 +41,7 @@ from django.contrib.contenttypes.fields import GenericRelation
from django.contrib.postgres.indexes import GinIndex from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchVectorField from django.contrib.postgres.search import SearchVectorField
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from django.core.serializers.json import DjangoJSONEncoder
from django.core.validators import MinLengthValidator, RegexValidator, URLValidator, validate_email from django.core.validators import MinLengthValidator, RegexValidator, URLValidator, validate_email
from django.db import models, transaction from django.db import models, transaction
from django.db.backends.base.base import BaseDatabaseWrapper from django.db.backends.base.base import BaseDatabaseWrapper
@ -4450,6 +4453,7 @@ class AbstractRealmAuditLog(models.Model):
ROLE_COUNT_BOTS = "12" ROLE_COUNT_BOTS = "12"
extra_data = models.TextField(null=True) extra_data = models.TextField(null=True)
extra_data_json = models.JSONField(default=dict, encoder=DjangoJSONEncoder)
# Event types # Event types
USER_CREATED = 101 USER_CREATED = 101
@ -4554,6 +4558,29 @@ class AbstractRealmAuditLog(models.Model):
class Meta: class Meta:
abstract = True abstract = True
def save(
self,
force_insert: bool = False,
force_update: bool = False,
using: Optional[str] = None,
update_fields: Optional[Iterable[str]] = None,
) -> None:
# Unless extra_data is set and extra_data_json is unset (defaults to {})
# we do not attempt to auto convert extra_data to extra_data_json
if self.extra_data is not None and not self.extra_data_json:
try:
if self.extra_data.startswith("{'"):
self.extra_data_json = ast.literal_eval(self.extra_data)
else:
self.extra_data_json = orjson.loads(self.extra_data)
except (
Exception
): # nocoverage # This is not intended to happen at all for correctly written code
raise Exception(
"extra_data_json must be explicitly set if extra_data is not str()'d from a dict or JSON-encoded."
)
super().save(force_insert, force_update, using, update_fields)
class RealmAuditLog(AbstractRealmAuditLog): class RealmAuditLog(AbstractRealmAuditLog):
""" """

View File

@ -900,7 +900,11 @@ class AnalyticsBouncerTest(BouncerTestCase):
self.add_mock_response() self.add_mock_response()
def verify_request_with_overridden_extra_data( def verify_request_with_overridden_extra_data(
request_extra_data: object, expected_extra_data: object request_extra_data: object,
*,
expected_extra_data: object = None,
expected_extra_data_json: object = None,
skip_audit_log_check: bool = False,
) -> None: ) -> None:
user = self.example_user("hamlet") user = self.example_user("hamlet")
log_entry = RealmAuditLog.objects.create( log_entry = RealmAuditLog.objects.create(
@ -935,27 +939,44 @@ class AnalyticsBouncerTest(BouncerTestCase):
): ):
send_analytics_to_remote_server() send_analytics_to_remote_server()
if skip_audit_log_check:
return
remote_log_entry = RemoteRealmAuditLog.objects.order_by("id").last() remote_log_entry = RemoteRealmAuditLog.objects.order_by("id").last()
assert remote_log_entry is not None assert remote_log_entry is not None
self.assertEqual(str(remote_log_entry.server.uuid), self.server_uuid) self.assertEqual(str(remote_log_entry.server.uuid), self.server_uuid)
self.assertEqual(remote_log_entry.remote_id, log_entry.id) self.assertEqual(remote_log_entry.remote_id, log_entry.id)
self.assertEqual(remote_log_entry.event_time, self.TIME_ZERO) self.assertEqual(remote_log_entry.event_time, self.TIME_ZERO)
self.assertEqual(remote_log_entry.extra_data, expected_extra_data) self.assertEqual(remote_log_entry.extra_data, expected_extra_data)
self.assertEqual(remote_log_entry.extra_data_json, expected_extra_data_json)
# Pre-migration extra_data # Pre-migration extra_data
verify_request_with_overridden_extra_data( verify_request_with_overridden_extra_data(
request_extra_data=orjson.dumps({"fake_data": 42}).decode(), request_extra_data=orjson.dumps({"fake_data": 42}).decode(),
expected_extra_data=orjson.dumps({"fake_data": 42}).decode(), expected_extra_data=orjson.dumps({"fake_data": 42}).decode(),
expected_extra_data_json={"fake_data": 42},
)
verify_request_with_overridden_extra_data(
request_extra_data=None, expected_extra_data=None, expected_extra_data_json={}
) )
verify_request_with_overridden_extra_data(request_extra_data=None, expected_extra_data=None)
# Post-migration extra_data # Post-migration extra_data
verify_request_with_overridden_extra_data( verify_request_with_overridden_extra_data(
request_extra_data={"fake_data": 42}, request_extra_data={"fake_data": 42},
expected_extra_data=orjson.dumps({"fake_data": 42}).decode(), expected_extra_data=orjson.dumps({"fake_data": 42}).decode(),
expected_extra_data_json={"fake_data": 42},
) )
verify_request_with_overridden_extra_data( verify_request_with_overridden_extra_data(
request_extra_data={}, expected_extra_data=orjson.dumps({}).decode() request_extra_data={},
expected_extra_data=orjson.dumps({}).decode(),
expected_extra_data_json={},
) )
# Invalid extra_data
with self.assertLogs(level="WARNING") as m:
verify_request_with_overridden_extra_data(
request_extra_data="{malformedjson:",
skip_audit_log_check=True,
)
self.assertIn("Malformed audit log data", m.output[0])
class PushNotificationTest(BouncerTestCase): class PushNotificationTest(BouncerTestCase):

View File

@ -0,0 +1,27 @@
# Generated by Django 4.0.7 on 2022-09-30 20:25
import django
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("zilencer", "0025_alter_remotepushdevicetoken_user_id_drop_index"),
]
operations = [
migrations.AddField(
model_name="remoterealmauditlog",
name="extra_data_json",
field=models.JSONField(
default=dict, encoder=django.core.serializers.json.DjangoJSONEncoder
),
),
migrations.AddField(
model_name="remotezulipserverauditlog",
name="extra_data_json",
field=models.JSONField(
default=dict, encoder=django.core.serializers.json.DjangoJSONEncoder
),
),
]

View File

@ -481,14 +481,27 @@ def remote_server_post_analytics(
if realmauditlog_rows is not None: if realmauditlog_rows is not None:
remote_realm_audit_logs = [] remote_realm_audit_logs = []
for row in realmauditlog_rows: for row in realmauditlog_rows:
extra_data = {}
extra_data_str = None
# Remote servers that do support JSONField will pass extra_data # Remote servers that do support JSONField will pass extra_data
# as a dict. Otherwise, extra_data will be either a string or None. # as a dict. Otherwise, extra_data will be either a string or None.
if isinstance(row["extra_data"], dict): if isinstance(row["extra_data"], str):
# A valid "extra_data" as a str, if present, should always be generated from
# orjson.dumps because the POSTed analytics data for RealmAuditLog is restricted
# to event types in SYNC_BILLING_EVENTS.
# For these event types, we don't create extra_data that requires special
# handling to fit into the JSONField.
try:
extra_data = orjson.loads(row["extra_data"])
except orjson.JSONDecodeError:
raise JsonableError(_("Malformed audit log data"))
extra_data_str = row["extra_data"]
elif row["extra_data"] is not None:
assert isinstance(row["extra_data"], dict)
extra_data = row["extra_data"]
# This is guaranteed to succeed because row["extra_data"] would be parsed # This is guaranteed to succeed because row["extra_data"] would be parsed
# from JSON with our json validator if it is a dict. # from JSON with our json validator if it is a dict.
extra_data = orjson.dumps(row["extra_data"]).decode() extra_data_str = orjson.dumps(row["extra_data"]).decode()
else:
extra_data = row["extra_data"]
remote_realm_audit_logs.append( remote_realm_audit_logs.append(
RemoteRealmAuditLog( RemoteRealmAuditLog(
realm_id=row["realm"], realm_id=row["realm"],
@ -498,7 +511,8 @@ def remote_server_post_analytics(
row["event_time"], tz=datetime.timezone.utc row["event_time"], tz=datetime.timezone.utc
), ),
backfilled=row["backfilled"], backfilled=row["backfilled"],
extra_data=extra_data, extra_data=extra_data_str,
extra_data_json=extra_data,
event_type=row["event_type"], event_type=row["event_type"],
) )
) )