From e39e04c3ce9a2044258185d645a6e721077a8e10 Mon Sep 17 00:00:00 2001 From: Zixuan Li <39874143+PIG208@users.noreply.github.com> Date: Wed, 7 Jun 2023 15:14:43 -0400 Subject: [PATCH] migration: Add `extra_data_json` for audit log models. Note that we use the DjangoJSONEncoder so that we have builtin support for parsing Decimal and datetime. During this intermediate state, the migration that creates extra_data_json field has been run. We prepare for running the backfilling migration that populates extra_data_json from extra_data. This change implements double-write, which is important to keep the state of extra data consistent. For most extra_data usage, this is handled by the overriden `save` method on `AbstractRealmAuditLog`, where we either generates extra_data_json using orjson.loads or ast.literal_eval. While backfilling ensures that old realm audit log entries have extra_data_json populated, double-write ensures that any new entries generated will also have extra_data_json set. So that we can then safely rename extra_data_json to extra_data while ensuring the non-nullable invariant. For completeness, we additionally set RealmAuditLog.NEW_VALUE for the USER_FULL_NAME_CHANGED event. This cannot be handled with the overridden `save`. This addresses: https://github.com/zulip/zulip/pull/23116#discussion_r1040277795 Note that extra_data_json at this point is not used yet. So the test cases do not need to switch to testing extra_data_json. This is later done after we rename extra_data_json to extra_data. Double-write for the remote server audit logs is special, because we only get the dumped bytes from an external source. Luckily, none of the payload carries extra_data that is not generated using orjson.dumps for audit logs of event types in SYNC_BILLING_EVENTS. This can be verified by looking at: `git grep -A 6 -E "event_type=.*(USER_CREATED|USER_ACTIVATED|USER_DEACTIVATED|USER_REACTIVATED|USER_ROLE_CHANGED|REALM_DEACTIVATED|REALM_REACTIVATED)"` Therefore, we just need to populate extra_data_json doing an orjson.loads call after a None-check. Co-authored-by: Zixuan James Li --- corporate/lib/stripe.py | 6 +++++ version.py | 2 +- zerver/actions/user_settings.py | 1 + zerver/lib/remote_server.py | 3 +++ .../0452_realmauditlog_extra_data_json.py | 20 ++++++++++++++ zerver/models.py | 27 +++++++++++++++++++ zerver/tests/test_push_notifications.py | 27 ++++++++++++++++--- .../0026_auditlog_models_extra_data_json.py | 27 +++++++++++++++++++ zilencer/views.py | 24 +++++++++++++---- 9 files changed, 128 insertions(+), 9 deletions(-) create mode 100644 zerver/migrations/0452_realmauditlog_extra_data_json.py create mode 100644 zilencer/migrations/0026_auditlog_models_extra_data_json.py diff --git a/corporate/lib/stripe.py b/corporate/lib/stripe.py index e652124c00..c3112d54db 100644 --- a/corporate/lib/stripe.py +++ b/corporate/lib/stripe.py @@ -654,6 +654,7 @@ def do_change_remote_server_plan_type(remote_server: RemoteZulipServer, plan_typ server=remote_server, event_time=timezone_now(), extra_data=str({"old_value": old_value, "new_value": plan_type}), + extra_data_json={"old_value": old_value, "new_value": plan_type}, ) @@ -738,6 +739,8 @@ def process_initial_upgrade( event_time=billing_cycle_anchor, event_type=RealmAuditLog.CUSTOMER_PLAN_CREATED, extra_data=orjson.dumps(plan_params, default=decimal_to_float).decode(), + # Note that DjangoJSONEncoder has builtin support for parsing Decimal + extra_data_json=plan_params, ) if not free_trial: @@ -974,6 +977,7 @@ def attach_discount_to_realm( event_type=RealmAuditLog.REALM_DISCOUNT_CHANGED, event_time=timezone_now(), extra_data=str({"old_discount": old_discount, "new_discount": discount}), + extra_data_json={"old_discount": old_discount, "new_discount": discount}, ) @@ -989,6 +993,7 @@ def update_sponsorship_status( event_type=RealmAuditLog.REALM_SPONSORSHIP_PENDING_STATUS_CHANGED, event_time=timezone_now(), extra_data=str({"sponsorship_pending": sponsorship_pending}), + extra_data_json={"sponsorship_pending": sponsorship_pending}, ) @@ -1243,4 +1248,5 @@ def update_billing_method_of_current_plan( event_type=RealmAuditLog.REALM_BILLING_METHOD_CHANGED, event_time=timezone_now(), extra_data=str({"charge_automatically": charge_automatically}), + extra_data_json={"charge_automatically": charge_automatically}, ) diff --git a/version.py b/version.py index 5801d88975..f4ff587bbc 100644 --- a/version.py +++ b/version.py @@ -48,4 +48,4 @@ API_FEATURE_LEVEL = 185 # historical commits sharing the same major version, in which case a # minor version bump suffices. -PROVISION_VERSION = (243, 0) +PROVISION_VERSION = (243, 1) diff --git a/zerver/actions/user_settings.py b/zerver/actions/user_settings.py index b4be9b6bd3..82cd9fd3b7 100644 --- a/zerver/actions/user_settings.py +++ b/zerver/actions/user_settings.py @@ -204,6 +204,7 @@ def do_change_full_name( event_type=RealmAuditLog.USER_FULL_NAME_CHANGED, event_time=event_time, extra_data=old_name, + extra_data_json={RealmAuditLog.OLD_VALUE: old_name, RealmAuditLog.NEW_VALUE: full_name}, ) payload = dict(user_id=user_profile.id, full_name=user_profile.full_name) send_event( diff --git a/zerver/lib/remote_server.py b/zerver/lib/remote_server.py index 3672c8dc15..a12f1f79dd 100644 --- a/zerver/lib/remote_server.py +++ b/zerver/lib/remote_server.py @@ -125,6 +125,9 @@ REALMAUDITLOG_PUSHED_FIELDS = [ "realm", "event_time", "backfilled", + # Note that we don't need to add extra_data_json here because + # the view remote_server_post_analytics populates extra_data_json + # from the provided extra_data. "extra_data", "event_type", ] diff --git a/zerver/migrations/0452_realmauditlog_extra_data_json.py b/zerver/migrations/0452_realmauditlog_extra_data_json.py new file mode 100644 index 0000000000..e4ea752e4e --- /dev/null +++ b/zerver/migrations/0452_realmauditlog_extra_data_json.py @@ -0,0 +1,20 @@ +# Generated by Django 4.0.7 on 2022-09-30 20:25 + +import django +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("zerver", "0451_add_userprofile_api_key_index"), + ] + + operations = [ + migrations.AddField( + model_name="realmauditlog", + name="extra_data_json", + field=models.JSONField( + default=dict, encoder=django.core.serializers.json.DjangoJSONEncoder + ), + ), + ] diff --git a/zerver/models.py b/zerver/models.py index 4093a95381..aaefe73326 100644 --- a/zerver/models.py +++ b/zerver/models.py @@ -1,3 +1,4 @@ +import ast import datetime import secrets import time @@ -10,6 +11,7 @@ from typing import ( Any, Callable, Dict, + Iterable, List, Optional, Pattern, @@ -39,6 +41,7 @@ from django.contrib.contenttypes.fields import GenericRelation from django.contrib.postgres.indexes import GinIndex from django.contrib.postgres.search import SearchVectorField from django.core.exceptions import ValidationError +from django.core.serializers.json import DjangoJSONEncoder from django.core.validators import MinLengthValidator, RegexValidator, URLValidator, validate_email from django.db import models, transaction from django.db.backends.base.base import BaseDatabaseWrapper @@ -4450,6 +4453,7 @@ class AbstractRealmAuditLog(models.Model): ROLE_COUNT_BOTS = "12" extra_data = models.TextField(null=True) + extra_data_json = models.JSONField(default=dict, encoder=DjangoJSONEncoder) # Event types USER_CREATED = 101 @@ -4554,6 +4558,29 @@ class AbstractRealmAuditLog(models.Model): class Meta: abstract = True + def save( + self, + force_insert: bool = False, + force_update: bool = False, + using: Optional[str] = None, + update_fields: Optional[Iterable[str]] = None, + ) -> None: + # Unless extra_data is set and extra_data_json is unset (defaults to {}) + # we do not attempt to auto convert extra_data to extra_data_json + if self.extra_data is not None and not self.extra_data_json: + try: + if self.extra_data.startswith("{'"): + self.extra_data_json = ast.literal_eval(self.extra_data) + else: + self.extra_data_json = orjson.loads(self.extra_data) + except ( + Exception + ): # nocoverage # This is not intended to happen at all for correctly written code + raise Exception( + "extra_data_json must be explicitly set if extra_data is not str()'d from a dict or JSON-encoded." + ) + super().save(force_insert, force_update, using, update_fields) + class RealmAuditLog(AbstractRealmAuditLog): """ diff --git a/zerver/tests/test_push_notifications.py b/zerver/tests/test_push_notifications.py index 5dcf168b13..1e6cd8fd0e 100644 --- a/zerver/tests/test_push_notifications.py +++ b/zerver/tests/test_push_notifications.py @@ -900,7 +900,11 @@ class AnalyticsBouncerTest(BouncerTestCase): self.add_mock_response() def verify_request_with_overridden_extra_data( - request_extra_data: object, expected_extra_data: object + request_extra_data: object, + *, + expected_extra_data: object = None, + expected_extra_data_json: object = None, + skip_audit_log_check: bool = False, ) -> None: user = self.example_user("hamlet") log_entry = RealmAuditLog.objects.create( @@ -935,27 +939,44 @@ class AnalyticsBouncerTest(BouncerTestCase): ): send_analytics_to_remote_server() + if skip_audit_log_check: + return + remote_log_entry = RemoteRealmAuditLog.objects.order_by("id").last() assert remote_log_entry is not None self.assertEqual(str(remote_log_entry.server.uuid), self.server_uuid) self.assertEqual(remote_log_entry.remote_id, log_entry.id) self.assertEqual(remote_log_entry.event_time, self.TIME_ZERO) self.assertEqual(remote_log_entry.extra_data, expected_extra_data) + self.assertEqual(remote_log_entry.extra_data_json, expected_extra_data_json) # Pre-migration extra_data verify_request_with_overridden_extra_data( request_extra_data=orjson.dumps({"fake_data": 42}).decode(), expected_extra_data=orjson.dumps({"fake_data": 42}).decode(), + expected_extra_data_json={"fake_data": 42}, + ) + verify_request_with_overridden_extra_data( + request_extra_data=None, expected_extra_data=None, expected_extra_data_json={} ) - verify_request_with_overridden_extra_data(request_extra_data=None, expected_extra_data=None) # Post-migration extra_data verify_request_with_overridden_extra_data( request_extra_data={"fake_data": 42}, expected_extra_data=orjson.dumps({"fake_data": 42}).decode(), + expected_extra_data_json={"fake_data": 42}, ) verify_request_with_overridden_extra_data( - request_extra_data={}, expected_extra_data=orjson.dumps({}).decode() + request_extra_data={}, + expected_extra_data=orjson.dumps({}).decode(), + expected_extra_data_json={}, ) + # Invalid extra_data + with self.assertLogs(level="WARNING") as m: + verify_request_with_overridden_extra_data( + request_extra_data="{malformedjson:", + skip_audit_log_check=True, + ) + self.assertIn("Malformed audit log data", m.output[0]) class PushNotificationTest(BouncerTestCase): diff --git a/zilencer/migrations/0026_auditlog_models_extra_data_json.py b/zilencer/migrations/0026_auditlog_models_extra_data_json.py new file mode 100644 index 0000000000..cb0df6ff60 --- /dev/null +++ b/zilencer/migrations/0026_auditlog_models_extra_data_json.py @@ -0,0 +1,27 @@ +# Generated by Django 4.0.7 on 2022-09-30 20:25 + +import django +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("zilencer", "0025_alter_remotepushdevicetoken_user_id_drop_index"), + ] + + operations = [ + migrations.AddField( + model_name="remoterealmauditlog", + name="extra_data_json", + field=models.JSONField( + default=dict, encoder=django.core.serializers.json.DjangoJSONEncoder + ), + ), + migrations.AddField( + model_name="remotezulipserverauditlog", + name="extra_data_json", + field=models.JSONField( + default=dict, encoder=django.core.serializers.json.DjangoJSONEncoder + ), + ), + ] diff --git a/zilencer/views.py b/zilencer/views.py index 43fb529a60..f4859f6a45 100644 --- a/zilencer/views.py +++ b/zilencer/views.py @@ -481,14 +481,27 @@ def remote_server_post_analytics( if realmauditlog_rows is not None: remote_realm_audit_logs = [] for row in realmauditlog_rows: + extra_data = {} + extra_data_str = None # Remote servers that do support JSONField will pass extra_data # as a dict. Otherwise, extra_data will be either a string or None. - if isinstance(row["extra_data"], dict): + if isinstance(row["extra_data"], str): + # A valid "extra_data" as a str, if present, should always be generated from + # orjson.dumps because the POSTed analytics data for RealmAuditLog is restricted + # to event types in SYNC_BILLING_EVENTS. + # For these event types, we don't create extra_data that requires special + # handling to fit into the JSONField. + try: + extra_data = orjson.loads(row["extra_data"]) + except orjson.JSONDecodeError: + raise JsonableError(_("Malformed audit log data")) + extra_data_str = row["extra_data"] + elif row["extra_data"] is not None: + assert isinstance(row["extra_data"], dict) + extra_data = row["extra_data"] # This is guaranteed to succeed because row["extra_data"] would be parsed # from JSON with our json validator if it is a dict. - extra_data = orjson.dumps(row["extra_data"]).decode() - else: - extra_data = row["extra_data"] + extra_data_str = orjson.dumps(row["extra_data"]).decode() remote_realm_audit_logs.append( RemoteRealmAuditLog( realm_id=row["realm"], @@ -498,7 +511,8 @@ def remote_server_post_analytics( row["event_time"], tz=datetime.timezone.utc ), backfilled=row["backfilled"], - extra_data=extra_data, + extra_data=extra_data_str, + extra_data_json=extra_data, event_type=row["event_type"], ) )