remote_server: Serialize analytics requests with Pydantic.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg 2023-12-08 15:09:01 -08:00 committed by Tim Abbott
parent abdfdeffe4
commit 0400614a48
3 changed files with 135 additions and 104 deletions

View File

@ -5,14 +5,13 @@ from urllib.parse import urljoin
import orjson import orjson
import requests import requests
from django.conf import settings from django.conf import settings
from django.forms.models import model_to_dict from django.db.models import QuerySet
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
from pydantic import UUID4, BaseModel, ConfigDict, Field, field_validator from pydantic import UUID4, BaseModel, ConfigDict, Field, Json, field_validator
from analytics.models import InstallationCount, RealmCount from analytics.models import InstallationCount, RealmCount
from version import ZULIP_VERSION from version import ZULIP_VERSION
from zerver.lib.exceptions import JsonableError, MissingRemoteRealmError from zerver.lib.exceptions import JsonableError, MissingRemoteRealmError
from zerver.lib.export import floatify_datetime_fields
from zerver.lib.outgoing_http import OutgoingSession from zerver.lib.outgoing_http import OutgoingSession
from zerver.lib.queue import queue_json_publish from zerver.lib.queue import queue_json_publish
from zerver.lib.types import RemoteRealmDictValue from zerver.lib.types import RemoteRealmDictValue
@ -36,6 +35,32 @@ class PushNotificationBouncerServerError(PushNotificationBouncerRetryLaterError)
http_status_code = 502 http_status_code = 502
class RealmCountDataForAnalytics(BaseModel):
property: str
realm: int
id: int
end_time: float
subgroup: Optional[str]
value: int
class InstallationCountDataForAnalytics(BaseModel):
property: str
id: int
end_time: float
subgroup: Optional[str]
value: int
class RealmAuditLogDataForAnalytics(BaseModel):
id: int
realm: int
event_time: float
backfilled: bool
extra_data: Optional[Union[str, Dict[str, Any]]]
event_type: int
class RealmDataForAnalytics(BaseModel): class RealmDataForAnalytics(BaseModel):
model_config = ConfigDict(extra="forbid") model_config = ConfigDict(extra="forbid")
@ -61,6 +86,14 @@ class RealmDataForAnalytics(BaseModel):
return value return value
class AnalyticsRequest(BaseModel):
realm_counts: Json[List[RealmCountDataForAnalytics]]
installation_counts: Json[List[InstallationCountDataForAnalytics]]
realmauditlog_rows: Optional[Json[List[RealmAuditLogDataForAnalytics]]] = None
realms: Json[List[RealmDataForAnalytics]]
version: Optional[Json[str]]
class UserDataForRemoteBilling(BaseModel): class UserDataForRemoteBilling(BaseModel):
uuid: UUID4 uuid: UUID4
email: str email: str
@ -189,45 +222,54 @@ def send_json_to_push_bouncer(
) )
REALMAUDITLOG_PUSHED_FIELDS = [ def build_analytics_data(
"id", realm_count_query: QuerySet[RealmCount],
"realm", installation_count_query: QuerySet[InstallationCount],
"event_time", realmauditlog_query: QuerySet[RealmAuditLog],
"backfilled", ) -> Tuple[
List[RealmCountDataForAnalytics],
List[InstallationCountDataForAnalytics],
List[RealmAuditLogDataForAnalytics],
]:
# We limit the batch size on the client side to avoid OOM kills timeouts, etc.
MAX_CLIENT_BATCH_SIZE = 10000
realm_count_data = [
RealmCountDataForAnalytics(
property=row.property,
realm=row.realm.id,
id=row.id,
end_time=row.end_time.timestamp(),
subgroup=row.subgroup,
value=row.value,
)
for row in realm_count_query.order_by("id")[0:MAX_CLIENT_BATCH_SIZE]
]
installation_count_data = [
InstallationCountDataForAnalytics(
property=row.property,
id=row.id,
end_time=row.end_time.timestamp(),
subgroup=row.subgroup,
value=row.value,
)
for row in installation_count_query.order_by("id")[0:MAX_CLIENT_BATCH_SIZE]
]
zerver_realmauditlog = [
RealmAuditLogDataForAnalytics(
id=row.id,
realm=row.realm.id,
event_time=row.event_time.timestamp(),
backfilled=row.backfilled,
# Note that we don't need to add extra_data_json here because # Note that we don't need to add extra_data_json here because
# the view remote_server_post_analytics populates extra_data_json # the view remote_server_post_analytics populates extra_data_json
# from the provided extra_data. # from the provided extra_data.
"extra_data", extra_data=row.extra_data,
"event_type", event_type=row.event_type,
] )
def build_analytics_data(
realm_count_query: Any, installation_count_query: Any, realmauditlog_query: Any
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
# We limit the batch size on the client side to avoid OOM kills timeouts, etc.
MAX_CLIENT_BATCH_SIZE = 10000
data = {}
data["analytics_realmcount"] = [
model_to_dict(row) for row in realm_count_query.order_by("id")[0:MAX_CLIENT_BATCH_SIZE]
]
data["analytics_installationcount"] = [
model_to_dict(row)
for row in installation_count_query.order_by("id")[0:MAX_CLIENT_BATCH_SIZE]
]
data["zerver_realmauditlog"] = [
model_to_dict(row, fields=REALMAUDITLOG_PUSHED_FIELDS)
for row in realmauditlog_query.order_by("id")[0:MAX_CLIENT_BATCH_SIZE] for row in realmauditlog_query.order_by("id")[0:MAX_CLIENT_BATCH_SIZE]
] ]
floatify_datetime_fields(data, "analytics_realmcount") return realm_count_data, installation_count_data, zerver_realmauditlog
floatify_datetime_fields(data, "analytics_installationcount")
floatify_datetime_fields(data, "zerver_realmauditlog")
return (
data["analytics_realmcount"],
data["analytics_installationcount"],
data["zerver_realmauditlog"],
)
def get_realms_info_for_push_bouncer(realm_id: Optional[int] = None) -> List[RealmDataForAnalytics]: def get_realms_info_for_push_bouncer(realm_id: Optional[int] = None) -> List[RealmDataForAnalytics]:
@ -287,36 +329,36 @@ def send_analytics_to_push_bouncer() -> None:
) )
record_count = len(realm_count_data) + len(installation_count_data) + len(realmauditlog_data) record_count = len(realm_count_data) + len(installation_count_data) + len(realmauditlog_data)
request = { request = AnalyticsRequest.model_construct(
"realm_counts": orjson.dumps(realm_count_data).decode(), realm_counts=realm_count_data,
"installation_counts": orjson.dumps(installation_count_data).decode(), installation_counts=installation_count_data,
"realmauditlog_rows": orjson.dumps(realmauditlog_data).decode(), realmauditlog_rows=realmauditlog_data,
"realms": orjson.dumps( realms=get_realms_info_for_push_bouncer(),
[dict(realm_data) for realm_data in get_realms_info_for_push_bouncer()] version=ZULIP_VERSION,
).decode(), )
"version": orjson.dumps(ZULIP_VERSION).decode(),
}
try: try:
send_to_push_bouncer("POST", "server/analytics", request) send_to_push_bouncer("POST", "server/analytics", request.model_dump(round_trip=True))
except JsonableError as e: except JsonableError as e:
logger.warning(e.msg) logger.warning(e.msg)
logger.info("Reported %d records", record_count) logger.info("Reported %d records", record_count)
def send_realms_only_to_push_bouncer() -> Dict[str, RemoteRealmDictValue]: def send_realms_only_to_push_bouncer() -> Dict[str, RemoteRealmDictValue]:
request = { request = AnalyticsRequest.model_construct(
"realm_counts": "[]", realm_counts=[],
"installation_counts": "[]", installation_counts=[],
"realms": orjson.dumps( realms=get_realms_info_for_push_bouncer(),
[dict(realm_data) for realm_data in get_realms_info_for_push_bouncer()] version=ZULIP_VERSION,
).decode(), )
"version": orjson.dumps(ZULIP_VERSION).decode(),
}
# We don't catch JsonableError here, because we want it to propagate further # We don't catch JsonableError here, because we want it to propagate further
# to either explicitly, loudly fail or be error-handled by the caller. # to either explicitly, loudly fail or be error-handled by the caller.
response = send_to_push_bouncer("POST", "server/analytics", request) response = send_to_push_bouncer(
"POST",
"server/analytics",
request.model_dump(round_trip=True, exclude={"realmauditlog_rows"}),
)
assert isinstance(response["realms"], dict) # for mypy assert isinstance(response["realms"], dict) # for mypy
return response["realms"] return response["realms"]

View File

@ -60,6 +60,7 @@ from zerver.lib.push_notifications import (
send_notifications_to_bouncer, send_notifications_to_bouncer,
) )
from zerver.lib.remote_server import ( from zerver.lib.remote_server import (
AnalyticsRequest,
PushNotificationBouncerError, PushNotificationBouncerError,
PushNotificationBouncerRetryLaterError, PushNotificationBouncerRetryLaterError,
PushNotificationBouncerServerError, PushNotificationBouncerServerError,
@ -1351,14 +1352,17 @@ class AnalyticsBouncerTest(BouncerTestCase):
(realm_count_data, installation_count_data, realmauditlog_data) = build_analytics_data( (realm_count_data, installation_count_data, realmauditlog_data) = build_analytics_data(
RealmCount.objects.all(), InstallationCount.objects.all(), RealmAuditLog.objects.all() RealmCount.objects.all(), InstallationCount.objects.all(), RealmAuditLog.objects.all()
) )
request = AnalyticsRequest.model_construct(
realm_counts=realm_count_data,
installation_counts=installation_count_data,
realmauditlog_rows=realmauditlog_data,
realms=[],
version=None,
)
result = self.uuid_post( result = self.uuid_post(
self.server_uuid, self.server_uuid,
"/api/v1/remotes/server/analytics", "/api/v1/remotes/server/analytics",
{ request.model_dump(round_trip=True, exclude={"realms", "version"}),
"realm_counts": orjson.dumps(realm_count_data).decode(),
"installation_counts": orjson.dumps(installation_count_data).decode(),
"realmauditlog_rows": orjson.dumps(realmauditlog_data).decode(),
},
subdomain="", subdomain="",
) )
self.assert_json_error(result, "Data is out of order.") self.assert_json_error(result, "Data is out of order.")
@ -1382,19 +1386,21 @@ class AnalyticsBouncerTest(BouncerTestCase):
check_counts(11, 11, 3, 2, 8) check_counts(11, 11, 3, 2, 8)
# Test that only valid org_type values are accepted - integers defined in OrgTypeEnum. # Test that only valid org_type values are accepted - integers defined in OrgTypeEnum.
realms_data = [dict(realm) for realm in get_realms_info_for_push_bouncer()] realms_data = get_realms_info_for_push_bouncer()
# Not a valid org_type value: # Not a valid org_type value:
realms_data[0]["org_type"] = 11 realms_data[0].org_type = 11
request = AnalyticsRequest.model_construct(
realm_counts=[],
installation_counts=[],
realmauditlog_rows=[],
realms=realms_data,
version=None,
)
result = self.uuid_post( result = self.uuid_post(
self.server_uuid, self.server_uuid,
"/api/v1/remotes/server/analytics", "/api/v1/remotes/server/analytics",
{ request.model_dump(round_trip=True, exclude={"version", "api_feature_level"}),
"realm_counts": orjson.dumps([]).decode(),
"installation_counts": orjson.dumps([]).decode(),
"realmauditlog_rows": orjson.dumps([]).decode(),
"realms": orjson.dumps(realms_data).decode(),
},
subdomain="", subdomain="",
) )
self.assert_json_error( self.assert_json_error(
@ -1435,15 +1441,17 @@ class AnalyticsBouncerTest(BouncerTestCase):
) )
# This first post should fail because of excessive audit log event types. # This first post should fail because of excessive audit log event types.
request = AnalyticsRequest.model_construct(
realm_counts=realm_count_data,
installation_counts=installation_count_data,
realmauditlog_rows=realmauditlog_data,
realms=[],
version=None,
)
result = self.uuid_post( result = self.uuid_post(
self.server_uuid, self.server_uuid,
"/api/v1/remotes/server/analytics", "/api/v1/remotes/server/analytics",
{ request.model_dump(round_trip=True, exclude={"version"}),
"realm_counts": orjson.dumps(realm_count_data).decode(),
"installation_counts": orjson.dumps(installation_count_data).decode(),
"realmauditlog_rows": orjson.dumps(realmauditlog_data).decode(),
"realms": orjson.dumps([]).decode(),
},
subdomain="", subdomain="",
) )
self.assert_json_error(result, "Invalid event type.") self.assert_json_error(result, "Invalid event type.")
@ -1458,15 +1466,17 @@ class AnalyticsBouncerTest(BouncerTestCase):
# Send the data to the bouncer without any realms data. This should lead # Send the data to the bouncer without any realms data. This should lead
# to successful saving of the data, but with the remote_realm foreign key # to successful saving of the data, but with the remote_realm foreign key
# set to NULL. # set to NULL.
request = AnalyticsRequest.model_construct(
realm_counts=realm_count_data,
installation_counts=installation_count_data,
realmauditlog_rows=realmauditlog_data,
realms=[],
version=None,
)
result = self.uuid_post( result = self.uuid_post(
self.server_uuid, self.server_uuid,
"/api/v1/remotes/server/analytics", "/api/v1/remotes/server/analytics",
{ request.model_dump(round_trip=True, exclude={"version"}),
"realm_counts": orjson.dumps(realm_count_data).decode(),
"installation_counts": orjson.dumps(installation_count_data).decode(),
"realmauditlog_rows": orjson.dumps(realmauditlog_data).decode(),
"realms": orjson.dumps([]).decode(),
},
subdomain="", subdomain="",
) )
self.assert_json_success(result) self.assert_json_success(result)

View File

@ -34,7 +34,12 @@ from zerver.lib.push_notifications import (
send_apple_push_notification, send_apple_push_notification,
send_test_push_notification_directly_to_devices, send_test_push_notification_directly_to_devices,
) )
from zerver.lib.remote_server import RealmDataForAnalytics from zerver.lib.remote_server import (
InstallationCountDataForAnalytics,
RealmAuditLogDataForAnalytics,
RealmCountDataForAnalytics,
RealmDataForAnalytics,
)
from zerver.lib.request import REQ, has_request_variables from zerver.lib.request import REQ, has_request_variables
from zerver.lib.response import json_success from zerver.lib.response import json_success
from zerver.lib.timestamp import datetime_to_timestamp, timestamp_to_datetime from zerver.lib.timestamp import datetime_to_timestamp, timestamp_to_datetime
@ -659,32 +664,6 @@ def update_remote_realm_data_for_server(
RemoteRealmAuditLog.objects.bulk_create(remote_realm_audit_logs) RemoteRealmAuditLog.objects.bulk_create(remote_realm_audit_logs)
class RealmAuditLogDataForAnalytics(BaseModel):
id: int
realm: int
event_time: float
backfilled: bool
extra_data: Optional[Union[str, Dict[str, Any]]]
event_type: int
class RealmCountDataForAnalytics(BaseModel):
property: str
realm: int
id: int
end_time: float
subgroup: Optional[str]
value: int
class InstallationCountDataForAnalytics(BaseModel):
property: str
id: int
end_time: float
subgroup: Optional[str]
value: int
@typed_endpoint @typed_endpoint
@transaction.atomic @transaction.atomic
def remote_server_post_analytics( def remote_server_post_analytics(