analytics: Store realm disk space used as a CountStat.

Fixes #29632.

The issue description explains this well:

We currently recalculate `currently_used_upload_space_bytes` every file
upload, by dint of calling `flush_used_upload_space_cache`  on
save/delete, and then immediately calling
`user_profile.realm.currently_used_upload_space_bytes()` in
`notify_attachment_update`.  Since this walks the Attachments table,
recalculating this can take seconds in large realms.

Switch this to using a CountStat, so we don't need to walk significant
chunks of the Attachment table when we upload an attachment.  This will
also give us a historical daily graph of usage.
This commit is contained in:
Mateusz Mandera 2024-04-19 03:06:53 +02:00 committed by Tim Abbott
parent 4c4a443002
commit 9406bfbc0a
4 changed files with 124 additions and 5 deletions

View File

@ -485,6 +485,40 @@ def sql_data_collector(
return DataCollector(output_table, pull_function)
def count_upload_space_used_by_realm_query(realm: Optional[Realm]) -> QueryFn:
if realm is None:
realm_clause: Composable = SQL("")
else:
realm_clause = SQL("zerver_attachment.realm_id = {} AND").format(Literal(realm.id))
# Note: This query currently has to go through the entire table,
# summing all the sizes of attachments for every realm. This can be improved
# by having a query which looks at the latest CountStat for each realm,
# and sums it with only the new attachments.
# There'd be additional complexity added by the fact that attachments can
# also be deleted. Partially this can be accounted for by subtracting
# ArchivedAttachment sizes, but there's still the issue of attachments
# which can be directly deleted via the API.
return lambda kwargs: SQL(
"""
INSERT INTO analytics_realmcount (realm_id, property, end_time, value)
SELECT
zerver_attachment.realm_id,
%(property)s,
%(time_end)s,
COALESCE(SUM(zerver_attachment.size), 0)
FROM
zerver_attachment
WHERE
{realm_clause}
zerver_attachment.create_time < %(time_end)s
GROUP BY
zerver_attachment.realm_id
"""
).format(**kwargs, realm_clause=realm_clause)
def do_pull_minutes_active(
property: str, start_time: datetime, end_time: datetime, realm: Optional[Realm] = None
) -> int:
@ -863,6 +897,11 @@ def get_count_stats(realm: Optional[Realm] = None) -> Dict[str, CountStat]:
CountStat.DAY,
interval=TIMEDELTA_MAX,
),
CountStat(
"upload_quota_used_bytes::day",
sql_data_collector(RealmCount, count_upload_space_used_by_realm_query(realm), None),
CountStat.DAY,
),
# Messages read stats. messages_read::hour is the total
# number of messages read, whereas
# messages_read_interactions::hour tries to count the total

View File

@ -76,6 +76,7 @@ from zerver.models import (
)
from zerver.models.clients import get_client
from zerver.models.groups import SystemGroups
from zerver.models.messages import Attachment
from zerver.models.scheduled_jobs import NotificationTriggers
from zerver.models.users import get_user, is_cross_realm_bot_email
from zilencer.models import (
@ -190,6 +191,18 @@ class AnalyticsTestCase(ZulipTestCase):
kwargs[key] = kwargs.get(key, value)
return Message.objects.create(**kwargs)
def create_attachment(
self, user_profile: UserProfile, filename: str, size: int, create_time: datetime
) -> Attachment:
return Attachment.objects.create(
file_name=filename,
path_id=f"foo/bar/{filename}",
owner=user_profile,
realm=user_profile.realm,
size=size,
create_time=create_time,
)
# kwargs should only ever be a UserProfile or Stream.
def assert_table_count(
self,
@ -546,6 +559,41 @@ class TestCountStats(AnalyticsTestCase):
self.assertTableState(UserCount, [], [])
self.assertTableState(StreamCount, [], [])
def test_upload_quota_used_bytes(self) -> None:
stat = COUNT_STATS["upload_quota_used_bytes::day"]
self.current_property = stat.property
user1 = self.create_user()
user2 = self.create_user()
user_second_realm = self.create_user(realm=self.second_realm)
self.create_attachment(user1, "file1", 100, self.TIME_LAST_HOUR)
attachment2 = self.create_attachment(user2, "file2", 200, self.TIME_LAST_HOUR)
self.create_attachment(user_second_realm, "file3", 10, self.TIME_LAST_HOUR)
do_fill_count_stat_at_hour(stat, self.TIME_ZERO)
self.assertTableState(
RealmCount,
["value", "subgroup", "realm"],
[[300, None, self.default_realm], [10, None, self.second_realm]],
)
# Delete an attachment and run the CountStat job again the next day.
attachment2.delete()
do_fill_count_stat_at_hour(stat, self.TIME_ZERO + self.DAY)
self.assertTableState(
RealmCount,
["value", "subgroup", "realm", "end_time"],
[
[300, None, self.default_realm, self.TIME_ZERO],
[10, None, self.second_realm, self.TIME_ZERO],
[100, None, self.default_realm, self.TIME_ZERO + self.DAY],
[10, None, self.second_realm, self.TIME_ZERO + self.DAY],
],
)
def test_active_users_by_is_bot_for_realm_constraint(self) -> None:
# For single Realm

View File

@ -870,12 +870,26 @@ class Realm(models.Model): # type: ignore[django-manager-missing] # django-stub
lambda realm: get_realm_used_upload_space_cache_key(realm.id), timeout=3600 * 24 * 7
)
def currently_used_upload_space_bytes(realm) -> int: # noqa: N805
from analytics.models import RealmCount, installation_epoch
from zerver.models import Attachment
used_space = Attachment.objects.filter(realm=realm).aggregate(Sum("size"))["size__sum"]
if used_space is None:
return 0
return used_space
try:
latest_count_stat = RealmCount.objects.filter(
realm=realm, property="upload_quota_used_bytes::day"
).latest("end_time")
last_recorded_used_space = latest_count_stat.value
last_recorded_date = latest_count_stat.end_time
except RealmCount.DoesNotExist:
last_recorded_used_space = 0
last_recorded_date = installation_epoch()
newly_used_space = Attachment.objects.filter(
realm=realm, create_time__gte=last_recorded_date
).aggregate(Sum("size"))["size__sum"]
if newly_used_space is None:
return last_recorded_used_space
return last_recorded_used_space + newly_used_space
def ensure_not_on_limited_plan(self) -> None:
if self.plan_type == Realm.PLAN_TYPE_LIMITED:

View File

@ -9,12 +9,14 @@ from urllib.parse import quote
import orjson
from django.conf import settings
from django.utils.timezone import now as timezone_now
from PIL import Image
from typing_extensions import override
from urllib3 import encode_multipart_formdata
from urllib3.fields import RequestField
import zerver.lib.upload
from analytics.models import RealmCount
from zerver.actions.create_realm import do_create_realm
from zerver.actions.message_send import internal_send_private_message
from zerver.actions.realm_icon import do_change_icon_source
@ -23,7 +25,7 @@ from zerver.actions.realm_settings import do_change_realm_plan_type, do_set_real
from zerver.actions.user_settings import do_delete_avatar_image
from zerver.lib.attachments import validate_attachment_request
from zerver.lib.avatar import avatar_url, get_avatar_field
from zerver.lib.cache import cache_get, get_realm_used_upload_space_cache_key
from zerver.lib.cache import cache_delete, cache_get, get_realm_used_upload_space_cache_key
from zerver.lib.create_user import copy_default_settings
from zerver.lib.initial_password import initial_password
from zerver.lib.realm_icon import realm_icon_url
@ -1844,6 +1846,22 @@ class UploadSpaceTests(UploadSerializeMixin, ZulipTestCase):
self.assertEqual(None, cache_get(get_realm_used_upload_space_cache_key(self.realm.id)))
self.assert_length(data2, self.realm.currently_used_upload_space_bytes())
now = timezone_now()
RealmCount.objects.create(
realm=self.realm,
property="upload_quota_used_bytes::day",
end_time=now,
value=len(data2),
)
# Purge the cache since we want to actually execute the function.
cache_delete(get_realm_used_upload_space_cache_key(self.realm.id))
self.assert_length(data2, self.realm.currently_used_upload_space_bytes())
data3 = b"even-more-data!"
upload_message_attachment("dummy3.txt", len(data3), "text/plain", data3, self.user_profile)
self.assertEqual(len(data2) + len(data3), self.realm.currently_used_upload_space_bytes())
class DecompressionBombTests(ZulipTestCase):
@override