diff --git a/analytics/lib/counts.py b/analytics/lib/counts.py index 5f189cc0c6..4f9261f636 100644 --- a/analytics/lib/counts.py +++ b/analytics/lib/counts.py @@ -485,6 +485,40 @@ def sql_data_collector( return DataCollector(output_table, pull_function) +def count_upload_space_used_by_realm_query(realm: Optional[Realm]) -> QueryFn: + if realm is None: + realm_clause: Composable = SQL("") + else: + realm_clause = SQL("zerver_attachment.realm_id = {} AND").format(Literal(realm.id)) + + # Note: This query currently has to go through the entire table, + # summing all the sizes of attachments for every realm. This can be improved + # by having a query which looks at the latest CountStat for each realm, + # and sums it with only the new attachments. + # There'd be additional complexity added by the fact that attachments can + # also be deleted. Partially this can be accounted for by subtracting + # ArchivedAttachment sizes, but there's still the issue of attachments + # which can be directly deleted via the API. + + return lambda kwargs: SQL( + """ + INSERT INTO analytics_realmcount (realm_id, property, end_time, value) + SELECT + zerver_attachment.realm_id, + %(property)s, + %(time_end)s, + COALESCE(SUM(zerver_attachment.size), 0) + FROM + zerver_attachment + WHERE + {realm_clause} + zerver_attachment.create_time < %(time_end)s + GROUP BY + zerver_attachment.realm_id + """ + ).format(**kwargs, realm_clause=realm_clause) + + def do_pull_minutes_active( property: str, start_time: datetime, end_time: datetime, realm: Optional[Realm] = None ) -> int: @@ -863,6 +897,11 @@ def get_count_stats(realm: Optional[Realm] = None) -> Dict[str, CountStat]: CountStat.DAY, interval=TIMEDELTA_MAX, ), + CountStat( + "upload_quota_used_bytes::day", + sql_data_collector(RealmCount, count_upload_space_used_by_realm_query(realm), None), + CountStat.DAY, + ), # Messages read stats. messages_read::hour is the total # number of messages read, whereas # messages_read_interactions::hour tries to count the total diff --git a/analytics/tests/test_counts.py b/analytics/tests/test_counts.py index 65de27ed3f..db35dcd9b0 100644 --- a/analytics/tests/test_counts.py +++ b/analytics/tests/test_counts.py @@ -76,6 +76,7 @@ from zerver.models import ( ) from zerver.models.clients import get_client from zerver.models.groups import SystemGroups +from zerver.models.messages import Attachment from zerver.models.scheduled_jobs import NotificationTriggers from zerver.models.users import get_user, is_cross_realm_bot_email from zilencer.models import ( @@ -190,6 +191,18 @@ class AnalyticsTestCase(ZulipTestCase): kwargs[key] = kwargs.get(key, value) return Message.objects.create(**kwargs) + def create_attachment( + self, user_profile: UserProfile, filename: str, size: int, create_time: datetime + ) -> Attachment: + return Attachment.objects.create( + file_name=filename, + path_id=f"foo/bar/{filename}", + owner=user_profile, + realm=user_profile.realm, + size=size, + create_time=create_time, + ) + # kwargs should only ever be a UserProfile or Stream. def assert_table_count( self, @@ -546,6 +559,41 @@ class TestCountStats(AnalyticsTestCase): self.assertTableState(UserCount, [], []) self.assertTableState(StreamCount, [], []) + def test_upload_quota_used_bytes(self) -> None: + stat = COUNT_STATS["upload_quota_used_bytes::day"] + self.current_property = stat.property + + user1 = self.create_user() + user2 = self.create_user() + user_second_realm = self.create_user(realm=self.second_realm) + + self.create_attachment(user1, "file1", 100, self.TIME_LAST_HOUR) + attachment2 = self.create_attachment(user2, "file2", 200, self.TIME_LAST_HOUR) + self.create_attachment(user_second_realm, "file3", 10, self.TIME_LAST_HOUR) + + do_fill_count_stat_at_hour(stat, self.TIME_ZERO) + + self.assertTableState( + RealmCount, + ["value", "subgroup", "realm"], + [[300, None, self.default_realm], [10, None, self.second_realm]], + ) + + # Delete an attachment and run the CountStat job again the next day. + attachment2.delete() + do_fill_count_stat_at_hour(stat, self.TIME_ZERO + self.DAY) + + self.assertTableState( + RealmCount, + ["value", "subgroup", "realm", "end_time"], + [ + [300, None, self.default_realm, self.TIME_ZERO], + [10, None, self.second_realm, self.TIME_ZERO], + [100, None, self.default_realm, self.TIME_ZERO + self.DAY], + [10, None, self.second_realm, self.TIME_ZERO + self.DAY], + ], + ) + def test_active_users_by_is_bot_for_realm_constraint(self) -> None: # For single Realm diff --git a/zerver/models/realms.py b/zerver/models/realms.py index 56880dc6e1..f87bb6e8a4 100644 --- a/zerver/models/realms.py +++ b/zerver/models/realms.py @@ -870,12 +870,26 @@ class Realm(models.Model): # type: ignore[django-manager-missing] # django-stub lambda realm: get_realm_used_upload_space_cache_key(realm.id), timeout=3600 * 24 * 7 ) def currently_used_upload_space_bytes(realm) -> int: # noqa: N805 + from analytics.models import RealmCount, installation_epoch from zerver.models import Attachment - used_space = Attachment.objects.filter(realm=realm).aggregate(Sum("size"))["size__sum"] - if used_space is None: - return 0 - return used_space + try: + latest_count_stat = RealmCount.objects.filter( + realm=realm, property="upload_quota_used_bytes::day" + ).latest("end_time") + last_recorded_used_space = latest_count_stat.value + last_recorded_date = latest_count_stat.end_time + except RealmCount.DoesNotExist: + last_recorded_used_space = 0 + last_recorded_date = installation_epoch() + + newly_used_space = Attachment.objects.filter( + realm=realm, create_time__gte=last_recorded_date + ).aggregate(Sum("size"))["size__sum"] + + if newly_used_space is None: + return last_recorded_used_space + return last_recorded_used_space + newly_used_space def ensure_not_on_limited_plan(self) -> None: if self.plan_type == Realm.PLAN_TYPE_LIMITED: diff --git a/zerver/tests/test_upload.py b/zerver/tests/test_upload.py index fb7b49ad6c..df874d26f6 100644 --- a/zerver/tests/test_upload.py +++ b/zerver/tests/test_upload.py @@ -9,12 +9,14 @@ from urllib.parse import quote import orjson from django.conf import settings +from django.utils.timezone import now as timezone_now from PIL import Image from typing_extensions import override from urllib3 import encode_multipart_formdata from urllib3.fields import RequestField import zerver.lib.upload +from analytics.models import RealmCount from zerver.actions.create_realm import do_create_realm from zerver.actions.message_send import internal_send_private_message from zerver.actions.realm_icon import do_change_icon_source @@ -23,7 +25,7 @@ from zerver.actions.realm_settings import do_change_realm_plan_type, do_set_real from zerver.actions.user_settings import do_delete_avatar_image from zerver.lib.attachments import validate_attachment_request from zerver.lib.avatar import avatar_url, get_avatar_field -from zerver.lib.cache import cache_get, get_realm_used_upload_space_cache_key +from zerver.lib.cache import cache_delete, cache_get, get_realm_used_upload_space_cache_key from zerver.lib.create_user import copy_default_settings from zerver.lib.initial_password import initial_password from zerver.lib.realm_icon import realm_icon_url @@ -1844,6 +1846,22 @@ class UploadSpaceTests(UploadSerializeMixin, ZulipTestCase): self.assertEqual(None, cache_get(get_realm_used_upload_space_cache_key(self.realm.id))) self.assert_length(data2, self.realm.currently_used_upload_space_bytes()) + now = timezone_now() + RealmCount.objects.create( + realm=self.realm, + property="upload_quota_used_bytes::day", + end_time=now, + value=len(data2), + ) + # Purge the cache since we want to actually execute the function. + cache_delete(get_realm_used_upload_space_cache_key(self.realm.id)) + + self.assert_length(data2, self.realm.currently_used_upload_space_bytes()) + + data3 = b"even-more-data!" + upload_message_attachment("dummy3.txt", len(data3), "text/plain", data3, self.user_profile) + self.assertEqual(len(data2) + len(data3), self.realm.currently_used_upload_space_bytes()) + class DecompressionBombTests(ZulipTestCase): @override