analytics: Rewrite realm_active_humans::day query.

This makes it no longer dependent on active_users_audit:is_bot:day,
which subsequent commits will make a RealmCount, not UserCount, query.
This folds the same behaviour of `active_users_audit` directly into
the query; however, only running over active users, using the index
from the earlier commit, and using the new `DISTINCT ON` formulation
make this a fast query compared to `active_users_audit:is_bot:day` +
the old `realm_active_humans::day`.
This commit is contained in:
Alex Vandiver 2024-06-02 15:16:44 +00:00 committed by Tim Abbott
parent e638ae44a8
commit 195defb031
2 changed files with 52 additions and 55 deletions

View File

@ -783,29 +783,45 @@ def count_realm_active_humans_query(realm: Optional[Realm]) -> QueryFn:
INSERT INTO analytics_realmcount INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time) (realm_id, value, property, subgroup, end_time)
SELECT SELECT
usercount1.realm_id, count(*), %(property)s, NULL, %(time_end)s active_usercount.realm_id, count(*), %(property)s, NULL, %(time_end)s
FROM ( FROM (
SELECT realm_id, user_id SELECT
FROM analytics_usercount realm_id,
WHERE user_id
property = 'active_users_audit:is_bot:day' AND FROM
subgroup = 'false' AND analytics_usercount
{realm_clause} WHERE
end_time = %(time_end)s property = '15day_actives::day'
) usercount1 {realm_clause}
AND end_time = %(time_end)s
) active_usercount
JOIN zerver_userprofile ON active_usercount.user_id = zerver_userprofile.id
JOIN ( JOIN (
SELECT realm_id, user_id SELECT DISTINCT ON (modified_user_id)
FROM analytics_usercount modified_user_id, event_type
WHERE FROM
property = '15day_actives::day' AND zerver_realmauditlog
{realm_clause} WHERE
end_time = %(time_end)s event_type IN ({user_created}, {user_activated}, {user_deactivated}, {user_reactivated})
) usercount2 AND event_time < %(time_end)s
ON ORDER BY
usercount1.user_id = usercount2.user_id modified_user_id,
GROUP BY usercount1.realm_id event_time DESC
) last_user_event ON last_user_event.modified_user_id = active_usercount.user_id
WHERE
NOT zerver_userprofile.is_bot
AND event_type IN ({user_created}, {user_activated}, {user_reactivated})
GROUP BY
active_usercount.realm_id
""" """
).format(**kwargs, realm_clause=realm_clause) ).format(
**kwargs,
user_created=Literal(RealmAuditLog.USER_CREATED),
user_activated=Literal(RealmAuditLog.USER_ACTIVATED),
user_deactivated=Literal(RealmAuditLog.USER_DEACTIVATED),
user_reactivated=Literal(RealmAuditLog.USER_REACTIVATED),
realm_clause=realm_clause,
)
# Currently unused and untested # Currently unused and untested
@ -951,7 +967,7 @@ def get_count_stats(realm: Optional[Realm] = None) -> Dict[str, CountStat]:
"realm_active_humans::day", "realm_active_humans::day",
sql_data_collector(RealmCount, count_realm_active_humans_query(realm), None), sql_data_collector(RealmCount, count_realm_active_humans_query(realm), None),
CountStat.DAY, CountStat.DAY,
dependencies=["active_users_audit:is_bot:day", "15day_actives::day"], dependencies=["15day_actives::day"],
), ),
] ]

View File

@ -3,7 +3,6 @@ from datetime import datetime, timedelta, timezone
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type from typing import Any, Dict, Iterator, List, Optional, Tuple, Type
from unittest import mock from unittest import mock
import orjson
import time_machine import time_machine
from django.apps import apps from django.apps import apps
from django.db import models from django.db import models
@ -2081,18 +2080,6 @@ class TestRealmActiveHumans(AnalyticsTestCase):
self.stat = COUNT_STATS["realm_active_humans::day"] self.stat = COUNT_STATS["realm_active_humans::day"]
self.current_property = self.stat.property self.current_property = self.stat.property
def mark_audit_active(self, user: UserProfile, end_time: Optional[datetime] = None) -> None:
if end_time is None:
end_time = self.TIME_ZERO
UserCount.objects.create(
user=user,
realm=user.realm,
property="active_users_audit:is_bot:day",
subgroup=orjson.dumps(user.is_bot).decode(),
end_time=end_time,
value=1,
)
def mark_15day_active(self, user: UserProfile, end_time: Optional[datetime] = None) -> None: def mark_15day_active(self, user: UserProfile, end_time: Optional[datetime] = None) -> None:
if end_time is None: if end_time is None:
end_time = self.TIME_ZERO end_time = self.TIME_ZERO
@ -2100,38 +2087,35 @@ class TestRealmActiveHumans(AnalyticsTestCase):
user=user, realm=user.realm, property="15day_actives::day", end_time=end_time, value=1 user=user, realm=user.realm, property="15day_actives::day", end_time=end_time, value=1
) )
def test_basic_boolean_logic(self) -> None: def test_basic_logic(self) -> None:
user = self.create_user() user = self.create_user()
self.mark_audit_active(user, end_time=self.TIME_ZERO - self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO) self.mark_15day_active(user, end_time=self.TIME_ZERO)
self.mark_audit_active(user, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO + self.DAY) self.mark_15day_active(user, end_time=self.TIME_ZERO + self.DAY)
for i in [-1, 0, 1]: for i in [-1, 0, 1]:
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i * self.DAY) do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i * self.DAY)
self.assertTableState(RealmCount, ["value", "end_time"], [[1, self.TIME_ZERO + self.DAY]]) self.assertTableState(
RealmCount, ["value", "end_time"], [[1, self.TIME_ZERO], [1, self.TIME_ZERO + self.DAY]]
)
def test_bots_not_counted(self) -> None: def test_bots_not_counted(self) -> None:
bot = self.create_user(is_bot=True) bot = self.create_user(is_bot=True)
self.mark_audit_active(bot)
self.mark_15day_active(bot) self.mark_15day_active(bot)
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO) do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO)
self.assertTableState(RealmCount, [], []) self.assertTableState(RealmCount, [], [])
def test_multiple_users_realms_and_times(self) -> None: def test_multiple_users_realms_and_times(self) -> None:
user1 = self.create_user() user1 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY)
user2 = self.create_user() user2 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY)
second_realm = do_create_realm(string_id="second", name="second") second_realm = do_create_realm(string_id="second", name="second")
user3 = self.create_user(realm=second_realm) user3 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY, realm=second_realm)
user4 = self.create_user(realm=second_realm) user4 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY, realm=second_realm)
user5 = self.create_user(realm=second_realm) user5 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY, realm=second_realm)
for user in [user1, user2, user3, user4, user5]:
self.mark_audit_active(user)
self.mark_15day_active(user)
for user in [user1, user3, user4]: for user in [user1, user3, user4]:
self.mark_audit_active(user, end_time=self.TIME_ZERO - self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO - self.DAY) self.mark_15day_active(user, end_time=self.TIME_ZERO - self.DAY)
for user in [user1, user2, user3, user4, user5]:
self.mark_15day_active(user)
for i in [-1, 0, 1]: for i in [-1, 0, 1]:
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i * self.DAY) do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i * self.DAY)
@ -2139,17 +2123,14 @@ class TestRealmActiveHumans(AnalyticsTestCase):
RealmCount, RealmCount,
["value", "realm", "end_time"], ["value", "realm", "end_time"],
[ [
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
[1, self.default_realm, self.TIME_ZERO - self.DAY], [1, self.default_realm, self.TIME_ZERO - self.DAY],
[2, second_realm, self.TIME_ZERO - self.DAY], [2, second_realm, self.TIME_ZERO - self.DAY],
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
], ],
) )
# Check that adding spurious entries doesn't make a difference # Check that adding spurious entries doesn't make a difference
self.mark_audit_active(user1, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user2, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user2, end_time=self.TIME_ZERO - self.DAY)
self.create_user() self.create_user()
third_realm = do_create_realm(string_id="third", name="third") third_realm = do_create_realm(string_id="third", name="third")
self.create_user(realm=third_realm) self.create_user(realm=third_realm)
@ -2162,10 +2143,10 @@ class TestRealmActiveHumans(AnalyticsTestCase):
RealmCount, RealmCount,
["value", "realm", "end_time"], ["value", "realm", "end_time"],
[ [
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
[1, self.default_realm, self.TIME_ZERO - self.DAY], [1, self.default_realm, self.TIME_ZERO - self.DAY],
[2, second_realm, self.TIME_ZERO - self.DAY], [2, second_realm, self.TIME_ZERO - self.DAY],
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
], ],
) )