analytics: Rewrite realm_active_humans::day query.

This makes it no longer dependent on active_users_audit:is_bot:day,
which subsequent commits will make a RealmCount, not UserCount, query.
This folds the same behaviour of `active_users_audit` directly into
the query; however, only running over active users, using the index
from the earlier commit, and using the new `DISTINCT ON` formulation
make this a fast query compared to `active_users_audit:is_bot:day` +
the old `realm_active_humans::day`.
This commit is contained in:
Alex Vandiver 2024-06-02 15:16:44 +00:00 committed by Tim Abbott
parent e638ae44a8
commit 195defb031
2 changed files with 52 additions and 55 deletions

View File

@ -783,29 +783,45 @@ def count_realm_active_humans_query(realm: Optional[Realm]) -> QueryFn:
INSERT INTO analytics_realmcount INSERT INTO analytics_realmcount
(realm_id, value, property, subgroup, end_time) (realm_id, value, property, subgroup, end_time)
SELECT SELECT
usercount1.realm_id, count(*), %(property)s, NULL, %(time_end)s active_usercount.realm_id, count(*), %(property)s, NULL, %(time_end)s
FROM ( FROM (
SELECT realm_id, user_id SELECT
FROM analytics_usercount realm_id,
user_id
FROM
analytics_usercount
WHERE WHERE
property = 'active_users_audit:is_bot:day' AND property = '15day_actives::day'
subgroup = 'false' AND
{realm_clause} {realm_clause}
end_time = %(time_end)s AND end_time = %(time_end)s
) usercount1 ) active_usercount
JOIN zerver_userprofile ON active_usercount.user_id = zerver_userprofile.id
JOIN ( JOIN (
SELECT realm_id, user_id SELECT DISTINCT ON (modified_user_id)
FROM analytics_usercount modified_user_id, event_type
FROM
zerver_realmauditlog
WHERE WHERE
property = '15day_actives::day' AND event_type IN ({user_created}, {user_activated}, {user_deactivated}, {user_reactivated})
{realm_clause} AND event_time < %(time_end)s
end_time = %(time_end)s ORDER BY
) usercount2 modified_user_id,
ON event_time DESC
usercount1.user_id = usercount2.user_id ) last_user_event ON last_user_event.modified_user_id = active_usercount.user_id
GROUP BY usercount1.realm_id WHERE
NOT zerver_userprofile.is_bot
AND event_type IN ({user_created}, {user_activated}, {user_reactivated})
GROUP BY
active_usercount.realm_id
""" """
).format(**kwargs, realm_clause=realm_clause) ).format(
**kwargs,
user_created=Literal(RealmAuditLog.USER_CREATED),
user_activated=Literal(RealmAuditLog.USER_ACTIVATED),
user_deactivated=Literal(RealmAuditLog.USER_DEACTIVATED),
user_reactivated=Literal(RealmAuditLog.USER_REACTIVATED),
realm_clause=realm_clause,
)
# Currently unused and untested # Currently unused and untested
@ -951,7 +967,7 @@ def get_count_stats(realm: Optional[Realm] = None) -> Dict[str, CountStat]:
"realm_active_humans::day", "realm_active_humans::day",
sql_data_collector(RealmCount, count_realm_active_humans_query(realm), None), sql_data_collector(RealmCount, count_realm_active_humans_query(realm), None),
CountStat.DAY, CountStat.DAY,
dependencies=["active_users_audit:is_bot:day", "15day_actives::day"], dependencies=["15day_actives::day"],
), ),
] ]

View File

@ -3,7 +3,6 @@ from datetime import datetime, timedelta, timezone
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type from typing import Any, Dict, Iterator, List, Optional, Tuple, Type
from unittest import mock from unittest import mock
import orjson
import time_machine import time_machine
from django.apps import apps from django.apps import apps
from django.db import models from django.db import models
@ -2081,18 +2080,6 @@ class TestRealmActiveHumans(AnalyticsTestCase):
self.stat = COUNT_STATS["realm_active_humans::day"] self.stat = COUNT_STATS["realm_active_humans::day"]
self.current_property = self.stat.property self.current_property = self.stat.property
def mark_audit_active(self, user: UserProfile, end_time: Optional[datetime] = None) -> None:
if end_time is None:
end_time = self.TIME_ZERO
UserCount.objects.create(
user=user,
realm=user.realm,
property="active_users_audit:is_bot:day",
subgroup=orjson.dumps(user.is_bot).decode(),
end_time=end_time,
value=1,
)
def mark_15day_active(self, user: UserProfile, end_time: Optional[datetime] = None) -> None: def mark_15day_active(self, user: UserProfile, end_time: Optional[datetime] = None) -> None:
if end_time is None: if end_time is None:
end_time = self.TIME_ZERO end_time = self.TIME_ZERO
@ -2100,38 +2087,35 @@ class TestRealmActiveHumans(AnalyticsTestCase):
user=user, realm=user.realm, property="15day_actives::day", end_time=end_time, value=1 user=user, realm=user.realm, property="15day_actives::day", end_time=end_time, value=1
) )
def test_basic_boolean_logic(self) -> None: def test_basic_logic(self) -> None:
user = self.create_user() user = self.create_user()
self.mark_audit_active(user, end_time=self.TIME_ZERO - self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO) self.mark_15day_active(user, end_time=self.TIME_ZERO)
self.mark_audit_active(user, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO + self.DAY) self.mark_15day_active(user, end_time=self.TIME_ZERO + self.DAY)
for i in [-1, 0, 1]: for i in [-1, 0, 1]:
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i * self.DAY) do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i * self.DAY)
self.assertTableState(RealmCount, ["value", "end_time"], [[1, self.TIME_ZERO + self.DAY]]) self.assertTableState(
RealmCount, ["value", "end_time"], [[1, self.TIME_ZERO], [1, self.TIME_ZERO + self.DAY]]
)
def test_bots_not_counted(self) -> None: def test_bots_not_counted(self) -> None:
bot = self.create_user(is_bot=True) bot = self.create_user(is_bot=True)
self.mark_audit_active(bot)
self.mark_15day_active(bot) self.mark_15day_active(bot)
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO) do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO)
self.assertTableState(RealmCount, [], []) self.assertTableState(RealmCount, [], [])
def test_multiple_users_realms_and_times(self) -> None: def test_multiple_users_realms_and_times(self) -> None:
user1 = self.create_user() user1 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY)
user2 = self.create_user() user2 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY)
second_realm = do_create_realm(string_id="second", name="second") second_realm = do_create_realm(string_id="second", name="second")
user3 = self.create_user(realm=second_realm) user3 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY, realm=second_realm)
user4 = self.create_user(realm=second_realm) user4 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY, realm=second_realm)
user5 = self.create_user(realm=second_realm) user5 = self.create_user(date_joined=self.TIME_ZERO - 2 * self.DAY, realm=second_realm)
for user in [user1, user2, user3, user4, user5]:
self.mark_audit_active(user)
self.mark_15day_active(user)
for user in [user1, user3, user4]: for user in [user1, user3, user4]:
self.mark_audit_active(user, end_time=self.TIME_ZERO - self.DAY)
self.mark_15day_active(user, end_time=self.TIME_ZERO - self.DAY) self.mark_15day_active(user, end_time=self.TIME_ZERO - self.DAY)
for user in [user1, user2, user3, user4, user5]:
self.mark_15day_active(user)
for i in [-1, 0, 1]: for i in [-1, 0, 1]:
do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i * self.DAY) do_fill_count_stat_at_hour(self.stat, self.TIME_ZERO + i * self.DAY)
@ -2139,17 +2123,14 @@ class TestRealmActiveHumans(AnalyticsTestCase):
RealmCount, RealmCount,
["value", "realm", "end_time"], ["value", "realm", "end_time"],
[ [
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
[1, self.default_realm, self.TIME_ZERO - self.DAY], [1, self.default_realm, self.TIME_ZERO - self.DAY],
[2, second_realm, self.TIME_ZERO - self.DAY], [2, second_realm, self.TIME_ZERO - self.DAY],
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
], ],
) )
# Check that adding spurious entries doesn't make a difference # Check that adding spurious entries doesn't make a difference
self.mark_audit_active(user1, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user2, end_time=self.TIME_ZERO + self.DAY)
self.mark_15day_active(user2, end_time=self.TIME_ZERO - self.DAY)
self.create_user() self.create_user()
third_realm = do_create_realm(string_id="third", name="third") third_realm = do_create_realm(string_id="third", name="third")
self.create_user(realm=third_realm) self.create_user(realm=third_realm)
@ -2162,10 +2143,10 @@ class TestRealmActiveHumans(AnalyticsTestCase):
RealmCount, RealmCount,
["value", "realm", "end_time"], ["value", "realm", "end_time"],
[ [
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
[1, self.default_realm, self.TIME_ZERO - self.DAY], [1, self.default_realm, self.TIME_ZERO - self.DAY],
[2, second_realm, self.TIME_ZERO - self.DAY], [2, second_realm, self.TIME_ZERO - self.DAY],
[2, self.default_realm, self.TIME_ZERO],
[3, second_realm, self.TIME_ZERO],
], ],
) )