2017-11-16 00:55:49 +01:00
|
|
|
import time
|
|
|
|
from collections import OrderedDict, defaultdict
|
|
|
|
from datetime import datetime, timedelta
|
2017-12-13 01:45:57 +01:00
|
|
|
import logging
|
2017-11-16 00:55:49 +01:00
|
|
|
from typing import Any, Callable, Dict, List, \
|
|
|
|
Optional, Text, Tuple, Type, Union
|
|
|
|
|
2017-02-19 01:59:45 +01:00
|
|
|
from django.conf import settings
|
2016-07-29 21:52:45 +02:00
|
|
|
from django.db import connection, models
|
2017-02-15 17:26:22 +01:00
|
|
|
from django.db.models import F
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2017-11-16 00:55:49 +01:00
|
|
|
from analytics.models import Anomaly, BaseCount, \
|
|
|
|
FillState, InstallationCount, RealmCount, StreamCount, \
|
|
|
|
UserCount, installation_epoch, last_successful_fill
|
2017-12-13 01:45:57 +01:00
|
|
|
from zerver.lib.logging_util import log_to_file
|
2017-11-16 00:55:49 +01:00
|
|
|
from zerver.lib.timestamp import ceiling_to_day, \
|
|
|
|
ceiling_to_hour, floor_to_hour, verify_UTC
|
|
|
|
from zerver.models import Message, Realm, RealmAuditLog, \
|
|
|
|
Stream, UserActivityInterval, UserProfile, models
|
2016-10-13 22:52:39 +02:00
|
|
|
|
|
|
|
## Logging setup ##
|
2017-04-02 07:34:17 +02:00
|
|
|
|
2017-12-13 01:45:57 +01:00
|
|
|
logger = logging.getLogger('zulip.management')
|
|
|
|
log_to_file(logger, settings.ANALYTICS_LOG_PATH)
|
2016-10-13 22:52:39 +02:00
|
|
|
|
2017-03-16 05:08:36 +01:00
|
|
|
# You can't subtract timedelta.max from a datetime, so use this instead
|
|
|
|
TIMEDELTA_MAX = timedelta(days=365*1000)
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
|
2017-04-02 07:34:17 +02:00
|
|
|
## Class definitions ##
|
|
|
|
|
2017-11-05 11:30:44 +01:00
|
|
|
class CountStat:
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
HOUR = 'hour'
|
|
|
|
DAY = 'day'
|
|
|
|
FREQUENCIES = frozenset([HOUR, DAY])
|
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def __init__(self, property: str, data_collector: 'DataCollector', frequency: str,
|
|
|
|
interval: Optional[timedelta]=None) -> None:
|
2016-07-29 21:52:45 +02:00
|
|
|
self.property = property
|
2017-04-02 00:49:53 +02:00
|
|
|
self.data_collector = data_collector
|
2016-07-29 21:52:45 +02:00
|
|
|
# might have to do something different for bitfields
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
if frequency not in self.FREQUENCIES:
|
2017-03-12 01:43:51 +01:00
|
|
|
raise AssertionError("Unknown frequency: %s" % (frequency,))
|
2016-07-29 21:52:45 +02:00
|
|
|
self.frequency = frequency
|
2017-03-16 05:08:36 +01:00
|
|
|
if interval is not None:
|
|
|
|
self.interval = interval
|
|
|
|
elif frequency == CountStat.HOUR:
|
|
|
|
self.interval = timedelta(hours=1)
|
2017-05-07 16:20:00 +02:00
|
|
|
else: # frequency == CountStat.DAY
|
2017-03-16 05:08:36 +01:00
|
|
|
self.interval = timedelta(days=1)
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def __str__(self) -> Text:
|
2017-10-27 09:06:40 +02:00
|
|
|
return "<CountStat: %s>" % (self.property,)
|
2016-12-15 02:04:28 +01:00
|
|
|
|
2017-02-15 17:26:22 +01:00
|
|
|
class LoggingCountStat(CountStat):
|
2017-11-22 07:55:37 +01:00
|
|
|
def __init__(self, property: str, output_table: Type[BaseCount], frequency: str) -> None:
|
2017-04-02 02:43:17 +02:00
|
|
|
CountStat.__init__(self, property, DataCollector(output_table, None), frequency)
|
2017-02-15 17:26:22 +01:00
|
|
|
|
2017-04-05 07:51:55 +02:00
|
|
|
class DependentCountStat(CountStat):
|
2017-11-22 07:55:37 +01:00
|
|
|
def __init__(self, property: str, data_collector: 'DataCollector', frequency: str,
|
|
|
|
interval: Optional[timedelta]=None, dependencies: List[str]=[]) -> None:
|
2017-04-05 07:51:55 +02:00
|
|
|
CountStat.__init__(self, property, data_collector, frequency, interval=interval)
|
|
|
|
self.dependencies = dependencies
|
|
|
|
|
2017-11-05 11:30:44 +01:00
|
|
|
class DataCollector:
|
2017-11-22 07:55:37 +01:00
|
|
|
def __init__(self, output_table: Type[BaseCount],
|
|
|
|
pull_function: Optional[Callable[[str, datetime, datetime], int]]) -> None:
|
2017-04-02 02:12:39 +02:00
|
|
|
self.output_table = output_table
|
2017-04-02 02:43:17 +02:00
|
|
|
self.pull_function = pull_function
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2017-04-02 07:34:17 +02:00
|
|
|
## CountStat-level operations ##
|
2017-01-07 09:19:37 +01:00
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def process_count_stat(stat: CountStat, fill_to_time: datetime) -> None:
|
2017-04-15 09:23:39 +02:00
|
|
|
if stat.frequency == CountStat.HOUR:
|
|
|
|
time_increment = timedelta(hours=1)
|
|
|
|
elif stat.frequency == CountStat.DAY:
|
|
|
|
time_increment = timedelta(days=1)
|
|
|
|
else:
|
|
|
|
raise AssertionError("Unknown frequency: %s" % (stat.frequency,))
|
|
|
|
|
2017-10-05 02:06:43 +02:00
|
|
|
verify_UTC(fill_to_time)
|
2017-10-05 01:51:49 +02:00
|
|
|
if floor_to_hour(fill_to_time) != fill_to_time:
|
|
|
|
raise ValueError("fill_to_time must be on an hour boundary: %s" % (fill_to_time,))
|
2017-04-28 02:22:40 +02:00
|
|
|
|
2017-01-07 09:19:37 +01:00
|
|
|
fill_state = FillState.objects.filter(property=stat.property).first()
|
2016-10-12 23:40:48 +02:00
|
|
|
if fill_state is None:
|
|
|
|
currently_filled = installation_epoch()
|
2017-01-07 09:19:37 +01:00
|
|
|
fill_state = FillState.objects.create(property=stat.property,
|
|
|
|
end_time=currently_filled,
|
|
|
|
state=FillState.DONE)
|
2016-10-13 22:52:39 +02:00
|
|
|
logger.info("INITIALIZED %s %s" % (stat.property, currently_filled))
|
2017-01-07 09:19:37 +01:00
|
|
|
elif fill_state.state == FillState.STARTED:
|
|
|
|
logger.info("UNDO START %s %s" % (stat.property, fill_state.end_time))
|
2017-02-15 04:10:03 +01:00
|
|
|
do_delete_counts_at_hour(stat, fill_state.end_time)
|
2017-04-15 09:23:39 +02:00
|
|
|
currently_filled = fill_state.end_time - time_increment
|
2017-01-07 09:19:37 +01:00
|
|
|
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
|
2016-10-13 22:52:39 +02:00
|
|
|
logger.info("UNDO DONE %s" % (stat.property,))
|
2017-01-07 09:19:37 +01:00
|
|
|
elif fill_state.state == FillState.DONE:
|
|
|
|
currently_filled = fill_state.end_time
|
2016-10-12 23:40:48 +02:00
|
|
|
else:
|
2017-03-12 01:43:51 +01:00
|
|
|
raise AssertionError("Unknown value for FillState.state: %s." % (fill_state.state,))
|
2016-10-12 23:40:48 +02:00
|
|
|
|
2017-04-05 07:51:55 +02:00
|
|
|
if isinstance(stat, DependentCountStat):
|
|
|
|
for dependency in stat.dependencies:
|
|
|
|
dependency_fill_time = last_successful_fill(dependency)
|
|
|
|
if dependency_fill_time is None:
|
|
|
|
logger.warning("DependentCountStat %s run before dependency %s." %
|
|
|
|
(stat.property, dependency))
|
|
|
|
return
|
|
|
|
fill_to_time = min(fill_to_time, dependency_fill_time)
|
|
|
|
|
2017-04-15 09:23:39 +02:00
|
|
|
currently_filled = currently_filled + time_increment
|
2016-10-12 23:40:48 +02:00
|
|
|
while currently_filled <= fill_to_time:
|
2017-03-16 05:08:36 +01:00
|
|
|
logger.info("START %s %s" % (stat.property, currently_filled))
|
2016-10-13 22:52:39 +02:00
|
|
|
start = time.time()
|
2017-01-07 09:19:37 +01:00
|
|
|
do_update_fill_state(fill_state, currently_filled, FillState.STARTED)
|
2016-10-12 23:40:48 +02:00
|
|
|
do_fill_count_stat_at_hour(stat, currently_filled)
|
2017-01-07 09:19:37 +01:00
|
|
|
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
|
2016-10-13 22:52:39 +02:00
|
|
|
end = time.time()
|
2017-04-15 09:23:39 +02:00
|
|
|
currently_filled = currently_filled + time_increment
|
2017-03-16 05:08:36 +01:00
|
|
|
logger.info("DONE %s (%dms)" % (stat.property, (end-start)*1000))
|
2016-10-12 23:40:48 +02:00
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def do_update_fill_state(fill_state: FillState, end_time: datetime, state: int) -> None:
|
2017-04-02 07:34:17 +02:00
|
|
|
fill_state.end_time = end_time
|
|
|
|
fill_state.state = state
|
|
|
|
fill_state.save()
|
|
|
|
|
2017-04-15 09:23:39 +02:00
|
|
|
# We assume end_time is valid (e.g. is on a day or hour boundary as appropriate)
|
|
|
|
# and is timezone aware. It is the caller's responsibility to enforce this!
|
2017-11-22 07:55:37 +01:00
|
|
|
def do_fill_count_stat_at_hour(stat: CountStat, end_time: datetime) -> None:
|
2017-03-16 05:08:36 +01:00
|
|
|
start_time = end_time - stat.interval
|
2017-04-04 20:40:22 +02:00
|
|
|
if not isinstance(stat, LoggingCountStat):
|
2017-04-06 03:07:06 +02:00
|
|
|
timer = time.time()
|
2017-05-26 02:36:54 +02:00
|
|
|
assert(stat.data_collector.pull_function is not None)
|
2017-04-06 03:07:06 +02:00
|
|
|
rows_added = stat.data_collector.pull_function(stat.property, start_time, end_time)
|
|
|
|
logger.info("%s run pull_function (%dms/%sr)" %
|
|
|
|
(stat.property, (time.time()-timer)*1000, rows_added))
|
2017-01-16 22:05:51 +01:00
|
|
|
do_aggregate_to_summary_table(stat, end_time)
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def do_delete_counts_at_hour(stat: CountStat, end_time: datetime) -> None:
|
2017-04-04 20:40:22 +02:00
|
|
|
if isinstance(stat, LoggingCountStat):
|
2017-02-15 17:26:22 +01:00
|
|
|
InstallationCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
2017-04-02 02:12:39 +02:00
|
|
|
if stat.data_collector.output_table in [UserCount, StreamCount]:
|
2017-02-15 17:26:22 +01:00
|
|
|
RealmCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
|
|
|
else:
|
|
|
|
UserCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
|
|
|
StreamCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
|
|
|
RealmCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
|
|
|
InstallationCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
2016-10-12 23:40:48 +02:00
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def do_aggregate_to_summary_table(stat: CountStat, end_time: datetime) -> None:
|
2016-10-11 02:23:42 +02:00
|
|
|
cursor = connection.cursor()
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2016-10-11 02:23:42 +02:00
|
|
|
# Aggregate into RealmCount
|
2017-04-02 02:12:39 +02:00
|
|
|
output_table = stat.data_collector.output_table
|
|
|
|
if output_table in (UserCount, StreamCount):
|
2016-10-11 02:23:42 +02:00
|
|
|
realmcount_query = """
|
|
|
|
INSERT INTO analytics_realmcount
|
2017-01-16 22:05:51 +01:00
|
|
|
(realm_id, value, property, subgroup, end_time)
|
2016-10-11 02:23:42 +02:00
|
|
|
SELECT
|
2017-04-02 02:12:39 +02:00
|
|
|
zerver_realm.id, COALESCE(sum(%(output_table)s.value), 0), '%(property)s',
|
|
|
|
%(output_table)s.subgroup, %%(end_time)s
|
2016-10-11 02:23:42 +02:00
|
|
|
FROM zerver_realm
|
2017-04-02 02:12:39 +02:00
|
|
|
JOIN %(output_table)s
|
2016-10-11 02:23:42 +02:00
|
|
|
ON
|
2017-04-02 02:12:39 +02:00
|
|
|
zerver_realm.id = %(output_table)s.realm_id
|
2017-02-18 00:15:38 +01:00
|
|
|
WHERE
|
2017-04-02 02:12:39 +02:00
|
|
|
%(output_table)s.property = '%(property)s' AND
|
|
|
|
%(output_table)s.end_time = %%(end_time)s
|
|
|
|
GROUP BY zerver_realm.id, %(output_table)s.subgroup
|
|
|
|
""" % {'output_table': output_table._meta.db_table,
|
2017-01-16 22:05:51 +01:00
|
|
|
'property': stat.property}
|
2016-10-13 22:52:39 +02:00
|
|
|
start = time.time()
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
cursor.execute(realmcount_query, {'end_time': end_time})
|
2016-10-13 22:52:39 +02:00
|
|
|
end = time.time()
|
2017-11-04 12:38:25 +01:00
|
|
|
logger.info("%s RealmCount aggregation (%dms/%sr)" % (
|
|
|
|
stat.property, (end - start) * 1000, cursor.rowcount))
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2016-10-11 02:23:42 +02:00
|
|
|
# Aggregate into InstallationCount
|
|
|
|
installationcount_query = """
|
|
|
|
INSERT INTO analytics_installationcount
|
2017-01-16 22:05:51 +01:00
|
|
|
(value, property, subgroup, end_time)
|
2016-10-11 02:23:42 +02:00
|
|
|
SELECT
|
2017-01-16 22:05:51 +01:00
|
|
|
sum(value), '%(property)s', analytics_realmcount.subgroup, %%(end_time)s
|
2016-10-11 02:23:42 +02:00
|
|
|
FROM analytics_realmcount
|
|
|
|
WHERE
|
2016-07-29 21:52:45 +02:00
|
|
|
property = '%(property)s' AND
|
2017-01-16 22:05:51 +01:00
|
|
|
end_time = %%(end_time)s
|
2017-02-18 00:15:38 +01:00
|
|
|
GROUP BY analytics_realmcount.subgroup
|
2017-01-16 22:05:51 +01:00
|
|
|
""" % {'property': stat.property}
|
2016-10-13 22:52:39 +02:00
|
|
|
start = time.time()
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
cursor.execute(installationcount_query, {'end_time': end_time})
|
2016-10-13 22:52:39 +02:00
|
|
|
end = time.time()
|
2017-11-04 12:38:25 +01:00
|
|
|
logger.info("%s InstallationCount aggregation (%dms/%sr)" % (
|
|
|
|
stat.property, (end - start) * 1000, cursor.rowcount))
|
2016-07-29 21:52:45 +02:00
|
|
|
cursor.close()
|
|
|
|
|
2017-04-02 07:34:17 +02:00
|
|
|
## Utility functions called from outside counts.py ##
|
|
|
|
|
|
|
|
# called from zerver/lib/actions.py; should not throw any errors
|
2017-11-22 07:55:37 +01:00
|
|
|
def do_increment_logging_stat(zerver_object: Union[Realm, UserProfile, Stream], stat: CountStat,
|
|
|
|
subgroup: Optional[Union[str, int, bool]], event_time: datetime,
|
|
|
|
increment: int=1) -> None:
|
2017-04-02 07:34:17 +02:00
|
|
|
table = stat.data_collector.output_table
|
|
|
|
if table == RealmCount:
|
|
|
|
id_args = {'realm': zerver_object}
|
|
|
|
elif table == UserCount:
|
|
|
|
id_args = {'realm': zerver_object.realm, 'user': zerver_object}
|
2017-05-07 16:20:00 +02:00
|
|
|
else: # StreamCount
|
2017-04-02 07:34:17 +02:00
|
|
|
id_args = {'realm': zerver_object.realm, 'stream': zerver_object}
|
|
|
|
|
|
|
|
if stat.frequency == CountStat.DAY:
|
|
|
|
end_time = ceiling_to_day(event_time)
|
2017-05-07 16:20:00 +02:00
|
|
|
else: # CountStat.HOUR:
|
2017-04-02 07:34:17 +02:00
|
|
|
end_time = ceiling_to_hour(event_time)
|
|
|
|
|
|
|
|
row, created = table.objects.get_or_create(
|
|
|
|
property=stat.property, subgroup=subgroup, end_time=end_time,
|
|
|
|
defaults={'value': increment}, **id_args)
|
|
|
|
if not created:
|
|
|
|
row.value = F('value') + increment
|
|
|
|
row.save(update_fields=['value'])
|
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def do_drop_all_analytics_tables() -> None:
|
2017-04-02 07:34:17 +02:00
|
|
|
UserCount.objects.all().delete()
|
|
|
|
StreamCount.objects.all().delete()
|
|
|
|
RealmCount.objects.all().delete()
|
|
|
|
InstallationCount.objects.all().delete()
|
|
|
|
FillState.objects.all().delete()
|
|
|
|
Anomaly.objects.all().delete()
|
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def do_drop_single_stat(property: str) -> None:
|
2017-10-05 00:55:43 +02:00
|
|
|
UserCount.objects.filter(property=property).delete()
|
|
|
|
StreamCount.objects.filter(property=property).delete()
|
|
|
|
RealmCount.objects.filter(property=property).delete()
|
|
|
|
InstallationCount.objects.filter(property=property).delete()
|
|
|
|
FillState.objects.filter(property=property).delete()
|
|
|
|
|
2017-04-02 07:34:17 +02:00
|
|
|
## DataCollector-level operations ##
|
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def do_pull_by_sql_query(property: str, start_time: datetime, end_time: datetime, query: str,
|
|
|
|
group_by: Optional[Tuple[models.Model, str]]) -> int:
|
2017-04-01 10:16:02 +02:00
|
|
|
if group_by is None:
|
2016-10-26 00:41:57 +02:00
|
|
|
subgroup = 'NULL'
|
|
|
|
group_by_clause = ''
|
|
|
|
else:
|
2017-04-01 10:16:02 +02:00
|
|
|
subgroup = '%s.%s' % (group_by[0]._meta.db_table, group_by[1])
|
2016-10-26 00:41:57 +02:00
|
|
|
group_by_clause = ', ' + subgroup
|
|
|
|
|
2017-04-02 07:49:50 +02:00
|
|
|
# We do string replacement here because cursor.execute will reject a
|
|
|
|
# group_by_clause given as a param.
|
2017-04-05 07:51:55 +02:00
|
|
|
# We pass in the datetimes as params to cursor.execute so that we don't have to
|
|
|
|
# think about how to convert python datetimes to SQL datetimes.
|
2017-04-02 07:16:32 +02:00
|
|
|
query_ = query % {'property': property, 'subgroup': subgroup,
|
2017-04-02 02:43:17 +02:00
|
|
|
'group_by_clause': group_by_clause}
|
2016-07-29 21:52:45 +02:00
|
|
|
cursor = connection.cursor()
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
cursor.execute(query_, {'time_start': start_time, 'time_end': end_time})
|
2017-04-06 03:07:06 +02:00
|
|
|
rowcount = cursor.rowcount
|
2016-07-29 21:52:45 +02:00
|
|
|
cursor.close()
|
2017-04-06 03:07:06 +02:00
|
|
|
return rowcount
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def sql_data_collector(output_table: Type[BaseCount], query: str,
|
|
|
|
group_by: Optional[Tuple[models.Model, str]]) -> DataCollector:
|
|
|
|
def pull_function(property: str, start_time: datetime, end_time: datetime) -> int:
|
2017-04-06 03:30:36 +02:00
|
|
|
return do_pull_by_sql_query(property, start_time, end_time, query, group_by)
|
2017-04-02 02:43:17 +02:00
|
|
|
return DataCollector(output_table, pull_function)
|
|
|
|
|
2017-11-22 07:55:37 +01:00
|
|
|
def do_pull_minutes_active(property: str, start_time: datetime, end_time: datetime) -> int:
|
2017-04-02 07:34:17 +02:00
|
|
|
user_activity_intervals = UserActivityInterval.objects.filter(
|
|
|
|
end__gt=start_time, start__lt=end_time
|
|
|
|
).select_related(
|
|
|
|
'user_profile'
|
|
|
|
).values_list(
|
|
|
|
'user_profile_id', 'user_profile__realm_id', 'start', 'end')
|
2017-02-15 17:26:22 +01:00
|
|
|
|
2017-05-07 16:20:00 +02:00
|
|
|
seconds_active = defaultdict(float) # type: Dict[Tuple[int, int], float]
|
2017-04-02 07:34:17 +02:00
|
|
|
for user_id, realm_id, interval_start, interval_end in user_activity_intervals:
|
|
|
|
start = max(start_time, interval_start)
|
|
|
|
end = min(end_time, interval_end)
|
|
|
|
seconds_active[(user_id, realm_id)] += (end - start).total_seconds()
|
2017-02-15 17:26:22 +01:00
|
|
|
|
2017-04-02 07:34:17 +02:00
|
|
|
rows = [UserCount(user_id=ids[0], realm_id=ids[1], property=property,
|
|
|
|
end_time=end_time, value=int(seconds // 60))
|
|
|
|
for ids, seconds in seconds_active.items() if seconds >= 60]
|
|
|
|
UserCount.objects.bulk_create(rows)
|
2017-04-06 03:07:06 +02:00
|
|
|
return len(rows)
|
2016-07-29 21:52:45 +02:00
|
|
|
|
|
|
|
count_message_by_user_query = """
|
|
|
|
INSERT INTO analytics_usercount
|
2017-01-16 22:05:51 +01:00
|
|
|
(user_id, realm_id, value, property, subgroup, end_time)
|
2016-07-29 21:52:45 +02:00
|
|
|
SELECT
|
2017-11-04 12:38:25 +01:00
|
|
|
zerver_userprofile.id, zerver_userprofile.realm_id, count(*),
|
|
|
|
'%(property)s', %(subgroup)s, %%(time_end)s
|
2016-07-29 21:52:45 +02:00
|
|
|
FROM zerver_userprofile
|
|
|
|
JOIN zerver_message
|
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_userprofile.id = zerver_message.sender_id
|
|
|
|
WHERE
|
|
|
|
zerver_userprofile.date_joined < %%(time_end)s AND
|
2016-07-29 21:52:45 +02:00
|
|
|
zerver_message.pub_date >= %%(time_start)s AND
|
|
|
|
zerver_message.pub_date < %%(time_end)s
|
2016-10-26 00:41:57 +02:00
|
|
|
GROUP BY zerver_userprofile.id %(group_by_clause)s
|
2016-07-29 21:52:45 +02:00
|
|
|
"""
|
|
|
|
|
2017-04-02 07:49:50 +02:00
|
|
|
# Note: ignores the group_by / group_by_clause.
|
2016-11-01 00:17:29 +01:00
|
|
|
count_message_type_by_user_query = """
|
|
|
|
INSERT INTO analytics_usercount
|
2017-01-16 22:05:51 +01:00
|
|
|
(realm_id, user_id, value, property, subgroup, end_time)
|
|
|
|
SELECT realm_id, id, SUM(count) AS value, '%(property)s', message_type, %%(time_end)s
|
2016-11-01 00:17:29 +01:00
|
|
|
FROM
|
|
|
|
(
|
|
|
|
SELECT zerver_userprofile.realm_id, zerver_userprofile.id, count(*),
|
|
|
|
CASE WHEN
|
2017-03-19 00:11:07 +01:00
|
|
|
zerver_recipient.type = 1 THEN 'private_message'
|
|
|
|
WHEN
|
|
|
|
zerver_recipient.type = 3 THEN 'huddle_message'
|
2016-11-01 00:17:29 +01:00
|
|
|
WHEN
|
|
|
|
zerver_stream.invite_only = TRUE THEN 'private_stream'
|
|
|
|
ELSE 'public_stream'
|
|
|
|
END
|
|
|
|
message_type
|
|
|
|
|
|
|
|
FROM zerver_userprofile
|
|
|
|
JOIN zerver_message
|
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_userprofile.id = zerver_message.sender_id AND
|
2016-11-01 00:17:29 +01:00
|
|
|
zerver_message.pub_date >= %%(time_start)s AND
|
|
|
|
zerver_message.pub_date < %%(time_end)s
|
|
|
|
JOIN zerver_recipient
|
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_message.recipient_id = zerver_recipient.id
|
2017-01-11 02:11:38 +01:00
|
|
|
LEFT JOIN zerver_stream
|
2016-11-01 00:17:29 +01:00
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_recipient.type_id = zerver_stream.id
|
2017-11-04 12:38:25 +01:00
|
|
|
GROUP BY
|
|
|
|
zerver_userprofile.realm_id, zerver_userprofile.id,
|
|
|
|
zerver_recipient.type, zerver_stream.invite_only
|
2016-11-01 00:17:29 +01:00
|
|
|
) AS subquery
|
|
|
|
GROUP BY realm_id, id, message_type
|
|
|
|
"""
|
|
|
|
|
2017-04-02 07:49:50 +02:00
|
|
|
# This query joins to the UserProfile table since all current queries that
|
|
|
|
# use this also subgroup on UserProfile.is_bot. If in the future there is a
|
|
|
|
# stat that counts messages by stream and doesn't need the UserProfile
|
|
|
|
# table, consider writing a new query for efficiency.
|
2016-12-18 19:10:58 +01:00
|
|
|
count_message_by_stream_query = """
|
2016-12-15 20:17:16 +01:00
|
|
|
INSERT INTO analytics_streamcount
|
2017-01-16 22:05:51 +01:00
|
|
|
(stream_id, realm_id, value, property, subgroup, end_time)
|
2016-12-15 20:17:16 +01:00
|
|
|
SELECT
|
2017-01-16 22:05:51 +01:00
|
|
|
zerver_stream.id, zerver_stream.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
|
2016-12-15 20:17:16 +01:00
|
|
|
FROM zerver_stream
|
|
|
|
JOIN zerver_recipient
|
|
|
|
ON
|
|
|
|
zerver_stream.id = zerver_recipient.type_id
|
|
|
|
JOIN zerver_message
|
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_recipient.id = zerver_message.recipient_id
|
|
|
|
JOIN zerver_userprofile
|
|
|
|
ON
|
|
|
|
zerver_message.sender_id = zerver_userprofile.id
|
|
|
|
WHERE
|
|
|
|
zerver_stream.date_created < %%(time_end)s AND
|
|
|
|
zerver_recipient.type = 2 AND
|
2016-12-15 20:17:16 +01:00
|
|
|
zerver_message.pub_date >= %%(time_start)s AND
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_message.pub_date < %%(time_end)s
|
2016-12-18 19:10:58 +01:00
|
|
|
GROUP BY zerver_stream.id %(group_by_clause)s
|
2016-12-15 20:17:16 +01:00
|
|
|
"""
|
|
|
|
|
2017-04-02 07:34:17 +02:00
|
|
|
# Hardcodes the query needed by active_users:is_bot:day, since that is
|
|
|
|
# currently the only stat that uses this.
|
|
|
|
count_user_by_realm_query = """
|
|
|
|
INSERT INTO analytics_realmcount
|
|
|
|
(realm_id, value, property, subgroup, end_time)
|
2017-03-16 07:58:23 +01:00
|
|
|
SELECT
|
2017-04-02 07:34:17 +02:00
|
|
|
zerver_realm.id, count(*),'%(property)s', %(subgroup)s, %%(time_end)s
|
|
|
|
FROM zerver_realm
|
|
|
|
JOIN zerver_userprofile
|
2017-03-16 07:58:23 +01:00
|
|
|
ON
|
2017-04-02 07:34:17 +02:00
|
|
|
zerver_realm.id = zerver_userprofile.realm_id
|
2017-03-16 07:58:23 +01:00
|
|
|
WHERE
|
2017-04-02 07:34:17 +02:00
|
|
|
zerver_realm.date_created < %%(time_end)s AND
|
|
|
|
zerver_userprofile.date_joined >= %%(time_start)s AND
|
|
|
|
zerver_userprofile.date_joined < %%(time_end)s AND
|
|
|
|
zerver_userprofile.is_active = TRUE
|
|
|
|
GROUP BY zerver_realm.id %(group_by_clause)s
|
2017-03-16 07:58:23 +01:00
|
|
|
"""
|
|
|
|
|
2017-04-01 03:26:35 +02:00
|
|
|
# Currently hardcodes the query needed for active_users_audit:is_bot:day.
|
|
|
|
# Assumes that a user cannot have two RealmAuditLog entries with the same event_time and
|
|
|
|
# event_type in ['user_created', 'user_deactivated', etc].
|
|
|
|
# In particular, it's important to ensure that migrations don't cause that to happen.
|
|
|
|
check_realmauditlog_by_user_query = """
|
|
|
|
INSERT INTO analytics_usercount
|
|
|
|
(user_id, realm_id, value, property, subgroup, end_time)
|
|
|
|
SELECT
|
|
|
|
ral1.modified_user_id, ral1.realm_id, 1, '%(property)s', %(subgroup)s, %%(time_end)s
|
|
|
|
FROM zerver_realmauditlog ral1
|
|
|
|
JOIN (
|
|
|
|
SELECT modified_user_id, max(event_time) AS max_event_time
|
|
|
|
FROM zerver_realmauditlog
|
|
|
|
WHERE
|
|
|
|
event_type in ('user_created', 'user_deactivated', 'user_activated', 'user_reactivated') AND
|
|
|
|
event_time < %%(time_end)s
|
|
|
|
GROUP BY modified_user_id
|
|
|
|
) ral2
|
|
|
|
ON
|
|
|
|
ral1.event_time = max_event_time AND
|
|
|
|
ral1.modified_user_id = ral2.modified_user_id
|
|
|
|
JOIN zerver_userprofile
|
|
|
|
ON
|
|
|
|
ral1.modified_user_id = zerver_userprofile.id
|
|
|
|
WHERE
|
|
|
|
ral1.event_type in ('user_created', 'user_activated', 'user_reactivated')
|
|
|
|
"""
|
|
|
|
|
2017-04-02 07:34:17 +02:00
|
|
|
check_useractivityinterval_by_user_query = """
|
|
|
|
INSERT INTO analytics_usercount
|
|
|
|
(user_id, realm_id, value, property, subgroup, end_time)
|
|
|
|
SELECT
|
|
|
|
zerver_userprofile.id, zerver_userprofile.realm_id, 1, '%(property)s', %(subgroup)s, %%(time_end)s
|
|
|
|
FROM zerver_userprofile
|
|
|
|
JOIN zerver_useractivityinterval
|
|
|
|
ON
|
|
|
|
zerver_userprofile.id = zerver_useractivityinterval.user_profile_id
|
|
|
|
WHERE
|
|
|
|
zerver_useractivityinterval.end >= %%(time_start)s AND
|
|
|
|
zerver_useractivityinterval.start < %%(time_end)s
|
|
|
|
GROUP BY zerver_userprofile.id %(group_by_clause)s
|
|
|
|
"""
|
2017-03-16 09:23:44 +01:00
|
|
|
|
2017-04-05 07:51:55 +02:00
|
|
|
count_realm_active_humans_query = """
|
|
|
|
INSERT INTO analytics_realmcount
|
|
|
|
(realm_id, value, property, subgroup, end_time)
|
|
|
|
SELECT
|
|
|
|
usercount1.realm_id, count(*), '%(property)s', NULL, %%(time_end)s
|
|
|
|
FROM (
|
|
|
|
SELECT realm_id, user_id
|
|
|
|
FROM analytics_usercount
|
|
|
|
WHERE
|
|
|
|
property = 'active_users_audit:is_bot:day' AND
|
|
|
|
subgroup = 'false' AND
|
|
|
|
end_time = %%(time_end)s
|
|
|
|
) usercount1
|
|
|
|
JOIN (
|
|
|
|
SELECT realm_id, user_id
|
|
|
|
FROM analytics_usercount
|
|
|
|
WHERE
|
|
|
|
property = '15day_actives::day' AND
|
|
|
|
end_time = %%(time_end)s
|
|
|
|
) usercount2
|
|
|
|
ON
|
|
|
|
usercount1.user_id = usercount2.user_id
|
|
|
|
GROUP BY usercount1.realm_id
|
|
|
|
"""
|
|
|
|
|
2017-04-02 07:34:17 +02:00
|
|
|
# Currently unused and untested
|
|
|
|
count_stream_by_realm_query = """
|
|
|
|
INSERT INTO analytics_realmcount
|
|
|
|
(realm_id, value, property, subgroup, end_time)
|
|
|
|
SELECT
|
|
|
|
zerver_realm.id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
|
|
|
|
FROM zerver_realm
|
|
|
|
JOIN zerver_stream
|
|
|
|
ON
|
|
|
|
zerver_realm.id = zerver_stream.realm_id AND
|
|
|
|
WHERE
|
|
|
|
zerver_realm.date_created < %%(time_end)s AND
|
|
|
|
zerver_stream.date_created >= %%(time_start)s AND
|
|
|
|
zerver_stream.date_created < %%(time_end)s
|
|
|
|
GROUP BY zerver_realm.id %(group_by_clause)s
|
|
|
|
"""
|
2017-03-16 09:23:44 +01:00
|
|
|
|
2017-04-02 07:34:17 +02:00
|
|
|
## CountStat declarations ##
|
2017-03-16 09:23:44 +01:00
|
|
|
|
2017-02-15 04:17:00 +01:00
|
|
|
count_stats_ = [
|
2017-04-15 07:42:10 +02:00
|
|
|
# Messages Sent stats
|
|
|
|
# Stats that count the number of messages sent in various ways.
|
|
|
|
# These are also the set of stats that read from the Message table.
|
|
|
|
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat('messages_sent:is_bot:hour',
|
2017-04-06 03:30:36 +02:00
|
|
|
sql_data_collector(UserCount, count_message_by_user_query, (UserProfile, 'is_bot')),
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat.HOUR),
|
|
|
|
CountStat('messages_sent:message_type:day',
|
2017-04-06 03:30:36 +02:00
|
|
|
sql_data_collector(UserCount, count_message_type_by_user_query, None), CountStat.DAY),
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat('messages_sent:client:day',
|
2017-04-06 03:30:36 +02:00
|
|
|
sql_data_collector(UserCount, count_message_by_user_query, (Message, 'sending_client_id')),
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat.DAY),
|
|
|
|
CountStat('messages_in_stream:is_bot:day',
|
2017-04-06 03:30:36 +02:00
|
|
|
sql_data_collector(StreamCount, count_message_by_stream_query, (UserProfile, 'is_bot')),
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat.DAY),
|
2017-04-01 02:34:41 +02:00
|
|
|
|
2017-04-15 07:42:10 +02:00
|
|
|
# Number of Users stats
|
|
|
|
# Stats that count the number of active users in the UserProfile.is_active sense.
|
|
|
|
|
2017-04-01 03:26:35 +02:00
|
|
|
# 'active_users_audit:is_bot:day' is the canonical record of which users were
|
|
|
|
# active on which days (in the UserProfile.is_active sense).
|
2017-04-05 07:51:55 +02:00
|
|
|
# Important that this stay a daily stat, so that 'realm_active_humans::day' works as expected.
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat('active_users_audit:is_bot:day',
|
2017-04-06 03:30:36 +02:00
|
|
|
sql_data_collector(UserCount, check_realmauditlog_by_user_query, (UserProfile, 'is_bot')),
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat.DAY),
|
2017-04-15 07:42:10 +02:00
|
|
|
# Sanity check on 'active_users_audit:is_bot:day', and a archetype for future LoggingCountStats.
|
|
|
|
# In RealmCount, 'active_users_audit:is_bot:day' should be the partial
|
|
|
|
# sum sequence of 'active_users_log:is_bot:day', for any realm that
|
|
|
|
# started after the latter stat was introduced.
|
2017-04-01 02:34:41 +02:00
|
|
|
LoggingCountStat('active_users_log:is_bot:day', RealmCount, CountStat.DAY),
|
2017-04-15 07:42:10 +02:00
|
|
|
# Another sanity check on 'active_users_audit:is_bot:day'. Is only an
|
|
|
|
# approximation, e.g. if a user is deactivated between the end of the
|
|
|
|
# day and when this stat is run, they won't be counted. However, is the
|
|
|
|
# simplest of the three to inspect by hand.
|
|
|
|
CountStat('active_users:is_bot:day',
|
|
|
|
sql_data_collector(RealmCount, count_user_by_realm_query, (UserProfile, 'is_bot')),
|
|
|
|
CountStat.DAY, interval=TIMEDELTA_MAX),
|
|
|
|
|
|
|
|
# User Activity stats
|
|
|
|
# Stats that measure user activity in the UserActivityInterval sense.
|
2017-04-01 02:34:41 +02:00
|
|
|
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat('15day_actives::day',
|
2017-04-06 03:30:36 +02:00
|
|
|
sql_data_collector(UserCount, check_useractivityinterval_by_user_query, None),
|
2017-04-15 07:20:16 +02:00
|
|
|
CountStat.DAY, interval=timedelta(days=15)-UserActivityInterval.MIN_INTERVAL_LENGTH),
|
2017-04-05 07:51:55 +02:00
|
|
|
CountStat('minutes_active::day', DataCollector(UserCount, do_pull_minutes_active), CountStat.DAY),
|
|
|
|
|
2017-12-05 20:21:25 +01:00
|
|
|
# Rate limiting stats
|
|
|
|
|
|
|
|
# Used to limit the number of invitation emails sent by a realm
|
|
|
|
LoggingCountStat('invites_sent::day', RealmCount, CountStat.DAY),
|
|
|
|
|
2017-04-15 07:42:10 +02:00
|
|
|
# Dependent stats
|
|
|
|
# Must come after their dependencies.
|
|
|
|
|
2017-04-05 07:51:55 +02:00
|
|
|
# Canonical account of the number of active humans in a realm on each day.
|
|
|
|
DependentCountStat('realm_active_humans::day',
|
|
|
|
sql_data_collector(RealmCount, count_realm_active_humans_query, None),
|
|
|
|
CountStat.DAY,
|
|
|
|
dependencies=['active_users_audit:is_bot:day', '15day_actives::day'])
|
2017-02-15 04:17:00 +01:00
|
|
|
]
|
|
|
|
|
2017-04-05 07:51:55 +02:00
|
|
|
COUNT_STATS = OrderedDict([(stat.property, stat) for stat in count_stats_])
|