2017-02-19 01:59:45 +01:00
|
|
|
from django.conf import settings
|
2016-07-29 21:52:45 +02:00
|
|
|
from django.db import connection, models
|
2017-02-15 17:26:22 +01:00
|
|
|
from django.db.models import F
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
from django.utils import timezone
|
2016-07-29 21:52:45 +02:00
|
|
|
|
|
|
|
from analytics.models import InstallationCount, RealmCount, \
|
2017-03-12 08:55:55 +01:00
|
|
|
UserCount, StreamCount, BaseCount, FillState, Anomaly, installation_epoch
|
2017-03-16 09:23:44 +01:00
|
|
|
from zerver.models import Realm, UserProfile, Message, Stream, \
|
2017-04-01 03:26:35 +02:00
|
|
|
UserActivityInterval, RealmAuditLog, models
|
2017-02-15 17:26:22 +01:00
|
|
|
from zerver.lib.timestamp import floor_to_day, floor_to_hour, ceiling_to_day, \
|
|
|
|
ceiling_to_hour
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2017-03-16 09:23:44 +01:00
|
|
|
from typing import Any, Callable, Dict, Optional, Text, Tuple, Type, Union
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2017-03-16 09:23:44 +01:00
|
|
|
from collections import defaultdict
|
2017-02-19 01:59:45 +01:00
|
|
|
from datetime import timedelta, datetime
|
2016-10-13 22:52:39 +02:00
|
|
|
import logging
|
|
|
|
import time
|
|
|
|
|
|
|
|
## Logging setup ##
|
|
|
|
log_format = '%(asctime)s %(levelname)-8s %(message)s'
|
|
|
|
logging.basicConfig(format=log_format)
|
|
|
|
|
|
|
|
formatter = logging.Formatter(log_format)
|
|
|
|
file_handler = logging.FileHandler(settings.ANALYTICS_LOG_PATH)
|
|
|
|
file_handler.setFormatter(formatter)
|
|
|
|
|
|
|
|
logger = logging.getLogger("zulip.management")
|
|
|
|
logger.setLevel(logging.INFO)
|
|
|
|
logger.addHandler(file_handler)
|
|
|
|
|
2017-03-16 05:08:36 +01:00
|
|
|
# You can't subtract timedelta.max from a datetime, so use this instead
|
|
|
|
TIMEDELTA_MAX = timedelta(days=365*1000)
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
|
2016-07-29 21:52:45 +02:00
|
|
|
class CountStat(object):
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
HOUR = 'hour'
|
|
|
|
DAY = 'day'
|
|
|
|
FREQUENCIES = frozenset([HOUR, DAY])
|
|
|
|
|
2017-04-02 00:49:53 +02:00
|
|
|
def __init__(self, property, data_collector, frequency, interval=None):
|
|
|
|
# type: (str, DataCollector, str, Optional[timedelta]) -> None
|
2016-07-29 21:52:45 +02:00
|
|
|
self.property = property
|
2017-04-02 00:49:53 +02:00
|
|
|
self.data_collector = data_collector
|
2016-07-29 21:52:45 +02:00
|
|
|
# might have to do something different for bitfields
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
if frequency not in self.FREQUENCIES:
|
2017-03-12 01:43:51 +01:00
|
|
|
raise AssertionError("Unknown frequency: %s" % (frequency,))
|
2016-07-29 21:52:45 +02:00
|
|
|
self.frequency = frequency
|
2017-03-16 05:08:36 +01:00
|
|
|
if interval is not None:
|
|
|
|
self.interval = interval
|
|
|
|
elif frequency == CountStat.HOUR:
|
|
|
|
self.interval = timedelta(hours=1)
|
|
|
|
else: # frequency == CountStat.DAY
|
|
|
|
self.interval = timedelta(days=1)
|
2017-02-15 17:26:22 +01:00
|
|
|
self.is_logging = False
|
2017-03-16 09:23:44 +01:00
|
|
|
self.custom_pull_function = None # type: Optional[Callable[[CountStat, datetime, datetime], None]]
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2016-12-15 02:04:28 +01:00
|
|
|
def __unicode__(self):
|
|
|
|
# type: () -> Text
|
|
|
|
return u"<CountStat: %s>" % (self.property,)
|
|
|
|
|
2017-02-15 17:26:22 +01:00
|
|
|
class LoggingCountStat(CountStat):
|
2017-04-02 02:12:39 +02:00
|
|
|
def __init__(self, property, output_table, frequency):
|
2017-02-15 17:26:22 +01:00
|
|
|
# type: (str, Type[BaseCount], str) -> None
|
2017-04-02 02:43:17 +02:00
|
|
|
CountStat.__init__(self, property, DataCollector(output_table, None), frequency)
|
2017-02-15 17:26:22 +01:00
|
|
|
self.is_logging = True
|
|
|
|
|
2017-03-16 09:23:44 +01:00
|
|
|
class CustomPullCountStat(CountStat):
|
2017-04-02 02:12:39 +02:00
|
|
|
def __init__(self, property, output_table, frequency, custom_pull_function):
|
2017-03-16 09:23:44 +01:00
|
|
|
# type: (str, Type[BaseCount], str, Callable[[CountStat, datetime, datetime], None]) -> None
|
2017-04-02 02:43:17 +02:00
|
|
|
CountStat.__init__(self, property, DataCollector(output_table, None), frequency)
|
2017-03-16 09:23:44 +01:00
|
|
|
self.custom_pull_function = custom_pull_function
|
|
|
|
|
2017-04-02 00:49:53 +02:00
|
|
|
class DataCollector(object):
|
2017-04-02 02:43:17 +02:00
|
|
|
def __init__(self, output_table, pull_function):
|
|
|
|
# type: (Type[BaseCount], Optional[Callable[[CountStat, datetime, datetime], None]]) -> None
|
2017-04-02 02:12:39 +02:00
|
|
|
self.output_table = output_table
|
2017-04-02 02:43:17 +02:00
|
|
|
self.pull_function = pull_function
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2017-01-07 09:19:37 +01:00
|
|
|
def do_update_fill_state(fill_state, end_time, state):
|
|
|
|
# type: (FillState, datetime, int) -> None
|
|
|
|
fill_state.end_time = end_time
|
|
|
|
fill_state.state = state
|
|
|
|
fill_state.save()
|
|
|
|
|
2016-10-12 23:40:48 +02:00
|
|
|
def process_count_stat(stat, fill_to_time):
|
|
|
|
# type: (CountStat, datetime) -> None
|
2017-01-07 09:19:37 +01:00
|
|
|
fill_state = FillState.objects.filter(property=stat.property).first()
|
2016-10-12 23:40:48 +02:00
|
|
|
if fill_state is None:
|
|
|
|
currently_filled = installation_epoch()
|
2017-01-07 09:19:37 +01:00
|
|
|
fill_state = FillState.objects.create(property=stat.property,
|
|
|
|
end_time=currently_filled,
|
|
|
|
state=FillState.DONE)
|
2016-10-13 22:52:39 +02:00
|
|
|
logger.info("INITIALIZED %s %s" % (stat.property, currently_filled))
|
2017-01-07 09:19:37 +01:00
|
|
|
elif fill_state.state == FillState.STARTED:
|
|
|
|
logger.info("UNDO START %s %s" % (stat.property, fill_state.end_time))
|
2017-02-15 04:10:03 +01:00
|
|
|
do_delete_counts_at_hour(stat, fill_state.end_time)
|
2017-01-07 09:19:37 +01:00
|
|
|
currently_filled = fill_state.end_time - timedelta(hours = 1)
|
|
|
|
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
|
2016-10-13 22:52:39 +02:00
|
|
|
logger.info("UNDO DONE %s" % (stat.property,))
|
2017-01-07 09:19:37 +01:00
|
|
|
elif fill_state.state == FillState.DONE:
|
|
|
|
currently_filled = fill_state.end_time
|
2016-10-12 23:40:48 +02:00
|
|
|
else:
|
2017-03-12 01:43:51 +01:00
|
|
|
raise AssertionError("Unknown value for FillState.state: %s." % (fill_state.state,))
|
2016-10-12 23:40:48 +02:00
|
|
|
|
|
|
|
currently_filled = currently_filled + timedelta(hours = 1)
|
|
|
|
while currently_filled <= fill_to_time:
|
2017-03-16 05:08:36 +01:00
|
|
|
logger.info("START %s %s" % (stat.property, currently_filled))
|
2016-10-13 22:52:39 +02:00
|
|
|
start = time.time()
|
2017-01-07 09:19:37 +01:00
|
|
|
do_update_fill_state(fill_state, currently_filled, FillState.STARTED)
|
2016-10-12 23:40:48 +02:00
|
|
|
do_fill_count_stat_at_hour(stat, currently_filled)
|
2017-01-07 09:19:37 +01:00
|
|
|
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
|
2016-10-13 22:52:39 +02:00
|
|
|
end = time.time()
|
2016-10-12 23:40:48 +02:00
|
|
|
currently_filled = currently_filled + timedelta(hours = 1)
|
2017-03-16 05:08:36 +01:00
|
|
|
logger.info("DONE %s (%dms)" % (stat.property, (end-start)*1000))
|
2016-10-12 23:40:48 +02:00
|
|
|
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
# We assume end_time is on an hour boundary, and is timezone aware.
|
|
|
|
# It is the caller's responsibility to enforce this!
|
2016-10-12 23:40:48 +02:00
|
|
|
def do_fill_count_stat_at_hour(stat, end_time):
|
|
|
|
# type: (CountStat, datetime) -> None
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
if stat.frequency == CountStat.DAY and (end_time != floor_to_day(end_time)):
|
2016-10-13 22:46:31 +02:00
|
|
|
return
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
|
2017-03-16 05:08:36 +01:00
|
|
|
start_time = end_time - stat.interval
|
2017-03-16 09:23:44 +01:00
|
|
|
if stat.custom_pull_function is not None:
|
|
|
|
stat.custom_pull_function(stat, start_time, end_time)
|
|
|
|
elif not stat.is_logging:
|
2017-04-02 02:43:17 +02:00
|
|
|
stat.data_collector.pull_function(stat, start_time, end_time)
|
2017-01-16 22:05:51 +01:00
|
|
|
do_aggregate_to_summary_table(stat, end_time)
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2017-02-15 04:10:03 +01:00
|
|
|
def do_delete_counts_at_hour(stat, end_time):
|
2016-10-12 23:40:48 +02:00
|
|
|
# type: (CountStat, datetime) -> None
|
2017-02-15 17:26:22 +01:00
|
|
|
if stat.is_logging:
|
|
|
|
InstallationCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
2017-04-02 02:12:39 +02:00
|
|
|
if stat.data_collector.output_table in [UserCount, StreamCount]:
|
2017-02-15 17:26:22 +01:00
|
|
|
RealmCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
|
|
|
else:
|
|
|
|
UserCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
|
|
|
StreamCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
|
|
|
RealmCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
|
|
|
InstallationCount.objects.filter(property=stat.property, end_time=end_time).delete()
|
2016-10-12 23:40:48 +02:00
|
|
|
|
2017-01-07 03:23:44 +01:00
|
|
|
def do_drop_all_analytics_tables():
|
|
|
|
# type: () -> None
|
|
|
|
UserCount.objects.all().delete()
|
|
|
|
StreamCount.objects.all().delete()
|
|
|
|
RealmCount.objects.all().delete()
|
|
|
|
InstallationCount.objects.all().delete()
|
|
|
|
FillState.objects.all().delete()
|
2017-03-12 08:55:55 +01:00
|
|
|
Anomaly.objects.all().delete()
|
2017-01-07 03:23:44 +01:00
|
|
|
|
2017-01-16 22:05:51 +01:00
|
|
|
def do_aggregate_to_summary_table(stat, end_time):
|
|
|
|
# type: (CountStat, datetime) -> None
|
2016-10-11 02:23:42 +02:00
|
|
|
cursor = connection.cursor()
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2016-10-11 02:23:42 +02:00
|
|
|
# Aggregate into RealmCount
|
2017-04-02 02:12:39 +02:00
|
|
|
output_table = stat.data_collector.output_table
|
|
|
|
if output_table in (UserCount, StreamCount):
|
2016-10-11 02:23:42 +02:00
|
|
|
realmcount_query = """
|
|
|
|
INSERT INTO analytics_realmcount
|
2017-01-16 22:05:51 +01:00
|
|
|
(realm_id, value, property, subgroup, end_time)
|
2016-10-11 02:23:42 +02:00
|
|
|
SELECT
|
2017-04-02 02:12:39 +02:00
|
|
|
zerver_realm.id, COALESCE(sum(%(output_table)s.value), 0), '%(property)s',
|
|
|
|
%(output_table)s.subgroup, %%(end_time)s
|
2016-10-11 02:23:42 +02:00
|
|
|
FROM zerver_realm
|
2017-04-02 02:12:39 +02:00
|
|
|
JOIN %(output_table)s
|
2016-10-11 02:23:42 +02:00
|
|
|
ON
|
2017-04-02 02:12:39 +02:00
|
|
|
zerver_realm.id = %(output_table)s.realm_id
|
2017-02-18 00:15:38 +01:00
|
|
|
WHERE
|
2017-04-02 02:12:39 +02:00
|
|
|
%(output_table)s.property = '%(property)s' AND
|
|
|
|
%(output_table)s.end_time = %%(end_time)s
|
|
|
|
GROUP BY zerver_realm.id, %(output_table)s.subgroup
|
|
|
|
""" % {'output_table': output_table._meta.db_table,
|
2017-01-16 22:05:51 +01:00
|
|
|
'property': stat.property}
|
2016-10-13 22:52:39 +02:00
|
|
|
start = time.time()
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
cursor.execute(realmcount_query, {'end_time': end_time})
|
2016-10-13 22:52:39 +02:00
|
|
|
end = time.time()
|
|
|
|
logger.info("%s RealmCount aggregation (%dms/%sr)" % (stat.property, (end-start)*1000, cursor.rowcount))
|
2016-07-29 21:52:45 +02:00
|
|
|
|
2016-10-11 02:23:42 +02:00
|
|
|
# Aggregate into InstallationCount
|
|
|
|
installationcount_query = """
|
|
|
|
INSERT INTO analytics_installationcount
|
2017-01-16 22:05:51 +01:00
|
|
|
(value, property, subgroup, end_time)
|
2016-10-11 02:23:42 +02:00
|
|
|
SELECT
|
2017-01-16 22:05:51 +01:00
|
|
|
sum(value), '%(property)s', analytics_realmcount.subgroup, %%(end_time)s
|
2016-10-11 02:23:42 +02:00
|
|
|
FROM analytics_realmcount
|
|
|
|
WHERE
|
2016-07-29 21:52:45 +02:00
|
|
|
property = '%(property)s' AND
|
2017-01-16 22:05:51 +01:00
|
|
|
end_time = %%(end_time)s
|
2017-02-18 00:15:38 +01:00
|
|
|
GROUP BY analytics_realmcount.subgroup
|
2017-01-16 22:05:51 +01:00
|
|
|
""" % {'property': stat.property}
|
2016-10-13 22:52:39 +02:00
|
|
|
start = time.time()
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
cursor.execute(installationcount_query, {'end_time': end_time})
|
2016-10-13 22:52:39 +02:00
|
|
|
end = time.time()
|
|
|
|
logger.info("%s InstallationCount aggregation (%dms/%sr)" % (stat.property, (end-start)*1000, cursor.rowcount))
|
2016-07-29 21:52:45 +02:00
|
|
|
cursor.close()
|
|
|
|
|
2016-10-25 21:01:21 +02:00
|
|
|
# This is the only method that hits the prod databases directly.
|
2017-04-02 02:43:17 +02:00
|
|
|
def do_pull_from_zerver(stat, start_time, end_time, query, group_by):
|
|
|
|
# type: (CountStat, datetime, datetime, str, Optional[Tuple[models.Model, str]]) -> None
|
2017-04-01 10:16:02 +02:00
|
|
|
if group_by is None:
|
2016-10-26 00:41:57 +02:00
|
|
|
subgroup = 'NULL'
|
|
|
|
group_by_clause = ''
|
|
|
|
else:
|
2017-04-01 10:16:02 +02:00
|
|
|
subgroup = '%s.%s' % (group_by[0]._meta.db_table, group_by[1])
|
2016-10-26 00:41:57 +02:00
|
|
|
group_by_clause = ', ' + subgroup
|
|
|
|
|
2017-04-01 08:11:44 +02:00
|
|
|
# We do string replacement here because passing group_by_clause as a param
|
2016-07-29 21:52:45 +02:00
|
|
|
# may result in problems when running cursor.execute; we do
|
|
|
|
# the string formatting prior so that cursor.execute runs it as sql
|
2017-04-02 02:43:17 +02:00
|
|
|
query_ = query % {'property': stat.property, 'subgroup': subgroup,
|
|
|
|
'group_by_clause': group_by_clause}
|
2016-07-29 21:52:45 +02:00
|
|
|
cursor = connection.cursor()
|
2016-10-13 22:52:39 +02:00
|
|
|
start = time.time()
|
analytics: Simplify frequency and measurement interval options.
Change the CountStat object to take an is_gauge variable instead of a
smallest_interval variable. Previously, (smallest_interval, frequency)
could be any of (hour, hour), (hour, day), (hour, gauge), (day, hour),
(day, day), or (day, gauge).
The current change is equivalent to excluding (hour, day) and (day, hour)
from the list above.
This change, along with other recent changes, allows us to simplify how we
handle time intervals. This commit also removes the TimeInterval object.
2016-10-14 00:15:46 +02:00
|
|
|
cursor.execute(query_, {'time_start': start_time, 'time_end': end_time})
|
2016-10-13 22:52:39 +02:00
|
|
|
end = time.time()
|
|
|
|
logger.info("%s do_pull_from_zerver (%dms/%sr)" % (stat.property, (end-start)*1000, cursor.rowcount))
|
2016-07-29 21:52:45 +02:00
|
|
|
cursor.close()
|
|
|
|
|
2017-04-02 02:43:17 +02:00
|
|
|
def zerver_data_collector(output_table, query, group_by):
|
|
|
|
# type: (Type[BaseCount], str, Optional[Tuple[models.Model, str]]) -> DataCollector
|
|
|
|
def pull_function(stat, start_time, end_time):
|
|
|
|
# type: (CountStat, datetime, datetime) -> None
|
|
|
|
do_pull_from_zerver(stat, start_time, end_time, query, group_by)
|
|
|
|
return DataCollector(output_table, pull_function)
|
|
|
|
|
2017-02-15 17:26:22 +01:00
|
|
|
# called from zerver/lib/actions.py; should not throw any errors
|
|
|
|
def do_increment_logging_stat(zerver_object, stat, subgroup, event_time, increment=1):
|
|
|
|
# type: (Union[Realm, UserProfile, Stream], CountStat, Optional[Union[str, int, bool]], datetime, int) -> None
|
2017-04-02 02:12:39 +02:00
|
|
|
table = stat.data_collector.output_table
|
2017-02-15 17:26:22 +01:00
|
|
|
if table == RealmCount:
|
|
|
|
id_args = {'realm': zerver_object}
|
|
|
|
elif table == UserCount:
|
|
|
|
id_args = {'realm': zerver_object.realm, 'user': zerver_object}
|
|
|
|
else: # StreamCount
|
|
|
|
id_args = {'realm': zerver_object.realm, 'stream': zerver_object}
|
|
|
|
|
|
|
|
if stat.frequency == CountStat.DAY:
|
|
|
|
end_time = ceiling_to_day(event_time)
|
|
|
|
else: # CountStat.HOUR:
|
|
|
|
end_time = ceiling_to_hour(event_time)
|
|
|
|
|
|
|
|
row, created = table.objects.get_or_create(
|
|
|
|
property=stat.property, subgroup=subgroup, end_time=end_time,
|
|
|
|
defaults={'value': increment}, **id_args)
|
|
|
|
if not created:
|
|
|
|
row.value = F('value') + increment
|
|
|
|
row.save(update_fields=['value'])
|
|
|
|
|
2017-04-01 08:06:36 +02:00
|
|
|
# Hardcodes the query needed by active_users:is_bot:day, since that is
|
|
|
|
# currently the only stat that uses this.
|
2016-07-29 21:52:45 +02:00
|
|
|
count_user_by_realm_query = """
|
|
|
|
INSERT INTO analytics_realmcount
|
2017-01-16 22:05:51 +01:00
|
|
|
(realm_id, value, property, subgroup, end_time)
|
2016-07-29 21:52:45 +02:00
|
|
|
SELECT
|
2017-04-01 08:39:54 +02:00
|
|
|
zerver_realm.id, count(*),'%(property)s', %(subgroup)s, %%(time_end)s
|
2016-07-29 21:52:45 +02:00
|
|
|
FROM zerver_realm
|
2016-11-03 08:27:32 +01:00
|
|
|
JOIN zerver_userprofile
|
2016-07-29 21:52:45 +02:00
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_realm.id = zerver_userprofile.realm_id
|
|
|
|
WHERE
|
|
|
|
zerver_realm.date_created < %%(time_end)s AND
|
2016-07-29 21:52:45 +02:00
|
|
|
zerver_userprofile.date_joined >= %%(time_start)s AND
|
2017-04-01 08:06:36 +02:00
|
|
|
zerver_userprofile.date_joined < %%(time_end)s AND
|
|
|
|
zerver_userprofile.is_active = TRUE
|
2016-10-26 00:41:57 +02:00
|
|
|
GROUP BY zerver_realm.id %(group_by_clause)s
|
2016-07-29 21:52:45 +02:00
|
|
|
"""
|
|
|
|
|
|
|
|
# currently .sender_id is only Message specific thing
|
|
|
|
count_message_by_user_query = """
|
|
|
|
INSERT INTO analytics_usercount
|
2017-01-16 22:05:51 +01:00
|
|
|
(user_id, realm_id, value, property, subgroup, end_time)
|
2016-07-29 21:52:45 +02:00
|
|
|
SELECT
|
2017-01-16 22:05:51 +01:00
|
|
|
zerver_userprofile.id, zerver_userprofile.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
|
2016-07-29 21:52:45 +02:00
|
|
|
FROM zerver_userprofile
|
|
|
|
JOIN zerver_message
|
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_userprofile.id = zerver_message.sender_id
|
|
|
|
WHERE
|
|
|
|
zerver_userprofile.date_joined < %%(time_end)s AND
|
2016-07-29 21:52:45 +02:00
|
|
|
zerver_message.pub_date >= %%(time_start)s AND
|
|
|
|
zerver_message.pub_date < %%(time_end)s
|
2016-10-26 00:41:57 +02:00
|
|
|
GROUP BY zerver_userprofile.id %(group_by_clause)s
|
2016-07-29 21:52:45 +02:00
|
|
|
"""
|
|
|
|
|
2016-12-17 03:26:39 +01:00
|
|
|
# Currently unused and untested
|
2016-07-29 21:52:45 +02:00
|
|
|
count_stream_by_realm_query = """
|
2016-10-18 02:03:57 +02:00
|
|
|
INSERT INTO analytics_realmcount
|
2017-01-16 22:05:51 +01:00
|
|
|
(realm_id, value, property, subgroup, end_time)
|
2016-07-29 21:52:45 +02:00
|
|
|
SELECT
|
2017-01-16 22:05:51 +01:00
|
|
|
zerver_realm.id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
|
2016-10-18 02:03:57 +02:00
|
|
|
FROM zerver_realm
|
2016-11-03 08:27:32 +01:00
|
|
|
JOIN zerver_stream
|
2016-07-29 21:52:45 +02:00
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_realm.id = zerver_stream.realm_id AND
|
|
|
|
WHERE
|
|
|
|
zerver_realm.date_created < %%(time_end)s AND
|
2016-10-18 02:03:57 +02:00
|
|
|
zerver_stream.date_created >= %%(time_start)s AND
|
|
|
|
zerver_stream.date_created < %%(time_end)s
|
2016-10-26 00:41:57 +02:00
|
|
|
GROUP BY zerver_realm.id %(group_by_clause)s
|
2016-07-29 21:52:45 +02:00
|
|
|
"""
|
|
|
|
|
2016-11-01 00:17:29 +01:00
|
|
|
# This query violates the count_X_by_Y_query conventions in several ways. One,
|
|
|
|
# the X table is not specified by the query name; MessageType is not a zerver
|
|
|
|
# table. Two, it ignores the subgroup column in the CountStat object; instead,
|
|
|
|
# it uses 'message_type' from the subquery to fill in the subgroup column.
|
|
|
|
count_message_type_by_user_query = """
|
|
|
|
INSERT INTO analytics_usercount
|
2017-01-16 22:05:51 +01:00
|
|
|
(realm_id, user_id, value, property, subgroup, end_time)
|
|
|
|
SELECT realm_id, id, SUM(count) AS value, '%(property)s', message_type, %%(time_end)s
|
2016-11-01 00:17:29 +01:00
|
|
|
FROM
|
|
|
|
(
|
|
|
|
SELECT zerver_userprofile.realm_id, zerver_userprofile.id, count(*),
|
|
|
|
CASE WHEN
|
2017-03-19 00:11:07 +01:00
|
|
|
zerver_recipient.type = 1 THEN 'private_message'
|
|
|
|
WHEN
|
|
|
|
zerver_recipient.type = 3 THEN 'huddle_message'
|
2016-11-01 00:17:29 +01:00
|
|
|
WHEN
|
|
|
|
zerver_stream.invite_only = TRUE THEN 'private_stream'
|
|
|
|
ELSE 'public_stream'
|
|
|
|
END
|
|
|
|
message_type
|
|
|
|
|
|
|
|
FROM zerver_userprofile
|
|
|
|
JOIN zerver_message
|
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_userprofile.id = zerver_message.sender_id AND
|
2016-11-01 00:17:29 +01:00
|
|
|
zerver_message.pub_date >= %%(time_start)s AND
|
|
|
|
zerver_message.pub_date < %%(time_end)s
|
|
|
|
JOIN zerver_recipient
|
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_message.recipient_id = zerver_recipient.id
|
2017-01-11 02:11:38 +01:00
|
|
|
LEFT JOIN zerver_stream
|
2016-11-01 00:17:29 +01:00
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_recipient.type_id = zerver_stream.id
|
2016-11-01 00:17:29 +01:00
|
|
|
GROUP BY zerver_userprofile.realm_id, zerver_userprofile.id, zerver_recipient.type, zerver_stream.invite_only
|
|
|
|
) AS subquery
|
|
|
|
GROUP BY realm_id, id, message_type
|
|
|
|
"""
|
|
|
|
|
2016-12-18 19:10:58 +01:00
|
|
|
# Note that this query also joins to the UserProfile table, since all
|
|
|
|
# current queries that use this also subgroup on UserProfile.is_bot. If in
|
|
|
|
# the future there is a query that counts messages by stream and doesn't need
|
|
|
|
# the UserProfile table, consider writing a new query for efficiency.
|
|
|
|
count_message_by_stream_query = """
|
2016-12-15 20:17:16 +01:00
|
|
|
INSERT INTO analytics_streamcount
|
2017-01-16 22:05:51 +01:00
|
|
|
(stream_id, realm_id, value, property, subgroup, end_time)
|
2016-12-15 20:17:16 +01:00
|
|
|
SELECT
|
2017-01-16 22:05:51 +01:00
|
|
|
zerver_stream.id, zerver_stream.realm_id, count(*), '%(property)s', %(subgroup)s, %%(time_end)s
|
2016-12-15 20:17:16 +01:00
|
|
|
FROM zerver_stream
|
|
|
|
JOIN zerver_recipient
|
|
|
|
ON
|
|
|
|
zerver_stream.id = zerver_recipient.type_id
|
|
|
|
JOIN zerver_message
|
|
|
|
ON
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_recipient.id = zerver_message.recipient_id
|
|
|
|
JOIN zerver_userprofile
|
|
|
|
ON
|
|
|
|
zerver_message.sender_id = zerver_userprofile.id
|
|
|
|
WHERE
|
|
|
|
zerver_stream.date_created < %%(time_end)s AND
|
|
|
|
zerver_recipient.type = 2 AND
|
2016-12-15 20:17:16 +01:00
|
|
|
zerver_message.pub_date >= %%(time_start)s AND
|
2017-02-18 00:15:38 +01:00
|
|
|
zerver_message.pub_date < %%(time_end)s
|
2016-12-18 19:10:58 +01:00
|
|
|
GROUP BY zerver_stream.id %(group_by_clause)s
|
2016-12-15 20:17:16 +01:00
|
|
|
"""
|
|
|
|
|
2017-03-16 07:58:23 +01:00
|
|
|
check_useractivityinterval_by_user_query = """
|
|
|
|
INSERT INTO analytics_usercount
|
|
|
|
(user_id, realm_id, value, property, subgroup, end_time)
|
|
|
|
SELECT
|
|
|
|
zerver_userprofile.id, zerver_userprofile.realm_id, 1, '%(property)s', %(subgroup)s, %%(time_end)s
|
|
|
|
FROM zerver_userprofile
|
|
|
|
JOIN zerver_useractivityinterval
|
|
|
|
ON
|
|
|
|
zerver_userprofile.id = zerver_useractivityinterval.user_profile_id
|
|
|
|
WHERE
|
|
|
|
zerver_useractivityinterval.end >= %%(time_start)s AND
|
|
|
|
zerver_useractivityinterval.start < %%(time_end)s
|
|
|
|
GROUP BY zerver_userprofile.id %(group_by_clause)s
|
|
|
|
"""
|
|
|
|
|
2017-04-01 03:26:35 +02:00
|
|
|
# Currently hardcodes the query needed for active_users_audit:is_bot:day.
|
|
|
|
# Assumes that a user cannot have two RealmAuditLog entries with the same event_time and
|
|
|
|
# event_type in ['user_created', 'user_deactivated', etc].
|
|
|
|
# In particular, it's important to ensure that migrations don't cause that to happen.
|
|
|
|
check_realmauditlog_by_user_query = """
|
|
|
|
INSERT INTO analytics_usercount
|
|
|
|
(user_id, realm_id, value, property, subgroup, end_time)
|
|
|
|
SELECT
|
|
|
|
ral1.modified_user_id, ral1.realm_id, 1, '%(property)s', %(subgroup)s, %%(time_end)s
|
|
|
|
FROM zerver_realmauditlog ral1
|
|
|
|
JOIN (
|
|
|
|
SELECT modified_user_id, max(event_time) AS max_event_time
|
|
|
|
FROM zerver_realmauditlog
|
|
|
|
WHERE
|
|
|
|
event_type in ('user_created', 'user_deactivated', 'user_activated', 'user_reactivated') AND
|
|
|
|
event_time < %%(time_end)s
|
|
|
|
GROUP BY modified_user_id
|
|
|
|
) ral2
|
|
|
|
ON
|
|
|
|
ral1.event_time = max_event_time AND
|
|
|
|
ral1.modified_user_id = ral2.modified_user_id
|
|
|
|
JOIN zerver_userprofile
|
|
|
|
ON
|
|
|
|
ral1.modified_user_id = zerver_userprofile.id
|
|
|
|
WHERE
|
|
|
|
ral1.event_type in ('user_created', 'user_activated', 'user_reactivated')
|
|
|
|
"""
|
|
|
|
|
2017-03-16 09:23:44 +01:00
|
|
|
def do_pull_minutes_active(stat, start_time, end_time):
|
|
|
|
# type: (CountStat, datetime, datetime) -> None
|
|
|
|
timer_start = time.time()
|
|
|
|
user_activity_intervals = UserActivityInterval.objects.filter(
|
|
|
|
end__gt=start_time, start__lt=end_time
|
|
|
|
).select_related(
|
|
|
|
'user_profile'
|
|
|
|
).values_list(
|
|
|
|
'user_profile_id', 'user_profile__realm_id', 'start', 'end')
|
|
|
|
|
|
|
|
seconds_active = defaultdict(float) # type: Dict[Tuple[int, int], float]
|
|
|
|
for user_id, realm_id, interval_start, interval_end in user_activity_intervals:
|
|
|
|
start = max(start_time, interval_start)
|
|
|
|
end = min(end_time, interval_end)
|
|
|
|
seconds_active[(user_id, realm_id)] += (end - start).total_seconds()
|
|
|
|
|
|
|
|
rows = [UserCount(user_id=ids[0], realm_id=ids[1], property=stat.property,
|
|
|
|
end_time=end_time, value=int(seconds // 60))
|
|
|
|
for ids, seconds in seconds_active.items() if seconds >= 60]
|
|
|
|
UserCount.objects.bulk_create(rows)
|
|
|
|
|
|
|
|
logger.info("%s do_pull_minutes_active (%dms/%sr)" %
|
|
|
|
(stat.property, (time.time()-timer_start)*1000, len(rows)))
|
|
|
|
|
2017-02-15 04:17:00 +01:00
|
|
|
count_stats_ = [
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat('messages_sent:is_bot:hour',
|
2017-04-02 02:43:17 +02:00
|
|
|
zerver_data_collector(UserCount, count_message_by_user_query, (UserProfile, 'is_bot')),
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat.HOUR),
|
|
|
|
CountStat('messages_sent:message_type:day',
|
2017-04-02 02:43:17 +02:00
|
|
|
zerver_data_collector(UserCount, count_message_type_by_user_query, None), CountStat.DAY),
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat('messages_sent:client:day',
|
2017-04-02 02:43:17 +02:00
|
|
|
zerver_data_collector(UserCount, count_message_by_user_query, (Message, 'sending_client_id')),
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat.DAY),
|
|
|
|
CountStat('messages_in_stream:is_bot:day',
|
2017-04-02 02:43:17 +02:00
|
|
|
zerver_data_collector(StreamCount, count_message_by_stream_query, (UserProfile, 'is_bot')),
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat.DAY),
|
2017-04-01 02:34:41 +02:00
|
|
|
|
2017-04-01 03:26:35 +02:00
|
|
|
# Sanity check on the bottom two stats. Is only an approximation,
|
|
|
|
# e.g. if a user is deactivated between the end of the day and when this
|
|
|
|
# stat is run, they won't be counted.
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat('active_users:is_bot:day',
|
2017-04-02 02:43:17 +02:00
|
|
|
zerver_data_collector(RealmCount, count_user_by_realm_query, (UserProfile, 'is_bot')),
|
2017-04-01 10:16:02 +02:00
|
|
|
CountStat.DAY, interval=TIMEDELTA_MAX),
|
2017-04-01 03:26:35 +02:00
|
|
|
# In RealmCount, 'active_humans_audit::day' should be the partial sum sequence
|
|
|
|
# of 'active_users_log:is_bot:day', for any realm that started after the
|
|
|
|
# latter stat was introduced.
|
|
|
|
# 'active_users_audit:is_bot:day' is the canonical record of which users were
|
|
|
|
# active on which days (in the UserProfile.is_active sense).
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat('active_users_audit:is_bot:day',
|
2017-04-02 02:43:17 +02:00
|
|
|
zerver_data_collector(UserCount, check_realmauditlog_by_user_query, (UserProfile, 'is_bot')),
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat.DAY),
|
2017-04-01 02:34:41 +02:00
|
|
|
LoggingCountStat('active_users_log:is_bot:day', RealmCount, CountStat.DAY),
|
|
|
|
|
2017-03-16 07:58:23 +01:00
|
|
|
# The minutes=15 part is due to the 15 minutes added in
|
|
|
|
# zerver.lib.actions.do_update_user_activity_interval.
|
2017-04-02 02:28:03 +02:00
|
|
|
CountStat('15day_actives::day',
|
2017-04-02 02:43:17 +02:00
|
|
|
zerver_data_collector(UserCount, check_useractivityinterval_by_user_query, None),
|
2017-04-01 10:16:02 +02:00
|
|
|
CountStat.DAY, interval=timedelta(days=15)-timedelta(minutes=15)),
|
2017-03-16 09:23:44 +01:00
|
|
|
CustomPullCountStat('minutes_active::day', UserCount, CountStat.DAY, do_pull_minutes_active)
|
2017-02-15 04:17:00 +01:00
|
|
|
]
|
|
|
|
|
|
|
|
COUNT_STATS = {stat.property: stat for stat in count_stats_}
|