mirror of https://github.com/zulip/zulip.git
analytics: Only update daily stats on day boundaries.
Previously we would update FillState for daily stats on hourly boundaries as well. This would create two extra queries on the FillState table every hour (for each CountStat), which adds roughly 50ms of extra processing for each CountStat each day, as well as two extra lines each hour in the analytics log. This can be a minor annoyance when backfilling stats.
This commit is contained in:
parent
2535f6c8f2
commit
5e49da9285
|
@ -80,6 +80,13 @@ class DataCollector(object):
|
||||||
|
|
||||||
def process_count_stat(stat, fill_to_time):
|
def process_count_stat(stat, fill_to_time):
|
||||||
# type: (CountStat, datetime) -> None
|
# type: (CountStat, datetime) -> None
|
||||||
|
if stat.frequency == CountStat.HOUR:
|
||||||
|
time_increment = timedelta(hours=1)
|
||||||
|
elif stat.frequency == CountStat.DAY:
|
||||||
|
time_increment = timedelta(days=1)
|
||||||
|
else:
|
||||||
|
raise AssertionError("Unknown frequency: %s" % (stat.frequency,))
|
||||||
|
|
||||||
fill_state = FillState.objects.filter(property=stat.property).first()
|
fill_state = FillState.objects.filter(property=stat.property).first()
|
||||||
if fill_state is None:
|
if fill_state is None:
|
||||||
currently_filled = installation_epoch()
|
currently_filled = installation_epoch()
|
||||||
|
@ -90,7 +97,7 @@ def process_count_stat(stat, fill_to_time):
|
||||||
elif fill_state.state == FillState.STARTED:
|
elif fill_state.state == FillState.STARTED:
|
||||||
logger.info("UNDO START %s %s" % (stat.property, fill_state.end_time))
|
logger.info("UNDO START %s %s" % (stat.property, fill_state.end_time))
|
||||||
do_delete_counts_at_hour(stat, fill_state.end_time)
|
do_delete_counts_at_hour(stat, fill_state.end_time)
|
||||||
currently_filled = fill_state.end_time - timedelta(hours = 1)
|
currently_filled = fill_state.end_time - time_increment
|
||||||
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
|
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
|
||||||
logger.info("UNDO DONE %s" % (stat.property,))
|
logger.info("UNDO DONE %s" % (stat.property,))
|
||||||
elif fill_state.state == FillState.DONE:
|
elif fill_state.state == FillState.DONE:
|
||||||
|
@ -107,7 +114,7 @@ def process_count_stat(stat, fill_to_time):
|
||||||
return
|
return
|
||||||
fill_to_time = min(fill_to_time, dependency_fill_time)
|
fill_to_time = min(fill_to_time, dependency_fill_time)
|
||||||
|
|
||||||
currently_filled = currently_filled + timedelta(hours = 1)
|
currently_filled = currently_filled + time_increment
|
||||||
while currently_filled <= fill_to_time:
|
while currently_filled <= fill_to_time:
|
||||||
logger.info("START %s %s" % (stat.property, currently_filled))
|
logger.info("START %s %s" % (stat.property, currently_filled))
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
@ -115,7 +122,7 @@ def process_count_stat(stat, fill_to_time):
|
||||||
do_fill_count_stat_at_hour(stat, currently_filled)
|
do_fill_count_stat_at_hour(stat, currently_filled)
|
||||||
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
|
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
currently_filled = currently_filled + timedelta(hours = 1)
|
currently_filled = currently_filled + time_increment
|
||||||
logger.info("DONE %s (%dms)" % (stat.property, (end-start)*1000))
|
logger.info("DONE %s (%dms)" % (stat.property, (end-start)*1000))
|
||||||
|
|
||||||
def do_update_fill_state(fill_state, end_time, state):
|
def do_update_fill_state(fill_state, end_time, state):
|
||||||
|
@ -124,13 +131,10 @@ def do_update_fill_state(fill_state, end_time, state):
|
||||||
fill_state.state = state
|
fill_state.state = state
|
||||||
fill_state.save()
|
fill_state.save()
|
||||||
|
|
||||||
# We assume end_time is on an hour boundary, and is timezone aware.
|
# We assume end_time is valid (e.g. is on a day or hour boundary as appropriate)
|
||||||
# It is the caller's responsibility to enforce this!
|
# and is timezone aware. It is the caller's responsibility to enforce this!
|
||||||
def do_fill_count_stat_at_hour(stat, end_time):
|
def do_fill_count_stat_at_hour(stat, end_time):
|
||||||
# type: (CountStat, datetime) -> None
|
# type: (CountStat, datetime) -> None
|
||||||
if stat.frequency == CountStat.DAY and (end_time != floor_to_day(end_time)):
|
|
||||||
return
|
|
||||||
|
|
||||||
start_time = end_time - stat.interval
|
start_time = end_time - stat.interval
|
||||||
if not isinstance(stat, LoggingCountStat):
|
if not isinstance(stat, LoggingCountStat):
|
||||||
timer = time.time()
|
timer = time.time()
|
||||||
|
|
|
@ -195,6 +195,13 @@ class TestProcessCountStat(AnalyticsTestCase):
|
||||||
self.assertFillStateEquals(stat, current_time)
|
self.assertFillStateEquals(stat, current_time)
|
||||||
self.assertEqual(InstallationCount.objects.filter(property=stat.property).count(), 2)
|
self.assertEqual(InstallationCount.objects.filter(property=stat.property).count(), 2)
|
||||||
|
|
||||||
|
def test_off_boundary_fill_to_time(self):
|
||||||
|
# type: () -> None
|
||||||
|
stat = self.make_dummy_count_stat('test stat')
|
||||||
|
process_count_stat(stat, installation_epoch() + 65*self.MINUTE)
|
||||||
|
self.assertFillStateEquals(stat, installation_epoch() + self.HOUR)
|
||||||
|
self.assertEqual(InstallationCount.objects.filter(property=stat.property).count(), 1)
|
||||||
|
|
||||||
# This tests the LoggingCountStat branch of the code in do_delete_counts_at_hour.
|
# This tests the LoggingCountStat branch of the code in do_delete_counts_at_hour.
|
||||||
# It is important that do_delete_counts_at_hour not delete any of the collected
|
# It is important that do_delete_counts_at_hour not delete any of the collected
|
||||||
# logging data!
|
# logging data!
|
||||||
|
@ -277,6 +284,19 @@ class TestProcessCountStat(AnalyticsTestCase):
|
||||||
['stat3', hour[1]], ['stat3', hour[2]]])
|
['stat3', hour[1]], ['stat3', hour[2]]])
|
||||||
self.assertFillStateEquals(stat3, hour[2])
|
self.assertFillStateEquals(stat3, hour[2])
|
||||||
|
|
||||||
|
# test daily dependent stat with hourly dependencies
|
||||||
|
query = """INSERT INTO analytics_realmcount (realm_id, value, property, end_time)
|
||||||
|
VALUES (%s, 1, '%s', %%%%(time_end)s)""" % (self.default_realm.id, 'stat4')
|
||||||
|
stat4 = DependentCountStat('stat4', sql_data_collector(RealmCount, query, None), CountStat.DAY,
|
||||||
|
dependencies=['stat1', 'stat2'])
|
||||||
|
hour24 = installation_epoch() + 24*self.HOUR
|
||||||
|
hour25 = installation_epoch() + 25*self.HOUR
|
||||||
|
process_count_stat(stat1, hour25)
|
||||||
|
process_count_stat(stat2, hour25)
|
||||||
|
process_count_stat(stat4, hour25)
|
||||||
|
self.assertEqual(InstallationCount.objects.filter(property='stat4').count(), 1)
|
||||||
|
self.assertFillStateEquals(stat4, hour24)
|
||||||
|
|
||||||
class TestCountStats(AnalyticsTestCase):
|
class TestCountStats(AnalyticsTestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
# type: () -> None
|
# type: () -> None
|
||||||
|
|
Loading…
Reference in New Issue