analytics: Only update daily stats on day boundaries.

Previously we would update FillState for daily stats on hourly boundaries as
well. This would create two extra queries on the FillState table every hour
(for each CountStat), which adds roughly 50ms of extra processing for each
CountStat each day, as well as two extra lines each hour in the analytics
log. This can be a minor annoyance when backfilling stats.
This commit is contained in:
Rishi Gupta 2017-04-15 00:23:39 -07:00 committed by Tim Abbott
parent 2535f6c8f2
commit 5e49da9285
2 changed files with 32 additions and 8 deletions

View File

@ -80,6 +80,13 @@ class DataCollector(object):
def process_count_stat(stat, fill_to_time):
# type: (CountStat, datetime) -> None
if stat.frequency == CountStat.HOUR:
time_increment = timedelta(hours=1)
elif stat.frequency == CountStat.DAY:
time_increment = timedelta(days=1)
else:
raise AssertionError("Unknown frequency: %s" % (stat.frequency,))
fill_state = FillState.objects.filter(property=stat.property).first()
if fill_state is None:
currently_filled = installation_epoch()
@ -90,7 +97,7 @@ def process_count_stat(stat, fill_to_time):
elif fill_state.state == FillState.STARTED:
logger.info("UNDO START %s %s" % (stat.property, fill_state.end_time))
do_delete_counts_at_hour(stat, fill_state.end_time)
currently_filled = fill_state.end_time - timedelta(hours = 1)
currently_filled = fill_state.end_time - time_increment
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
logger.info("UNDO DONE %s" % (stat.property,))
elif fill_state.state == FillState.DONE:
@ -107,7 +114,7 @@ def process_count_stat(stat, fill_to_time):
return
fill_to_time = min(fill_to_time, dependency_fill_time)
currently_filled = currently_filled + timedelta(hours = 1)
currently_filled = currently_filled + time_increment
while currently_filled <= fill_to_time:
logger.info("START %s %s" % (stat.property, currently_filled))
start = time.time()
@ -115,7 +122,7 @@ def process_count_stat(stat, fill_to_time):
do_fill_count_stat_at_hour(stat, currently_filled)
do_update_fill_state(fill_state, currently_filled, FillState.DONE)
end = time.time()
currently_filled = currently_filled + timedelta(hours = 1)
currently_filled = currently_filled + time_increment
logger.info("DONE %s (%dms)" % (stat.property, (end-start)*1000))
def do_update_fill_state(fill_state, end_time, state):
@ -124,13 +131,10 @@ def do_update_fill_state(fill_state, end_time, state):
fill_state.state = state
fill_state.save()
# We assume end_time is on an hour boundary, and is timezone aware.
# It is the caller's responsibility to enforce this!
# We assume end_time is valid (e.g. is on a day or hour boundary as appropriate)
# and is timezone aware. It is the caller's responsibility to enforce this!
def do_fill_count_stat_at_hour(stat, end_time):
# type: (CountStat, datetime) -> None
if stat.frequency == CountStat.DAY and (end_time != floor_to_day(end_time)):
return
start_time = end_time - stat.interval
if not isinstance(stat, LoggingCountStat):
timer = time.time()

View File

@ -195,6 +195,13 @@ class TestProcessCountStat(AnalyticsTestCase):
self.assertFillStateEquals(stat, current_time)
self.assertEqual(InstallationCount.objects.filter(property=stat.property).count(), 2)
def test_off_boundary_fill_to_time(self):
# type: () -> None
stat = self.make_dummy_count_stat('test stat')
process_count_stat(stat, installation_epoch() + 65*self.MINUTE)
self.assertFillStateEquals(stat, installation_epoch() + self.HOUR)
self.assertEqual(InstallationCount.objects.filter(property=stat.property).count(), 1)
# This tests the LoggingCountStat branch of the code in do_delete_counts_at_hour.
# It is important that do_delete_counts_at_hour not delete any of the collected
# logging data!
@ -277,6 +284,19 @@ class TestProcessCountStat(AnalyticsTestCase):
['stat3', hour[1]], ['stat3', hour[2]]])
self.assertFillStateEquals(stat3, hour[2])
# test daily dependent stat with hourly dependencies
query = """INSERT INTO analytics_realmcount (realm_id, value, property, end_time)
VALUES (%s, 1, '%s', %%%%(time_end)s)""" % (self.default_realm.id, 'stat4')
stat4 = DependentCountStat('stat4', sql_data_collector(RealmCount, query, None), CountStat.DAY,
dependencies=['stat1', 'stat2'])
hour24 = installation_epoch() + 24*self.HOUR
hour25 = installation_epoch() + 25*self.HOUR
process_count_stat(stat1, hour25)
process_count_stat(stat2, hour25)
process_count_stat(stat4, hour25)
self.assertEqual(InstallationCount.objects.filter(property='stat4').count(), 1)
self.assertFillStateEquals(stat4, hour24)
class TestCountStats(AnalyticsTestCase):
def setUp(self):
# type: () -> None