From 680e7f75e1405b9689b5a14aee11c70df302cd6a Mon Sep 17 00:00:00 2001 From: Rishi Gupta Date: Mon, 16 Jan 2017 11:05:21 -0800 Subject: [PATCH] analytics: Change generate_time_series_data argument from length to days. Previously, this function seemed ambivalent about whether it was generating a series of abstract data points or a series of data points that would correspond to times. Switch firmly to the latter, so e.g. if the frequency changes, so will the length of the output sequence. --- analytics/lib/fixtures.py | 27 ++++++++++++------- .../commands/populate_analytics_db.py | 6 +++-- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/analytics/lib/fixtures.py b/analytics/lib/fixtures.py index 2fe562c250..c7dc4909dd 100644 --- a/analytics/lib/fixtures.py +++ b/analytics/lib/fixtures.py @@ -11,14 +11,15 @@ from random import gauss, random, seed from six.moves import range, zip -def generate_time_series_data(length, business_hours_base, non_business_hours_base, +def generate_time_series_data(days=100, business_hours_base=10, non_business_hours_base=10, growth=1, autocorrelation=0, spikiness=1, holiday_rate=0, - frequency=CountStat.HOUR, is_gauge=False): + frequency=CountStat.DAY, is_gauge=False): # type: (int, float, float, float, float, float, float, str, bool) -> List[int] """ Generate semi-realistic looking time series data for testing analytics graphs. - length -- Number of data points returned. + days -- Number of days of data. Is the number of data points generated if + frequency is CountStat.DAY. business_hours_base -- Average value during a business hour (or day) at beginning of time series, if frequency is CountStat.HOUR (CountStat.DAY, respectively). non_business_hours_base -- The above, for non-business hours/days. @@ -28,21 +29,29 @@ def generate_time_series_data(length, business_hours_base, non_business_hours_ba function of the previous point. spikiness -- 0 means no randomness (other than holiday_rate), higher values increase the variance. - holiday_rate -- Fraction of points randomly set to 0. + holiday_rate -- Fraction of days randomly set to 0, largely for testing how we handle 0s. frequency -- Should be CountStat.HOUR or CountStat.DAY. is_gauge -- If True, return partial sum of the series. """ - if length < 2: - raise ValueError("length must be at least 2") if frequency == CountStat.HOUR: + length = days*24 seasonality = [non_business_hours_base] * 24 * 7 for day in range(5): for hour in range(8): seasonality[24*day + hour] = business_hours_base + holidays = [] + for i in range(days): + holidays.extend([random() < holiday_rate] * 24) elif frequency == CountStat.DAY: - seasonality = [business_hours_base]*5 + [non_business_hours_base]*2 + length = days + seasonality = [8*business_hours_base + 16*non_business_hours_base] * 5 + \ + [24*non_business_hours_base] * 2 + holidays = [random() < holiday_rate for i in range(days)] else: raise ValueError("Unknown frequency: %s" % (frequency,)) + if length < 2: + raise ValueError("Must be generating at least 2 data points. " + "Currently generating %s" % (length,)) growth_base = growth ** (1. / (length-1)) values_no_noise = [seasonality[i % len(seasonality)] * (growth_base**i) for i in range(length)] @@ -51,8 +60,8 @@ def generate_time_series_data(length, business_hours_base, non_business_hours_ba for i in range(1, length): noise_scalars.append(noise_scalars[-1]*autocorrelation + gauss(0, 1)*(1-autocorrelation)) - values = [0 if random() < holiday_rate else int(v + sqrt(v)*noise_scalar*spikiness) - for v, noise_scalar in zip(values_no_noise, noise_scalars)] + values = [0 if holiday else int(v + sqrt(v)*noise_scalar*spikiness) + for v, noise_scalar, holiday in zip(values_no_noise, noise_scalars, holidays)] if is_gauge: for i in range(1, length): values[i] = values[i-1] + values[i] diff --git a/analytics/management/commands/populate_analytics_db.py b/analytics/management/commands/populate_analytics_db.py index d5958dfc6a..74c755bcad 100644 --- a/analytics/management/commands/populate_analytics_db.py +++ b/analytics/management/commands/populate_analytics_db.py @@ -48,9 +48,11 @@ class Command(BaseCommand): stat = COUNT_STATS['active_users:is_bot'] if not RealmCount.objects.filter(property=stat.property).exists(): last_end_time = floor_to_day(timezone.now()) - human_data = generate_time_series_data(100, 30, 10, growth=5, autocorrelation=.5, + human_data = generate_time_series_data(days=100, business_hours_base=30, + non_business_hours_base=10, growth=5, autocorrelation=.5, spikiness=3, frequency=CountStat.DAY) - bot_data = generate_time_series_data(100, 20, 20, growth=3, frequency=CountStat.DAY) + bot_data = generate_time_series_data(days=100, business_hours_base=20, + non_business_hours_base=20, growth=3, frequency=CountStat.DAY) bulk_create_realmcount(stat.property, 'false', last_end_time, stat.frequency, stat.interval, human_data, realm) bulk_create_realmcount(stat.property, 'true', last_end_time,