analytics: Move last_successful_fill to CountStat.

This is a prep commit. Currenty we only pass CountStat.property to last_successful_fill function. But it needs access to CountStat.time_increment as well. We can pass the entire CountStat object to the function as a workaround. But making last_successful_fill a property of CountStat seems to be much more cleaner.
2020-12-22 22:39:34 +05:30 · 2020-12-22 22:39:34 +05:30 · 235a347639
parent 189e9a2759
commit 235a347639
6 changed files with 70 additions and 63 deletions
--- a/analytics/lib/counts.py
+++ b/analytics/lib/counts.py
@ -17,7 +17,6 @@ from analytics.models import (
    StreamCount,
    UserCount,
    installation_epoch,
-    last_successful_fill,
 )
 from zerver.lib.logging_util import log_to_file
 from zerver.lib.timestamp import ceiling_to_day, ceiling_to_hour, floor_to_hour, verify_UTC
@ -68,6 +67,14 @@ class CountStat:
    def __str__(self) -> str:
        return f"<CountStat: {self.property}>"

+    def last_successful_fill(self) -> Optional[datetime]:
+        fillstate = FillState.objects.filter(property=self.property).first()
+        if fillstate is None:
+            return None
+        if fillstate.state == FillState.DONE:
+            return fillstate.end_time
+        return fillstate.end_time - timedelta(hours=1)
+
 class LoggingCountStat(CountStat):
    def __init__(self, property: str, output_table: Type[BaseCount], frequency: str) -> None:
        CountStat.__init__(self, property, DataCollector(output_table, None), frequency)
@ -121,7 +128,7 @@ def process_count_stat(stat: CountStat, fill_to_time: datetime,

    if isinstance(stat, DependentCountStat):
        for dependency in stat.dependencies:
-            dependency_fill_time = last_successful_fill(dependency)
+            dependency_fill_time = COUNT_STATS[dependency].last_successful_fill()
            if dependency_fill_time is None:
                logger.warning("DependentCountStat %s run before dependency %s.",
                               stat.property, dependency)
--- a/analytics/management/commands/check_analytics_state.py
+++ b/analytics/management/commands/check_analytics_state.py
@ -7,7 +7,7 @@ from django.core.management.base import BaseCommand
 from django.utils.timezone import now as timezone_now

 from analytics.lib.counts import COUNT_STATS, CountStat
-from analytics.models import installation_epoch, last_successful_fill
+from analytics.models import installation_epoch
 from zerver.lib.timestamp import TimezoneNotUTCException, floor_to_day, floor_to_hour, verify_UTC
 from zerver.models import Realm

@ -42,7 +42,7 @@ class Command(BaseCommand):
        warning_unfilled_properties = []
        critical_unfilled_properties = []
        for property, stat in COUNT_STATS.items():
-            last_fill = last_successful_fill(property)
+            last_fill = stat.last_successful_fill()
            if last_fill is None:
                last_fill = installation_epoch()
            try:
--- a/analytics/models.py
+++ b/analytics/models.py
@ -26,14 +26,6 @@ def installation_epoch() -> datetime.datetime:
    earliest_realm_creation = Realm.objects.aggregate(models.Min('date_created'))['date_created__min']
    return floor_to_day(earliest_realm_creation)

-def last_successful_fill(property: str) -> Optional[datetime.datetime]:
-    fillstate = FillState.objects.filter(property=property).first()
-    if fillstate is None:
-        return None
-    if fillstate.state == FillState.DONE:
-        return fillstate.end_time
-    return fillstate.end_time - datetime.timedelta(hours=1)
-
 class BaseCount(models.Model):
    # Note: When inheriting from BaseCount, you may want to rearrange
    # the order of the columns in the migration to make sure they
--- a/analytics/tests/test_counts.py
+++ b/analytics/tests/test_counts.py
@ -322,35 +322,7 @@ class TestProcessCountStat(AnalyticsTestCase):
        stat3 = DependentCountStat('stat3', sql_data_collector(RealmCount, query, None),
                                   CountStat.HOUR,
                                   dependencies=['stat1', 'stat2'])
-        hour = [installation_epoch() + i*self.HOUR for i in range(5)]

-        # test when one dependency has been run, and the other hasn't
-        process_count_stat(stat1, hour[2])
-        process_count_stat(stat3, hour[1])
-        self.assertTableState(InstallationCount, ['property', 'end_time'],
-                              [['stat1', hour[1]], ['stat1', hour[2]]])
-        self.assertFillStateEquals(stat3, hour[0])
-
-        # test that we don't fill past the fill_to_time argument, even if
-        # dependencies have later last_successful_fill
-        process_count_stat(stat2, hour[3])
-        process_count_stat(stat3, hour[1])
-        self.assertTableState(InstallationCount, ['property', 'end_time'],
-                              [['stat1', hour[1]], ['stat1', hour[2]],
-                               ['stat2', hour[1]], ['stat2', hour[2]], ['stat2', hour[3]],
-                               ['stat3', hour[1]]])
-        self.assertFillStateEquals(stat3, hour[1])
-
-        # test that we don't fill past the dependency last_successful_fill times,
-        # even if fill_to_time is later
-        process_count_stat(stat3, hour[4])
-        self.assertTableState(InstallationCount, ['property', 'end_time'],
-                              [['stat1', hour[1]], ['stat1', hour[2]],
-                               ['stat2', hour[1]], ['stat2', hour[2]], ['stat2', hour[3]],
-                               ['stat3', hour[1]], ['stat3', hour[2]]])
-        self.assertFillStateEquals(stat3, hour[2])
-
-        # test daily dependent stat with hourly dependencies
        query = lambda kwargs: SQL("""
            INSERT INTO analytics_realmcount (realm_id, value, property, end_time)
            VALUES ({default_realm_id}, 1, {property}, %(time_end)s)
@ -361,13 +333,51 @@ class TestProcessCountStat(AnalyticsTestCase):
        stat4 = DependentCountStat('stat4', sql_data_collector(RealmCount, query, None),
                                   CountStat.DAY,
                                   dependencies=['stat1', 'stat2'])
-        hour24 = installation_epoch() + 24*self.HOUR
-        hour25 = installation_epoch() + 25*self.HOUR
-        process_count_stat(stat1, hour25)
-        process_count_stat(stat2, hour25)
-        process_count_stat(stat4, hour25)
-        self.assertEqual(InstallationCount.objects.filter(property='stat4').count(), 1)
-        self.assertFillStateEquals(stat4, hour24)
+
+        dummy_count_stats = {
+            "stat1": stat1,
+            "stat2": stat2,
+            "stat3": stat3,
+            "stat4": stat4,
+
+        }
+        with mock.patch("analytics.lib.counts.COUNT_STATS", dummy_count_stats):
+            hour = [installation_epoch() + i*self.HOUR for i in range(5)]
+
+            # test when one dependency has been run, and the other hasn't
+            process_count_stat(stat1, hour[2])
+            process_count_stat(stat3, hour[1])
+            self.assertTableState(InstallationCount, ['property', 'end_time'],
+                                  [['stat1', hour[1]], ['stat1', hour[2]]])
+            self.assertFillStateEquals(stat3, hour[0])
+
+            # test that we don't fill past the fill_to_time argument, even if
+            # dependencies have later last_successful_fill
+            process_count_stat(stat2, hour[3])
+            process_count_stat(stat3, hour[1])
+            self.assertTableState(InstallationCount, ['property', 'end_time'],
+                                  [['stat1', hour[1]], ['stat1', hour[2]],
+                                  ['stat2', hour[1]], ['stat2', hour[2]], ['stat2', hour[3]],
+                                  ['stat3', hour[1]]])
+            self.assertFillStateEquals(stat3, hour[1])
+
+            # test that we don't fill past the dependency last_successful_fill times,
+            # even if fill_to_time is later
+            process_count_stat(stat3, hour[4])
+            self.assertTableState(InstallationCount, ['property', 'end_time'],
+                                  [['stat1', hour[1]], ['stat1', hour[2]],
+                                  ['stat2', hour[1]], ['stat2', hour[2]], ['stat2', hour[3]],
+                                  ['stat3', hour[1]], ['stat3', hour[2]]])
+            self.assertFillStateEquals(stat3, hour[2])
+
+            # test daily dependent stat with hourly dependencies
+            hour24 = installation_epoch() + 24*self.HOUR
+            hour25 = installation_epoch() + 25*self.HOUR
+            process_count_stat(stat1, hour25)
+            process_count_stat(stat2, hour25)
+            process_count_stat(stat4, hour25)
+            self.assertEqual(InstallationCount.objects.filter(property='stat4').count(), 1)
+            self.assertFillStateEquals(stat4, hour24)

 class TestCountStats(AnalyticsTestCase):
    def setUp(self) -> None:
@ -995,6 +1005,18 @@ class TestCountStats(AnalyticsTestCase):
        self.assertTableState(InstallationCount, ['value'], [])
        self.assertTableState(StreamCount, [], [])

+    def test_last_successful_fill(self) -> None:
+        self.assertIsNone(COUNT_STATS["messages_sent:is_bot:hour"].last_successful_fill())
+
+        a_time = datetime(2016, 3, 14, 19, tzinfo=timezone.utc)
+        one_hour_before = datetime(2016, 3, 14, 18, tzinfo=timezone.utc)
+        fillstate = FillState.objects.create(property=COUNT_STATS["messages_sent:is_bot:hour"].property,
+                                             end_time=a_time, state=FillState.DONE)
+        self.assertEqual(COUNT_STATS["messages_sent:is_bot:hour"].last_successful_fill(), a_time)
+        fillstate.state = FillState.STARTED
+        fillstate.save()
+        self.assertEqual(COUNT_STATS["messages_sent:is_bot:hour"].last_successful_fill(), one_hour_before)
+
 class TestDoAggregateToSummaryTable(AnalyticsTestCase):
    # do_aggregate_to_summary_table is mostly tested by the end to end
    # nature of the tests in TestCountStats. But want to highlight one
--- a/analytics/tests/test_views.py
+++ b/analytics/tests/test_views.py
@ -8,7 +8,7 @@ from django.utils.timezone import now as timezone_now

 from analytics.lib.counts import COUNT_STATS, CountStat
 from analytics.lib.time_utils import time_range
-from analytics.models import FillState, RealmCount, UserCount, last_successful_fill
+from analytics.models import FillState, RealmCount, UserCount
 from analytics.views import rewrite_client_arrays, sort_by_totals, sort_client_labels
 from corporate.lib.stripe import add_months, update_sponsorship_status
 from corporate.models import Customer, CustomerPlan, LicenseLedger, get_customer_by_realm
@ -805,19 +805,6 @@ class TestSupportEndpoint(ZulipTestCase):
            m.assert_not_called()

 class TestGetChartDataHelpers(ZulipTestCase):
-    # last_successful_fill is in analytics/models.py, but get_chart_data is
-    # the only function that uses it at the moment
-    def test_last_successful_fill(self) -> None:
-        self.assertIsNone(last_successful_fill('non-existant'))
-        a_time = datetime(2016, 3, 14, 19, tzinfo=timezone.utc)
-        one_hour_before = datetime(2016, 3, 14, 18, tzinfo=timezone.utc)
-        fillstate = FillState.objects.create(property='property', end_time=a_time,
-                                             state=FillState.DONE)
-        self.assertEqual(last_successful_fill('property'), a_time)
-        fillstate.state = FillState.STARTED
-        fillstate.save()
-        self.assertEqual(last_successful_fill('property'), one_hour_before)
-
    def test_sort_by_totals(self) -> None:
        empty: List[int] = []
        value_arrays = {'c': [0, 1], 'a': [9], 'b': [1, 1, 1], 'd': empty}
--- a/analytics/views.py
+++ b/analytics/views.py
@ -35,7 +35,6 @@ from analytics.models import (
    StreamCount,
    UserCount,
    installation_epoch,
-    last_successful_fill,
 )
 from confirmation.models import Confirmation, _properties, confirmation_url
 from confirmation.settings import STATUS_ACTIVE
@ -308,7 +307,7 @@ def get_chart_data(request: HttpRequest, user_profile: UserProfile, chart_name:
            else:
                start = realm.date_created
        if end is None:
-            end = max(last_successful_fill(stat.property) or
+            end = max(stat.last_successful_fill() or
                      datetime.min.replace(tzinfo=timezone.utc) for stat in stats)

        if start > end and (timezone_now() - start > MAX_TIME_FOR_FULL_ANALYTICS_GENERATION):