zulip/tools/generate-activity-metrics.py

#!/usr/bin/env python
#
# Generates % delta activity metrics from graphite/statsd data
#
import os, sys

sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

import optparse
from datetime import timedelta, datetime
from zerver.lib.timestamp import datetime_to_timestamp
from zerver.lib.utils import statsd_key
import requests

# Workaround to support the Python-requests 1.0 transition of .json
# from a property to a function
requests_json_is_function = callable(requests.Response.json)
def extract_json_response(resp):
    if requests_json_is_function:
        return resp.json()
    else:
        return resp.json

def get_data_url(buckets, realm):
    realm_key = statsd_key(realm, True)

    # This is the slightly-cleaned up JSON api version of https://graphiti.zulip.net/graphs/945c7aafc2d
    #
    # Fetches 1 month worth of data
    DATA_URL="https://stats1.zulip.net:444/render/?from=-1000d&format=json"
    for bucket in buckets:
        if realm != 'all':
            statsd_target = "stats.gauges.staging.users.active.%s.%s" % (realm_key, bucket)
            DATA_URL += "&target=%s" % (statsd_target,)
        else:
            # all means adding up all realms, but exclude the .all. metrics since that would double things
            DATA_URL += "&target=sum(exclude(stats.gauges.staging.users.active.*.%s, 'all'))" % (bucket,)
    return DATA_URL

def get_data(url, username, pw):
    from requests.auth import HTTPDigestAuth

    res = requests.get(url, auth=HTTPDigestAuth(username, pw), verify=False)

    if res.status_code != 200:
        print "Failed to fetch data url: %s" % (res.error,)
        return []

    return extract_json_response(res)

def noon_of(day=datetime.now()):
    return datetime(year=day.year, month=day.month, day=day.day, hour=12)

def points_during_day(data, noon):
    """Returns all the points in the dataset that occur in the 12 hours around
    the datetime object that is passed in. data must be sorted."""
    before =datetime_to_timestamp(noon - timedelta(hours=12))
    after = datetime_to_timestamp(noon + timedelta(hours=12))

    between = filter(lambda pt: pt[1] > before and pt[1] < after, data)
    return between

def best_during_day(data, day):
    valid = sorted(points_during_day(data, day), key=lambda pt: pt[0], reverse=True)
    if len(valid):
        return valid[0][0]
    else:
        return None

def percent_diff(prev, cur):
    if prev is None or cur is None:
        return None
    if cur == 0 and prev == 0:
        return ""
    if prev == 0:
        return "NaN"
    return "%.02f%%" % (((cur - prev) / prev) * 100,)

def parse_data(data, today):
    def print_results(all_days, days, compare_with_last=False):
        first_data_point = True
        best_last_time = 0
        for i in all_days:
            day = today - timedelta(days=i)
            # Ignore weekends
            if day.weekday() in days:
                best = best_during_day(metric['datapoints'], day)
                if best is None:
                    continue

                if not compare_with_last:
                    percent = percent_diff(best, best_today)
                else:
                    if first_data_point:
                        percent = ""
                        first_data_point = False
                    else:
                        percent = percent_diff(best_last_time, best)

                if best is not None:
                    print "Last %s, %s %s ago:\t%.01f\t\t%s" \
                        % (day.strftime("%A"), i, "days", best, percent)
                best_last_time = best

    for metric in data:
        # print "Got %s with data points %s" % (metric['target'], len(metric['datapoints']))
        # Calculate % between peak 2hr and 10min across each day and week
        metric['datapoints'].sort(key=lambda p: p[1])

        best_today = best_during_day(metric['datapoints'], today)
        print "Date\t\t\t\tUsers\t\tChange from then to today"
        print "Today, 0 days ago:\t\t%.01f" % (best_today,)
        print_results(xrange(1, 1000), [0, 1, 2, 3, 4, 7])

        print "\n\nWeekly Wednesday results"
        print "Date\t\t\t\tUsers\t\tDelta from previous week"
        print_results(reversed(xrange(1, 1000)), [2], True)


parser = optparse.OptionParser(r"""

%prog --user username --password pw [--start-from unixtimestamp]

    Generates activity statistics with detailed week-over-week percentage change
""")

parser.add_option('--user',
                  help='Graphite usernarme',
                  metavar='USER')
parser.add_option('--password',
                  help='Graphite password',
                  metavar='PASSWORD')
parser.add_option('--start-from',
                  help='What day to consider as \'today\' when calculating stats as a Unix timestamp',
                  metavar='STARTDATE',
                  default='today')
parser.add_option('--realm',
                  help='Which realm to query',
                  default='all')
parser.add_option('--bucket',
                  help='Which bucket to query',
                  default='12hr')

if __name__ == '__main__':
    (options, args) = parser.parse_args()

    if not options.user or not options.password:
        parser.error("You must enter a username and password to log into graphite with")

    startfrom = noon_of(day=datetime.now())
    if options.start_from != 'today':
        startfrom = noon_of(day=datetime.fromtimestamp(int(options.start_from)))
        print "Using baseline of today as %s" % (startfrom,)

    realm_key = statsd_key(options.realm, True)
    buckets = [options.bucket]

    # This is the slightly-cleaned up JSON api version of https://graphiti.zulip.net/graphs/945c7aafc2d
    #
    # Fetches 1 month worth of data
    DATA_URL = get_data_url(buckets, options.realm)
    data = get_data(DATA_URL, options.user, options.password)


    parse_data(data, startfrom)
Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00			`#!/usr/bin/env python`
			`#`
			`# Generates % delta activity metrics from graphite/statsd data`
			`#`
			`import os, sys`

generate-activity-metrics: Fix import issues. (imported from commit a8dd3690200ea42616301adc29d136e7554f45df) 2013-05-13 19:11:59 +02:00			`sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))`
Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00
			`import optparse`
			`from datetime import timedelta, datetime`
[manual] Rename Django app from zephyr to zerver. This needs to be deployed to both staging and prod at the same off-peak time (and the schema migration run). At the time it is deployed, we need to make a few changes directly in the database: (1) UPDATE django_content_type set app_label='zerver' where app_label='zephyr'; (2) UPDATE south_migrationhistory set app_name='zerver' where app_name='zephyr'; (imported from commit eb3fd719571740189514ef0b884738cb30df1320) 2013-07-29 23:03:31 +02:00			`from zerver.lib.timestamp import datetime_to_timestamp`
			`from zerver.lib.utils import statsd_key`
generate-activity-metrics: Fix import issues. (imported from commit a8dd3690200ea42616301adc29d136e7554f45df) 2013-05-13 19:11:59 +02:00			`import requests`
Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00
generate-activity-metrics: Fix import issues. (imported from commit a8dd3690200ea42616301adc29d136e7554f45df) 2013-05-13 19:11:59 +02:00			`# Workaround to support the Python-requests 1.0 transition of .json`
			`# from a property to a function`
			`requests_json_is_function = callable(requests.Response.json)`
			`def extract_json_response(resp):`
			`if requests_json_is_function:`
			`return resp.json()`
			`else:`
			`return resp.json`

Refactor to use wildcard/exclusion for all (imported from commit 2874361aef2cbc14a3047f964eb584f27a29323c) 2013-05-30 17:43:27 +02:00			`def get_data_url(buckets, realm):`
			`realm_key = statsd_key(realm, True)`

Update humbughq.com => zulip.net in tools. This shouldn't be pushed until we get graphite.zulip.net working. (imported from commit c0dd208222385a9ee92ad1ca8fbf141894f7c7b6) 2013-07-24 23:24:13 +02:00			`# This is the slightly-cleaned up JSON api version of https://graphiti.zulip.net/graphs/945c7aafc2d`
Refactor to use wildcard/exclusion for all (imported from commit 2874361aef2cbc14a3047f964eb584f27a29323c) 2013-05-30 17:43:27 +02:00			`#`
			`# Fetches 1 month worth of data`
Update generate-activity-metrics to use new graphite url (imported from commit 4664e9f389d05edb014ddb3b7f271cfc32245f3f) 2013-09-06 21:14:42 +02:00			`DATA_URL="https://stats1.zulip.net:444/render/?from=-1000d&format=json"`
Refactor to use wildcard/exclusion for all (imported from commit 2874361aef2cbc14a3047f964eb584f27a29323c) 2013-05-30 17:43:27 +02:00			`for bucket in buckets:`
			`if realm != 'all':`
			`statsd_target = "stats.gauges.staging.users.active.%s.%s" % (realm_key, bucket)`
			`DATA_URL += "&target=%s" % (statsd_target,)`
			`else:`
			`# all means adding up all realms, but exclude the .all. metrics since that would double things`
			`DATA_URL += "&target=sum(exclude(stats.gauges.staging.users.active.*.%s, 'all'))" % (bucket,)`
			`return DATA_URL`

Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00			`def get_data(url, username, pw):`
			`from requests.auth import HTTPDigestAuth`

			`res = requests.get(url, auth=HTTPDigestAuth(username, pw), verify=False)`

			`if res.status_code != 200:`
			`print "Failed to fetch data url: %s" % (res.error,)`
			`return []`

generate-activity-metrics: Fix import issues. (imported from commit a8dd3690200ea42616301adc29d136e7554f45df) 2013-05-13 19:11:59 +02:00			`return extract_json_response(res)`
Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00
			`def noon_of(day=datetime.now()):`
			`return datetime(year=day.year, month=day.month, day=day.day, hour=12)`

			`def points_during_day(data, noon):`
			`"""Returns all the points in the dataset that occur in the 12 hours around`
			`the datetime object that is passed in. data must be sorted."""`
			`before =datetime_to_timestamp(noon - timedelta(hours=12))`
			`after = datetime_to_timestamp(noon + timedelta(hours=12))`

			`between = filter(lambda pt: pt[1] > before and pt[1] < after, data)`
			`return between`

			`def best_during_day(data, day):`
			`valid = sorted(points_during_day(data, day), key=lambda pt: pt[0], reverse=True)`
			`if len(valid):`
			`return valid[0][0]`
			`else:`
			`return None`

			`def percent_diff(prev, cur):`
			`if prev is None or cur is None:`
			`return None`
generate-activity-metrics: Display by-week usage numbers changes. (imported from commit 22bc1592c35c5f86ab89f7f4fc14ba7dbd2375db) 2013-05-30 23:16:43 +02:00			`if cur == 0 and prev == 0:`
			`return ""`
			`if prev == 0:`
			`return "NaN"`
			`return "%.02f%%" % (((cur - prev) / prev) * 100,)`
Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00
			`def parse_data(data, today):`
generate-activity-metrics: Display by-week usage numbers changes. (imported from commit 22bc1592c35c5f86ab89f7f4fc14ba7dbd2375db) 2013-05-30 23:16:43 +02:00			`def print_results(all_days, days, compare_with_last=False):`
			`first_data_point = True`
			`best_last_time = 0`
			`for i in all_days:`
			`day = today - timedelta(days=i)`
			`# Ignore weekends`
			`if day.weekday() in days:`
			`best = best_during_day(metric['datapoints'], day)`
			`if best is None:`
			`continue`

			`if not compare_with_last:`
			`percent = percent_diff(best, best_today)`
			`else:`
			`if first_data_point:`
			`percent = ""`
			`first_data_point = False`
			`else:`
			`percent = percent_diff(best_last_time, best)`

			`if best is not None:`
			`print "Last %s, %s %s ago:\t%.01f\t\t%s" \`
			`% (day.strftime("%A"), i, "days", best, percent)`
			`best_last_time = best`

Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00			`for metric in data:`
			`# print "Got %s with data points %s" % (metric['target'], len(metric['datapoints']))`
Refactor to use wildcard/exclusion for all (imported from commit 2874361aef2cbc14a3047f964eb584f27a29323c) 2013-05-30 17:43:27 +02:00			`# Calculate % between peak 2hr and 10min across each day and week`
			`metric['datapoints'].sort(key=lambda p: p[1])`
Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00
Refactor to use wildcard/exclusion for all (imported from commit 2874361aef2cbc14a3047f964eb584f27a29323c) 2013-05-30 17:43:27 +02:00			`best_today = best_during_day(metric['datapoints'], today)`
generate-activity-metrics: Display by-week usage numbers changes. (imported from commit 22bc1592c35c5f86ab89f7f4fc14ba7dbd2375db) 2013-05-30 23:16:43 +02:00			`print "Date\t\t\t\tUsers\t\tChange from then to today"`
			`print "Today, 0 days ago:\t\t%.01f" % (best_today,)`
			`print_results(xrange(1, 1000), [0, 1, 2, 3, 4, 7])`
Change display format for metrics (imported from commit eb87f1958fd52098d8bdcf205efc95cdaad56787) 2013-05-29 23:41:52 +02:00
generate-activity-metrics: Display by-week usage numbers changes. (imported from commit 22bc1592c35c5f86ab89f7f4fc14ba7dbd2375db) 2013-05-30 23:16:43 +02:00			`print "\n\nWeekly Wednesday results"`
			`print "Date\t\t\t\tUsers\t\tDelta from previous week"`
			`print_results(reversed(xrange(1, 1000)), [2], True)`
Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00


			`parser = optparse.OptionParser(r"""`

			`%prog --user username --password pw [--start-from unixtimestamp]`

			`Generates activity statistics with detailed week-over-week percentage change`
			`""")`

			`parser.add_option('--user',`
			`help='Graphite usernarme',`
			`metavar='USER')`
			`parser.add_option('--password',`
			`help='Graphite password',`
			`metavar='PASSWORD')`
			`parser.add_option('--start-from',`
			`help='What day to consider as \'today\' when calculating stats as a Unix timestamp',`
			`metavar='STARTDATE',`
			`default='today')`
generate-activity-metrics: Add support for specifying realm. (imported from commit 09e2adb6ae8d9fb0a140fbbe9a0f1272e2aa81a7) 2013-05-29 22:54:13 +02:00			`parser.add_option('--realm',`
			`help='Which realm to query',`
			`default='all')`
Do only one bucket, as specified on command line. (imported from commit 4ca4f9609a01a6cfedc55efdbe2423673bbe4904) 2013-05-30 23:15:22 +02:00			`parser.add_option('--bucket',`
			`help='Which bucket to query',`
			`default='12hr')`
Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00
			`if __name__ == '__main__':`
			`(options, args) = parser.parse_args()`

			`if not options.user or not options.password:`
			`parser.error("You must enter a username and password to log into graphite with")`

			`startfrom = noon_of(day=datetime.now())`
			`if options.start_from != 'today':`
			`startfrom = noon_of(day=datetime.fromtimestamp(int(options.start_from)))`
			`print "Using baseline of today as %s" % (startfrom,)`

generate-activity-metrics: Add support for specifying realm. (imported from commit 09e2adb6ae8d9fb0a140fbbe9a0f1272e2aa81a7) 2013-05-29 22:54:13 +02:00			`realm_key = statsd_key(options.realm, True)`
Do only one bucket, as specified on command line. (imported from commit 4ca4f9609a01a6cfedc55efdbe2423673bbe4904) 2013-05-30 23:15:22 +02:00			`buckets = [options.bucket]`
generate-activity-metrics: Add support for specifying realm. (imported from commit 09e2adb6ae8d9fb0a140fbbe9a0f1272e2aa81a7) 2013-05-29 22:54:13 +02:00
Update humbughq.com => zulip.net in tools. This shouldn't be pushed until we get graphite.zulip.net working. (imported from commit c0dd208222385a9ee92ad1ca8fbf141894f7c7b6) 2013-07-24 23:24:13 +02:00			`# This is the slightly-cleaned up JSON api version of https://graphiti.zulip.net/graphs/945c7aafc2d`
generate-activity-metrics: Add support for specifying realm. (imported from commit 09e2adb6ae8d9fb0a140fbbe9a0f1272e2aa81a7) 2013-05-29 22:54:13 +02:00			`#`
			`# Fetches 1 month worth of data`
Refactor to use wildcard/exclusion for all (imported from commit 2874361aef2cbc14a3047f964eb584f27a29323c) 2013-05-30 17:43:27 +02:00			`DATA_URL = get_data_url(buckets, options.realm)`
Add a rough tool to get day over day and week over week activity metrics (imported from commit fd6e6771fd29a31d553620c0f29ed663e2d680aa) 2013-05-09 16:41:46 +02:00			`data = get_data(DATA_URL, options.user, options.password)`


			`parse_data(data, startfrom)`