zulip/tools/generate-activity-metrics.py

#!/usr/bin/env python
#
# Generates % delta activity metrics from graphite/statsd data
#
import os, sys

sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

import optparse
from itertools import dropwhile, takewhile
from datetime import timedelta, datetime
from zephyr.lib.timestamp import datetime_to_timestamp
from zephyr.lib.utils import statsd_key
import requests

# Workaround to support the Python-requests 1.0 transition of .json
# from a property to a function
requests_json_is_function = callable(requests.Response.json)
def extract_json_response(resp):
    if requests_json_is_function:
        return resp.json()
    else:
        return resp.json

def get_data_url(buckets, realm):
    realm_key = statsd_key(realm, True)

    # This is the slightly-cleaned up JSON api version of https://graphiti.humbughq.com/graphs/945c7aafc2d
    #
    # Fetches 1 month worth of data
    DATA_URL="https://graphite.humbughq.com/render/?from=-28d&format=json"
    for bucket in buckets:
        if realm != 'all':
            statsd_target = "stats.gauges.staging.users.active.%s.%s" % (realm_key, bucket)
            DATA_URL += "&target=%s" % (statsd_target,)
        else:
            # all means adding up all realms, but exclude the .all. metrics since that would double things
            DATA_URL += "&target=sum(exclude(stats.gauges.staging.users.active.*.%s, 'all'))" % (bucket,)
    return DATA_URL

def get_data(url, username, pw):
    from requests.auth import HTTPDigestAuth

    res = requests.get(url, auth=HTTPDigestAuth(username, pw), verify=False)

    if res.status_code != 200:
        print "Failed to fetch data url: %s" % (res.error,)
        return []

    return extract_json_response(res)

def noon_of(day=datetime.now()):
    return datetime(year=day.year, month=day.month, day=day.day, hour=12)

def points_during_day(data, noon):
    """Returns all the points in the dataset that occur in the 12 hours around
    the datetime object that is passed in. data must be sorted."""
    before =datetime_to_timestamp(noon - timedelta(hours=12))
    after = datetime_to_timestamp(noon + timedelta(hours=12))

    between = filter(lambda pt: pt[1] > before and pt[1] < after, data)
    return between

def best_during_day(data, day):
    valid = sorted(points_during_day(data, day), key=lambda pt: pt[0], reverse=True)
    if len(valid):
        return valid[0][0]
    else:
        return None

def percent_diff(prev, cur):
    if prev is None or cur is None:
        return None
    return  ((cur - prev) / prev) * 100

def parse_data(data, today):
    for metric in data:
        # print "Got %s with data points %s" % (metric['target'], len(metric['datapoints']))
        # Calculate % between peak 2hr and 10min across each day and week
        metric['datapoints'].sort(key=lambda p: p[1])

        best_today = best_during_day(metric['datapoints'], today)
        print "Today, 0 days ago:\t\t(%.01f users)" % (best_today,)

        for i in xrange(1, 100):
            day = today - timedelta(days=i)
            week = today - timedelta(weeks=i*7)
            # Ignore weekends
            if day.weekday() not in [5, 6]:
                best = best_during_day(metric['datapoints'], day)

                if best is not None:
                    print "Last %s, %s days ago:\t(%.01f users)\t\t%.02f%%" \
                        % (day.strftime("%A"), i, best, percent_diff(best, best_today))
            best = best_during_day(metric['datapoints'], week)
            if best is not None:
                print "Weekly %% change from %s weeks ago today:\t\t%.02f" \
                        % (i, percent_diff(best, best_today))


parser = optparse.OptionParser(r"""

%prog --user username --password pw [--start-from unixtimestamp]

    Generates activity statistics with detailed week-over-week percentage change
""")

parser.add_option('--user',
                  help='Graphite usernarme',
                  metavar='USER')
parser.add_option('--password',
                  help='Graphite password',
                  metavar='PASSWORD')
parser.add_option('--start-from',
                  help='What day to consider as \'today\' when calculating stats as a Unix timestamp',
                  metavar='STARTDATE',
                  default='today')
parser.add_option('--realm',
                  help='Which realm to query',
                  default='all')

if __name__ == '__main__':
    (options, args) = parser.parse_args()

    if not options.user or not options.password:
        parser.error("You must enter a username and password to log into graphite with")

    startfrom = noon_of(day=datetime.now())
    if options.start_from != 'today':
        startfrom = noon_of(day=datetime.fromtimestamp(int(options.start_from)))
        print "Using baseline of today as %s" % (startfrom,)

    realm_key = statsd_key(options.realm, True)
    buckets = ["12hr", "2hr", "0_16hr"]

    # This is the slightly-cleaned up JSON api version of https://graphiti.humbughq.com/graphs/945c7aafc2d
    #
    # Fetches 1 month worth of data
    DATA_URL = get_data_url(buckets, options.realm)
    data = get_data(DATA_URL, options.user, options.password)


    parse_data(data, startfrom)