zulip/zerver/lib/utils.py

# -*- coding: utf-8 -*-

from typing import Any, Callable, List, Optional, Sequence, TypeVar, Iterable, Set, Tuple, Text
import base64
import errno
import hashlib
import heapq
import itertools
import os
import sys
from time import sleep
from itertools import zip_longest

from django.conf import settings

T = TypeVar('T')

def statsd_key(val, clean_periods=False):
    # type: (Any, bool) -> str
    if not isinstance(val, str):
        val = str(val)

    if ':' in val:
        val = val.split(':')[0]
    val = val.replace('-', "_")
    if clean_periods:
        val = val.replace('.', '_')

    return val

class StatsDWrapper:
    """Transparently either submit metrics to statsd
    or do nothing without erroring out"""

    # Backported support for gauge deltas
    # as our statsd server supports them but supporting
    # pystatsd is not released yet
    def _our_gauge(self, stat, value, rate=1, delta=False):
            # type: (str, float, float, bool) -> None
            """Set a gauge value."""
            from django_statsd.clients import statsd
            if delta:
                value_str = '%+g|g' % (value,)
            else:
                value_str = '%g|g' % (value,)
            statsd._send(stat, value_str, rate)

    def __getattr__(self, name):
        # type: (str) -> Any
        # Hand off to statsd if we have it enabled
        # otherwise do nothing
        if name in ['timer', 'timing', 'incr', 'decr', 'gauge']:
            if settings.STATSD_HOST != '':
                from django_statsd.clients import statsd
                if name == 'gauge':
                    return self._our_gauge
                else:
                    return getattr(statsd, name)
            else:
                return lambda *args, **kwargs: None

        raise AttributeError

statsd = StatsDWrapper()

# Runs the callback with slices of all_list of a given batch_size
def run_in_batches(all_list, batch_size, callback, sleep_time = 0, logger = None):
    # type: (Sequence[T], int, Callable[[Sequence[T]], None], int, Optional[Callable[[str], None]]) ->  None
    if len(all_list) == 0:
        return

    limit = (len(all_list) // batch_size) + 1
    for i in range(limit):
        start = i*batch_size
        end = (i+1) * batch_size
        if end >= len(all_list):
            end = len(all_list)
        batch = all_list[start:end]

        if logger:
            logger("Executing %s in batch %s of %s" % (end-start, i+1, limit))

        callback(batch)

        if i != limit - 1:
            sleep(sleep_time)

def make_safe_digest(string, hash_func=hashlib.sha1):
    # type: (Text, Callable[[bytes], Any]) -> Text
    """
    return a hex digest of `string`.
    """
    # hashlib.sha1, md5, etc. expect bytes, so non-ASCII strings must
    # be encoded.
    return hash_func(string.encode('utf-8')).hexdigest()


def log_statsd_event(name):
    # type: (str) -> None
    """
    Sends a single event to statsd with the desired name and the current timestamp

    This can be used to provide vertical lines in generated graphs,
    for example when doing a prod deploy, bankruptcy request, or
    other one-off events

    Note that to draw this event as a vertical line in graphite
    you can use the drawAsInfinite() command
    """
    event_name = "events.%s" % (name,)
    statsd.incr(event_name)

def generate_random_token(length):
    # type: (int) -> str
    return str(base64.b16encode(os.urandom(length // 2)).decode('utf-8').lower())

def query_chunker(queries, id_collector=None, chunk_size=1000, db_chunk_size=None):
    # type: (List[Any], Set[int], int, int) -> Iterable[Any]
    '''
    This merges one or more Django ascending-id queries into
    a generator that returns chunks of chunk_size row objects
    during each yield, preserving id order across all results..

    Queries should satisfy these conditions:
        - They should be Django filters.
        - They should return Django objects with "id" attributes.
        - They should be disjoint.

    The generator also populates id_collector, which we use
    internally to enforce unique ids, but which the caller
    can pass in to us if they want the side effect of collecting
    all ids.
    '''
    if db_chunk_size is None:
        db_chunk_size = chunk_size // len(queries)

    assert db_chunk_size >= 2
    assert chunk_size >= 2

    if id_collector is not None:
        assert(len(id_collector) == 0)
    else:
        id_collector = set()

    def chunkify(q, i):
        # type: (Any, int) -> Iterable[Tuple[int, int, Any]]
        q = q.order_by('id')
        min_id = -1
        while True:
            assert db_chunk_size is not None  # Hint for mypy, but also workaround for mypy bug #3442.
            rows = list(q.filter(id__gt=min_id)[0:db_chunk_size])
            if len(rows) == 0:
                break
            for row in rows:
                yield (row.id, i, row)
            min_id = rows[-1].id

    iterators = [chunkify(q, i) for i, q in enumerate(queries)]
    merged_query = heapq.merge(*iterators)

    while True:
        tup_chunk = list(itertools.islice(merged_query, 0, chunk_size))
        if len(tup_chunk) == 0:
            break

        # Do duplicate-id management here.
        tup_ids = set([tup[0] for tup in tup_chunk])
        assert len(tup_ids) == len(tup_chunk)
        assert len(tup_ids.intersection(id_collector)) == 0
        id_collector.update(tup_ids)

        yield [row for row_id, i, row in tup_chunk]

def split_by(array, group_size, filler):
    # type: (List[Any], int, Any) -> List[List[Any]]
    """
    Group elements into list of size `group_size` and fill empty cells with
    `filler`. Recipe from https://docs.python.org/3/library/itertools.html
    """
    args = [iter(array)] * group_size
    return list(map(list, zip_longest(*args, fillvalue=filler)))

def is_remote_server(identifier):
    # type: (Text) -> bool
    """
    This function can be used to identify the source of API auth
    request. We can have two types of sources, Remote Zulip Servers
    and UserProfiles.
    """
    return "@" not in identifier
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`# -- coding: utf-8 --`
Enable absolute imports. See PEP 328[1] for details. This feature was introduced in Python 2.5 and will become mandatory in Python 3. [1]: http://www.python.org/dev/peps/pep-0328 (imported from commit 7444eeba8a08d5f91b94c7921848f2274979bd76) 2013-04-23 18:51:17 +02:00
mypy: Added Dict, List and Set imports. Fixed mypy errors associated with the upgrade. 2017-03-03 19:01:52 +01:00			`from typing import Any, Callable, List, Optional, Sequence, TypeVar, Iterable, Set, Tuple, Text`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00			`import base64`
utils: Add mkdir_p implementation. 2016-08-08 23:30:46 +02:00			`import errno`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00			`import hashlib`
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`import heapq`
			`import itertools`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00			`import os`
Add /authors page. Contributor visualization showing the avatar, user name and number of commits for each contributors. The JSON data would be updated upon deployment, triggered by the `update-prod-static` script. 2017-01-06 18:56:36 +01:00			`import sys`
Remove unused imports (imported from commit 9e3050c72a2d1137b9096c6cfa1c3945341b9a56) 2013-06-27 20:03:51 +02:00			`from time import sleep`
refactor: Remove six.moves.zip_longest import. 2017-11-05 06:39:22 +01:00			`from itertools import zip_longest`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`from django.conf import settings`

mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`T = TypeVar('T')`

Add a management command for active user stats (imported from commit a4227858b422c48e272700880e0c21889c7ce566) 2013-04-30 23:58:59 +02:00			`def statsd_key(val, clean_periods=False):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (Any, bool) -> str`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`if not isinstance(val, str):`
			`val = str(val)`

			`if ':' in val:`
			`val = val.split(':')[0]`
			`val = val.replace('-', "_")`
Add a management command for active user stats (imported from commit a4227858b422c48e272700880e0c21889c7ce566) 2013-04-30 23:58:59 +02:00			`if clean_periods:`
			`val = val.replace('.', '_')`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00
			`return val`

zerver/lib: Remove inheritance from object. 2017-11-05 11:37:41 +01:00			`class StatsDWrapper:`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`"""Transparently either submit metrics to statsd`
			`or do nothing without erroring out"""`

			`# Backported support for gauge deltas`
			`# as our statsd server supports them but supporting`
			`# pystatsd is not released yet`
			`def _our_gauge(self, stat, value, rate=1, delta=False):`
Fix several new errors caught by mypy 0.501. Clear out a bunch of easy to review errors, so we can focus on the more complicated ones. 2017-03-03 20:30:49 +01:00			`# type: (str, float, float, bool) -> None`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`"""Set a gauge value."""`
			`from django_statsd.clients import statsd`
			`if delta:`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`value_str = '%+g\|g' % (value,)`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`else:`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`value_str = '%g\|g' % (value,)`
			`statsd._send(stat, value_str, rate)`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00
			`def __getattr__(self, name):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (str) -> Any`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`# Hand off to statsd if we have it enabled`
			`# otherwise do nothing`
			`if name in ['timer', 'timing', 'incr', 'decr', 'gauge']:`
Move zulip.com-related statsd configuration out of main settings.py. This also removes the convenient way to run statsd in the Dev VM, because we don't anticipate anyone doing that. It's just 2 lines of config to configure it anyway: STATSD_HOST = 'localhost' STATSD_PREFIX = 'user' (imported from commit 5b09422ee0e956bc7f336dd1e575634380b8bfa2) 2015-08-22 22:18:55 +02:00			`if settings.STATSD_HOST != '':`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`from django_statsd.clients import statsd`
			`if name == 'gauge':`
			`return self._our_gauge`
			`else:`
			`return getattr(statsd, name)`
			`else:`
			`return lambda args, *kwargs: None`

			`raise AttributeError`

			`statsd = StatsDWrapper()`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00
			`# Runs the callback with slices of all_list of a given batch_size`
			`def run_in_batches(all_list, batch_size, callback, sleep_time = 0, logger = None):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (Sequence[T], int, Callable[[Sequence[T]], None], int, Optional[Callable[[str], None]]) -> None`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`if len(all_list) == 0:`
			`return`

lint: Fix E703 pep8 violations. 2016-11-09 13:44:29 +01:00			`limit = (len(all_list) // batch_size) + 1`
Apply Python 3 futurize transform libmodernize.fixes.fix_xrange_six. 2015-11-01 17:15:05 +01:00			`for i in range(limit):`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`start = i*batch_size`
			`end = (i+1) * batch_size`
			`if end >= len(all_list):`
			`end = len(all_list)`
			`batch = all_list[start:end]`

			`if logger:`
			`logger("Executing %s in batch %s of %s" % (end-start, i+1, limit))`

			`callback(batch)`
Only sleep if there is more work to be done (imported from commit f8a1380e0045c9470909c088a9d262f8a714c86e) 2013-03-18 18:09:16 +01:00
			`if i != limit - 1:`
			`sleep(sleep_time)`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00
			`def make_safe_digest(string, hash_func=hashlib.sha1):`
Remove usage of six.moves.binary_type. 2017-11-09 09:03:33 +01:00			`# type: (Text, Callable[[bytes], Any]) -> Text`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00			`"""`
			return a hex digest of `string`.
			`"""`
			`# hashlib.sha1, md5, etc. expect bytes, so non-ASCII strings must`
			`# be encoded.`
Avoid unneeded force_text() in make_safe_digest(). 2017-11-04 17:30:42 +01:00			`return hash_func(string.encode('utf-8')).hexdigest()`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00

			`def log_statsd_event(name):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (str) -> None`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`"""`
			`Sends a single event to statsd with the desired name and the current timestamp`

			`This can be used to provide vertical lines in generated graphs,`
			`for example when doing a prod deploy, bankruptcy request, or`
			`other one-off events`

			`Note that to draw this event as a vertical line in graphite`
			`you can use the drawAsInfinite() command`
			`"""`
			`event_name = "events.%s" % (name,)`
Fix bankruptcy event sent to graphite (imported from commit b016ab41ddc2636b76a49c9eb33b7becd387557d) 2013-06-07 23:53:20 +02:00			`statsd.incr(event_name)`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00
			`def generate_random_token(length):`
utils: Cast generate_random_token to str. Having this be Text is forcing various URLs, emails, etc to be type annotated as Text. 2017-07-08 02:46:51 +02:00			`# type: (int) -> str`
			`return str(base64.b16encode(os.urandom(length // 2)).decode('utf-8').lower())`
utils: Add mkdir_p implementation. 2016-08-08 23:30:46 +02:00
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`def query_chunker(queries, id_collector=None, chunk_size=1000, db_chunk_size=None):`
			`# type: (List[Any], Set[int], int, int) -> Iterable[Any]`
			`'''`
			`This merges one or more Django ascending-id queries into`
			`a generator that returns chunks of chunk_size row objects`
			`during each yield, preserving id order across all results..`

			`Queries should satisfy these conditions:`
			`- They should be Django filters.`
			`- They should return Django objects with "id" attributes.`
			`- They should be disjoint.`

			`The generator also populates id_collector, which we use`
			`internally to enforce unique ids, but which the caller`
			`can pass in to us if they want the side effect of collecting`
			`all ids.`
			`'''`
			`if db_chunk_size is None:`
			`db_chunk_size = chunk_size // len(queries)`

			`assert db_chunk_size >= 2`
			`assert chunk_size >= 2`

			`if id_collector is not None:`
			`assert(len(id_collector) == 0)`
			`else:`
			`id_collector = set()`

			`def chunkify(q, i):`
			`# type: (Any, int) -> Iterable[Tuple[int, int, Any]]`
			`q = q.order_by('id')`
			`min_id = -1`
			`while True:`
mypy: strict optional fixes. 2017-05-24 21:28:26 +02:00			`assert db_chunk_size is not None # Hint for mypy, but also workaround for mypy bug #3442.`
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`rows = list(q.filter(id__gt=min_id)[0:db_chunk_size])`
			`if len(rows) == 0:`
			`break`
			`for row in rows:`
			`yield (row.id, i, row)`
			`min_id = rows[-1].id`

			`iterators = [chunkify(q, i) for i, q in enumerate(queries)]`
			`merged_query = heapq.merge(*iterators)`

			`while True:`
			`tup_chunk = list(itertools.islice(merged_query, 0, chunk_size))`
			`if len(tup_chunk) == 0:`
			`break`

			`# Do duplicate-id management here.`
			`tup_ids = set([tup[0] for tup in tup_chunk])`
			`assert len(tup_ids) == len(tup_chunk)`
			`assert len(tup_ids.intersection(id_collector)) == 0`
			`id_collector.update(tup_ids)`

			`yield [row for row_id, i, row in tup_chunk]`
Add option for hosting each realm on its own subdomain. This adds support for running a Zulip production server with each realm on its own unique subdomain, e.g. https://realm_name.example.com. This patch includes a ton of important features: * Configuring the Zulip sesion middleware to issue cookier correctly for the subdomains case. * Throwing an error if the user tries to visit an invalid subdomain. * Runs a portion of the Casper tests with REALMS_HAVE_SUBDOMAINS enabled to test the subdomain signup process. * Updating our integrations documentation to refer to the current subdomain. * Enforces that users can only login to the subdomain of their realm (but does not restrict the API; that will be tightened in a future commit). Note that toggling settings.REALMS_HAVE_SUBDOMAINS on a live server is not supported without manual intervention (the main problem will be adding "subdomain" values for all the existing realms). [substantially modified by tabbott as part of merging] 2016-07-19 14:35:08 +02:00
Add /authors page. Contributor visualization showing the avatar, user name and number of commits for each contributors. The JSON data would be updated upon deployment, triggered by the `update-prod-static` script. 2017-01-06 18:56:36 +01:00			`def split_by(array, group_size, filler):`
			`# type: (List[Any], int, Any) -> List[List[Any]]`
			`"""`
			Group elements into list of size `group_size` and fill empty cells with
			`filler`. Recipe from https://docs.python.org/3/library/itertools.html
			`"""`
			`args = [iter(array)] * group_size`
			`return list(map(list, zip_longest(*args, fillvalue=filler)))`
api: Add is_remote_server(). This function abstracts the logic to ascertain the source of API auth request. 2017-04-28 06:55:22 +02:00
			`def is_remote_server(identifier):`
			`# type: (Text) -> bool`
			`"""`
			`This function can be used to identify the source of API auth`
			`request. We can have two types of sources, Remote Zulip Servers`
			`and UserProfiles.`
			`"""`
			`return "@" not in identifier`