zulip/zerver/lib/utils.py

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division

from typing import Any, Callable, Optional, Sequence, TypeVar, Iterable, Tuple
from six import text_type, binary_type
import base64
import errno
import hashlib
import heapq
import itertools
import os
from time import sleep

from django.conf import settings
from six.moves import range
from zerver.lib.str_utils import force_text

T = TypeVar('T')

def statsd_key(val, clean_periods=False):
    # type: (Any, bool) -> str
    if not isinstance(val, str):
        val = str(val)

    if ':' in val:
        val = val.split(':')[0]
    val = val.replace('-', "_")
    if clean_periods:
        val = val.replace('.', '_')

    return val

class StatsDWrapper(object):
    """Transparently either submit metrics to statsd
    or do nothing without erroring out"""

    # Backported support for gauge deltas
    # as our statsd server supports them but supporting
    # pystatsd is not released yet
    def _our_gauge(self, stat, value, rate=1, delta=False):
            # type: (str, float, float, bool) -> str
            """Set a gauge value."""
            from django_statsd.clients import statsd
            if delta:
                value_str = '%+g|g' % (value,)
            else:
                value_str = '%g|g' % (value,)
            statsd._send(stat, value_str, rate)

    def __getattr__(self, name):
        # type: (str) -> Any
        # Hand off to statsd if we have it enabled
        # otherwise do nothing
        if name in ['timer', 'timing', 'incr', 'decr', 'gauge']:
            if settings.STATSD_HOST != '':
                from django_statsd.clients import statsd
                if name == 'gauge':
                    return self._our_gauge
                else:
                    return getattr(statsd, name)
            else:
                return lambda *args, **kwargs: None

        raise AttributeError

statsd = StatsDWrapper()

# Runs the callback with slices of all_list of a given batch_size
def run_in_batches(all_list, batch_size, callback, sleep_time = 0, logger = None):
    # type: (Sequence[T], int, Callable[[Sequence[T]], None], int, Optional[Callable[[str], None]]) ->  None
    if len(all_list) == 0:
        return

    limit = (len(all_list) // batch_size) + 1;
    for i in range(limit):
        start = i*batch_size
        end = (i+1) * batch_size
        if end >= len(all_list):
            end = len(all_list)
        batch = all_list[start:end]

        if logger:
            logger("Executing %s in batch %s of %s" % (end-start, i+1, limit))

        callback(batch)

        if i != limit - 1:
            sleep(sleep_time)

def make_safe_digest(string, hash_func=hashlib.sha1):
    # type: (text_type, Callable[[binary_type], Any]) -> text_type
    """
    return a hex digest of `string`.
    """
    # hashlib.sha1, md5, etc. expect bytes, so non-ASCII strings must
    # be encoded.
    return force_text(hash_func(string.encode('utf-8')).hexdigest())


def log_statsd_event(name):
    # type: (str) -> None
    """
    Sends a single event to statsd with the desired name and the current timestamp

    This can be used to provide vertical lines in generated graphs,
    for example when doing a prod deploy, bankruptcy request, or
    other one-off events

    Note that to draw this event as a vertical line in graphite
    you can use the drawAsInfinite() command
    """
    event_name = "events.%s" % (name,)
    statsd.incr(event_name)

def generate_random_token(length):
    # type: (int) -> text_type
    return base64.b16encode(os.urandom(length // 2)).decode('utf-8').lower()

def mkdir_p(path):
    # type: (str) -> None
    # Python doesn't have an analog to `mkdir -p` < Python 3.2.
    try:
        os.makedirs(path)
    except OSError as e:
        if e.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def query_chunker(queries, id_collector=None, chunk_size=1000, db_chunk_size=None):
    # type: (List[Any], Set[int], int, int) -> Iterable[Any]

    '''
    This merges one or more Django ascending-id queries into
    a generator that returns chunks of chunk_size row objects
    during each yield, preserving id order across all results..

    Queries should satisfy these conditions:
        - They should be Django filters.
        - They should return Django objects with "id" attributes.
        - They should be disjoint.

    The generator also populates id_collector, which we use
    internally to enforce unique ids, but which the caller
    can pass in to us if they want the side effect of collecting
    all ids.
    '''
    if db_chunk_size is None:
        db_chunk_size = chunk_size // len(queries)

    assert db_chunk_size >= 2
    assert chunk_size >= 2

    if id_collector is not None:
        assert(len(id_collector) == 0)
    else:
        id_collector = set()

    def chunkify(q, i):
        # type: (Any, int) -> Iterable[Tuple[int, int, Any]]
        q = q.order_by('id')
        min_id = -1
        while True:
            rows = list(q.filter(id__gt=min_id)[0:db_chunk_size])
            if len(rows) == 0:
                break
            for row in rows:
                yield (row.id, i, row)
            min_id = rows[-1].id

    iterators = [chunkify(q, i) for i, q in enumerate(queries)]
    merged_query = heapq.merge(*iterators)

    while True:
        tup_chunk = list(itertools.islice(merged_query, 0, chunk_size))
        if len(tup_chunk) == 0:
            break

        # Do duplicate-id management here.
        tup_ids = set([tup[0] for tup in tup_chunk])
        assert len(tup_ids) == len(tup_chunk)
        assert len(tup_ids.intersection(id_collector)) == 0
        id_collector.update(tup_ids)

        yield [row for row_id, i, row in tup_chunk]
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`# -- coding: utf-8 --`
Enable absolute imports. See PEP 328[1] for details. This feature was introduced in Python 2.5 and will become mandatory in Python 3. [1]: http://www.python.org/dev/peps/pep-0328 (imported from commit 7444eeba8a08d5f91b94c7921848f2274979bd76) 2013-04-23 18:51:17 +02:00			`from __future__ import absolute_import`
Switch to using Python 3 style division everywhere. Also add testing for this to our Python 3 compatibility test suite. 2016-01-24 03:56:05 +01:00			`from __future__ import division`
Enable absolute imports. See PEP 328[1] for details. This feature was introduced in Python 2.5 and will become mandatory in Python 3. [1]: http://www.python.org/dev/peps/pep-0328 (imported from commit 7444eeba8a08d5f91b94c7921848f2274979bd76) 2013-04-23 18:51:17 +02:00
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`from typing import Any, Callable, Optional, Sequence, TypeVar, Iterable, Tuple`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`from six import text_type, binary_type`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00			`import base64`
utils: Add mkdir_p implementation. 2016-08-08 23:30:46 +02:00			`import errno`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00			`import hashlib`
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`import heapq`
			`import itertools`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00			`import os`
Remove unused imports (imported from commit 9e3050c72a2d1137b9096c6cfa1c3945341b9a56) 2013-06-27 20:03:51 +02:00			`from time import sleep`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`from django.conf import settings`
Apply Python 3 futurize transform libmodernize.fixes.fix_xrange_six. 2015-11-01 17:15:05 +01:00			`from six.moves import range`
Fix annotations related to make_safe_digest and hashes. 2016-06-12 14:22:20 +02:00			`from zerver.lib.str_utils import force_text`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`T = TypeVar('T')`

Add a management command for active user stats (imported from commit a4227858b422c48e272700880e0c21889c7ce566) 2013-04-30 23:58:59 +02:00			`def statsd_key(val, clean_periods=False):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (Any, bool) -> str`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`if not isinstance(val, str):`
			`val = str(val)`

			`if ':' in val:`
			`val = val.split(':')[0]`
			`val = val.replace('-', "_")`
Add a management command for active user stats (imported from commit a4227858b422c48e272700880e0c21889c7ce566) 2013-04-30 23:58:59 +02:00			`if clean_periods:`
			`val = val.replace('.', '_')`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00
			`return val`

			`class StatsDWrapper(object):`
			`"""Transparently either submit metrics to statsd`
			`or do nothing without erroring out"""`

			`# Backported support for gauge deltas`
			`# as our statsd server supports them but supporting`
			`# pystatsd is not released yet`
			`def _our_gauge(self, stat, value, rate=1, delta=False):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (str, float, float, bool) -> str`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`"""Set a gauge value."""`
			`from django_statsd.clients import statsd`
			`if delta:`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`value_str = '%+g\|g' % (value,)`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`else:`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`value_str = '%g\|g' % (value,)`
			`statsd._send(stat, value_str, rate)`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00
			`def __getattr__(self, name):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (str) -> Any`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`# Hand off to statsd if we have it enabled`
			`# otherwise do nothing`
			`if name in ['timer', 'timing', 'incr', 'decr', 'gauge']:`
Move zulip.com-related statsd configuration out of main settings.py. This also removes the convenient way to run statsd in the Dev VM, because we don't anticipate anyone doing that. It's just 2 lines of config to configure it anyway: STATSD_HOST = 'localhost' STATSD_PREFIX = 'user' (imported from commit 5b09422ee0e956bc7f336dd1e575634380b8bfa2) 2015-08-22 22:18:55 +02:00			`if settings.STATSD_HOST != '':`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`from django_statsd.clients import statsd`
			`if name == 'gauge':`
			`return self._our_gauge`
			`else:`
			`return getattr(statsd, name)`
			`else:`
			`return lambda args, *kwargs: None`

			`raise AttributeError`

			`statsd = StatsDWrapper()`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00
			`# Runs the callback with slices of all_list of a given batch_size`
			`def run_in_batches(all_list, batch_size, callback, sleep_time = 0, logger = None):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (Sequence[T], int, Callable[[Sequence[T]], None], int, Optional[Callable[[str], None]]) -> None`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`if len(all_list) == 0:`
			`return`

Switch to using Python 3 style division everywhere. Also add testing for this to our Python 3 compatibility test suite. 2016-01-24 03:56:05 +01:00			`limit = (len(all_list) // batch_size) + 1;`
Apply Python 3 futurize transform libmodernize.fixes.fix_xrange_six. 2015-11-01 17:15:05 +01:00			`for i in range(limit):`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`start = i*batch_size`
			`end = (i+1) * batch_size`
			`if end >= len(all_list):`
			`end = len(all_list)`
			`batch = all_list[start:end]`

			`if logger:`
			`logger("Executing %s in batch %s of %s" % (end-start, i+1, limit))`

			`callback(batch)`
Only sleep if there is more work to be done (imported from commit f8a1380e0045c9470909c088a9d262f8a714c86e) 2013-03-18 18:09:16 +01:00
			`if i != limit - 1:`
			`sleep(sleep_time)`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00
			`def make_safe_digest(string, hash_func=hashlib.sha1):`
Fix annotations related to make_safe_digest and hashes. 2016-06-12 14:22:20 +02:00			`# type: (text_type, Callable[[binary_type], Any]) -> text_type`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00			`"""`
			return a hex digest of `string`.
			`"""`
			`# hashlib.sha1, md5, etc. expect bytes, so non-ASCII strings must`
			`# be encoded.`
Fix annotations related to make_safe_digest and hashes. 2016-06-12 14:22:20 +02:00			`return force_text(hash_func(string.encode('utf-8')).hexdigest())`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00

			`def log_statsd_event(name):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (str) -> None`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`"""`
			`Sends a single event to statsd with the desired name and the current timestamp`

			`This can be used to provide vertical lines in generated graphs,`
			`for example when doing a prod deploy, bankruptcy request, or`
			`other one-off events`

			`Note that to draw this event as a vertical line in graphite`
			`you can use the drawAsInfinite() command`
			`"""`
			`event_name = "events.%s" % (name,)`
Fix bankruptcy event sent to graphite (imported from commit b016ab41ddc2636b76a49c9eb33b7becd387557d) 2013-06-07 23:53:20 +02:00			`statsd.incr(event_name)`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00
			`def generate_random_token(length):`
Fix zerver.lib.utils.generate_random_token. generate_random_token used to return a value of type six.binary_type and its return type was annotated as `str`. This commit fixes that by making it return a value of type `six.text_type` and updating the annotation accordingly. Also fix clashing annnotations. 2016-06-12 12:24:27 +02:00			`# type: (int) -> text_type`
			`return base64.b16encode(os.urandom(length // 2)).decode('utf-8').lower()`
utils: Add mkdir_p implementation. 2016-08-08 23:30:46 +02:00
			`def mkdir_p(path):`
			`# type: (str) -> None`
			# Python doesn't have an analog to `mkdir -p` < Python 3.2.
			`try:`
			`os.makedirs(path)`
			`except OSError as e:`
			`if e.errno == errno.EEXIST and os.path.isdir(path):`
			`pass`
			`else:`
			`raise`
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00
			`def query_chunker(queries, id_collector=None, chunk_size=1000, db_chunk_size=None):`
			`# type: (List[Any], Set[int], int, int) -> Iterable[Any]`

			`'''`
			`This merges one or more Django ascending-id queries into`
			`a generator that returns chunks of chunk_size row objects`
			`during each yield, preserving id order across all results..`

			`Queries should satisfy these conditions:`
			`- They should be Django filters.`
			`- They should return Django objects with "id" attributes.`
			`- They should be disjoint.`

			`The generator also populates id_collector, which we use`
			`internally to enforce unique ids, but which the caller`
			`can pass in to us if they want the side effect of collecting`
			`all ids.`
			`'''`
			`if db_chunk_size is None:`
			`db_chunk_size = chunk_size // len(queries)`

			`assert db_chunk_size >= 2`
			`assert chunk_size >= 2`

			`if id_collector is not None:`
			`assert(len(id_collector) == 0)`
			`else:`
			`id_collector = set()`

			`def chunkify(q, i):`
			`# type: (Any, int) -> Iterable[Tuple[int, int, Any]]`
			`q = q.order_by('id')`
			`min_id = -1`
			`while True:`
			`rows = list(q.filter(id__gt=min_id)[0:db_chunk_size])`
			`if len(rows) == 0:`
			`break`
			`for row in rows:`
			`yield (row.id, i, row)`
			`min_id = rows[-1].id`

			`iterators = [chunkify(q, i) for i, q in enumerate(queries)]`
			`merged_query = heapq.merge(*iterators)`

			`while True:`
			`tup_chunk = list(itertools.islice(merged_query, 0, chunk_size))`
			`if len(tup_chunk) == 0:`
			`break`

			`# Do duplicate-id management here.`
			`tup_ids = set([tup[0] for tup in tup_chunk])`
			`assert len(tup_ids) == len(tup_chunk)`
			`assert len(tup_ids.intersection(id_collector)) == 0`
			`id_collector.update(tup_ids)`

			`yield [row for row_id, i, row in tup_chunk]`