zulip/zerver/lib/utils.py

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division

from typing import Any, Callable, Optional, Sequence, TypeVar, Iterable, Tuple
from six import text_type, binary_type
import base64
import errno
import hashlib
import heapq
import itertools
import os
from time import sleep

from django.conf import settings
from django.http import HttpRequest
from six.moves import range
from zerver.lib.str_utils import force_text

T = TypeVar('T')

def statsd_key(val, clean_periods=False):
    # type: (Any, bool) -> str
    if not isinstance(val, str):
        val = str(val)

    if ':' in val:
        val = val.split(':')[0]
    val = val.replace('-', "_")
    if clean_periods:
        val = val.replace('.', '_')

    return val

class StatsDWrapper(object):
    """Transparently either submit metrics to statsd
    or do nothing without erroring out"""

    # Backported support for gauge deltas
    # as our statsd server supports them but supporting
    # pystatsd is not released yet
    def _our_gauge(self, stat, value, rate=1, delta=False):
            # type: (str, float, float, bool) -> str
            """Set a gauge value."""
            from django_statsd.clients import statsd
            if delta:
                value_str = '%+g|g' % (value,)
            else:
                value_str = '%g|g' % (value,)
            statsd._send(stat, value_str, rate)

    def __getattr__(self, name):
        # type: (str) -> Any
        # Hand off to statsd if we have it enabled
        # otherwise do nothing
        if name in ['timer', 'timing', 'incr', 'decr', 'gauge']:
            if settings.STATSD_HOST != '':
                from django_statsd.clients import statsd
                if name == 'gauge':
                    return self._our_gauge
                else:
                    return getattr(statsd, name)
            else:
                return lambda *args, **kwargs: None

        raise AttributeError

statsd = StatsDWrapper()

# Runs the callback with slices of all_list of a given batch_size
def run_in_batches(all_list, batch_size, callback, sleep_time = 0, logger = None):
    # type: (Sequence[T], int, Callable[[Sequence[T]], None], int, Optional[Callable[[str], None]]) ->  None
    if len(all_list) == 0:
        return

    limit = (len(all_list) // batch_size) + 1
    for i in range(limit):
        start = i*batch_size
        end = (i+1) * batch_size
        if end >= len(all_list):
            end = len(all_list)
        batch = all_list[start:end]

        if logger:
            logger("Executing %s in batch %s of %s" % (end-start, i+1, limit))

        callback(batch)

        if i != limit - 1:
            sleep(sleep_time)

def make_safe_digest(string, hash_func=hashlib.sha1):
    # type: (text_type, Callable[[binary_type], Any]) -> text_type
    """
    return a hex digest of `string`.
    """
    # hashlib.sha1, md5, etc. expect bytes, so non-ASCII strings must
    # be encoded.
    return force_text(hash_func(string.encode('utf-8')).hexdigest())


def log_statsd_event(name):
    # type: (str) -> None
    """
    Sends a single event to statsd with the desired name and the current timestamp

    This can be used to provide vertical lines in generated graphs,
    for example when doing a prod deploy, bankruptcy request, or
    other one-off events

    Note that to draw this event as a vertical line in graphite
    you can use the drawAsInfinite() command
    """
    event_name = "events.%s" % (name,)
    statsd.incr(event_name)

def generate_random_token(length):
    # type: (int) -> text_type
    return base64.b16encode(os.urandom(length // 2)).decode('utf-8').lower()

def mkdir_p(path):
    # type: (str) -> None
    # Python doesn't have an analog to `mkdir -p` < Python 3.2.
    try:
        os.makedirs(path)
    except OSError as e:
        if e.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def query_chunker(queries, id_collector=None, chunk_size=1000, db_chunk_size=None):
    # type: (List[Any], Set[int], int, int) -> Iterable[Any]
    '''
    This merges one or more Django ascending-id queries into
    a generator that returns chunks of chunk_size row objects
    during each yield, preserving id order across all results..

    Queries should satisfy these conditions:
        - They should be Django filters.
        - They should return Django objects with "id" attributes.
        - They should be disjoint.

    The generator also populates id_collector, which we use
    internally to enforce unique ids, but which the caller
    can pass in to us if they want the side effect of collecting
    all ids.
    '''
    if db_chunk_size is None:
        db_chunk_size = chunk_size // len(queries)

    assert db_chunk_size >= 2
    assert chunk_size >= 2

    if id_collector is not None:
        assert(len(id_collector) == 0)
    else:
        id_collector = set()

    def chunkify(q, i):
        # type: (Any, int) -> Iterable[Tuple[int, int, Any]]
        q = q.order_by('id')
        min_id = -1
        while True:
            rows = list(q.filter(id__gt=min_id)[0:db_chunk_size])
            if len(rows) == 0:
                break
            for row in rows:
                yield (row.id, i, row)
            min_id = rows[-1].id

    iterators = [chunkify(q, i) for i, q in enumerate(queries)]
    merged_query = heapq.merge(*iterators)

    while True:
        tup_chunk = list(itertools.islice(merged_query, 0, chunk_size))
        if len(tup_chunk) == 0:
            break

        # Do duplicate-id management here.
        tup_ids = set([tup[0] for tup in tup_chunk])
        assert len(tup_ids) == len(tup_chunk)
        assert len(tup_ids.intersection(id_collector)) == 0
        id_collector.update(tup_ids)

        yield [row for row_id, i, row in tup_chunk]

def get_subdomain(request):
    # type: (HttpRequest) -> text_type
    domain = request.get_host().lower()
    index = domain.find("." + settings.EXTERNAL_HOST)
    if index == -1:
        return ""
    subdomain = domain[0:index]
    if subdomain in settings.ROOT_SUBDOMAIN_ALIASES:
        return ""
    return subdomain

def check_subdomain(realm_subdomain, user_subdomain):
    # type: (text_type, text_type) -> bool
    if settings.REALMS_HAVE_SUBDOMAINS and realm_subdomain is not None:
        if (realm_subdomain == "" and user_subdomain is None):
            return True
        if realm_subdomain != user_subdomain:
            return False
    return True
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`# -- coding: utf-8 --`
Enable absolute imports. See PEP 328[1] for details. This feature was introduced in Python 2.5 and will become mandatory in Python 3. [1]: http://www.python.org/dev/peps/pep-0328 (imported from commit 7444eeba8a08d5f91b94c7921848f2274979bd76) 2013-04-23 18:51:17 +02:00			`from __future__ import absolute_import`
Switch to using Python 3 style division everywhere. Also add testing for this to our Python 3 compatibility test suite. 2016-01-24 03:56:05 +01:00			`from __future__ import division`
Enable absolute imports. See PEP 328[1] for details. This feature was introduced in Python 2.5 and will become mandatory in Python 3. [1]: http://www.python.org/dev/peps/pep-0328 (imported from commit 7444eeba8a08d5f91b94c7921848f2274979bd76) 2013-04-23 18:51:17 +02:00
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`from typing import Any, Callable, Optional, Sequence, TypeVar, Iterable, Tuple`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`from six import text_type, binary_type`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00			`import base64`
utils: Add mkdir_p implementation. 2016-08-08 23:30:46 +02:00			`import errno`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00			`import hashlib`
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`import heapq`
			`import itertools`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00			`import os`
Remove unused imports (imported from commit 9e3050c72a2d1137b9096c6cfa1c3945341b9a56) 2013-06-27 20:03:51 +02:00			`from time import sleep`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`from django.conf import settings`
Add option for hosting each realm on its own subdomain. This adds support for running a Zulip production server with each realm on its own unique subdomain, e.g. https://realm_name.example.com. This patch includes a ton of important features: * Configuring the Zulip sesion middleware to issue cookier correctly for the subdomains case. * Throwing an error if the user tries to visit an invalid subdomain. * Runs a portion of the Casper tests with REALMS_HAVE_SUBDOMAINS enabled to test the subdomain signup process. * Updating our integrations documentation to refer to the current subdomain. * Enforces that users can only login to the subdomain of their realm (but does not restrict the API; that will be tightened in a future commit). Note that toggling settings.REALMS_HAVE_SUBDOMAINS on a live server is not supported without manual intervention (the main problem will be adding "subdomain" values for all the existing realms). [substantially modified by tabbott as part of merging] 2016-07-19 14:35:08 +02:00			`from django.http import HttpRequest`
Apply Python 3 futurize transform libmodernize.fixes.fix_xrange_six. 2015-11-01 17:15:05 +01:00			`from six.moves import range`
Fix annotations related to make_safe_digest and hashes. 2016-06-12 14:22:20 +02:00			`from zerver.lib.str_utils import force_text`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`T = TypeVar('T')`

Add a management command for active user stats (imported from commit a4227858b422c48e272700880e0c21889c7ce566) 2013-04-30 23:58:59 +02:00			`def statsd_key(val, clean_periods=False):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (Any, bool) -> str`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`if not isinstance(val, str):`
			`val = str(val)`

			`if ':' in val:`
			`val = val.split(':')[0]`
			`val = val.replace('-', "_")`
Add a management command for active user stats (imported from commit a4227858b422c48e272700880e0c21889c7ce566) 2013-04-30 23:58:59 +02:00			`if clean_periods:`
			`val = val.replace('.', '_')`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00
			`return val`

			`class StatsDWrapper(object):`
			`"""Transparently either submit metrics to statsd`
			`or do nothing without erroring out"""`

			`# Backported support for gauge deltas`
			`# as our statsd server supports them but supporting`
			`# pystatsd is not released yet`
			`def _our_gauge(self, stat, value, rate=1, delta=False):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (str, float, float, bool) -> str`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`"""Set a gauge value."""`
			`from django_statsd.clients import statsd`
			`if delta:`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`value_str = '%+g\|g' % (value,)`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`else:`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`value_str = '%g\|g' % (value,)`
			`statsd._send(stat, value_str, rate)`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00
			`def __getattr__(self, name):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (str) -> Any`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`# Hand off to statsd if we have it enabled`
			`# otherwise do nothing`
			`if name in ['timer', 'timing', 'incr', 'decr', 'gauge']:`
Move zulip.com-related statsd configuration out of main settings.py. This also removes the convenient way to run statsd in the Dev VM, because we don't anticipate anyone doing that. It's just 2 lines of config to configure it anyway: STATSD_HOST = 'localhost' STATSD_PREFIX = 'user' (imported from commit 5b09422ee0e956bc7f336dd1e575634380b8bfa2) 2015-08-22 22:18:55 +02:00			`if settings.STATSD_HOST != '':`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`from django_statsd.clients import statsd`
			`if name == 'gauge':`
			`return self._our_gauge`
			`else:`
			`return getattr(statsd, name)`
			`else:`
			`return lambda args, *kwargs: None`

			`raise AttributeError`

			`statsd = StatsDWrapper()`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00
			`# Runs the callback with slices of all_list of a given batch_size`
			`def run_in_batches(all_list, batch_size, callback, sleep_time = 0, logger = None):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (Sequence[T], int, Callable[[Sequence[T]], None], int, Optional[Callable[[str], None]]) -> None`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`if len(all_list) == 0:`
			`return`

lint: Fix E703 pep8 violations. 2016-11-09 13:44:29 +01:00			`limit = (len(all_list) // batch_size) + 1`
Apply Python 3 futurize transform libmodernize.fixes.fix_xrange_six. 2015-11-01 17:15:05 +01:00			`for i in range(limit):`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`start = i*batch_size`
			`end = (i+1) * batch_size`
			`if end >= len(all_list):`
			`end = len(all_list)`
			`batch = all_list[start:end]`

			`if logger:`
			`logger("Executing %s in batch %s of %s" % (end-start, i+1, limit))`

			`callback(batch)`
Only sleep if there is more work to be done (imported from commit f8a1380e0045c9470909c088a9d262f8a714c86e) 2013-03-18 18:09:16 +01:00
			`if i != limit - 1:`
			`sleep(sleep_time)`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00
			`def make_safe_digest(string, hash_func=hashlib.sha1):`
Fix annotations related to make_safe_digest and hashes. 2016-06-12 14:22:20 +02:00			`# type: (text_type, Callable[[binary_type], Any]) -> text_type`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00			`"""`
			return a hex digest of `string`.
			`"""`
			`# hashlib.sha1, md5, etc. expect bytes, so non-ASCII strings must`
			`# be encoded.`
Fix annotations related to make_safe_digest and hashes. 2016-06-12 14:22:20 +02:00			`return force_text(hash_func(string.encode('utf-8')).hexdigest())`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00

			`def log_statsd_event(name):`
mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`# type: (str) -> None`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`"""`
			`Sends a single event to statsd with the desired name and the current timestamp`

			`This can be used to provide vertical lines in generated graphs,`
			`for example when doing a prod deploy, bankruptcy request, or`
			`other one-off events`

			`Note that to draw this event as a vertical line in graphite`
			`you can use the drawAsInfinite() command`
			`"""`
			`event_name = "events.%s" % (name,)`
Fix bankruptcy event sent to graphite (imported from commit b016ab41ddc2636b76a49c9eb33b7becd387557d) 2013-06-07 23:53:20 +02:00			`statsd.incr(event_name)`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00
			`def generate_random_token(length):`
Fix zerver.lib.utils.generate_random_token. generate_random_token used to return a value of type six.binary_type and its return type was annotated as `str`. This commit fixes that by making it return a value of type `six.text_type` and updating the annotation accordingly. Also fix clashing annnotations. 2016-06-12 12:24:27 +02:00			`# type: (int) -> text_type`
			`return base64.b16encode(os.urandom(length // 2)).decode('utf-8').lower()`
utils: Add mkdir_p implementation. 2016-08-08 23:30:46 +02:00
			`def mkdir_p(path):`
			`# type: (str) -> None`
			# Python doesn't have an analog to `mkdir -p` < Python 3.2.
			`try:`
			`os.makedirs(path)`
			`except OSError as e:`
			`if e.errno == errno.EEXIST and os.path.isdir(path):`
			`pass`
			`else:`
			`raise`
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00
			`def query_chunker(queries, id_collector=None, chunk_size=1000, db_chunk_size=None):`
			`# type: (List[Any], Set[int], int, int) -> Iterable[Any]`
			`'''`
			`This merges one or more Django ascending-id queries into`
			`a generator that returns chunks of chunk_size row objects`
			`during each yield, preserving id order across all results..`

			`Queries should satisfy these conditions:`
			`- They should be Django filters.`
			`- They should return Django objects with "id" attributes.`
			`- They should be disjoint.`

			`The generator also populates id_collector, which we use`
			`internally to enforce unique ids, but which the caller`
			`can pass in to us if they want the side effect of collecting`
			`all ids.`
			`'''`
			`if db_chunk_size is None:`
			`db_chunk_size = chunk_size // len(queries)`

			`assert db_chunk_size >= 2`
			`assert chunk_size >= 2`

			`if id_collector is not None:`
			`assert(len(id_collector) == 0)`
			`else:`
			`id_collector = set()`

			`def chunkify(q, i):`
			`# type: (Any, int) -> Iterable[Tuple[int, int, Any]]`
			`q = q.order_by('id')`
			`min_id = -1`
			`while True:`
			`rows = list(q.filter(id__gt=min_id)[0:db_chunk_size])`
			`if len(rows) == 0:`
			`break`
			`for row in rows:`
			`yield (row.id, i, row)`
			`min_id = rows[-1].id`

			`iterators = [chunkify(q, i) for i, q in enumerate(queries)]`
			`merged_query = heapq.merge(*iterators)`

			`while True:`
			`tup_chunk = list(itertools.islice(merged_query, 0, chunk_size))`
			`if len(tup_chunk) == 0:`
			`break`

			`# Do duplicate-id management here.`
			`tup_ids = set([tup[0] for tup in tup_chunk])`
			`assert len(tup_ids) == len(tup_chunk)`
			`assert len(tup_ids.intersection(id_collector)) == 0`
			`id_collector.update(tup_ids)`

			`yield [row for row_id, i, row in tup_chunk]`
Add option for hosting each realm on its own subdomain. This adds support for running a Zulip production server with each realm on its own unique subdomain, e.g. https://realm_name.example.com. This patch includes a ton of important features: * Configuring the Zulip sesion middleware to issue cookier correctly for the subdomains case. * Throwing an error if the user tries to visit an invalid subdomain. * Runs a portion of the Casper tests with REALMS_HAVE_SUBDOMAINS enabled to test the subdomain signup process. * Updating our integrations documentation to refer to the current subdomain. * Enforces that users can only login to the subdomain of their realm (but does not restrict the API; that will be tightened in a future commit). Note that toggling settings.REALMS_HAVE_SUBDOMAINS on a live server is not supported without manual intervention (the main problem will be adding "subdomain" values for all the existing realms). [substantially modified by tabbott as part of merging] 2016-07-19 14:35:08 +02:00
			`def get_subdomain(request):`
			`# type: (HttpRequest) -> text_type`
			`domain = request.get_host().lower()`
			`index = domain.find("." + settings.EXTERNAL_HOST)`
			`if index == -1:`
			`return ""`
			`subdomain = domain[0:index]`
subdomains: Add support for aliases of the root subdomain. We default to counting "www" as such an alias. 2016-08-19 05:30:16 +02:00			`if subdomain in settings.ROOT_SUBDOMAIN_ALIASES:`
			`return ""`
Add option for hosting each realm on its own subdomain. This adds support for running a Zulip production server with each realm on its own unique subdomain, e.g. https://realm_name.example.com. This patch includes a ton of important features: * Configuring the Zulip sesion middleware to issue cookier correctly for the subdomains case. * Throwing an error if the user tries to visit an invalid subdomain. * Runs a portion of the Casper tests with REALMS_HAVE_SUBDOMAINS enabled to test the subdomain signup process. * Updating our integrations documentation to refer to the current subdomain. * Enforces that users can only login to the subdomain of their realm (but does not restrict the API; that will be tightened in a future commit). Note that toggling settings.REALMS_HAVE_SUBDOMAINS on a live server is not supported without manual intervention (the main problem will be adding "subdomain" values for all the existing realms). [substantially modified by tabbott as part of merging] 2016-07-19 14:35:08 +02:00			`return subdomain`

			`def check_subdomain(realm_subdomain, user_subdomain):`
			`# type: (text_type, text_type) -> bool`
			`if settings.REALMS_HAVE_SUBDOMAINS and realm_subdomain is not None:`
			`if (realm_subdomain == "" and user_subdomain is None):`
			`return True`
			`if realm_subdomain != user_subdomain:`
			`return False`
			`return True`