zulip/zerver/lib/utils.py

# -*- coding: utf-8 -*-

from typing import Any, Callable, List, Optional, Sequence, TypeVar, Iterable, Set, Tuple
import base64
import hashlib
import heapq
import itertools
import os
import re
import string
from time import sleep
from itertools import zip_longest

from django.conf import settings

T = TypeVar('T')

def statsd_key(val: Any, clean_periods: bool=False) -> str:
    if not isinstance(val, str):
        val = str(val)

    if ':' in val:
        val = val.split(':')[0]
    val = val.replace('-', "_")
    if clean_periods:
        val = val.replace('.', '_')

    return val

class StatsDWrapper:
    """Transparently either submit metrics to statsd
    or do nothing without erroring out"""

    # Backported support for gauge deltas
    # as our statsd server supports them but supporting
    # pystatsd is not released yet
    def _our_gauge(self, stat: str, value: float, rate: float=1, delta: bool=False) -> None:
        """Set a gauge value."""
        from django_statsd.clients import statsd
        if delta:
            value_str = '%+g|g' % (value,)
        else:
            value_str = '%g|g' % (value,)
        statsd._send(stat, value_str, rate)

    def __getattr__(self, name: str) -> Any:
        # Hand off to statsd if we have it enabled
        # otherwise do nothing
        if name in ['timer', 'timing', 'incr', 'decr', 'gauge']:
            if settings.STATSD_HOST != '':
                from django_statsd.clients import statsd
                if name == 'gauge':
                    return self._our_gauge
                else:
                    return getattr(statsd, name)
            else:
                return lambda *args, **kwargs: None

        raise AttributeError

statsd = StatsDWrapper()

# Runs the callback with slices of all_list of a given batch_size
def run_in_batches(all_list: Sequence[T],
                   batch_size: int,
                   callback: Callable[[Sequence[T]], None],
                   sleep_time: int=0,
                   logger: Optional[Callable[[str], None]]=None) -> None:
    if len(all_list) == 0:
        return

    limit = (len(all_list) // batch_size) + 1
    for i in range(limit):
        start = i*batch_size
        end = (i+1) * batch_size
        if end >= len(all_list):
            end = len(all_list)
        batch = all_list[start:end]

        if logger:
            logger("Executing %s in batch %s of %s" % (end-start, i+1, limit))

        callback(batch)

        if i != limit - 1:
            sleep(sleep_time)

def make_safe_digest(string: str,
                     hash_func: Callable[[bytes], Any]=hashlib.sha1) -> str:
    """
    return a hex digest of `string`.
    """
    # hashlib.sha1, md5, etc. expect bytes, so non-ASCII strings must
    # be encoded.
    return hash_func(string.encode('utf-8')).hexdigest()


def log_statsd_event(name: str) -> None:
    """
    Sends a single event to statsd with the desired name and the current timestamp

    This can be used to provide vertical lines in generated graphs,
    for example when doing a prod deploy, bankruptcy request, or
    other one-off events

    Note that to draw this event as a vertical line in graphite
    you can use the drawAsInfinite() command
    """
    event_name = "events.%s" % (name,)
    statsd.incr(event_name)

def generate_random_token(length: int) -> str:
    return str(base64.b16encode(os.urandom(length // 2)).decode('utf-8').lower())

def generate_api_key() -> str:
    choices = string.ascii_letters + string.digits
    altchars = ''.join([choices[ord(os.urandom(1)) % 62] for _ in range(2)]).encode("utf-8")
    api_key = base64.b64encode(os.urandom(24), altchars=altchars).decode("utf-8")
    return api_key

def has_api_key_format(key: str) -> bool:
    return bool(re.fullmatch(r"([A-Za-z0-9]){32}", key))

def query_chunker(queries: List[Any],
                  id_collector: Optional[Set[int]]=None,
                  chunk_size: int=1000,
                  db_chunk_size: Optional[int]=None) -> Iterable[Any]:
    '''
    This merges one or more Django ascending-id queries into
    a generator that returns chunks of chunk_size row objects
    during each yield, preserving id order across all results..

    Queries should satisfy these conditions:
        - They should be Django filters.
        - They should return Django objects with "id" attributes.
        - They should be disjoint.

    The generator also populates id_collector, which we use
    internally to enforce unique ids, but which the caller
    can pass in to us if they want the side effect of collecting
    all ids.
    '''
    if db_chunk_size is None:
        db_chunk_size = chunk_size // len(queries)

    assert db_chunk_size >= 2
    assert chunk_size >= 2

    if id_collector is not None:
        assert(len(id_collector) == 0)
    else:
        id_collector = set()

    def chunkify(q: Any, i: int) -> Iterable[Tuple[int, int, Any]]:
        q = q.order_by('id')
        min_id = -1
        while True:
            assert db_chunk_size is not None  # Hint for mypy, but also workaround for mypy bug #3442.
            rows = list(q.filter(id__gt=min_id)[0:db_chunk_size])
            if len(rows) == 0:
                break
            for row in rows:
                yield (row.id, i, row)
            min_id = rows[-1].id

    iterators = [chunkify(q, i) for i, q in enumerate(queries)]
    merged_query = heapq.merge(*iterators)

    while True:
        tup_chunk = list(itertools.islice(merged_query, 0, chunk_size))
        if len(tup_chunk) == 0:
            break

        # Do duplicate-id management here.
        tup_ids = set([tup[0] for tup in tup_chunk])
        assert len(tup_ids) == len(tup_chunk)
        assert len(tup_ids.intersection(id_collector)) == 0
        id_collector.update(tup_ids)

        yield [row for row_id, i, row in tup_chunk]

def process_list_in_batches(lst: List[Any],
                            chunk_size: int,
                            process_batch: Callable[[List[Any]], None]) -> None:
    offset = 0

    while True:
        items = lst[offset:offset+chunk_size]
        if not items:
            break
        process_batch(items)
        offset += chunk_size

def split_by(array: List[Any], group_size: int, filler: Any) -> List[List[Any]]:
    """
    Group elements into list of size `group_size` and fill empty cells with
    `filler`. Recipe from https://docs.python.org/3/library/itertools.html
    """
    args = [iter(array)] * group_size
    return list(map(list, zip_longest(*args, fillvalue=filler)))

def is_remote_server(identifier: str) -> bool:
    """
    This function can be used to identify the source of API auth
    request. We can have two types of sources, Remote Zulip Servers
    and UserProfiles.
    """
    return "@" not in identifier
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`# -- coding: utf-8 --`
Enable absolute imports. See PEP 328[1] for details. This feature was introduced in Python 2.5 and will become mandatory in Python 3. [1]: http://www.python.org/dev/peps/pep-0328 (imported from commit 7444eeba8a08d5f91b94c7921848f2274979bd76) 2013-04-23 18:51:17 +02:00
zerver/lib: Change use of typing.Text to str. 2018-05-11 01:40:23 +02:00			`from typing import Any, Callable, List, Optional, Sequence, TypeVar, Iterable, Set, Tuple`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00			`import base64`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00			`import hashlib`
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`import heapq`
			`import itertools`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00			`import os`
utils: Add a function to check if string can be an API key. 2019-12-16 06:27:34 +01:00			`import re`
utils: Move random API key generator as generate_api_key. random_api_key, the function we use to generate random tokens for API keys, has been moved to zerver/lib/utils.py because it's used in more parts of the codebase (apart from user creation), and having it in zerver/lib/create_user.py was prone to cyclic dependencies. The function has also been renamed to generate_api_key to have an imperative name, that makes clearer what it does. 2018-08-01 11:18:37 +02:00			`import string`
Remove unused imports (imported from commit 9e3050c72a2d1137b9096c6cfa1c3945341b9a56) 2013-06-27 20:03:51 +02:00			`from time import sleep`
refactor: Remove six.moves.zip_longest import. 2017-11-05 06:39:22 +01:00			`from itertools import zip_longest`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`from django.conf import settings`

mypy type annotations for zerver/lib/utils 2016-06-03 18:39:57 +02:00			`T = TypeVar('T')`

zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`def statsd_key(val: Any, clean_periods: bool=False) -> str:`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`if not isinstance(val, str):`
			`val = str(val)`

			`if ':' in val:`
			`val = val.split(':')[0]`
			`val = val.replace('-', "_")`
Add a management command for active user stats (imported from commit a4227858b422c48e272700880e0c21889c7ce566) 2013-04-30 23:58:59 +02:00			`if clean_periods:`
			`val = val.replace('.', '_')`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00
			`return val`

zerver/lib: Remove inheritance from object. 2017-11-05 11:37:41 +01:00			`class StatsDWrapper:`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`"""Transparently either submit metrics to statsd`
			`or do nothing without erroring out"""`

			`# Backported support for gauge deltas`
			`# as our statsd server supports them but supporting`
			`# pystatsd is not released yet`
zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`def _our_gauge(self, stat: str, value: float, rate: float=1, delta: bool=False) -> None:`
tools: Upgrade Pycodestyle and fix new linter errors. Here, we are upgrading pycodestyle version from 2.4.0 to 2.5.0. Fixes: #11396. 2019-01-31 14:32:37 +01:00			`"""Set a gauge value."""`
			`from django_statsd.clients import statsd`
			`if delta:`
			`value_str = '%+g\|g' % (value,)`
			`else:`
			`value_str = '%g\|g' % (value,)`
			`statsd._send(stat, value_str, rate)`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00
zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`def __getattr__(self, name: str) -> Any:`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`# Hand off to statsd if we have it enabled`
			`# otherwise do nothing`
			`if name in ['timer', 'timing', 'incr', 'decr', 'gauge']:`
Move zulip.com-related statsd configuration out of main settings.py. This also removes the convenient way to run statsd in the Dev VM, because we don't anticipate anyone doing that. It's just 2 lines of config to configure it anyway: STATSD_HOST = 'localhost' STATSD_PREFIX = 'user' (imported from commit 5b09422ee0e956bc7f336dd1e575634380b8bfa2) 2015-08-22 22:18:55 +02:00			`if settings.STATSD_HOST != '':`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`from django_statsd.clients import statsd`
			`if name == 'gauge':`
			`return self._our_gauge`
			`else:`
			`return getattr(statsd, name)`
			`else:`
			`return lambda args, *kwargs: None`

			`raise AttributeError`

			`statsd = StatsDWrapper()`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00
			`# Runs the callback with slices of all_list of a given batch_size`
zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`def run_in_batches(all_list: Sequence[T],`
			`batch_size: int,`
			`callback: Callable[[Sequence[T]], None],`
			`sleep_time: int=0,`
			`logger: Optional[Callable[[str], None]]=None) -> None:`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`if len(all_list) == 0:`
			`return`

lint: Fix E703 pep8 violations. 2016-11-09 13:44:29 +01:00			`limit = (len(all_list) // batch_size) + 1`
Apply Python 3 futurize transform libmodernize.fixes.fix_xrange_six. 2015-11-01 17:15:05 +01:00			`for i in range(limit):`
[schema][manual] Add South migration for flags on UserMessage (imported from commit bdf6cf2d5427709e52ef051e3c4a19c5fbb4851c) 2013-03-12 17:51:35 +01:00			`start = i*batch_size`
			`end = (i+1) * batch_size`
			`if end >= len(all_list):`
			`end = len(all_list)`
			`batch = all_list[start:end]`

			`if logger:`
			`logger("Executing %s in batch %s of %s" % (end-start, i+1, limit))`

			`callback(batch)`
Only sleep if there is more work to be done (imported from commit f8a1380e0045c9470909c088a9d262f8a714c86e) 2013-03-18 18:09:16 +01:00
			`if i != limit - 1:`
			`sleep(sleep_time)`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00
zerver/lib: Change use of typing.Text to str. 2018-05-11 01:40:23 +02:00			`def make_safe_digest(string: str,`
			`hash_func: Callable[[bytes], Any]=hashlib.sha1) -> str:`
Always give hashlib.sha1 and friends bytes. This fixes an experienced bug where you couldn't subscribe to a stream with non-ASCII characters (failing with a UnicodeEncodeError), as well as many other potential bugs. (imported from commit f084a4b4b597b85935655097a7b5a163811c4d71) 2013-03-20 15:31:27 +01:00			`"""`
			return a hex digest of `string`.
			`"""`
			`# hashlib.sha1, md5, etc. expect bytes, so non-ASCII strings must`
			`# be encoded.`
Avoid unneeded force_text() in make_safe_digest(). 2017-11-04 17:30:42 +01:00			`return hash_func(string.encode('utf-8')).hexdigest()`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00

zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`def log_statsd_event(name: str) -> None:`
Add statsd helpers and wrappers (imported from commit 9d5b805ae416a65ac49dda8e8e11d9831308116c) 2013-04-16 22:57:50 +02:00			`"""`
			`Sends a single event to statsd with the desired name and the current timestamp`

			`This can be used to provide vertical lines in generated graphs,`
			`for example when doing a prod deploy, bankruptcy request, or`
			`other one-off events`

			`Note that to draw this event as a vertical line in graphite`
			`you can use the drawAsInfinite() command`
			`"""`
			`event_name = "events.%s" % (name,)`
Fix bankruptcy event sent to graphite (imported from commit b016ab41ddc2636b76a49c9eb33b7becd387557d) 2013-06-07 23:53:20 +02:00			`statsd.incr(event_name)`
Move to a common random token generation function instead of several one-offs. (imported from commit 3217de5384088deff68fbffc6bd481c045a76817) 2013-08-08 16:50:58 +02:00
zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`def generate_random_token(length: int) -> str:`
utils: Cast generate_random_token to str. Having this be Text is forcing various URLs, emails, etc to be type annotated as Text. 2017-07-08 02:46:51 +02:00			`return str(base64.b16encode(os.urandom(length // 2)).decode('utf-8').lower())`
utils: Add mkdir_p implementation. 2016-08-08 23:30:46 +02:00
utils: Move random API key generator as generate_api_key. random_api_key, the function we use to generate random tokens for API keys, has been moved to zerver/lib/utils.py because it's used in more parts of the codebase (apart from user creation), and having it in zerver/lib/create_user.py was prone to cyclic dependencies. The function has also been renamed to generate_api_key to have an imperative name, that makes clearer what it does. 2018-08-01 11:18:37 +02:00			`def generate_api_key() -> str:`
			`choices = string.ascii_letters + string.digits`
			`altchars = ''.join([choices[ord(os.urandom(1)) % 62] for _ in range(2)]).encode("utf-8")`
			`api_key = base64.b64encode(os.urandom(24), altchars=altchars).decode("utf-8")`
			`return api_key`

utils: Add a function to check if string can be an API key. 2019-12-16 06:27:34 +01:00			`def has_api_key_format(key: str) -> bool:`
			`return bool(re.fullmatch(r"([A-Za-z0-9]){32}", key))`

zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`def query_chunker(queries: List[Any],`
mypy: Add explicit Optional for default=None parameters in various files. 2018-03-23 23:42:54 +01:00			`id_collector: Optional[Set[int]]=None,`
zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`chunk_size: int=1000,`
mypy: Add explicit Optional for default=None parameters in various files. 2018-03-23 23:42:54 +01:00			`db_chunk_size: Optional[int]=None) -> Iterable[Any]:`
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`'''`
			`This merges one or more Django ascending-id queries into`
			`a generator that returns chunks of chunk_size row objects`
			`during each yield, preserving id order across all results..`

			`Queries should satisfy these conditions:`
			`- They should be Django filters.`
			`- They should return Django objects with "id" attributes.`
			`- They should be disjoint.`

			`The generator also populates id_collector, which we use`
			`internally to enforce unique ids, but which the caller`
			`can pass in to us if they want the side effect of collecting`
			`all ids.`
			`'''`
			`if db_chunk_size is None:`
			`db_chunk_size = chunk_size // len(queries)`

			`assert db_chunk_size >= 2`
			`assert chunk_size >= 2`

			`if id_collector is not None:`
			`assert(len(id_collector) == 0)`
			`else:`
			`id_collector = set()`

zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`def chunkify(q: Any, i: int) -> Iterable[Tuple[int, int, Any]]:`
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`q = q.order_by('id')`
			`min_id = -1`
			`while True:`
mypy: strict optional fixes. 2017-05-24 21:28:26 +02:00			`assert db_chunk_size is not None # Hint for mypy, but also workaround for mypy bug #3442.`
export: Add lib.utils.query_chunker(). 2016-08-14 18:33:29 +02:00			`rows = list(q.filter(id__gt=min_id)[0:db_chunk_size])`
			`if len(rows) == 0:`
			`break`
			`for row in rows:`
			`yield (row.id, i, row)`
			`min_id = rows[-1].id`

			`iterators = [chunkify(q, i) for i, q in enumerate(queries)]`
			`merged_query = heapq.merge(*iterators)`

			`while True:`
			`tup_chunk = list(itertools.islice(merged_query, 0, chunk_size))`
			`if len(tup_chunk) == 0:`
			`break`

			`# Do duplicate-id management here.`
			`tup_ids = set([tup[0] for tup in tup_chunk])`
			`assert len(tup_ids) == len(tup_chunk)`
			`assert len(tup_ids.intersection(id_collector)) == 0`
			`id_collector.update(tup_ids)`

			`yield [row for row_id, i, row in tup_chunk]`
Add option for hosting each realm on its own subdomain. This adds support for running a Zulip production server with each realm on its own unique subdomain, e.g. https://realm_name.example.com. This patch includes a ton of important features: * Configuring the Zulip sesion middleware to issue cookier correctly for the subdomains case. * Throwing an error if the user tries to visit an invalid subdomain. * Runs a portion of the Casper tests with REALMS_HAVE_SUBDOMAINS enabled to test the subdomain signup process. * Updating our integrations documentation to refer to the current subdomain. * Enforces that users can only login to the subdomain of their realm (but does not restrict the API; that will be tightened in a future commit). Note that toggling settings.REALMS_HAVE_SUBDOMAINS on a live server is not supported without manual intervention (the main problem will be adding "subdomain" values for all the existing realms). [substantially modified by tabbott as part of merging] 2016-07-19 14:35:08 +02:00
utils: Add process_list_in_batches(). 2018-10-15 14:24:13 +02:00			`def process_list_in_batches(lst: List[Any],`
			`chunk_size: int,`
			`process_batch: Callable[[List[Any]], None]) -> None:`
			`offset = 0`

			`while True:`
			`items = lst[offset:offset+chunk_size]`
			`if not items:`
			`break`
			`process_batch(items)`
			`offset += chunk_size`

zerver/lib: Use python 3 syntax for typing. Extracted from a larger commit by tabbott because these changes will not create significant merge conflicts. 2017-11-05 11:15:10 +01:00			`def split_by(array: List[Any], group_size: int, filler: Any) -> List[List[Any]]:`
Add /authors page. Contributor visualization showing the avatar, user name and number of commits for each contributors. The JSON data would be updated upon deployment, triggered by the `update-prod-static` script. 2017-01-06 18:56:36 +01:00			`"""`
			Group elements into list of size `group_size` and fill empty cells with
			`filler`. Recipe from https://docs.python.org/3/library/itertools.html
			`"""`
			`args = [iter(array)] * group_size`
			`return list(map(list, zip_longest(*args, fillvalue=filler)))`
api: Add is_remote_server(). This function abstracts the logic to ascertain the source of API auth request. 2017-04-28 06:55:22 +02:00
zerver/lib: Change use of typing.Text to str. 2018-05-11 01:40:23 +02:00			`def is_remote_server(identifier: str) -> bool:`
api: Add is_remote_server(). This function abstracts the logic to ascertain the source of API auth request. 2017-04-28 06:55:22 +02:00			`"""`
			`This function can be used to identify the source of API auth`
			`request. We can have two types of sources, Remote Zulip Servers`
			`and UserProfiles.`
			`"""`
			`return "@" not in identifier`