zulip/tools/lib/capitalization.py

from __future__ import absolute_import

from typing import List, Tuple, Set, Pattern, Match
import re

from bs4 import BeautifulSoup

# The phrases in this list will be ignored. The longest phrase is
# tried first; this removes the chance of smaller phrases changing
# the text before longer phrases are tried.
# The errors shown by `tools/check-capitalization` can be added to
# this list without any modification.
IGNORED_PHRASES = [
    # Proper nouns and acronyms
    r"Android",
    r"API",
    r"APNS",
    r"Cookie Bot",
    r"Dropbox",
    r"GitHub",
    r"Google",
    r"HTTP",
    r"ID",
    r"IDs",
    r"JIRA",
    r"JSON",
    r"Kerberos",
    r"Mac",
    r"MacOS",
    r"MiB",
    r"OTP",
    r"Pivotal",
    r'REMOTE_USER',
    r'Slack',
    r"SSO",
    r'Terms of Service',
    r"URL",
    r"Ubuntu",
    r"V5",
    r"Webathena",
    r"Windows",
    r"WordPress",
    r"XML",
    r"Zephyr",
    r"Zulip",
    r"iPhone",
    r"iOS",
    r"Emoji One",
    # Code things
    r".zuliprc",
    r"__\w+\.\w+__",
    # Things using "I"
    r"I say",
    r"I want",
    r"I'm",
    # Specific short words
    r"and",
    r"bot",
    r"e.g.",
    r"etc.",
    r"images",

    # Fragments of larger strings
    (r'Change notification settings for individual streams on your '
     '<a href="/#streams">Streams page</a>.'),
    (r'Looking for our '
     '<a href="/integrations" target="_blank">Integrations</a> or '
     '<a href="{{ server_uri }}/api" target="_blank">API</a> '
     'documentation?'),
    r'Most stream administration is done on the <a href="/#streams">Streams page</a>.',
    r"one or more people...",
    r"confirmation email",
    r"invites remaining",
    r"^left$",
    r"was too large; the maximum file size is 25MiB.",
    r"^right$",
    r"selected message",

    # SPECIAL CASES
    # Enter is usually capitalized
    r"Press Enter to send",
    # Because topics usually are lower-case, this would look weird if it were capitalized
    r"more topics",
    # For consistency with "more topics"
    r"more conversations",
    # We should probably just delete this string from translations
    r'activation key',
    # this is used as a topic
    r'^hello$',

    # TO CLEAN UP
    # Just want to avoid churning login.html right now
    r"or Choose a user",
    # This is a parsing bug in the tool
    r"argument ",
    # I can't find this one
    r"text",
]

# Sort regexes in descending order of their lengths. As a result, the
# longer phrases will be ignored first.
IGNORED_PHRASES.sort(key=lambda regex: len(regex), reverse=True)

# Compile regexes to improve performance. This also extracts the
# text using BeautifulSoup and then removes extra whitespaces from
# it. This step enables us to add HTML in our regexes directly.
COMPILED_IGNORED_PHRASES = [
    re.compile(' '.join(BeautifulSoup(regex, 'lxml').text.split()))
    for regex in IGNORED_PHRASES
]

SPLIT_BOUNDARY = '?.!'  # Used to split string into sentences.
SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY))

# Regexes which check capitalization in sentences.
DISALLOWED_REGEXES = [re.compile(regex) for regex in [
    r'^[a-z]',  # Checks if the sentence starts with a lower case character.
    r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]',  # Checks if an upper case character exists
    # after a lower case character when the first character is in upper case.
]]

def get_safe_phrase(phrase):
    # type: (str) -> str
    """
    Safe phrase is in lower case and doesn't contain characters which can
    conflict with split boundaries. All conflicting characters are replaced
    with low dash (_).
    """
    phrase = SPLIT_BOUNDARY_REGEX.sub('_', phrase)
    return phrase.lower()

def replace_with_safe_phrase(matchobj):
    # type: (Match[str]) -> str
    """
    The idea is to convert IGNORED_PHRASES into safe phrases, see
    `get_safe_phrase()` function. The only exception is when the
    IGNORED_PHRASE is at the start of the text or after a split
    boundary; in this case, we change the first letter of the phrase
    to upper case.
    """
    ignored_phrase = matchobj.group(0)
    safe_string = get_safe_phrase(ignored_phrase)

    start_index = matchobj.start()
    complete_string = matchobj.string

    is_string_start = start_index == 0
    # We expect that there will be one space between split boundary
    # and the next word.
    punctuation = complete_string[max(start_index - 2, 0)]
    is_after_split_boundary = punctuation in SPLIT_BOUNDARY
    if is_string_start or is_after_split_boundary:
        return safe_string.capitalize()

    return safe_string

def get_safe_text(text):
    # type: (str) -> str
    """
    This returns text which is rendered by BeautifulSoup and is in the
    form that can be split easily and has all IGNORED_PHRASES processed.
    """
    soup = BeautifulSoup(text, 'lxml')
    text = ' '.join(soup.text.split())  # Remove extra whitespaces.
    for phrase_regex in COMPILED_IGNORED_PHRASES:
        text = phrase_regex.sub(replace_with_safe_phrase, text)

    return text

def is_capitalized(safe_text):
    # type: (str) -> bool
    sentences = SPLIT_BOUNDARY_REGEX.split(safe_text)
    sentences = [sentence.strip()
                 for sentence in sentences if sentence.strip()]

    if not sentences:
        return False

    for sentence in sentences:
        for regex in DISALLOWED_REGEXES:
            if regex.search(sentence):
                return False

    return True

def check_capitalization(strings):
    # type: (List[str]) -> Tuple[List[str], List[str]]
    errors = []
    ignored = []
    for text in strings:
        text = ' '.join(text.split())  # Remove extra whitespaces.
        safe_text = get_safe_text(text)
        has_ignored_phrase = text != safe_text
        capitalized = is_capitalized(safe_text)
        if not capitalized:
            errors.append(text)
        elif capitalized and has_ignored_phrase:
            ignored.append(text)

    return sorted(errors), sorted(ignored)
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`from __future__ import absolute_import`

			`from typing import List, Tuple, Set, Pattern, Match`
			`import re`

			`from bs4 import BeautifulSoup`

capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`# The phrases in this list will be ignored. The longest phrase is`
			`# tried first; this removes the chance of smaller phrases changing`
			`# the text before longer phrases are tried.`
			# The errors shown by `tools/check-capitalization` can be added to
			`# this list without any modification.`
			`IGNORED_PHRASES = [`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`# Proper nouns and acronyms`
lint: Mark alt argument for translation. 2017-05-26 07:38:24 +02:00			`r"Android",`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`r"API",`
capitalization: Add APNS to list of valid acronyms. 2017-07-07 19:43:02 +02:00			`r"APNS",`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`r"Cookie Bot",`
			`r"Dropbox",`
			`r"GitHub",`
			`r"Google",`
			`r"HTTP",`
			`r"ID",`
			`r"IDs",`
			`r"JIRA",`
			`r"JSON",`
			`r"Kerberos",`
			`r"Mac",`
lint: Mark alt argument for translation. 2017-05-26 07:38:24 +02:00			`r"MacOS",`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`r"MiB",`
Add mobile auth redirect to custom URI scheme (zulip://). This makes it possible for the Zulip mobile apps to use the normal web authentication/Oauth flows, so that they can support GitHub, Google, and other authentication methods we support on the backend, without needing to write significant custom mobile-app-side code for each authentication backend. This PR only provides support for Google auth; a bit more refactoring would be needed to support this for the GitHub/Social backends. Modified by tabbott to use the mobile_auth_otp library to protect the API key. 2017-03-19 20:01:01 +01:00			`r"OTP",`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`r"Pivotal",`
			`r'REMOTE_USER',`
Add Slack webhook. Adds a new webhook integration for Slack to receive messages from one's Slack team's public channels. Contains negative tests for broken, missing or invalid data. Allows two different option for integration: 1. Receive notification on a single stream with different topics for each of Slack's public channels. 2. Receive notification on different streams for each of Slack's public channels. Steps to choose between the two options is described in the documentation. Fixes #3569. 2017-01-30 21:18:41 +01:00			`r'Slack',`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`r"SSO",`
			`r'Terms of Service',`
			`r"URL",`
			`r"Ubuntu",`
			`r"V5",`
			`r"Webathena",`
			`r"Windows",`
			`r"WordPress",`
			`r"XML",`
			`r"Zephyr",`
			`r"Zulip",`
			`r"iPhone",`
lint: Add iOS to list of valid weird capitalizations. 2017-05-27 05:59:32 +02:00			`r"iOS",`
capitalization: Flag Emoji One as allowed. 2017-04-25 08:15:39 +02:00			`r"Emoji One",`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`# Code things`
			`r".zuliprc",`
			`r"__\w+\.\w+__",`
			`# Things using "I"`
			`r"I say",`
			`r"I want",`
			`r"I'm",`
			`# Specific short words`
			`r"and",`
			`r"bot",`
			`r"e.g.",`
			`r"etc.",`
			`r"images",`

			`# Fragments of larger strings`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`(r'Change notification settings for individual streams on your '`
			`'<a href="/#streams">Streams page</a>.'),`
i18n: Automatically strip Handlebars strings. Some Handlebars strings contained whitespaces characters at their ends. With this, such characters are removed, as well as multiple spaces (like the ones produced by code indentation). This also includes a couple of fixes that removes spaces that were intentionally placed before/after the string to translate. 2017-03-27 23:25:43 +02:00			`(r'Looking for our '`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`'<a href="/integrations" target="_blank">Integrations</a> or '`
			`'<a href="{{ server_uri }}/api" target="_blank">API</a> '`
i18n: Automatically strip Handlebars strings. Some Handlebars strings contained whitespaces characters at their ends. With this, such characters are removed, as well as multiple spaces (like the ones produced by code indentation). This also includes a couple of fixes that removes spaces that were intentionally placed before/after the string to translate. 2017-03-27 23:25:43 +02:00			`'documentation?'),`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`r'Most stream administration is done on the <a href="/#streams">Streams page</a>.',`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`r"one or more people...",`
			`r"confirmation email",`
			`r"invites remaining",`
			`r"^left$",`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`r"was too large; the maximum file size is 25MiB.",`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`r"^right$",`
capitalization: Add exception for selected message. This fixes a test that's been failing overnight. 2017-03-20 15:52:46 +01:00			`r"selected message",`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00
			`# SPECIAL CASES`
			`# Enter is usually capitalized`
			`r"Press Enter to send",`
			`# Because topics usually are lower-case, this would look weird if it were capitalized`
			`r"more topics",`
			`# For consistency with "more topics"`
			`r"more conversations",`
			`# We should probably just delete this string from translations`
			`r'activation key',`
Send welcome messages for new streams. 2017-04-27 00:03:21 +02:00			`# this is used as a topic`
			`r'^hello$',`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00
			`# TO CLEAN UP`
			`# Just want to avoid churning login.html right now`
			`r"or Choose a user",`
			`# This is a parsing bug in the tool`
			`r"argument ",`
			`# I can't find this one`
			`r"text",`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`]`

			`# Sort regexes in descending order of their lengths. As a result, the`
			`# longer phrases will be ignored first.`
			`IGNORED_PHRASES.sort(key=lambda regex: len(regex), reverse=True)`

			`# Compile regexes to improve performance. This also extracts the`
			`# text using BeautifulSoup and then removes extra whitespaces from`
			`# it. This step enables us to add HTML in our regexes directly.`
			`COMPILED_IGNORED_PHRASES = [`
			`re.compile(' '.join(BeautifulSoup(regex, 'lxml').text.split()))`
			`for regex in IGNORED_PHRASES`
			`]`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00
			`SPLIT_BOUNDARY = '?.!' # Used to split string into sentences.`
			`SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY))`

			`# Regexes which check capitalization in sentences.`
			`DISALLOWED_REGEXES = [re.compile(regex) for regex in [`
			`r'^[a-z]', # Checks if the sentence starts with a lower case character.`
			`r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]', # Checks if an upper case character exists`
			`# after a lower case character when the first character is in upper case.`
			`]]`

			`def get_safe_phrase(phrase):`
			`# type: (str) -> str`
			`"""`
			`Safe phrase is in lower case and doesn't contain characters which can`
			`conflict with split boundaries. All conflicting characters are replaced`
			`with low dash (_).`
			`"""`
			`phrase = SPLIT_BOUNDARY_REGEX.sub('_', phrase)`
			`return phrase.lower()`

			`def replace_with_safe_phrase(matchobj):`
			`# type: (Match[str]) -> str`
			`"""`
			`The idea is to convert IGNORED_PHRASES into safe phrases, see`
			`get_safe_phrase()` function. The only exception is when the
			`IGNORED_PHRASE is at the start of the text or after a split`
			`boundary; in this case, we change the first letter of the phrase`
			`to upper case.`
			`"""`
			`ignored_phrase = matchobj.group(0)`
			`safe_string = get_safe_phrase(ignored_phrase)`

			`start_index = matchobj.start()`
			`complete_string = matchobj.string`

			`is_string_start = start_index == 0`
			`# We expect that there will be one space between split boundary`
			`# and the next word.`
			`punctuation = complete_string[max(start_index - 2, 0)]`
			`is_after_split_boundary = punctuation in SPLIT_BOUNDARY`
			`if is_string_start or is_after_split_boundary:`
			`return safe_string.capitalize()`

			`return safe_string`

			`def get_safe_text(text):`
			`# type: (str) -> str`
			`"""`
			`This returns text which is rendered by BeautifulSoup and is in the`
			`form that can be split easily and has all IGNORED_PHRASES processed.`
			`"""`
			`soup = BeautifulSoup(text, 'lxml')`
			`text = ' '.join(soup.text.split()) # Remove extra whitespaces.`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`for phrase_regex in COMPILED_IGNORED_PHRASES:`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`text = phrase_regex.sub(replace_with_safe_phrase, text)`

			`return text`

			`def is_capitalized(safe_text):`
			`# type: (str) -> bool`
			`sentences = SPLIT_BOUNDARY_REGEX.split(safe_text)`
			`sentences = [sentence.strip()`
			`for sentence in sentences if sentence.strip()]`

			`if not sentences:`
			`return False`

			`for sentence in sentences:`
			`for regex in DISALLOWED_REGEXES:`
			`if regex.search(sentence):`
			`return False`

			`return True`

			`def check_capitalization(strings):`
			`# type: (List[str]) -> Tuple[List[str], List[str]]`
			`errors = []`
			`ignored = []`
			`for text in strings:`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`text = ' '.join(text.split()) # Remove extra whitespaces.`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`safe_text = get_safe_text(text)`
			`has_ignored_phrase = text != safe_text`
			`capitalized = is_capitalized(safe_text)`
			`if not capitalized:`
			`errors.append(text)`
			`elif capitalized and has_ignored_phrase:`
			`ignored.append(text)`

			`return sorted(errors), sorted(ignored)`