zulip/tools/lib/capitalization.py

from __future__ import absolute_import

from typing import List, Tuple, Set, Pattern, Match
import re

from bs4 import BeautifulSoup

# The phrases in this list will be ignored. The longest phrase is
# tried first; this removes the chance of smaller phrases changing
# the text before longer phrases are tried.
# The errors shown by `tools/check-capitalization` can be added to
# this list without any modification.
IGNORED_PHRASES = [
    # Proper nouns and acronyms
    r"API",
    r"Cookie Bot",
    r"Dropbox",
    r"GitHub",
    r"Google",
    r"HTTP",
    r"ID",
    r"IDs",
    r"JIRA",
    r"JSON",
    r"Kerberos",
    r"Mac",
    r"MiB",
    r"Pivotal",
    r'REMOTE_USER',
    r"SSO",
    r'Terms of Service',
    r"URL",
    r"Ubuntu",
    r"V5",
    r"Webathena",
    r"Windows",
    r"WordPress",
    r"XML",
    r"Zephyr",
    r"Zulip",
    r"iPhone",
    # Code things
    r".zuliprc",
    r"__\w+\.\w+__",
    # Things using "I"
    r"I say",
    r"I want",
    r"I'm",
    # Specific short words
    r"and",
    r"bot",
    r"e.g.",
    r"etc.",
    r"images",

    # Fragments of larger strings
    (r'Change notification settings for individual streams on your '
     '<a href="/#streams">Streams page</a>.'),
    (r'<p class="bot-settings-note padded-container"> Looking for our '
     '<a href="/integrations" target="_blank">Integrations</a> or '
     '<a href="{{ server_uri }}/api" target="_blank">API</a> '
     'documentation? </p>'),
    r'Most stream administration is done on the <a href="/#streams">Streams page</a>.',
    r"one or more people...",
    r"confirmation email",
    r"invites remaining",
    r"^left$",
    r"was too large; the maximum file size is 25MiB.",
    r"^right$",

    # SPECIAL CASES
    # Enter is usually capitalized
    r"Press Enter to send",
    # Because topics usually are lower-case, this would look weird if it were capitalized
    r"more topics",
    # For consistency with "more topics"
    r"more conversations",
    # We should probably just delete this string from translations
    r'activation key',

    # TO CLEAN UP
    # Just want to avoid churning login.html right now
    r"or Choose a user",
    # This is a parsing bug in the tool
    r"argument ",
    # I can't find this one
    r"text",
]

# Sort regexes in descending order of their lengths. As a result, the
# longer phrases will be ignored first.
IGNORED_PHRASES.sort(key=lambda regex: len(regex), reverse=True)

# Compile regexes to improve performance. This also extracts the
# text using BeautifulSoup and then removes extra whitespaces from
# it. This step enables us to add HTML in our regexes directly.
COMPILED_IGNORED_PHRASES = [
    re.compile(' '.join(BeautifulSoup(regex, 'lxml').text.split()))
    for regex in IGNORED_PHRASES
]

SPLIT_BOUNDARY = '?.!'  # Used to split string into sentences.
SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY))

# Regexes which check capitalization in sentences.
DISALLOWED_REGEXES = [re.compile(regex) for regex in [
    r'^[a-z]',  # Checks if the sentence starts with a lower case character.
    r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]',  # Checks if an upper case character exists
    # after a lower case character when the first character is in upper case.
]]

def get_safe_phrase(phrase):
    # type: (str) -> str
    """
    Safe phrase is in lower case and doesn't contain characters which can
    conflict with split boundaries. All conflicting characters are replaced
    with low dash (_).
    """
    phrase = SPLIT_BOUNDARY_REGEX.sub('_', phrase)
    return phrase.lower()

def replace_with_safe_phrase(matchobj):
    # type: (Match[str]) -> str
    """
    The idea is to convert IGNORED_PHRASES into safe phrases, see
    `get_safe_phrase()` function. The only exception is when the
    IGNORED_PHRASE is at the start of the text or after a split
    boundary; in this case, we change the first letter of the phrase
    to upper case.
    """
    ignored_phrase = matchobj.group(0)
    safe_string = get_safe_phrase(ignored_phrase)

    start_index = matchobj.start()
    complete_string = matchobj.string

    is_string_start = start_index == 0
    # We expect that there will be one space between split boundary
    # and the next word.
    punctuation = complete_string[max(start_index - 2, 0)]
    is_after_split_boundary = punctuation in SPLIT_BOUNDARY
    if is_string_start or is_after_split_boundary:
        return safe_string.capitalize()

    return safe_string

def get_safe_text(text):
    # type: (str) -> str
    """
    This returns text which is rendered by BeautifulSoup and is in the
    form that can be split easily and has all IGNORED_PHRASES processed.
    """
    soup = BeautifulSoup(text, 'lxml')
    text = ' '.join(soup.text.split())  # Remove extra whitespaces.
    for phrase_regex in COMPILED_IGNORED_PHRASES:
        text = phrase_regex.sub(replace_with_safe_phrase, text)

    return text

def is_capitalized(safe_text):
    # type: (str) -> bool
    sentences = SPLIT_BOUNDARY_REGEX.split(safe_text)
    sentences = [sentence.strip()
                 for sentence in sentences if sentence.strip()]

    if not sentences:
        return False

    for sentence in sentences:
        for regex in DISALLOWED_REGEXES:
            if regex.search(sentence):
                return False

    return True

def check_capitalization(strings):
    # type: (List[str]) -> Tuple[List[str], List[str]]
    errors = []
    ignored = []
    for text in strings:
        text = ' '.join(text.split())  # Remove extra whitespaces.
        safe_text = get_safe_text(text)
        has_ignored_phrase = text != safe_text
        capitalized = is_capitalized(safe_text)
        if not capitalized:
            errors.append(text)
        elif capitalized and has_ignored_phrase:
            ignored.append(text)

    return sorted(errors), sorted(ignored)
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`from __future__ import absolute_import`

			`from typing import List, Tuple, Set, Pattern, Match`
			`import re`

			`from bs4 import BeautifulSoup`

capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`# The phrases in this list will be ignored. The longest phrase is`
			`# tried first; this removes the chance of smaller phrases changing`
			`# the text before longer phrases are tried.`
			# The errors shown by `tools/check-capitalization` can be added to
			`# this list without any modification.`
			`IGNORED_PHRASES = [`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`# Proper nouns and acronyms`
			`r"API",`
			`r"Cookie Bot",`
			`r"Dropbox",`
			`r"GitHub",`
			`r"Google",`
			`r"HTTP",`
			`r"ID",`
			`r"IDs",`
			`r"JIRA",`
			`r"JSON",`
			`r"Kerberos",`
			`r"Mac",`
			`r"MiB",`
			`r"Pivotal",`
			`r'REMOTE_USER',`
			`r"SSO",`
			`r'Terms of Service',`
			`r"URL",`
			`r"Ubuntu",`
			`r"V5",`
			`r"Webathena",`
			`r"Windows",`
			`r"WordPress",`
			`r"XML",`
			`r"Zephyr",`
			`r"Zulip",`
			`r"iPhone",`
			`# Code things`
			`r".zuliprc",`
			`r"__\w+\.\w+__",`
			`# Things using "I"`
			`r"I say",`
			`r"I want",`
			`r"I'm",`
			`# Specific short words`
			`r"and",`
			`r"bot",`
			`r"e.g.",`
			`r"etc.",`
			`r"images",`

			`# Fragments of larger strings`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`(r'Change notification settings for individual streams on your '`
			`'<a href="/#streams">Streams page</a>.'),`
			`(r'<p class="bot-settings-note padded-container"> Looking for our '`
			`'<a href="/integrations" target="_blank">Integrations</a> or '`
			`'<a href="{{ server_uri }}/api" target="_blank">API</a> '`
			`'documentation? </p>'),`
			`r'Most stream administration is done on the <a href="/#streams">Streams page</a>.',`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`r"one or more people...",`
			`r"confirmation email",`
			`r"invites remaining",`
			`r"^left$",`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`r"was too large; the maximum file size is 25MiB.",`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`r"^right$",`

			`# SPECIAL CASES`
			`# Enter is usually capitalized`
			`r"Press Enter to send",`
			`# Because topics usually are lower-case, this would look weird if it were capitalized`
			`r"more topics",`
			`# For consistency with "more topics"`
			`r"more conversations",`
			`# We should probably just delete this string from translations`
			`r'activation key',`

			`# TO CLEAN UP`
			`# Just want to avoid churning login.html right now`
			`r"or Choose a user",`
			`# This is a parsing bug in the tool`
			`r"argument ",`
			`# I can't find this one`
			`r"text",`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`]`

			`# Sort regexes in descending order of their lengths. As a result, the`
			`# longer phrases will be ignored first.`
			`IGNORED_PHRASES.sort(key=lambda regex: len(regex), reverse=True)`

			`# Compile regexes to improve performance. This also extracts the`
			`# text using BeautifulSoup and then removes extra whitespaces from`
			`# it. This step enables us to add HTML in our regexes directly.`
			`COMPILED_IGNORED_PHRASES = [`
			`re.compile(' '.join(BeautifulSoup(regex, 'lxml').text.split()))`
			`for regex in IGNORED_PHRASES`
			`]`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00
			`SPLIT_BOUNDARY = '?.!' # Used to split string into sentences.`
			`SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY))`

			`# Regexes which check capitalization in sentences.`
			`DISALLOWED_REGEXES = [re.compile(regex) for regex in [`
			`r'^[a-z]', # Checks if the sentence starts with a lower case character.`
			`r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]', # Checks if an upper case character exists`
			`# after a lower case character when the first character is in upper case.`
			`]]`

			`def get_safe_phrase(phrase):`
			`# type: (str) -> str`
			`"""`
			`Safe phrase is in lower case and doesn't contain characters which can`
			`conflict with split boundaries. All conflicting characters are replaced`
			`with low dash (_).`
			`"""`
			`phrase = SPLIT_BOUNDARY_REGEX.sub('_', phrase)`
			`return phrase.lower()`

			`def replace_with_safe_phrase(matchobj):`
			`# type: (Match[str]) -> str`
			`"""`
			`The idea is to convert IGNORED_PHRASES into safe phrases, see`
			`get_safe_phrase()` function. The only exception is when the
			`IGNORED_PHRASE is at the start of the text or after a split`
			`boundary; in this case, we change the first letter of the phrase`
			`to upper case.`
			`"""`
			`ignored_phrase = matchobj.group(0)`
			`safe_string = get_safe_phrase(ignored_phrase)`

			`start_index = matchobj.start()`
			`complete_string = matchobj.string`

			`is_string_start = start_index == 0`
			`# We expect that there will be one space between split boundary`
			`# and the next word.`
			`punctuation = complete_string[max(start_index - 2, 0)]`
			`is_after_split_boundary = punctuation in SPLIT_BOUNDARY`
			`if is_string_start or is_after_split_boundary:`
			`return safe_string.capitalize()`

			`return safe_string`

			`def get_safe_text(text):`
			`# type: (str) -> str`
			`"""`
			`This returns text which is rendered by BeautifulSoup and is in the`
			`form that can be split easily and has all IGNORED_PHRASES processed.`
			`"""`
			`soup = BeautifulSoup(text, 'lxml')`
			`text = ' '.join(soup.text.split()) # Remove extra whitespaces.`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`for phrase_regex in COMPILED_IGNORED_PHRASES:`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`text = phrase_regex.sub(replace_with_safe_phrase, text)`

			`return text`

			`def is_capitalized(safe_text):`
			`# type: (str) -> bool`
			`sentences = SPLIT_BOUNDARY_REGEX.split(safe_text)`
			`sentences = [sentence.strip()`
			`for sentence in sentences if sentence.strip()]`

			`if not sentences:`
			`return False`

			`for sentence in sentences:`
			`for regex in DISALLOWED_REGEXES:`
			`if regex.search(sentence):`
			`return False`

			`return True`

			`def check_capitalization(strings):`
			`# type: (List[str]) -> Tuple[List[str], List[str]]`
			`errors = []`
			`ignored = []`
			`for text in strings:`
capitalization: Make it easier to ignore phrases. This commit allows us to add the errors shown by the tools/check-capitalization in the IGNORED_PHRASES list without any modification. 2017-03-10 11:47:06 +01:00			`text = ' '.join(text.split()) # Remove extra whitespaces.`
Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. 2017-03-03 12:42:07 +01:00			`safe_text = get_safe_text(text)`
			`has_ignored_phrase = text != safe_text`
			`capitalized = is_capitalized(safe_text)`
			`if not capitalized:`
			`errors.append(text)`
			`elif capitalized and has_ignored_phrase:`
			`ignored.append(text)`

			`return sorted(errors), sorted(ignored)`