from typing import List, Tuple, Set, Pattern, Match import re from bs4 import BeautifulSoup # The phrases in this list will be ignored. The longest phrase is # tried first; this removes the chance of smaller phrases changing # the text before longer phrases are tried. # The errors shown by `tools/check-capitalization` can be added to # this list without any modification. IGNORED_PHRASES = [ # Proper nouns and acronyms r"Android", r"API", r"APNS", r"App Store", r"Botserver", r"Cookie Bot", r"Dropbox", r"GitHub", r"Google", r"HTTP", r"ID", r"IDs", r"JIRA", r"JSON", r"Kerberos", r"LDAP", r"Mac", r"macOS", r"MiB", r"OTP", r"Pivotal", r"Play Store", r'REMOTE_USER', r'Slack', r"SSO", r'Terms of Service', r"URL", r"Ubuntu", r"Updown", r"V5", r"Webathena", r"Windows", r"WordPress", r"XML", r"Zephyr", r"Zulip", r"iPhone", r"iOS", r"Emoji One", # Code things r".zuliprc", r"__\w+\.\w+__", # Things using "I" r"I say", r"I want", r"I'm", # Specific short words r"and", r"bot", r"e.g.", r"etc.", r"images", r"enabled", r"disabled", # Fragments of larger strings (r'Change notification settings for individual streams on your ' 'Streams page.'), (r'Looking for our ' 'Integrations or ' 'API documentation?'), r'Most stream administration is done on the Streams page.', r"one or more people...", r"confirmation email", r"invites remaining", r"^left$", r"was too large; the maximum file size is 25MiB.", r"^right$", r"selected message", r"a-z", # SPECIAL CASES # Enter is usually capitalized r"Press Enter to send", # Because topics usually are lower-case, this would look weird if it were capitalized r"more topics", # For consistency with "more topics" r"more conversations", # We should probably just delete this string from translations r'activation key', # this is used as a topic r'^hello$', # TO CLEAN UP # Just want to avoid churning login.html right now r"or Choose a user", # This is a parsing bug in the tool r"argument ", # I can't find this one r"text", ] # Sort regexes in descending order of their lengths. As a result, the # longer phrases will be ignored first. IGNORED_PHRASES.sort(key=lambda regex: len(regex), reverse=True) # Compile regexes to improve performance. This also extracts the # text using BeautifulSoup and then removes extra whitespaces from # it. This step enables us to add HTML in our regexes directly. COMPILED_IGNORED_PHRASES = [ re.compile(' '.join(BeautifulSoup(regex, 'lxml').text.split())) for regex in IGNORED_PHRASES ] SPLIT_BOUNDARY = '?.!' # Used to split string into sentences. SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY)) # Regexes which check capitalization in sentences. DISALLOWED_REGEXES = [re.compile(regex) for regex in [ r'^[a-z]', # Checks if the sentence starts with a lower case character. r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]', # Checks if an upper case character exists # after a lower case character when the first character is in upper case. ]] def get_safe_phrase(phrase): # type: (str) -> str """ Safe phrase is in lower case and doesn't contain characters which can conflict with split boundaries. All conflicting characters are replaced with low dash (_). """ phrase = SPLIT_BOUNDARY_REGEX.sub('_', phrase) return phrase.lower() def replace_with_safe_phrase(matchobj): # type: (Match[str]) -> str """ The idea is to convert IGNORED_PHRASES into safe phrases, see `get_safe_phrase()` function. The only exception is when the IGNORED_PHRASE is at the start of the text or after a split boundary; in this case, we change the first letter of the phrase to upper case. """ ignored_phrase = matchobj.group(0) safe_string = get_safe_phrase(ignored_phrase) start_index = matchobj.start() complete_string = matchobj.string is_string_start = start_index == 0 # We expect that there will be one space between split boundary # and the next word. punctuation = complete_string[max(start_index - 2, 0)] is_after_split_boundary = punctuation in SPLIT_BOUNDARY if is_string_start or is_after_split_boundary: return safe_string.capitalize() return safe_string def get_safe_text(text): # type: (str) -> str """ This returns text which is rendered by BeautifulSoup and is in the form that can be split easily and has all IGNORED_PHRASES processed. """ soup = BeautifulSoup(text, 'lxml') text = ' '.join(soup.text.split()) # Remove extra whitespaces. for phrase_regex in COMPILED_IGNORED_PHRASES: text = phrase_regex.sub(replace_with_safe_phrase, text) return text def is_capitalized(safe_text): # type: (str) -> bool sentences = SPLIT_BOUNDARY_REGEX.split(safe_text) sentences = [sentence.strip() for sentence in sentences if sentence.strip()] if not sentences: return False for sentence in sentences: for regex in DISALLOWED_REGEXES: if regex.search(sentence): return False return True def check_capitalization(strings): # type: (List[str]) -> Tuple[List[str], List[str]] errors = [] ignored = [] for text in strings: text = ' '.join(text.split()) # Remove extra whitespaces. safe_text = get_safe_text(text) has_ignored_phrase = text != safe_text capitalized = is_capitalized(safe_text) if not capitalized: errors.append(text) elif capitalized and has_ignored_phrase: ignored.append(text) return sorted(errors), sorted(ignored)