capitalization: Make it easier to ignore phrases.

This commit allows us to add the errors shown by the
tools/check-capitalization in the IGNORED_PHRASES list
without any modification.
This commit is contained in:
Umair Khan 2017-03-10 15:47:06 +05:00 committed by Tim Abbott
parent 43b19a6997
commit 90ee06bd89
2 changed files with 45 additions and 21 deletions

View File

@ -5,10 +5,12 @@ import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# The phrases in this list will be ignored. # The phrases in this list will be ignored. The longest phrase is
# # tried first; this removes the chance of smaller phrases changing
# Keep the sublists lexicographically sorted. # the text before longer phrases are tried.
IGNORED_PHRASES = [re.compile(regex) for regex in [ # The errors shown by `tools/check-capitalization` can be added to
# this list without any modification.
IGNORED_PHRASES = [
# Proper nouns and acronyms # Proper nouns and acronyms
r"API", r"API",
r"Cookie Bot", r"Cookie Bot",
@ -52,10 +54,18 @@ IGNORED_PHRASES = [re.compile(regex) for regex in [
r"images", r"images",
# Fragments of larger strings # Fragments of larger strings
(r'Change notification settings for individual streams on your '
'<a href="/#streams">Streams page</a>.'),
(r'<p class="bot-settings-note padded-container"> Looking for our '
'<a href="/integrations" target="_blank">Integrations</a> or '
'<a href="{{ server_uri }}/api" target="_blank">API</a> '
'documentation? </p>'),
r'Most stream administration is done on the <a href="/#streams">Streams page</a>.',
r"one or more people...", r"one or more people...",
r"confirmation email", r"confirmation email",
r"invites remaining", r"invites remaining",
r"^left$", r"^left$",
r"was too large; the maximum file size is 25MiB.",
r"^right$", r"^right$",
# SPECIAL CASES # SPECIAL CASES
@ -75,7 +85,19 @@ IGNORED_PHRASES = [re.compile(regex) for regex in [
r"argument ", r"argument ",
# I can't find this one # I can't find this one
r"text", r"text",
]] ]
# Sort regexes in descending order of their lengths. As a result, the
# longer phrases will be ignored first.
IGNORED_PHRASES.sort(key=lambda regex: len(regex), reverse=True)
# Compile regexes to improve performance. This also extracts the
# text using BeautifulSoup and then removes extra whitespaces from
# it. This step enables us to add HTML in our regexes directly.
COMPILED_IGNORED_PHRASES = [
re.compile(' '.join(BeautifulSoup(regex, 'lxml').text.split()))
for regex in IGNORED_PHRASES
]
SPLIT_BOUNDARY = '?.!' # Used to split string into sentences. SPLIT_BOUNDARY = '?.!' # Used to split string into sentences.
SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY)) SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY))
@ -130,7 +152,7 @@ def get_safe_text(text):
""" """
soup = BeautifulSoup(text, 'lxml') soup = BeautifulSoup(text, 'lxml')
text = ' '.join(soup.text.split()) # Remove extra whitespaces. text = ' '.join(soup.text.split()) # Remove extra whitespaces.
for phrase_regex in IGNORED_PHRASES: for phrase_regex in COMPILED_IGNORED_PHRASES:
text = phrase_regex.sub(replace_with_safe_phrase, text) text = phrase_regex.sub(replace_with_safe_phrase, text)
return text return text
@ -156,15 +178,7 @@ def check_capitalization(strings):
errors = [] errors = []
ignored = [] ignored = []
for text in strings: for text in strings:
# Hand-skip a few that break the tool text = ' '.join(text.split()) # Remove extra whitespaces.
if 'Change notification settings for individual streams' in text:
continue
if 'was too large; the maximum file size is 25MiB.' in text:
continue
if 'Most stream administration is done on the' in text:
continue
if 'bot-settings-note padded-container' in text:
continue
safe_text = get_safe_text(text) safe_text = get_safe_text(text)
has_ignored_phrase = text != safe_text has_ignored_phrase = text != safe_text
capitalized = is_capitalized(safe_text) capitalized = is_capitalized(safe_text)

View File

@ -132,12 +132,22 @@ class CheckCapitalizationTestCase(TestCase):
"Some number 25MiB", "Some number 25MiB",
"Not Ignored Phrase", "Not Ignored Phrase",
"Not ignored phrase", "Not ignored phrase",
('<p class="bot-settings-note padded-container"> Looking for our '
'<a href="/integrations" target="_blank">Integrations</a> or '
'<a href="{{ server_uri }}/api" target="_blank">API</a> '
'documentation? </p>'),
] ]
errored, ignored = check_capitalization(strings) errored, ignored = check_capitalization(strings)
self.assertEqual(errored, ['Not Ignored Phrase']) self.assertEqual(errored, ['Not Ignored Phrase'])
self.assertEqual(ignored, sorted(["Zulip Zulip. Zulip some text!", self.assertEqual(
"Zulip Zulip? Zulip some text!", ignored,
"Zulip Zulip! Zulip some text!", sorted(["Zulip Zulip. Zulip some text!",
"Zulip Zulip, Zulip some text!", "Zulip Zulip? Zulip some text!",
"Some number 25MiB", "Zulip Zulip! Zulip some text!",
])) "Zulip Zulip, Zulip some text!",
"Some number 25MiB",
('<p class="bot-settings-note padded-container"> Looking '
'for our <a href="/integrations" target="_blank">'
'Integrations</a> or <a href="{{ server_uri }}/api" '
'target="_blank">API</a> documentation? </p>'),
]))