capitalization: Make it easier to ignore phrases.

This commit allows us to add the errors shown by the
tools/check-capitalization in the IGNORED_PHRASES list
without any modification.
This commit is contained in:
Umair Khan 2017-03-10 15:47:06 +05:00 committed by Tim Abbott
parent 43b19a6997
commit 90ee06bd89
2 changed files with 45 additions and 21 deletions

View File

@ -5,10 +5,12 @@ import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# The phrases in this list will be ignored. # The phrases in this list will be ignored. The longest phrase is
# # tried first; this removes the chance of smaller phrases changing
# Keep the sublists lexicographically sorted. # the text before longer phrases are tried.
IGNORED_PHRASES = [re.compile(regex) for regex in [ # The errors shown by `tools/check-capitalization` can be added to
# this list without any modification.
IGNORED_PHRASES = [
# Proper nouns and acronyms # Proper nouns and acronyms
r"API", r"API",
r"Cookie Bot", r"Cookie Bot",
@ -52,10 +54,18 @@ IGNORED_PHRASES = [re.compile(regex) for regex in [
r"images", r"images",
# Fragments of larger strings # Fragments of larger strings
(r'Change notification settings for individual streams on your '
'<a href="/#streams">Streams page</a>.'),
(r'<p class="bot-settings-note padded-container"> Looking for our '
'<a href="/integrations" target="_blank">Integrations</a> or '
'<a href="{{ server_uri }}/api" target="_blank">API</a> '
'documentation? </p>'),
r'Most stream administration is done on the <a href="/#streams">Streams page</a>.',
r"one or more people...", r"one or more people...",
r"confirmation email", r"confirmation email",
r"invites remaining", r"invites remaining",
r"^left$", r"^left$",
r"was too large; the maximum file size is 25MiB.",
r"^right$", r"^right$",
# SPECIAL CASES # SPECIAL CASES
@ -75,7 +85,19 @@ IGNORED_PHRASES = [re.compile(regex) for regex in [
r"argument ", r"argument ",
# I can't find this one # I can't find this one
r"text", r"text",
]] ]
# Sort regexes in descending order of their lengths. As a result, the
# longer phrases will be ignored first.
IGNORED_PHRASES.sort(key=lambda regex: len(regex), reverse=True)
# Compile regexes to improve performance. This also extracts the
# text using BeautifulSoup and then removes extra whitespaces from
# it. This step enables us to add HTML in our regexes directly.
COMPILED_IGNORED_PHRASES = [
re.compile(' '.join(BeautifulSoup(regex, 'lxml').text.split()))
for regex in IGNORED_PHRASES
]
SPLIT_BOUNDARY = '?.!' # Used to split string into sentences. SPLIT_BOUNDARY = '?.!' # Used to split string into sentences.
SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY)) SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY))
@ -130,7 +152,7 @@ def get_safe_text(text):
""" """
soup = BeautifulSoup(text, 'lxml') soup = BeautifulSoup(text, 'lxml')
text = ' '.join(soup.text.split()) # Remove extra whitespaces. text = ' '.join(soup.text.split()) # Remove extra whitespaces.
for phrase_regex in IGNORED_PHRASES: for phrase_regex in COMPILED_IGNORED_PHRASES:
text = phrase_regex.sub(replace_with_safe_phrase, text) text = phrase_regex.sub(replace_with_safe_phrase, text)
return text return text
@ -156,15 +178,7 @@ def check_capitalization(strings):
errors = [] errors = []
ignored = [] ignored = []
for text in strings: for text in strings:
# Hand-skip a few that break the tool text = ' '.join(text.split()) # Remove extra whitespaces.
if 'Change notification settings for individual streams' in text:
continue
if 'was too large; the maximum file size is 25MiB.' in text:
continue
if 'Most stream administration is done on the' in text:
continue
if 'bot-settings-note padded-container' in text:
continue
safe_text = get_safe_text(text) safe_text = get_safe_text(text)
has_ignored_phrase = text != safe_text has_ignored_phrase = text != safe_text
capitalized = is_capitalized(safe_text) capitalized = is_capitalized(safe_text)

View File

@ -132,12 +132,22 @@ class CheckCapitalizationTestCase(TestCase):
"Some number 25MiB", "Some number 25MiB",
"Not Ignored Phrase", "Not Ignored Phrase",
"Not ignored phrase", "Not ignored phrase",
('<p class="bot-settings-note padded-container"> Looking for our '
'<a href="/integrations" target="_blank">Integrations</a> or '
'<a href="{{ server_uri }}/api" target="_blank">API</a> '
'documentation? </p>'),
] ]
errored, ignored = check_capitalization(strings) errored, ignored = check_capitalization(strings)
self.assertEqual(errored, ['Not Ignored Phrase']) self.assertEqual(errored, ['Not Ignored Phrase'])
self.assertEqual(ignored, sorted(["Zulip Zulip. Zulip some text!", self.assertEqual(
ignored,
sorted(["Zulip Zulip. Zulip some text!",
"Zulip Zulip? Zulip some text!", "Zulip Zulip? Zulip some text!",
"Zulip Zulip! Zulip some text!", "Zulip Zulip! Zulip some text!",
"Zulip Zulip, Zulip some text!", "Zulip Zulip, Zulip some text!",
"Some number 25MiB", "Some number 25MiB",
('<p class="bot-settings-note padded-container"> Looking '
'for our <a href="/integrations" target="_blank">'
'Integrations</a> or <a href="{{ server_uri }}/api" '
'target="_blank">API</a> documentation? </p>'),
])) ]))