Add capitalization checker tool.

Initial rules significantly by modified by tabbott, who also added the
hacky list of excludes that the tool can't handle correctly yet.

Fixes: #3899.
This commit is contained in:
Umair Khan 2017-03-03 16:42:07 +05:00 committed by Tim Abbott
parent 84d4f62abf
commit 83dd901ecf
5 changed files with 413 additions and 2 deletions

View File

@ -33,9 +33,39 @@ languages (e.g. what word to translate words like "home" to):
* [Russian](russian.html)
* [Spanish](spanish.html)
A great first step when getting started translating Zulip into a new
language is to write a style guide, since it greatly increases the
ability of future translators to translate in a way that's consistent
with what your work.
### Capitalization
We expect that all the English translatable strings in Zulip are
properly capitalized in a way consistent with how Zulip does
capitalization in general. This means that:
* The first letter of a sentence or phrase should be capitalized.
- Correct: "Manage streams"
- Incorrect: "Manage Streams"
* All proper nouns should be capitalized.
- Correct: "This is Zulip"
- Incorrect: "This is zulip"
* All common words like URL, HTTP, etc. should be written in their
standard forms.
- Correct: "URL"
- Incorrect: "Url"
We have a tool to check for the correct capitalization of the
translatable strings; this tool will not allow the Travis builds to
pass in case of errors. You can use our capitalization checker to
validate your code by running `./tool/check-capitalization`. If you
think that you have a case where our capitalization checker tool
wrongly categorizes a string as not capitalized, you can add an
exception in the `tools.lib.capitalization.IGNORED_PHRASES` list to
make the tool pass.
Please, stick to these while translating, and feel free to point out
anything that should be improved or fixed. New style guides for other
languages are welcome, too.
any strings that should be improved or fixed.
## Translation process

61
tools/check-capitalization Executable file
View File

@ -0,0 +1,61 @@
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import print_function
# check for the venv
from lib import sanity_check
sanity_check.check_venv(__file__)
import argparse
import json
import os
import re
import subprocess
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from scripts.lib.zulip_tools import WARNING, FAIL, ENDC
from tools.lib.capitalization import check_capitalization
DJANGO_PO_REGEX = re.compile('msgid "(.*?)"')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--show-ignored',
action='store_true', dest='show_ignored', default=False,
help='Show strings that passed the check because they '
'contained ignored phrases.')
args = parser.parse_args()
subprocess.call(['./manage.py', 'makemessages'], stderr=subprocess.STDOUT)
with open('static/locale/en/translations.json') as f:
data = json.load(f)
frontend = check_capitalization(list(data.keys()))
frontend_errors, frontend_ignored = frontend
with open('static/locale/en/LC_MESSAGES/django.po') as f:
rows = [r for r in DJANGO_PO_REGEX.findall(f.read()) if r]
backend = check_capitalization(rows)
backend_errors, backend_ignored = backend
if frontend_errors:
print(WARNING + "Strings not capitalized in frontend:" + ENDC)
print('\n'.join(frontend_errors))
if backend_errors:
print(WARNING + "Strings not capitalized in backend:" + ENDC)
print('\n'.join(backend_errors))
if args.show_ignored:
print(WARNING + "Strings which were ignored: " + ENDC)
print('\n'.join(frontend_ignored + backend_ignored))
if frontend_errors or backend_errors:
# Point the user to the documentation on what the policy is.
print(WARNING + "See https://zulip.readthedocs.io/en/latest/translating.html#capitalization" + ENDC)
print(FAIL + "Failed!" + ENDC)
sys.exit(1)
else:
sys.exit(0)

176
tools/lib/capitalization.py Normal file
View File

@ -0,0 +1,176 @@
from __future__ import absolute_import
from typing import List, Tuple, Set, Pattern, Match
import re
from bs4 import BeautifulSoup
# The phrases in this list will be ignored.
#
# Keep the sublists lexicographically sorted.
IGNORED_PHRASES = [re.compile(regex) for regex in [
# Proper nouns and acronyms
r"API",
r"Cookie Bot",
r"Dropbox",
r"GitHub",
r"Google",
r"HTTP",
r"ID",
r"IDs",
r"JIRA",
r"JSON",
r"Kerberos",
r"Mac",
r"MiB",
r"Pivotal",
r'REMOTE_USER',
r"SSO",
r'Terms of Service',
r"URL",
r"Ubuntu",
r"V5",
r"Webathena",
r"Windows",
r"WordPress",
r"XML",
r"Zephyr",
r"Zulip",
r"iPhone",
# Code things
r".zuliprc",
r"__\w+\.\w+__",
# Things using "I"
r"I say",
r"I want",
r"I'm",
# Specific short words
r"and",
r"bot",
r"e.g.",
r"etc.",
r"images",
# Fragments of larger strings
r"one or more people...",
r"confirmation email",
r"invites remaining",
r"^left$",
r"^right$",
# SPECIAL CASES
# Enter is usually capitalized
r"Press Enter to send",
# Because topics usually are lower-case, this would look weird if it were capitalized
r"more topics",
# For consistency with "more topics"
r"more conversations",
# We should probably just delete this string from translations
r'activation key',
# TO CLEAN UP
# Just want to avoid churning login.html right now
r"or Choose a user",
# This is a parsing bug in the tool
r"argument ",
# I can't find this one
r"text",
]]
SPLIT_BOUNDARY = '?.!' # Used to split string into sentences.
SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY))
# Regexes which check capitalization in sentences.
DISALLOWED_REGEXES = [re.compile(regex) for regex in [
r'^[a-z]', # Checks if the sentence starts with a lower case character.
r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]', # Checks if an upper case character exists
# after a lower case character when the first character is in upper case.
]]
def get_safe_phrase(phrase):
# type: (str) -> str
"""
Safe phrase is in lower case and doesn't contain characters which can
conflict with split boundaries. All conflicting characters are replaced
with low dash (_).
"""
phrase = SPLIT_BOUNDARY_REGEX.sub('_', phrase)
return phrase.lower()
def replace_with_safe_phrase(matchobj):
# type: (Match[str]) -> str
"""
The idea is to convert IGNORED_PHRASES into safe phrases, see
`get_safe_phrase()` function. The only exception is when the
IGNORED_PHRASE is at the start of the text or after a split
boundary; in this case, we change the first letter of the phrase
to upper case.
"""
ignored_phrase = matchobj.group(0)
safe_string = get_safe_phrase(ignored_phrase)
start_index = matchobj.start()
complete_string = matchobj.string
is_string_start = start_index == 0
# We expect that there will be one space between split boundary
# and the next word.
punctuation = complete_string[max(start_index - 2, 0)]
is_after_split_boundary = punctuation in SPLIT_BOUNDARY
if is_string_start or is_after_split_boundary:
return safe_string.capitalize()
return safe_string
def get_safe_text(text):
# type: (str) -> str
"""
This returns text which is rendered by BeautifulSoup and is in the
form that can be split easily and has all IGNORED_PHRASES processed.
"""
soup = BeautifulSoup(text, 'lxml')
text = ' '.join(soup.text.split()) # Remove extra whitespaces.
for phrase_regex in IGNORED_PHRASES:
text = phrase_regex.sub(replace_with_safe_phrase, text)
return text
def is_capitalized(safe_text):
# type: (str) -> bool
sentences = SPLIT_BOUNDARY_REGEX.split(safe_text)
sentences = [sentence.strip()
for sentence in sentences if sentence.strip()]
if not sentences:
return False
for sentence in sentences:
for regex in DISALLOWED_REGEXES:
if regex.search(sentence):
return False
return True
def check_capitalization(strings):
# type: (List[str]) -> Tuple[List[str], List[str]]
errors = []
ignored = []
for text in strings:
# Hand-skip a few that break the tool
if 'Change notification settings for individual streams' in text:
continue
if 'was too large; the maximum file size is 25MiB.' in text:
continue
if 'Most stream administration is done on the' in text:
continue
if 'bot-settings-note padded-container' in text:
continue
safe_text = get_safe_text(text)
has_ignored_phrase = text != safe_text
capitalized = is_capitalized(safe_text)
if not capitalized:
errors.append(text)
elif capitalized and has_ignored_phrase:
ignored.append(text)
return sorted(errors), sorted(ignored)

View File

@ -0,0 +1,143 @@
from bs4 import BeautifulSoup
from django.test import TestCase
from tools.lib.capitalization import check_capitalization, is_capitalized, \
get_safe_text
class GetSafeTextTestCase(TestCase):
def test_get_safe_text(self):
# type: () -> None
string = ('Messages in __page_params.product_name__ go to a '
'stream and have a topic.')
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'Messages in __page_params_product_name__ '
'go to a stream and have a topic.')
string = "Zulip Zulip. Zulip some text!"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'Zulip zulip. Zulip some text!')
string = "Zulip Zulip? Zulip some text!"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'Zulip zulip? Zulip some text!')
string = "Zulip Zulip! Zulip some text!"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'Zulip zulip! Zulip some text!')
string = "Zulip Zulip, Zulip some text!"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'Zulip zulip, zulip some text!')
string = "Some text 25MiB"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'Some text 25mib')
string = "Not Ignored Phrase"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'Not Ignored Phrase')
string = "Not ignored phrase"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'Not ignored phrase')
string = ""
safe_text = get_safe_text(string)
self.assertEqual(safe_text, '')
string = """
<p>Please re-enter your password to confirm your identity.
(<a href="/accounts/password/reset/" target="_blank">Forgotten it?</a>)</p>
"""
safe_text = get_safe_text(string)
soup = BeautifulSoup(safe_text, 'lxml')
rendered_text = ' '.join(soup.text.split())
self.assertEqual(safe_text, rendered_text)
string = "Edited (__last_edit_timestr__)"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, string)
string = "iPhone application"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'Iphone application')
string = "One two etc. three"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'One two etc_ three')
string = "One two etc. three. four"
safe_text = get_safe_text(string)
self.assertEqual(safe_text, 'One two etc_ three. four')
class IsCapitalizedTestCase(TestCase):
def test_process_text(self):
# type: () -> None
string = "Zulip zulip. Zulip some text!"
capitalized = is_capitalized(string)
self.assertTrue(capitalized)
string = "Zulip zulip? Zulip some text!"
capitalized = is_capitalized(string)
self.assertTrue(capitalized)
string = "Zulip zulip! Zulip some text!"
capitalized = is_capitalized(string)
self.assertTrue(capitalized)
string = "Zulip zulip, Zulip some text!"
capitalized = is_capitalized(string)
self.assertTrue(capitalized)
string = "Some number 25mib"
capitalized = is_capitalized(string)
self.assertTrue(capitalized)
string = "Not Ignored Phrase"
capitalized = is_capitalized(string)
self.assertFalse(capitalized)
string = "Not ignored phrase"
capitalized = is_capitalized(string)
self.assertTrue(capitalized)
string = ""
capitalized = is_capitalized(string)
self.assertFalse(capitalized)
string = ("Please re-enter your password to confirm your identity."
" (Forgotten it?)")
capitalized = is_capitalized(string)
self.assertTrue(capitalized)
string = "Edited (__last_edit_timestr__)"
capitalized = is_capitalized(string)
self.assertTrue(capitalized)
string = "Iphone application"
capitalized = is_capitalized(string)
self.assertTrue(capitalized)
string = "One two etc_ three"
capitalized = is_capitalized(string)
self.assertTrue(capitalized)
class CheckCapitalizationTestCase(TestCase):
def test_check_capitalization(self):
# type: () -> None
strings = ["Zulip Zulip. Zulip some text!",
"Zulip Zulip? Zulip some text!",
"Zulip Zulip! Zulip some text!",
"Zulip Zulip, Zulip some text!",
"Some number 25MiB",
"Not Ignored Phrase",
"Not ignored phrase",
]
errored, ignored = check_capitalization(strings)
self.assertEqual(errored, ['Not Ignored Phrase'])
self.assertEqual(ignored, sorted(["Zulip Zulip. Zulip some text!",
"Zulip Zulip? Zulip some text!",
"Zulip Zulip! Zulip some text!",
"Zulip Zulip, Zulip some text!",
"Some number 25MiB",
]))

View File

@ -12,5 +12,6 @@ set -x
./tools/test-documentation
./tools/test-help-documentation.py
./tools/test-api
python -W ignore tools/check-capitalization
# Some test suites disabled in CI for being flaky
#./tools/test-queue-worker-reload