mirror of https://github.com/zulip/zulip.git
Add capitalization checker tool.
Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899.
This commit is contained in:
parent
84d4f62abf
commit
83dd901ecf
|
@ -33,9 +33,39 @@ languages (e.g. what word to translate words like "home" to):
|
|||
* [Russian](russian.html)
|
||||
* [Spanish](spanish.html)
|
||||
|
||||
A great first step when getting started translating Zulip into a new
|
||||
language is to write a style guide, since it greatly increases the
|
||||
ability of future translators to translate in a way that's consistent
|
||||
with what your work.
|
||||
|
||||
### Capitalization
|
||||
|
||||
We expect that all the English translatable strings in Zulip are
|
||||
properly capitalized in a way consistent with how Zulip does
|
||||
capitalization in general. This means that:
|
||||
|
||||
* The first letter of a sentence or phrase should be capitalized.
|
||||
- Correct: "Manage streams"
|
||||
- Incorrect: "Manage Streams"
|
||||
* All proper nouns should be capitalized.
|
||||
- Correct: "This is Zulip"
|
||||
- Incorrect: "This is zulip"
|
||||
* All common words like URL, HTTP, etc. should be written in their
|
||||
standard forms.
|
||||
- Correct: "URL"
|
||||
- Incorrect: "Url"
|
||||
|
||||
We have a tool to check for the correct capitalization of the
|
||||
translatable strings; this tool will not allow the Travis builds to
|
||||
pass in case of errors. You can use our capitalization checker to
|
||||
validate your code by running `./tool/check-capitalization`. If you
|
||||
think that you have a case where our capitalization checker tool
|
||||
wrongly categorizes a string as not capitalized, you can add an
|
||||
exception in the `tools.lib.capitalization.IGNORED_PHRASES` list to
|
||||
make the tool pass.
|
||||
|
||||
Please, stick to these while translating, and feel free to point out
|
||||
anything that should be improved or fixed. New style guides for other
|
||||
languages are welcome, too.
|
||||
any strings that should be improved or fixed.
|
||||
|
||||
## Translation process
|
||||
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
|
||||
# check for the venv
|
||||
from lib import sanity_check
|
||||
sanity_check.check_venv(__file__)
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
||||
from scripts.lib.zulip_tools import WARNING, FAIL, ENDC
|
||||
|
||||
from tools.lib.capitalization import check_capitalization
|
||||
|
||||
DJANGO_PO_REGEX = re.compile('msgid "(.*?)"')
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--show-ignored',
|
||||
action='store_true', dest='show_ignored', default=False,
|
||||
help='Show strings that passed the check because they '
|
||||
'contained ignored phrases.')
|
||||
args = parser.parse_args()
|
||||
|
||||
subprocess.call(['./manage.py', 'makemessages'], stderr=subprocess.STDOUT)
|
||||
|
||||
with open('static/locale/en/translations.json') as f:
|
||||
data = json.load(f)
|
||||
frontend = check_capitalization(list(data.keys()))
|
||||
frontend_errors, frontend_ignored = frontend
|
||||
|
||||
with open('static/locale/en/LC_MESSAGES/django.po') as f:
|
||||
rows = [r for r in DJANGO_PO_REGEX.findall(f.read()) if r]
|
||||
backend = check_capitalization(rows)
|
||||
backend_errors, backend_ignored = backend
|
||||
|
||||
if frontend_errors:
|
||||
print(WARNING + "Strings not capitalized in frontend:" + ENDC)
|
||||
print('\n'.join(frontend_errors))
|
||||
|
||||
if backend_errors:
|
||||
print(WARNING + "Strings not capitalized in backend:" + ENDC)
|
||||
print('\n'.join(backend_errors))
|
||||
|
||||
if args.show_ignored:
|
||||
print(WARNING + "Strings which were ignored: " + ENDC)
|
||||
print('\n'.join(frontend_ignored + backend_ignored))
|
||||
|
||||
if frontend_errors or backend_errors:
|
||||
# Point the user to the documentation on what the policy is.
|
||||
print(WARNING + "See https://zulip.readthedocs.io/en/latest/translating.html#capitalization" + ENDC)
|
||||
print(FAIL + "Failed!" + ENDC)
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.exit(0)
|
|
@ -0,0 +1,176 @@
|
|||
from __future__ import absolute_import
|
||||
|
||||
from typing import List, Tuple, Set, Pattern, Match
|
||||
import re
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# The phrases in this list will be ignored.
|
||||
#
|
||||
# Keep the sublists lexicographically sorted.
|
||||
IGNORED_PHRASES = [re.compile(regex) for regex in [
|
||||
# Proper nouns and acronyms
|
||||
r"API",
|
||||
r"Cookie Bot",
|
||||
r"Dropbox",
|
||||
r"GitHub",
|
||||
r"Google",
|
||||
r"HTTP",
|
||||
r"ID",
|
||||
r"IDs",
|
||||
r"JIRA",
|
||||
r"JSON",
|
||||
r"Kerberos",
|
||||
r"Mac",
|
||||
r"MiB",
|
||||
r"Pivotal",
|
||||
r'REMOTE_USER',
|
||||
r"SSO",
|
||||
r'Terms of Service',
|
||||
r"URL",
|
||||
r"Ubuntu",
|
||||
r"V5",
|
||||
r"Webathena",
|
||||
r"Windows",
|
||||
r"WordPress",
|
||||
r"XML",
|
||||
r"Zephyr",
|
||||
r"Zulip",
|
||||
r"iPhone",
|
||||
# Code things
|
||||
r".zuliprc",
|
||||
r"__\w+\.\w+__",
|
||||
# Things using "I"
|
||||
r"I say",
|
||||
r"I want",
|
||||
r"I'm",
|
||||
# Specific short words
|
||||
r"and",
|
||||
r"bot",
|
||||
r"e.g.",
|
||||
r"etc.",
|
||||
r"images",
|
||||
|
||||
# Fragments of larger strings
|
||||
r"one or more people...",
|
||||
r"confirmation email",
|
||||
r"invites remaining",
|
||||
r"^left$",
|
||||
r"^right$",
|
||||
|
||||
# SPECIAL CASES
|
||||
# Enter is usually capitalized
|
||||
r"Press Enter to send",
|
||||
# Because topics usually are lower-case, this would look weird if it were capitalized
|
||||
r"more topics",
|
||||
# For consistency with "more topics"
|
||||
r"more conversations",
|
||||
# We should probably just delete this string from translations
|
||||
r'activation key',
|
||||
|
||||
# TO CLEAN UP
|
||||
# Just want to avoid churning login.html right now
|
||||
r"or Choose a user",
|
||||
# This is a parsing bug in the tool
|
||||
r"argument ",
|
||||
# I can't find this one
|
||||
r"text",
|
||||
]]
|
||||
|
||||
SPLIT_BOUNDARY = '?.!' # Used to split string into sentences.
|
||||
SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY))
|
||||
|
||||
# Regexes which check capitalization in sentences.
|
||||
DISALLOWED_REGEXES = [re.compile(regex) for regex in [
|
||||
r'^[a-z]', # Checks if the sentence starts with a lower case character.
|
||||
r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]', # Checks if an upper case character exists
|
||||
# after a lower case character when the first character is in upper case.
|
||||
]]
|
||||
|
||||
def get_safe_phrase(phrase):
|
||||
# type: (str) -> str
|
||||
"""
|
||||
Safe phrase is in lower case and doesn't contain characters which can
|
||||
conflict with split boundaries. All conflicting characters are replaced
|
||||
with low dash (_).
|
||||
"""
|
||||
phrase = SPLIT_BOUNDARY_REGEX.sub('_', phrase)
|
||||
return phrase.lower()
|
||||
|
||||
def replace_with_safe_phrase(matchobj):
|
||||
# type: (Match[str]) -> str
|
||||
"""
|
||||
The idea is to convert IGNORED_PHRASES into safe phrases, see
|
||||
`get_safe_phrase()` function. The only exception is when the
|
||||
IGNORED_PHRASE is at the start of the text or after a split
|
||||
boundary; in this case, we change the first letter of the phrase
|
||||
to upper case.
|
||||
"""
|
||||
ignored_phrase = matchobj.group(0)
|
||||
safe_string = get_safe_phrase(ignored_phrase)
|
||||
|
||||
start_index = matchobj.start()
|
||||
complete_string = matchobj.string
|
||||
|
||||
is_string_start = start_index == 0
|
||||
# We expect that there will be one space between split boundary
|
||||
# and the next word.
|
||||
punctuation = complete_string[max(start_index - 2, 0)]
|
||||
is_after_split_boundary = punctuation in SPLIT_BOUNDARY
|
||||
if is_string_start or is_after_split_boundary:
|
||||
return safe_string.capitalize()
|
||||
|
||||
return safe_string
|
||||
|
||||
def get_safe_text(text):
|
||||
# type: (str) -> str
|
||||
"""
|
||||
This returns text which is rendered by BeautifulSoup and is in the
|
||||
form that can be split easily and has all IGNORED_PHRASES processed.
|
||||
"""
|
||||
soup = BeautifulSoup(text, 'lxml')
|
||||
text = ' '.join(soup.text.split()) # Remove extra whitespaces.
|
||||
for phrase_regex in IGNORED_PHRASES:
|
||||
text = phrase_regex.sub(replace_with_safe_phrase, text)
|
||||
|
||||
return text
|
||||
|
||||
def is_capitalized(safe_text):
|
||||
# type: (str) -> bool
|
||||
sentences = SPLIT_BOUNDARY_REGEX.split(safe_text)
|
||||
sentences = [sentence.strip()
|
||||
for sentence in sentences if sentence.strip()]
|
||||
|
||||
if not sentences:
|
||||
return False
|
||||
|
||||
for sentence in sentences:
|
||||
for regex in DISALLOWED_REGEXES:
|
||||
if regex.search(sentence):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def check_capitalization(strings):
|
||||
# type: (List[str]) -> Tuple[List[str], List[str]]
|
||||
errors = []
|
||||
ignored = []
|
||||
for text in strings:
|
||||
# Hand-skip a few that break the tool
|
||||
if 'Change notification settings for individual streams' in text:
|
||||
continue
|
||||
if 'was too large; the maximum file size is 25MiB.' in text:
|
||||
continue
|
||||
if 'Most stream administration is done on the' in text:
|
||||
continue
|
||||
if 'bot-settings-note padded-container' in text:
|
||||
continue
|
||||
safe_text = get_safe_text(text)
|
||||
has_ignored_phrase = text != safe_text
|
||||
capitalized = is_capitalized(safe_text)
|
||||
if not capitalized:
|
||||
errors.append(text)
|
||||
elif capitalized and has_ignored_phrase:
|
||||
ignored.append(text)
|
||||
|
||||
return sorted(errors), sorted(ignored)
|
|
@ -0,0 +1,143 @@
|
|||
from bs4 import BeautifulSoup
|
||||
from django.test import TestCase
|
||||
|
||||
from tools.lib.capitalization import check_capitalization, is_capitalized, \
|
||||
get_safe_text
|
||||
|
||||
class GetSafeTextTestCase(TestCase):
|
||||
def test_get_safe_text(self):
|
||||
# type: () -> None
|
||||
string = ('Messages in __page_params.product_name__ go to a '
|
||||
'stream and have a topic.')
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'Messages in __page_params_product_name__ '
|
||||
'go to a stream and have a topic.')
|
||||
|
||||
string = "Zulip Zulip. Zulip some text!"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'Zulip zulip. Zulip some text!')
|
||||
|
||||
string = "Zulip Zulip? Zulip some text!"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'Zulip zulip? Zulip some text!')
|
||||
|
||||
string = "Zulip Zulip! Zulip some text!"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'Zulip zulip! Zulip some text!')
|
||||
|
||||
string = "Zulip Zulip, Zulip some text!"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'Zulip zulip, zulip some text!')
|
||||
|
||||
string = "Some text 25MiB"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'Some text 25mib')
|
||||
|
||||
string = "Not Ignored Phrase"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'Not Ignored Phrase')
|
||||
|
||||
string = "Not ignored phrase"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'Not ignored phrase')
|
||||
|
||||
string = ""
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, '')
|
||||
|
||||
string = """
|
||||
<p>Please re-enter your password to confirm your identity.
|
||||
(<a href="/accounts/password/reset/" target="_blank">Forgotten it?</a>)</p>
|
||||
"""
|
||||
safe_text = get_safe_text(string)
|
||||
soup = BeautifulSoup(safe_text, 'lxml')
|
||||
rendered_text = ' '.join(soup.text.split())
|
||||
self.assertEqual(safe_text, rendered_text)
|
||||
|
||||
string = "Edited (__last_edit_timestr__)"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, string)
|
||||
|
||||
string = "iPhone application"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'Iphone application')
|
||||
|
||||
string = "One two etc. three"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'One two etc_ three')
|
||||
|
||||
string = "One two etc. three. four"
|
||||
safe_text = get_safe_text(string)
|
||||
self.assertEqual(safe_text, 'One two etc_ three. four')
|
||||
|
||||
class IsCapitalizedTestCase(TestCase):
|
||||
def test_process_text(self):
|
||||
# type: () -> None
|
||||
string = "Zulip zulip. Zulip some text!"
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertTrue(capitalized)
|
||||
|
||||
string = "Zulip zulip? Zulip some text!"
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertTrue(capitalized)
|
||||
|
||||
string = "Zulip zulip! Zulip some text!"
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertTrue(capitalized)
|
||||
|
||||
string = "Zulip zulip, Zulip some text!"
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertTrue(capitalized)
|
||||
|
||||
string = "Some number 25mib"
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertTrue(capitalized)
|
||||
|
||||
string = "Not Ignored Phrase"
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertFalse(capitalized)
|
||||
|
||||
string = "Not ignored phrase"
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertTrue(capitalized)
|
||||
|
||||
string = ""
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertFalse(capitalized)
|
||||
|
||||
string = ("Please re-enter your password to confirm your identity."
|
||||
" (Forgotten it?)")
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertTrue(capitalized)
|
||||
|
||||
string = "Edited (__last_edit_timestr__)"
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertTrue(capitalized)
|
||||
|
||||
string = "Iphone application"
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertTrue(capitalized)
|
||||
|
||||
string = "One two etc_ three"
|
||||
capitalized = is_capitalized(string)
|
||||
self.assertTrue(capitalized)
|
||||
|
||||
class CheckCapitalizationTestCase(TestCase):
|
||||
def test_check_capitalization(self):
|
||||
# type: () -> None
|
||||
strings = ["Zulip Zulip. Zulip some text!",
|
||||
"Zulip Zulip? Zulip some text!",
|
||||
"Zulip Zulip! Zulip some text!",
|
||||
"Zulip Zulip, Zulip some text!",
|
||||
"Some number 25MiB",
|
||||
"Not Ignored Phrase",
|
||||
"Not ignored phrase",
|
||||
]
|
||||
errored, ignored = check_capitalization(strings)
|
||||
self.assertEqual(errored, ['Not Ignored Phrase'])
|
||||
self.assertEqual(ignored, sorted(["Zulip Zulip. Zulip some text!",
|
||||
"Zulip Zulip? Zulip some text!",
|
||||
"Zulip Zulip! Zulip some text!",
|
||||
"Zulip Zulip, Zulip some text!",
|
||||
"Some number 25MiB",
|
||||
]))
|
|
@ -12,5 +12,6 @@ set -x
|
|||
./tools/test-documentation
|
||||
./tools/test-help-documentation.py
|
||||
./tools/test-api
|
||||
python -W ignore tools/check-capitalization
|
||||
# Some test suites disabled in CI for being flaky
|
||||
#./tools/test-queue-worker-reload
|
||||
|
|
Loading…
Reference in New Issue