From 83dd901ecf601cb79981ccee8222fd1a2407a422 Mon Sep 17 00:00:00 2001 From: Umair Khan Date: Fri, 3 Mar 2017 16:42:07 +0500 Subject: [PATCH] Add capitalization checker tool. Initial rules significantly by modified by tabbott, who also added the hacky list of excludes that the tool can't handle correctly yet. Fixes: #3899. --- docs/translating.md | 34 +++- tools/check-capitalization | 61 +++++++ tools/lib/capitalization.py | 176 +++++++++++++++++++++ tools/tests/test_capitalization_checker.py | 143 +++++++++++++++++ tools/travis/backend | 1 + 5 files changed, 413 insertions(+), 2 deletions(-) create mode 100755 tools/check-capitalization create mode 100644 tools/lib/capitalization.py create mode 100644 tools/tests/test_capitalization_checker.py diff --git a/docs/translating.md b/docs/translating.md index ed5599459a..d6d4fd5e54 100644 --- a/docs/translating.md +++ b/docs/translating.md @@ -33,9 +33,39 @@ languages (e.g. what word to translate words like "home" to): * [Russian](russian.html) * [Spanish](spanish.html) +A great first step when getting started translating Zulip into a new +language is to write a style guide, since it greatly increases the +ability of future translators to translate in a way that's consistent +with what your work. + +### Capitalization + +We expect that all the English translatable strings in Zulip are +properly capitalized in a way consistent with how Zulip does +capitalization in general. This means that: + +* The first letter of a sentence or phrase should be capitalized. + - Correct: "Manage streams" + - Incorrect: "Manage Streams" +* All proper nouns should be capitalized. + - Correct: "This is Zulip" + - Incorrect: "This is zulip" +* All common words like URL, HTTP, etc. should be written in their + standard forms. + - Correct: "URL" + - Incorrect: "Url" + +We have a tool to check for the correct capitalization of the +translatable strings; this tool will not allow the Travis builds to +pass in case of errors. You can use our capitalization checker to +validate your code by running `./tool/check-capitalization`. If you +think that you have a case where our capitalization checker tool +wrongly categorizes a string as not capitalized, you can add an +exception in the `tools.lib.capitalization.IGNORED_PHRASES` list to +make the tool pass. + Please, stick to these while translating, and feel free to point out -anything that should be improved or fixed. New style guides for other -languages are welcome, too. +any strings that should be improved or fixed. ## Translation process diff --git a/tools/check-capitalization b/tools/check-capitalization new file mode 100755 index 0000000000..c9869b3539 --- /dev/null +++ b/tools/check-capitalization @@ -0,0 +1,61 @@ +#!/usr/bin/env python +from __future__ import absolute_import +from __future__ import print_function + +# check for the venv +from lib import sanity_check +sanity_check.check_venv(__file__) + +import argparse +import json +import os +import re +import subprocess +import sys + +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from scripts.lib.zulip_tools import WARNING, FAIL, ENDC + +from tools.lib.capitalization import check_capitalization + +DJANGO_PO_REGEX = re.compile('msgid "(.*?)"') + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--show-ignored', + action='store_true', dest='show_ignored', default=False, + help='Show strings that passed the check because they ' + 'contained ignored phrases.') + args = parser.parse_args() + + subprocess.call(['./manage.py', 'makemessages'], stderr=subprocess.STDOUT) + + with open('static/locale/en/translations.json') as f: + data = json.load(f) + frontend = check_capitalization(list(data.keys())) + frontend_errors, frontend_ignored = frontend + + with open('static/locale/en/LC_MESSAGES/django.po') as f: + rows = [r for r in DJANGO_PO_REGEX.findall(f.read()) if r] + backend = check_capitalization(rows) + backend_errors, backend_ignored = backend + + if frontend_errors: + print(WARNING + "Strings not capitalized in frontend:" + ENDC) + print('\n'.join(frontend_errors)) + + if backend_errors: + print(WARNING + "Strings not capitalized in backend:" + ENDC) + print('\n'.join(backend_errors)) + + if args.show_ignored: + print(WARNING + "Strings which were ignored: " + ENDC) + print('\n'.join(frontend_ignored + backend_ignored)) + + if frontend_errors or backend_errors: + # Point the user to the documentation on what the policy is. + print(WARNING + "See https://zulip.readthedocs.io/en/latest/translating.html#capitalization" + ENDC) + print(FAIL + "Failed!" + ENDC) + sys.exit(1) + else: + sys.exit(0) diff --git a/tools/lib/capitalization.py b/tools/lib/capitalization.py new file mode 100644 index 0000000000..6b2483f1fa --- /dev/null +++ b/tools/lib/capitalization.py @@ -0,0 +1,176 @@ +from __future__ import absolute_import + +from typing import List, Tuple, Set, Pattern, Match +import re + +from bs4 import BeautifulSoup + +# The phrases in this list will be ignored. +# +# Keep the sublists lexicographically sorted. +IGNORED_PHRASES = [re.compile(regex) for regex in [ + # Proper nouns and acronyms + r"API", + r"Cookie Bot", + r"Dropbox", + r"GitHub", + r"Google", + r"HTTP", + r"ID", + r"IDs", + r"JIRA", + r"JSON", + r"Kerberos", + r"Mac", + r"MiB", + r"Pivotal", + r'REMOTE_USER', + r"SSO", + r'Terms of Service', + r"URL", + r"Ubuntu", + r"V5", + r"Webathena", + r"Windows", + r"WordPress", + r"XML", + r"Zephyr", + r"Zulip", + r"iPhone", + # Code things + r".zuliprc", + r"__\w+\.\w+__", + # Things using "I" + r"I say", + r"I want", + r"I'm", + # Specific short words + r"and", + r"bot", + r"e.g.", + r"etc.", + r"images", + + # Fragments of larger strings + r"one or more people...", + r"confirmation email", + r"invites remaining", + r"^left$", + r"^right$", + + # SPECIAL CASES + # Enter is usually capitalized + r"Press Enter to send", + # Because topics usually are lower-case, this would look weird if it were capitalized + r"more topics", + # For consistency with "more topics" + r"more conversations", + # We should probably just delete this string from translations + r'activation key', + + # TO CLEAN UP + # Just want to avoid churning login.html right now + r"or Choose a user", + # This is a parsing bug in the tool + r"argument ", + # I can't find this one + r"text", +]] + +SPLIT_BOUNDARY = '?.!' # Used to split string into sentences. +SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY)) + +# Regexes which check capitalization in sentences. +DISALLOWED_REGEXES = [re.compile(regex) for regex in [ + r'^[a-z]', # Checks if the sentence starts with a lower case character. + r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]', # Checks if an upper case character exists + # after a lower case character when the first character is in upper case. +]] + +def get_safe_phrase(phrase): + # type: (str) -> str + """ + Safe phrase is in lower case and doesn't contain characters which can + conflict with split boundaries. All conflicting characters are replaced + with low dash (_). + """ + phrase = SPLIT_BOUNDARY_REGEX.sub('_', phrase) + return phrase.lower() + +def replace_with_safe_phrase(matchobj): + # type: (Match[str]) -> str + """ + The idea is to convert IGNORED_PHRASES into safe phrases, see + `get_safe_phrase()` function. The only exception is when the + IGNORED_PHRASE is at the start of the text or after a split + boundary; in this case, we change the first letter of the phrase + to upper case. + """ + ignored_phrase = matchobj.group(0) + safe_string = get_safe_phrase(ignored_phrase) + + start_index = matchobj.start() + complete_string = matchobj.string + + is_string_start = start_index == 0 + # We expect that there will be one space between split boundary + # and the next word. + punctuation = complete_string[max(start_index - 2, 0)] + is_after_split_boundary = punctuation in SPLIT_BOUNDARY + if is_string_start or is_after_split_boundary: + return safe_string.capitalize() + + return safe_string + +def get_safe_text(text): + # type: (str) -> str + """ + This returns text which is rendered by BeautifulSoup and is in the + form that can be split easily and has all IGNORED_PHRASES processed. + """ + soup = BeautifulSoup(text, 'lxml') + text = ' '.join(soup.text.split()) # Remove extra whitespaces. + for phrase_regex in IGNORED_PHRASES: + text = phrase_regex.sub(replace_with_safe_phrase, text) + + return text + +def is_capitalized(safe_text): + # type: (str) -> bool + sentences = SPLIT_BOUNDARY_REGEX.split(safe_text) + sentences = [sentence.strip() + for sentence in sentences if sentence.strip()] + + if not sentences: + return False + + for sentence in sentences: + for regex in DISALLOWED_REGEXES: + if regex.search(sentence): + return False + + return True + +def check_capitalization(strings): + # type: (List[str]) -> Tuple[List[str], List[str]] + errors = [] + ignored = [] + for text in strings: + # Hand-skip a few that break the tool + if 'Change notification settings for individual streams' in text: + continue + if 'was too large; the maximum file size is 25MiB.' in text: + continue + if 'Most stream administration is done on the' in text: + continue + if 'bot-settings-note padded-container' in text: + continue + safe_text = get_safe_text(text) + has_ignored_phrase = text != safe_text + capitalized = is_capitalized(safe_text) + if not capitalized: + errors.append(text) + elif capitalized and has_ignored_phrase: + ignored.append(text) + + return sorted(errors), sorted(ignored) diff --git a/tools/tests/test_capitalization_checker.py b/tools/tests/test_capitalization_checker.py new file mode 100644 index 0000000000..845175fb74 --- /dev/null +++ b/tools/tests/test_capitalization_checker.py @@ -0,0 +1,143 @@ +from bs4 import BeautifulSoup +from django.test import TestCase + +from tools.lib.capitalization import check_capitalization, is_capitalized, \ + get_safe_text + +class GetSafeTextTestCase(TestCase): + def test_get_safe_text(self): + # type: () -> None + string = ('Messages in __page_params.product_name__ go to a ' + 'stream and have a topic.') + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'Messages in __page_params_product_name__ ' + 'go to a stream and have a topic.') + + string = "Zulip Zulip. Zulip some text!" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'Zulip zulip. Zulip some text!') + + string = "Zulip Zulip? Zulip some text!" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'Zulip zulip? Zulip some text!') + + string = "Zulip Zulip! Zulip some text!" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'Zulip zulip! Zulip some text!') + + string = "Zulip Zulip, Zulip some text!" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'Zulip zulip, zulip some text!') + + string = "Some text 25MiB" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'Some text 25mib') + + string = "Not Ignored Phrase" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'Not Ignored Phrase') + + string = "Not ignored phrase" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'Not ignored phrase') + + string = "" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, '') + + string = """ +

Please re-enter your password to confirm your identity. + (Forgotten it?)

+ """ + safe_text = get_safe_text(string) + soup = BeautifulSoup(safe_text, 'lxml') + rendered_text = ' '.join(soup.text.split()) + self.assertEqual(safe_text, rendered_text) + + string = "Edited (__last_edit_timestr__)" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, string) + + string = "iPhone application" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'Iphone application') + + string = "One two etc. three" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'One two etc_ three') + + string = "One two etc. three. four" + safe_text = get_safe_text(string) + self.assertEqual(safe_text, 'One two etc_ three. four') + +class IsCapitalizedTestCase(TestCase): + def test_process_text(self): + # type: () -> None + string = "Zulip zulip. Zulip some text!" + capitalized = is_capitalized(string) + self.assertTrue(capitalized) + + string = "Zulip zulip? Zulip some text!" + capitalized = is_capitalized(string) + self.assertTrue(capitalized) + + string = "Zulip zulip! Zulip some text!" + capitalized = is_capitalized(string) + self.assertTrue(capitalized) + + string = "Zulip zulip, Zulip some text!" + capitalized = is_capitalized(string) + self.assertTrue(capitalized) + + string = "Some number 25mib" + capitalized = is_capitalized(string) + self.assertTrue(capitalized) + + string = "Not Ignored Phrase" + capitalized = is_capitalized(string) + self.assertFalse(capitalized) + + string = "Not ignored phrase" + capitalized = is_capitalized(string) + self.assertTrue(capitalized) + + string = "" + capitalized = is_capitalized(string) + self.assertFalse(capitalized) + + string = ("Please re-enter your password to confirm your identity." + " (Forgotten it?)") + capitalized = is_capitalized(string) + self.assertTrue(capitalized) + + string = "Edited (__last_edit_timestr__)" + capitalized = is_capitalized(string) + self.assertTrue(capitalized) + + string = "Iphone application" + capitalized = is_capitalized(string) + self.assertTrue(capitalized) + + string = "One two etc_ three" + capitalized = is_capitalized(string) + self.assertTrue(capitalized) + +class CheckCapitalizationTestCase(TestCase): + def test_check_capitalization(self): + # type: () -> None + strings = ["Zulip Zulip. Zulip some text!", + "Zulip Zulip? Zulip some text!", + "Zulip Zulip! Zulip some text!", + "Zulip Zulip, Zulip some text!", + "Some number 25MiB", + "Not Ignored Phrase", + "Not ignored phrase", + ] + errored, ignored = check_capitalization(strings) + self.assertEqual(errored, ['Not Ignored Phrase']) + self.assertEqual(ignored, sorted(["Zulip Zulip. Zulip some text!", + "Zulip Zulip? Zulip some text!", + "Zulip Zulip! Zulip some text!", + "Zulip Zulip, Zulip some text!", + "Some number 25MiB", + ])) diff --git a/tools/travis/backend b/tools/travis/backend index cf5305ad5d..0a5c328dab 100755 --- a/tools/travis/backend +++ b/tools/travis/backend @@ -12,5 +12,6 @@ set -x ./tools/test-documentation ./tools/test-help-documentation.py ./tools/test-api +python -W ignore tools/check-capitalization # Some test suites disabled in CI for being flaky #./tools/test-queue-worker-reload