From 83dd901ecf601cb79981ccee8222fd1a2407a422 Mon Sep 17 00:00:00 2001
From: Umair Khan <umair.waheed@gmail.com>
Date: Fri, 3 Mar 2017 16:42:07 +0500
Subject: [PATCH] Add capitalization checker tool.

Initial rules significantly by modified by tabbott, who also added the
hacky list of excludes that the tool can't handle correctly yet.

Fixes: #3899.
---
 docs/translating.md                        |  34 +++-
 tools/check-capitalization                 |  61 +++++++
 tools/lib/capitalization.py                | 176 +++++++++++++++++++++
 tools/tests/test_capitalization_checker.py | 143 +++++++++++++++++
 tools/travis/backend                       |   1 +
 5 files changed, 413 insertions(+), 2 deletions(-)
 create mode 100755 tools/check-capitalization
 create mode 100644 tools/lib/capitalization.py
 create mode 100644 tools/tests/test_capitalization_checker.py

diff --git a/docs/translating.md b/docs/translating.md
index ed5599459a..d6d4fd5e54 100644
--- a/docs/translating.md
+++ b/docs/translating.md
@@ -33,9 +33,39 @@ languages (e.g. what word to translate words like "home" to):
 * [Russian](russian.html)
 * [Spanish](spanish.html)
 
+A great first step when getting started translating Zulip into a new
+language is to write a style guide, since it greatly increases the
+ability of future translators to translate in a way that's consistent
+with what your work.
+
+### Capitalization
+
+We expect that all the English translatable strings in Zulip are
+properly capitalized in a way consistent with how Zulip does
+capitalization in general.  This means that:
+
+* The first letter of a sentence or phrase should be capitalized.
+    - Correct: "Manage streams"
+    - Incorrect: "Manage Streams"
+* All proper nouns should be capitalized.
+    - Correct: "This is Zulip"
+    - Incorrect: "This is zulip"
+* All common words like URL, HTTP, etc. should be written in their
+  standard forms.
+    - Correct: "URL"
+    - Incorrect: "Url"
+
+We have a tool to check for the correct capitalization of the
+translatable strings; this tool will not allow the Travis builds to
+pass in case of errors. You can use our capitalization checker to
+validate your code by running `./tool/check-capitalization`. If you
+think that you have a case where our capitalization checker tool
+wrongly categorizes a string as not capitalized, you can add an
+exception in the `tools.lib.capitalization.IGNORED_PHRASES` list to
+make the tool pass.
+
 Please, stick to these while translating, and feel free to point out
-anything that should be improved or fixed. New style guides for other
-languages are welcome, too.
+any strings that should be improved or fixed.
 
 ## Translation process
 
diff --git a/tools/check-capitalization b/tools/check-capitalization
new file mode 100755
index 0000000000..c9869b3539
--- /dev/null
+++ b/tools/check-capitalization
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+from __future__ import absolute_import
+from __future__ import print_function
+
+# check for the venv
+from lib import sanity_check
+sanity_check.check_venv(__file__)
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from scripts.lib.zulip_tools import WARNING, FAIL, ENDC
+
+from tools.lib.capitalization import check_capitalization
+
+DJANGO_PO_REGEX = re.compile('msgid "(.*?)"')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--show-ignored',
+                        action='store_true', dest='show_ignored', default=False,
+                        help='Show strings that passed the check because they '
+                             'contained ignored phrases.')
+    args = parser.parse_args()
+
+    subprocess.call(['./manage.py', 'makemessages'], stderr=subprocess.STDOUT)
+
+    with open('static/locale/en/translations.json') as f:
+        data = json.load(f)
+        frontend = check_capitalization(list(data.keys()))
+        frontend_errors, frontend_ignored = frontend
+
+    with open('static/locale/en/LC_MESSAGES/django.po') as f:
+        rows = [r for r in DJANGO_PO_REGEX.findall(f.read()) if r]
+        backend = check_capitalization(rows)
+        backend_errors, backend_ignored = backend
+
+    if frontend_errors:
+        print(WARNING + "Strings not capitalized in frontend:" + ENDC)
+        print('\n'.join(frontend_errors))
+
+    if backend_errors:
+        print(WARNING + "Strings not capitalized in backend:" + ENDC)
+        print('\n'.join(backend_errors))
+
+    if args.show_ignored:
+        print(WARNING + "Strings which were ignored: " + ENDC)
+        print('\n'.join(frontend_ignored + backend_ignored))
+
+    if frontend_errors or backend_errors:
+        # Point the user to the documentation on what the policy is.
+        print(WARNING + "See https://zulip.readthedocs.io/en/latest/translating.html#capitalization" + ENDC)
+        print(FAIL + "Failed!" + ENDC)
+        sys.exit(1)
+    else:
+        sys.exit(0)
diff --git a/tools/lib/capitalization.py b/tools/lib/capitalization.py
new file mode 100644
index 0000000000..6b2483f1fa
--- /dev/null
+++ b/tools/lib/capitalization.py
@@ -0,0 +1,176 @@
+from __future__ import absolute_import
+
+from typing import List, Tuple, Set, Pattern, Match
+import re
+
+from bs4 import BeautifulSoup
+
+# The phrases in this list will be ignored.
+#
+# Keep the sublists lexicographically sorted.
+IGNORED_PHRASES = [re.compile(regex) for regex in [
+    # Proper nouns and acronyms
+    r"API",
+    r"Cookie Bot",
+    r"Dropbox",
+    r"GitHub",
+    r"Google",
+    r"HTTP",
+    r"ID",
+    r"IDs",
+    r"JIRA",
+    r"JSON",
+    r"Kerberos",
+    r"Mac",
+    r"MiB",
+    r"Pivotal",
+    r'REMOTE_USER',
+    r"SSO",
+    r'Terms of Service',
+    r"URL",
+    r"Ubuntu",
+    r"V5",
+    r"Webathena",
+    r"Windows",
+    r"WordPress",
+    r"XML",
+    r"Zephyr",
+    r"Zulip",
+    r"iPhone",
+    # Code things
+    r".zuliprc",
+    r"__\w+\.\w+__",
+    # Things using "I"
+    r"I say",
+    r"I want",
+    r"I'm",
+    # Specific short words
+    r"and",
+    r"bot",
+    r"e.g.",
+    r"etc.",
+    r"images",
+
+    # Fragments of larger strings
+    r"one or more people...",
+    r"confirmation email",
+    r"invites remaining",
+    r"^left$",
+    r"^right$",
+
+    # SPECIAL CASES
+    # Enter is usually capitalized
+    r"Press Enter to send",
+    # Because topics usually are lower-case, this would look weird if it were capitalized
+    r"more topics",
+    # For consistency with "more topics"
+    r"more conversations",
+    # We should probably just delete this string from translations
+    r'activation key',
+
+    # TO CLEAN UP
+    # Just want to avoid churning login.html right now
+    r"or Choose a user",
+    # This is a parsing bug in the tool
+    r"argument ",
+    # I can't find this one
+    r"text",
+]]
+
+SPLIT_BOUNDARY = '?.!'  # Used to split string into sentences.
+SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY))
+
+# Regexes which check capitalization in sentences.
+DISALLOWED_REGEXES = [re.compile(regex) for regex in [
+    r'^[a-z]',  # Checks if the sentence starts with a lower case character.
+    r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]',  # Checks if an upper case character exists
+    # after a lower case character when the first character is in upper case.
+]]
+
+def get_safe_phrase(phrase):
+    # type: (str) -> str
+    """
+    Safe phrase is in lower case and doesn't contain characters which can
+    conflict with split boundaries. All conflicting characters are replaced
+    with low dash (_).
+    """
+    phrase = SPLIT_BOUNDARY_REGEX.sub('_', phrase)
+    return phrase.lower()
+
+def replace_with_safe_phrase(matchobj):
+    # type: (Match[str]) -> str
+    """
+    The idea is to convert IGNORED_PHRASES into safe phrases, see
+    `get_safe_phrase()` function. The only exception is when the
+    IGNORED_PHRASE is at the start of the text or after a split
+    boundary; in this case, we change the first letter of the phrase
+    to upper case.
+    """
+    ignored_phrase = matchobj.group(0)
+    safe_string = get_safe_phrase(ignored_phrase)
+
+    start_index = matchobj.start()
+    complete_string = matchobj.string
+
+    is_string_start = start_index == 0
+    # We expect that there will be one space between split boundary
+    # and the next word.
+    punctuation = complete_string[max(start_index - 2, 0)]
+    is_after_split_boundary = punctuation in SPLIT_BOUNDARY
+    if is_string_start or is_after_split_boundary:
+        return safe_string.capitalize()
+
+    return safe_string
+
+def get_safe_text(text):
+    # type: (str) -> str
+    """
+    This returns text which is rendered by BeautifulSoup and is in the
+    form that can be split easily and has all IGNORED_PHRASES processed.
+    """
+    soup = BeautifulSoup(text, 'lxml')
+    text = ' '.join(soup.text.split())  # Remove extra whitespaces.
+    for phrase_regex in IGNORED_PHRASES:
+        text = phrase_regex.sub(replace_with_safe_phrase, text)
+
+    return text
+
+def is_capitalized(safe_text):
+    # type: (str) -> bool
+    sentences = SPLIT_BOUNDARY_REGEX.split(safe_text)
+    sentences = [sentence.strip()
+                 for sentence in sentences if sentence.strip()]
+
+    if not sentences:
+        return False
+
+    for sentence in sentences:
+        for regex in DISALLOWED_REGEXES:
+            if regex.search(sentence):
+                return False
+
+    return True
+
+def check_capitalization(strings):
+    # type: (List[str]) -> Tuple[List[str], List[str]]
+    errors = []
+    ignored = []
+    for text in strings:
+        # Hand-skip a few that break the tool
+        if 'Change notification settings for individual streams' in text:
+            continue
+        if 'was too large; the maximum file size is 25MiB.' in text:
+            continue
+        if 'Most stream administration is done on the' in text:
+            continue
+        if 'bot-settings-note padded-container' in text:
+            continue
+        safe_text = get_safe_text(text)
+        has_ignored_phrase = text != safe_text
+        capitalized = is_capitalized(safe_text)
+        if not capitalized:
+            errors.append(text)
+        elif capitalized and has_ignored_phrase:
+            ignored.append(text)
+
+    return sorted(errors), sorted(ignored)
diff --git a/tools/tests/test_capitalization_checker.py b/tools/tests/test_capitalization_checker.py
new file mode 100644
index 0000000000..845175fb74
--- /dev/null
+++ b/tools/tests/test_capitalization_checker.py
@@ -0,0 +1,143 @@
+from bs4 import BeautifulSoup
+from django.test import TestCase
+
+from tools.lib.capitalization import check_capitalization, is_capitalized, \
+    get_safe_text
+
+class GetSafeTextTestCase(TestCase):
+    def test_get_safe_text(self):
+        # type: () -> None
+        string = ('Messages in __page_params.product_name__ go to a '
+                  'stream and have a topic.')
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'Messages in __page_params_product_name__ '
+                                    'go to a stream and have a topic.')
+
+        string = "Zulip Zulip. Zulip some text!"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'Zulip zulip. Zulip some text!')
+
+        string = "Zulip Zulip? Zulip some text!"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'Zulip zulip? Zulip some text!')
+
+        string = "Zulip Zulip! Zulip some text!"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'Zulip zulip! Zulip some text!')
+
+        string = "Zulip Zulip, Zulip some text!"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'Zulip zulip, zulip some text!')
+
+        string = "Some text 25MiB"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'Some text 25mib')
+
+        string = "Not Ignored Phrase"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'Not Ignored Phrase')
+
+        string = "Not ignored phrase"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'Not ignored phrase')
+
+        string = ""
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, '')
+
+        string = """
+        <p>Please re-enter your password to confirm your identity.
+                (<a href="/accounts/password/reset/" target="_blank">Forgotten it?</a>)</p>
+                """
+        safe_text = get_safe_text(string)
+        soup = BeautifulSoup(safe_text, 'lxml')
+        rendered_text = ' '.join(soup.text.split())
+        self.assertEqual(safe_text, rendered_text)
+
+        string = "Edited (__last_edit_timestr__)"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, string)
+
+        string = "iPhone application"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'Iphone application')
+
+        string = "One two etc. three"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'One two etc_ three')
+
+        string = "One two etc. three.      four"
+        safe_text = get_safe_text(string)
+        self.assertEqual(safe_text, 'One two etc_ three. four')
+
+class IsCapitalizedTestCase(TestCase):
+    def test_process_text(self):
+        # type: () -> None
+        string = "Zulip zulip. Zulip some text!"
+        capitalized = is_capitalized(string)
+        self.assertTrue(capitalized)
+
+        string = "Zulip zulip? Zulip some text!"
+        capitalized = is_capitalized(string)
+        self.assertTrue(capitalized)
+
+        string = "Zulip zulip! Zulip some text!"
+        capitalized = is_capitalized(string)
+        self.assertTrue(capitalized)
+
+        string = "Zulip zulip, Zulip some text!"
+        capitalized = is_capitalized(string)
+        self.assertTrue(capitalized)
+
+        string = "Some number 25mib"
+        capitalized = is_capitalized(string)
+        self.assertTrue(capitalized)
+
+        string = "Not Ignored Phrase"
+        capitalized = is_capitalized(string)
+        self.assertFalse(capitalized)
+
+        string = "Not ignored phrase"
+        capitalized = is_capitalized(string)
+        self.assertTrue(capitalized)
+
+        string = ""
+        capitalized = is_capitalized(string)
+        self.assertFalse(capitalized)
+
+        string = ("Please re-enter your password to confirm your identity."
+                  " (Forgotten it?)")
+        capitalized = is_capitalized(string)
+        self.assertTrue(capitalized)
+
+        string = "Edited (__last_edit_timestr__)"
+        capitalized = is_capitalized(string)
+        self.assertTrue(capitalized)
+
+        string = "Iphone application"
+        capitalized = is_capitalized(string)
+        self.assertTrue(capitalized)
+
+        string = "One two etc_ three"
+        capitalized = is_capitalized(string)
+        self.assertTrue(capitalized)
+
+class CheckCapitalizationTestCase(TestCase):
+    def test_check_capitalization(self):
+        # type: () -> None
+        strings = ["Zulip Zulip. Zulip some text!",
+                   "Zulip Zulip? Zulip some text!",
+                   "Zulip Zulip! Zulip some text!",
+                   "Zulip Zulip, Zulip some text!",
+                   "Some number 25MiB",
+                   "Not Ignored Phrase",
+                   "Not ignored phrase",
+                   ]
+        errored, ignored = check_capitalization(strings)
+        self.assertEqual(errored, ['Not Ignored Phrase'])
+        self.assertEqual(ignored, sorted(["Zulip Zulip. Zulip some text!",
+                                          "Zulip Zulip? Zulip some text!",
+                                          "Zulip Zulip! Zulip some text!",
+                                          "Zulip Zulip, Zulip some text!",
+                                          "Some number 25MiB",
+                                          ]))
diff --git a/tools/travis/backend b/tools/travis/backend
index cf5305ad5d..0a5c328dab 100755
--- a/tools/travis/backend
+++ b/tools/travis/backend
@@ -12,5 +12,6 @@ set -x
 ./tools/test-documentation
 ./tools/test-help-documentation.py
 ./tools/test-api
+python -W ignore tools/check-capitalization
 # Some test suites disabled in CI for being flaky
 #./tools/test-queue-worker-reload