From 70b30e7792e6b9197aefb8f1985c89079a9a921b Mon Sep 17 00:00:00 2001
From: Josh Klar <josh@zulip.com>
Date: Wed, 4 Jan 2023 19:12:15 -0800
Subject: [PATCH] i18n: Unescape Unicode sequences in JSON.

This greatly improves the readability of the diffs and in-codebase
translation strings over using ASCII sequences for unicode in the JSON.

We've previously noticed [^1] some JSON translation files ending up with
escaped Unicode sequences on disk, which Transifex indicates is expected
behavior [^2], though it is sometimes fixed by `manage.py
compilemessages` [^3]. Further, as noted in #23932 [^4], some JSON
translation files include HTML-escaped entities like quotation marks.

This script will ingest valid JSON files and output them as proper UTF-8
files with appropriately unescaped (unless otherwise necessary, like
double quotes being backslash-escaped) sequences, except when the key
itself contains HTML escape sequences (as it's presumed the value of
such entries must be pre-escaped before being passed to consumers).

[^1]: https://chat.zulip.org/#narrow/stream/58-translation/topic/Transifex.20client/near/1479205

[^2]: https://chat.zulip.org/#narrow/stream/58-translation/topic/an.20email.20for.20Transifex.20support/near/1481287

[^3]: https://chat.zulip.org/#narrow/stream/58-translation/topic/an.20email.20for.20Transifex.20support/near/1481908

[^4]: Which is not end-to-end fixed yet by this commit: that will
require a new release of Zulip Server.

gitlint-ignore: B1, title-trailing-punctuation, body-min-length, body-is-missing
---
 tools/i18n/sync-translations                  |  6 ++
 tools/i18n/unescape-contents                  | 60 +++++++++++++++++++
 tools/i18n/unescape-html-in-json-translations |  8 +++
 3 files changed, 74 insertions(+)
 create mode 100755 tools/i18n/unescape-contents
 create mode 100755 tools/i18n/unescape-html-in-json-translations

diff --git a/tools/i18n/sync-translations b/tools/i18n/sync-translations
index 3bea2a7741..911262055c 100755
--- a/tools/i18n/sync-translations
+++ b/tools/i18n/sync-translations
@@ -8,5 +8,11 @@ set -x
 
 ./manage.py makemessages --all
 tx pull -a -f --mode=translator --minimum-perc=5 "$@"
+# For readability, we prefer UTF-8, not ascii, in these JSON files.
+find ./locale \
+    -regextype sed \
+    -regex '^\./locale/.*/\(mobile\|translations\).json$' \
+    -exec ./tools/i18n/unescape-contents {} \;
+
 ./manage.py compilemessages
 ./tools/i18n/process-mobile-i18n
diff --git a/tools/i18n/unescape-contents b/tools/i18n/unescape-contents
new file mode 100755
index 0000000000..b388fffe8d
--- /dev/null
+++ b/tools/i18n/unescape-contents
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+import argparse
+import html
+import json
+import sys
+from typing import Dict, NamedTuple
+
+
+class CLIArgs(NamedTuple):
+    unescape_html: bool
+    filename: str
+    indent_level: int
+
+
+def parse_args() -> CLIArgs:
+    parser = argparse.ArgumentParser(
+        prog="unescape-contents",
+        description="Unescape Unicode and, optionally, HTML entities in a JSON file. Input file must be a JSON dictionary. Output will always be unescaped UTF-8.",
+    )
+
+    parser.add_argument("filename", type=str)
+    parser.add_argument(
+        "--unescape-html",
+        action="store_true",
+        dest="unescape_html",
+        help="If the key of a dictionary field does not contain HTML escapes, unescape any HTML escapes found in the value",
+    )
+    parser.add_argument("--indent-level", dest="indent_level", type=int, default=2, required=False)
+    args = parser.parse_args()
+
+    return CLIArgs(args.unescape_html, args.filename, args.indent_level)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(f"unescaping file {args.filename}", file=sys.stderr)
+
+    json_data: Dict[str, str] = {}
+
+    with open(args.filename) as source:
+        json_data = json.load(source)
+
+        if args.unescape_html:
+            for key, value in json_data.items():
+                if key == html.unescape(key):
+                    json_data[key] = html.unescape(value)
+                else:
+                    print(
+                        f'{args.filename}: key "{key}" contains HTML, not escaping HTML in value',
+                        file=sys.stderr,
+                    )
+
+    with open(args.filename, mode="w") as dest:
+        # At least on Linux systems with LANG=en_US.UTF-8, ensure_ascii=False
+        # ensures our output uses real UTF-8 codepoints for human readability,
+        # rather than \u0000 style escape sequences, providing us a
+        # somewhat-implicit JSON unescape. This may behave in unexpected ways
+        # on other OSes or system encodings.
+        json.dump(json_data, dest, ensure_ascii=False, indent=args.indent_level)
diff --git a/tools/i18n/unescape-html-in-json-translations b/tools/i18n/unescape-html-in-json-translations
new file mode 100755
index 0000000000..047cfcb79e
--- /dev/null
+++ b/tools/i18n/unescape-html-in-json-translations
@@ -0,0 +1,8 @@
+#!/usr/bin/env sh
+
+set -ex
+
+find ./locale \
+    -regextype sed \
+    -regex '^\./locale/.*/\(mobile\|translations\).json$' \
+    -exec ./tools/i18n/unescape-contents --unescape-html {} \;