From 70b30e7792e6b9197aefb8f1985c89079a9a921b Mon Sep 17 00:00:00 2001 From: Josh Klar Date: Wed, 4 Jan 2023 19:12:15 -0800 Subject: [PATCH] i18n: Unescape Unicode sequences in JSON. This greatly improves the readability of the diffs and in-codebase translation strings over using ASCII sequences for unicode in the JSON. We've previously noticed [^1] some JSON translation files ending up with escaped Unicode sequences on disk, which Transifex indicates is expected behavior [^2], though it is sometimes fixed by `manage.py compilemessages` [^3]. Further, as noted in #23932 [^4], some JSON translation files include HTML-escaped entities like quotation marks. This script will ingest valid JSON files and output them as proper UTF-8 files with appropriately unescaped (unless otherwise necessary, like double quotes being backslash-escaped) sequences, except when the key itself contains HTML escape sequences (as it's presumed the value of such entries must be pre-escaped before being passed to consumers). [^1]: https://chat.zulip.org/#narrow/stream/58-translation/topic/Transifex.20client/near/1479205 [^2]: https://chat.zulip.org/#narrow/stream/58-translation/topic/an.20email.20for.20Transifex.20support/near/1481287 [^3]: https://chat.zulip.org/#narrow/stream/58-translation/topic/an.20email.20for.20Transifex.20support/near/1481908 [^4]: Which is not end-to-end fixed yet by this commit: that will require a new release of Zulip Server. gitlint-ignore: B1, title-trailing-punctuation, body-min-length, body-is-missing --- tools/i18n/sync-translations | 6 ++ tools/i18n/unescape-contents | 60 +++++++++++++++++++ tools/i18n/unescape-html-in-json-translations | 8 +++ 3 files changed, 74 insertions(+) create mode 100755 tools/i18n/unescape-contents create mode 100755 tools/i18n/unescape-html-in-json-translations diff --git a/tools/i18n/sync-translations b/tools/i18n/sync-translations index 3bea2a7741..911262055c 100755 --- a/tools/i18n/sync-translations +++ b/tools/i18n/sync-translations @@ -8,5 +8,11 @@ set -x ./manage.py makemessages --all tx pull -a -f --mode=translator --minimum-perc=5 "$@" +# For readability, we prefer UTF-8, not ascii, in these JSON files. +find ./locale \ + -regextype sed \ + -regex '^\./locale/.*/\(mobile\|translations\).json$' \ + -exec ./tools/i18n/unescape-contents {} \; + ./manage.py compilemessages ./tools/i18n/process-mobile-i18n diff --git a/tools/i18n/unescape-contents b/tools/i18n/unescape-contents new file mode 100755 index 0000000000..b388fffe8d --- /dev/null +++ b/tools/i18n/unescape-contents @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import argparse +import html +import json +import sys +from typing import Dict, NamedTuple + + +class CLIArgs(NamedTuple): + unescape_html: bool + filename: str + indent_level: int + + +def parse_args() -> CLIArgs: + parser = argparse.ArgumentParser( + prog="unescape-contents", + description="Unescape Unicode and, optionally, HTML entities in a JSON file. Input file must be a JSON dictionary. Output will always be unescaped UTF-8.", + ) + + parser.add_argument("filename", type=str) + parser.add_argument( + "--unescape-html", + action="store_true", + dest="unescape_html", + help="If the key of a dictionary field does not contain HTML escapes, unescape any HTML escapes found in the value", + ) + parser.add_argument("--indent-level", dest="indent_level", type=int, default=2, required=False) + args = parser.parse_args() + + return CLIArgs(args.unescape_html, args.filename, args.indent_level) + + +if __name__ == "__main__": + args = parse_args() + print(f"unescaping file {args.filename}", file=sys.stderr) + + json_data: Dict[str, str] = {} + + with open(args.filename) as source: + json_data = json.load(source) + + if args.unescape_html: + for key, value in json_data.items(): + if key == html.unescape(key): + json_data[key] = html.unescape(value) + else: + print( + f'{args.filename}: key "{key}" contains HTML, not escaping HTML in value', + file=sys.stderr, + ) + + with open(args.filename, mode="w") as dest: + # At least on Linux systems with LANG=en_US.UTF-8, ensure_ascii=False + # ensures our output uses real UTF-8 codepoints for human readability, + # rather than \u0000 style escape sequences, providing us a + # somewhat-implicit JSON unescape. This may behave in unexpected ways + # on other OSes or system encodings. + json.dump(json_data, dest, ensure_ascii=False, indent=args.indent_level) diff --git a/tools/i18n/unescape-html-in-json-translations b/tools/i18n/unescape-html-in-json-translations new file mode 100755 index 0000000000..047cfcb79e --- /dev/null +++ b/tools/i18n/unescape-html-in-json-translations @@ -0,0 +1,8 @@ +#!/usr/bin/env sh + +set -ex + +find ./locale \ + -regextype sed \ + -regex '^\./locale/.*/\(mobile\|translations\).json$' \ + -exec ./tools/i18n/unescape-contents --unescape-html {} \;