mirror of https://github.com/zulip/zulip.git
i18n: Unescape Unicode sequences in JSON.
This greatly improves the readability of the diffs and in-codebase translation strings over using ASCII sequences for unicode in the JSON. We've previously noticed [^1] some JSON translation files ending up with escaped Unicode sequences on disk, which Transifex indicates is expected behavior [^2], though it is sometimes fixed by `manage.py compilemessages` [^3]. Further, as noted in #23932 [^4], some JSON translation files include HTML-escaped entities like quotation marks. This script will ingest valid JSON files and output them as proper UTF-8 files with appropriately unescaped (unless otherwise necessary, like double quotes being backslash-escaped) sequences, except when the key itself contains HTML escape sequences (as it's presumed the value of such entries must be pre-escaped before being passed to consumers). [^1]: https://chat.zulip.org/#narrow/stream/58-translation/topic/Transifex.20client/near/1479205 [^2]: https://chat.zulip.org/#narrow/stream/58-translation/topic/an.20email.20for.20Transifex.20support/near/1481287 [^3]: https://chat.zulip.org/#narrow/stream/58-translation/topic/an.20email.20for.20Transifex.20support/near/1481908 [^4]: Which is not end-to-end fixed yet by this commit: that will require a new release of Zulip Server. gitlint-ignore: B1, title-trailing-punctuation, body-min-length, body-is-missing
This commit is contained in:
parent
0718043283
commit
70b30e7792
|
@ -8,5 +8,11 @@ set -x
|
||||||
|
|
||||||
./manage.py makemessages --all
|
./manage.py makemessages --all
|
||||||
tx pull -a -f --mode=translator --minimum-perc=5 "$@"
|
tx pull -a -f --mode=translator --minimum-perc=5 "$@"
|
||||||
|
# For readability, we prefer UTF-8, not ascii, in these JSON files.
|
||||||
|
find ./locale \
|
||||||
|
-regextype sed \
|
||||||
|
-regex '^\./locale/.*/\(mobile\|translations\).json$' \
|
||||||
|
-exec ./tools/i18n/unescape-contents {} \;
|
||||||
|
|
||||||
./manage.py compilemessages
|
./manage.py compilemessages
|
||||||
./tools/i18n/process-mobile-i18n
|
./tools/i18n/process-mobile-i18n
|
||||||
|
|
|
@ -0,0 +1,60 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import html
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from typing import Dict, NamedTuple
|
||||||
|
|
||||||
|
|
||||||
|
class CLIArgs(NamedTuple):
|
||||||
|
unescape_html: bool
|
||||||
|
filename: str
|
||||||
|
indent_level: int
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> CLIArgs:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="unescape-contents",
|
||||||
|
description="Unescape Unicode and, optionally, HTML entities in a JSON file. Input file must be a JSON dictionary. Output will always be unescaped UTF-8.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("filename", type=str)
|
||||||
|
parser.add_argument(
|
||||||
|
"--unescape-html",
|
||||||
|
action="store_true",
|
||||||
|
dest="unescape_html",
|
||||||
|
help="If the key of a dictionary field does not contain HTML escapes, unescape any HTML escapes found in the value",
|
||||||
|
)
|
||||||
|
parser.add_argument("--indent-level", dest="indent_level", type=int, default=2, required=False)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
return CLIArgs(args.unescape_html, args.filename, args.indent_level)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
print(f"unescaping file {args.filename}", file=sys.stderr)
|
||||||
|
|
||||||
|
json_data: Dict[str, str] = {}
|
||||||
|
|
||||||
|
with open(args.filename) as source:
|
||||||
|
json_data = json.load(source)
|
||||||
|
|
||||||
|
if args.unescape_html:
|
||||||
|
for key, value in json_data.items():
|
||||||
|
if key == html.unescape(key):
|
||||||
|
json_data[key] = html.unescape(value)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f'{args.filename}: key "{key}" contains HTML, not escaping HTML in value',
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(args.filename, mode="w") as dest:
|
||||||
|
# At least on Linux systems with LANG=en_US.UTF-8, ensure_ascii=False
|
||||||
|
# ensures our output uses real UTF-8 codepoints for human readability,
|
||||||
|
# rather than \u0000 style escape sequences, providing us a
|
||||||
|
# somewhat-implicit JSON unescape. This may behave in unexpected ways
|
||||||
|
# on other OSes or system encodings.
|
||||||
|
json.dump(json_data, dest, ensure_ascii=False, indent=args.indent_level)
|
|
@ -0,0 +1,8 @@
|
||||||
|
#!/usr/bin/env sh
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
find ./locale \
|
||||||
|
-regextype sed \
|
||||||
|
-regex '^\./locale/.*/\(mobile\|translations\).json$' \
|
||||||
|
-exec ./tools/i18n/unescape-contents --unescape-html {} \;
|
Loading…
Reference in New Issue