makemessages: Write with orjson.

orjson’s use of Unicode is more consistent with what we get from Transifex. (We could alternatively use json’s ensure_ascii=False flag.) Signed-off-by: Anders Kaseorg <anders@zulip.com>
2024-11-13 16:30:36 -08:00 · 2024-11-13 16:30:36 -08:00 · 2de648df02
parent 06a9600aa7
commit 2de648df02
5 changed files with 56 additions and 33 deletions
--- a/tools/i18n/process-mobile-i18n
+++ b/tools/i18n/process-mobile-i18n
@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 import json
 import os
 import re
 from subprocess import check_output
 import orjson
 def get_json_filename(locale: str) -> str:
    return f"locale/{locale}/mobile.json"
@ -23,8 +24,8 @@ def get_locales() -> list[str]:
 def get_translation_stats(resource_path: str) -> dict[str, int]:
-    with open(resource_path) as raw_resource_file:
+    with open(resource_path, "rb") as raw_resource_file:
-        raw_info = json.load(raw_resource_file)
+        raw_info = orjson.loads(raw_resource_file.read())
    total = len(raw_info)
    not_translated = len([i for i in raw_info.items() if i[1] == ""])
@ -41,8 +42,12 @@ for locale in get_locales():
        locale_paths.append(path)
 stats_path = os.path.join("locale", "mobile_info.json")
-with open(stats_path, "w") as f:
+with open(stats_path, "wb") as f:
-    json.dump(translation_stats, f, indent=2, sort_keys=True)
+    f.write(
-    f.write("\n")
+        orjson.dumps(
            translation_stats,
            option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
        )
    )
 print("Mobile stats file created at: " + stats_path)
--- a/tools/i18n/unescape-contents
+++ b/tools/i18n/unescape-contents
@ -2,10 +2,11 @@
 import argparse
 import html
 import json
 import sys
 from typing import NamedTuple
 import orjson
 class CLIArgs(NamedTuple):
    unescape_html: bool
@ -38,8 +39,8 @@ if __name__ == "__main__":
    json_data: dict[str, str] = {}
-    with open(args.filename) as source:
+    with open(args.filename, "rb") as source:
-        json_data = json.load(source)
+        json_data = orjson.loads(source.read())
        if args.unescape_html:
            for key, value in json_data.items():
@ -51,11 +52,13 @@ if __name__ == "__main__":
                        file=sys.stderr,
                    )
-    with open(args.filename, mode="w") as dest:
+    with open(args.filename, mode="wb") as dest:
-        # At least on Linux systems with LANG=en_US.UTF-8, ensure_ascii=False
+        # orjson ensures our output uses real UTF-8 codepoints for
-        # ensures our output uses real UTF-8 codepoints for human readability,
+        # human readability, rather than \u0000 style escape
-        # rather than \u0000 style escape sequences, providing us a
+        # sequences, providing us a somewhat-implicit JSON unescape.
-        # somewhat-implicit JSON unescape. This may behave in unexpected ways
+        dest.write(
-        # on other OSes or system encodings.
+            orjson.dumps(
-        json.dump(json_data, dest, ensure_ascii=False, indent=args.indent_level)
+                json_data,
-        dest.write("\n")
+                option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
            )
        )
--- a/tools/i18n/update-for-legacy-translations
+++ b/tools/i18n/update-for-legacy-translations
@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 import json
 import os
 import re
 from subprocess import check_output
 import orjson
 LEGACY_STRINGS_MAP = {
    "<p>You are searching for messages that belong to more than one channel, which is not possible.</p>": "<p>You are searching for messages that belong to more than one stream, which is not possible.</p>",
    "<strong>{name}</strong> <i>(guest)</i> is not subscribed to this channel. They will not be notified if you mention them.": "<strong>{name}</strong> <i>(guest)</i> is not subscribed to this stream. They will not be notified if you mention them.",
@ -200,8 +201,8 @@ def get_locales() -> list[str]:
 def get_translations(path: str) -> dict[str, str]:
-    with open(path) as raw_resource_file:
+    with open(path, "rb") as raw_resource_file:
-        translations = json.load(raw_resource_file)
+        translations = orjson.loads(raw_resource_file.read())
    return translations
@ -226,9 +227,13 @@ def update_for_legacy_stream_translations(
    # Only replace file content if we've made any updates for legacy
    # translated strings.
    if number_of_updates > 0:
-        with open(path, "w") as f:
+        with open(path, "wb") as f:
-            json.dump(updated_translations, f, ensure_ascii=False, indent=2, sort_keys=True)
+            f.write(
-            f.write("\n")
+                orjson.dumps(
                    updated_translations,
                    option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
                )
            )
        print(f"Updated {number_of_updates} strings in: {path}")
--- a/zerver/management/commands/compilemessages.py
+++ b/zerver/management/commands/compilemessages.py
@ -1,4 +1,3 @@
 import json
 import os
 import re
 import unicodedata
@ -137,9 +136,13 @@ class Command(compilemessages.Command):
            info["percent_translated"] = percentage
            data["languages"].append(info)
-        with open(output_path, "w") as writer:
+        with open(output_path, "wb") as writer:
-            json.dump(data, writer, indent=2, sort_keys=True)
+            writer.write(
-            writer.write("\n")
+                orjson.dumps(
                    data,
                    option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
                )
            )
    def get_translation_percentage(self, locale_path: str, locale: str) -> int:
        # backend stats
--- a/zerver/management/commands/makemessages.py
+++ b/zerver/management/commands/makemessages.py
@ -34,13 +34,13 @@ https://stackoverflow.com/questions/2090717
 import glob
 import itertools
 import json
 import os
 import re
 import subprocess
 from collections.abc import Collection, Iterator, Mapping
 from typing import Any
 import orjson
 from django.core.management.base import CommandParser
 from django.core.management.commands import makemessages
 from django.template.base import BLOCK_TAG_END, BLOCK_TAG_START
@ -215,7 +215,7 @@ class Command(makemessages.Command):
                "web/src/**/*.ts",
            ]
        )
-        translation_strings.extend(json.loads(extracted).values())
+        translation_strings.extend(orjson.loads(extracted).values())
        return list(set(translation_strings))
@ -276,11 +276,18 @@ class Command(makemessages.Command):
        for locale, output_path in zip(self.get_locales(), self.get_output_paths(), strict=False):
            self.stdout.write(f"[frontend] processing locale {locale}")
            try:
-                with open(output_path) as reader:
+                with open(output_path, "rb") as reader:
-                    old_strings = json.load(reader)
+                    old_strings = orjson.loads(reader.read())
            except (OSError, ValueError):
                old_strings = {}
            new_strings = self.get_new_strings(old_strings, translation_strings, locale)
-            with open(output_path, "w") as writer:
+            with open(output_path, "wb") as writer:
-                json.dump(new_strings, writer, indent=2, sort_keys=True)
+                writer.write(
                    orjson.dumps(
                        new_strings,
                        option=orjson.OPT_APPEND_NEWLINE
                        | orjson.OPT_INDENT_2
                        | orjson.OPT_SORT_KEYS,
                    )
                )