makemessages: Write with orjson.

orjson’s use of Unicode is more consistent with what we get from Transifex. (We could alternatively use json’s ensure_ascii=False flag.) Signed-off-by: Anders Kaseorg <anders@zulip.com>
2024-11-13 16:30:36 -08:00 · 2024-11-13 16:30:36 -08:00 · 2de648df02
parent 06a9600aa7
commit 2de648df02
5 changed files with 56 additions and 33 deletions
--- a/tools/i18n/process-mobile-i18n
+++ b/tools/i18n/process-mobile-i18n
@ -1,9 +1,10 @@
 #!/usr/bin/env python3
-import json
 import os
 import re
 from subprocess import check_output

+import orjson
+

 def get_json_filename(locale: str) -> str:
    return f"locale/{locale}/mobile.json"
@ -23,8 +24,8 @@ def get_locales() -> list[str]:


 def get_translation_stats(resource_path: str) -> dict[str, int]:
-    with open(resource_path) as raw_resource_file:
-        raw_info = json.load(raw_resource_file)
+    with open(resource_path, "rb") as raw_resource_file:
+        raw_info = orjson.loads(raw_resource_file.read())

    total = len(raw_info)
    not_translated = len([i for i in raw_info.items() if i[1] == ""])
@ -41,8 +42,12 @@ for locale in get_locales():
        locale_paths.append(path)

 stats_path = os.path.join("locale", "mobile_info.json")
-with open(stats_path, "w") as f:
-    json.dump(translation_stats, f, indent=2, sort_keys=True)
-    f.write("\n")
+with open(stats_path, "wb") as f:
+    f.write(
+        orjson.dumps(
+            translation_stats,
+            option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
+        )
+    )

 print("Mobile stats file created at: " + stats_path)
--- a/tools/i18n/unescape-contents
+++ b/tools/i18n/unescape-contents
@ -2,10 +2,11 @@

 import argparse
 import html
-import json
 import sys
 from typing import NamedTuple

+import orjson
+

 class CLIArgs(NamedTuple):
    unescape_html: bool
@ -38,8 +39,8 @@ if __name__ == "__main__":

    json_data: dict[str, str] = {}

-    with open(args.filename) as source:
-        json_data = json.load(source)
+    with open(args.filename, "rb") as source:
+        json_data = orjson.loads(source.read())

        if args.unescape_html:
            for key, value in json_data.items():
@ -51,11 +52,13 @@ if __name__ == "__main__":
                        file=sys.stderr,
                    )

-    with open(args.filename, mode="w") as dest:
-        # At least on Linux systems with LANG=en_US.UTF-8, ensure_ascii=False
-        # ensures our output uses real UTF-8 codepoints for human readability,
-        # rather than \u0000 style escape sequences, providing us a
-        # somewhat-implicit JSON unescape. This may behave in unexpected ways
-        # on other OSes or system encodings.
-        json.dump(json_data, dest, ensure_ascii=False, indent=args.indent_level)
-        dest.write("\n")
+    with open(args.filename, mode="wb") as dest:
+        # orjson ensures our output uses real UTF-8 codepoints for
+        # human readability, rather than \u0000 style escape
+        # sequences, providing us a somewhat-implicit JSON unescape.
+        dest.write(
+            orjson.dumps(
+                json_data,
+                option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
+            )
+        )
--- a/tools/i18n/update-for-legacy-translations
+++ b/tools/i18n/update-for-legacy-translations
@ -1,9 +1,10 @@
 #!/usr/bin/env python3
-import json
 import os
 import re
 from subprocess import check_output

+import orjson
+
 LEGACY_STRINGS_MAP = {
    "<p>You are searching for messages that belong to more than one channel, which is not possible.</p>": "<p>You are searching for messages that belong to more than one stream, which is not possible.</p>",
    "<strong>{name}</strong> <i>(guest)</i> is not subscribed to this channel. They will not be notified if you mention them.": "<strong>{name}</strong> <i>(guest)</i> is not subscribed to this stream. They will not be notified if you mention them.",
@ -200,8 +201,8 @@ def get_locales() -> list[str]:


 def get_translations(path: str) -> dict[str, str]:
-    with open(path) as raw_resource_file:
-        translations = json.load(raw_resource_file)
+    with open(path, "rb") as raw_resource_file:
+        translations = orjson.loads(raw_resource_file.read())

    return translations

@ -226,9 +227,13 @@ def update_for_legacy_stream_translations(
    # Only replace file content if we've made any updates for legacy
    # translated strings.
    if number_of_updates > 0:
-        with open(path, "w") as f:
-            json.dump(updated_translations, f, ensure_ascii=False, indent=2, sort_keys=True)
-            f.write("\n")
+        with open(path, "wb") as f:
+            f.write(
+                orjson.dumps(
+                    updated_translations,
+                    option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
+                )
+            )
        print(f"Updated {number_of_updates} strings in: {path}")


--- a/zerver/management/commands/compilemessages.py
+++ b/zerver/management/commands/compilemessages.py
@ -1,4 +1,3 @@
-import json
 import os
 import re
 import unicodedata
@ -137,9 +136,13 @@ class Command(compilemessages.Command):
            info["percent_translated"] = percentage
            data["languages"].append(info)

-        with open(output_path, "w") as writer:
-            json.dump(data, writer, indent=2, sort_keys=True)
-            writer.write("\n")
+        with open(output_path, "wb") as writer:
+            writer.write(
+                orjson.dumps(
+                    data,
+                    option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
+                )
+            )

    def get_translation_percentage(self, locale_path: str, locale: str) -> int:
        # backend stats
--- a/zerver/management/commands/makemessages.py
+++ b/zerver/management/commands/makemessages.py
@ -34,13 +34,13 @@ https://stackoverflow.com/questions/2090717

 import glob
 import itertools
-import json
 import os
 import re
 import subprocess
 from collections.abc import Collection, Iterator, Mapping
 from typing import Any

+import orjson
 from django.core.management.base import CommandParser
 from django.core.management.commands import makemessages
 from django.template.base import BLOCK_TAG_END, BLOCK_TAG_START
@ -215,7 +215,7 @@ class Command(makemessages.Command):
                "web/src/**/*.ts",
            ]
        )
-        translation_strings.extend(json.loads(extracted).values())
+        translation_strings.extend(orjson.loads(extracted).values())

        return list(set(translation_strings))

@ -276,11 +276,18 @@ class Command(makemessages.Command):
        for locale, output_path in zip(self.get_locales(), self.get_output_paths(), strict=False):
            self.stdout.write(f"[frontend] processing locale {locale}")
            try:
-                with open(output_path) as reader:
-                    old_strings = json.load(reader)
+                with open(output_path, "rb") as reader:
+                    old_strings = orjson.loads(reader.read())
            except (OSError, ValueError):
                old_strings = {}

            new_strings = self.get_new_strings(old_strings, translation_strings, locale)
-            with open(output_path, "w") as writer:
-                json.dump(new_strings, writer, indent=2, sort_keys=True)
+            with open(output_path, "wb") as writer:
+                writer.write(
+                    orjson.dumps(
+                        new_strings,
+                        option=orjson.OPT_APPEND_NEWLINE
+                        | orjson.OPT_INDENT_2
+                        | orjson.OPT_SORT_KEYS,
+                    )
+                )