#!/usr/bin/env python3 import argparse import html import sys from typing import NamedTuple import orjson class CLIArgs(NamedTuple): unescape_html: bool filename: str indent_level: int def parse_args() -> CLIArgs: parser = argparse.ArgumentParser( prog="unescape-contents", description="Unescape Unicode and, optionally, HTML entities in a JSON file. Input file must be a JSON dictionary. Output will always be unescaped UTF-8.", ) parser.add_argument("filename", type=str) parser.add_argument( "--unescape-html", action="store_true", dest="unescape_html", help="If the key of a dictionary field does not contain HTML escapes, unescape any HTML escapes found in the value", ) parser.add_argument("--indent-level", dest="indent_level", type=int, default=2, required=False) args = parser.parse_args() return CLIArgs(args.unescape_html, args.filename, args.indent_level) if __name__ == "__main__": args = parse_args() print(f"unescaping file {args.filename}", file=sys.stderr) json_data: dict[str, str] = {} with open(args.filename, "rb") as source: json_data = orjson.loads(source.read()) if args.unescape_html: for key, value in json_data.items(): if key == html.unescape(key): json_data[key] = html.unescape(value) else: print( f'{args.filename}: key "{key}" contains HTML, not escaping HTML in value', file=sys.stderr, ) with open(args.filename, mode="wb") as dest: # orjson ensures our output uses real UTF-8 codepoints for # human readability, rather than \u0000 style escape # sequences, providing us a somewhat-implicit JSON unescape. dest.write( orjson.dumps( json_data, option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, ) )