tools: Add `export_emoji_names_to_csv`.

This tool is used for exporting `emoji_names.py` to a csv file.
2017-11-15 19:31:13 +00:00 · 2017-11-15 19:31:13 +00:00 · 628e868d1e
parent 05f85eb94d
commit 628e868d1e
1 changed files with 127 additions and 0 deletions
--- a/tools/setup/emoji/export_emoji_names_to_csv
+++ b/tools/setup/emoji/export_emoji_names_to_csv
@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+#
+# This exports the emoji_names.py data set to a CSV file in the same
+# format used as input for import_emoji_names_from_csv.  We use this
+# as part of a test for the correctness of the import process (one can
+# compare the exported CSV file to the original CSV file, and if the
+# data round-tripped with no changes, we know everything is working
+# correctly).
+import argparse
+import csv
+import os
+import re
+import ujson
+
+from typing import Any, Dict, List
+
+TOOLS_DIR_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+ZULIP_PATH = os.path.dirname(TOOLS_DIR_PATH)
+# `emoji.json` file is same in all four emoji-datasource packages.
+EMOJI_DATA_PATH = os.path.join(ZULIP_PATH, 'node_modules', 'emoji-datasource-google', 'emoji.json')
+
+sorting_info = {}   # type: Dict[str, Any]
+column_names = [
+    'Codepoint',
+    'New sorting info',
+    'zulip (main)',
+    'zulip (alternates)',
+    'explanation',
+]
+category_index = {
+    'People': '1',
+    'Nature': '2',
+    'Foods': '3',
+    'Activity': '4',
+    'Places': '5',
+    'Objects': '6',
+    'Symbols': '7',
+    'Flags': '8',
+    'Skin Tones': '9',
+}
+
+name_entry_regex = re.compile(r"'(?P<emoji_code>[a-z0-9-]+)': "
+                              r"{'canonical_name': '(?P<canonical_name>[+-]?[a-z0-9_X-]+)',[\n ]+"
+                              r"'aliases': \[(?P<aliases>('([+-]?[a-z0-9_, X-]+)'[, ]{0,2})*)\]},")
+explanation_regex = re.compile(r"    # (?P<explanation_line>[^\r\n\t]+)")
+
+def prepare_sorting_info() -> None:
+    emoji_data = []     # type: List[Dict[str, Any]]
+    with open(EMOJI_DATA_PATH) as fp:
+        emoji_data = ujson.load(fp)
+
+    for emoji_dict in emoji_data:
+        emoji_code = emoji_dict['unified'].lower()
+        sort_order = str(emoji_dict['sort_order']).strip()
+        sorting_info[emoji_code] = {
+            'category': emoji_dict['category'],
+            'sort_order': sort_order.rjust(3, '0'),
+        }
+
+def get_sorting_info(emoji_code: str) -> str:
+    category = sorting_info[emoji_code]['category']
+    category = category_index[category] + '-' + category
+    sort_order = sorting_info[emoji_code]['sort_order']
+    return ' '.join([category, sort_order])
+
+def prepare_explanation(explanation_lines: List[str]) -> str:
+    return ' '.join(explanation_lines)
+
+def prepare_aliases(captured_aliases: str) -> str:
+    aliases = []
+    for alias in captured_aliases.split(', '):
+        aliases.append(alias.strip("'"))
+    return ', '.join(aliases)
+
+def main() -> None:
+    description = ("This script is used for exporting `emoji_names.py` to comma separated file. It "
+                   "takes the path of output csv file and path to `emoji_names.py` as arguments.")
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument(
+        "--input-file", dest="input_file_path", type=str, metavar="<path>",
+        default=os.path.join(TOOLS_DIR_PATH, "setup", "emoji", "emoji_names.py"),
+        help="Path to the file from which data is to be read.")
+    parser.add_argument(
+        "--output-file", dest="output_file_path", type=str, metavar="<path>",
+        default=os.path.join(TOOLS_DIR_PATH, "setup", "emoji", "emoji_names.csv"),
+        help="Path to the output csv file.")
+
+    args = parser.parse_args()
+    prepare_sorting_info()
+    output_data = [column_names, ]
+    explanation_lines = []  # type: List[str]
+    with open(args.input_file_path) as fp:
+        for line in fp.readlines():
+            match = name_entry_regex.search(line)
+            if match is not None:
+                emoji_code = match.group('emoji_code')
+                sort_info = get_sorting_info(emoji_code)
+                canonical_name = match.group('canonical_name')
+                aliases = prepare_aliases(match.group('aliases'))
+                explanation = prepare_explanation(explanation_lines)
+                output_data.append([
+                    emoji_code,
+                    sort_info,
+                    canonical_name,
+                    aliases,
+                    explanation,
+                ])
+                explanation_lines = []
+                continue
+
+            match = explanation_regex.search(line)
+            if match is not None:
+                explanation_line = match.group('explanation_line').strip()
+                explanation_lines.append(explanation_line)
+
+    fp = open(args.output_file_path, 'w')
+    writer = csv.writer(fp, dialect='excel')
+    writer.writerows(output_data)
+    # The CSV file exported by google sheets doesn't have a newline
+    # character in the end. So we also strip the last newline character
+    # so that round-trip conversion test passes.
+    line_sep_len = len(os.linesep)
+    fp.truncate(fp.tell() - line_sep_len)
+    fp.close()
+
+if __name__ == "__main__":
+    main()