diff --git a/tools/setup/emoji/export_emoji_names_to_csv b/tools/setup/emoji/export_emoji_names_to_csv new file mode 100755 index 0000000000..ddb36cc8ee --- /dev/null +++ b/tools/setup/emoji/export_emoji_names_to_csv @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# +# This exports the emoji_names.py data set to a CSV file in the same +# format used as input for import_emoji_names_from_csv. We use this +# as part of a test for the correctness of the import process (one can +# compare the exported CSV file to the original CSV file, and if the +# data round-tripped with no changes, we know everything is working +# correctly). +import argparse +import csv +import os +import re +import ujson + +from typing import Any, Dict, List + +TOOLS_DIR_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +ZULIP_PATH = os.path.dirname(TOOLS_DIR_PATH) +# `emoji.json` file is same in all four emoji-datasource packages. +EMOJI_DATA_PATH = os.path.join(ZULIP_PATH, 'node_modules', 'emoji-datasource-google', 'emoji.json') + +sorting_info = {} # type: Dict[str, Any] +column_names = [ + 'Codepoint', + 'New sorting info', + 'zulip (main)', + 'zulip (alternates)', + 'explanation', +] +category_index = { + 'People': '1', + 'Nature': '2', + 'Foods': '3', + 'Activity': '4', + 'Places': '5', + 'Objects': '6', + 'Symbols': '7', + 'Flags': '8', + 'Skin Tones': '9', +} + +name_entry_regex = re.compile(r"'(?P[a-z0-9-]+)': " + r"{'canonical_name': '(?P[+-]?[a-z0-9_X-]+)',[\n ]+" + r"'aliases': \[(?P('([+-]?[a-z0-9_, X-]+)'[, ]{0,2})*)\]},") +explanation_regex = re.compile(r" # (?P[^\r\n\t]+)") + +def prepare_sorting_info() -> None: + emoji_data = [] # type: List[Dict[str, Any]] + with open(EMOJI_DATA_PATH) as fp: + emoji_data = ujson.load(fp) + + for emoji_dict in emoji_data: + emoji_code = emoji_dict['unified'].lower() + sort_order = str(emoji_dict['sort_order']).strip() + sorting_info[emoji_code] = { + 'category': emoji_dict['category'], + 'sort_order': sort_order.rjust(3, '0'), + } + +def get_sorting_info(emoji_code: str) -> str: + category = sorting_info[emoji_code]['category'] + category = category_index[category] + '-' + category + sort_order = sorting_info[emoji_code]['sort_order'] + return ' '.join([category, sort_order]) + +def prepare_explanation(explanation_lines: List[str]) -> str: + return ' '.join(explanation_lines) + +def prepare_aliases(captured_aliases: str) -> str: + aliases = [] + for alias in captured_aliases.split(', '): + aliases.append(alias.strip("'")) + return ', '.join(aliases) + +def main() -> None: + description = ("This script is used for exporting `emoji_names.py` to comma separated file. It " + "takes the path of output csv file and path to `emoji_names.py` as arguments.") + parser = argparse.ArgumentParser(description=description) + parser.add_argument( + "--input-file", dest="input_file_path", type=str, metavar="", + default=os.path.join(TOOLS_DIR_PATH, "setup", "emoji", "emoji_names.py"), + help="Path to the file from which data is to be read.") + parser.add_argument( + "--output-file", dest="output_file_path", type=str, metavar="", + default=os.path.join(TOOLS_DIR_PATH, "setup", "emoji", "emoji_names.csv"), + help="Path to the output csv file.") + + args = parser.parse_args() + prepare_sorting_info() + output_data = [column_names, ] + explanation_lines = [] # type: List[str] + with open(args.input_file_path) as fp: + for line in fp.readlines(): + match = name_entry_regex.search(line) + if match is not None: + emoji_code = match.group('emoji_code') + sort_info = get_sorting_info(emoji_code) + canonical_name = match.group('canonical_name') + aliases = prepare_aliases(match.group('aliases')) + explanation = prepare_explanation(explanation_lines) + output_data.append([ + emoji_code, + sort_info, + canonical_name, + aliases, + explanation, + ]) + explanation_lines = [] + continue + + match = explanation_regex.search(line) + if match is not None: + explanation_line = match.group('explanation_line').strip() + explanation_lines.append(explanation_line) + + fp = open(args.output_file_path, 'w') + writer = csv.writer(fp, dialect='excel') + writer.writerows(output_data) + # The CSV file exported by google sheets doesn't have a newline + # character in the end. So we also strip the last newline character + # so that round-trip conversion test passes. + line_sep_len = len(os.linesep) + fp.truncate(fp.tell() - line_sep_len) + fp.close() + +if __name__ == "__main__": + main()