tools: Add `export_emoji_names_to_csv`.

This tool is used for exporting `emoji_names.py` to a csv file.
This commit is contained in:
Harshit Bansal 2017-11-15 19:31:13 +00:00 committed by Tim Abbott
parent 05f85eb94d
commit 628e868d1e
1 changed files with 127 additions and 0 deletions

View File

@ -0,0 +1,127 @@
#!/usr/bin/env python3
#
# This exports the emoji_names.py data set to a CSV file in the same
# format used as input for import_emoji_names_from_csv. We use this
# as part of a test for the correctness of the import process (one can
# compare the exported CSV file to the original CSV file, and if the
# data round-tripped with no changes, we know everything is working
# correctly).
import argparse
import csv
import os
import re
import ujson
from typing import Any, Dict, List
TOOLS_DIR_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
ZULIP_PATH = os.path.dirname(TOOLS_DIR_PATH)
# `emoji.json` file is same in all four emoji-datasource packages.
EMOJI_DATA_PATH = os.path.join(ZULIP_PATH, 'node_modules', 'emoji-datasource-google', 'emoji.json')
sorting_info = {} # type: Dict[str, Any]
column_names = [
'Codepoint',
'New sorting info',
'zulip (main)',
'zulip (alternates)',
'explanation',
]
category_index = {
'People': '1',
'Nature': '2',
'Foods': '3',
'Activity': '4',
'Places': '5',
'Objects': '6',
'Symbols': '7',
'Flags': '8',
'Skin Tones': '9',
}
name_entry_regex = re.compile(r"'(?P<emoji_code>[a-z0-9-]+)': "
r"{'canonical_name': '(?P<canonical_name>[+-]?[a-z0-9_X-]+)',[\n ]+"
r"'aliases': \[(?P<aliases>('([+-]?[a-z0-9_, X-]+)'[, ]{0,2})*)\]},")
explanation_regex = re.compile(r" # (?P<explanation_line>[^\r\n\t]+)")
def prepare_sorting_info() -> None:
emoji_data = [] # type: List[Dict[str, Any]]
with open(EMOJI_DATA_PATH) as fp:
emoji_data = ujson.load(fp)
for emoji_dict in emoji_data:
emoji_code = emoji_dict['unified'].lower()
sort_order = str(emoji_dict['sort_order']).strip()
sorting_info[emoji_code] = {
'category': emoji_dict['category'],
'sort_order': sort_order.rjust(3, '0'),
}
def get_sorting_info(emoji_code: str) -> str:
category = sorting_info[emoji_code]['category']
category = category_index[category] + '-' + category
sort_order = sorting_info[emoji_code]['sort_order']
return ' '.join([category, sort_order])
def prepare_explanation(explanation_lines: List[str]) -> str:
return ' '.join(explanation_lines)
def prepare_aliases(captured_aliases: str) -> str:
aliases = []
for alias in captured_aliases.split(', '):
aliases.append(alias.strip("'"))
return ', '.join(aliases)
def main() -> None:
description = ("This script is used for exporting `emoji_names.py` to comma separated file. It "
"takes the path of output csv file and path to `emoji_names.py` as arguments.")
parser = argparse.ArgumentParser(description=description)
parser.add_argument(
"--input-file", dest="input_file_path", type=str, metavar="<path>",
default=os.path.join(TOOLS_DIR_PATH, "setup", "emoji", "emoji_names.py"),
help="Path to the file from which data is to be read.")
parser.add_argument(
"--output-file", dest="output_file_path", type=str, metavar="<path>",
default=os.path.join(TOOLS_DIR_PATH, "setup", "emoji", "emoji_names.csv"),
help="Path to the output csv file.")
args = parser.parse_args()
prepare_sorting_info()
output_data = [column_names, ]
explanation_lines = [] # type: List[str]
with open(args.input_file_path) as fp:
for line in fp.readlines():
match = name_entry_regex.search(line)
if match is not None:
emoji_code = match.group('emoji_code')
sort_info = get_sorting_info(emoji_code)
canonical_name = match.group('canonical_name')
aliases = prepare_aliases(match.group('aliases'))
explanation = prepare_explanation(explanation_lines)
output_data.append([
emoji_code,
sort_info,
canonical_name,
aliases,
explanation,
])
explanation_lines = []
continue
match = explanation_regex.search(line)
if match is not None:
explanation_line = match.group('explanation_line').strip()
explanation_lines.append(explanation_line)
fp = open(args.output_file_path, 'w')
writer = csv.writer(fp, dialect='excel')
writer.writerows(output_data)
# The CSV file exported by google sheets doesn't have a newline
# character in the end. So we also strip the last newline character
# so that round-trip conversion test passes.
line_sep_len = len(os.linesep)
fp.truncate(fp.tell() - line_sep_len)
fp.close()
if __name__ == "__main__":
main()