emoji: Remove duplicates from autocomplete and emoji picker.

Previously, if you searched for ':offi..' you would see both 🏢 and
:office_building: as possible completions, both of which are shortcodes for
the same unicode codepoint (and hence which have the same image). Also, we
sort the emoji in our emoji pickers alphabetically by shortcode, and so the
images for 🏢 and :office_building: show up next to each other, which
looks like a bug. This removes :office_building: as a shortcode, along with
several hundred other duplicates. It leaves some duplicates in that won't
give autocomplete or alphabetical ordering a problem, like (🚗,
:automobile:).
This commit is contained in:
Rishi Gupta 2017-01-25 23:35:23 -08:00 committed by Tim Abbott
parent d8c648ac05
commit a2890f7d7a
3 changed files with 187 additions and 7 deletions

View File

@ -13,6 +13,8 @@ from typing import Union, Text
from os.path import dirname from os.path import dirname
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from emoji_setup_utils import emoji_names_for_picker
ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../../') ZULIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../../')
sys.path.append(ZULIP_PATH) sys.path.append(ZULIP_PATH)
@ -152,7 +154,7 @@ def get_success_stamp():
sha = hashlib.sha1() sha = hashlib.sha1()
filenames = ['NotoColorEmoji.ttf', 'emoji_map.json', 'AndroidEmoji.ttf', filenames = ['NotoColorEmoji.ttf', 'emoji_map.json', 'AndroidEmoji.ttf',
'build_emoji'] 'build_emoji', 'emoji_setup_utils.py']
for filename in filenames: for filename in filenames:
with open(filename, 'rb') as reader: with open(filename, 'rb') as reader:
@ -165,9 +167,6 @@ def dump_emojis(cache_path):
subprocess.call('ttx -v -z extfile -d {} NotoColorEmoji.ttf'.format(EMOJI_DUMP_DIR_PATH), shell=True) subprocess.call('ttx -v -z extfile -d {} NotoColorEmoji.ttf'.format(EMOJI_DUMP_DIR_PATH), shell=True)
emoji_map = json.load(open('emoji_map.json')) emoji_map = json.load(open('emoji_map.json'))
# Fix data problem with red/blue cars being inaccurate.
emoji_map['blue_car'] = emoji_map['red_car']
emoji_map['red_car'] = emoji_map['oncoming_automobile']
code_point_to_fname_map = code_point_to_file_name_map(EMOJI_DUMP_PATH("NotoColorEmoji.ttx")) code_point_to_fname_map = code_point_to_file_name_map(EMOJI_DUMP_PATH("NotoColorEmoji.ttx"))
os.chdir(EMOJI_DUMP_DIR_PATH) os.chdir(EMOJI_DUMP_DIR_PATH)
@ -227,7 +226,7 @@ def dump_emojis(cache_path):
EMOJI_CODES_PATH = os.path.join(cache_path, 'emoji_codes.js') EMOJI_CODES_PATH = os.path.join(cache_path, 'emoji_codes.js')
emoji_codes_file = open(EMOJI_CODES_PATH, 'w') emoji_codes_file = open(EMOJI_CODES_PATH, 'w')
emoji_codes_file.write(EMOJI_CODES_FILE_TEMPLATE % { emoji_codes_file.write(EMOJI_CODES_FILE_TEMPLATE % {
'names': [str(name) for name in sorted(emoji_map.keys())], 'names': emoji_names_for_picker(emoji_map),
'codepoints': sorted([str(code_point) for code_point in set(emoji_map.values())]) 'codepoints': sorted([str(code_point) for code_point in set(emoji_map.values())])
}) })
emoji_codes_file.close() emoji_codes_file.close()

View File

@ -0,0 +1,181 @@
from __future__ import absolute_import
from collections import defaultdict
from itertools import permutations, chain
import ujson
from six.moves import range, zip
from typing import Text
# the corresponding code point will be set to exactly these names as a final pass,
# overriding any other rules
whitelisted_names = [
['date', 'calendar'], ['shirt', 'tshirt'], ['cupid', 'heart_with_arrow'],
['tada', 'party_popper'], ['parking', 'p_button'], ['car', 'automobile'],
['mortar_board', 'graduation_cap'], ['cd', 'optical_disc'], ['tv', 'television'],
['sound', 'speaker_on'], ['mute', 'speaker_off'], ['antenna_bars', 'signal_strength'],
['mag_right', 'right_pointing_magnifying_glass'], ['mag', 'left_pointing_magnifying_glass'],
['loud_sound', 'speaker_loud'], ['rice_scene', 'moon_ceremony'],
['fast_up_button', 'arrow_double_up'], ['fast_down_button', 'arrow_double_down'],
['rewind', 'fast_reverse_button'], ['100', 'hundred_points'], ['muscle', 'flexed_biceps'],
['walking', 'pedestrian'], ['email', 'envelope'], ['dart', 'direct_hit'],
['wc', 'water_closet'], ['zap', 'high_voltage'], ['underage', 'no_one_under_eighteen'],
['vhs', 'videocassette'], ['bangbang', 'double_exclamation_mark'],
['gun', 'pistol'], ['hocho', 'kitchen_knife'], ['8ball', 'billiards'],
['pray', 'folded_hands'], ['cop', 'police_officer'], ['phone', 'telephone'],
['bee', 'honeybee'], ['lips', 'mouth'], ['boat', 'sailboat'], ['feet', 'paw_prints'],
['uk', 'gb'], ['alien_monster', 'space_invader'], ['reverse_button', 'arrow_backward'],
# both github and slack remove play_button, though I think this is better
['play_button', 'arrow_forward'],
# github/slack both get rid of shuffle_tracks_button, which seems wrong
['shuffle_tracks_button', 'twisted_rightwards_arrows'],
['iphone', 'mobile_phone'], # disagrees with github/slack/emojione
# both github and slack remove {growing,beating}_heart, not sure what I think
['heartpulse', 'growing_heart'], ['heartbeat', 'beating_heart'],
# did remove cityscape_at_dusk from (city_sunset, cityscape_at_dusk)
['sunset', 'city_sunrise'],
['punch', 'oncoming_fist'], # doesn't include facepunch
['+1', 'thumbs_up'], # doesn't include thumbsup
['-1', 'thumbs_down'], # doesn't include thumbsdown
# shit, hankey. slack allows poop, shit, hankey. github calls it hankey,
# and autocompletes for poop and shit. emojione calls it poop, and
# autocompletes for pile_of_poo and shit.
['poop', 'pile_of_poo'],
# github/slack remove cooking, but their emoji for this is an uncooked egg
['egg', 'cooking'],
# ['ocean', 'water_wave'], wave is so common that we want it to point only to :wave:
]
blacklisted_names = frozenset([
# would be chosen by words_supersets or superstrings
'football', # american_football
'post_office', # european_post_office (there's also a japanese_post_office)
'castle', # european_castle (there's also a japanese_castle)
'chart', # chart_increasing_with_yen (should rename chart_increasing to chart)
'loop', # double_curly_loop (should rename curly_loop to loop)
'massage', # face_massage
'bulb', # light_bulb
'barber', # barber_pole
'mens', # mens_room
'womens', # womens_room
'knife', # kitchen_knife (hocho also maps here)
'notes', # musical_notes
'beetle', # lady_beetle
'ab', # ab_button (due to keeping a_button, due to the one_lettered() rule)
'headphone', # headphones
'mega', # megaphone
'ski', # skis
'high_heel', # high_heeled_shoe (so that it shows up when searching for shoe)
# less confident about the following
'dolls', # japanese_dolls
'moon', # waxing_gibbous_moon (should rename crescent_moon to moon)
'clapper', # clapper_board
'traffic_light', # horizontal_traffic_light (there's also a vertical_traffic_light)
'lantern',
'red_paper_lantern', # izakaya_lantern (in the future we should make sure
# red_paper_lantern finds this)
# would be chosen by longer
'down_button', # arrow_down_small, I think to match the other arrow_*
# names. Matching what github and slack do.
'running_shoe', # athletic_shoe, both github and slack agree here.
'running', # runner. slack has both, github has running_man and running_woman, but not runner
'o2', # o_button
'star2', # glowing_star
'bright', # high_brightness, to match low_brightness, what github/slack do
'dim_button', # low_brightness, copying github/slack
'stars', # shooting_star. disagrees with github, slack, and emojione, but this seems better
'nail_care', # nail_polish. Also disagrees github/slack/emojione, is nail_polish mostly an
# american thing?
'busstop', # bus_stop
'tophat', # top_hat
'old_woman', # older_woman, following github/slack/emojione on these
'old_man', # older_man
'blue_car', # recreational_vehicle
'litter_in_bin_sign', # put_litter_in_its_place
'moai', # moyai based on github/slack
'fuelpump', # fuel_pump
# names not otherwise excluded by our heuristics
'left_arrow', # arrow_left, to match other arrow_* shortnames
'right_arrow', # arrow_right
'up_arrow', # arrow_up
'down_arrow', # arrow_down
'chequered_flag', # checkered_flag
'e_mail', # e-mail
'non_potable_water', # non-potable_water
'flipper', # dolphin
])
## functions that take in a list of names at a codepoint and return a subset to exclude
def blacklisted(names):
# type: (List[str]) -> List[str]
return [name for name in names if name in blacklisted_names]
# 1 letter names don't currently show up in our autocomplete. Maybe should
# change our autocomplete so that a whitelist of letters do, like j (for joy), x, etc
# github uses a, ab, etc. instead of a_button, slack doesn't have any of the [letter]_buttons
def one_lettered(names):
# type: (List[str]) -> List[str]
if len(names) == 1:
return []
return [name for name in names if len(name) == 1]
# If it is an ideograph (or katakana, but we'll probably deal with that
# differently after 1.5), remove any names that don't have
# ideograph/katakana in them
def ideographless(names):
# type: (List[str]) -> List[str]
has_ideographs = ['ideograph' in name.split('_') or
'katakana' in name.split('_') for name in names]
if not any(has_ideographs):
return []
return [name for name, has_ideograph in zip(names, has_ideographs) if not has_ideograph]
# subsumed by longer, but still useful for breaking up a hand review of the
# blacklist decisions
def word_superset(names):
# type: (List[str]) -> List[str]
bags_of_words = [frozenset(name.split('_')) for name in names]
bad_names = set()
for i, j in permutations(list(range(len(names))), 2):
if bags_of_words[i] < bags_of_words[j]:
bad_names.add(names[j])
return list(bad_names)
# subsumed by longer, but still useful for breaking up a hand review of the
# blacklist decisions
def superstring(names):
# type: (List[str]) -> List[str]
bad_names = set()
for name1, name2 in permutations(names, 2):
if name2[:len(name1)] == name1:
bad_names.add(name2)
return list(bad_names)
def longer(names):
# type: (List[str]) -> List[str]
lengths = [len(name) for name in names]
min_length = min(lengths)
return [name for name, length in zip(names, lengths) if length > min_length]
def emoji_names_for_picker(emoji_map):
# type: (Dict[Text, Text]) -> List[str]
codepoint_to_names = defaultdict(list) # type: Dict[Text, List[str]]
for name, codepoint in emoji_map.items():
codepoint_to_names[codepoint].append(str(name))
# blacklisted must come first, followed by {one_lettered, ideographless}
# Each function here returns a list of names to be removed from a list of names
for func in [blacklisted, one_lettered, ideographless, word_superset, superstring, longer]:
for codepoint, names in codepoint_to_names.items():
codepoint_to_names[codepoint] = [name for name in names if name not in func(names)]
for names in whitelisted_names:
codepoint = emoji_map[names[0]]
for name in names:
assert (emoji_map[name] == codepoint)
codepoint_to_names[codepoint] = names
return sorted(list(chain.from_iterable(codepoint_to_names.values())))

View File

@ -206,8 +206,8 @@
}, },
{ {
"name": "random_emoji_1", "name": "random_emoji_1",
"input": ":hankey:", "input": ":airplane:",
"expected_output": "<p><img alt=\":hankey:\" class=\"emoji\" src=\"/static/generated/emoji/images/emoji/hankey.png\" title=\":hankey:\"></p>", "expected_output": "<p><img alt=\":airplane:\" class=\"emoji\" src=\"/static/generated/emoji/images/emoji/airplane.png\" title=\":airplane:\"></p>",
"bugdown_matches_marked": true "bugdown_matches_marked": true
}, },
{ {