diff --git a/generate_emoji_name_data.py b/generate_emoji_name_data.py new file mode 100755 index 000000000..2063ed3be --- /dev/null +++ b/generate_emoji_name_data.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-# +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generate name data for emoji resources. Currently in json format.""" + +import argparse +import collections +import glob +import json +import os +from os import path +import re +import sys + +import generate_emoji_html + +from nototools import tool_utils +from nototools import unicode_data + +def _create_custom_gendered_seq_names(): + """The names have detail that is adequately represented by the image.""" + + BOY = 0x1f466 + GIRL = 0x1f467 + MAN = 0x1f468 + WOMAN = 0x1f469 + HEART = 0x2764 # Heavy Black Heart + KISS_MARK = 0x1f48b + return { + (MAN, HEART, KISS_MARK, MAN): 'Kiss', + (WOMAN, HEART, KISS_MARK, WOMAN): 'Kiss', + (WOMAN, HEART, KISS_MARK, MAN): 'Kiss', + (WOMAN, HEART, MAN): 'Couple with Heart', + (MAN, HEART, MAN): 'Couple with Heart', + (WOMAN, HEART, WOMAN): 'Couple with Heart', + (MAN, GIRL): 'Family', + (MAN, GIRL, GIRL): 'Family', + (MAN, GIRL, BOY): 'Family', + (MAN, BOY): 'Family', + (MAN, BOY, BOY): 'Family', + (MAN, WOMAN, GIRL): 'Family', + (MAN, WOMAN, GIRL, GIRL): 'Family', + (MAN, WOMAN, GIRL, BOY): 'Family', + (MAN, WOMAN, BOY): 'Family', + (MAN, WOMAN, BOY, BOY): 'Family', + (MAN, MAN, GIRL): 'Family', + (MAN, MAN, GIRL, GIRL): 'Family', + (MAN, MAN, GIRL, BOY): 'Family', + (MAN, MAN, BOY): 'Family', + (MAN, MAN, BOY, BOY): 'Family', + (WOMAN, GIRL): 'Family', + (WOMAN, GIRL, GIRL): 'Family', + (WOMAN, GIRL, BOY): 'Family', + (WOMAN, BOY): 'Family', + (WOMAN, BOY, BOY): 'Family', + (WOMAN, WOMAN, GIRL): 'Family', + (WOMAN, WOMAN, GIRL, GIRL): 'Family', + (WOMAN, WOMAN, GIRL, BOY): 'Family', + (WOMAN, WOMAN, BOY): 'Family', + (WOMAN, WOMAN, BOY, BOY): 'Family' } + +def _create_custom_seq_names(): + """These have names that often are of the form 'Person xyz-ing' or 'Man Xyz.' + We opt to simplify the former to an activity name or action, and the latter to + drop the gender. This also generally makes the names shorter.""" + + EYE = 0x1f441 + SPEECH = 0x1f5e8 + WHITE_FLAG = 0x1f3f3 + RAINBOW = 0x1f308 + return { + (EYE, SPEECH): 'I Witness', + (WHITE_FLAG, RAINBOW): 'Rainbow Flag', + (0x2695,): 'Health Worker', + (0x2696,): 'Judge', + (0x26f7,): 'Skiing', + (0x26f9,): 'Bouncing a Ball', + (0x2708,): 'Pilot', + (0x1f33e,): 'Farmer', + (0x1f373,): 'Cook', + (0x1f393,): 'Student', + (0x1f3a4,): 'Singer', + (0x1f3a8,): 'Artist', + (0x1f3c2,): 'Snowboarding', + (0x1f3c3,): 'Running', + (0x1f3c4,): 'Surfing', + (0x1f3ca,): 'Swimming', + (0x1f3cb,): 'Weight Lifting', + (0x1f3cc,): 'Golfing', + (0x1f3eb,): 'Teacher', + (0x1f3ed,): 'Factory Worker', + (0x1f46e,): 'Police Officer', + (0x1f46f,): 'Partying', + (0x1f471,): 'Person with Blond Hair', + (0x1f473,): 'Person Wearing Turban', + (0x1f477,): 'Construction Worker', + (0x1f481,): 'Tipping Hand', + (0x1f482,): 'Guard', + (0x1f486,): 'Face Massage', + (0x1f487,): 'Haircut', + (0x1f4bb,): 'Technologist', + (0x1f4bc,): 'Office Worker', + (0x1f527,): 'Mechanic', + (0x1f52c,): 'Scientist', + (0x1f575,): 'Detective', + (0x1f645,): 'No Good Gesture', + (0x1f646,): 'OK Gesture', + (0x1f647,): 'Bowing Deeply', + (0x1f64b,): 'Raising Hand', + (0x1f64d,): 'Frowning', + (0x1f64e,): 'Pouting', + (0x1f680,): 'Astronaut', + (0x1f692,): 'Firefighter', + (0x1f6a3,): 'Rowing', + (0x1f6b4,): 'Bicycling', + (0x1f6b5,): 'Mountain Biking', + (0x1f6b6,): 'Walking', + (0x1f926,): 'Face Palm', + (0x1f937,): 'Shrug', + (0x1f938,): 'Doing a Cartwheel', + (0x1f939,): 'Juggling', + (0x1f93c,): 'Wrestling', + (0x1f93d,): 'Water Polo', + (0x1f93e,): 'Playing Handball', + (0x1f9d6,): 'Person in Steamy Room', + (0x1f9d7,): 'Climbing', + (0x1f9d8,): 'Person in Lotus Position', + (0x1f9d9,): 'Mage', + (0x1f9da,): 'Fairy', + (0x1f9db,): 'Vampire', + (0x1f9dd,): 'Elf', + (0x1f9de,): 'Genie', + (0x1f9df,): 'Zombie', + } + +_CUSTOM_GENDERED_SEQ_NAMES = _create_custom_gendered_seq_names() +_CUSTOM_SEQ_NAMES = _create_custom_seq_names() + +# Fixes for unusual capitalization or cases we don't care to handle in code. +# Also prevents titlecasing 'S' after apostrophe in posessives. Note we _do_ +# want titlecasing after apostrophe in some cases, e.g. O'Clock. +_CUSTOM_CAPS_NAMES = { + (0x26d1,): 'Rescue Worker’s Helmet', + (0x1f170,): 'A Button (blood type)', # a Button (Blood Type) + (0x1f171,): 'B Button (blood type)', # B Button (Blood Type) + (0x1f17e,): 'O Button (blood type)', # O Button (Blood Type) + (0x1f18e,): 'AB Button (blood type)', # Ab Button (Blood Type) + (0x1f191,): 'CL Button', # Cl Button + (0x1f192,): 'COOL Button', # Cool Button + (0x1f193,): 'FREE Button', # Free Button + (0x1f194,): 'ID Button', # Id Button + (0x1f195,): 'NEW Button', # New Button + (0x1f196,): 'NG Button', # Ng Button + (0x1f197,): 'OK Button', # Ok Button + (0x1f198,): 'SOS Button', # Sos Button + (0x1f199,): 'UP! Button', # Up! Button + (0x1f19a,): 'VS Button', # Vs Button + (0x1f3e7,): 'ATM Sign', # Atm Sign + (0x1f44C,): 'OK Hand', # Ok Hand + (0x1f452,): 'Woman’s Hat', + (0x1f45a,): 'Woman’s Clothes', + (0x1f45e,): 'Man’s Shoe', + (0x1f461,): 'Woman’s Sandal', + (0x1f462,): 'Woman’s Boot', + (0x1f519,): 'BACK Arrow', # Back Arrow + (0x1f51a,): 'END Arrow', # End Arrow + (0x1f51b,): 'ON! Arrow', # On! Arrow + (0x1f51c,): 'SOON Arrow', # Soon Arrow + (0x1f51d,): 'TOP Arrow', # Top Arrow + (0x1f6b9,): 'Men’s Room', + (0x1f6ba,): 'Women’s Room', +} + +# For the custom sequences we ignore ZWJ, the emoji variation selector +# and skin tone modifiers. We can't always ignore gender because +# the gendered sequences match against them, but we ignore gender in other +# cases so we define a separate set of gendered emoji to remove. + +_NON_GENDER_CPS_TO_STRIP = frozenset( + [0xfe0f, 0x200d] + + range(unicode_data._FITZ_START, unicode_data._FITZ_END + 1)) + +_GENDER_CPS_TO_STRIP = frozenset([0x2640, 0x2642, 0x1f468, 0x1f469]) + +def _custom_name(seq): + """Apply three kinds of custom names, based on the sequence.""" + + seq = tuple([cp for cp in seq if cp not in _NON_GENDER_CPS_TO_STRIP]) + name = _CUSTOM_CAPS_NAMES.get(seq) + if name: + return name + + # Single characters that participate in sequences (e.g. fire truck in the + # firefighter sequences) should not get converted. Single characters + # are in the custom caps names set but not the other sets. + if len(seq) == 1: + return None + + name = _CUSTOM_GENDERED_SEQ_NAMES.get(seq) + if name: + return name + + seq = tuple([cp for cp in seq if cp not in _GENDER_CPS_TO_STRIP]) + name = _CUSTOM_SEQ_NAMES.get(seq) + + return name + + +def _standard_name(seq): + """Use the standard emoji name, with some algorithmic modifications. + + We want to ignore skin-tone modifiers (but of course if the sequence _is_ + the skin-tone modifier itself we keep that). So we strip these so we can + start with the generic name ignoring skin tone. + + Non-emoji that are turned into emoji using the emoji VS have '(emoji) ' + prepended to them, so strip that. + + Regional indicator symbol names are a bit long, so shorten them. + + Regional sequences are assumed to be ok as-is in terms of capitalization and + punctuation, so no modifications are applied to them. + + After title-casing we make some English articles/prepositions lower-case + again. We also replace '&' with 'and'; Unicode seems rather fond of + ampersand.""" + + if not unicode_data.is_skintone_modifier(seq[0]): + seq = tuple([cp for cp in seq if not unicode_data.is_skintone_modifier(cp)]) + name = unicode_data.get_emoji_sequence_name(seq) + + if name.startswith('(emoji) '): + name = name[8:] + + if len(seq) == 1 and unicode_data.is_regional_indicator(seq[0]): + return 'Regional Symbol ' + unicode_data.regional_indicator_to_ascii(seq[0]) + + if (unicode_data.is_regional_indicator_seq(seq) or + unicode_data.is_regional_tag_seq(seq)): + return name + + name = name.title() + # Require space delimiting just in case... + name = re.sub(r'\s&\s', ' and ', name) + name = re.sub( + # not \b at start because we retain capital at start of phrase + r'(\s(:?A|And|From|In|Of|With|For))\b', lambda s: s.group(1).lower(), + name) + + return name + + +def _name_data(seq, seq_file): + name = _custom_name(seq) or _standard_name(seq) + # we don't need canonical sequences + sequence = ''.join('&#x%x;' % cp for cp in seq if cp != 0xfe0f) + fname = path.basename(seq_file) + return fname, sequence, name + + +def generate_names( + src_dir, dst_dir, pretty_print=False, verbose=False): + srcdir = tool_utils.resolve_path(src_dir) + if not path.isdir(srcdir): + print >> sys.stderr, '%s is not a directory' % src_dir + return + + # make sure the destination exists + dstdir = tool_utils.ensure_dir_exists( + tool_utils.resolve_path(dst_dir)) + + # _get_image_data returns canonical cp sequences + print 'src dir:', srcdir + seq_to_file = generate_emoji_html._get_image_data(srcdir, 'png', 'emoji_u') + print 'seq to file has %d sequences' % len(seq_to_file) + + # Aliases add non-gendered versions using gendered images for the most part. + # But when we display the images, we don't distinguish genders in the + # naming, we rely on the images-- so these look redundant. So we + # intentionally don't generate images for these. + # However, the alias file also includes the flag aliases, which we do want, + # and it also fails to exclude the unknown flag pua (since it doesn't + # map to anything), so we need to adjust for this. + canonical_aliases = generate_emoji_html._get_canonical_aliases() + + aliases = set([ + cps for cps in canonical_aliases.keys() + if not unicode_data.is_regional_indicator_seq(cps)]) + aliases.add((0xfe82b,)) # unknown flag PUA + excluded = aliases | generate_emoji_html._get_canonical_excluded() + + # The flag aliases have distinct names, so we _do_ want to show them + # multiple times. + to_add = {} + for seq in canonical_aliases: + if unicode_data.is_regional_indicator_seq(seq): + replace_seq = canonical_aliases[seq] + if seq in seq_to_file: + print 'warning, alias %s has file %s' % ( + unicode_data.regional_indicator_seq_to_string(seq), + seq_to_file[seq]) + continue + replace_file = seq_to_file.get(replace_seq) + if replace_file: + to_add[seq] = replace_file + seq_to_file.update(to_add) + + data = [] + last_skipped_group = None + skipcount = 0 + skip_limit = 20 + for group in unicode_data.get_emoji_groups(): + name_data = [] + for seq in unicode_data.get_emoji_in_group(group): + if seq in excluded: + continue + seq_file = seq_to_file.get(seq, None) + if seq_file is None: + skipcount += 1 + if verbose: + if group != last_skipped_group: + print 'group %s' % group + last_skipped_group = group + print ' %s (%s)' % ( + unicode_data.seq_to_string(seq), + ', '.join(unicode_data.name(cp, 'x') for cp in seq)) + if skipcount > skip_limit: + raise Exception('skipped too many items') + else: + name_data.append(_name_data(seq, seq_file)) + data.append({'category': group, 'emojis': name_data}) + + outfile = path.join(dstdir, 'data.json') + with open(outfile, 'w') as f: + indent = 2 if pretty_print else None + separators = None if pretty_print else (',', ':') + json.dump(data, f, indent=indent, separators=separators) + print 'wrote %s' % outfile + + +def main(): + DEFAULT_DSTDIR = '[emoji]/emoji' + DEFAULT_IMAGEDIR = '[emoji]/build/compressed_pngs' + + parser = argparse.ArgumentParser() + parser.add_argument( + '-s', '--srcdir', help='directory containing images (default %s)' % + DEFAULT_IMAGEDIR, metavar='dir', default=DEFAULT_IMAGEDIR) + parser.add_argument( + '-d', '--dstdir', help='name of destination directory (default %s)' % + DEFAULT_DSTDIR, metavar='fname', default=DEFAULT_DSTDIR) + parser.add_argument( + '-p', '--pretty_print', help='pretty-print json file', + action='store_true') + parser.add_argument( + '-v', '--verbose', help='print progress information to stdout', + action='store_true') + args = parser.parse_args() + generate_names( + args.srcdir, args.dstdir, pretty_print=args.pretty_print, + verbose=args.verbose) + + +if __name__ == "__main__": + main()