Merge pull request #124 from dougfelt/emoji_names
Move emoji name tool from nototools to here, and change a bit.pull/127/head
commit
3fb9bf6158
|
@ -0,0 +1,379 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-#
|
||||
#
|
||||
# Copyright 2015 Google Inc. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Generate name data for emoji resources. Currently in json format."""
|
||||
|
||||
import argparse
|
||||
import collections
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
from os import path
|
||||
import re
|
||||
import sys
|
||||
|
||||
import generate_emoji_html
|
||||
|
||||
from nototools import tool_utils
|
||||
from nototools import unicode_data
|
||||
|
||||
def _create_custom_gendered_seq_names():
|
||||
"""The names have detail that is adequately represented by the image."""
|
||||
|
||||
BOY = 0x1f466
|
||||
GIRL = 0x1f467
|
||||
MAN = 0x1f468
|
||||
WOMAN = 0x1f469
|
||||
HEART = 0x2764 # Heavy Black Heart
|
||||
KISS_MARK = 0x1f48b
|
||||
return {
|
||||
(MAN, HEART, KISS_MARK, MAN): 'Kiss',
|
||||
(WOMAN, HEART, KISS_MARK, WOMAN): 'Kiss',
|
||||
(WOMAN, HEART, KISS_MARK, MAN): 'Kiss',
|
||||
(WOMAN, HEART, MAN): 'Couple with Heart',
|
||||
(MAN, HEART, MAN): 'Couple with Heart',
|
||||
(WOMAN, HEART, WOMAN): 'Couple with Heart',
|
||||
(MAN, GIRL): 'Family',
|
||||
(MAN, GIRL, GIRL): 'Family',
|
||||
(MAN, GIRL, BOY): 'Family',
|
||||
(MAN, BOY): 'Family',
|
||||
(MAN, BOY, BOY): 'Family',
|
||||
(MAN, WOMAN, GIRL): 'Family',
|
||||
(MAN, WOMAN, GIRL, GIRL): 'Family',
|
||||
(MAN, WOMAN, GIRL, BOY): 'Family',
|
||||
(MAN, WOMAN, BOY): 'Family',
|
||||
(MAN, WOMAN, BOY, BOY): 'Family',
|
||||
(MAN, MAN, GIRL): 'Family',
|
||||
(MAN, MAN, GIRL, GIRL): 'Family',
|
||||
(MAN, MAN, GIRL, BOY): 'Family',
|
||||
(MAN, MAN, BOY): 'Family',
|
||||
(MAN, MAN, BOY, BOY): 'Family',
|
||||
(WOMAN, GIRL): 'Family',
|
||||
(WOMAN, GIRL, GIRL): 'Family',
|
||||
(WOMAN, GIRL, BOY): 'Family',
|
||||
(WOMAN, BOY): 'Family',
|
||||
(WOMAN, BOY, BOY): 'Family',
|
||||
(WOMAN, WOMAN, GIRL): 'Family',
|
||||
(WOMAN, WOMAN, GIRL, GIRL): 'Family',
|
||||
(WOMAN, WOMAN, GIRL, BOY): 'Family',
|
||||
(WOMAN, WOMAN, BOY): 'Family',
|
||||
(WOMAN, WOMAN, BOY, BOY): 'Family' }
|
||||
|
||||
def _create_custom_seq_names():
|
||||
"""These have names that often are of the form 'Person xyz-ing' or 'Man Xyz.'
|
||||
We opt to simplify the former to an activity name or action, and the latter to
|
||||
drop the gender. This also generally makes the names shorter."""
|
||||
|
||||
EYE = 0x1f441
|
||||
SPEECH = 0x1f5e8
|
||||
WHITE_FLAG = 0x1f3f3
|
||||
RAINBOW = 0x1f308
|
||||
return {
|
||||
(EYE, SPEECH): 'I Witness',
|
||||
(WHITE_FLAG, RAINBOW): 'Rainbow Flag',
|
||||
(0x2695,): 'Health Worker',
|
||||
(0x2696,): 'Judge',
|
||||
(0x26f7,): 'Skiing',
|
||||
(0x26f9,): 'Bouncing a Ball',
|
||||
(0x2708,): 'Pilot',
|
||||
(0x1f33e,): 'Farmer',
|
||||
(0x1f373,): 'Cook',
|
||||
(0x1f393,): 'Student',
|
||||
(0x1f3a4,): 'Singer',
|
||||
(0x1f3a8,): 'Artist',
|
||||
(0x1f3c2,): 'Snowboarding',
|
||||
(0x1f3c3,): 'Running',
|
||||
(0x1f3c4,): 'Surfing',
|
||||
(0x1f3ca,): 'Swimming',
|
||||
(0x1f3cb,): 'Weight Lifting',
|
||||
(0x1f3cc,): 'Golfing',
|
||||
(0x1f3eb,): 'Teacher',
|
||||
(0x1f3ed,): 'Factory Worker',
|
||||
(0x1f46e,): 'Police Officer',
|
||||
(0x1f46f,): 'Partying',
|
||||
(0x1f471,): 'Person with Blond Hair',
|
||||
(0x1f473,): 'Person Wearing Turban',
|
||||
(0x1f477,): 'Construction Worker',
|
||||
(0x1f481,): 'Tipping Hand',
|
||||
(0x1f482,): 'Guard',
|
||||
(0x1f486,): 'Face Massage',
|
||||
(0x1f487,): 'Haircut',
|
||||
(0x1f4bb,): 'Technologist',
|
||||
(0x1f4bc,): 'Office Worker',
|
||||
(0x1f527,): 'Mechanic',
|
||||
(0x1f52c,): 'Scientist',
|
||||
(0x1f575,): 'Detective',
|
||||
(0x1f645,): 'No Good Gesture',
|
||||
(0x1f646,): 'OK Gesture',
|
||||
(0x1f647,): 'Bowing Deeply',
|
||||
(0x1f64b,): 'Raising Hand',
|
||||
(0x1f64d,): 'Frowning',
|
||||
(0x1f64e,): 'Pouting',
|
||||
(0x1f680,): 'Astronaut',
|
||||
(0x1f692,): 'Firefighter',
|
||||
(0x1f6a3,): 'Rowing',
|
||||
(0x1f6b4,): 'Bicycling',
|
||||
(0x1f6b5,): 'Mountain Biking',
|
||||
(0x1f6b6,): 'Walking',
|
||||
(0x1f926,): 'Face Palm',
|
||||
(0x1f937,): 'Shrug',
|
||||
(0x1f938,): 'Doing a Cartwheel',
|
||||
(0x1f939,): 'Juggling',
|
||||
(0x1f93c,): 'Wrestling',
|
||||
(0x1f93d,): 'Water Polo',
|
||||
(0x1f93e,): 'Playing Handball',
|
||||
(0x1f9d6,): 'Person in Steamy Room',
|
||||
(0x1f9d7,): 'Climbing',
|
||||
(0x1f9d8,): 'Person in Lotus Position',
|
||||
(0x1f9d9,): 'Mage',
|
||||
(0x1f9da,): 'Fairy',
|
||||
(0x1f9db,): 'Vampire',
|
||||
(0x1f9dd,): 'Elf',
|
||||
(0x1f9de,): 'Genie',
|
||||
(0x1f9df,): 'Zombie',
|
||||
}
|
||||
|
||||
_CUSTOM_GENDERED_SEQ_NAMES = _create_custom_gendered_seq_names()
|
||||
_CUSTOM_SEQ_NAMES = _create_custom_seq_names()
|
||||
|
||||
# Fixes for unusual capitalization or cases we don't care to handle in code.
|
||||
# Also prevents titlecasing 'S' after apostrophe in posessives. Note we _do_
|
||||
# want titlecasing after apostrophe in some cases, e.g. O'Clock.
|
||||
_CUSTOM_CAPS_NAMES = {
|
||||
(0x26d1,): 'Rescue Worker’s Helmet',
|
||||
(0x1f170,): 'A Button (blood type)', # a Button (Blood Type)
|
||||
(0x1f171,): 'B Button (blood type)', # B Button (Blood Type)
|
||||
(0x1f17e,): 'O Button (blood type)', # O Button (Blood Type)
|
||||
(0x1f18e,): 'AB Button (blood type)', # Ab Button (Blood Type)
|
||||
(0x1f191,): 'CL Button', # Cl Button
|
||||
(0x1f192,): 'COOL Button', # Cool Button
|
||||
(0x1f193,): 'FREE Button', # Free Button
|
||||
(0x1f194,): 'ID Button', # Id Button
|
||||
(0x1f195,): 'NEW Button', # New Button
|
||||
(0x1f196,): 'NG Button', # Ng Button
|
||||
(0x1f197,): 'OK Button', # Ok Button
|
||||
(0x1f198,): 'SOS Button', # Sos Button
|
||||
(0x1f199,): 'UP! Button', # Up! Button
|
||||
(0x1f19a,): 'VS Button', # Vs Button
|
||||
(0x1f3e7,): 'ATM Sign', # Atm Sign
|
||||
(0x1f44C,): 'OK Hand', # Ok Hand
|
||||
(0x1f452,): 'Woman’s Hat',
|
||||
(0x1f45a,): 'Woman’s Clothes',
|
||||
(0x1f45e,): 'Man’s Shoe',
|
||||
(0x1f461,): 'Woman’s Sandal',
|
||||
(0x1f462,): 'Woman’s Boot',
|
||||
(0x1f519,): 'BACK Arrow', # Back Arrow
|
||||
(0x1f51a,): 'END Arrow', # End Arrow
|
||||
(0x1f51b,): 'ON! Arrow', # On! Arrow
|
||||
(0x1f51c,): 'SOON Arrow', # Soon Arrow
|
||||
(0x1f51d,): 'TOP Arrow', # Top Arrow
|
||||
(0x1f6b9,): 'Men’s Room',
|
||||
(0x1f6ba,): 'Women’s Room',
|
||||
}
|
||||
|
||||
# For the custom sequences we ignore ZWJ, the emoji variation selector
|
||||
# and skin tone modifiers. We can't always ignore gender because
|
||||
# the gendered sequences match against them, but we ignore gender in other
|
||||
# cases so we define a separate set of gendered emoji to remove.
|
||||
|
||||
_NON_GENDER_CPS_TO_STRIP = frozenset(
|
||||
[0xfe0f, 0x200d] +
|
||||
range(unicode_data._FITZ_START, unicode_data._FITZ_END + 1))
|
||||
|
||||
_GENDER_CPS_TO_STRIP = frozenset([0x2640, 0x2642, 0x1f468, 0x1f469])
|
||||
|
||||
def _custom_name(seq):
|
||||
"""Apply three kinds of custom names, based on the sequence."""
|
||||
|
||||
seq = tuple([cp for cp in seq if cp not in _NON_GENDER_CPS_TO_STRIP])
|
||||
name = _CUSTOM_CAPS_NAMES.get(seq)
|
||||
if name:
|
||||
return name
|
||||
|
||||
# Single characters that participate in sequences (e.g. fire truck in the
|
||||
# firefighter sequences) should not get converted. Single characters
|
||||
# are in the custom caps names set but not the other sets.
|
||||
if len(seq) == 1:
|
||||
return None
|
||||
|
||||
name = _CUSTOM_GENDERED_SEQ_NAMES.get(seq)
|
||||
if name:
|
||||
return name
|
||||
|
||||
seq = tuple([cp for cp in seq if cp not in _GENDER_CPS_TO_STRIP])
|
||||
name = _CUSTOM_SEQ_NAMES.get(seq)
|
||||
|
||||
return name
|
||||
|
||||
|
||||
def _standard_name(seq):
|
||||
"""Use the standard emoji name, with some algorithmic modifications.
|
||||
|
||||
We want to ignore skin-tone modifiers (but of course if the sequence _is_
|
||||
the skin-tone modifier itself we keep that). So we strip these so we can
|
||||
start with the generic name ignoring skin tone.
|
||||
|
||||
Non-emoji that are turned into emoji using the emoji VS have '(emoji) '
|
||||
prepended to them, so strip that.
|
||||
|
||||
Regional indicator symbol names are a bit long, so shorten them.
|
||||
|
||||
Regional sequences are assumed to be ok as-is in terms of capitalization and
|
||||
punctuation, so no modifications are applied to them.
|
||||
|
||||
After title-casing we make some English articles/prepositions lower-case
|
||||
again. We also replace '&' with 'and'; Unicode seems rather fond of
|
||||
ampersand."""
|
||||
|
||||
if not unicode_data.is_skintone_modifier(seq[0]):
|
||||
seq = tuple([cp for cp in seq if not unicode_data.is_skintone_modifier(cp)])
|
||||
name = unicode_data.get_emoji_sequence_name(seq)
|
||||
|
||||
if name.startswith('(emoji) '):
|
||||
name = name[8:]
|
||||
|
||||
if len(seq) == 1 and unicode_data.is_regional_indicator(seq[0]):
|
||||
return 'Regional Symbol ' + unicode_data.regional_indicator_to_ascii(seq[0])
|
||||
|
||||
if (unicode_data.is_regional_indicator_seq(seq) or
|
||||
unicode_data.is_regional_tag_seq(seq)):
|
||||
return name
|
||||
|
||||
name = name.title()
|
||||
# Require space delimiting just in case...
|
||||
name = re.sub(r'\s&\s', ' and ', name)
|
||||
name = re.sub(
|
||||
# not \b at start because we retain capital at start of phrase
|
||||
r'(\s(:?A|And|From|In|Of|With|For))\b', lambda s: s.group(1).lower(),
|
||||
name)
|
||||
|
||||
return name
|
||||
|
||||
|
||||
def _name_data(seq, seq_file):
|
||||
name = _custom_name(seq) or _standard_name(seq)
|
||||
# we don't need canonical sequences
|
||||
sequence = ''.join('&#x%x;' % cp for cp in seq if cp != 0xfe0f)
|
||||
fname = path.basename(seq_file)
|
||||
return fname, sequence, name
|
||||
|
||||
|
||||
def generate_names(
|
||||
src_dir, dst_dir, pretty_print=False, verbose=False):
|
||||
srcdir = tool_utils.resolve_path(src_dir)
|
||||
if not path.isdir(srcdir):
|
||||
print >> sys.stderr, '%s is not a directory' % src_dir
|
||||
return
|
||||
|
||||
# make sure the destination exists
|
||||
dstdir = tool_utils.ensure_dir_exists(
|
||||
tool_utils.resolve_path(dst_dir))
|
||||
|
||||
# _get_image_data returns canonical cp sequences
|
||||
print 'src dir:', srcdir
|
||||
seq_to_file = generate_emoji_html._get_image_data(srcdir, 'png', 'emoji_u')
|
||||
print 'seq to file has %d sequences' % len(seq_to_file)
|
||||
|
||||
# Aliases add non-gendered versions using gendered images for the most part.
|
||||
# But when we display the images, we don't distinguish genders in the
|
||||
# naming, we rely on the images-- so these look redundant. So we
|
||||
# intentionally don't generate images for these.
|
||||
# However, the alias file also includes the flag aliases, which we do want,
|
||||
# and it also fails to exclude the unknown flag pua (since it doesn't
|
||||
# map to anything), so we need to adjust for this.
|
||||
canonical_aliases = generate_emoji_html._get_canonical_aliases()
|
||||
|
||||
aliases = set([
|
||||
cps for cps in canonical_aliases.keys()
|
||||
if not unicode_data.is_regional_indicator_seq(cps)])
|
||||
aliases.add((0xfe82b,)) # unknown flag PUA
|
||||
excluded = aliases | generate_emoji_html._get_canonical_excluded()
|
||||
|
||||
# The flag aliases have distinct names, so we _do_ want to show them
|
||||
# multiple times.
|
||||
to_add = {}
|
||||
for seq in canonical_aliases:
|
||||
if unicode_data.is_regional_indicator_seq(seq):
|
||||
replace_seq = canonical_aliases[seq]
|
||||
if seq in seq_to_file:
|
||||
print 'warning, alias %s has file %s' % (
|
||||
unicode_data.regional_indicator_seq_to_string(seq),
|
||||
seq_to_file[seq])
|
||||
continue
|
||||
replace_file = seq_to_file.get(replace_seq)
|
||||
if replace_file:
|
||||
to_add[seq] = replace_file
|
||||
seq_to_file.update(to_add)
|
||||
|
||||
data = []
|
||||
last_skipped_group = None
|
||||
skipcount = 0
|
||||
skip_limit = 20
|
||||
for group in unicode_data.get_emoji_groups():
|
||||
name_data = []
|
||||
for seq in unicode_data.get_emoji_in_group(group):
|
||||
if seq in excluded:
|
||||
continue
|
||||
seq_file = seq_to_file.get(seq, None)
|
||||
if seq_file is None:
|
||||
skipcount += 1
|
||||
if verbose:
|
||||
if group != last_skipped_group:
|
||||
print 'group %s' % group
|
||||
last_skipped_group = group
|
||||
print ' %s (%s)' % (
|
||||
unicode_data.seq_to_string(seq),
|
||||
', '.join(unicode_data.name(cp, 'x') for cp in seq))
|
||||
if skipcount > skip_limit:
|
||||
raise Exception('skipped too many items')
|
||||
else:
|
||||
name_data.append(_name_data(seq, seq_file))
|
||||
data.append({'category': group, 'emojis': name_data})
|
||||
|
||||
outfile = path.join(dstdir, 'data.json')
|
||||
with open(outfile, 'w') as f:
|
||||
indent = 2 if pretty_print else None
|
||||
separators = None if pretty_print else (',', ':')
|
||||
json.dump(data, f, indent=indent, separators=separators)
|
||||
print 'wrote %s' % outfile
|
||||
|
||||
|
||||
def main():
|
||||
DEFAULT_DSTDIR = '[emoji]/emoji'
|
||||
DEFAULT_IMAGEDIR = '[emoji]/build/compressed_pngs'
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'-s', '--srcdir', help='directory containing images (default %s)' %
|
||||
DEFAULT_IMAGEDIR, metavar='dir', default=DEFAULT_IMAGEDIR)
|
||||
parser.add_argument(
|
||||
'-d', '--dstdir', help='name of destination directory (default %s)' %
|
||||
DEFAULT_DSTDIR, metavar='fname', default=DEFAULT_DSTDIR)
|
||||
parser.add_argument(
|
||||
'-p', '--pretty_print', help='pretty-print json file',
|
||||
action='store_true')
|
||||
parser.add_argument(
|
||||
'-v', '--verbose', help='print progress information to stdout',
|
||||
action='store_true')
|
||||
args = parser.parse_args()
|
||||
generate_names(
|
||||
args.srcdir, args.dstdir, pretty_print=args.pretty_print,
|
||||
verbose=args.verbose)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue