marseymoji/generate_emoji_name_data.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-#
#
# Copyright 2015 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Generate name data for emoji resources. Currently in json format."""
from __future__ import print_function

import argparse
import collections
import glob
import json
import os
from os import path
import re
import sys

import generate_emoji_html

from nototools import tool_utils
from nototools import unicode_data

def _create_custom_gendered_seq_names():
  """The names have detail that is adequately represented by the image."""

  BOY = 0x1f466
  GIRL = 0x1f467
  MAN = 0x1f468
  WOMAN = 0x1f469
  HEART = 0x2764  # Heavy Black Heart
  KISS_MARK = 0x1f48b
  return {
      (MAN, HEART, KISS_MARK, MAN): 'Kiss',
      (WOMAN, HEART, KISS_MARK, WOMAN): 'Kiss',
      (WOMAN, HEART, KISS_MARK, MAN): 'Kiss',
      (WOMAN, HEART, MAN): 'Couple with Heart',
      (MAN, HEART, MAN): 'Couple with Heart',
      (WOMAN, HEART, WOMAN): 'Couple with Heart',
      (MAN, GIRL): 'Family',
      (MAN, GIRL, GIRL): 'Family',
      (MAN, GIRL, BOY): 'Family',
      (MAN, BOY): 'Family',
      (MAN, BOY, BOY): 'Family',
      (MAN, WOMAN, GIRL): 'Family',
      (MAN, WOMAN, GIRL, GIRL): 'Family',
      (MAN, WOMAN, GIRL, BOY): 'Family',
      (MAN, WOMAN, BOY): 'Family',
      (MAN, WOMAN, BOY, BOY): 'Family',
      (MAN, MAN, GIRL): 'Family',
      (MAN, MAN, GIRL, GIRL): 'Family',
      (MAN, MAN, GIRL, BOY): 'Family',
      (MAN, MAN, BOY): 'Family',
      (MAN, MAN, BOY, BOY): 'Family',
      (WOMAN, GIRL): 'Family',
      (WOMAN, GIRL, GIRL): 'Family',
      (WOMAN, GIRL, BOY): 'Family',
      (WOMAN, BOY): 'Family',
      (WOMAN, BOY, BOY): 'Family',
      (WOMAN, WOMAN, GIRL): 'Family',
      (WOMAN, WOMAN, GIRL, GIRL): 'Family',
      (WOMAN, WOMAN, GIRL, BOY): 'Family',
      (WOMAN, WOMAN, BOY): 'Family',
      (WOMAN, WOMAN, BOY, BOY): 'Family' }

def _create_custom_seq_names():
  """These have names that often are of the form 'Person xyz-ing' or 'Man Xyz.'
  We opt to simplify the former to an activity name or action, and the latter to
  drop the gender.  This also generally makes the names shorter."""

  EYE = 0x1f441
  SPEECH = 0x1f5e8
  WHITE_FLAG = 0x1f3f3
  RAINBOW = 0x1f308
  return {
      (EYE, SPEECH): 'I Witness',
      (WHITE_FLAG, RAINBOW): 'Rainbow Flag',
      (0x2695,): 'Health Worker',
      (0x2696,): 'Judge',
      (0x26f7,): 'Skiing',
      (0x26f9,): 'Bouncing a Ball',
      (0x2708,): 'Pilot',
      (0x1f33e,): 'Farmer',
      (0x1f373,): 'Cook',
      (0x1f393,): 'Student',
      (0x1f3a4,): 'Singer',
      (0x1f3a8,): 'Artist',
      (0x1f3c2,): 'Snowboarding',
      (0x1f3c3,): 'Running',
      (0x1f3c4,): 'Surfing',
      (0x1f3ca,): 'Swimming',
      (0x1f3cb,): 'Weight Lifting',
      (0x1f3cc,): 'Golfing',
      (0x1f3eb,): 'Teacher',
      (0x1f3ed,): 'Factory Worker',
      (0x1f46e,): 'Police Officer',
      (0x1f46f,): 'Partying',
      (0x1f471,): 'Person with Blond Hair',
      (0x1f473,): 'Person Wearing Turban',
      (0x1f477,): 'Construction Worker',
      (0x1f481,): 'Tipping Hand',
      (0x1f482,): 'Guard',
      (0x1f486,): 'Face Massage',
      (0x1f487,): 'Haircut',
      (0x1f4bb,): 'Technologist',
      (0x1f4bc,): 'Office Worker',
      (0x1f527,): 'Mechanic',
      (0x1f52c,): 'Scientist',
      (0x1f575,): 'Detective',
      (0x1f645,): 'No Good Gesture',
      (0x1f646,): 'OK Gesture',
      (0x1f647,): 'Bowing Deeply',
      (0x1f64b,): 'Raising Hand',
      (0x1f64d,): 'Frowning',
      (0x1f64e,): 'Pouting',
      (0x1f680,): 'Astronaut',
      (0x1f692,): 'Firefighter',
      (0x1f6a3,): 'Rowing',
      (0x1f6b4,): 'Bicycling',
      (0x1f6b5,): 'Mountain Biking',
      (0x1f6b6,): 'Walking',
      (0x1f926,): 'Face Palm',
      (0x1f937,): 'Shrug',
      (0x1f938,): 'Doing a Cartwheel',
      (0x1f939,): 'Juggling',
      (0x1f93c,): 'Wrestling',
      (0x1f93d,): 'Water Polo',
      (0x1f93e,): 'Playing Handball',
      (0x1f9d6,): 'Person in Steamy Room',
      (0x1f9d7,): 'Climbing',
      (0x1f9d8,): 'Person in Lotus Position',
      (0x1f9d9,): 'Mage',
      (0x1f9da,): 'Fairy',
      (0x1f9db,): 'Vampire',
      (0x1f9dd,): 'Elf',
      (0x1f9de,): 'Genie',
      (0x1f9df,): 'Zombie',
  }

_CUSTOM_GENDERED_SEQ_NAMES = _create_custom_gendered_seq_names()
_CUSTOM_SEQ_NAMES = _create_custom_seq_names()

# Fixes for unusual capitalization or cases we don't care to handle in code.
# Also prevents titlecasing 'S' after apostrophe in posessives.  Note we _do_
# want titlecasing after apostrophe in some cases, e.g. O'Clock.
_CUSTOM_CAPS_NAMES = {
    (0x26d1,): 'Rescue Worker’s Helmet',
    (0x1f170,): 'A Button (blood type)',  # a Button (Blood Type)
    (0x1f171,): 'B Button (blood type)',  # B Button (Blood Type)
    (0x1f17e,): 'O Button (blood type)',  # O Button (Blood Type)
    (0x1f18e,): 'AB Button (blood type)',  # Ab Button (Blood Type)
    (0x1f191,): 'CL Button',  # Cl Button
    (0x1f192,): 'COOL Button',  # Cool Button
    (0x1f193,): 'FREE Button',  # Free Button
    (0x1f194,): 'ID Button',  # Id Button
    (0x1f195,): 'NEW Button',  # New Button
    (0x1f196,): 'NG Button',  # Ng Button
    (0x1f197,): 'OK Button',  # Ok Button
    (0x1f198,): 'SOS Button',  # Sos Button
    (0x1f199,): 'UP! Button',  # Up! Button
    (0x1f19a,): 'VS Button',  # Vs Button
    (0x1f3e7,): 'ATM Sign',  # Atm Sign
    (0x1f44C,): 'OK Hand',  # Ok Hand
    (0x1f452,): 'Woman’s Hat',
    (0x1f45a,): 'Woman’s Clothes',
    (0x1f45e,): 'Man’s Shoe',
    (0x1f461,): 'Woman’s Sandal',
    (0x1f462,): 'Woman’s Boot',
    (0x1f519,): 'BACK Arrow',  # Back Arrow
    (0x1f51a,): 'END Arrow',  # End Arrow
    (0x1f51b,): 'ON! Arrow',  # On! Arrow
    (0x1f51c,): 'SOON Arrow',  # Soon Arrow
    (0x1f51d,): 'TOP Arrow',  # Top Arrow
    (0x1f6b9,): 'Men’s Room',
    (0x1f6ba,): 'Women’s Room',
}

# For the custom sequences we ignore ZWJ, the emoji variation selector
# and skin tone modifiers.  We can't always ignore gender  because
# the gendered sequences match against them, but we ignore gender in other
# cases so we define a separate set of gendered emoji to remove.

_NON_GENDER_CPS_TO_STRIP = frozenset(
    [0xfe0f, 0x200d] +
    range(unicode_data._FITZ_START, unicode_data._FITZ_END + 1))

_GENDER_CPS_TO_STRIP = frozenset([0x2640, 0x2642, 0x1f468, 0x1f469])

def _custom_name(seq):
  """Apply three kinds of custom names, based on the sequence."""

  seq = tuple([cp for cp in seq if cp not in _NON_GENDER_CPS_TO_STRIP])
  name = _CUSTOM_CAPS_NAMES.get(seq)
  if name:
    return name

  # Single characters that participate in sequences (e.g. fire truck in the
  # firefighter sequences) should not get converted.  Single characters
  # are in the custom caps names set but not the other sets.
  if len(seq) == 1:
    return None

  name = _CUSTOM_GENDERED_SEQ_NAMES.get(seq)
  if name:
    return name

  seq = tuple([cp for cp in seq if cp not in _GENDER_CPS_TO_STRIP])
  name = _CUSTOM_SEQ_NAMES.get(seq)

  return name


def _standard_name(seq):
  """Use the standard emoji name, with some algorithmic modifications.

  We want to ignore skin-tone modifiers (but of course if the sequence _is_
  the skin-tone modifier itself we keep that).  So we strip these so we can
  start with the generic name ignoring skin tone.

  Non-emoji that are turned into emoji using the emoji VS have '(emoji) '
  prepended to them, so strip that.

  Regional indicator symbol names are a bit long, so shorten them.

  Regional sequences are assumed to be ok as-is in terms of capitalization and
  punctuation, so no modifications are applied to them.

  After title-casing we make some English articles/prepositions lower-case
  again.  We also replace '&' with 'and'; Unicode seems rather fond of
  ampersand."""

  if not unicode_data.is_skintone_modifier(seq[0]):
    seq = tuple([cp for cp in seq if not unicode_data.is_skintone_modifier(cp)])
  name = unicode_data.get_emoji_sequence_name(seq)

  if name.startswith('(emoji) '):
    name = name[8:]

  if len(seq) == 1 and unicode_data.is_regional_indicator(seq[0]):
    return 'Regional Symbol ' + unicode_data.regional_indicator_to_ascii(seq[0])

  if (unicode_data.is_regional_indicator_seq(seq) or
      unicode_data.is_regional_tag_seq(seq)):
    return name

  name = name.title()
  # Require space delimiting just in case...
  name = re.sub(r'\s&\s', ' and ', name)
  name = re.sub(
      # not \b at start because we retain capital at start of phrase
      r'(\s(:?A|And|From|In|Of|With|For))\b', lambda s: s.group(1).lower(),
      name)

  return name


def _name_data(seq, seq_file):
  name = _custom_name(seq) or _standard_name(seq)
  # we don't need canonical sequences
  sequence = ''.join('&#x%x;' % cp for cp in seq if cp != 0xfe0f)
  fname = path.basename(seq_file)
  return fname, sequence, name


def generate_names(
    src_dir, dst_dir, skip_limit=20, omit_groups=None, pretty_print=False,
    verbose=False):
  srcdir = tool_utils.resolve_path(src_dir)
  if not path.isdir(srcdir):
    print('%s is not a directory' % src_dir, file=sys.stderr)
    return

  if omit_groups:
    unknown_groups = set(omit_groups) - set(unicode_data.get_emoji_groups())
    if unknown_groups:
      print('did not recognize %d group%s: %s' % (
          len(unknown_groups), '' if len(unknown_groups) == 1 else 's',
          ', '.join('"%s"' % g for g in omit_groups if g in unknown_groups)), file=sys.stderr)
      print('valid groups are:\n  %s' % (
          '\n  '.join(g for g in unicode_data.get_emoji_groups())), file=sys.stderr)
      return
    print('omitting %d group%s: %s' % (
        len(omit_groups), '' if len(omit_groups) == 1 else 's',
        ', '.join('"%s"' % g for g in omit_groups)))
  else:
    # might be None
    print('keeping all groups')
    omit_groups = []

  # make sure the destination exists
  dstdir = tool_utils.ensure_dir_exists(
      tool_utils.resolve_path(dst_dir))

  # _get_image_data returns canonical cp sequences
  print('src dir:', srcdir)
  seq_to_file = generate_emoji_html._get_image_data(srcdir, 'png', 'emoji_u')
  print('seq to file has %d sequences' % len(seq_to_file))

  # Aliases add non-gendered versions using gendered images for the most part.
  # But when we display the images, we don't distinguish genders in the
  # naming, we rely on the images-- so these look redundant. So we
  # intentionally don't generate images for these.
  # However, the alias file also includes the flag aliases, which we do want,
  # and it also fails to exclude the unknown flag pua (since it doesn't
  # map to anything), so we need to adjust for this.
  canonical_aliases = generate_emoji_html._get_canonical_aliases()

  aliases = set([
      cps for cps in canonical_aliases.keys()
      if not unicode_data.is_regional_indicator_seq(cps)])
  aliases.add((0xfe82b,))  # unknown flag PUA
  excluded = aliases | generate_emoji_html._get_canonical_excluded()

  # The flag aliases have distinct names, so we _do_ want to show them
  # multiple times.
  to_add = {}
  for seq in canonical_aliases:
    if unicode_data.is_regional_indicator_seq(seq):
      replace_seq = canonical_aliases[seq]
      if seq in seq_to_file:
        print('warning, alias %s has file %s' % (
            unicode_data.regional_indicator_seq_to_string(seq),
            seq_to_file[seq]))
        continue
      replace_file = seq_to_file.get(replace_seq)
      if replace_file:
        to_add[seq] = replace_file
  seq_to_file.update(to_add)

  data = []
  last_skipped_group = None
  skipcount = 0
  for group in unicode_data.get_emoji_groups():
    if group in omit_groups:
      continue
    name_data = []
    for seq in unicode_data.get_emoji_in_group(group):
      if seq in excluded:
        continue
      seq_file = seq_to_file.get(seq, None)
      if seq_file is None:
        skipcount += 1
        if verbose:
          if group != last_skipped_group:
            print('group %s' % group)
            last_skipped_group = group
          print('  %s (%s)' % (
              unicode_data.seq_to_string(seq),
              ', '.join(unicode_data.name(cp, 'x') for cp in seq)))
        if skip_limit >= 0 and skipcount > skip_limit:
          raise Exception('skipped too many items')
      else:
        name_data.append(_name_data(seq, seq_file))
    data.append({'category': group, 'emojis': name_data})

  outfile = path.join(dstdir, 'data.json')
  with open(outfile, 'w') as f:
    indent = 2 if pretty_print else None
    separators = None if pretty_print else (',', ':')
    json.dump(data, f, indent=indent, separators=separators)
  print('wrote %s' % outfile)


def main():
  DEFAULT_DSTDIR = '[emoji]/emoji'
  DEFAULT_IMAGEDIR = '[emoji]/build/compressed_pngs'

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '-s', '--srcdir', help='directory containing images (default %s)' %
      DEFAULT_IMAGEDIR,  metavar='dir', default=DEFAULT_IMAGEDIR)
  parser.add_argument(
      '-d', '--dstdir', help='name of destination directory (default %s)' %
      DEFAULT_DSTDIR, metavar='fname', default=DEFAULT_DSTDIR)
  parser.add_argument(
      '-p', '--pretty_print', help='pretty-print json file',
      action='store_true')
  parser.add_argument(
      '-m', '--missing_limit', help='number of missing images before failure '
      '(default 20), use -1 for no limit', metavar='n', default=20)
  parser.add_argument(
      '--omit_groups', help='names of groups to omit (default "Misc, Flags")',
      metavar='name', default=['Misc', 'Flags'], nargs='*')
  parser.add_argument(
      '-v', '--verbose', help='print progress information to stdout',
      action='store_true')
  args = parser.parse_args()
  generate_names(
      args.srcdir, args.dstdir, args.missing_limit, args.omit_groups,
      pretty_print=args.pretty_print, verbose=args.verbose)


if __name__ == "__main__":
    main()