marseymoji/check_emoji_sequences.py

#!/usr/bin/env python
#
# Copyright 2016 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Compare emoji image file namings against unicode property data."""

import argparse
import collections
import glob
import os
from os import path
import re
import sys

from nototools import unicode_data

DATA_ROOT = path.dirname(path.abspath(__file__))

ZWJ = 0x200d
EMOJI_VS = 0xfe0f

def _is_regional_indicator(cp):
  return 0x1f1e6 <= cp <= 0x1f1ff


def _is_skintone_modifier(cp):
  return 0x1f3fb <= cp <= 0x1f3ff


def _seq_string(seq):
  return '_'.join('%04x' % cp for cp in seq)

def strip_vs(seq):
  return tuple(cp for cp in seq if cp != EMOJI_VS)

_namedata = None

def seq_name(seq):
  global _namedata

  if not _namedata:
    def strip_vs_map(seq_map):
      return {
          strip_vs(k): v
          for k, v in seq_map.iteritems()}
    _namedata = [
        strip_vs_map(unicode_data.get_emoji_combining_sequences()),
        strip_vs_map(unicode_data.get_emoji_flag_sequences()),
        strip_vs_map(unicode_data.get_emoji_modifier_sequences()),
        strip_vs_map(unicode_data.get_emoji_zwj_sequences()),
        ]

  if len(seq) == 1:
    return unicode_data.name(seq[0], None)

  for data in _namedata:
    if seq in data:
      return data[seq]
  if EMOJI_VS in seq:
    non_vs_seq = strip_vs(seq)
    for data in _namedata:
      if non_vs_seq in data:
        return data[non_vs_seq]

  return None


def _check_valid_emoji(sorted_seq_to_filepath):
  """Ensure all emoji are either valid emoji or specific chars."""

  valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps())
  valid_cps.add(0x200d)  # ZWJ
  valid_cps.add(0x20e3)  # combining enclosing keycap
  valid_cps.add(0xfe0f)  # variation selector (emoji presentation)
  valid_cps.add(0xfe82b)  # PUA value for unknown flag

  not_emoji = {}
  for seq, fp in sorted_seq_to_filepath.iteritems():
    for cp in seq:
      if cp not in valid_cps:
        if cp not in not_emoji:
          not_emoji[cp] = []
        not_emoji[cp].append(fp)

  if len(not_emoji):
    print >> sys.stderr, '%d non-emoji found:' % len(not_emoji)
    for cp in sorted(not_emoji):
      print >> sys.stderr, '%04x (in %s)' % (cp, ', '.join(not_emoji[cp]))


def _check_zwj(sorted_seq_to_filepath):
  """Ensure zwj is only between two appropriate emoji."""
  ZWJ = 0x200D
  EMOJI_PRESENTATION_VS = 0xFE0F

  for seq, fp in sorted_seq_to_filepath.iteritems():
    if ZWJ not in seq:
      continue
    if seq[0] == 0x200d:
      print >> sys.stderr, 'zwj at head of sequence in %s' % fp
    if len(seq) == 1:
      continue
    if seq[-1] == 0x200d:
      print >> sys.stderr, 'zwj at end of sequence in %s' % fp
    for i, cp in enumerate(seq):
      if cp == ZWJ:
        if i > 0:
          pcp = seq[i-1]
          if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp):
            print >> sys.stderr, 'non-emoji %04x preceeds ZWJ in %s' % (pcp, fp)
        if i < len(seq) - 1:
          fcp = seq[i+1]
          if not unicode_data.is_emoji(fcp):
            print >> sys.stderr, 'non-emoji %04x follows ZWJ in %s' % (fcp, fp)


def _check_flags(sorted_seq_to_filepath):
  """Ensure regional indicators are only in sequences of one or two, and
  never mixed."""
  for seq, fp in sorted_seq_to_filepath.iteritems():
    have_reg = None
    for cp in seq:
      is_reg = _is_regional_indicator(cp)
      if have_reg == None:
        have_reg = is_reg
      elif have_reg != is_reg:
        print >> sys.stderr, 'mix of regional and non-regional in %s' % fp
    if have_reg and len(seq) > 2:
      # We provide dummy glyphs for regional indicators, so there are sequences
      # with single regional indicator symbols.
      print >> sys.stderr, 'regional indicator sequence length != 2 in %s' % fp


def _check_skintone(sorted_seq_to_filepath):
  """Ensure skin tone modifiers are not applied to emoji that are not defined
  to take them.  May appear standalone, though.  Also check that emoji that take
  skin tone modifiers have a complete set."""
  base_to_modifiers = collections.defaultdict(set)
  for seq, fp in sorted_seq_to_filepath.iteritems():
    for i, cp in enumerate(seq):
      if _is_skintone_modifier(cp):
        if i == 0:
          if len(seq) > 1:
            print >> sys.stderr, 'skin color selector first in sequence %s' % fp
          # standalone are ok
          continue
        pcp = seq[i-1]
        if not unicode_data.is_emoji_modifier_base(pcp):
          print >> sys.stderr, (
              'emoji skintone modifier applied to non-base at %d: %s' % (i, fp))
      elif unicode_data.is_emoji_modifier_base(cp):
        if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]):
          base_to_modifiers[cp].add(seq[i+1])
        elif cp not in base_to_modifiers:
          base_to_modifiers[cp] = set()
  for cp, modifiers in sorted(base_to_modifiers.iteritems()):
    if len(modifiers) != 5:
      print >> sys.stderr, 'emoji base %04x has %d modifiers defined (%s) in %s' % (
          cp, len(modifiers),
          ', '.join('%04x' % cp for cp in sorted(modifiers)), fp)


def _check_zwj_sequences(seq_to_filepath):
  """Verify that zwj sequences are valid."""
  zwj_sequence_to_name = unicode_data.get_emoji_zwj_sequences()
  # strip emoji variant selectors and add extra mappings
  zwj_sequence_without_vs_to_name_canonical = {}
  for seq, seq_name in zwj_sequence_to_name.iteritems():
    if EMOJI_VS in seq:
      stripped_seq = strip_vs(seq)
      zwj_sequence_without_vs_to_name_canonical[stripped_seq] = (seq_name, seq)

  zwj_seq_to_filepath = {
      seq: fp for seq, fp in seq_to_filepath.iteritems()
      if ZWJ in seq}

  for seq, fp in zwj_seq_to_filepath.iteritems():
    if seq not in zwj_sequence_to_name:
      if seq not in zwj_sequence_without_vs_to_name_canonical:
        print >> sys.stderr, 'zwj sequence not defined: %s' % fp
      else:
        _, can = zwj_sequence_without_vs_to_name_canonical[seq]
        # print >> sys.stderr, 'canonical sequence %s contains vs: %s' % (
        #     _seq_string(can), fp)

def read_emoji_aliases():
  result = {}

  with open(path.join(DATA_ROOT, 'emoji_aliases.txt'), 'r') as f:
    for line in f:
      ix = line.find('#')
      if (ix > -1):
        line = line[:ix]
      line = line.strip()
      if not line:
        continue
      als, trg = (s.strip() for s in line.split(';'))
      als_seq = tuple([int(x, 16) for x in als.split('_')])
      try:
        trg_seq = tuple([int(x, 16) for x in trg.split('_')])
      except:
        print 'cannot process alias %s -> %s' % (als, trg)
        continue
      result[als_seq] = trg_seq
  return result


def _check_coverage(seq_to_filepath):
  age = 9.0

  non_vs_to_canonical = {}
  for k in seq_to_filepath:
    if EMOJI_VS in k:
      non_vs = strip_vs(k)
      non_vs_to_canonical[non_vs] = k

  aliases = read_emoji_aliases()
  for k, v in sorted(aliases.items()):
    if v not in seq_to_filepath and v not in non_vs_to_canonical:
      print 'alias %s missing target %s' % (_seq_string(k), _seq_string(v))
      continue
    if k in seq_to_filepath or k in non_vs_to_canonical:
      print 'alias %s already exists as %s (%s)' % (
          _seq_string(k), _seq_string(v), seq_name(v))
      continue
    filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]]
    seq_to_filepath[k] = 'alias:' + filename

  # check single emoji, this includes most of the special chars
  emoji = sorted(unicode_data.get_emoji(age=age))
  for cp in emoji:
    if tuple([cp]) not in seq_to_filepath:
      print 'missing single %04x (%s)' % (cp, unicode_data.name(cp, '<no name>'))

  # special characters
  # all but combining enclosing keycap are currently marked as emoji
  for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a):
    if cp not in emoji and tuple([cp]) not in seq_to_filepath:
      print 'missing special %04x (%s)' % (cp, unicode_data.name(cp))

  # combining sequences
  comb_seq_to_name = sorted(
      unicode_data.get_emoji_combining_sequences(age=age).iteritems())
  for seq, name in comb_seq_to_name:
    if seq not in seq_to_filepath:
      # strip vs and try again
      non_vs_seq = strip_vs(seq)
      if non_vs_seq not in seq_to_filepath:
        print 'missing combining sequence %s (%s)' % (_seq_string(seq), name)

  # flag sequences
  flag_seq_to_name = sorted(
      unicode_data.get_emoji_flag_sequences(age=age).iteritems())
  for seq, name in flag_seq_to_name:
    if seq not in seq_to_filepath:
      print 'missing flag sequence %s (%s)' % (_seq_string(seq), name)

  # skin tone modifier sequences
  mod_seq_to_name = sorted(
      unicode_data.get_emoji_modifier_sequences(age=age).iteritems())
  for seq, name in mod_seq_to_name:
    if seq not in seq_to_filepath:
      print 'missing modifier sequence %s (%s)' % (
          _seq_string(seq), name)

  # zwj sequences
  # some of ours include the emoji presentation variation selector and some
  # don't, and the same is true for the canonical sequences.  normalize all
  # of them to omit it to test coverage, but report the canonical sequence.
  zwj_seq_without_vs = set()
  for seq in seq_to_filepath:
    if ZWJ not in seq:
      continue
    if EMOJI_VS in seq:
      seq = tuple(cp for cp in seq if cp != EMOJI_VS)
    zwj_seq_without_vs.add(seq)

  for seq, name in sorted(
      unicode_data.get_emoji_zwj_sequences(age=age).iteritems()):
    if EMOJI_VS in seq:
      test_seq = tuple(s for s in seq if s != EMOJI_VS)
    else:
      test_seq = seq
    if test_seq not in zwj_seq_without_vs:
      print 'missing (canonical) zwj sequence %s (%s)' % (
          _seq_string(seq), name)

  # check for 'unknown flag'
  # this is either emoji_ufe82b or 'unknown_flag', we filter out things that
  # don't start with our prefix so 'unknown_flag' would be excluded by default.
  if tuple([0xfe82b]) not in seq_to_filepath:
    print 'missing unknown flag PUA fe82b'


def check_sequence_to_filepath(seq_to_filepath):
  sorted_seq_to_filepath = collections.OrderedDict(
      sorted(seq_to_filepath.items()))
  _check_valid_emoji(sorted_seq_to_filepath)
  _check_zwj(sorted_seq_to_filepath)
  _check_flags(sorted_seq_to_filepath)
  _check_skintone(sorted_seq_to_filepath)
  _check_zwj_sequences(sorted_seq_to_filepath)
  _check_coverage(sorted_seq_to_filepath)

def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
  """Check names, and convert name to sequences for names that are ok,
  returning a sequence to file path mapping.  Reports bad segments
  of a name to stderr."""
  segment_re = re.compile(r'^[0-9a-f]{4,6}$')
  result = {}
  for name, dirname in name_to_dirpath.iteritems():
    if not name.startswith(prefix):
      print 'expected prefix "%s" for "%s"' % (prefix, name)
      continue

    segments = name[len(prefix): -len(suffix)].split('_')
    segfail = False
    seq = []
    for s in segments:
      if not segment_re.match(s):
        print 'bad codepoint name "%s" in %s/%s' % (s, dirname, name)
        segfail = True
        continue
      n = int(s, 16)
      if n > 0x10ffff:
        print 'codepoint "%s" out of range in %s/%s' % (s, dirname, name)
        segfail = True
        continue
      seq.append(n)
    if not segfail:
      result[tuple(seq)] = path.join(dirname, name)
  return result


def collect_name_to_dirpath(directory, prefix, suffix):
  """Return a mapping from filename to path rooted at directory, ignoring files
  that don't match suffix.  Report when a filename appears in more than one
  subdir; the first path found is kept."""
  result = {}
  for dirname, _, files in os.walk(directory):
    if directory != '.':
      dirname = path.join(directory, dirname)
    for f in files:
      if not f.endswith(suffix):
        continue
      if f in result:
        print >> sys.stderr, 'duplicate file "%s" in %s and %s ' % (
            f, dirname, result[f])
        continue
      result[f] = dirname
  return result


def collect_name_to_dirpath_with_override(dirs, prefix, suffix):
  """Return a mapping from filename to a directory path rooted at a directory
  in dirs, using collect_name_to_filepath.  The last directory is retained. This
  does not report an error if a file appears under more than one root directory,
  so lets later root directories override earlier ones."""
  result = {}
  for d in dirs:
    result.update(collect_name_to_dirpath(d, prefix, suffix))
  return result


def run_check(dirs, prefix, suffix):
  print 'Checking files with prefix "%s" and suffix "%s" in:\n  %s' % (
      prefix, suffix, '\n  '.join(dirs))
  name_to_dirpath = collect_name_to_dirpath_with_override(
      dirs, prefix=prefix, suffix=suffix)
  print 'checking %d names' % len(name_to_dirpath)
  seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix)
  print 'checking %d sequences' % len(seq_to_filepath)
  check_sequence_to_filepath(seq_to_filepath)
  print 'done.'


def main():
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '-d', '--dirs', help='directories containing emoji images',
      metavar='dir', nargs='+', required=True)
  parser.add_argument(
      '-p', '--prefix', help='prefix to match, default "emoji_u"',
      metavar='pfx', default='emoji_u')
  parser.add_argument(
      '-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx',
      default='.png')
  args = parser.parse_args()
  run_check(args.dirs, args.prefix, args.suffix)


if __name__ == '__main__':
  main()