From 4ec1a5fed8f2f1b953c44e3c52ee6dbceca3d7a2 Mon Sep 17 00:00:00 2001
From: Doug Felt <dougfelt@google.com>
Date: Wed, 18 Jan 2017 14:48:52 -0800
Subject: [PATCH] Update emoji sequence check to be more flexible.

- supports checking files with other extension besides .png
- checks all files under a root directory and not just the
  files directly in a directory
- checks for duplicate files in multiple directories under a root
- reports the directory containing a file when there are problems
---
 check_emoji_sequences.py | 164 +++++++++++++++++++++++++++------------
 generate_emoji_html.py   |   4 +-
 2 files changed, 115 insertions(+), 53 deletions(-)

diff --git a/check_emoji_sequences.py b/check_emoji_sequences.py
index c2649ad25..5bf1626fe 100755
--- a/check_emoji_sequences.py
+++ b/check_emoji_sequences.py
@@ -19,6 +19,7 @@
 import argparse
 import collections
 import glob
+import os
 from os import path
 import re
 import sys
@@ -37,7 +38,7 @@ def _seq_string(seq):
   return '_'.join('%04x' % cp for cp in seq)
 
 
-def _check_valid_emoji(sorted_seqs):
+def _check_valid_emoji(sorted_seq_to_filepath):
   """Ensure all emoji are either valid emoji or specific chars."""
 
   valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps())
@@ -46,79 +47,80 @@ def _check_valid_emoji(sorted_seqs):
   valid_cps.add(0xfe0f)  # variation selector (emoji presentation)
   valid_cps.add(0xfe82b)  # PUA value for unknown flag
 
-  not_emoji = set()
-  for seq in sorted_seqs:
+  not_emoji = {}
+  for seq, fp in sorted_seq_to_filepath.iteritems():
     for cp in seq:
       if cp not in valid_cps:
-        not_emoji.add(cp)
+        if cp not in not_emoji:
+          not_emoji[cp] = []
+        not_emoji[cp].append(fp)
 
   if len(not_emoji):
     print >> sys.stderr, '%d non-emoji found:' % len(not_emoji)
     for cp in sorted(not_emoji):
-      print >> sys.stderr, '%04X' % cp
+      print >> sys.stderr, '%04x (in %s)' % (cp, ', '.join(not_emoji[cp]))
 
 
-def _check_zwj(sorted_seqs):
+def _check_zwj(sorted_seq_to_filepath):
   """Ensure zwj is only between two appropriate emoji."""
   ZWJ = 0x200D
   EMOJI_PRESENTATION_VS = 0xFE0F
 
-  for seq in sorted_seqs:
+  for seq, fp in sorted_seq_to_filepath.iteritems():
     if ZWJ not in seq:
       continue
     if seq[0] == 0x200d:
-      print >> sys.stderr, 'zwj at head of sequence'
+      print >> sys.stderr, 'zwj at head of sequence in %s' % fp
     if len(seq) == 1:
       continue
     if seq[-1] == 0x200d:
-      print >> sys.stderr, 'zwj at end of sequence'
+      print >> sys.stderr, 'zwj at end of sequence in %s' % fp
     for i, cp in enumerate(seq):
       if cp == ZWJ:
-        pcp = seq[i-1]
-        if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp):
-          print >> sys.stderr, 'non-emoji %04X preceeds ZWJ' % pcp
-        fcp = seq[i+1]
-        if not unicode_data.is_emoji(fcp):
-          print >> sys.stderr, 'non-emoji %04X follows ZWJ' % fcp
+        if i > 0:
+          pcp = seq[i-1]
+          if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp):
+            print >> sys.stderr, 'non-emoji %04x preceeds ZWJ in %s' % (pcp, fp)
+        if i < len(seq) - 1:
+          fcp = seq[i+1]
+          if not unicode_data.is_emoji(fcp):
+            print >> sys.stderr, 'non-emoji %04x follows ZWJ in %s' % (fcp, fp)
 
 
-def _check_flags(sorted_seqs):
+def _check_flags(sorted_seq_to_filepath):
   """Ensure regional indicators are only in sequences of one or two, and
   never mixed."""
-  for seq in sorted_seqs:
+  for seq, fp in sorted_seq_to_filepath.iteritems():
     have_reg = None
     for cp in seq:
       is_reg = _is_regional_indicator(cp)
       if have_reg == None:
         have_reg = is_reg
       elif have_reg != is_reg:
-        print >> sys.stderr, ('mix of regional and non-regional in %s' %
-            _seq_string(seq))
+        print >> sys.stderr, 'mix of regional and non-regional in %s' % fp
     if have_reg and len(seq) > 2:
       # We provide dummy glyphs for regional indicators, so there are sequences
       # with single regional indicator symbols.
-      print >> sys.stderr, ('regional indicator sequence length != 2: %s' %
-            _seq_string(seq))
+      print >> sys.stderr, 'regional indicator sequence length != 2 in %s' % fp
 
 
-def _check_skintone(sorted_seqs):
+def _check_skintone(sorted_seq_to_filepath):
   """Ensure skin tone modifiers are not applied to emoji that are not defined
   to take them.  May appear standalone, though.  Also check that emoji that take
   skin tone modifiers have a complete set."""
   base_to_modifiers = collections.defaultdict(set)
-  for seq in sorted_seqs:
+  for seq, fp in sorted_seq_to_filepath.iteritems():
     for i, cp in enumerate(seq):
       if _is_skintone_modifier(cp):
         if i == 0:
           if len(seq) > 1:
-            print >> sys.stderr, 'skin color selector first in sequence %s'
+            print >> sys.stderr, 'skin color selector first in sequence %s' % fp
           # standalone are ok
           continue
         pcp = seq[i-1]
         if not unicode_data.is_emoji_modifier_base(pcp):
           print >> sys.stderr, (
-              'emoji skintone modifier applied to non-base at %d: %s' % (
-                  i, _seq_string(seq)))
+              'emoji skintone modifier applied to non-base at %d: %s' % (i, fp))
       elif unicode_data.is_emoji_modifier_base(cp):
         if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]):
           base_to_modifiers[cp].add(seq[i+1])
@@ -126,36 +128,90 @@ def _check_skintone(sorted_seqs):
           base_to_modifiers[cp] = set()
   for cp, modifiers in sorted(base_to_modifiers.iteritems()):
     if len(modifiers) != 5:
-      print 'emoji base %04X has %d modifiers defined (%s)' % (
+      print 'emoji base %04x has %d modifiers defined (%s) in %s' % (
           cp, len(modifiers),
-          ', '.join('%04x' % cp for cp in sorted(modifiers)))
+          ', '.join('%04x' % cp for cp in sorted(modifiers)), fp)
 
 
-def check_sequences(seqs):
-  sorted_seqs = sorted(seqs)
-  print 'checking %d sequences' % len(seqs)
-  _check_valid_emoji(sorted_seqs)
-  _check_zwj(sorted_seqs)
-  _check_flags(sorted_seqs)
-  _check_skintone(sorted_seqs)
-  print 'done.'
+def check_sequence_to_filepath(seq_to_filepath):
+  sorted_seq_to_filepath = collections.OrderedDict(
+      sorted(seq_to_filepath.items()))
+  _check_valid_emoji(sorted_seq_to_filepath)
+  _check_zwj(sorted_seq_to_filepath)
+  _check_flags(sorted_seq_to_filepath)
+  _check_skintone(sorted_seq_to_filepath)
 
 
-def _collect_sequences(dirs, prefix='emoji_u'):
-  seqs = set()
-  path_re = re.compile('%s([a-zA-Z0-9_]+)\.png' % prefix)
+def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
+  """Check names, and convert name to sequences for names that are ok,
+  returning a sequence to file path mapping.  Reports bad segments
+  of a name to stderr."""
+  segment_re = re.compile(r'^[0-9a-f]{4,6}$')
+  result = {}
+  for name, dirname in name_to_dirpath.iteritems():
+    if not name.startswith(prefix):
+      print 'expected prefix "%s" for "%s"' % (prefix, name)
+      continue
+
+    segments = name[len(prefix): -len(suffix)].split('_')
+    segfail = False
+    seq = []
+    for s in segments:
+      if not segment_re.match(s):
+        print 'bad codepoint name "%s" in %s/%s' % (s, dirname, name)
+        segfail = True
+        continue
+      n = int(s, 16)
+      if n > 0x10ffff:
+        print 'codepoint "%s" out of range in %s/%s' % (s, dirname, name)
+        segfail = True
+        continue
+      seq.append(n)
+    if not segfail:
+      result[tuple(seq)] = path.join(dirname, name)
+  return result
+
+
+def collect_name_to_dirpath(directory, prefix, suffix):
+  """Return a mapping from filename to path rooted at directory, ignoring files
+  that don't match suffix.  Report when a filename appears in more than one
+  subdir; the first path found is kept."""
+  result = {}
+  for dirname, _, files in os.walk(directory):
+    if directory != '.':
+      dirname = path.join(directory, dirname)
+    for f in files:
+      if not f.endswith(suffix):
+        continue
+      if f in result:
+        print >> sys.stderr, 'duplicate file "%s" in %s and %s ' % (
+            f, dirname, result[f])
+        continue
+      result[f] = dirname
+  return result
+
+
+def collect_name_to_dirpath_with_override(dirs, prefix, suffix):
+  """Return a mapping from filename to a directory path rooted at a directory
+  in dirs, using collect_name_to_filepath.  The last directory is retained. This
+  does not report an error if a file appears under more than one root directory,
+  so lets later root directories override earlier ones."""
+  result = {}
   for d in dirs:
-    for f in glob.glob(path.join(d, '%s*.png' % prefix)):
-      m = path_re.match(path.basename(f))
-      if not m:
-        print >> sys.stderr, 'could not match file "%s"' % f
-        continue
-      seq = tuple(int(s, 16) for s in m.group(1).split('_'))
-      if seq in seqs:
-        print >> sys.stderr, 'duplicate sequence for "%s"' % f
-        continue
-      seqs.add(seq)
-  return seqs
+    result.update(collect_name_to_dirpath(d, prefix, suffix))
+  return result
+
+
+def run_check(dirs, prefix, suffix):
+  print 'Checking files with prefix "%s" and suffix "%s" in:\n  %s' % (
+      prefix, suffix, '\n  '.join(dirs))
+  name_to_dirpath = collect_name_to_dirpath_with_override(
+      dirs, prefix=prefix, suffix=suffix)
+  print 'checking %d names' % len(name_to_dirpath)
+  seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix)
+  print 'checking %d sequences' % len(seq_to_filepath)
+  check_sequence_to_filepath(seq_to_filepath)
+  print 'done.'
 
 
 def main():
@@ -163,8 +219,14 @@ def main():
   parser.add_argument(
       '-d', '--dirs', help='directories containing emoji images',
       metavar='dir', nargs='+', required=True)
+  parser.add_argument(
+      '-p', '--prefix', help='prefix to match, default "emoji_u"',
+      metavar='pfx', default='emoji_u')
+  parser.add_argument(
+      '-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx',
+      default='.png')
   args = parser.parse_args()
-  check_sequences(_collect_sequences(args.dirs))
+  run_check(args.dirs, args.prefix, args.suffix)
 
 
 if __name__ == '__main__':
diff --git a/generate_emoji_html.py b/generate_emoji_html.py
index f62113a64..3df68881c 100755
--- a/generate_emoji_html.py
+++ b/generate_emoji_html.py
@@ -204,9 +204,9 @@ def _generate_content(basedir, font, dir_infos, limit, annotate, standalone):
       if abs_srcdir == basedir:
         dirspec = ''
       elif abs_srcdir.startswith(basedir):
-        dirspec = abs_filedir[len(abs_basedir) + 1:]
+        dirspec = abs_srcdir[len(basedir) + 1:]
       else:
-        dirspec = abs_filedir
+        dirspec = abs_srcdir
       basepaths.append(dirspec)
 
   lines = ['<table>']