From 4ec1a5fed8f2f1b953c44e3c52ee6dbceca3d7a2 Mon Sep 17 00:00:00 2001 From: Doug Felt Date: Wed, 18 Jan 2017 14:48:52 -0800 Subject: [PATCH] Update emoji sequence check to be more flexible. - supports checking files with other extension besides .png - checks all files under a root directory and not just the files directly in a directory - checks for duplicate files in multiple directories under a root - reports the directory containing a file when there are problems --- check_emoji_sequences.py | 164 +++++++++++++++++++++++++++------------ generate_emoji_html.py | 4 +- 2 files changed, 115 insertions(+), 53 deletions(-) diff --git a/check_emoji_sequences.py b/check_emoji_sequences.py index c2649ad25..5bf1626fe 100755 --- a/check_emoji_sequences.py +++ b/check_emoji_sequences.py @@ -19,6 +19,7 @@ import argparse import collections import glob +import os from os import path import re import sys @@ -37,7 +38,7 @@ def _seq_string(seq): return '_'.join('%04x' % cp for cp in seq) -def _check_valid_emoji(sorted_seqs): +def _check_valid_emoji(sorted_seq_to_filepath): """Ensure all emoji are either valid emoji or specific chars.""" valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps()) @@ -46,79 +47,80 @@ def _check_valid_emoji(sorted_seqs): valid_cps.add(0xfe0f) # variation selector (emoji presentation) valid_cps.add(0xfe82b) # PUA value for unknown flag - not_emoji = set() - for seq in sorted_seqs: + not_emoji = {} + for seq, fp in sorted_seq_to_filepath.iteritems(): for cp in seq: if cp not in valid_cps: - not_emoji.add(cp) + if cp not in not_emoji: + not_emoji[cp] = [] + not_emoji[cp].append(fp) if len(not_emoji): print >> sys.stderr, '%d non-emoji found:' % len(not_emoji) for cp in sorted(not_emoji): - print >> sys.stderr, '%04X' % cp + print >> sys.stderr, '%04x (in %s)' % (cp, ', '.join(not_emoji[cp])) -def _check_zwj(sorted_seqs): +def _check_zwj(sorted_seq_to_filepath): """Ensure zwj is only between two appropriate emoji.""" ZWJ = 0x200D EMOJI_PRESENTATION_VS = 0xFE0F - for seq in sorted_seqs: + for seq, fp in sorted_seq_to_filepath.iteritems(): if ZWJ not in seq: continue if seq[0] == 0x200d: - print >> sys.stderr, 'zwj at head of sequence' + print >> sys.stderr, 'zwj at head of sequence in %s' % fp if len(seq) == 1: continue if seq[-1] == 0x200d: - print >> sys.stderr, 'zwj at end of sequence' + print >> sys.stderr, 'zwj at end of sequence in %s' % fp for i, cp in enumerate(seq): if cp == ZWJ: - pcp = seq[i-1] - if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp): - print >> sys.stderr, 'non-emoji %04X preceeds ZWJ' % pcp - fcp = seq[i+1] - if not unicode_data.is_emoji(fcp): - print >> sys.stderr, 'non-emoji %04X follows ZWJ' % fcp + if i > 0: + pcp = seq[i-1] + if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp): + print >> sys.stderr, 'non-emoji %04x preceeds ZWJ in %s' % (pcp, fp) + if i < len(seq) - 1: + fcp = seq[i+1] + if not unicode_data.is_emoji(fcp): + print >> sys.stderr, 'non-emoji %04x follows ZWJ in %s' % (fcp, fp) -def _check_flags(sorted_seqs): +def _check_flags(sorted_seq_to_filepath): """Ensure regional indicators are only in sequences of one or two, and never mixed.""" - for seq in sorted_seqs: + for seq, fp in sorted_seq_to_filepath.iteritems(): have_reg = None for cp in seq: is_reg = _is_regional_indicator(cp) if have_reg == None: have_reg = is_reg elif have_reg != is_reg: - print >> sys.stderr, ('mix of regional and non-regional in %s' % - _seq_string(seq)) + print >> sys.stderr, 'mix of regional and non-regional in %s' % fp if have_reg and len(seq) > 2: # We provide dummy glyphs for regional indicators, so there are sequences # with single regional indicator symbols. - print >> sys.stderr, ('regional indicator sequence length != 2: %s' % - _seq_string(seq)) + print >> sys.stderr, 'regional indicator sequence length != 2 in %s' % fp -def _check_skintone(sorted_seqs): +def _check_skintone(sorted_seq_to_filepath): """Ensure skin tone modifiers are not applied to emoji that are not defined to take them. May appear standalone, though. Also check that emoji that take skin tone modifiers have a complete set.""" base_to_modifiers = collections.defaultdict(set) - for seq in sorted_seqs: + for seq, fp in sorted_seq_to_filepath.iteritems(): for i, cp in enumerate(seq): if _is_skintone_modifier(cp): if i == 0: if len(seq) > 1: - print >> sys.stderr, 'skin color selector first in sequence %s' + print >> sys.stderr, 'skin color selector first in sequence %s' % fp # standalone are ok continue pcp = seq[i-1] if not unicode_data.is_emoji_modifier_base(pcp): print >> sys.stderr, ( - 'emoji skintone modifier applied to non-base at %d: %s' % ( - i, _seq_string(seq))) + 'emoji skintone modifier applied to non-base at %d: %s' % (i, fp)) elif unicode_data.is_emoji_modifier_base(cp): if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]): base_to_modifiers[cp].add(seq[i+1]) @@ -126,36 +128,90 @@ def _check_skintone(sorted_seqs): base_to_modifiers[cp] = set() for cp, modifiers in sorted(base_to_modifiers.iteritems()): if len(modifiers) != 5: - print 'emoji base %04X has %d modifiers defined (%s)' % ( + print 'emoji base %04x has %d modifiers defined (%s) in %s' % ( cp, len(modifiers), - ', '.join('%04x' % cp for cp in sorted(modifiers))) + ', '.join('%04x' % cp for cp in sorted(modifiers)), fp) -def check_sequences(seqs): - sorted_seqs = sorted(seqs) - print 'checking %d sequences' % len(seqs) - _check_valid_emoji(sorted_seqs) - _check_zwj(sorted_seqs) - _check_flags(sorted_seqs) - _check_skintone(sorted_seqs) - print 'done.' +def check_sequence_to_filepath(seq_to_filepath): + sorted_seq_to_filepath = collections.OrderedDict( + sorted(seq_to_filepath.items())) + _check_valid_emoji(sorted_seq_to_filepath) + _check_zwj(sorted_seq_to_filepath) + _check_flags(sorted_seq_to_filepath) + _check_skintone(sorted_seq_to_filepath) -def _collect_sequences(dirs, prefix='emoji_u'): - seqs = set() - path_re = re.compile('%s([a-zA-Z0-9_]+)\.png' % prefix) +def create_sequence_to_filepath(name_to_dirpath, prefix, suffix): + """Check names, and convert name to sequences for names that are ok, + returning a sequence to file path mapping. Reports bad segments + of a name to stderr.""" + segment_re = re.compile(r'^[0-9a-f]{4,6}$') + result = {} + for name, dirname in name_to_dirpath.iteritems(): + if not name.startswith(prefix): + print 'expected prefix "%s" for "%s"' % (prefix, name) + continue + + segments = name[len(prefix): -len(suffix)].split('_') + segfail = False + seq = [] + for s in segments: + if not segment_re.match(s): + print 'bad codepoint name "%s" in %s/%s' % (s, dirname, name) + segfail = True + continue + n = int(s, 16) + if n > 0x10ffff: + print 'codepoint "%s" out of range in %s/%s' % (s, dirname, name) + segfail = True + continue + seq.append(n) + if not segfail: + result[tuple(seq)] = path.join(dirname, name) + return result + + +def collect_name_to_dirpath(directory, prefix, suffix): + """Return a mapping from filename to path rooted at directory, ignoring files + that don't match suffix. Report when a filename appears in more than one + subdir; the first path found is kept.""" + result = {} + for dirname, _, files in os.walk(directory): + if directory != '.': + dirname = path.join(directory, dirname) + for f in files: + if not f.endswith(suffix): + continue + if f in result: + print >> sys.stderr, 'duplicate file "%s" in %s and %s ' % ( + f, dirname, result[f]) + continue + result[f] = dirname + return result + + +def collect_name_to_dirpath_with_override(dirs, prefix, suffix): + """Return a mapping from filename to a directory path rooted at a directory + in dirs, using collect_name_to_filepath. The last directory is retained. This + does not report an error if a file appears under more than one root directory, + so lets later root directories override earlier ones.""" + result = {} for d in dirs: - for f in glob.glob(path.join(d, '%s*.png' % prefix)): - m = path_re.match(path.basename(f)) - if not m: - print >> sys.stderr, 'could not match file "%s"' % f - continue - seq = tuple(int(s, 16) for s in m.group(1).split('_')) - if seq in seqs: - print >> sys.stderr, 'duplicate sequence for "%s"' % f - continue - seqs.add(seq) - return seqs + result.update(collect_name_to_dirpath(d, prefix, suffix)) + return result + + +def run_check(dirs, prefix, suffix): + print 'Checking files with prefix "%s" and suffix "%s" in:\n %s' % ( + prefix, suffix, '\n '.join(dirs)) + name_to_dirpath = collect_name_to_dirpath_with_override( + dirs, prefix=prefix, suffix=suffix) + print 'checking %d names' % len(name_to_dirpath) + seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix) + print 'checking %d sequences' % len(seq_to_filepath) + check_sequence_to_filepath(seq_to_filepath) + print 'done.' def main(): @@ -163,8 +219,14 @@ def main(): parser.add_argument( '-d', '--dirs', help='directories containing emoji images', metavar='dir', nargs='+', required=True) + parser.add_argument( + '-p', '--prefix', help='prefix to match, default "emoji_u"', + metavar='pfx', default='emoji_u') + parser.add_argument( + '-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx', + default='.png') args = parser.parse_args() - check_sequences(_collect_sequences(args.dirs)) + run_check(args.dirs, args.prefix, args.suffix) if __name__ == '__main__': diff --git a/generate_emoji_html.py b/generate_emoji_html.py index f62113a64..3df68881c 100755 --- a/generate_emoji_html.py +++ b/generate_emoji_html.py @@ -204,9 +204,9 @@ def _generate_content(basedir, font, dir_infos, limit, annotate, standalone): if abs_srcdir == basedir: dirspec = '' elif abs_srcdir.startswith(basedir): - dirspec = abs_filedir[len(abs_basedir) + 1:] + dirspec = abs_srcdir[len(basedir) + 1:] else: - dirspec = abs_filedir + dirspec = abs_srcdir basepaths.append(dirspec) lines = ['']