From 5bec5cce2c0dd7318a1adf9dd69aa3cab88680b9 Mon Sep 17 00:00:00 2001 From: Doug Felt Date: Mon, 16 Apr 2018 18:25:58 -0700 Subject: [PATCH] Update check_emoji_sequences. - use existing utilities in nototools/unicode_data, add_aliases - add check that file names do not use presentation selectors - include tags in valid cps that can appear in a sequence - add check for valid tag sequences (for subregion flags) - separate out check that no source for an alias is present (we expect to alias this so should not have an image with that name) - filter data by age (somewhat), provide command line flag, remove hard-coded unicode 9.0 value - separate coverage check (for when data is partial), provide command line flag and don't run by default - provide command line flag to exclude subdirs by name when collecting images - refactor output so each error has a consistent initial text indicating the check where the error was found, make output a tad less verbose --- check_emoji_sequences.py | 307 +++++++++++++++++++++++---------------- 1 file changed, 182 insertions(+), 125 deletions(-) diff --git a/check_emoji_sequences.py b/check_emoji_sequences.py index 083e0d5a8..f29bbe90b 100755 --- a/check_emoji_sequences.py +++ b/check_emoji_sequences.py @@ -26,25 +26,21 @@ import re import sys from nototools import unicode_data - -DATA_ROOT = path.dirname(path.abspath(__file__)) +import add_aliases ZWJ = 0x200d EMOJI_VS = 0xfe0f -def _is_regional_indicator(cp): - return 0x1f1e6 <= cp <= 0x1f1ff +END_TAG = 0xe007f +def _make_tag_set(): + tag_set = set() + tag_set |= set(range(0xe0030, 0xe003a)) # 0-9 + tag_set |= set(range(0xe0061, 0xe007b)) # a-z + tag_set.add(END_TAG) + return tag_set -def _is_skintone_modifier(cp): - return 0x1f3fb <= cp <= 0x1f3ff - - -def _seq_string(seq): - return '_'.join('%04x' % cp for cp in seq) - -def strip_vs(seq): - return tuple(cp for cp in seq if cp != EMOJI_VS) +TAG_SET = _make_tag_set() _namedata = None @@ -54,7 +50,7 @@ def seq_name(seq): if not _namedata: def strip_vs_map(seq_map): return { - strip_vs(k): v + unicode_data.strip_emoji_vs(k): v for k, v in seq_map.iteritems()} _namedata = [ strip_vs_map(unicode_data.get_emoji_combining_sequences()), @@ -70,7 +66,7 @@ def seq_name(seq): if seq in data: return data[seq] if EMOJI_VS in seq: - non_vs_seq = strip_vs(seq) + non_vs_seq = unicode_data.strip_emoji_vs(seq) for data in _namedata: if non_vs_seq in data: return data[non_vs_seq] @@ -78,14 +74,29 @@ def seq_name(seq): return None -def _check_valid_emoji(sorted_seq_to_filepath): - """Ensure all emoji are either valid emoji or specific chars.""" +def _check_no_vs(sorted_seq_to_filepath): + """Our image data does not use emoji presentation variation selectors.""" + for seq, fp in sorted_seq_to_filepath.iteritems(): + if EMOJI_VS in seq: + print('check no VS: FE0F in path: %s' % fp) - valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps()) + +def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version): + """Ensure all cps in these sequences are valid emoji cps or specific cps + used in forming emoji sequences. This is a 'pre-check' that reports + this specific problem.""" + + valid_cps = set(unicode_data.get_emoji()) + if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE: + valid_cps |= unicode_data.proposed_emoji_cps() + else: + valid_cps = set( + cp for cp in valid_cps if unicode_data.age(cp) <= unicode_version) valid_cps.add(0x200d) # ZWJ valid_cps.add(0x20e3) # combining enclosing keycap valid_cps.add(0xfe0f) # variation selector (emoji presentation) valid_cps.add(0xfe82b) # PUA value for unknown flag + valid_cps |= TAG_SET # used in subregion tag sequences not_emoji = {} for seq, fp in sorted_seq_to_filepath.iteritems(): @@ -96,35 +107,43 @@ def _check_valid_emoji(sorted_seq_to_filepath): not_emoji[cp].append(fp) if len(not_emoji): - print('%d non-emoji found:' % len(not_emoji), file=sys.stderr) + print( + 'check valid emoji cps: %d non-emoji cp found' % len(not_emoji), + file=sys.stderr) for cp in sorted(not_emoji): - print('%04x (in %s)' % (cp, ', '.join(not_emoji[cp])), file=sys.stderr) + fps = not_emoji[cp] + print( + 'check valid emoji cps: %04x (in %d sequences)' % (cp, len(fps)), + file=sys.stderr) def _check_zwj(sorted_seq_to_filepath): - """Ensure zwj is only between two appropriate emoji.""" - ZWJ = 0x200D - EMOJI_PRESENTATION_VS = 0xFE0F + """Ensure zwj is only between two appropriate emoji. This is a 'pre-check' + that reports this specific problem.""" for seq, fp in sorted_seq_to_filepath.iteritems(): if ZWJ not in seq: continue - if seq[0] == 0x200d: - print('zwj at head of sequence in %s' % fp, file=sys.stderr) + if seq[0] == ZWJ: + print('check zwj: zwj at head of sequence in %s' % fp, file=sys.stderr) if len(seq) == 1: continue - if seq[-1] == 0x200d: - print('zwj at end of sequence in %s' % fp, file=sys.stderr) + if seq[-1] == ZWJ: + print('check zwj: zwj at end of sequence in %s' % fp, file=sys.stderr) for i, cp in enumerate(seq): if cp == ZWJ: if i > 0: pcp = seq[i-1] - if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp): - print('non-emoji %04x preceeds ZWJ in %s' % (pcp, fp), file=sys.stderr) + if pcp != EMOJI_VS and not unicode_data.is_emoji(pcp): + print( + 'check zwj: non-emoji %04x preceeds ZWJ in %s' % (pcp, fp), + file=sys.stderr) if i < len(seq) - 1: fcp = seq[i+1] if not unicode_data.is_emoji(fcp): - print('non-emoji %04x follows ZWJ in %s' % (fcp, fp), file=sys.stderr) + print( + 'check zwj: non-emoji %04x follows ZWJ in %s' % (fcp, fp), + file=sys.stderr) def _check_flags(sorted_seq_to_filepath): @@ -133,15 +152,40 @@ def _check_flags(sorted_seq_to_filepath): for seq, fp in sorted_seq_to_filepath.iteritems(): have_reg = None for cp in seq: - is_reg = _is_regional_indicator(cp) + is_reg = unicode_data.is_regional_indicator(cp) if have_reg == None: have_reg = is_reg elif have_reg != is_reg: - print('mix of regional and non-regional in %s' % fp, file=sys.stderr) + print( + 'check flags: mix of regional and non-regional in %s' % fp, + file=sys.stderr) if have_reg and len(seq) > 2: # We provide dummy glyphs for regional indicators, so there are sequences - # with single regional indicator symbols. - print('regional indicator sequence length != 2 in %s' % fp, file=sys.stderr) + # with single regional indicator symbols, the len check handles this. + print( + 'check flags: regional indicator sequence length != 2 in %s' % fp, + file=sys.stderr) + +def _check_tags(sorted_seq_to_filepath): + """Ensure tag sequences (for subregion flags) conform to the spec. We don't + validate against CLDR, just that there's a sequence of 2 or more tags starting + and ending with the appropriate codepoints.""" + + BLACK_FLAG = 0x1f3f4 + BLACK_FLAG_SET = set([BLACK_FLAG]) + for seq, fp in sorted_seq_to_filepath.iteritems(): + seq_set = set(cp for cp in seq) + overlap_set = seq_set & TAG_SET + if not overlap_set: + continue + if seq[0] != BLACK_FLAG: + print('check tags: bad start tag in %s' % fp) + elif seq[-1] != END_TAG: + print('check tags: bad end tag in %s' % fp) + elif len(seq) < 4: + print('check tags: sequence too short in %s' % fp) + elif seq_set - TAG_SET != BLACK_FLAG_SET: + print('check tags: non-tag items in %s' % fp) def _check_skintone(sorted_seq_to_filepath): @@ -151,90 +195,76 @@ def _check_skintone(sorted_seq_to_filepath): base_to_modifiers = collections.defaultdict(set) for seq, fp in sorted_seq_to_filepath.iteritems(): for i, cp in enumerate(seq): - if _is_skintone_modifier(cp): + if unicode_data.is_skintone_modifier(cp): if i == 0: if len(seq) > 1: - print('skin color selector first in sequence %s' % fp, file=sys.stderr) + print( + 'check skintone: skin color selector first in sequence %s' % fp, + file=sys.stderr) # standalone are ok continue pcp = seq[i-1] if not unicode_data.is_emoji_modifier_base(pcp): - print(( - 'emoji skintone modifier applied to non-base at %d: %s' % (i, fp)), file=sys.stderr) - elif unicode_data.is_emoji_modifier_base(cp): - if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]): - base_to_modifiers[cp].add(seq[i+1]) - elif cp not in base_to_modifiers: - base_to_modifiers[cp] = set() + print( + 'check skintone: emoji skintone modifier applied to non-base ' + + 'at %d: %s' % (i, fp), file=sys.stderr) + else: + if pcp not in base_to_modifiers: + base_to_modifiers[pcp] = set() + base_to_modifiers[pcp].add(cp) + for cp, modifiers in sorted(base_to_modifiers.iteritems()): if len(modifiers) != 5: - print('emoji base %04x has %d modifiers defined (%s) in %s' % ( - cp, len(modifiers), - ', '.join('%04x' % cp for cp in sorted(modifiers)), fp), file=sys.stderr) + print( + 'check skintone: base %04x has %d modifiers defined (%s) in %s' % ( + cp, len(modifiers), + ', '.join('%04x' % cp for cp in sorted(modifiers)), fp), + file=sys.stderr) -def _check_zwj_sequences(seq_to_filepath): - """Verify that zwj sequences are valid.""" - zwj_sequence_to_name = unicode_data.get_emoji_zwj_sequences() - # strip emoji variant selectors and add extra mappings - zwj_sequence_without_vs_to_name_canonical = {} - for seq, seq_name in zwj_sequence_to_name.iteritems(): - if EMOJI_VS in seq: - stripped_seq = strip_vs(seq) - zwj_sequence_without_vs_to_name_canonical[stripped_seq] = (seq_name, seq) - - zwj_seq_to_filepath = { - seq: fp for seq, fp in seq_to_filepath.iteritems() - if ZWJ in seq} - - for seq, fp in zwj_seq_to_filepath.iteritems(): - if seq not in zwj_sequence_to_name: - if seq not in zwj_sequence_without_vs_to_name_canonical: - print('zwj sequence not defined: %s' % fp, file=sys.stderr) - else: - _, can = zwj_sequence_without_vs_to_name_canonical[seq] - # print >> sys.stderr, 'canonical sequence %s contains vs: %s' % ( - # _seq_string(can), fp) - -def read_emoji_aliases(): - result = {} - - with open(path.join(DATA_ROOT, 'emoji_aliases.txt'), 'r') as f: - for line in f: - ix = line.find('#') - if (ix > -1): - line = line[:ix] - line = line.strip() - if not line: - continue - als, trg = (s.strip() for s in line.split(';')) - als_seq = tuple([int(x, 16) for x in als.split('_')]) - try: - trg_seq = tuple([int(x, 16) for x in trg.split('_')]) - except: - print('cannot process alias %s -> %s' % (als, trg)) - continue - result[als_seq] = trg_seq - return result +def _check_zwj_sequences(sorted_seq_to_filepath, unicode_version): + """Verify that zwj sequences are valid for the given unicode version.""" + for seq, fp in sorted_seq_to_filepath.iteritems(): + if ZWJ not in seq: + continue + age = unicode_data.get_emoji_sequence_age(seq) + if age is None or unicode_version is not None and age > unicode_version: + print('check zwj sequences: undefined sequence %s' % fp) -def _check_coverage(seq_to_filepath): - age = 9.0 +def _check_no_alias_sources(sorted_seq_to_filepath): + """Check that we don't have sequences that we expect to be aliased to + some other sequence.""" + aliases = add_aliases.read_default_emoji_aliases() + for seq, fp in sorted_seq_to_filepath.iteritems(): + if seq in aliases: + print('check no alias sources: aliased sequence %s' % fp) + + +def _check_coverage(seq_to_filepath, unicode_version): + """Ensure we have all and only the cps and sequences that we need for the + font as of this version.""" + + age = unicode_version non_vs_to_canonical = {} for k in seq_to_filepath: if EMOJI_VS in k: - non_vs = strip_vs(k) + non_vs = unicode_data.strip_emoji_vs(k) non_vs_to_canonical[non_vs] = k - aliases = read_emoji_aliases() + aliases = add_aliases.read_default_emoji_aliases() for k, v in sorted(aliases.items()): if v not in seq_to_filepath and v not in non_vs_to_canonical: - print('alias %s missing target %s' % (_seq_string(k), _seq_string(v))) + alias_str = unicode_data.seq_to_string(k) + target_str = unicode_data.seq_to_string(v) + print('coverage: alias %s missing target %s' % (alias_str, target_str)) continue if k in seq_to_filepath or k in non_vs_to_canonical: - print('alias %s already exists as %s (%s)' % ( - _seq_string(k), _seq_string(v), seq_name(v))) + alias_str = unicode_data.seq_to_string(k) + target_str = unicode_data.seq_to_string(v) + print('coverage: alias %s already exists as %s (%s)' % ( + alias_str, target_str, seq_name(v))) continue filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]] seq_to_filepath[k] = 'alias:' + filename @@ -243,13 +273,15 @@ def _check_coverage(seq_to_filepath): emoji = sorted(unicode_data.get_emoji(age=age)) for cp in emoji: if tuple([cp]) not in seq_to_filepath: - print('missing single %04x (%s)' % (cp, unicode_data.name(cp, ''))) + print( + 'coverage: missing single %04x (%s)' % ( + cp, unicode_data.name(cp, ''))) # special characters # all but combining enclosing keycap are currently marked as emoji for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a): if cp not in emoji and tuple([cp]) not in seq_to_filepath: - print('missing special %04x (%s)' % (cp, unicode_data.name(cp))) + print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp))) # combining sequences comb_seq_to_name = sorted( @@ -257,24 +289,26 @@ def _check_coverage(seq_to_filepath): for seq, name in comb_seq_to_name: if seq not in seq_to_filepath: # strip vs and try again - non_vs_seq = strip_vs(seq) + non_vs_seq = unicode_data.strip_emoji_vs(seq) if non_vs_seq not in seq_to_filepath: - print('missing combining sequence %s (%s)' % (_seq_string(seq), name)) + print('coverage: missing combining sequence %s (%s)' % + (unicode_data.seq_to_string(seq), name)) # flag sequences flag_seq_to_name = sorted( unicode_data.get_emoji_flag_sequences(age=age).iteritems()) for seq, name in flag_seq_to_name: if seq not in seq_to_filepath: - print('missing flag sequence %s (%s)' % (_seq_string(seq), name)) + print('coverage: missing flag sequence %s (%s)' % + (unicode_data.seq_to_string(seq), name)) # skin tone modifier sequences mod_seq_to_name = sorted( unicode_data.get_emoji_modifier_sequences(age=age).iteritems()) for seq, name in mod_seq_to_name: if seq not in seq_to_filepath: - print('missing modifier sequence %s (%s)' % ( - _seq_string(seq), name)) + print('coverage: missing modifier sequence %s (%s)' % ( + unicode_data.seq_to_string(seq), name)) # zwj sequences # some of ours include the emoji presentation variation selector and some @@ -295,25 +329,30 @@ def _check_coverage(seq_to_filepath): else: test_seq = seq if test_seq not in zwj_seq_without_vs: - print('missing (canonical) zwj sequence %s (%s)' % ( - _seq_string(seq), name)) + print('coverage: missing (canonical) zwj sequence %s (%s)' % ( + unicode_data.seq_to_string(seq), name)) # check for 'unknown flag' - # this is either emoji_ufe82b or 'unknown_flag', we filter out things that + # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that # don't start with our prefix so 'unknown_flag' would be excluded by default. if tuple([0xfe82b]) not in seq_to_filepath: - print('missing unknown flag PUA fe82b') + print('coverage: missing unknown flag PUA fe82b') -def check_sequence_to_filepath(seq_to_filepath): +def check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage): sorted_seq_to_filepath = collections.OrderedDict( sorted(seq_to_filepath.items())) - _check_valid_emoji(sorted_seq_to_filepath) + _check_no_vs(sorted_seq_to_filepath) + _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version) _check_zwj(sorted_seq_to_filepath) _check_flags(sorted_seq_to_filepath) + _check_tags(sorted_seq_to_filepath) _check_skintone(sorted_seq_to_filepath) - _check_zwj_sequences(sorted_seq_to_filepath) - _check_coverage(sorted_seq_to_filepath) + _check_zwj_sequences(sorted_seq_to_filepath, unicode_version) + _check_no_alias_sources(sorted_seq_to_filepath) + if coverage: + _check_coverage(sorted_seq_to_filepath, unicode_version) + def create_sequence_to_filepath(name_to_dirpath, prefix, suffix): """Check names, and convert name to sequences for names that are ok, @@ -345,12 +384,15 @@ def create_sequence_to_filepath(name_to_dirpath, prefix, suffix): return result -def collect_name_to_dirpath(directory, prefix, suffix): +def collect_name_to_dirpath(directory, prefix, suffix, exclude=None): """Return a mapping from filename to path rooted at directory, ignoring files - that don't match suffix. Report when a filename appears in more than one - subdir; the first path found is kept.""" + that don't match suffix, and subtrees with names in exclude. Report when a + filename appears in more than one subdir; the first path found is kept.""" result = {} - for dirname, _, files in os.walk(directory): + for dirname, dirs, files in os.walk(directory, topdown=True): + if exclude: + dirs[:] = [d for d in dirs if d not in exclude] + if directory != '.': dirname = path.join(directory, dirname) for f in files: @@ -364,42 +406,57 @@ def collect_name_to_dirpath(directory, prefix, suffix): return result -def collect_name_to_dirpath_with_override(dirs, prefix, suffix): +def collect_name_to_dirpath_with_override(dirs, prefix, suffix, exclude=None): """Return a mapping from filename to a directory path rooted at a directory in dirs, using collect_name_to_filepath. The last directory is retained. This does not report an error if a file appears under more than one root directory, - so lets later root directories override earlier ones.""" + so lets later root directories override earlier ones. Use 'exclude' to + name subdirectories (of any root) whose subtree you wish to skip.""" result = {} for d in dirs: - result.update(collect_name_to_dirpath(d, prefix, suffix)) + result.update(collect_name_to_dirpath(d, prefix, suffix, exclude)) return result -def run_check(dirs, prefix, suffix): - print('Checking files with prefix "%s" and suffix "%s" in:\n %s' % ( - prefix, suffix, '\n '.join(dirs))) +def run_check(dirs, prefix, suffix, exclude, unicode_version, coverage): + msg = '' + if unicode_version: + msg = ' (%3.1f)' % unicode_version + print('Checking files with prefix "%s" and suffix "%s"%s in:\n %s' % ( + prefix, suffix, msg, '\n '.join(dirs))) name_to_dirpath = collect_name_to_dirpath_with_override( - dirs, prefix=prefix, suffix=suffix) + dirs, prefix=prefix, suffix=suffix, exclude=exclude) print('checking %d names' % len(name_to_dirpath)) seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix) print('checking %d sequences' % len(seq_to_filepath)) - check_sequence_to_filepath(seq_to_filepath) + check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage) print('done.') def main(): parser = argparse.ArgumentParser() parser.add_argument( - '-d', '--dirs', help='directories containing emoji images', + '-d', '--dirs', help='directory roots containing emoji images', metavar='dir', nargs='+', required=True) + parser.add_argument( + '-e', '--exclude', help='names of source subdirs to exclude', + metavar='dir', nargs='+') + parser.add_argument( + '-c', '--coverage', help='test for complete coverage', + action='store_true') parser.add_argument( '-p', '--prefix', help='prefix to match, default "emoji_u"', metavar='pfx', default='emoji_u') parser.add_argument( '-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx', default='.png') + parser.add_argument( + '-u', '--unicode_version', help='limit to this unicode version or before', + metavar='version', type=float) args = parser.parse_args() - run_check(args.dirs, args.prefix, args.suffix) + run_check( + args.dirs, args.prefix, args.suffix, args.exclude, args.unicode_version, + args.coverage) if __name__ == '__main__':