Update emoji sequence check to be more flexible.

- supports checking files with other extension besides .png
- checks all files under a root directory and not just the
  files directly in a directory
- checks for duplicate files in multiple directories under a root
- reports the directory containing a file when there are problems
pull/83/head
Doug Felt 2017-01-18 14:48:52 -08:00
parent f09b63d1ec
commit 4ec1a5fed8
2 changed files with 115 additions and 53 deletions

View File

@ -19,6 +19,7 @@
import argparse
import collections
import glob
import os
from os import path
import re
import sys
@ -37,7 +38,7 @@ def _seq_string(seq):
return '_'.join('%04x' % cp for cp in seq)
def _check_valid_emoji(sorted_seqs):
def _check_valid_emoji(sorted_seq_to_filepath):
"""Ensure all emoji are either valid emoji or specific chars."""
valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps())
@ -46,79 +47,80 @@ def _check_valid_emoji(sorted_seqs):
valid_cps.add(0xfe0f) # variation selector (emoji presentation)
valid_cps.add(0xfe82b) # PUA value for unknown flag
not_emoji = set()
for seq in sorted_seqs:
not_emoji = {}
for seq, fp in sorted_seq_to_filepath.iteritems():
for cp in seq:
if cp not in valid_cps:
not_emoji.add(cp)
if cp not in not_emoji:
not_emoji[cp] = []
not_emoji[cp].append(fp)
if len(not_emoji):
print >> sys.stderr, '%d non-emoji found:' % len(not_emoji)
for cp in sorted(not_emoji):
print >> sys.stderr, '%04X' % cp
print >> sys.stderr, '%04x (in %s)' % (cp, ', '.join(not_emoji[cp]))
def _check_zwj(sorted_seqs):
def _check_zwj(sorted_seq_to_filepath):
"""Ensure zwj is only between two appropriate emoji."""
ZWJ = 0x200D
EMOJI_PRESENTATION_VS = 0xFE0F
for seq in sorted_seqs:
for seq, fp in sorted_seq_to_filepath.iteritems():
if ZWJ not in seq:
continue
if seq[0] == 0x200d:
print >> sys.stderr, 'zwj at head of sequence'
print >> sys.stderr, 'zwj at head of sequence in %s' % fp
if len(seq) == 1:
continue
if seq[-1] == 0x200d:
print >> sys.stderr, 'zwj at end of sequence'
print >> sys.stderr, 'zwj at end of sequence in %s' % fp
for i, cp in enumerate(seq):
if cp == ZWJ:
if i > 0:
pcp = seq[i-1]
if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp):
print >> sys.stderr, 'non-emoji %04X preceeds ZWJ' % pcp
print >> sys.stderr, 'non-emoji %04x preceeds ZWJ in %s' % (pcp, fp)
if i < len(seq) - 1:
fcp = seq[i+1]
if not unicode_data.is_emoji(fcp):
print >> sys.stderr, 'non-emoji %04X follows ZWJ' % fcp
print >> sys.stderr, 'non-emoji %04x follows ZWJ in %s' % (fcp, fp)
def _check_flags(sorted_seqs):
def _check_flags(sorted_seq_to_filepath):
"""Ensure regional indicators are only in sequences of one or two, and
never mixed."""
for seq in sorted_seqs:
for seq, fp in sorted_seq_to_filepath.iteritems():
have_reg = None
for cp in seq:
is_reg = _is_regional_indicator(cp)
if have_reg == None:
have_reg = is_reg
elif have_reg != is_reg:
print >> sys.stderr, ('mix of regional and non-regional in %s' %
_seq_string(seq))
print >> sys.stderr, 'mix of regional and non-regional in %s' % fp
if have_reg and len(seq) > 2:
# We provide dummy glyphs for regional indicators, so there are sequences
# with single regional indicator symbols.
print >> sys.stderr, ('regional indicator sequence length != 2: %s' %
_seq_string(seq))
print >> sys.stderr, 'regional indicator sequence length != 2 in %s' % fp
def _check_skintone(sorted_seqs):
def _check_skintone(sorted_seq_to_filepath):
"""Ensure skin tone modifiers are not applied to emoji that are not defined
to take them. May appear standalone, though. Also check that emoji that take
skin tone modifiers have a complete set."""
base_to_modifiers = collections.defaultdict(set)
for seq in sorted_seqs:
for seq, fp in sorted_seq_to_filepath.iteritems():
for i, cp in enumerate(seq):
if _is_skintone_modifier(cp):
if i == 0:
if len(seq) > 1:
print >> sys.stderr, 'skin color selector first in sequence %s'
print >> sys.stderr, 'skin color selector first in sequence %s' % fp
# standalone are ok
continue
pcp = seq[i-1]
if not unicode_data.is_emoji_modifier_base(pcp):
print >> sys.stderr, (
'emoji skintone modifier applied to non-base at %d: %s' % (
i, _seq_string(seq)))
'emoji skintone modifier applied to non-base at %d: %s' % (i, fp))
elif unicode_data.is_emoji_modifier_base(cp):
if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]):
base_to_modifiers[cp].add(seq[i+1])
@ -126,36 +128,90 @@ def _check_skintone(sorted_seqs):
base_to_modifiers[cp] = set()
for cp, modifiers in sorted(base_to_modifiers.iteritems()):
if len(modifiers) != 5:
print 'emoji base %04X has %d modifiers defined (%s)' % (
print 'emoji base %04x has %d modifiers defined (%s) in %s' % (
cp, len(modifiers),
', '.join('%04x' % cp for cp in sorted(modifiers)))
', '.join('%04x' % cp for cp in sorted(modifiers)), fp)
def check_sequences(seqs):
sorted_seqs = sorted(seqs)
print 'checking %d sequences' % len(seqs)
_check_valid_emoji(sorted_seqs)
_check_zwj(sorted_seqs)
_check_flags(sorted_seqs)
_check_skintone(sorted_seqs)
print 'done.'
def check_sequence_to_filepath(seq_to_filepath):
sorted_seq_to_filepath = collections.OrderedDict(
sorted(seq_to_filepath.items()))
_check_valid_emoji(sorted_seq_to_filepath)
_check_zwj(sorted_seq_to_filepath)
_check_flags(sorted_seq_to_filepath)
_check_skintone(sorted_seq_to_filepath)
def _collect_sequences(dirs, prefix='emoji_u'):
seqs = set()
path_re = re.compile('%s([a-zA-Z0-9_]+)\.png' % prefix)
def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
"""Check names, and convert name to sequences for names that are ok,
returning a sequence to file path mapping. Reports bad segments
of a name to stderr."""
segment_re = re.compile(r'^[0-9a-f]{4,6}$')
result = {}
for name, dirname in name_to_dirpath.iteritems():
if not name.startswith(prefix):
print 'expected prefix "%s" for "%s"' % (prefix, name)
continue
segments = name[len(prefix): -len(suffix)].split('_')
segfail = False
seq = []
for s in segments:
if not segment_re.match(s):
print 'bad codepoint name "%s" in %s/%s' % (s, dirname, name)
segfail = True
continue
n = int(s, 16)
if n > 0x10ffff:
print 'codepoint "%s" out of range in %s/%s' % (s, dirname, name)
segfail = True
continue
seq.append(n)
if not segfail:
result[tuple(seq)] = path.join(dirname, name)
return result
def collect_name_to_dirpath(directory, prefix, suffix):
"""Return a mapping from filename to path rooted at directory, ignoring files
that don't match suffix. Report when a filename appears in more than one
subdir; the first path found is kept."""
result = {}
for dirname, _, files in os.walk(directory):
if directory != '.':
dirname = path.join(directory, dirname)
for f in files:
if not f.endswith(suffix):
continue
if f in result:
print >> sys.stderr, 'duplicate file "%s" in %s and %s ' % (
f, dirname, result[f])
continue
result[f] = dirname
return result
def collect_name_to_dirpath_with_override(dirs, prefix, suffix):
"""Return a mapping from filename to a directory path rooted at a directory
in dirs, using collect_name_to_filepath. The last directory is retained. This
does not report an error if a file appears under more than one root directory,
so lets later root directories override earlier ones."""
result = {}
for d in dirs:
for f in glob.glob(path.join(d, '%s*.png' % prefix)):
m = path_re.match(path.basename(f))
if not m:
print >> sys.stderr, 'could not match file "%s"' % f
continue
seq = tuple(int(s, 16) for s in m.group(1).split('_'))
if seq in seqs:
print >> sys.stderr, 'duplicate sequence for "%s"' % f
continue
seqs.add(seq)
return seqs
result.update(collect_name_to_dirpath(d, prefix, suffix))
return result
def run_check(dirs, prefix, suffix):
print 'Checking files with prefix "%s" and suffix "%s" in:\n %s' % (
prefix, suffix, '\n '.join(dirs))
name_to_dirpath = collect_name_to_dirpath_with_override(
dirs, prefix=prefix, suffix=suffix)
print 'checking %d names' % len(name_to_dirpath)
seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix)
print 'checking %d sequences' % len(seq_to_filepath)
check_sequence_to_filepath(seq_to_filepath)
print 'done.'
def main():
@ -163,8 +219,14 @@ def main():
parser.add_argument(
'-d', '--dirs', help='directories containing emoji images',
metavar='dir', nargs='+', required=True)
parser.add_argument(
'-p', '--prefix', help='prefix to match, default "emoji_u"',
metavar='pfx', default='emoji_u')
parser.add_argument(
'-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx',
default='.png')
args = parser.parse_args()
check_sequences(_collect_sequences(args.dirs))
run_check(args.dirs, args.prefix, args.suffix)
if __name__ == '__main__':

View File

@ -204,9 +204,9 @@ def _generate_content(basedir, font, dir_infos, limit, annotate, standalone):
if abs_srcdir == basedir:
dirspec = ''
elif abs_srcdir.startswith(basedir):
dirspec = abs_filedir[len(abs_basedir) + 1:]
dirspec = abs_srcdir[len(basedir) + 1:]
else:
dirspec = abs_filedir
dirspec = abs_srcdir
basepaths.append(dirspec)
lines = ['<table>']