Merge pull request #210 from dougfelt/emoji_check_update
Update check_emoji_sequences.pull/205/merge
commit
1a9fb3cd76
|
@ -26,25 +26,21 @@ import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from nototools import unicode_data
|
from nototools import unicode_data
|
||||||
|
import add_aliases
|
||||||
DATA_ROOT = path.dirname(path.abspath(__file__))
|
|
||||||
|
|
||||||
ZWJ = 0x200d
|
ZWJ = 0x200d
|
||||||
EMOJI_VS = 0xfe0f
|
EMOJI_VS = 0xfe0f
|
||||||
|
|
||||||
def _is_regional_indicator(cp):
|
END_TAG = 0xe007f
|
||||||
return 0x1f1e6 <= cp <= 0x1f1ff
|
|
||||||
|
|
||||||
|
def _make_tag_set():
|
||||||
|
tag_set = set()
|
||||||
|
tag_set |= set(range(0xe0030, 0xe003a)) # 0-9
|
||||||
|
tag_set |= set(range(0xe0061, 0xe007b)) # a-z
|
||||||
|
tag_set.add(END_TAG)
|
||||||
|
return tag_set
|
||||||
|
|
||||||
def _is_skintone_modifier(cp):
|
TAG_SET = _make_tag_set()
|
||||||
return 0x1f3fb <= cp <= 0x1f3ff
|
|
||||||
|
|
||||||
|
|
||||||
def _seq_string(seq):
|
|
||||||
return '_'.join('%04x' % cp for cp in seq)
|
|
||||||
|
|
||||||
def strip_vs(seq):
|
|
||||||
return tuple(cp for cp in seq if cp != EMOJI_VS)
|
|
||||||
|
|
||||||
_namedata = None
|
_namedata = None
|
||||||
|
|
||||||
|
@ -54,7 +50,7 @@ def seq_name(seq):
|
||||||
if not _namedata:
|
if not _namedata:
|
||||||
def strip_vs_map(seq_map):
|
def strip_vs_map(seq_map):
|
||||||
return {
|
return {
|
||||||
strip_vs(k): v
|
unicode_data.strip_emoji_vs(k): v
|
||||||
for k, v in seq_map.iteritems()}
|
for k, v in seq_map.iteritems()}
|
||||||
_namedata = [
|
_namedata = [
|
||||||
strip_vs_map(unicode_data.get_emoji_combining_sequences()),
|
strip_vs_map(unicode_data.get_emoji_combining_sequences()),
|
||||||
|
@ -70,7 +66,7 @@ def seq_name(seq):
|
||||||
if seq in data:
|
if seq in data:
|
||||||
return data[seq]
|
return data[seq]
|
||||||
if EMOJI_VS in seq:
|
if EMOJI_VS in seq:
|
||||||
non_vs_seq = strip_vs(seq)
|
non_vs_seq = unicode_data.strip_emoji_vs(seq)
|
||||||
for data in _namedata:
|
for data in _namedata:
|
||||||
if non_vs_seq in data:
|
if non_vs_seq in data:
|
||||||
return data[non_vs_seq]
|
return data[non_vs_seq]
|
||||||
|
@ -78,14 +74,29 @@ def seq_name(seq):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _check_valid_emoji(sorted_seq_to_filepath):
|
def _check_no_vs(sorted_seq_to_filepath):
|
||||||
"""Ensure all emoji are either valid emoji or specific chars."""
|
"""Our image data does not use emoji presentation variation selectors."""
|
||||||
|
for seq, fp in sorted_seq_to_filepath.iteritems():
|
||||||
|
if EMOJI_VS in seq:
|
||||||
|
print('check no VS: FE0F in path: %s' % fp)
|
||||||
|
|
||||||
valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps())
|
|
||||||
|
def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version):
|
||||||
|
"""Ensure all cps in these sequences are valid emoji cps or specific cps
|
||||||
|
used in forming emoji sequences. This is a 'pre-check' that reports
|
||||||
|
this specific problem."""
|
||||||
|
|
||||||
|
valid_cps = set(unicode_data.get_emoji())
|
||||||
|
if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE:
|
||||||
|
valid_cps |= unicode_data.proposed_emoji_cps()
|
||||||
|
else:
|
||||||
|
valid_cps = set(
|
||||||
|
cp for cp in valid_cps if unicode_data.age(cp) <= unicode_version)
|
||||||
valid_cps.add(0x200d) # ZWJ
|
valid_cps.add(0x200d) # ZWJ
|
||||||
valid_cps.add(0x20e3) # combining enclosing keycap
|
valid_cps.add(0x20e3) # combining enclosing keycap
|
||||||
valid_cps.add(0xfe0f) # variation selector (emoji presentation)
|
valid_cps.add(0xfe0f) # variation selector (emoji presentation)
|
||||||
valid_cps.add(0xfe82b) # PUA value for unknown flag
|
valid_cps.add(0xfe82b) # PUA value for unknown flag
|
||||||
|
valid_cps |= TAG_SET # used in subregion tag sequences
|
||||||
|
|
||||||
not_emoji = {}
|
not_emoji = {}
|
||||||
for seq, fp in sorted_seq_to_filepath.iteritems():
|
for seq, fp in sorted_seq_to_filepath.iteritems():
|
||||||
|
@ -96,35 +107,43 @@ def _check_valid_emoji(sorted_seq_to_filepath):
|
||||||
not_emoji[cp].append(fp)
|
not_emoji[cp].append(fp)
|
||||||
|
|
||||||
if len(not_emoji):
|
if len(not_emoji):
|
||||||
print('%d non-emoji found:' % len(not_emoji), file=sys.stderr)
|
print(
|
||||||
|
'check valid emoji cps: %d non-emoji cp found' % len(not_emoji),
|
||||||
|
file=sys.stderr)
|
||||||
for cp in sorted(not_emoji):
|
for cp in sorted(not_emoji):
|
||||||
print('%04x (in %s)' % (cp, ', '.join(not_emoji[cp])), file=sys.stderr)
|
fps = not_emoji[cp]
|
||||||
|
print(
|
||||||
|
'check valid emoji cps: %04x (in %d sequences)' % (cp, len(fps)),
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
def _check_zwj(sorted_seq_to_filepath):
|
def _check_zwj(sorted_seq_to_filepath):
|
||||||
"""Ensure zwj is only between two appropriate emoji."""
|
"""Ensure zwj is only between two appropriate emoji. This is a 'pre-check'
|
||||||
ZWJ = 0x200D
|
that reports this specific problem."""
|
||||||
EMOJI_PRESENTATION_VS = 0xFE0F
|
|
||||||
|
|
||||||
for seq, fp in sorted_seq_to_filepath.iteritems():
|
for seq, fp in sorted_seq_to_filepath.iteritems():
|
||||||
if ZWJ not in seq:
|
if ZWJ not in seq:
|
||||||
continue
|
continue
|
||||||
if seq[0] == 0x200d:
|
if seq[0] == ZWJ:
|
||||||
print('zwj at head of sequence in %s' % fp, file=sys.stderr)
|
print('check zwj: zwj at head of sequence in %s' % fp, file=sys.stderr)
|
||||||
if len(seq) == 1:
|
if len(seq) == 1:
|
||||||
continue
|
continue
|
||||||
if seq[-1] == 0x200d:
|
if seq[-1] == ZWJ:
|
||||||
print('zwj at end of sequence in %s' % fp, file=sys.stderr)
|
print('check zwj: zwj at end of sequence in %s' % fp, file=sys.stderr)
|
||||||
for i, cp in enumerate(seq):
|
for i, cp in enumerate(seq):
|
||||||
if cp == ZWJ:
|
if cp == ZWJ:
|
||||||
if i > 0:
|
if i > 0:
|
||||||
pcp = seq[i-1]
|
pcp = seq[i-1]
|
||||||
if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp):
|
if pcp != EMOJI_VS and not unicode_data.is_emoji(pcp):
|
||||||
print('non-emoji %04x preceeds ZWJ in %s' % (pcp, fp), file=sys.stderr)
|
print(
|
||||||
|
'check zwj: non-emoji %04x preceeds ZWJ in %s' % (pcp, fp),
|
||||||
|
file=sys.stderr)
|
||||||
if i < len(seq) - 1:
|
if i < len(seq) - 1:
|
||||||
fcp = seq[i+1]
|
fcp = seq[i+1]
|
||||||
if not unicode_data.is_emoji(fcp):
|
if not unicode_data.is_emoji(fcp):
|
||||||
print('non-emoji %04x follows ZWJ in %s' % (fcp, fp), file=sys.stderr)
|
print(
|
||||||
|
'check zwj: non-emoji %04x follows ZWJ in %s' % (fcp, fp),
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
def _check_flags(sorted_seq_to_filepath):
|
def _check_flags(sorted_seq_to_filepath):
|
||||||
|
@ -133,15 +152,40 @@ def _check_flags(sorted_seq_to_filepath):
|
||||||
for seq, fp in sorted_seq_to_filepath.iteritems():
|
for seq, fp in sorted_seq_to_filepath.iteritems():
|
||||||
have_reg = None
|
have_reg = None
|
||||||
for cp in seq:
|
for cp in seq:
|
||||||
is_reg = _is_regional_indicator(cp)
|
is_reg = unicode_data.is_regional_indicator(cp)
|
||||||
if have_reg == None:
|
if have_reg == None:
|
||||||
have_reg = is_reg
|
have_reg = is_reg
|
||||||
elif have_reg != is_reg:
|
elif have_reg != is_reg:
|
||||||
print('mix of regional and non-regional in %s' % fp, file=sys.stderr)
|
print(
|
||||||
|
'check flags: mix of regional and non-regional in %s' % fp,
|
||||||
|
file=sys.stderr)
|
||||||
if have_reg and len(seq) > 2:
|
if have_reg and len(seq) > 2:
|
||||||
# We provide dummy glyphs for regional indicators, so there are sequences
|
# We provide dummy glyphs for regional indicators, so there are sequences
|
||||||
# with single regional indicator symbols.
|
# with single regional indicator symbols, the len check handles this.
|
||||||
print('regional indicator sequence length != 2 in %s' % fp, file=sys.stderr)
|
print(
|
||||||
|
'check flags: regional indicator sequence length != 2 in %s' % fp,
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
def _check_tags(sorted_seq_to_filepath):
|
||||||
|
"""Ensure tag sequences (for subregion flags) conform to the spec. We don't
|
||||||
|
validate against CLDR, just that there's a sequence of 2 or more tags starting
|
||||||
|
and ending with the appropriate codepoints."""
|
||||||
|
|
||||||
|
BLACK_FLAG = 0x1f3f4
|
||||||
|
BLACK_FLAG_SET = set([BLACK_FLAG])
|
||||||
|
for seq, fp in sorted_seq_to_filepath.iteritems():
|
||||||
|
seq_set = set(cp for cp in seq)
|
||||||
|
overlap_set = seq_set & TAG_SET
|
||||||
|
if not overlap_set:
|
||||||
|
continue
|
||||||
|
if seq[0] != BLACK_FLAG:
|
||||||
|
print('check tags: bad start tag in %s' % fp)
|
||||||
|
elif seq[-1] != END_TAG:
|
||||||
|
print('check tags: bad end tag in %s' % fp)
|
||||||
|
elif len(seq) < 4:
|
||||||
|
print('check tags: sequence too short in %s' % fp)
|
||||||
|
elif seq_set - TAG_SET != BLACK_FLAG_SET:
|
||||||
|
print('check tags: non-tag items in %s' % fp)
|
||||||
|
|
||||||
|
|
||||||
def _check_skintone(sorted_seq_to_filepath):
|
def _check_skintone(sorted_seq_to_filepath):
|
||||||
|
@ -151,90 +195,76 @@ def _check_skintone(sorted_seq_to_filepath):
|
||||||
base_to_modifiers = collections.defaultdict(set)
|
base_to_modifiers = collections.defaultdict(set)
|
||||||
for seq, fp in sorted_seq_to_filepath.iteritems():
|
for seq, fp in sorted_seq_to_filepath.iteritems():
|
||||||
for i, cp in enumerate(seq):
|
for i, cp in enumerate(seq):
|
||||||
if _is_skintone_modifier(cp):
|
if unicode_data.is_skintone_modifier(cp):
|
||||||
if i == 0:
|
if i == 0:
|
||||||
if len(seq) > 1:
|
if len(seq) > 1:
|
||||||
print('skin color selector first in sequence %s' % fp, file=sys.stderr)
|
print(
|
||||||
|
'check skintone: skin color selector first in sequence %s' % fp,
|
||||||
|
file=sys.stderr)
|
||||||
# standalone are ok
|
# standalone are ok
|
||||||
continue
|
continue
|
||||||
pcp = seq[i-1]
|
pcp = seq[i-1]
|
||||||
if not unicode_data.is_emoji_modifier_base(pcp):
|
if not unicode_data.is_emoji_modifier_base(pcp):
|
||||||
print((
|
print(
|
||||||
'emoji skintone modifier applied to non-base at %d: %s' % (i, fp)), file=sys.stderr)
|
'check skintone: emoji skintone modifier applied to non-base ' +
|
||||||
elif unicode_data.is_emoji_modifier_base(cp):
|
'at %d: %s' % (i, fp), file=sys.stderr)
|
||||||
if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]):
|
else:
|
||||||
base_to_modifiers[cp].add(seq[i+1])
|
if pcp not in base_to_modifiers:
|
||||||
elif cp not in base_to_modifiers:
|
base_to_modifiers[pcp] = set()
|
||||||
base_to_modifiers[cp] = set()
|
base_to_modifiers[pcp].add(cp)
|
||||||
|
|
||||||
for cp, modifiers in sorted(base_to_modifiers.iteritems()):
|
for cp, modifiers in sorted(base_to_modifiers.iteritems()):
|
||||||
if len(modifiers) != 5:
|
if len(modifiers) != 5:
|
||||||
print('emoji base %04x has %d modifiers defined (%s) in %s' % (
|
print(
|
||||||
|
'check skintone: base %04x has %d modifiers defined (%s) in %s' % (
|
||||||
cp, len(modifiers),
|
cp, len(modifiers),
|
||||||
', '.join('%04x' % cp for cp in sorted(modifiers)), fp), file=sys.stderr)
|
', '.join('%04x' % cp for cp in sorted(modifiers)), fp),
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
def _check_zwj_sequences(seq_to_filepath):
|
def _check_zwj_sequences(sorted_seq_to_filepath, unicode_version):
|
||||||
"""Verify that zwj sequences are valid."""
|
"""Verify that zwj sequences are valid for the given unicode version."""
|
||||||
zwj_sequence_to_name = unicode_data.get_emoji_zwj_sequences()
|
for seq, fp in sorted_seq_to_filepath.iteritems():
|
||||||
# strip emoji variant selectors and add extra mappings
|
if ZWJ not in seq:
|
||||||
zwj_sequence_without_vs_to_name_canonical = {}
|
|
||||||
for seq, seq_name in zwj_sequence_to_name.iteritems():
|
|
||||||
if EMOJI_VS in seq:
|
|
||||||
stripped_seq = strip_vs(seq)
|
|
||||||
zwj_sequence_without_vs_to_name_canonical[stripped_seq] = (seq_name, seq)
|
|
||||||
|
|
||||||
zwj_seq_to_filepath = {
|
|
||||||
seq: fp for seq, fp in seq_to_filepath.iteritems()
|
|
||||||
if ZWJ in seq}
|
|
||||||
|
|
||||||
for seq, fp in zwj_seq_to_filepath.iteritems():
|
|
||||||
if seq not in zwj_sequence_to_name:
|
|
||||||
if seq not in zwj_sequence_without_vs_to_name_canonical:
|
|
||||||
print('zwj sequence not defined: %s' % fp, file=sys.stderr)
|
|
||||||
else:
|
|
||||||
_, can = zwj_sequence_without_vs_to_name_canonical[seq]
|
|
||||||
# print >> sys.stderr, 'canonical sequence %s contains vs: %s' % (
|
|
||||||
# _seq_string(can), fp)
|
|
||||||
|
|
||||||
def read_emoji_aliases():
|
|
||||||
result = {}
|
|
||||||
|
|
||||||
with open(path.join(DATA_ROOT, 'emoji_aliases.txt'), 'r') as f:
|
|
||||||
for line in f:
|
|
||||||
ix = line.find('#')
|
|
||||||
if (ix > -1):
|
|
||||||
line = line[:ix]
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
continue
|
||||||
als, trg = (s.strip() for s in line.split(';'))
|
age = unicode_data.get_emoji_sequence_age(seq)
|
||||||
als_seq = tuple([int(x, 16) for x in als.split('_')])
|
if age is None or unicode_version is not None and age > unicode_version:
|
||||||
try:
|
print('check zwj sequences: undefined sequence %s' % fp)
|
||||||
trg_seq = tuple([int(x, 16) for x in trg.split('_')])
|
|
||||||
except:
|
|
||||||
print('cannot process alias %s -> %s' % (als, trg))
|
|
||||||
continue
|
|
||||||
result[als_seq] = trg_seq
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def _check_coverage(seq_to_filepath):
|
def _check_no_alias_sources(sorted_seq_to_filepath):
|
||||||
age = 9.0
|
"""Check that we don't have sequences that we expect to be aliased to
|
||||||
|
some other sequence."""
|
||||||
|
aliases = add_aliases.read_default_emoji_aliases()
|
||||||
|
for seq, fp in sorted_seq_to_filepath.iteritems():
|
||||||
|
if seq in aliases:
|
||||||
|
print('check no alias sources: aliased sequence %s' % fp)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_coverage(seq_to_filepath, unicode_version):
|
||||||
|
"""Ensure we have all and only the cps and sequences that we need for the
|
||||||
|
font as of this version."""
|
||||||
|
|
||||||
|
age = unicode_version
|
||||||
|
|
||||||
non_vs_to_canonical = {}
|
non_vs_to_canonical = {}
|
||||||
for k in seq_to_filepath:
|
for k in seq_to_filepath:
|
||||||
if EMOJI_VS in k:
|
if EMOJI_VS in k:
|
||||||
non_vs = strip_vs(k)
|
non_vs = unicode_data.strip_emoji_vs(k)
|
||||||
non_vs_to_canonical[non_vs] = k
|
non_vs_to_canonical[non_vs] = k
|
||||||
|
|
||||||
aliases = read_emoji_aliases()
|
aliases = add_aliases.read_default_emoji_aliases()
|
||||||
for k, v in sorted(aliases.items()):
|
for k, v in sorted(aliases.items()):
|
||||||
if v not in seq_to_filepath and v not in non_vs_to_canonical:
|
if v not in seq_to_filepath and v not in non_vs_to_canonical:
|
||||||
print('alias %s missing target %s' % (_seq_string(k), _seq_string(v)))
|
alias_str = unicode_data.seq_to_string(k)
|
||||||
|
target_str = unicode_data.seq_to_string(v)
|
||||||
|
print('coverage: alias %s missing target %s' % (alias_str, target_str))
|
||||||
continue
|
continue
|
||||||
if k in seq_to_filepath or k in non_vs_to_canonical:
|
if k in seq_to_filepath or k in non_vs_to_canonical:
|
||||||
print('alias %s already exists as %s (%s)' % (
|
alias_str = unicode_data.seq_to_string(k)
|
||||||
_seq_string(k), _seq_string(v), seq_name(v)))
|
target_str = unicode_data.seq_to_string(v)
|
||||||
|
print('coverage: alias %s already exists as %s (%s)' % (
|
||||||
|
alias_str, target_str, seq_name(v)))
|
||||||
continue
|
continue
|
||||||
filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]]
|
filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]]
|
||||||
seq_to_filepath[k] = 'alias:' + filename
|
seq_to_filepath[k] = 'alias:' + filename
|
||||||
|
@ -243,13 +273,15 @@ def _check_coverage(seq_to_filepath):
|
||||||
emoji = sorted(unicode_data.get_emoji(age=age))
|
emoji = sorted(unicode_data.get_emoji(age=age))
|
||||||
for cp in emoji:
|
for cp in emoji:
|
||||||
if tuple([cp]) not in seq_to_filepath:
|
if tuple([cp]) not in seq_to_filepath:
|
||||||
print('missing single %04x (%s)' % (cp, unicode_data.name(cp, '<no name>')))
|
print(
|
||||||
|
'coverage: missing single %04x (%s)' % (
|
||||||
|
cp, unicode_data.name(cp, '<no name>')))
|
||||||
|
|
||||||
# special characters
|
# special characters
|
||||||
# all but combining enclosing keycap are currently marked as emoji
|
# all but combining enclosing keycap are currently marked as emoji
|
||||||
for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a):
|
for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a):
|
||||||
if cp not in emoji and tuple([cp]) not in seq_to_filepath:
|
if cp not in emoji and tuple([cp]) not in seq_to_filepath:
|
||||||
print('missing special %04x (%s)' % (cp, unicode_data.name(cp)))
|
print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp)))
|
||||||
|
|
||||||
# combining sequences
|
# combining sequences
|
||||||
comb_seq_to_name = sorted(
|
comb_seq_to_name = sorted(
|
||||||
|
@ -257,24 +289,26 @@ def _check_coverage(seq_to_filepath):
|
||||||
for seq, name in comb_seq_to_name:
|
for seq, name in comb_seq_to_name:
|
||||||
if seq not in seq_to_filepath:
|
if seq not in seq_to_filepath:
|
||||||
# strip vs and try again
|
# strip vs and try again
|
||||||
non_vs_seq = strip_vs(seq)
|
non_vs_seq = unicode_data.strip_emoji_vs(seq)
|
||||||
if non_vs_seq not in seq_to_filepath:
|
if non_vs_seq not in seq_to_filepath:
|
||||||
print('missing combining sequence %s (%s)' % (_seq_string(seq), name))
|
print('coverage: missing combining sequence %s (%s)' %
|
||||||
|
(unicode_data.seq_to_string(seq), name))
|
||||||
|
|
||||||
# flag sequences
|
# flag sequences
|
||||||
flag_seq_to_name = sorted(
|
flag_seq_to_name = sorted(
|
||||||
unicode_data.get_emoji_flag_sequences(age=age).iteritems())
|
unicode_data.get_emoji_flag_sequences(age=age).iteritems())
|
||||||
for seq, name in flag_seq_to_name:
|
for seq, name in flag_seq_to_name:
|
||||||
if seq not in seq_to_filepath:
|
if seq not in seq_to_filepath:
|
||||||
print('missing flag sequence %s (%s)' % (_seq_string(seq), name))
|
print('coverage: missing flag sequence %s (%s)' %
|
||||||
|
(unicode_data.seq_to_string(seq), name))
|
||||||
|
|
||||||
# skin tone modifier sequences
|
# skin tone modifier sequences
|
||||||
mod_seq_to_name = sorted(
|
mod_seq_to_name = sorted(
|
||||||
unicode_data.get_emoji_modifier_sequences(age=age).iteritems())
|
unicode_data.get_emoji_modifier_sequences(age=age).iteritems())
|
||||||
for seq, name in mod_seq_to_name:
|
for seq, name in mod_seq_to_name:
|
||||||
if seq not in seq_to_filepath:
|
if seq not in seq_to_filepath:
|
||||||
print('missing modifier sequence %s (%s)' % (
|
print('coverage: missing modifier sequence %s (%s)' % (
|
||||||
_seq_string(seq), name))
|
unicode_data.seq_to_string(seq), name))
|
||||||
|
|
||||||
# zwj sequences
|
# zwj sequences
|
||||||
# some of ours include the emoji presentation variation selector and some
|
# some of ours include the emoji presentation variation selector and some
|
||||||
|
@ -295,25 +329,30 @@ def _check_coverage(seq_to_filepath):
|
||||||
else:
|
else:
|
||||||
test_seq = seq
|
test_seq = seq
|
||||||
if test_seq not in zwj_seq_without_vs:
|
if test_seq not in zwj_seq_without_vs:
|
||||||
print('missing (canonical) zwj sequence %s (%s)' % (
|
print('coverage: missing (canonical) zwj sequence %s (%s)' % (
|
||||||
_seq_string(seq), name))
|
unicode_data.seq_to_string(seq), name))
|
||||||
|
|
||||||
# check for 'unknown flag'
|
# check for 'unknown flag'
|
||||||
# this is either emoji_ufe82b or 'unknown_flag', we filter out things that
|
# this is either emoji_ufe82b or 'unknown_flag', but we filter out things that
|
||||||
# don't start with our prefix so 'unknown_flag' would be excluded by default.
|
# don't start with our prefix so 'unknown_flag' would be excluded by default.
|
||||||
if tuple([0xfe82b]) not in seq_to_filepath:
|
if tuple([0xfe82b]) not in seq_to_filepath:
|
||||||
print('missing unknown flag PUA fe82b')
|
print('coverage: missing unknown flag PUA fe82b')
|
||||||
|
|
||||||
|
|
||||||
def check_sequence_to_filepath(seq_to_filepath):
|
def check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage):
|
||||||
sorted_seq_to_filepath = collections.OrderedDict(
|
sorted_seq_to_filepath = collections.OrderedDict(
|
||||||
sorted(seq_to_filepath.items()))
|
sorted(seq_to_filepath.items()))
|
||||||
_check_valid_emoji(sorted_seq_to_filepath)
|
_check_no_vs(sorted_seq_to_filepath)
|
||||||
|
_check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version)
|
||||||
_check_zwj(sorted_seq_to_filepath)
|
_check_zwj(sorted_seq_to_filepath)
|
||||||
_check_flags(sorted_seq_to_filepath)
|
_check_flags(sorted_seq_to_filepath)
|
||||||
|
_check_tags(sorted_seq_to_filepath)
|
||||||
_check_skintone(sorted_seq_to_filepath)
|
_check_skintone(sorted_seq_to_filepath)
|
||||||
_check_zwj_sequences(sorted_seq_to_filepath)
|
_check_zwj_sequences(sorted_seq_to_filepath, unicode_version)
|
||||||
_check_coverage(sorted_seq_to_filepath)
|
_check_no_alias_sources(sorted_seq_to_filepath)
|
||||||
|
if coverage:
|
||||||
|
_check_coverage(sorted_seq_to_filepath, unicode_version)
|
||||||
|
|
||||||
|
|
||||||
def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
|
def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
|
||||||
"""Check names, and convert name to sequences for names that are ok,
|
"""Check names, and convert name to sequences for names that are ok,
|
||||||
|
@ -345,12 +384,15 @@ def create_sequence_to_filepath(name_to_dirpath, prefix, suffix):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def collect_name_to_dirpath(directory, prefix, suffix):
|
def collect_name_to_dirpath(directory, prefix, suffix, exclude=None):
|
||||||
"""Return a mapping from filename to path rooted at directory, ignoring files
|
"""Return a mapping from filename to path rooted at directory, ignoring files
|
||||||
that don't match suffix. Report when a filename appears in more than one
|
that don't match suffix, and subtrees with names in exclude. Report when a
|
||||||
subdir; the first path found is kept."""
|
filename appears in more than one subdir; the first path found is kept."""
|
||||||
result = {}
|
result = {}
|
||||||
for dirname, _, files in os.walk(directory):
|
for dirname, dirs, files in os.walk(directory, topdown=True):
|
||||||
|
if exclude:
|
||||||
|
dirs[:] = [d for d in dirs if d not in exclude]
|
||||||
|
|
||||||
if directory != '.':
|
if directory != '.':
|
||||||
dirname = path.join(directory, dirname)
|
dirname = path.join(directory, dirname)
|
||||||
for f in files:
|
for f in files:
|
||||||
|
@ -364,42 +406,57 @@ def collect_name_to_dirpath(directory, prefix, suffix):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def collect_name_to_dirpath_with_override(dirs, prefix, suffix):
|
def collect_name_to_dirpath_with_override(dirs, prefix, suffix, exclude=None):
|
||||||
"""Return a mapping from filename to a directory path rooted at a directory
|
"""Return a mapping from filename to a directory path rooted at a directory
|
||||||
in dirs, using collect_name_to_filepath. The last directory is retained. This
|
in dirs, using collect_name_to_filepath. The last directory is retained. This
|
||||||
does not report an error if a file appears under more than one root directory,
|
does not report an error if a file appears under more than one root directory,
|
||||||
so lets later root directories override earlier ones."""
|
so lets later root directories override earlier ones. Use 'exclude' to
|
||||||
|
name subdirectories (of any root) whose subtree you wish to skip."""
|
||||||
result = {}
|
result = {}
|
||||||
for d in dirs:
|
for d in dirs:
|
||||||
result.update(collect_name_to_dirpath(d, prefix, suffix))
|
result.update(collect_name_to_dirpath(d, prefix, suffix, exclude))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def run_check(dirs, prefix, suffix):
|
def run_check(dirs, prefix, suffix, exclude, unicode_version, coverage):
|
||||||
print('Checking files with prefix "%s" and suffix "%s" in:\n %s' % (
|
msg = ''
|
||||||
prefix, suffix, '\n '.join(dirs)))
|
if unicode_version:
|
||||||
|
msg = ' (%3.1f)' % unicode_version
|
||||||
|
print('Checking files with prefix "%s" and suffix "%s"%s in:\n %s' % (
|
||||||
|
prefix, suffix, msg, '\n '.join(dirs)))
|
||||||
name_to_dirpath = collect_name_to_dirpath_with_override(
|
name_to_dirpath = collect_name_to_dirpath_with_override(
|
||||||
dirs, prefix=prefix, suffix=suffix)
|
dirs, prefix=prefix, suffix=suffix, exclude=exclude)
|
||||||
print('checking %d names' % len(name_to_dirpath))
|
print('checking %d names' % len(name_to_dirpath))
|
||||||
seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix)
|
seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix)
|
||||||
print('checking %d sequences' % len(seq_to_filepath))
|
print('checking %d sequences' % len(seq_to_filepath))
|
||||||
check_sequence_to_filepath(seq_to_filepath)
|
check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage)
|
||||||
print('done.')
|
print('done.')
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-d', '--dirs', help='directories containing emoji images',
|
'-d', '--dirs', help='directory roots containing emoji images',
|
||||||
metavar='dir', nargs='+', required=True)
|
metavar='dir', nargs='+', required=True)
|
||||||
|
parser.add_argument(
|
||||||
|
'-e', '--exclude', help='names of source subdirs to exclude',
|
||||||
|
metavar='dir', nargs='+')
|
||||||
|
parser.add_argument(
|
||||||
|
'-c', '--coverage', help='test for complete coverage',
|
||||||
|
action='store_true')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-p', '--prefix', help='prefix to match, default "emoji_u"',
|
'-p', '--prefix', help='prefix to match, default "emoji_u"',
|
||||||
metavar='pfx', default='emoji_u')
|
metavar='pfx', default='emoji_u')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx',
|
'-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx',
|
||||||
default='.png')
|
default='.png')
|
||||||
|
parser.add_argument(
|
||||||
|
'-u', '--unicode_version', help='limit to this unicode version or before',
|
||||||
|
metavar='version', type=float)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
run_check(args.dirs, args.prefix, args.suffix)
|
run_check(
|
||||||
|
args.dirs, args.prefix, args.suffix, args.exclude, args.unicode_version,
|
||||||
|
args.coverage)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in New Issue