marseymoji/third_party/color_emoji/add_glyphs.py

#!/usr/bin/env python

import collections, glob, os, re, sys
from fontTools import ttx
from fontTools.ttLib.tables import otTables
from png import PNG

# PUA character for unknown flag.  This avoids the legacy emoji pua values, but
# is in the same area.
UNKNOWN_FLAG_GLYPH_NAME = "uFE82B"

sys.path.append(
    os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
import add_emoji_gsub


def is_vs(cp):
        return cp >= 0xfe00 and cp <= 0xfe0f

def is_fitzpatrick(gname):
        cp = int(gname[1:], 16)
        return 0x1f3fb <= cp <= 0x1f3ff

def codes_to_string(codes):
	if "_" in codes:
		pieces = codes.split ("_")
		string = "".join ([unichr (int (code, 16)) for code in pieces])
	else:
          try:
		string = unichr (int (codes, 16))
          except:
            raise ValueError("uh-oh, no unichr for '%s'" % codes)
        return string


def glyph_sequence(string):
        # sequence of names of glyphs that form a ligature
        # variation selectors are stripped
        return ["u%04X" % ord(char) for char in string if not is_vs(ord(char))]


def glyph_name(string):
        # name of a ligature
        # includes variation selectors when present
	return "_".join (["u%04X" % ord (char) for char in string])


def add_ligature (font, seq, name):
	if 'GSUB' not in font:
		ligature_subst = otTables.LigatureSubst()
		ligature_subst.ligatures = {}

		lookup = otTables.Lookup()
		lookup.LookupType = 4
		lookup.LookupFlag = 0
		lookup.SubTableCount = 1
		lookup.SubTable = [ligature_subst]

		font['GSUB'] = add_emoji_gsub.create_simple_gsub([lookup])
	else:
		lookup = font['GSUB'].table.LookupList.Lookup[0]
		assert lookup.LookupType == 4
		assert lookup.LookupFlag == 0

	ligatures = lookup.SubTable[0].ligatures

	lig = otTables.Ligature()
	lig.CompCount = len(seq)
	lig.Component = seq[1:]
	lig.LigGlyph = name

	first = seq[0]
	try:
		ligatures[first].append(lig)
	except KeyError:
		ligatures[first] = [lig]


# Ligating sequences for emoji that already have a defined codepoint,
# to match the sequences for the related emoji with no codepoint.
# The key is the name of the glyph with the codepoint, the value is the
# name of the sequence in filename form.
EXTRA_SEQUENCES = {
    'u1F46A': '1F468_200D_1F469_200D_1F466', # MWB
    'u1F491': '1F469_200D_2764_FE0F_200D_1F468', # WHM
    'u1F48F': '1F469_200D_2764_FE0F_200D_1F48B_200D_1F468', # WHKM
}

# Flag aliases - from: to
FLAG_ALIASES = {
    'BV': 'NO',
    'CP': 'FR',
    'HM': 'AU',
    'SJ': 'NO',
    'UM': 'US',
}

if len (sys.argv) < 4:
	print >>sys.stderr, """
Usage:

add_glyphs.py font.ttx out-font.ttx strike-prefix...

This will search for files that have strike-prefix followed by one or more
hex numbers (separated by underscore if more than one), and end in ".png".
For example, if strike-prefix is "icons/u", then files with names like
"icons/u1F4A9.png" or "icons/u1F1EF_1F1F5.png" will be loaded.  The script
then adds cmap, htmx, and potentially GSUB entries for the Unicode
characters found.  The advance width will be chosen based on image aspect
ratio.  If Unicode values outside the BMP are desired, the existing cmap
table should be of the appropriate (format 12) type.  Only the first cmap
table and the first GSUB lookup (if existing) are modified.
"""
	sys.exit (1)

in_file = sys.argv[1]
out_file = sys.argv[2]
img_prefixen = sys.argv[3:]
del sys.argv

font = ttx.TTFont()
font.importXML (in_file)

img_files = {}
for img_prefix in img_prefixen:
        glb = "%s*.png" % img_prefix
        print "Looking for images matching '%s'." % glb
        for img_file in glob.glob (glb):
        	codes = img_file[len (img_prefix):-4]
                u = codes_to_string(codes)
                if u in img_files:
                        print 'overwriting %s with %s' % (img_files[u], img_file)
        	img_files[u] = img_file
if not img_files:
	raise Exception ("No image files found in '%s'." % glb)

ascent = font['hhea'].ascent
descent = -font['hhea'].descent

g = font['GlyphOrder'].glyphOrder
c = font['cmap'].tables[0].cmap
h = font['hmtx'].metrics

# Sort the characters by length, then codepoint, to keep the order stable
# and avoid adding empty glyphs for multi-character glyphs if any piece is
# also included.
img_pairs = img_files.items ()
img_pairs.sort (key=lambda pair: (len (pair[0]), pair[0]))

glyph_names = set()
ligatures = {}

def add_lig_sequence(ligatures, seq, n):
        # We have emoji sequences using regional indicator symbols,
        # ZWJ, fitzpatrick modifiers, and combinations of ZWJ and fitzpatrick
        # modifiers.  Currently, Harfbuzz special-cases the fitzpatrick
        # modifiers to treat them as combining marks instead of as Other
        # Neutral, which unicode says they are, and processes them
        # in visual order (at least in some circumstances).  So to handle
        # emoji sequences in an RTL context we need GSUB sequences that match
        # this order.
        # Regional indicator symbols are LTR, and emoji+fitzpatrick are
        # effectively LTR, so we only reorder sequences with ZWJ.  If however
        # the ZWJ sequence has fitzpatrick modifiers, those need to still follow
        # the emoji they logically follow, so simply reversing the sequence
        # doesn't work.  This code assumes the lig sequence is valid.
        tseq = tuple(seq)
        if tseq in ligatures:
                print 'lig sequence %s, replace %s with %s' % (
                    tseq, ligatures[tseq], n)
        ligatures[tseq] = n
        if 'u200D' in seq:
                rev_seq = seq[:]
                rev_seq.reverse()
                for i in xrange(1, len(rev_seq)):
                  if is_fitzpatrick(rev_seq[i - 1]):
                    tmp = rev_seq[i]
                    rev_seq[i] = rev_seq[i-1]
                    rev_seq[i-1] = tmp

                trseq = tuple(rev_seq)
                # if trseq in ligatures:
                #        print 'rev lig sequence %s, replace %s with %s' % (
                #        trseq, ligatures[trseq], n)
                ligatures[trseq] = n


for (u, filename) in img_pairs:
	n = glyph_name (u)
        glyph_names.add(n)
	# print "Adding glyph for %s" % n

	g.append (n)
	for char in u:
                cp = ord(char)
		if cp not in c and not is_vs(cp):
			name = glyph_name (char)
			c[cp] = name
			if len (u) > 1:
				h[name] = [0, 0]
	(img_width, img_height) = PNG (filename).get_size ()
	advance = int (round ((float (ascent+descent) * img_width / img_height)))
	h[n] = [advance, 0]
	if len (u) > 1:
                seq = glyph_sequence(u)
                add_lig_sequence(ligatures, seq, n)

for n in EXTRA_SEQUENCES:
        if n in glyph_names:
                seq = glyph_sequence(codes_to_string(EXTRA_SEQUENCES[n]))
                add_lig_sequence(ligatures, seq, n)
        else:
                print 'extras: no glyph for %s' % n

# Add missing regional indicator sequences and flag aliases
# if we support any.
regional_names = frozenset('u%X' % cp for cp in range(0x1F1E6, 0x1F200))

def _is_flag_sequence(t):
  return len(t) == 2 and t[0] in regional_names and t[1] in regional_names

have_flags = False
for k in ligatures:
  if _is_flag_sequence(k):
    have_flags = True
    break

if have_flags and UNKNOWN_FLAG_GLYPH_NAME not in glyph_names:
  raise ValueError(
      'Have flags but no unknown flag glyph "%s"' % UNKNOWN_FLAG_GLYPH_NAME)

# sigh, too many separate files with the same code.
# copied from add_emoji_gsub.
def _reg_indicator(letter):
  assert 'A' <= letter <= 'Z'
  return 0x1F1E6 + ord(letter) - ord('A')

def _reg_lig_sequence(flag_name):
  """Returns a tuple of strings naming the codepoints that form the ligature."""
  assert len(flag_name) == 2
  return tuple('u%X' % _reg_indicator(cp) for cp in flag_name)

def _reg_lig_name(flag_name):
  """Returns a glyph name for the flag name."""
  return '_'.join(_reg_lig_sequence(flag_name))

if have_flags:
  print 'Adding flag aliases.'
  for flag_from, flag_to in FLAG_ALIASES.iteritems():
    seq = _reg_lig_sequence(flag_from)
    name = _reg_lig_name(flag_to)
    add_lig_sequence(ligatures, seq, name)

  print 'Adding unused flag sequences'
  # every flag sequence we don't have gets the missing flag glyph
  for first in regional_names:
    for second in regional_names:
      seq = (first, second)
      if seq not in ligatures:
        add_lig_sequence(ligatures, seq, UNKNOWN_FLAG_GLYPH_NAME)


keyed_ligatures = collections.defaultdict(list)
for k, v in ligatures.iteritems():
        first = k[0]
        keyed_ligatures[first].append((k, v))

for base in sorted(keyed_ligatures):
        pairs = keyed_ligatures[base]
        # print 'base %s has %d sequences' % (base, len(pairs))

        # Sort longest first, this ensures longer sequences with common prefixes
        # are handled before shorter ones.  It would be better to have multiple
        # lookups, most likely.
        pairs.sort(key = lambda pair: (len(pair[0]), pair[0]), reverse=True)
        for seq, name in pairs:
                # print seq, name
                add_ligature(font, seq, name)

font.saveXML (out_file)