From a69b45779db4ccb36a451e68c5773dc94d999770 Mon Sep 17 00:00:00 2001 From: Doug Felt Date: Wed, 11 Apr 2018 11:16:26 -0700 Subject: [PATCH 1/3] Add aliases for superhero/villan genderless sequences. Maps them to the female gendered ones. --- emoji_aliases.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/emoji_aliases.txt b/emoji_aliases.txt index d4a2d34fd..b1a9e7554 100644 --- a/emoji_aliases.txt +++ b/emoji_aliases.txt @@ -254,6 +254,20 @@ fe82b;unknown_flag # no name -> no name 1f9de;1f9de_200d_2640 # GENIE -> female genie 1f9df;1f9df_200d_2640 # ZOMBIE -> female zombie +#unicode 11 +1f9b8;1f9b8_200d_2640 # SUPERHERO -> female superhero +1f9b8_1f3fb;1f9b8_1f3fb_200d_2640 # light skin tone +1f9b8_1f3fc;1f9b8_1f3fc_200d_2640 # medium-light skin tone +1f9b8_1f3fd;1f9b8_1f3fd_200d_2640 # medium skin tone +1f9b8_1f3fe;1f9b8_1f3fe_200d_2640 # medium-dark skin tone +1f9b8_1f3ff;1f9b8_1f3ff_200d_2640 # dark skin tone +1f9b9;1f9b9_200d_2640 # SUPERVILLAN -> female supervillan +1f9b9_1f3fb;1f9b9_1f3fb_200d_2640 # light skin tone +1f9b9_1f3fc;1f9b9_1f3fc_200d_2640 # medium-light skin tone +1f9b9_1f3fd;1f9b9_1f3fd_200d_2640 # medium skin tone +1f9b9_1f3fe;1f9b9_1f3fe_200d_2640 # medium-dark skin tone +1f9b9_1f3ff;1f9b9_1f3ff_200d_2640 # dark skin tone + # legacy android sequences # wrestlers -> men wrestling 1f93c_1f3fb;1f93c_1f3fb_200d_2642 # light skin tone From eee736963fa0b686ce4925d9ebc5ea33213e9130 Mon Sep 17 00:00:00 2001 From: Doug Felt Date: Mon, 16 Apr 2018 17:52:28 -0700 Subject: [PATCH 2/3] Remove duplicate images for 'people with bunny ears partying'. The non-gender-specific sequence is aliased to a gender-specific one, and the image is a duplicate. Aliasing handles the non-gender-specific case. --- png/128/emoji_u1f46f.png | Bin 6947 -> 0 bytes svg/emoji_u1f46f.svg | 574 --------------------------------------- 2 files changed, 574 deletions(-) delete mode 100644 png/128/emoji_u1f46f.png delete mode 100644 svg/emoji_u1f46f.svg diff --git a/png/128/emoji_u1f46f.png b/png/128/emoji_u1f46f.png deleted file mode 100644 index 682bac7ca2486fd691a72d394ff52d656f6044ad..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6947 zcmV+;8{FiHP)4sC!iEjXnV4uwgY2|F!Lr^`%%A?Y&A%sF$C%;~IU8q(92KRPs~P^Qy?hT#NS z4zw|(6q>Y+*wT;~Z?WUu)-FA*bMN=`WJ#7~Cwfo9&;HK&EXi;E{oeiVz2E)5?>>O# zvRszSa#=3RWw|WHqLT7tpa1;loAHCH>&HL-v0Lt&$LqD>hXb$GRSL3{h)*&BxcA@dhVJfe z7#kaNM5EC?NK)@T_uR8rste$`+OX0?IO6yF+s~do3qwOgfY)g+fdWt@KFI_i9*=Jb zg+dUE#b9b`3VM2aU}9pzfxq|Mb=O@dur?QSems5`5^(|_Li@nL03#+42moC(kx0|1?2YIZ{LOI>^OJs z93!00e~YFOu2a`$U6ZPuMXC5?5rDgOtKln#HSmJ%M)-zdJyV-R5=wJ3l5}vP=iISl zM;o4R|LEu_xZQ5Jn6t5S?lWBh-!WfRHUUr)J}FHAsMTuWW|K2FdykOGbBZ;IO z_uIC8`}Xz)lD>!R;N;{aT%oRn$F0{wJ!i^#DB`uL%4`A@iBC!m@Ty2q?#-^zIN-00 zA7M^SFc_qL?wjv?+qP|^*0hbtA(i`G+n<3|rJdC(wOTFveD>LAyJXe?io_?S25=dT zM)v)8{OIhUhUpRpggj^hzJT*>qgLhhdSR=+c@{axB4dI*Hk;YCP`TU9#vn5GJRJ|FGtxb@as+vhpoR*%QSJmKs04YRht92#KT=z91m zdn8b)_#_j6=bwL`qGQihR#t*arGkg2UZ1tkCS48d)z>Dk>*yQ7S%8Vl}xN?WDW9x*A>z^uqyC)S%<$Y!R*AUhK(8Iv}_8mTaxUU3}Ktn>^ZBY@FRmReeFsLut1xkHvRvT*fQ=O z{UrocaadKi3OJ5)z4+pbd!a-{iTF6FPlz}C9DeLvv0?=vQB0NHGWwm^xC((tL|Ix)7C4#Gu_4)u>o&8^f}n3R)a z)6~=iR`)P=b2RB25j$62l^(pPoG7@Gc^s0$$U!yC$iauJC&-cDyVci!NU8% zoZx|vM8yy(k@OalbgB#wUorSpGLS<gk#A@wsf89b(iqrD+(Vc!#Vde$wE@>78b^YIH!hy!Opzn z^C5={IayJS9zr?`h#`2p)go>tPkUF16@a4fsU&V4)a_yO=B#ET7<4)mF*O8jH4x%? z7#tjA9x~ZS8swnw`3l)aszfG*R`q&4)YsR8E{tk5>|>;Rd;y5YVkOlAO2ns;_> zOwDniM{-D@d9xE@XwwLAYildpNgLxqorr-t|~sYN>EC8!3_nCJS&; z9ymPSWYi-`F|n1<;7ex-TEF7(Ni6_u6pU_OEMyzi=>6KvCeQ^vpo@l3eG1A)m8FAb zdde5ohu~xDV(`B&X(6Ff1FyG$Hr1LU*y0RCvPQ$YRIzxR{=KzC0-!j28j01TLvW5} zhBo#7X^0|0dPWR7OGHHy7)i-2JCtOVg4E2GL`F>}1kZQPMREAIuZPp*Q2EAR zMxd!a!H82O;-KLJtY0k_x|ZlH;R9-xTLzSfPf7u3Vk$+wu+V}?R+TdWe|>iZ_B_Bt zb3^vLH?uJbdmoJ8|MO3W%DI-@0-#8IQu+Yw236d`Iu=C6XReDu`<3w&WBK-1bMLJa(7GxN_umWKA@L$i=uxdpE z+80QcO#qaFPc{K)sgDT~kENkH47(q1K(NBR`W>6yaR1E{1??mq^QG@MLiex`b?ewN z1^0b&Dt!z+0M}_%*#tl-_+&EyOp;S7Z)BQFlSAd1Hym@SN3ptk^l5Ua{PKh~dmkuY z>sJy!8J)mnx`kNuv8*k%j%a$@T*YE2d`uwBL762$5%{FM%B%Dz8Wn)WW3jX}6G0De zJBOg-tUkPLXxTL9H!(Oe8FHlcKSi zAW7G$-jWJ{Qt-*DXZTt#>)2_(0j7O=a9YFg%{z`~Z3_h> z;18~)h*)DGY*HM@Wd;t82P2{0h|xMx(eT-!6%yr5X)mP zw7}Q4ccbm&!DiL6=~M=76z;wLUFaQg!0Yd=gp>VMY+t7(%!s>Y?I8Ouc8mB(90GwT zBiz+vPoF!(9`~1811JTb)B-?`QdF&t<0jXzbSRpWg2S!PF2QTU{eH|D)jk)#tJ*um?MK`<0UtrssCDT`qr*yD@B zB%07pREt#SYK5YQUTQ~B3HT)KZ!}jqTOc)i)}YVs%SPiI{Qis+CVe`na|YRZS(+_N zg%(dx|2Q5`LL{0%?>Y%#J^}9!THw3SuV!|3RDOXT6AsX7Ete$55Js1^oB^)N7Qg!hLX@ZNw0 z0>NZvD}1$7PhW}nB<}`VDym!+4UO&g6?O2r_5TNJs)oR*S7rH}k$951!DJ%yMNz3s z&EcIHj2zZDRM>*hb}>z`oP;TF0-kcU!*AZM0?+6$L_)!?5^4a&;gd`N9(w2@N}RAC zGS;z$V|s`skO8eBYhSaC7CeN&9Y{h&j{ie`Pk;xguh7xjsH|;qW*5T{)YL=$rPWy1 z(!cN9a3k~`IB-DLV31PrNzw~=@WBW7;>UkOYV{1|z@-!@=ceLtbz>iBjYi;fXb{y} zRymCpj3y0?PezzMWXr5m#7qYOtadG|zSx%DU#tON2*$hyc>Yuy=yYi>oRTi?y6UQ{ zRL71TlclvN8DG(}4j*{n0m_fxA*ePXS+bb_hRWZ6>J9iv?KC**YC*+ihaGr64kJT; z2>6BJQVO6rD-BS+V)mT)u+IlUw+H_9SDWF~XeA|41e?vqDmQapBIvW<``-5?^O}{8 zPgNv2q!L*Q6H1jnF)=ag#RevA6#n6cH=(K054QT1pw{Wn`#QmiF$jA-@WVGYz|o$S z*^w+HhmtH%Ru0i??&;}~B*&i8@fBqPzWUX#Qoj42QpAeLq0CoeF8$0JiF5EqUp=%| zO@lQtf*vt;pA1G&sWr6HvS8)?KJbs81CJ*P2Yuy}LX=0JBs5bsW|jc2hHt%eC0x6D2)1wNhHCEzpw=5e zXST4<3>614dNETZ3=upRg;R-?XWwdt3BM8MnrKJIr^sQySKIq2lgI92h*3JeqD{c$ z)3A5gU$y7)ceg<(BFGWF^k^=UGRsFBN8vKGn@u!qTAMvsGLA&_Pg&vQh!cK4umUCm zg{NE*Ih1HobegVg8i1`$FVfeYm3LowS{0?^D@p*69O?=kF;%UAfTjT+du1&QPMXpr zQRb^s6e6Xzww9?!DmSe=4co3f2f@j42)n1qMs7ipT+8?*6aXZLx%TTAnu^rc*g3XtVrMG*|_CcHvyHX^%m+{S^VJ_g9 zLul6=<`s1_p0F+kU%%yTXmGOErGV9HWu9y9YtN*rWDluj(Bup1lYegWyIJH=vEsv1 zacVJmvy4v)0l?O=pDt>tshid3)y2K=rS;E2gKe5gqs3xjB;|Uw#a432D64kLv2uU* z;v=rL#@}IU$X-%S!I;!T?7;i(SjJbR0N_Q)^W0^su0hW>yRXapJm4J~06iD#-1nVl zohZefB*{dQ_}Vk6s?+HdztcN>wp`QHy9K>z7hOM>B=W)y4)zw0{doUvOBr8L0+6zc z2lX~P=&kly*MFPP(s1z(1G0Y+p+Fq|g9cNj2u$#faA<4o>6^M#$wM zFW_a$6d`k`?k~$;YK^Qf-@e4~6>kEPd8V|Gy2O^l0w(!vPdQph$`LxjX|)bxMOEI1 zwiNLdD*#XMC~diRhgyuSVHG(bbW^=^d7*NV(pEdroX3klg7?#GZ)};>qG`k}&WFWT zy_E13CjhB<^FFdWfwAGdB%$hYq*_XcaI@S&xz*#xn@*$G%XNr$oE$k+i>@|KB|+&~GD#|w7Jg{N(M@q??`|Gn-}_`?TUcze)V zba?cd)o$3du9|(ny8a|o7`;S6AO2}3m2Gfjzk`>$On$sCY#H;$&6SQ%lCgl48a#xy zu~|@$vjZl%&Iw5~siqqqxuPg?HZ=O-kvmS$8eXNMcT_MZD9edMFPU031yL3nFIPIg zqUQsUgds^A@uG*kLj&y;HOmZ~(D&RxN@!{VLP$_DSzOw)WGbnGxvB;<#=>b-!|rLc ziKCG6;CC;S9HrxvWH2Z*C_xfP&PTyfEwi8+FJI{`$|hdkDD-=|jJI{n_U02zRj0^d z{rB@FN9p*a7!Aw}O6(-(!#6Y_tdGxAA^{)x!)iD^VkpWxKKi~ubd1lRM>i3RvUok( zY0(}&S`0Bt$5*rfWO&dUqoJTXUz|F#_(H0pT8OVe;zYWxXCP~Qyh{pll#Z`x0TAsb zezefd5lLj9NLs}b3kBgszp3c{@$j?;Mm*XqCya<88%*(WRN2pg1T$oo8a|CgB#~O$ zIQevZKia1btTT}_P6sCj92XR&$}Mm7R70&}6sEefUZ+dlq$ltm_JT}B$@pYRM@`+S z$BupSh+fM%2Bvh-Gopgw;oMF|b2FPLLBIKM7`C*W1SF@e91mY< z_*62slD%3r?CJMZ!lPYnFy<5bw)C3Rg|-%I6;#FY=j(gmu8*DugC+`|0Q*-9lGIlY zAy5iFIdhWt(PGzib(3)5FJ6ON*Yz+7Ur1{aTSaAMCHq}%3Bo_$@>}@g=J&v?ivp$O zO9jh1pnOFU_~g_8?KFVmw5mXS(dIknu(ka9av1d))2)S;TxHI)C*m?&#i!N`!I!Q% z31)qE2JIl_*@-|Owvd)GX#hpwQ^`zH(q`tcRb*f0>Eo^NlVhuALYrv!#Ocu5R3yx- zkHPNi--XRBqxt*BaR85>jpJ`DmoK0Md>Z-Hq^eoNWv@ZK`>I~J`l1Qg|H@_1GhqdT zAv?jsrlv9Y=8Y#ANpr1V71SgJQIhmZ_p%6p67b2b0gf09ns$?E?)dgAf2xDC;~E&4 zHg_HCuY2{&*Srm%xM*;X!(m|Q)eDtiFp8}!R!%dp=<&%d0DHAs^&YF;P;ft=Cye$o zzE2Ex;BhDLxVHI@qm(Khcc^S;KneKdelt-Y0!J}Z7p}9*ZaOX&OBOs%jrNoivUJ~9 zE)%fW@yRX#^oJ$PBs)1*RjG0VEnWE0Md{NE9*0$LV9Tbrl}i8?J3d(jKpeGVVe$f* zGE&p8nsN@`7mdUqLfMrH9IH36E?4t~E{$61__&g($pTWY7P>~wdC2Vy`ho?Ir3n;j zA>CSe&A=kXR|)}eu~(lI%$GB#FS?I%?(iXgo>pQirx{q(_(~xFZ!v-j*}6GyBNYel z4Eg+&Z#&}{Y!MptptVMpq){gj)SQ|-U+S``@s&gaL}?~XK~K40Z*+Ge7I%j!0k62f z&Lm{#)CiR`+3_c{m3uB^h3rzsR}vpUaFi0l2nS`w)aZmyChY}Pq&OMl03}jn^KR)_ zQjKaPNmqrx?d&|}8kSLm9HoP0=?N@Se6kCG7{Fj^QmscoR86U>7u20<-rIOInMk;a zq;xAGVzf@7a(Z-@bYQg$wlcMnrc^zFMU9UuotjMbdDS)qRVe^d8)uT~7bcln$-dAu z&Bvqox%le#nOC?&Jc^A+14c88uI(>zw{X$oE0qBB(TW#Pp?cGpv08=7kzJeDMv5fS zG-wx_r%!~$rU4@OJvxES56O>)7|}$KR>!(chf3%VELwb}5`edaCD`J7bIIBzQZ>Dz zW+7m_Oj(na)6RFqvNHUqeOcr%#|x`@MLk$8YPf&%Z|RVOatXm=#aChh z;N@t2okS8JK#E33vFD_kiKC;}Oe1lW0c@4i2e+;JqntvpDDjn20Q%#xs9Yw7NZR7@ z#lSl}m|kly7E1I$YyCfcNe^tQ@0Lpl7A3w?3P6|CDl*p&Kl0jj&@-$9cVEw}mAovu zgUt?ix4px%eN!+6Norf__(~}NeaumkK~0L~tjFQ{rzWwuOo*`S=X>m5d500*dU%cZ4G5s5bSn2!G(>56K=-8^A zl&ZXMDdW?WSOEI+sYuFodmiD^`AFJ{Qco{j==i}=fZv=x3%6~Wz%4O2F;E2ukJhC3 zrT70hozCW^jIY!J&@Z!!%p|EFN!s)Nv>yKEC$@t7uU)_1@!mV{xR)Zn4}}Ski6pXp zJ89iLt=84%$-X7#e{ermKL>;IQlkHZR!@3XtU{m18E5tqe=ucQKy8nQe@=~L*vdtw?DQdyKO13%e# zcuJ}lU0ICyR3#*bxswv60D`=F}aieDdHWCE}#@s&~l4j}M0tP6>TJ+TNi pfYR|Tm*uitmdkQcR{4Jc1^|9km%f2U7D503002ovPDHLkV1i+`Sn>b> diff --git a/svg/emoji_u1f46f.svg b/svg/emoji_u1f46f.svg deleted file mode 100644 index 506d95715..000000000 --- a/svg/emoji_u1f46f.svg +++ /dev/null @@ -1,574 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file From 5bec5cce2c0dd7318a1adf9dd69aa3cab88680b9 Mon Sep 17 00:00:00 2001 From: Doug Felt Date: Mon, 16 Apr 2018 18:25:58 -0700 Subject: [PATCH 3/3] Update check_emoji_sequences. - use existing utilities in nototools/unicode_data, add_aliases - add check that file names do not use presentation selectors - include tags in valid cps that can appear in a sequence - add check for valid tag sequences (for subregion flags) - separate out check that no source for an alias is present (we expect to alias this so should not have an image with that name) - filter data by age (somewhat), provide command line flag, remove hard-coded unicode 9.0 value - separate coverage check (for when data is partial), provide command line flag and don't run by default - provide command line flag to exclude subdirs by name when collecting images - refactor output so each error has a consistent initial text indicating the check where the error was found, make output a tad less verbose --- check_emoji_sequences.py | 307 +++++++++++++++++++++++---------------- 1 file changed, 182 insertions(+), 125 deletions(-) diff --git a/check_emoji_sequences.py b/check_emoji_sequences.py index 083e0d5a8..f29bbe90b 100755 --- a/check_emoji_sequences.py +++ b/check_emoji_sequences.py @@ -26,25 +26,21 @@ import re import sys from nototools import unicode_data - -DATA_ROOT = path.dirname(path.abspath(__file__)) +import add_aliases ZWJ = 0x200d EMOJI_VS = 0xfe0f -def _is_regional_indicator(cp): - return 0x1f1e6 <= cp <= 0x1f1ff +END_TAG = 0xe007f +def _make_tag_set(): + tag_set = set() + tag_set |= set(range(0xe0030, 0xe003a)) # 0-9 + tag_set |= set(range(0xe0061, 0xe007b)) # a-z + tag_set.add(END_TAG) + return tag_set -def _is_skintone_modifier(cp): - return 0x1f3fb <= cp <= 0x1f3ff - - -def _seq_string(seq): - return '_'.join('%04x' % cp for cp in seq) - -def strip_vs(seq): - return tuple(cp for cp in seq if cp != EMOJI_VS) +TAG_SET = _make_tag_set() _namedata = None @@ -54,7 +50,7 @@ def seq_name(seq): if not _namedata: def strip_vs_map(seq_map): return { - strip_vs(k): v + unicode_data.strip_emoji_vs(k): v for k, v in seq_map.iteritems()} _namedata = [ strip_vs_map(unicode_data.get_emoji_combining_sequences()), @@ -70,7 +66,7 @@ def seq_name(seq): if seq in data: return data[seq] if EMOJI_VS in seq: - non_vs_seq = strip_vs(seq) + non_vs_seq = unicode_data.strip_emoji_vs(seq) for data in _namedata: if non_vs_seq in data: return data[non_vs_seq] @@ -78,14 +74,29 @@ def seq_name(seq): return None -def _check_valid_emoji(sorted_seq_to_filepath): - """Ensure all emoji are either valid emoji or specific chars.""" +def _check_no_vs(sorted_seq_to_filepath): + """Our image data does not use emoji presentation variation selectors.""" + for seq, fp in sorted_seq_to_filepath.iteritems(): + if EMOJI_VS in seq: + print('check no VS: FE0F in path: %s' % fp) - valid_cps = set(unicode_data.get_emoji() | unicode_data.proposed_emoji_cps()) + +def _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version): + """Ensure all cps in these sequences are valid emoji cps or specific cps + used in forming emoji sequences. This is a 'pre-check' that reports + this specific problem.""" + + valid_cps = set(unicode_data.get_emoji()) + if unicode_version is None or unicode_version >= unicode_data.PROPOSED_EMOJI_AGE: + valid_cps |= unicode_data.proposed_emoji_cps() + else: + valid_cps = set( + cp for cp in valid_cps if unicode_data.age(cp) <= unicode_version) valid_cps.add(0x200d) # ZWJ valid_cps.add(0x20e3) # combining enclosing keycap valid_cps.add(0xfe0f) # variation selector (emoji presentation) valid_cps.add(0xfe82b) # PUA value for unknown flag + valid_cps |= TAG_SET # used in subregion tag sequences not_emoji = {} for seq, fp in sorted_seq_to_filepath.iteritems(): @@ -96,35 +107,43 @@ def _check_valid_emoji(sorted_seq_to_filepath): not_emoji[cp].append(fp) if len(not_emoji): - print('%d non-emoji found:' % len(not_emoji), file=sys.stderr) + print( + 'check valid emoji cps: %d non-emoji cp found' % len(not_emoji), + file=sys.stderr) for cp in sorted(not_emoji): - print('%04x (in %s)' % (cp, ', '.join(not_emoji[cp])), file=sys.stderr) + fps = not_emoji[cp] + print( + 'check valid emoji cps: %04x (in %d sequences)' % (cp, len(fps)), + file=sys.stderr) def _check_zwj(sorted_seq_to_filepath): - """Ensure zwj is only between two appropriate emoji.""" - ZWJ = 0x200D - EMOJI_PRESENTATION_VS = 0xFE0F + """Ensure zwj is only between two appropriate emoji. This is a 'pre-check' + that reports this specific problem.""" for seq, fp in sorted_seq_to_filepath.iteritems(): if ZWJ not in seq: continue - if seq[0] == 0x200d: - print('zwj at head of sequence in %s' % fp, file=sys.stderr) + if seq[0] == ZWJ: + print('check zwj: zwj at head of sequence in %s' % fp, file=sys.stderr) if len(seq) == 1: continue - if seq[-1] == 0x200d: - print('zwj at end of sequence in %s' % fp, file=sys.stderr) + if seq[-1] == ZWJ: + print('check zwj: zwj at end of sequence in %s' % fp, file=sys.stderr) for i, cp in enumerate(seq): if cp == ZWJ: if i > 0: pcp = seq[i-1] - if pcp != EMOJI_PRESENTATION_VS and not unicode_data.is_emoji(pcp): - print('non-emoji %04x preceeds ZWJ in %s' % (pcp, fp), file=sys.stderr) + if pcp != EMOJI_VS and not unicode_data.is_emoji(pcp): + print( + 'check zwj: non-emoji %04x preceeds ZWJ in %s' % (pcp, fp), + file=sys.stderr) if i < len(seq) - 1: fcp = seq[i+1] if not unicode_data.is_emoji(fcp): - print('non-emoji %04x follows ZWJ in %s' % (fcp, fp), file=sys.stderr) + print( + 'check zwj: non-emoji %04x follows ZWJ in %s' % (fcp, fp), + file=sys.stderr) def _check_flags(sorted_seq_to_filepath): @@ -133,15 +152,40 @@ def _check_flags(sorted_seq_to_filepath): for seq, fp in sorted_seq_to_filepath.iteritems(): have_reg = None for cp in seq: - is_reg = _is_regional_indicator(cp) + is_reg = unicode_data.is_regional_indicator(cp) if have_reg == None: have_reg = is_reg elif have_reg != is_reg: - print('mix of regional and non-regional in %s' % fp, file=sys.stderr) + print( + 'check flags: mix of regional and non-regional in %s' % fp, + file=sys.stderr) if have_reg and len(seq) > 2: # We provide dummy glyphs for regional indicators, so there are sequences - # with single regional indicator symbols. - print('regional indicator sequence length != 2 in %s' % fp, file=sys.stderr) + # with single regional indicator symbols, the len check handles this. + print( + 'check flags: regional indicator sequence length != 2 in %s' % fp, + file=sys.stderr) + +def _check_tags(sorted_seq_to_filepath): + """Ensure tag sequences (for subregion flags) conform to the spec. We don't + validate against CLDR, just that there's a sequence of 2 or more tags starting + and ending with the appropriate codepoints.""" + + BLACK_FLAG = 0x1f3f4 + BLACK_FLAG_SET = set([BLACK_FLAG]) + for seq, fp in sorted_seq_to_filepath.iteritems(): + seq_set = set(cp for cp in seq) + overlap_set = seq_set & TAG_SET + if not overlap_set: + continue + if seq[0] != BLACK_FLAG: + print('check tags: bad start tag in %s' % fp) + elif seq[-1] != END_TAG: + print('check tags: bad end tag in %s' % fp) + elif len(seq) < 4: + print('check tags: sequence too short in %s' % fp) + elif seq_set - TAG_SET != BLACK_FLAG_SET: + print('check tags: non-tag items in %s' % fp) def _check_skintone(sorted_seq_to_filepath): @@ -151,90 +195,76 @@ def _check_skintone(sorted_seq_to_filepath): base_to_modifiers = collections.defaultdict(set) for seq, fp in sorted_seq_to_filepath.iteritems(): for i, cp in enumerate(seq): - if _is_skintone_modifier(cp): + if unicode_data.is_skintone_modifier(cp): if i == 0: if len(seq) > 1: - print('skin color selector first in sequence %s' % fp, file=sys.stderr) + print( + 'check skintone: skin color selector first in sequence %s' % fp, + file=sys.stderr) # standalone are ok continue pcp = seq[i-1] if not unicode_data.is_emoji_modifier_base(pcp): - print(( - 'emoji skintone modifier applied to non-base at %d: %s' % (i, fp)), file=sys.stderr) - elif unicode_data.is_emoji_modifier_base(cp): - if i < len(seq) - 1 and _is_skintone_modifier(seq[i+1]): - base_to_modifiers[cp].add(seq[i+1]) - elif cp not in base_to_modifiers: - base_to_modifiers[cp] = set() + print( + 'check skintone: emoji skintone modifier applied to non-base ' + + 'at %d: %s' % (i, fp), file=sys.stderr) + else: + if pcp not in base_to_modifiers: + base_to_modifiers[pcp] = set() + base_to_modifiers[pcp].add(cp) + for cp, modifiers in sorted(base_to_modifiers.iteritems()): if len(modifiers) != 5: - print('emoji base %04x has %d modifiers defined (%s) in %s' % ( - cp, len(modifiers), - ', '.join('%04x' % cp for cp in sorted(modifiers)), fp), file=sys.stderr) + print( + 'check skintone: base %04x has %d modifiers defined (%s) in %s' % ( + cp, len(modifiers), + ', '.join('%04x' % cp for cp in sorted(modifiers)), fp), + file=sys.stderr) -def _check_zwj_sequences(seq_to_filepath): - """Verify that zwj sequences are valid.""" - zwj_sequence_to_name = unicode_data.get_emoji_zwj_sequences() - # strip emoji variant selectors and add extra mappings - zwj_sequence_without_vs_to_name_canonical = {} - for seq, seq_name in zwj_sequence_to_name.iteritems(): - if EMOJI_VS in seq: - stripped_seq = strip_vs(seq) - zwj_sequence_without_vs_to_name_canonical[stripped_seq] = (seq_name, seq) - - zwj_seq_to_filepath = { - seq: fp for seq, fp in seq_to_filepath.iteritems() - if ZWJ in seq} - - for seq, fp in zwj_seq_to_filepath.iteritems(): - if seq not in zwj_sequence_to_name: - if seq not in zwj_sequence_without_vs_to_name_canonical: - print('zwj sequence not defined: %s' % fp, file=sys.stderr) - else: - _, can = zwj_sequence_without_vs_to_name_canonical[seq] - # print >> sys.stderr, 'canonical sequence %s contains vs: %s' % ( - # _seq_string(can), fp) - -def read_emoji_aliases(): - result = {} - - with open(path.join(DATA_ROOT, 'emoji_aliases.txt'), 'r') as f: - for line in f: - ix = line.find('#') - if (ix > -1): - line = line[:ix] - line = line.strip() - if not line: - continue - als, trg = (s.strip() for s in line.split(';')) - als_seq = tuple([int(x, 16) for x in als.split('_')]) - try: - trg_seq = tuple([int(x, 16) for x in trg.split('_')]) - except: - print('cannot process alias %s -> %s' % (als, trg)) - continue - result[als_seq] = trg_seq - return result +def _check_zwj_sequences(sorted_seq_to_filepath, unicode_version): + """Verify that zwj sequences are valid for the given unicode version.""" + for seq, fp in sorted_seq_to_filepath.iteritems(): + if ZWJ not in seq: + continue + age = unicode_data.get_emoji_sequence_age(seq) + if age is None or unicode_version is not None and age > unicode_version: + print('check zwj sequences: undefined sequence %s' % fp) -def _check_coverage(seq_to_filepath): - age = 9.0 +def _check_no_alias_sources(sorted_seq_to_filepath): + """Check that we don't have sequences that we expect to be aliased to + some other sequence.""" + aliases = add_aliases.read_default_emoji_aliases() + for seq, fp in sorted_seq_to_filepath.iteritems(): + if seq in aliases: + print('check no alias sources: aliased sequence %s' % fp) + + +def _check_coverage(seq_to_filepath, unicode_version): + """Ensure we have all and only the cps and sequences that we need for the + font as of this version.""" + + age = unicode_version non_vs_to_canonical = {} for k in seq_to_filepath: if EMOJI_VS in k: - non_vs = strip_vs(k) + non_vs = unicode_data.strip_emoji_vs(k) non_vs_to_canonical[non_vs] = k - aliases = read_emoji_aliases() + aliases = add_aliases.read_default_emoji_aliases() for k, v in sorted(aliases.items()): if v not in seq_to_filepath and v not in non_vs_to_canonical: - print('alias %s missing target %s' % (_seq_string(k), _seq_string(v))) + alias_str = unicode_data.seq_to_string(k) + target_str = unicode_data.seq_to_string(v) + print('coverage: alias %s missing target %s' % (alias_str, target_str)) continue if k in seq_to_filepath or k in non_vs_to_canonical: - print('alias %s already exists as %s (%s)' % ( - _seq_string(k), _seq_string(v), seq_name(v))) + alias_str = unicode_data.seq_to_string(k) + target_str = unicode_data.seq_to_string(v) + print('coverage: alias %s already exists as %s (%s)' % ( + alias_str, target_str, seq_name(v))) continue filename = seq_to_filepath.get(v) or seq_to_filepath[non_vs_to_canonical[v]] seq_to_filepath[k] = 'alias:' + filename @@ -243,13 +273,15 @@ def _check_coverage(seq_to_filepath): emoji = sorted(unicode_data.get_emoji(age=age)) for cp in emoji: if tuple([cp]) not in seq_to_filepath: - print('missing single %04x (%s)' % (cp, unicode_data.name(cp, ''))) + print( + 'coverage: missing single %04x (%s)' % ( + cp, unicode_data.name(cp, ''))) # special characters # all but combining enclosing keycap are currently marked as emoji for cp in [ord('*'), ord('#'), ord(u'\u20e3')] + range(0x30, 0x3a): if cp not in emoji and tuple([cp]) not in seq_to_filepath: - print('missing special %04x (%s)' % (cp, unicode_data.name(cp))) + print('coverage: missing special %04x (%s)' % (cp, unicode_data.name(cp))) # combining sequences comb_seq_to_name = sorted( @@ -257,24 +289,26 @@ def _check_coverage(seq_to_filepath): for seq, name in comb_seq_to_name: if seq not in seq_to_filepath: # strip vs and try again - non_vs_seq = strip_vs(seq) + non_vs_seq = unicode_data.strip_emoji_vs(seq) if non_vs_seq not in seq_to_filepath: - print('missing combining sequence %s (%s)' % (_seq_string(seq), name)) + print('coverage: missing combining sequence %s (%s)' % + (unicode_data.seq_to_string(seq), name)) # flag sequences flag_seq_to_name = sorted( unicode_data.get_emoji_flag_sequences(age=age).iteritems()) for seq, name in flag_seq_to_name: if seq not in seq_to_filepath: - print('missing flag sequence %s (%s)' % (_seq_string(seq), name)) + print('coverage: missing flag sequence %s (%s)' % + (unicode_data.seq_to_string(seq), name)) # skin tone modifier sequences mod_seq_to_name = sorted( unicode_data.get_emoji_modifier_sequences(age=age).iteritems()) for seq, name in mod_seq_to_name: if seq not in seq_to_filepath: - print('missing modifier sequence %s (%s)' % ( - _seq_string(seq), name)) + print('coverage: missing modifier sequence %s (%s)' % ( + unicode_data.seq_to_string(seq), name)) # zwj sequences # some of ours include the emoji presentation variation selector and some @@ -295,25 +329,30 @@ def _check_coverage(seq_to_filepath): else: test_seq = seq if test_seq not in zwj_seq_without_vs: - print('missing (canonical) zwj sequence %s (%s)' % ( - _seq_string(seq), name)) + print('coverage: missing (canonical) zwj sequence %s (%s)' % ( + unicode_data.seq_to_string(seq), name)) # check for 'unknown flag' - # this is either emoji_ufe82b or 'unknown_flag', we filter out things that + # this is either emoji_ufe82b or 'unknown_flag', but we filter out things that # don't start with our prefix so 'unknown_flag' would be excluded by default. if tuple([0xfe82b]) not in seq_to_filepath: - print('missing unknown flag PUA fe82b') + print('coverage: missing unknown flag PUA fe82b') -def check_sequence_to_filepath(seq_to_filepath): +def check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage): sorted_seq_to_filepath = collections.OrderedDict( sorted(seq_to_filepath.items())) - _check_valid_emoji(sorted_seq_to_filepath) + _check_no_vs(sorted_seq_to_filepath) + _check_valid_emoji_cps(sorted_seq_to_filepath, unicode_version) _check_zwj(sorted_seq_to_filepath) _check_flags(sorted_seq_to_filepath) + _check_tags(sorted_seq_to_filepath) _check_skintone(sorted_seq_to_filepath) - _check_zwj_sequences(sorted_seq_to_filepath) - _check_coverage(sorted_seq_to_filepath) + _check_zwj_sequences(sorted_seq_to_filepath, unicode_version) + _check_no_alias_sources(sorted_seq_to_filepath) + if coverage: + _check_coverage(sorted_seq_to_filepath, unicode_version) + def create_sequence_to_filepath(name_to_dirpath, prefix, suffix): """Check names, and convert name to sequences for names that are ok, @@ -345,12 +384,15 @@ def create_sequence_to_filepath(name_to_dirpath, prefix, suffix): return result -def collect_name_to_dirpath(directory, prefix, suffix): +def collect_name_to_dirpath(directory, prefix, suffix, exclude=None): """Return a mapping from filename to path rooted at directory, ignoring files - that don't match suffix. Report when a filename appears in more than one - subdir; the first path found is kept.""" + that don't match suffix, and subtrees with names in exclude. Report when a + filename appears in more than one subdir; the first path found is kept.""" result = {} - for dirname, _, files in os.walk(directory): + for dirname, dirs, files in os.walk(directory, topdown=True): + if exclude: + dirs[:] = [d for d in dirs if d not in exclude] + if directory != '.': dirname = path.join(directory, dirname) for f in files: @@ -364,42 +406,57 @@ def collect_name_to_dirpath(directory, prefix, suffix): return result -def collect_name_to_dirpath_with_override(dirs, prefix, suffix): +def collect_name_to_dirpath_with_override(dirs, prefix, suffix, exclude=None): """Return a mapping from filename to a directory path rooted at a directory in dirs, using collect_name_to_filepath. The last directory is retained. This does not report an error if a file appears under more than one root directory, - so lets later root directories override earlier ones.""" + so lets later root directories override earlier ones. Use 'exclude' to + name subdirectories (of any root) whose subtree you wish to skip.""" result = {} for d in dirs: - result.update(collect_name_to_dirpath(d, prefix, suffix)) + result.update(collect_name_to_dirpath(d, prefix, suffix, exclude)) return result -def run_check(dirs, prefix, suffix): - print('Checking files with prefix "%s" and suffix "%s" in:\n %s' % ( - prefix, suffix, '\n '.join(dirs))) +def run_check(dirs, prefix, suffix, exclude, unicode_version, coverage): + msg = '' + if unicode_version: + msg = ' (%3.1f)' % unicode_version + print('Checking files with prefix "%s" and suffix "%s"%s in:\n %s' % ( + prefix, suffix, msg, '\n '.join(dirs))) name_to_dirpath = collect_name_to_dirpath_with_override( - dirs, prefix=prefix, suffix=suffix) + dirs, prefix=prefix, suffix=suffix, exclude=exclude) print('checking %d names' % len(name_to_dirpath)) seq_to_filepath = create_sequence_to_filepath(name_to_dirpath, prefix, suffix) print('checking %d sequences' % len(seq_to_filepath)) - check_sequence_to_filepath(seq_to_filepath) + check_sequence_to_filepath(seq_to_filepath, unicode_version, coverage) print('done.') def main(): parser = argparse.ArgumentParser() parser.add_argument( - '-d', '--dirs', help='directories containing emoji images', + '-d', '--dirs', help='directory roots containing emoji images', metavar='dir', nargs='+', required=True) + parser.add_argument( + '-e', '--exclude', help='names of source subdirs to exclude', + metavar='dir', nargs='+') + parser.add_argument( + '-c', '--coverage', help='test for complete coverage', + action='store_true') parser.add_argument( '-p', '--prefix', help='prefix to match, default "emoji_u"', metavar='pfx', default='emoji_u') parser.add_argument( '-s', '--suffix', help='suffix to match, default ".png"', metavar='sfx', default='.png') + parser.add_argument( + '-u', '--unicode_version', help='limit to this unicode version or before', + metavar='version', type=float) args = parser.parse_args() - run_check(args.dirs, args.prefix, args.suffix) + run_check( + args.dirs, args.prefix, args.suffix, args.exclude, args.unicode_version, + args.coverage) if __name__ == '__main__':