Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright  2012  Google, Inc.
      3  *
      4  *  This is part of HarfBuzz, a text shaping library.
      5  *
      6  * Permission is hereby granted, without written agreement and without
      7  * license or royalty fees, to use, copy, modify, and distribute this
      8  * software and its documentation for any purpose, provided that the
      9  * above copyright notice and the following two paragraphs appear in
     10  * all copies of this software.
     11  *
     12  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
     13  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
     14  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
     15  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
     16  * DAMAGE.
     17  *
     18  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
     19  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
     20  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
     21  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
     22  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
     23  *
     24  * Google Author(s): Behdad Esfahbod
     25  */
     26 
     27 #ifndef HB_OT_SHAPE_COMPLEX_INDIC_HH
     28 #define HB_OT_SHAPE_COMPLEX_INDIC_HH
     29 
     30 #include "hb.hh"
     31 
     32 #include "hb-ot-shape-complex.hh"
     33 
     34 
     35 /* buffer var allocations */
     36 #define indic_category() complex_var_u8_0() /* indic_category_t */
     37 #define indic_position() complex_var_u8_1() /* indic_position_t */
     38 
     39 
     40 #define INDIC_TABLE_ELEMENT_TYPE uint16_t
     41 
     42 /* Cateories used in the OpenType spec:
     43  * https://docs.microsoft.com/en-us/typography/script-development/devanagari
     44  */
     45 /* Note: This enum is duplicated in the -machine.rl source file.
     46  * Not sure how to avoid duplication. */
     47 enum indic_category_t {
     48   OT_X = 0,
     49   OT_C = 1,
     50   OT_V = 2,
     51   OT_N = 3,
     52   OT_H = 4,
     53   OT_ZWNJ = 5,
     54   OT_ZWJ = 6,
     55   OT_M = 7,
     56   OT_SM = 8,
     57   /* OT_VD = 9, UNUSED; we use OT_A instead. */
     58   OT_A = 10,
     59   OT_PLACEHOLDER = 11,
     60   OT_DOTTEDCIRCLE = 12,
     61   OT_RS = 13, /* Register Shifter, used in Khmer OT spec. */
     62   OT_Coeng = 14, /* Khmer-style Virama. */
     63   OT_Repha = 15, /* Atomically-encoded logical or visual repha. */
     64   OT_Ra = 16,
     65   OT_CM = 17,  /* Consonant-Medial; Unused by Indic shaper. */
     66   OT_Symbol = 18, /* Avagraha, etc that take marks (SM,A,VD). */
     67   OT_CS = 19
     68 };
     69 
     70 /* Note:
     71  *
     72  * We treat Vowels and placeholders as if they were consonants.  This is safe because Vowels
     73  * cannot happen in a consonant syllable.  The plus side however is, we can call the
     74  * consonant syllable logic from the vowel syllable function and get it all right! */
     75 #define CONSONANT_FLAGS (FLAG (OT_C) | FLAG (OT_CS) | FLAG (OT_Ra) | FLAG (OT_V) | FLAG (OT_PLACEHOLDER) | FLAG (OT_DOTTEDCIRCLE))
     76 #define JOINER_FLAGS (FLAG (OT_ZWJ) | FLAG (OT_ZWNJ))
     77 
     78 
     79 /* Visual positions in a syllable from left to right. */
     80 enum indic_position_t {
     81   POS_START = 0,
     82 
     83   POS_RA_TO_BECOME_REPH = 1,
     84   POS_PRE_M = 2,
     85   POS_PRE_C = 3,
     86 
     87   POS_BASE_C = 4,
     88   POS_AFTER_MAIN = 5,
     89 
     90   POS_ABOVE_C = 6,
     91 
     92   POS_BEFORE_SUB = 7,
     93   POS_BELOW_C = 8,
     94   POS_AFTER_SUB = 9,
     95 
     96   POS_BEFORE_POST = 10,
     97   POS_POST_C = 11,
     98   POS_AFTER_POST = 12,
     99 
    100   POS_FINAL_C = 13,
    101   POS_SMVD = 14,
    102 
    103   POS_END = 15
    104 };
    105 
    106 /* Categories used in IndicSyllabicCategory.txt from UCD. */
    107 enum indic_syllabic_category_t {
    108   INDIC_SYLLABIC_CATEGORY_OTHER				= OT_X,
    109 
    110   INDIC_SYLLABIC_CATEGORY_AVAGRAHA			= OT_Symbol,
    111   INDIC_SYLLABIC_CATEGORY_BINDU				= OT_SM,
    112   INDIC_SYLLABIC_CATEGORY_BRAHMI_JOINING_NUMBER		= OT_PLACEHOLDER, /* Don't care. */
    113   INDIC_SYLLABIC_CATEGORY_CANTILLATION_MARK		= OT_A,
    114   INDIC_SYLLABIC_CATEGORY_CONSONANT			= OT_C,
    115   INDIC_SYLLABIC_CATEGORY_CONSONANT_DEAD		= OT_C,
    116   INDIC_SYLLABIC_CATEGORY_CONSONANT_FINAL		= OT_CM,
    117   INDIC_SYLLABIC_CATEGORY_CONSONANT_HEAD_LETTER		= OT_C,
    118   INDIC_SYLLABIC_CATEGORY_CONSONANT_KILLER		= OT_M, /* U+17CD only. */
    119   INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL		= OT_CM,
    120   INDIC_SYLLABIC_CATEGORY_CONSONANT_PLACEHOLDER		= OT_PLACEHOLDER,
    121   INDIC_SYLLABIC_CATEGORY_CONSONANT_PRECEDING_REPHA	= OT_Repha,
    122   INDIC_SYLLABIC_CATEGORY_CONSONANT_PREFIXED		= OT_X, /* Don't care. */
    123   INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED		= OT_CM,
    124   INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA	= OT_CM,
    125   INDIC_SYLLABIC_CATEGORY_CONSONANT_WITH_STACKER	= OT_CS,
    126   INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK		= OT_SM, /* https://github.com/harfbuzz/harfbuzz/issues/552 */
    127   INDIC_SYLLABIC_CATEGORY_INVISIBLE_STACKER		= OT_Coeng,
    128   INDIC_SYLLABIC_CATEGORY_JOINER			= OT_ZWJ,
    129   INDIC_SYLLABIC_CATEGORY_MODIFYING_LETTER		= OT_X,
    130   INDIC_SYLLABIC_CATEGORY_NON_JOINER			= OT_ZWNJ,
    131   INDIC_SYLLABIC_CATEGORY_NUKTA				= OT_N,
    132   INDIC_SYLLABIC_CATEGORY_NUMBER			= OT_PLACEHOLDER,
    133   INDIC_SYLLABIC_CATEGORY_NUMBER_JOINER			= OT_PLACEHOLDER, /* Don't care. */
    134   INDIC_SYLLABIC_CATEGORY_PURE_KILLER			= OT_M, /* Is like a vowel matra. */
    135   INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER		= OT_RS,
    136   INDIC_SYLLABIC_CATEGORY_SYLLABLE_MODIFIER		= OT_SM,
    137   INDIC_SYLLABIC_CATEGORY_TONE_LETTER			= OT_X,
    138   INDIC_SYLLABIC_CATEGORY_TONE_MARK			= OT_N,
    139   INDIC_SYLLABIC_CATEGORY_VIRAMA			= OT_H,
    140   INDIC_SYLLABIC_CATEGORY_VISARGA			= OT_SM,
    141   INDIC_SYLLABIC_CATEGORY_VOWEL				= OT_V,
    142   INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT		= OT_M,
    143   INDIC_SYLLABIC_CATEGORY_VOWEL_INDEPENDENT		= OT_V
    144 };
    145 
    146 /* Categories used in IndicSMatraCategory.txt from UCD */
    147 enum indic_matra_category_t {
    148   INDIC_MATRA_CATEGORY_NOT_APPLICABLE			= POS_END,
    149 
    150   INDIC_MATRA_CATEGORY_LEFT				= POS_PRE_C,
    151   INDIC_MATRA_CATEGORY_TOP				= POS_ABOVE_C,
    152   INDIC_MATRA_CATEGORY_BOTTOM				= POS_BELOW_C,
    153   INDIC_MATRA_CATEGORY_RIGHT				= POS_POST_C,
    154 
    155   /* These should resolve to the position of the last part of the split sequence. */
    156   INDIC_MATRA_CATEGORY_BOTTOM_AND_RIGHT			= INDIC_MATRA_CATEGORY_RIGHT,
    157   INDIC_MATRA_CATEGORY_LEFT_AND_RIGHT			= INDIC_MATRA_CATEGORY_RIGHT,
    158   INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM			= INDIC_MATRA_CATEGORY_BOTTOM,
    159   INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM_AND_RIGHT		= INDIC_MATRA_CATEGORY_RIGHT,
    160   INDIC_MATRA_CATEGORY_TOP_AND_LEFT			= INDIC_MATRA_CATEGORY_TOP,
    161   INDIC_MATRA_CATEGORY_TOP_AND_LEFT_AND_RIGHT		= INDIC_MATRA_CATEGORY_RIGHT,
    162   INDIC_MATRA_CATEGORY_TOP_AND_RIGHT			= INDIC_MATRA_CATEGORY_RIGHT,
    163 
    164   INDIC_MATRA_CATEGORY_OVERSTRUCK			= POS_AFTER_MAIN,
    165   INDIC_MATRA_CATEGORY_VISUAL_ORDER_LEFT		= POS_PRE_M
    166 };
    167 
    168 #define INDIC_COMBINE_CATEGORIES(S,M) \
    169   ( \
    170     ASSERT_STATIC_EXPR_ZERO (S < 255 && M < 255) + \
    171     ( S | \
    172      ( \
    173       ( \
    174        S == INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL || \
    175        S == INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK || \
    176        S == INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER || \
    177        S == INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA || \
    178        S == INDIC_SYLLABIC_CATEGORY_VIRAMA || \
    179        S == INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT || \
    180        false \
    181        ? M : INDIC_MATRA_CATEGORY_NOT_APPLICABLE \
    182       ) << 8 \
    183      ) \
    184     ) \
    185    )
    186 
    187 HB_INTERNAL INDIC_TABLE_ELEMENT_TYPE
    188 hb_indic_get_categories (hb_codepoint_t u);
    189 
    190 
    191 static inline bool
    192 is_one_of (const hb_glyph_info_t &info, unsigned int flags)
    193 {
    194   /* If it ligated, all bets are off. */
    195   if (_hb_glyph_info_ligated (&info)) return false;
    196   return !!(FLAG_UNSAFE (info.indic_category()) & flags);
    197 }
    198 
    199 static inline bool
    200 is_joiner (const hb_glyph_info_t &info)
    201 {
    202   return is_one_of (info, JOINER_FLAGS);
    203 }
    204 
    205 static inline bool
    206 is_consonant (const hb_glyph_info_t &info)
    207 {
    208   return is_one_of (info, CONSONANT_FLAGS);
    209 }
    210 
    211 static inline bool
    212 is_halant (const hb_glyph_info_t &info)
    213 {
    214   return is_one_of (info, FLAG (OT_H));
    215 }
    216 
    217 #define IN_HALF_BLOCK(u, Base) (((u) & ~0x7Fu) == (Base))
    218 
    219 #define IS_DEVA(u) (IN_HALF_BLOCK (u, 0x0900u))
    220 #define IS_BENG(u) (IN_HALF_BLOCK (u, 0x0980u))
    221 #define IS_GURU(u) (IN_HALF_BLOCK (u, 0x0A00u))
    222 #define IS_GUJR(u) (IN_HALF_BLOCK (u, 0x0A80u))
    223 #define IS_ORYA(u) (IN_HALF_BLOCK (u, 0x0B00u))
    224 #define IS_TAML(u) (IN_HALF_BLOCK (u, 0x0B80u))
    225 #define IS_TELU(u) (IN_HALF_BLOCK (u, 0x0C00u))
    226 #define IS_KNDA(u) (IN_HALF_BLOCK (u, 0x0C80u))
    227 #define IS_MLYM(u) (IN_HALF_BLOCK (u, 0x0D00u))
    228 #define IS_SINH(u) (IN_HALF_BLOCK (u, 0x0D80u))
    229 
    230 
    231 #define MATRA_POS_LEFT(u)	POS_PRE_M
    232 #define MATRA_POS_RIGHT(u)	( \
    233 				  IS_DEVA(u) ? POS_AFTER_SUB  : \
    234 				  IS_BENG(u) ? POS_AFTER_POST : \
    235 				  IS_GURU(u) ? POS_AFTER_POST : \
    236 				  IS_GUJR(u) ? POS_AFTER_POST : \
    237 				  IS_ORYA(u) ? POS_AFTER_POST : \
    238 				  IS_TAML(u) ? POS_AFTER_POST : \
    239 				  IS_TELU(u) ? (u <= 0x0C42u ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
    240 				  IS_KNDA(u) ? (u < 0x0CC3u || u > 0xCD6u ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
    241 				  IS_MLYM(u) ? POS_AFTER_POST : \
    242 				  IS_SINH(u) ? POS_AFTER_SUB  : \
    243 				  /*default*/  POS_AFTER_SUB    \
    244 				)
    245 #define MATRA_POS_TOP(u)	( /* BENG and MLYM don't have top matras. */ \
    246 				  IS_DEVA(u) ? POS_AFTER_SUB  : \
    247 				  IS_GURU(u) ? POS_AFTER_POST : /* Deviate from spec */ \
    248 				  IS_GUJR(u) ? POS_AFTER_SUB  : \
    249 				  IS_ORYA(u) ? POS_AFTER_MAIN : \
    250 				  IS_TAML(u) ? POS_AFTER_SUB  : \
    251 				  IS_TELU(u) ? POS_BEFORE_SUB : \
    252 				  IS_KNDA(u) ? POS_BEFORE_SUB : \
    253 				  IS_SINH(u) ? POS_AFTER_SUB  : \
    254 				  /*default*/  POS_AFTER_SUB    \
    255 				)
    256 #define MATRA_POS_BOTTOM(u)	( \
    257 				  IS_DEVA(u) ? POS_AFTER_SUB  : \
    258 				  IS_BENG(u) ? POS_AFTER_SUB  : \
    259 				  IS_GURU(u) ? POS_AFTER_POST : \
    260 				  IS_GUJR(u) ? POS_AFTER_POST : \
    261 				  IS_ORYA(u) ? POS_AFTER_SUB  : \
    262 				  IS_TAML(u) ? POS_AFTER_POST : \
    263 				  IS_TELU(u) ? POS_BEFORE_SUB : \
    264 				  IS_KNDA(u) ? POS_BEFORE_SUB : \
    265 				  IS_MLYM(u) ? POS_AFTER_POST : \
    266 				  IS_SINH(u) ? POS_AFTER_SUB  : \
    267 				  /*default*/  POS_AFTER_SUB    \
    268 				)
    269 
    270 static inline indic_position_t
    271 matra_position_indic (hb_codepoint_t u, indic_position_t side)
    272 {
    273   switch ((int) side)
    274   {
    275     case POS_PRE_C:	return MATRA_POS_LEFT (u);
    276     case POS_POST_C:	return MATRA_POS_RIGHT (u);
    277     case POS_ABOVE_C:	return MATRA_POS_TOP (u);
    278     case POS_BELOW_C:	return MATRA_POS_BOTTOM (u);
    279   };
    280   return side;
    281 }
    282 
    283 /* XXX
    284  * This is a hack for now.  We should move this data into the main Indic table.
    285  * Or completely remove it and just check in the tables.
    286  */
    287 static const hb_codepoint_t ra_chars[] = {
    288   0x0930u, /* Devanagari */
    289   0x09B0u, /* Bengali */
    290   0x09F0u, /* Bengali */
    291   0x0A30u, /* Gurmukhi */	/* No Reph */
    292   0x0AB0u, /* Gujarati */
    293   0x0B30u, /* Oriya */
    294   0x0BB0u, /* Tamil */		/* No Reph */
    295   0x0C30u, /* Telugu */		/* Reph formed only with ZWJ */
    296   0x0CB0u, /* Kannada */
    297   0x0D30u, /* Malayalam */	/* No Reph, Logical Repha */
    298 
    299   0x0DBBu, /* Sinhala */	/* Reph formed only with ZWJ */
    300 
    301   0x179Au, /* Khmer */
    302 };
    303 
    304 static inline bool
    305 is_ra (hb_codepoint_t u)
    306 {
    307   for (unsigned int i = 0; i < ARRAY_LENGTH (ra_chars); i++)
    308     if (u == ra_chars[i])
    309       return true;
    310   return false;
    311 }
    312 
    313 static inline void
    314 set_indic_properties (hb_glyph_info_t &info)
    315 {
    316   hb_codepoint_t u = info.codepoint;
    317   unsigned int type = hb_indic_get_categories (u);
    318   indic_category_t cat = (indic_category_t) (type & 0x7Fu);
    319   indic_position_t pos = (indic_position_t) (type >> 8);
    320 
    321 
    322   /*
    323    * Re-assign category
    324    */
    325 
    326   /* The following act more like the Bindus. */
    327   if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x0953u, 0x0954u)))
    328     cat = OT_SM;
    329   /* The following act like consonants. */
    330   else if (unlikely (hb_in_ranges<hb_codepoint_t> (u, 0x0A72u, 0x0A73u,
    331 				      0x1CF5u, 0x1CF6u)))
    332     cat = OT_C;
    333   /* TODO: The following should only be allowed after a Visarga.
    334    * For now, just treat them like regular tone marks. */
    335   else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x1CE2u, 0x1CE8u)))
    336     cat = OT_A;
    337   /* TODO: The following should only be allowed after some of
    338    * the nasalization marks, maybe only for U+1CE9..U+1CF1.
    339    * For now, just treat them like tone marks. */
    340   else if (unlikely (u == 0x1CEDu))
    341     cat = OT_A;
    342   /* The following take marks in standalone clusters, similar to Avagraha. */
    343   else if (unlikely (hb_in_ranges<hb_codepoint_t> (u, 0xA8F2u, 0xA8F7u,
    344 				      0x1CE9u, 0x1CECu,
    345 				      0x1CEEu, 0x1CF1u)))
    346   {
    347     cat = OT_Symbol;
    348     static_assert (((int) INDIC_SYLLABIC_CATEGORY_AVAGRAHA == OT_Symbol), "");
    349   }
    350   else if (unlikely (u == 0x0A51u))
    351   {
    352     /* https://github.com/harfbuzz/harfbuzz/issues/524 */
    353     cat = OT_M;
    354     pos = POS_BELOW_C;
    355   }
    356 
    357   /* According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil,
    358    * so the Indic shaper needs to know their categories. */
    359   else if (unlikely (u == 0x11301u || u == 0x11303u)) cat = OT_SM;
    360   else if (unlikely (u == 0x1133cu)) cat = OT_N;
    361 
    362   else if (unlikely (u == 0x0AFBu)) cat = OT_N; /* https://github.com/harfbuzz/harfbuzz/issues/552 */
    363 
    364   else if (unlikely (u == 0x0980u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/issues/538 */
    365   else if (unlikely (u == 0x0C80u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/pull/623 */
    366   else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x2010u, 0x2011u)))
    367 				    cat = OT_PLACEHOLDER;
    368   else if (unlikely (u == 0x25CCu)) cat = OT_DOTTEDCIRCLE;
    369 
    370 
    371   /*
    372    * Re-assign position.
    373    */
    374 
    375   if ((FLAG_UNSAFE (cat) & CONSONANT_FLAGS))
    376   {
    377     pos = POS_BASE_C;
    378     if (is_ra (u))
    379       cat = OT_Ra;
    380   }
    381   else if (cat == OT_M)
    382   {
    383     pos = matra_position_indic (u, pos);
    384   }
    385   else if ((FLAG_UNSAFE (cat) & (FLAG (OT_SM) /* | FLAG (OT_VD) */ | FLAG (OT_A) | FLAG (OT_Symbol))))
    386   {
    387     pos = POS_SMVD;
    388   }
    389 
    390   if (unlikely (u == 0x0B01u)) pos = POS_BEFORE_SUB; /* Oriya Bindu is BeforeSub in the spec. */
    391 
    392 
    393 
    394   info.indic_category() = cat;
    395   info.indic_position() = pos;
    396 }
    397 
    398 
    399 #endif /* HB_OT_SHAPE_COMPLEX_INDIC_HH */
    400