Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright  2011,2012,2014  Google, Inc.
      3  *
      4  *  This is part of HarfBuzz, a text shaping library.
      5  *
      6  * Permission is hereby granted, without written agreement and without
      7  * license or royalty fees, to use, copy, modify, and distribute this
      8  * software and its documentation for any purpose, provided that the
      9  * above copyright notice and the following two paragraphs appear in
     10  * all copies of this software.
     11  *
     12  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
     13  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
     14  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
     15  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
     16  * DAMAGE.
     17  *
     18  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
     19  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
     20  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
     21  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
     22  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
     23  *
     24  * Google Author(s): Behdad Esfahbod
     25  */
     26 
     27 #ifndef HB_UTF_HH
     28 #define HB_UTF_HH
     29 
     30 #include "hb.hh"
     31 
     32 #include "hb-open-type.hh"
     33 
     34 
     35 struct hb_utf8_t
     36 {
     37   typedef uint8_t codepoint_t;
     38 
     39   static const codepoint_t *
     40   next (const codepoint_t *text,
     41 	const codepoint_t *end,
     42 	hb_codepoint_t *unicode,
     43 	hb_codepoint_t replacement)
     44   {
     45     /* Written to only accept well-formed sequences.
     46      * Based on ideas from ICU's U8_NEXT.
     47      * Generates one "replacement" for each ill-formed byte. */
     48 
     49     hb_codepoint_t c = *text++;
     50 
     51     if (c > 0x7Fu)
     52     {
     53       if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */
     54       {
     55 	unsigned int t1;
     56 	if (likely (text < end &&
     57 		    (t1 = text[0] - 0x80u) <= 0x3Fu))
     58 	{
     59 	  c = ((c&0x1Fu)<<6) | t1;
     60 	  text++;
     61 	}
     62 	else
     63 	  goto error;
     64       }
     65       else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */
     66       {
     67 	unsigned int t1, t2;
     68 	if (likely (1 < end - text &&
     69 		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
     70 		    (t2 = text[1] - 0x80u) <= 0x3Fu))
     71 	{
     72 	  c = ((c&0xFu)<<12) | (t1<<6) | t2;
     73 	  if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
     74 	    goto error;
     75 	  text += 2;
     76 	}
     77 	else
     78 	  goto error;
     79       }
     80       else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */
     81       {
     82 	unsigned int t1, t2, t3;
     83 	if (likely (2 < end - text &&
     84 		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
     85 		    (t2 = text[1] - 0x80u) <= 0x3Fu &&
     86 		    (t3 = text[2] - 0x80u) <= 0x3Fu))
     87 	{
     88 	  c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
     89 	  if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu)))
     90 	    goto error;
     91 	  text += 3;
     92 	}
     93 	else
     94 	  goto error;
     95       }
     96       else
     97 	goto error;
     98     }
     99 
    100     *unicode = c;
    101     return text;
    102 
    103   error:
    104     *unicode = replacement;
    105     return text;
    106   }
    107 
    108   static const codepoint_t *
    109   prev (const codepoint_t *text,
    110 	const codepoint_t *start,
    111 	hb_codepoint_t *unicode,
    112 	hb_codepoint_t replacement)
    113   {
    114     const codepoint_t *end = text--;
    115     while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
    116       text--;
    117 
    118     if (likely (next (text, end, unicode, replacement) == end))
    119       return text;
    120 
    121     *unicode = replacement;
    122     return end - 1;
    123   }
    124 
    125   static unsigned int
    126   strlen (const codepoint_t *text)
    127   { return ::strlen ((const char *) text); }
    128 
    129   static unsigned int
    130   encode_len (hb_codepoint_t unicode)
    131   {
    132     if (unicode <   0x0080u) return 1;
    133     if (unicode <   0x0800u) return 2;
    134     if (unicode <  0x10000u) return 3;
    135     if (unicode < 0x110000u) return 4;
    136     return 3;
    137   }
    138 
    139   static codepoint_t *
    140   encode (codepoint_t *text,
    141 	  const codepoint_t *end,
    142 	  hb_codepoint_t unicode)
    143   {
    144     if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu)))
    145       unicode = 0xFFFDu;
    146     if (unicode < 0x0080u)
    147      *text++ = unicode;
    148     else if (unicode < 0x0800u)
    149     {
    150       if (end - text >= 2)
    151       {
    152 	*text++ =  0xC0u + (0x1Fu & (unicode >>  6));
    153 	*text++ =  0x80u + (0x3Fu & (unicode      ));
    154       }
    155     }
    156     else if (unicode < 0x10000u)
    157     {
    158       if (end - text >= 3)
    159       {
    160 	*text++ =  0xE0u + (0x0Fu & (unicode >> 12));
    161 	*text++ =  0x80u + (0x3Fu & (unicode >>  6));
    162 	*text++ =  0x80u + (0x3Fu & (unicode      ));
    163       }
    164     }
    165     else
    166     {
    167       if (end - text >= 4)
    168       {
    169 	*text++ =  0xF0u + (0x07u & (unicode >> 18));
    170 	*text++ =  0x80u + (0x3Fu & (unicode >> 12));
    171 	*text++ =  0x80u + (0x3Fu & (unicode >>  6));
    172 	*text++ =  0x80u + (0x3Fu & (unicode      ));
    173       }
    174     }
    175     return text;
    176   }
    177 };
    178 
    179 
    180 template <typename TCodepoint>
    181 struct hb_utf16_xe_t
    182 {
    183   static_assert (sizeof (TCodepoint) == 2, "");
    184   typedef TCodepoint codepoint_t;
    185 
    186   static const codepoint_t *
    187   next (const codepoint_t *text,
    188 	const codepoint_t *end,
    189 	hb_codepoint_t *unicode,
    190 	hb_codepoint_t replacement)
    191   {
    192     hb_codepoint_t c = *text++;
    193 
    194     if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
    195     {
    196       *unicode = c;
    197       return text;
    198     }
    199 
    200     if (likely (c <= 0xDBFFu && text < end))
    201     {
    202       /* High-surrogate in c */
    203       hb_codepoint_t l = *text;
    204       if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu)))
    205       {
    206 	/* Low-surrogate in l */
    207 	*unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
    208 	 text++;
    209 	 return text;
    210       }
    211     }
    212 
    213     /* Lonely / out-of-order surrogate. */
    214     *unicode = replacement;
    215     return text;
    216   }
    217 
    218   static const codepoint_t *
    219   prev (const codepoint_t *text,
    220 	const codepoint_t *start,
    221 	hb_codepoint_t *unicode,
    222 	hb_codepoint_t replacement)
    223   {
    224     hb_codepoint_t c = *--text;
    225 
    226     if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
    227     {
    228       *unicode = c;
    229       return text;
    230     }
    231 
    232     if (likely (c >= 0xDC00u && start < text))
    233     {
    234       /* Low-surrogate in c */
    235       hb_codepoint_t h = text[-1];
    236       if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu)))
    237       {
    238         /* High-surrogate in h */
    239         *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u);
    240         text--;
    241         return text;
    242       }
    243     }
    244 
    245     /* Lonely / out-of-order surrogate. */
    246     *unicode = replacement;
    247     return text;
    248   }
    249 
    250 
    251   static unsigned int
    252   strlen (const codepoint_t *text)
    253   {
    254     unsigned int l = 0;
    255     while (*text++) l++;
    256     return l;
    257   }
    258 
    259   static unsigned int
    260   encode_len (hb_codepoint_t unicode)
    261   {
    262     return unicode < 0x10000 ? 1 : 2;
    263   }
    264 
    265   static codepoint_t *
    266   encode (codepoint_t *text,
    267 	  const codepoint_t *end,
    268 	  hb_codepoint_t unicode)
    269   {
    270     if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu)))
    271       unicode = 0xFFFDu;
    272     if (unicode < 0x10000u)
    273      *text++ = unicode;
    274     else if (end - text >= 2)
    275     {
    276       unicode -= 0x10000u;
    277       *text++ =  0xD800u + (unicode >> 10);
    278       *text++ =  0xDC00u + (unicode & 0x03FFu);
    279     }
    280     return text;
    281   }
    282 };
    283 
    284 typedef hb_utf16_xe_t<uint16_t> hb_utf16_t;
    285 typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t;
    286 
    287 
    288 template <typename TCodepoint, bool validate=true>
    289 struct hb_utf32_xe_t
    290 {
    291   static_assert (sizeof (TCodepoint) == 4, "");
    292   typedef TCodepoint codepoint_t;
    293 
    294   static const TCodepoint *
    295   next (const TCodepoint *text,
    296 	const TCodepoint *end HB_UNUSED,
    297 	hb_codepoint_t *unicode,
    298 	hb_codepoint_t replacement)
    299   {
    300     hb_codepoint_t c = *unicode = *text++;
    301     if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
    302       *unicode = replacement;
    303     return text;
    304   }
    305 
    306   static const TCodepoint *
    307   prev (const TCodepoint *text,
    308 	const TCodepoint *start HB_UNUSED,
    309 	hb_codepoint_t *unicode,
    310 	hb_codepoint_t replacement)
    311   {
    312     hb_codepoint_t c = *unicode = *--text;
    313     if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
    314       *unicode = replacement;
    315     return text;
    316   }
    317 
    318   static unsigned int
    319   strlen (const TCodepoint *text)
    320   {
    321     unsigned int l = 0;
    322     while (*text++) l++;
    323     return l;
    324   }
    325 
    326   static unsigned int
    327   encode_len (hb_codepoint_t unicode HB_UNUSED)
    328   {
    329     return 1;
    330   }
    331 
    332   static codepoint_t *
    333   encode (codepoint_t *text,
    334 	  const codepoint_t *end HB_UNUSED,
    335 	  hb_codepoint_t unicode)
    336   {
    337     if (validate && unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu)))
    338       unicode = 0xFFFDu;
    339     *text++ = unicode;
    340     return text;
    341   }
    342 };
    343 
    344 typedef hb_utf32_xe_t<uint32_t> hb_utf32_t;
    345 typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t;
    346 
    347 
    348 struct hb_latin1_t
    349 {
    350   typedef uint8_t codepoint_t;
    351 
    352   static const codepoint_t *
    353   next (const codepoint_t *text,
    354 	const codepoint_t *end HB_UNUSED,
    355 	hb_codepoint_t *unicode,
    356 	hb_codepoint_t replacement HB_UNUSED)
    357   {
    358     *unicode = *text++;
    359     return text;
    360   }
    361 
    362   static const codepoint_t *
    363   prev (const codepoint_t *text,
    364 	const codepoint_t *start HB_UNUSED,
    365 	hb_codepoint_t *unicode,
    366 	hb_codepoint_t replacement HB_UNUSED)
    367   {
    368     *unicode = *--text;
    369     return text;
    370   }
    371 
    372   static unsigned int
    373   strlen (const codepoint_t *text)
    374   {
    375     unsigned int l = 0;
    376     while (*text++) l++;
    377     return l;
    378   }
    379 
    380   static unsigned int
    381   encode_len (hb_codepoint_t unicode HB_UNUSED)
    382   {
    383     return 1;
    384   }
    385 
    386   static codepoint_t *
    387   encode (codepoint_t *text,
    388 	  const codepoint_t *end HB_UNUSED,
    389 	  hb_codepoint_t unicode)
    390   {
    391     if (unlikely (unicode >= 0x0100u))
    392       unicode = '?';
    393     *text++ = unicode;
    394     return text;
    395   }
    396 };
    397 
    398 
    399 struct hb_ascii_t
    400 {
    401   typedef uint8_t codepoint_t;
    402 
    403   static const codepoint_t *
    404   next (const codepoint_t *text,
    405 	const codepoint_t *end HB_UNUSED,
    406 	hb_codepoint_t *unicode,
    407 	hb_codepoint_t replacement HB_UNUSED)
    408   {
    409     *unicode = *text++;
    410     if (*unicode >= 0x0080u)
    411       *unicode = replacement;
    412     return text;
    413   }
    414 
    415   static const codepoint_t *
    416   prev (const codepoint_t *text,
    417 	const codepoint_t *start HB_UNUSED,
    418 	hb_codepoint_t *unicode,
    419 	hb_codepoint_t replacement)
    420   {
    421     *unicode = *--text;
    422     if (*unicode >= 0x0080u)
    423       *unicode = replacement;
    424     return text;
    425   }
    426 
    427   static unsigned int
    428   strlen (const codepoint_t *text)
    429   {
    430     unsigned int l = 0;
    431     while (*text++) l++;
    432     return l;
    433   }
    434 
    435   static unsigned int
    436   encode_len (hb_codepoint_t unicode HB_UNUSED)
    437   {
    438     return 1;
    439   }
    440 
    441   static codepoint_t *
    442   encode (codepoint_t *text,
    443 	  const codepoint_t *end HB_UNUSED,
    444 	  hb_codepoint_t unicode)
    445   {
    446     if (unlikely (unicode >= 0x0080u))
    447       unicode = '?';
    448     *text++ = unicode;
    449     return text;
    450   }
    451 };
    452 
    453 #endif /* HB_UTF_HH */
    454