Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright  2011,2012,2014  Google, Inc.
      3  *
      4  *  This is part of HarfBuzz, a text shaping library.
      5  *
      6  * Permission is hereby granted, without written agreement and without
      7  * license or royalty fees, to use, copy, modify, and distribute this
      8  * software and its documentation for any purpose, provided that the
      9  * above copyright notice and the following two paragraphs appear in
     10  * all copies of this software.
     11  *
     12  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
     13  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
     14  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
     15  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
     16  * DAMAGE.
     17  *
     18  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
     19  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
     20  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
     21  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
     22  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
     23  *
     24  * Google Author(s): Behdad Esfahbod
     25  */
     26 
     27 #ifndef HB_UTF_PRIVATE_HH
     28 #define HB_UTF_PRIVATE_HH
     29 
     30 #include "hb-private.hh"
     31 
     32 template <typename T, bool validate=true> struct hb_utf_t;
     33 
     34 
     35 /* UTF-8 */
     36 
     37 template <>
     38 struct hb_utf_t<uint8_t, true>
     39 {
     40   static inline const uint8_t *
     41   next (const uint8_t *text,
     42 	const uint8_t *end,
     43 	hb_codepoint_t *unicode,
     44 	hb_codepoint_t replacement)
     45   {
     46     /* Written to only accept well-formed sequences.
     47      * Based on ideas from ICU's U8_NEXT.
     48      * Generates one "replacement" for each ill-formed byte. */
     49 
     50     hb_codepoint_t c = *text++;
     51 
     52     if (c > 0x7Fu)
     53     {
     54       if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
     55       {
     56 	unsigned int t1;
     57 	if (likely (text < end &&
     58 		    (t1 = text[0] - 0x80u) <= 0x3Fu))
     59 	{
     60 	  c = ((c&0x1Fu)<<6) | t1;
     61 	  text++;
     62 	}
     63 	else
     64 	  goto error;
     65       }
     66       else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
     67       {
     68 	unsigned int t1, t2;
     69 	if (likely (1 < end - text &&
     70 		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
     71 		    (t2 = text[1] - 0x80u) <= 0x3Fu))
     72 	{
     73 	  c = ((c&0xFu)<<12) | (t1<<6) | t2;
     74 	  if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu)))
     75 	    goto error;
     76 	  text += 2;
     77 	}
     78 	else
     79 	  goto error;
     80       }
     81       else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
     82       {
     83 	unsigned int t1, t2, t3;
     84 	if (likely (2 < end - text &&
     85 		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
     86 		    (t2 = text[1] - 0x80u) <= 0x3Fu &&
     87 		    (t3 = text[2] - 0x80u) <= 0x3Fu))
     88 	{
     89 	  c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
     90 	  if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu)))
     91 	    goto error;
     92 	  text += 3;
     93 	}
     94 	else
     95 	  goto error;
     96       }
     97       else
     98 	goto error;
     99     }
    100 
    101     *unicode = c;
    102     return text;
    103 
    104   error:
    105     *unicode = replacement;
    106     return text;
    107   }
    108 
    109   static inline const uint8_t *
    110   prev (const uint8_t *text,
    111 	const uint8_t *start,
    112 	hb_codepoint_t *unicode,
    113 	hb_codepoint_t replacement)
    114   {
    115     const uint8_t *end = text--;
    116     while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
    117       text--;
    118 
    119     if (likely (next (text, end, unicode, replacement) == end))
    120       return text;
    121 
    122     *unicode = replacement;
    123     return end - 1;
    124   }
    125 
    126   static inline unsigned int
    127   strlen (const uint8_t *text)
    128   {
    129     return ::strlen ((const char *) text);
    130   }
    131 };
    132 
    133 
    134 /* UTF-16 */
    135 
    136 template <>
    137 struct hb_utf_t<uint16_t, true>
    138 {
    139   static inline const uint16_t *
    140   next (const uint16_t *text,
    141 	const uint16_t *end,
    142 	hb_codepoint_t *unicode,
    143 	hb_codepoint_t replacement)
    144   {
    145     hb_codepoint_t c = *text++;
    146 
    147     if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
    148     {
    149       *unicode = c;
    150       return text;
    151     }
    152 
    153     if (likely (hb_in_range (c, 0xD800u, 0xDBFFu)))
    154     {
    155       /* High-surrogate in c */
    156       hb_codepoint_t l;
    157       if (text < end && ((l = *text), likely (hb_in_range (l, 0xDC00u, 0xDFFFu))))
    158       {
    159 	/* Low-surrogate in l */
    160 	*unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
    161 	 text++;
    162 	 return text;
    163       }
    164     }
    165 
    166     /* Lonely / out-of-order surrogate. */
    167     *unicode = replacement;
    168     return text;
    169   }
    170 
    171   static inline const uint16_t *
    172   prev (const uint16_t *text,
    173 	const uint16_t *start,
    174 	hb_codepoint_t *unicode,
    175 	hb_codepoint_t replacement)
    176   {
    177     const uint16_t *end = text--;
    178     hb_codepoint_t c = *text;
    179 
    180     if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
    181     {
    182       *unicode = c;
    183       return text;
    184     }
    185 
    186     if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu)))
    187       text--;
    188 
    189     if (likely (next (text, end, unicode, replacement) == end))
    190       return text;
    191 
    192     *unicode = replacement;
    193     return end - 1;
    194   }
    195 
    196 
    197   static inline unsigned int
    198   strlen (const uint16_t *text)
    199   {
    200     unsigned int l = 0;
    201     while (*text++) l++;
    202     return l;
    203   }
    204 };
    205 
    206 
    207 /* UTF-32 */
    208 
    209 template <bool validate>
    210 struct hb_utf_t<uint32_t, validate>
    211 {
    212   static inline const uint32_t *
    213   next (const uint32_t *text,
    214 	const uint32_t *end HB_UNUSED,
    215 	hb_codepoint_t *unicode,
    216 	hb_codepoint_t replacement)
    217   {
    218     hb_codepoint_t c = *text++;
    219     if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)))
    220       goto error;
    221     *unicode = c;
    222     return text;
    223 
    224   error:
    225     *unicode = replacement;
    226     return text;
    227   }
    228 
    229   static inline const uint32_t *
    230   prev (const uint32_t *text,
    231 	const uint32_t *start HB_UNUSED,
    232 	hb_codepoint_t *unicode,
    233 	hb_codepoint_t replacement)
    234   {
    235     next (text - 1, text, unicode, replacement);
    236     return text - 1;
    237   }
    238 
    239   static inline unsigned int
    240   strlen (const uint32_t *text)
    241   {
    242     unsigned int l = 0;
    243     while (*text++) l++;
    244     return l;
    245   }
    246 };
    247 
    248 
    249 #endif /* HB_UTF_PRIVATE_HH */
    250