Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright  2011,2012,2014  Google, Inc.
      3  *
      4  *  This is part of HarfBuzz, a text shaping library.
      5  *
      6  * Permission is hereby granted, without written agreement and without
      7  * license or royalty fees, to use, copy, modify, and distribute this
      8  * software and its documentation for any purpose, provided that the
      9  * above copyright notice and the following two paragraphs appear in
     10  * all copies of this software.
     11  *
     12  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
     13  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
     14  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
     15  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
     16  * DAMAGE.
     17  *
     18  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
     19  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
     20  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
     21  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
     22  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
     23  *
     24  * Google Author(s): Behdad Esfahbod
     25  */
     26 
     27 #ifndef HB_UTF_PRIVATE_HH
     28 #define HB_UTF_PRIVATE_HH
     29 
     30 #include "hb-private.hh"
     31 
     32 
     33 struct hb_utf8_t
     34 {
     35   typedef uint8_t codepoint_t;
     36 
     37   static inline const uint8_t *
     38   next (const uint8_t *text,
     39 	const uint8_t *end,
     40 	hb_codepoint_t *unicode,
     41 	hb_codepoint_t replacement)
     42   {
     43     /* Written to only accept well-formed sequences.
     44      * Based on ideas from ICU's U8_NEXT.
     45      * Generates one "replacement" for each ill-formed byte. */
     46 
     47     hb_codepoint_t c = *text++;
     48 
     49     if (c > 0x7Fu)
     50     {
     51       if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
     52       {
     53 	unsigned int t1;
     54 	if (likely (text < end &&
     55 		    (t1 = text[0] - 0x80u) <= 0x3Fu))
     56 	{
     57 	  c = ((c&0x1Fu)<<6) | t1;
     58 	  text++;
     59 	}
     60 	else
     61 	  goto error;
     62       }
     63       else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
     64       {
     65 	unsigned int t1, t2;
     66 	if (likely (1 < end - text &&
     67 		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
     68 		    (t2 = text[1] - 0x80u) <= 0x3Fu))
     69 	{
     70 	  c = ((c&0xFu)<<12) | (t1<<6) | t2;
     71 	  if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu)))
     72 	    goto error;
     73 	  text += 2;
     74 	}
     75 	else
     76 	  goto error;
     77       }
     78       else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
     79       {
     80 	unsigned int t1, t2, t3;
     81 	if (likely (2 < end - text &&
     82 		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
     83 		    (t2 = text[1] - 0x80u) <= 0x3Fu &&
     84 		    (t3 = text[2] - 0x80u) <= 0x3Fu))
     85 	{
     86 	  c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
     87 	  if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu)))
     88 	    goto error;
     89 	  text += 3;
     90 	}
     91 	else
     92 	  goto error;
     93       }
     94       else
     95 	goto error;
     96     }
     97 
     98     *unicode = c;
     99     return text;
    100 
    101   error:
    102     *unicode = replacement;
    103     return text;
    104   }
    105 
    106   static inline const uint8_t *
    107   prev (const uint8_t *text,
    108 	const uint8_t *start,
    109 	hb_codepoint_t *unicode,
    110 	hb_codepoint_t replacement)
    111   {
    112     const uint8_t *end = text--;
    113     while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
    114       text--;
    115 
    116     if (likely (next (text, end, unicode, replacement) == end))
    117       return text;
    118 
    119     *unicode = replacement;
    120     return end - 1;
    121   }
    122 
    123   static inline unsigned int
    124   strlen (const uint8_t *text)
    125   {
    126     return ::strlen ((const char *) text);
    127   }
    128 };
    129 
    130 
    131 struct hb_utf16_t
    132 {
    133   typedef uint16_t codepoint_t;
    134 
    135   static inline const uint16_t *
    136   next (const uint16_t *text,
    137 	const uint16_t *end,
    138 	hb_codepoint_t *unicode,
    139 	hb_codepoint_t replacement)
    140   {
    141     hb_codepoint_t c = *text++;
    142 
    143     if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
    144     {
    145       *unicode = c;
    146       return text;
    147     }
    148 
    149     if (likely (c <= 0xDBFFu && text < end))
    150     {
    151       /* High-surrogate in c */
    152       hb_codepoint_t l = *text;
    153       if (likely (hb_in_range (l, 0xDC00u, 0xDFFFu)))
    154       {
    155 	/* Low-surrogate in l */
    156 	*unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
    157 	 text++;
    158 	 return text;
    159       }
    160     }
    161 
    162     /* Lonely / out-of-order surrogate. */
    163     *unicode = replacement;
    164     return text;
    165   }
    166 
    167   static inline const uint16_t *
    168   prev (const uint16_t *text,
    169 	const uint16_t *start,
    170 	hb_codepoint_t *unicode,
    171 	hb_codepoint_t replacement)
    172   {
    173     hb_codepoint_t c = *--text;
    174 
    175     if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
    176     {
    177       *unicode = c;
    178       return text;
    179     }
    180 
    181     if (likely (c >= 0xDC00u && start < text))
    182     {
    183       /* Low-surrogate in c */
    184       hb_codepoint_t h = text[-1];
    185       if (likely (hb_in_range (h, 0xD800u, 0xDBFFu)))
    186       {
    187         /* High-surrogate in h */
    188         *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u);
    189         text--;
    190         return text;
    191       }
    192     }
    193 
    194     /* Lonely / out-of-order surrogate. */
    195     *unicode = replacement;
    196     return text;
    197   }
    198 
    199 
    200   static inline unsigned int
    201   strlen (const uint16_t *text)
    202   {
    203     unsigned int l = 0;
    204     while (*text++) l++;
    205     return l;
    206   }
    207 };
    208 
    209 
    210 template <bool validate=true>
    211 struct hb_utf32_t
    212 {
    213   typedef uint32_t codepoint_t;
    214 
    215   static inline const uint32_t *
    216   next (const uint32_t *text,
    217 	const uint32_t *end HB_UNUSED,
    218 	hb_codepoint_t *unicode,
    219 	hb_codepoint_t replacement)
    220   {
    221     hb_codepoint_t c = *unicode = *text++;
    222     if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
    223       *unicode = replacement;
    224     return text;
    225   }
    226 
    227   static inline const uint32_t *
    228   prev (const uint32_t *text,
    229 	const uint32_t *start HB_UNUSED,
    230 	hb_codepoint_t *unicode,
    231 	hb_codepoint_t replacement)
    232   {
    233     hb_codepoint_t c = *unicode = *--text;
    234     if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
    235       *unicode = replacement;
    236     return text;
    237   }
    238 
    239   static inline unsigned int
    240   strlen (const uint32_t *text)
    241   {
    242     unsigned int l = 0;
    243     while (*text++) l++;
    244     return l;
    245   }
    246 };
    247 
    248 
    249 struct hb_latin1_t
    250 {
    251   typedef uint8_t codepoint_t;
    252 
    253   static inline const uint8_t *
    254   next (const uint8_t *text,
    255 	const uint8_t *end HB_UNUSED,
    256 	hb_codepoint_t *unicode,
    257 	hb_codepoint_t replacement HB_UNUSED)
    258   {
    259     *unicode = *text++;
    260     return text;
    261   }
    262 
    263   static inline const uint8_t *
    264   prev (const uint8_t *text,
    265 	const uint8_t *start HB_UNUSED,
    266 	hb_codepoint_t *unicode,
    267 	hb_codepoint_t replacement)
    268   {
    269     *unicode = *--text;
    270     return text;
    271   }
    272 
    273   static inline unsigned int
    274   strlen (const uint8_t *text)
    275   {
    276     unsigned int l = 0;
    277     while (*text++) l++;
    278     return l;
    279   }
    280 };
    281 
    282 #endif /* HB_UTF_PRIVATE_HH */
    283