Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  utf16.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999sep09
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 /**
     18  * \file
     19  * \brief C API: 16-bit Unicode handling macros
     20  *
     21  * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
     22  * utf16.h is included by utf.h after unicode/umachine.h
     23  * and some common definitions.
     24  *
     25  * For more information see utf.h and the ICU User Guide Strings chapter
     26  * (http://icu-project.org/userguide/strings.html).
     27  *
     28  * <em>Usage:</em>
     29  * ICU coding guidelines for if() statements should be followed when using these macros.
     30  * Compound statements (curly braces {}) must be used  for if-else-while...
     31  * bodies and all macro statements should be terminated with semicolon.
     32  */
     33 
     34 #ifndef __UTF16_H__
     35 #define __UTF16_H__
     36 
     37 /* utf.h must be included first. */
     38 #ifndef __UTF_H__
     39 #   include "unicode/utf.h"
     40 #endif
     41 
     42 /* single-code point definitions -------------------------------------------- */
     43 
     44 /**
     45  * Does this code unit alone encode a code point (BMP, not a surrogate)?
     46  * @param c 16-bit code unit
     47  * @return TRUE or FALSE
     48  * @stable ICU 2.4
     49  */
     50 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
     51 
     52 /**
     53  * Is this code unit a lead surrogate (U+d800..U+dbff)?
     54  * @param c 16-bit code unit
     55  * @return TRUE or FALSE
     56  * @stable ICU 2.4
     57  */
     58 #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
     59 
     60 /**
     61  * Is this code unit a trail surrogate (U+dc00..U+dfff)?
     62  * @param c 16-bit code unit
     63  * @return TRUE or FALSE
     64  * @stable ICU 2.4
     65  */
     66 #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
     67 
     68 /**
     69  * Is this code unit a surrogate (U+d800..U+dfff)?
     70  * @param c 16-bit code unit
     71  * @return TRUE or FALSE
     72  * @stable ICU 2.4
     73  */
     74 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
     75 
     76 /**
     77  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
     78  * is it a lead surrogate?
     79  * @param c 16-bit code unit
     80  * @return TRUE or FALSE
     81  * @stable ICU 2.4
     82  */
     83 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
     84 
     85 /**
     86  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
     87  * is it a trail surrogate?
     88  * @param c 16-bit code unit
     89  * @return TRUE or FALSE
     90  * @stable ICU 4.2
     91  */
     92 #define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
     93 
     94 /**
     95  * Helper constant for U16_GET_SUPPLEMENTARY.
     96  * @internal
     97  */
     98 #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
     99 
    100 /**
    101  * Get a supplementary code point value (U+10000..U+10ffff)
    102  * from its lead and trail surrogates.
    103  * The result is undefined if the input values are not
    104  * lead and trail surrogates.
    105  *
    106  * @param lead lead surrogate (U+d800..U+dbff)
    107  * @param trail trail surrogate (U+dc00..U+dfff)
    108  * @return supplementary code point (U+10000..U+10ffff)
    109  * @stable ICU 2.4
    110  */
    111 #define U16_GET_SUPPLEMENTARY(lead, trail) \
    112     (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
    113 
    114 
    115 /**
    116  * Get the lead surrogate (0xd800..0xdbff) for a
    117  * supplementary code point (0x10000..0x10ffff).
    118  * @param supplementary 32-bit code point (U+10000..U+10ffff)
    119  * @return lead surrogate (U+d800..U+dbff) for supplementary
    120  * @stable ICU 2.4
    121  */
    122 #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
    123 
    124 /**
    125  * Get the trail surrogate (0xdc00..0xdfff) for a
    126  * supplementary code point (0x10000..0x10ffff).
    127  * @param supplementary 32-bit code point (U+10000..U+10ffff)
    128  * @return trail surrogate (U+dc00..U+dfff) for supplementary
    129  * @stable ICU 2.4
    130  */
    131 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
    132 
    133 /**
    134  * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
    135  * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
    136  * @param c 32-bit code point
    137  * @return 1 or 2
    138  * @stable ICU 2.4
    139  */
    140 #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
    141 
    142 /**
    143  * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
    144  * @return 2
    145  * @stable ICU 2.4
    146  */
    147 #define U16_MAX_LENGTH 2
    148 
    149 /**
    150  * Get a code point from a string at a random-access offset,
    151  * without changing the offset.
    152  * "Unsafe" macro, assumes well-formed UTF-16.
    153  *
    154  * The offset may point to either the lead or trail surrogate unit
    155  * for a supplementary code point, in which case the macro will read
    156  * the adjacent matching surrogate as well.
    157  * The result is undefined if the offset points to a single, unpaired surrogate.
    158  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
    159  *
    160  * @param s const UChar * string
    161  * @param i string offset
    162  * @param c output UChar32 variable
    163  * @see U16_GET
    164  * @stable ICU 2.4
    165  */
    166 #define U16_GET_UNSAFE(s, i, c) { \
    167     (c)=(s)[i]; \
    168     if(U16_IS_SURROGATE(c)) { \
    169         if(U16_IS_SURROGATE_LEAD(c)) { \
    170             (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
    171         } else { \
    172             (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
    173         } \
    174     } \
    175 }
    176 
    177 /**
    178  * Get a code point from a string at a random-access offset,
    179  * without changing the offset.
    180  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    181  *
    182  * The offset may point to either the lead or trail surrogate unit
    183  * for a supplementary code point, in which case the macro will read
    184  * the adjacent matching surrogate as well.
    185  * If the offset points to a single, unpaired surrogate, then that itself
    186  * will be returned as the code point.
    187  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
    188  *
    189  * @param s const UChar * string
    190  * @param start starting string offset (usually 0)
    191  * @param i string offset, must be start<=i<length
    192  * @param length string length
    193  * @param c output UChar32 variable
    194  * @see U16_GET_UNSAFE
    195  * @stable ICU 2.4
    196  */
    197 #define U16_GET(s, start, i, length, c) { \
    198     (c)=(s)[i]; \
    199     if(U16_IS_SURROGATE(c)) { \
    200         uint16_t __c2; \
    201         if(U16_IS_SURROGATE_LEAD(c)) { \
    202             if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
    203                 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
    204             } \
    205         } else { \
    206             if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
    207                 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
    208             } \
    209         } \
    210     } \
    211 }
    212 
    213 /* definitions with forward iteration --------------------------------------- */
    214 
    215 /**
    216  * Get a code point from a string at a code point boundary offset,
    217  * and advance the offset to the next code point boundary.
    218  * (Post-incrementing forward iteration.)
    219  * "Unsafe" macro, assumes well-formed UTF-16.
    220  *
    221  * The offset may point to the lead surrogate unit
    222  * for a supplementary code point, in which case the macro will read
    223  * the following trail surrogate as well.
    224  * If the offset points to a trail surrogate, then that itself
    225  * will be returned as the code point.
    226  * The result is undefined if the offset points to a single, unpaired lead surrogate.
    227  *
    228  * @param s const UChar * string
    229  * @param i string offset
    230  * @param c output UChar32 variable
    231  * @see U16_NEXT
    232  * @stable ICU 2.4
    233  */
    234 #define U16_NEXT_UNSAFE(s, i, c) { \
    235     (c)=(s)[(i)++]; \
    236     if(U16_IS_LEAD(c)) { \
    237         (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
    238     } \
    239 }
    240 
    241 /**
    242  * Get a code point from a string at a code point boundary offset,
    243  * and advance the offset to the next code point boundary.
    244  * (Post-incrementing forward iteration.)
    245  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    246  *
    247  * The offset may point to the lead surrogate unit
    248  * for a supplementary code point, in which case the macro will read
    249  * the following trail surrogate as well.
    250  * If the offset points to a trail surrogate or
    251  * to a single, unpaired lead surrogate, then that itself
    252  * will be returned as the code point.
    253  *
    254  * @param s const UChar * string
    255  * @param i string offset, must be i<length
    256  * @param length string length
    257  * @param c output UChar32 variable
    258  * @see U16_NEXT_UNSAFE
    259  * @stable ICU 2.4
    260  */
    261 #define U16_NEXT(s, i, length, c) { \
    262     (c)=(s)[(i)++]; \
    263     if(U16_IS_LEAD(c)) { \
    264         uint16_t __c2; \
    265         if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
    266             ++(i); \
    267             (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
    268         } \
    269     } \
    270 }
    271 
    272 /**
    273  * Append a code point to a string, overwriting 1 or 2 code units.
    274  * The offset points to the current end of the string contents
    275  * and is advanced (post-increment).
    276  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
    277  * Otherwise, the result is undefined.
    278  *
    279  * @param s const UChar * string buffer
    280  * @param i string offset
    281  * @param c code point to append
    282  * @see U16_APPEND
    283  * @stable ICU 2.4
    284  */
    285 #define U16_APPEND_UNSAFE(s, i, c) { \
    286     if((uint32_t)(c)<=0xffff) { \
    287         (s)[(i)++]=(uint16_t)(c); \
    288     } else { \
    289         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
    290         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    291     } \
    292 }
    293 
    294 /**
    295  * Append a code point to a string, overwriting 1 or 2 code units.
    296  * The offset points to the current end of the string contents
    297  * and is advanced (post-increment).
    298  * "Safe" macro, checks for a valid code point.
    299  * If a surrogate pair is written, checks for sufficient space in the string.
    300  * If the code point is not valid or a trail surrogate does not fit,
    301  * then isError is set to TRUE.
    302  *
    303  * @param s const UChar * string buffer
    304  * @param i string offset, must be i<capacity
    305  * @param capacity size of the string buffer
    306  * @param c code point to append
    307  * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
    308  * @see U16_APPEND_UNSAFE
    309  * @stable ICU 2.4
    310  */
    311 #define U16_APPEND(s, i, capacity, c, isError) { \
    312     if((uint32_t)(c)<=0xffff) { \
    313         (s)[(i)++]=(uint16_t)(c); \
    314     } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
    315         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
    316         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    317     } else /* c>0x10ffff or not enough space */ { \
    318         (isError)=TRUE; \
    319     } \
    320 }
    321 
    322 /**
    323  * Advance the string offset from one code point boundary to the next.
    324  * (Post-incrementing iteration.)
    325  * "Unsafe" macro, assumes well-formed UTF-16.
    326  *
    327  * @param s const UChar * string
    328  * @param i string offset
    329  * @see U16_FWD_1
    330  * @stable ICU 2.4
    331  */
    332 #define U16_FWD_1_UNSAFE(s, i) { \
    333     if(U16_IS_LEAD((s)[(i)++])) { \
    334         ++(i); \
    335     } \
    336 }
    337 
    338 /**
    339  * Advance the string offset from one code point boundary to the next.
    340  * (Post-incrementing iteration.)
    341  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    342  *
    343  * @param s const UChar * string
    344  * @param i string offset, must be i<length
    345  * @param length string length
    346  * @see U16_FWD_1_UNSAFE
    347  * @stable ICU 2.4
    348  */
    349 #define U16_FWD_1(s, i, length) { \
    350     if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
    351         ++(i); \
    352     } \
    353 }
    354 
    355 /**
    356  * Advance the string offset from one code point boundary to the n-th next one,
    357  * i.e., move forward by n code points.
    358  * (Post-incrementing iteration.)
    359  * "Unsafe" macro, assumes well-formed UTF-16.
    360  *
    361  * @param s const UChar * string
    362  * @param i string offset
    363  * @param n number of code points to skip
    364  * @see U16_FWD_N
    365  * @stable ICU 2.4
    366  */
    367 #define U16_FWD_N_UNSAFE(s, i, n) { \
    368     int32_t __N=(n); \
    369     while(__N>0) { \
    370         U16_FWD_1_UNSAFE(s, i); \
    371         --__N; \
    372     } \
    373 }
    374 
    375 /**
    376  * Advance the string offset from one code point boundary to the n-th next one,
    377  * i.e., move forward by n code points.
    378  * (Post-incrementing iteration.)
    379  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    380  *
    381  * @param s const UChar * string
    382  * @param i string offset, must be i<length
    383  * @param length string length
    384  * @param n number of code points to skip
    385  * @see U16_FWD_N_UNSAFE
    386  * @stable ICU 2.4
    387  */
    388 #define U16_FWD_N(s, i, length, n) { \
    389     int32_t __N=(n); \
    390     while(__N>0 && (i)<(length)) { \
    391         U16_FWD_1(s, i, length); \
    392         --__N; \
    393     } \
    394 }
    395 
    396 /**
    397  * Adjust a random-access offset to a code point boundary
    398  * at the start of a code point.
    399  * If the offset points to the trail surrogate of a surrogate pair,
    400  * then the offset is decremented.
    401  * Otherwise, it is not modified.
    402  * "Unsafe" macro, assumes well-formed UTF-16.
    403  *
    404  * @param s const UChar * string
    405  * @param i string offset
    406  * @see U16_SET_CP_START
    407  * @stable ICU 2.4
    408  */
    409 #define U16_SET_CP_START_UNSAFE(s, i) { \
    410     if(U16_IS_TRAIL((s)[i])) { \
    411         --(i); \
    412     } \
    413 }
    414 
    415 /**
    416  * Adjust a random-access offset to a code point boundary
    417  * at the start of a code point.
    418  * If the offset points to the trail surrogate of a surrogate pair,
    419  * then the offset is decremented.
    420  * Otherwise, it is not modified.
    421  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    422  *
    423  * @param s const UChar * string
    424  * @param start starting string offset (usually 0)
    425  * @param i string offset, must be start<=i
    426  * @see U16_SET_CP_START_UNSAFE
    427  * @stable ICU 2.4
    428  */
    429 #define U16_SET_CP_START(s, start, i) { \
    430     if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
    431         --(i); \
    432     } \
    433 }
    434 
    435 /* definitions with backward iteration -------------------------------------- */
    436 
    437 /**
    438  * Move the string offset from one code point boundary to the previous one
    439  * and get the code point between them.
    440  * (Pre-decrementing backward iteration.)
    441  * "Unsafe" macro, assumes well-formed UTF-16.
    442  *
    443  * The input offset may be the same as the string length.
    444  * If the offset is behind a trail surrogate unit
    445  * for a supplementary code point, then the macro will read
    446  * the preceding lead surrogate as well.
    447  * If the offset is behind a lead surrogate, then that itself
    448  * will be returned as the code point.
    449  * The result is undefined if the offset is behind a single, unpaired trail surrogate.
    450  *
    451  * @param s const UChar * string
    452  * @param i string offset
    453  * @param c output UChar32 variable
    454  * @see U16_PREV
    455  * @stable ICU 2.4
    456  */
    457 #define U16_PREV_UNSAFE(s, i, c) { \
    458     (c)=(s)[--(i)]; \
    459     if(U16_IS_TRAIL(c)) { \
    460         (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
    461     } \
    462 }
    463 
    464 /**
    465  * Move the string offset from one code point boundary to the previous one
    466  * and get the code point between them.
    467  * (Pre-decrementing backward iteration.)
    468  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    469  *
    470  * The input offset may be the same as the string length.
    471  * If the offset is behind a trail surrogate unit
    472  * for a supplementary code point, then the macro will read
    473  * the preceding lead surrogate as well.
    474  * If the offset is behind a lead surrogate or behind a single, unpaired
    475  * trail surrogate, then that itself
    476  * will be returned as the code point.
    477  *
    478  * @param s const UChar * string
    479  * @param start starting string offset (usually 0)
    480  * @param i string offset, must be start<i
    481  * @param c output UChar32 variable
    482  * @see U16_PREV_UNSAFE
    483  * @stable ICU 2.4
    484  */
    485 #define U16_PREV(s, start, i, c) { \
    486     (c)=(s)[--(i)]; \
    487     if(U16_IS_TRAIL(c)) { \
    488         uint16_t __c2; \
    489         if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
    490             --(i); \
    491             (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
    492         } \
    493     } \
    494 }
    495 
    496 /**
    497  * Move the string offset from one code point boundary to the previous one.
    498  * (Pre-decrementing backward iteration.)
    499  * The input offset may be the same as the string length.
    500  * "Unsafe" macro, assumes well-formed UTF-16.
    501  *
    502  * @param s const UChar * string
    503  * @param i string offset
    504  * @see U16_BACK_1
    505  * @stable ICU 2.4
    506  */
    507 #define U16_BACK_1_UNSAFE(s, i) { \
    508     if(U16_IS_TRAIL((s)[--(i)])) { \
    509         --(i); \
    510     } \
    511 }
    512 
    513 /**
    514  * Move the string offset from one code point boundary to the previous one.
    515  * (Pre-decrementing backward iteration.)
    516  * The input offset may be the same as the string length.
    517  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    518  *
    519  * @param s const UChar * string
    520  * @param start starting string offset (usually 0)
    521  * @param i string offset, must be start<i
    522  * @see U16_BACK_1_UNSAFE
    523  * @stable ICU 2.4
    524  */
    525 #define U16_BACK_1(s, start, i) { \
    526     if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
    527         --(i); \
    528     } \
    529 }
    530 
    531 /**
    532  * Move the string offset from one code point boundary to the n-th one before it,
    533  * i.e., move backward by n code points.
    534  * (Pre-decrementing backward iteration.)
    535  * The input offset may be the same as the string length.
    536  * "Unsafe" macro, assumes well-formed UTF-16.
    537  *
    538  * @param s const UChar * string
    539  * @param i string offset
    540  * @param n number of code points to skip
    541  * @see U16_BACK_N
    542  * @stable ICU 2.4
    543  */
    544 #define U16_BACK_N_UNSAFE(s, i, n) { \
    545     int32_t __N=(n); \
    546     while(__N>0) { \
    547         U16_BACK_1_UNSAFE(s, i); \
    548         --__N; \
    549     } \
    550 }
    551 
    552 /**
    553  * Move the string offset from one code point boundary to the n-th one before it,
    554  * i.e., move backward by n code points.
    555  * (Pre-decrementing backward iteration.)
    556  * The input offset may be the same as the string length.
    557  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    558  *
    559  * @param s const UChar * string
    560  * @param start start of string
    561  * @param i string offset, must be start<i
    562  * @param n number of code points to skip
    563  * @see U16_BACK_N_UNSAFE
    564  * @stable ICU 2.4
    565  */
    566 #define U16_BACK_N(s, start, i, n) { \
    567     int32_t __N=(n); \
    568     while(__N>0 && (i)>(start)) { \
    569         U16_BACK_1(s, start, i); \
    570         --__N; \
    571     } \
    572 }
    573 
    574 /**
    575  * Adjust a random-access offset to a code point boundary after a code point.
    576  * If the offset is behind the lead surrogate of a surrogate pair,
    577  * then the offset is incremented.
    578  * Otherwise, it is not modified.
    579  * The input offset may be the same as the string length.
    580  * "Unsafe" macro, assumes well-formed UTF-16.
    581  *
    582  * @param s const UChar * string
    583  * @param i string offset
    584  * @see U16_SET_CP_LIMIT
    585  * @stable ICU 2.4
    586  */
    587 #define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
    588     if(U16_IS_LEAD((s)[(i)-1])) { \
    589         ++(i); \
    590     } \
    591 }
    592 
    593 /**
    594  * Adjust a random-access offset to a code point boundary after a code point.
    595  * If the offset is behind the lead surrogate of a surrogate pair,
    596  * then the offset is incremented.
    597  * Otherwise, it is not modified.
    598  * The input offset may be the same as the string length.
    599  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    600  *
    601  * @param s const UChar * string
    602  * @param start starting string offset (usually 0)
    603  * @param i string offset, start<=i<=length
    604  * @param length string length
    605  * @see U16_SET_CP_LIMIT_UNSAFE
    606  * @stable ICU 2.4
    607  */
    608 #define U16_SET_CP_LIMIT(s, start, i, length) { \
    609     if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
    610         ++(i); \
    611     } \
    612 }
    613 
    614 #endif
    615