Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  utf16.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999sep09
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 /**
     18  * \file
     19  * \brief C API: 16-bit Unicode handling macros
     20  *
     21  * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
     22  *
     23  * For more information see utf.h and the ICU User Guide Strings chapter
     24  * (http://userguide.icu-project.org/strings).
     25  *
     26  * <em>Usage:</em>
     27  * ICU coding guidelines for if() statements should be followed when using these macros.
     28  * Compound statements (curly braces {}) must be used  for if-else-while...
     29  * bodies and all macro statements should be terminated with semicolon.
     30  */
     31 
     32 #ifndef __UTF16_H__
     33 #define __UTF16_H__
     34 
     35 #include "unicode/umachine.h"
     36 #ifndef __UTF_H__
     37 #   include "unicode/utf.h"
     38 #endif
     39 
     40 /* single-code point definitions -------------------------------------------- */
     41 
     42 /**
     43  * Does this code unit alone encode a code point (BMP, not a surrogate)?
     44  * @param c 16-bit code unit
     45  * @return TRUE or FALSE
     46  * @stable ICU 2.4
     47  */
     48 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
     49 
     50 /**
     51  * Is this code unit a lead surrogate (U+d800..U+dbff)?
     52  * @param c 16-bit code unit
     53  * @return TRUE or FALSE
     54  * @stable ICU 2.4
     55  */
     56 #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
     57 
     58 /**
     59  * Is this code unit a trail surrogate (U+dc00..U+dfff)?
     60  * @param c 16-bit code unit
     61  * @return TRUE or FALSE
     62  * @stable ICU 2.4
     63  */
     64 #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
     65 
     66 /**
     67  * Is this code unit a surrogate (U+d800..U+dfff)?
     68  * @param c 16-bit code unit
     69  * @return TRUE or FALSE
     70  * @stable ICU 2.4
     71  */
     72 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
     73 
     74 /**
     75  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
     76  * is it a lead surrogate?
     77  * @param c 16-bit code unit
     78  * @return TRUE or FALSE
     79  * @stable ICU 2.4
     80  */
     81 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
     82 
     83 /**
     84  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
     85  * is it a trail surrogate?
     86  * @param c 16-bit code unit
     87  * @return TRUE or FALSE
     88  * @stable ICU 4.2
     89  */
     90 #define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
     91 
     92 /**
     93  * Helper constant for U16_GET_SUPPLEMENTARY.
     94  * @internal
     95  */
     96 #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
     97 
     98 /**
     99  * Get a supplementary code point value (U+10000..U+10ffff)
    100  * from its lead and trail surrogates.
    101  * The result is undefined if the input values are not
    102  * lead and trail surrogates.
    103  *
    104  * @param lead lead surrogate (U+d800..U+dbff)
    105  * @param trail trail surrogate (U+dc00..U+dfff)
    106  * @return supplementary code point (U+10000..U+10ffff)
    107  * @stable ICU 2.4
    108  */
    109 #define U16_GET_SUPPLEMENTARY(lead, trail) \
    110     (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
    111 
    112 
    113 /**
    114  * Get the lead surrogate (0xd800..0xdbff) for a
    115  * supplementary code point (0x10000..0x10ffff).
    116  * @param supplementary 32-bit code point (U+10000..U+10ffff)
    117  * @return lead surrogate (U+d800..U+dbff) for supplementary
    118  * @stable ICU 2.4
    119  */
    120 #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
    121 
    122 /**
    123  * Get the trail surrogate (0xdc00..0xdfff) for a
    124  * supplementary code point (0x10000..0x10ffff).
    125  * @param supplementary 32-bit code point (U+10000..U+10ffff)
    126  * @return trail surrogate (U+dc00..U+dfff) for supplementary
    127  * @stable ICU 2.4
    128  */
    129 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
    130 
    131 /**
    132  * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
    133  * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
    134  * @param c 32-bit code point
    135  * @return 1 or 2
    136  * @stable ICU 2.4
    137  */
    138 #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
    139 
    140 /**
    141  * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
    142  * @return 2
    143  * @stable ICU 2.4
    144  */
    145 #define U16_MAX_LENGTH 2
    146 
    147 /**
    148  * Get a code point from a string at a random-access offset,
    149  * without changing the offset.
    150  * "Unsafe" macro, assumes well-formed UTF-16.
    151  *
    152  * The offset may point to either the lead or trail surrogate unit
    153  * for a supplementary code point, in which case the macro will read
    154  * the adjacent matching surrogate as well.
    155  * The result is undefined if the offset points to a single, unpaired surrogate.
    156  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
    157  *
    158  * @param s const UChar * string
    159  * @param i string offset
    160  * @param c output UChar32 variable
    161  * @see U16_GET
    162  * @stable ICU 2.4
    163  */
    164 #define U16_GET_UNSAFE(s, i, c) { \
    165     (c)=(s)[i]; \
    166     if(U16_IS_SURROGATE(c)) { \
    167         if(U16_IS_SURROGATE_LEAD(c)) { \
    168             (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
    169         } else { \
    170             (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
    171         } \
    172     } \
    173 }
    174 
    175 /**
    176  * Get a code point from a string at a random-access offset,
    177  * without changing the offset.
    178  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    179  *
    180  * The offset may point to either the lead or trail surrogate unit
    181  * for a supplementary code point, in which case the macro will read
    182  * the adjacent matching surrogate as well.
    183  *
    184  * The length can be negative for a NUL-terminated string.
    185  *
    186  * If the offset points to a single, unpaired surrogate, then that itself
    187  * will be returned as the code point.
    188  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
    189  *
    190  * @param s const UChar * string
    191  * @param start starting string offset (usually 0)
    192  * @param i string offset, must be start<=i<length
    193  * @param length string length
    194  * @param c output UChar32 variable
    195  * @see U16_GET_UNSAFE
    196  * @stable ICU 2.4
    197  */
    198 #define U16_GET(s, start, i, length, c) { \
    199     (c)=(s)[i]; \
    200     if(U16_IS_SURROGATE(c)) { \
    201         uint16_t __c2; \
    202         if(U16_IS_SURROGATE_LEAD(c)) { \
    203             if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
    204                 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
    205             } \
    206         } else { \
    207             if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
    208                 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
    209             } \
    210         } \
    211     } \
    212 }
    213 
    214 /* definitions with forward iteration --------------------------------------- */
    215 
    216 /**
    217  * Get a code point from a string at a code point boundary offset,
    218  * and advance the offset to the next code point boundary.
    219  * (Post-incrementing forward iteration.)
    220  * "Unsafe" macro, assumes well-formed UTF-16.
    221  *
    222  * The offset may point to the lead surrogate unit
    223  * for a supplementary code point, in which case the macro will read
    224  * the following trail surrogate as well.
    225  * If the offset points to a trail surrogate, then that itself
    226  * will be returned as the code point.
    227  * The result is undefined if the offset points to a single, unpaired lead surrogate.
    228  *
    229  * @param s const UChar * string
    230  * @param i string offset
    231  * @param c output UChar32 variable
    232  * @see U16_NEXT
    233  * @stable ICU 2.4
    234  */
    235 #define U16_NEXT_UNSAFE(s, i, c) { \
    236     (c)=(s)[(i)++]; \
    237     if(U16_IS_LEAD(c)) { \
    238         (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
    239     } \
    240 }
    241 
    242 /**
    243  * Get a code point from a string at a code point boundary offset,
    244  * and advance the offset to the next code point boundary.
    245  * (Post-incrementing forward iteration.)
    246  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    247  *
    248  * The length can be negative for a NUL-terminated string.
    249  *
    250  * The offset may point to the lead surrogate unit
    251  * for a supplementary code point, in which case the macro will read
    252  * the following trail surrogate as well.
    253  * If the offset points to a trail surrogate or
    254  * to a single, unpaired lead surrogate, then that itself
    255  * will be returned as the code point.
    256  *
    257  * @param s const UChar * string
    258  * @param i string offset, must be i<length
    259  * @param length string length
    260  * @param c output UChar32 variable
    261  * @see U16_NEXT_UNSAFE
    262  * @stable ICU 2.4
    263  */
    264 #define U16_NEXT(s, i, length, c) { \
    265     (c)=(s)[(i)++]; \
    266     if(U16_IS_LEAD(c)) { \
    267         uint16_t __c2; \
    268         if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
    269             ++(i); \
    270             (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
    271         } \
    272     } \
    273 }
    274 
    275 /**
    276  * Append a code point to a string, overwriting 1 or 2 code units.
    277  * The offset points to the current end of the string contents
    278  * and is advanced (post-increment).
    279  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
    280  * Otherwise, the result is undefined.
    281  *
    282  * @param s const UChar * string buffer
    283  * @param i string offset
    284  * @param c code point to append
    285  * @see U16_APPEND
    286  * @stable ICU 2.4
    287  */
    288 #define U16_APPEND_UNSAFE(s, i, c) { \
    289     if((uint32_t)(c)<=0xffff) { \
    290         (s)[(i)++]=(uint16_t)(c); \
    291     } else { \
    292         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
    293         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    294     } \
    295 }
    296 
    297 /**
    298  * Append a code point to a string, overwriting 1 or 2 code units.
    299  * The offset points to the current end of the string contents
    300  * and is advanced (post-increment).
    301  * "Safe" macro, checks for a valid code point.
    302  * If a surrogate pair is written, checks for sufficient space in the string.
    303  * If the code point is not valid or a trail surrogate does not fit,
    304  * then isError is set to TRUE.
    305  *
    306  * @param s const UChar * string buffer
    307  * @param i string offset, must be i<capacity
    308  * @param capacity size of the string buffer
    309  * @param c code point to append
    310  * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
    311  * @see U16_APPEND_UNSAFE
    312  * @stable ICU 2.4
    313  */
    314 #define U16_APPEND(s, i, capacity, c, isError) { \
    315     if((uint32_t)(c)<=0xffff) { \
    316         (s)[(i)++]=(uint16_t)(c); \
    317     } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
    318         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
    319         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    320     } else /* c>0x10ffff or not enough space */ { \
    321         (isError)=TRUE; \
    322     } \
    323 }
    324 
    325 /**
    326  * Advance the string offset from one code point boundary to the next.
    327  * (Post-incrementing iteration.)
    328  * "Unsafe" macro, assumes well-formed UTF-16.
    329  *
    330  * @param s const UChar * string
    331  * @param i string offset
    332  * @see U16_FWD_1
    333  * @stable ICU 2.4
    334  */
    335 #define U16_FWD_1_UNSAFE(s, i) { \
    336     if(U16_IS_LEAD((s)[(i)++])) { \
    337         ++(i); \
    338     } \
    339 }
    340 
    341 /**
    342  * Advance the string offset from one code point boundary to the next.
    343  * (Post-incrementing iteration.)
    344  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    345  *
    346  * The length can be negative for a NUL-terminated string.
    347  *
    348  * @param s const UChar * string
    349  * @param i string offset, must be i<length
    350  * @param length string length
    351  * @see U16_FWD_1_UNSAFE
    352  * @stable ICU 2.4
    353  */
    354 #define U16_FWD_1(s, i, length) { \
    355     if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
    356         ++(i); \
    357     } \
    358 }
    359 
    360 /**
    361  * Advance the string offset from one code point boundary to the n-th next one,
    362  * i.e., move forward by n code points.
    363  * (Post-incrementing iteration.)
    364  * "Unsafe" macro, assumes well-formed UTF-16.
    365  *
    366  * @param s const UChar * string
    367  * @param i string offset
    368  * @param n number of code points to skip
    369  * @see U16_FWD_N
    370  * @stable ICU 2.4
    371  */
    372 #define U16_FWD_N_UNSAFE(s, i, n) { \
    373     int32_t __N=(n); \
    374     while(__N>0) { \
    375         U16_FWD_1_UNSAFE(s, i); \
    376         --__N; \
    377     } \
    378 }
    379 
    380 /**
    381  * Advance the string offset from one code point boundary to the n-th next one,
    382  * i.e., move forward by n code points.
    383  * (Post-incrementing iteration.)
    384  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    385  *
    386  * The length can be negative for a NUL-terminated string.
    387  *
    388  * @param s const UChar * string
    389  * @param i int32_t string offset, must be i<length
    390  * @param length int32_t string length
    391  * @param n number of code points to skip
    392  * @see U16_FWD_N_UNSAFE
    393  * @stable ICU 2.4
    394  */
    395 #define U16_FWD_N(s, i, length, n) { \
    396     int32_t __N=(n); \
    397     while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
    398         U16_FWD_1(s, i, length); \
    399         --__N; \
    400     } \
    401 }
    402 
    403 /**
    404  * Adjust a random-access offset to a code point boundary
    405  * at the start of a code point.
    406  * If the offset points to the trail surrogate of a surrogate pair,
    407  * then the offset is decremented.
    408  * Otherwise, it is not modified.
    409  * "Unsafe" macro, assumes well-formed UTF-16.
    410  *
    411  * @param s const UChar * string
    412  * @param i string offset
    413  * @see U16_SET_CP_START
    414  * @stable ICU 2.4
    415  */
    416 #define U16_SET_CP_START_UNSAFE(s, i) { \
    417     if(U16_IS_TRAIL((s)[i])) { \
    418         --(i); \
    419     } \
    420 }
    421 
    422 /**
    423  * Adjust a random-access offset to a code point boundary
    424  * at the start of a code point.
    425  * If the offset points to the trail surrogate of a surrogate pair,
    426  * then the offset is decremented.
    427  * Otherwise, it is not modified.
    428  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    429  *
    430  * @param s const UChar * string
    431  * @param start starting string offset (usually 0)
    432  * @param i string offset, must be start<=i
    433  * @see U16_SET_CP_START_UNSAFE
    434  * @stable ICU 2.4
    435  */
    436 #define U16_SET_CP_START(s, start, i) { \
    437     if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
    438         --(i); \
    439     } \
    440 }
    441 
    442 /* definitions with backward iteration -------------------------------------- */
    443 
    444 /**
    445  * Move the string offset from one code point boundary to the previous one
    446  * and get the code point between them.
    447  * (Pre-decrementing backward iteration.)
    448  * "Unsafe" macro, assumes well-formed UTF-16.
    449  *
    450  * The input offset may be the same as the string length.
    451  * If the offset is behind a trail surrogate unit
    452  * for a supplementary code point, then the macro will read
    453  * the preceding lead surrogate as well.
    454  * If the offset is behind a lead surrogate, then that itself
    455  * will be returned as the code point.
    456  * The result is undefined if the offset is behind a single, unpaired trail surrogate.
    457  *
    458  * @param s const UChar * string
    459  * @param i string offset
    460  * @param c output UChar32 variable
    461  * @see U16_PREV
    462  * @stable ICU 2.4
    463  */
    464 #define U16_PREV_UNSAFE(s, i, c) { \
    465     (c)=(s)[--(i)]; \
    466     if(U16_IS_TRAIL(c)) { \
    467         (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
    468     } \
    469 }
    470 
    471 /**
    472  * Move the string offset from one code point boundary to the previous one
    473  * and get the code point between them.
    474  * (Pre-decrementing backward iteration.)
    475  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    476  *
    477  * The input offset may be the same as the string length.
    478  * If the offset is behind a trail surrogate unit
    479  * for a supplementary code point, then the macro will read
    480  * the preceding lead surrogate as well.
    481  * If the offset is behind a lead surrogate or behind a single, unpaired
    482  * trail surrogate, then that itself
    483  * will be returned as the code point.
    484  *
    485  * @param s const UChar * string
    486  * @param start starting string offset (usually 0)
    487  * @param i string offset, must be start<i
    488  * @param c output UChar32 variable
    489  * @see U16_PREV_UNSAFE
    490  * @stable ICU 2.4
    491  */
    492 #define U16_PREV(s, start, i, c) { \
    493     (c)=(s)[--(i)]; \
    494     if(U16_IS_TRAIL(c)) { \
    495         uint16_t __c2; \
    496         if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
    497             --(i); \
    498             (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
    499         } \
    500     } \
    501 }
    502 
    503 /**
    504  * Move the string offset from one code point boundary to the previous one.
    505  * (Pre-decrementing backward iteration.)
    506  * The input offset may be the same as the string length.
    507  * "Unsafe" macro, assumes well-formed UTF-16.
    508  *
    509  * @param s const UChar * string
    510  * @param i string offset
    511  * @see U16_BACK_1
    512  * @stable ICU 2.4
    513  */
    514 #define U16_BACK_1_UNSAFE(s, i) { \
    515     if(U16_IS_TRAIL((s)[--(i)])) { \
    516         --(i); \
    517     } \
    518 }
    519 
    520 /**
    521  * Move the string offset from one code point boundary to the previous one.
    522  * (Pre-decrementing backward iteration.)
    523  * The input offset may be the same as the string length.
    524  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    525  *
    526  * @param s const UChar * string
    527  * @param start starting string offset (usually 0)
    528  * @param i string offset, must be start<i
    529  * @see U16_BACK_1_UNSAFE
    530  * @stable ICU 2.4
    531  */
    532 #define U16_BACK_1(s, start, i) { \
    533     if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
    534         --(i); \
    535     } \
    536 }
    537 
    538 /**
    539  * Move the string offset from one code point boundary to the n-th one before it,
    540  * i.e., move backward by n code points.
    541  * (Pre-decrementing backward iteration.)
    542  * The input offset may be the same as the string length.
    543  * "Unsafe" macro, assumes well-formed UTF-16.
    544  *
    545  * @param s const UChar * string
    546  * @param i string offset
    547  * @param n number of code points to skip
    548  * @see U16_BACK_N
    549  * @stable ICU 2.4
    550  */
    551 #define U16_BACK_N_UNSAFE(s, i, n) { \
    552     int32_t __N=(n); \
    553     while(__N>0) { \
    554         U16_BACK_1_UNSAFE(s, i); \
    555         --__N; \
    556     } \
    557 }
    558 
    559 /**
    560  * Move the string offset from one code point boundary to the n-th one before it,
    561  * i.e., move backward by n code points.
    562  * (Pre-decrementing backward iteration.)
    563  * The input offset may be the same as the string length.
    564  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    565  *
    566  * @param s const UChar * string
    567  * @param start start of string
    568  * @param i string offset, must be start<i
    569  * @param n number of code points to skip
    570  * @see U16_BACK_N_UNSAFE
    571  * @stable ICU 2.4
    572  */
    573 #define U16_BACK_N(s, start, i, n) { \
    574     int32_t __N=(n); \
    575     while(__N>0 && (i)>(start)) { \
    576         U16_BACK_1(s, start, i); \
    577         --__N; \
    578     } \
    579 }
    580 
    581 /**
    582  * Adjust a random-access offset to a code point boundary after a code point.
    583  * If the offset is behind the lead surrogate of a surrogate pair,
    584  * then the offset is incremented.
    585  * Otherwise, it is not modified.
    586  * The input offset may be the same as the string length.
    587  * "Unsafe" macro, assumes well-formed UTF-16.
    588  *
    589  * @param s const UChar * string
    590  * @param i string offset
    591  * @see U16_SET_CP_LIMIT
    592  * @stable ICU 2.4
    593  */
    594 #define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
    595     if(U16_IS_LEAD((s)[(i)-1])) { \
    596         ++(i); \
    597     } \
    598 }
    599 
    600 /**
    601  * Adjust a random-access offset to a code point boundary after a code point.
    602  * If the offset is behind the lead surrogate of a surrogate pair,
    603  * then the offset is incremented.
    604  * Otherwise, it is not modified.
    605  * The input offset may be the same as the string length.
    606  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    607  *
    608  * The length can be negative for a NUL-terminated string.
    609  *
    610  * @param s const UChar * string
    611  * @param start int32_t starting string offset (usually 0)
    612  * @param i int32_t string offset, start<=i<=length
    613  * @param length int32_t string length
    614  * @see U16_SET_CP_LIMIT_UNSAFE
    615  * @stable ICU 2.4
    616  */
    617 #define U16_SET_CP_LIMIT(s, start, i, length) { \
    618     if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
    619         ++(i); \
    620     } \
    621 }
    622 
    623 #endif
    624