Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2004, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  utf16.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999sep09
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 /**
     18  * \file
     19  * \brief C API: 16-bit Unicode handling macros
     20  *
     21  * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
     22  * utf16.h is included by utf.h after unicode/umachine.h
     23  * and some common definitions.
     24  *
     25  * For more information see utf.h and the ICU User Guide Strings chapter
     26  * (http://oss.software.ibm.com/icu/userguide/).
     27  *
     28  * <em>Usage:</em>
     29  * ICU coding guidelines for if() statements should be followed when using these macros.
     30  * Compound statements (curly braces {}) must be used  for if-else-while...
     31  * bodies and all macro statements should be terminated with semicolon.
     32  */
     33 
     34 #ifndef __UTF16_H__
     35 #define __UTF16_H__
     36 
     37 /* utf.h must be included first. */
     38 #ifndef __UTF_H__
     39 #   include "unicode/utf.h"
     40 #endif
     41 
     42 /* single-code point definitions -------------------------------------------- */
     43 
     44 /**
     45  * Does this code unit alone encode a code point (BMP, not a surrogate)?
     46  * @param c 16-bit code unit
     47  * @return TRUE or FALSE
     48  * @stable ICU 2.4
     49  */
     50 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
     51 
     52 /**
     53  * Is this code unit a lead surrogate (U+d800..U+dbff)?
     54  * @param c 16-bit code unit
     55  * @return TRUE or FALSE
     56  * @stable ICU 2.4
     57  */
     58 #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
     59 
     60 /**
     61  * Is this code unit a trail surrogate (U+dc00..U+dfff)?
     62  * @param c 16-bit code unit
     63  * @return TRUE or FALSE
     64  * @stable ICU 2.4
     65  */
     66 #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
     67 
     68 /**
     69  * Is this code unit a surrogate (U+d800..U+dfff)?
     70  * @param c 16-bit code unit
     71  * @return TRUE or FALSE
     72  * @stable ICU 2.4
     73  */
     74 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
     75 
     76 /**
     77  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
     78  * is it a lead surrogate?
     79  * @param c 16-bit code unit
     80  * @return TRUE or FALSE
     81  * @stable ICU 2.4
     82  */
     83 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
     84 
     85 /**
     86  * Helper constant for U16_GET_SUPPLEMENTARY.
     87  * @internal
     88  */
     89 #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
     90 
     91 /**
     92  * Get a supplementary code point value (U+10000..U+10ffff)
     93  * from its lead and trail surrogates.
     94  * The result is undefined if the input values are not
     95  * lead and trail surrogates.
     96  *
     97  * @param lead lead surrogate (U+d800..U+dbff)
     98  * @param trail trail surrogate (U+dc00..U+dfff)
     99  * @return supplementary code point (U+10000..U+10ffff)
    100  * @stable ICU 2.4
    101  */
    102 #define U16_GET_SUPPLEMENTARY(lead, trail) \
    103     (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
    104 
    105 
    106 /**
    107  * Get the lead surrogate (0xd800..0xdbff) for a
    108  * supplementary code point (0x10000..0x10ffff).
    109  * @param supplementary 32-bit code point (U+10000..U+10ffff)
    110  * @return lead surrogate (U+d800..U+dbff) for supplementary
    111  * @stable ICU 2.4
    112  */
    113 #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
    114 
    115 /**
    116  * Get the trail surrogate (0xdc00..0xdfff) for a
    117  * supplementary code point (0x10000..0x10ffff).
    118  * @param supplementary 32-bit code point (U+10000..U+10ffff)
    119  * @return trail surrogate (U+dc00..U+dfff) for supplementary
    120  * @stable ICU 2.4
    121  */
    122 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
    123 
    124 /**
    125  * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
    126  * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
    127  * @param c 32-bit code point
    128  * @return 1 or 2
    129  * @stable ICU 2.4
    130  */
    131 #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
    132 
    133 /**
    134  * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
    135  * @return 2
    136  * @stable ICU 2.4
    137  */
    138 #define U16_MAX_LENGTH 2
    139 
    140 /**
    141  * Get a code point from a string at a random-access offset,
    142  * without changing the offset.
    143  * "Unsafe" macro, assumes well-formed UTF-16.
    144  *
    145  * The offset may point to either the lead or trail surrogate unit
    146  * for a supplementary code point, in which case the macro will read
    147  * the adjacent matching surrogate as well.
    148  * The result is undefined if the offset points to a single, unpaired surrogate.
    149  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
    150  *
    151  * @param s const UChar * string
    152  * @param i string offset
    153  * @param c output UChar32 variable
    154  * @see U16_GET
    155  * @stable ICU 2.4
    156  */
    157 #define U16_GET_UNSAFE(s, i, c) { \
    158     (c)=(s)[i]; \
    159     if(U16_IS_SURROGATE(c)) { \
    160         if(U16_IS_SURROGATE_LEAD(c)) { \
    161             (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
    162         } else { \
    163             (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
    164         } \
    165     } \
    166 }
    167 
    168 /**
    169  * Get a code point from a string at a random-access offset,
    170  * without changing the offset.
    171  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    172  *
    173  * The offset may point to either the lead or trail surrogate unit
    174  * for a supplementary code point, in which case the macro will read
    175  * the adjacent matching surrogate as well.
    176  * If the offset points to a single, unpaired surrogate, then that itself
    177  * will be returned as the code point.
    178  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
    179  *
    180  * @param s const UChar * string
    181  * @param start starting string offset (usually 0)
    182  * @param i string offset, start<=i<length
    183  * @param length string length
    184  * @param c output UChar32 variable
    185  * @see U16_GET_UNSAFE
    186  * @stable ICU 2.4
    187  */
    188 #define U16_GET(s, start, i, length, c) { \
    189     (c)=(s)[i]; \
    190     if(U16_IS_SURROGATE(c)) { \
    191         uint16_t __c2; \
    192         if(U16_IS_SURROGATE_LEAD(c)) { \
    193             if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
    194                 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
    195             } \
    196         } else { \
    197             if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
    198                 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
    199             } \
    200         } \
    201     } \
    202 }
    203 
    204 /* definitions with forward iteration --------------------------------------- */
    205 
    206 /**
    207  * Get a code point from a string at a code point boundary offset,
    208  * and advance the offset to the next code point boundary.
    209  * (Post-incrementing forward iteration.)
    210  * "Unsafe" macro, assumes well-formed UTF-16.
    211  *
    212  * The offset may point to the lead surrogate unit
    213  * for a supplementary code point, in which case the macro will read
    214  * the following trail surrogate as well.
    215  * If the offset points to a trail surrogate, then that itself
    216  * will be returned as the code point.
    217  * The result is undefined if the offset points to a single, unpaired lead surrogate.
    218  *
    219  * @param s const UChar * string
    220  * @param i string offset
    221  * @param c output UChar32 variable
    222  * @see U16_NEXT
    223  * @stable ICU 2.4
    224  */
    225 #define U16_NEXT_UNSAFE(s, i, c) { \
    226     (c)=(s)[(i)++]; \
    227     if(U16_IS_LEAD(c)) { \
    228         (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
    229     } \
    230 }
    231 
    232 /**
    233  * Get a code point from a string at a code point boundary offset,
    234  * and advance the offset to the next code point boundary.
    235  * (Post-incrementing forward iteration.)
    236  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    237  *
    238  * The offset may point to the lead surrogate unit
    239  * for a supplementary code point, in which case the macro will read
    240  * the following trail surrogate as well.
    241  * If the offset points to a trail surrogate or
    242  * to a single, unpaired lead surrogate, then that itself
    243  * will be returned as the code point.
    244  *
    245  * @param s const UChar * string
    246  * @param i string offset, i<length
    247  * @param length string length
    248  * @param c output UChar32 variable
    249  * @see U16_NEXT_UNSAFE
    250  * @stable ICU 2.4
    251  */
    252 #define U16_NEXT(s, i, length, c) { \
    253     (c)=(s)[(i)++]; \
    254     if(U16_IS_LEAD(c)) { \
    255         uint16_t __c2; \
    256         if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
    257             ++(i); \
    258             (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
    259         } \
    260     } \
    261 }
    262 
    263 /**
    264  * Append a code point to a string, overwriting 1 or 2 code units.
    265  * The offset points to the current end of the string contents
    266  * and is advanced (post-increment).
    267  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
    268  * Otherwise, the result is undefined.
    269  *
    270  * @param s const UChar * string buffer
    271  * @param i string offset
    272  * @param c code point to append
    273  * @see U16_APPEND
    274  * @stable ICU 2.4
    275  */
    276 #define U16_APPEND_UNSAFE(s, i, c) { \
    277     if((uint32_t)(c)<=0xffff) { \
    278         (s)[(i)++]=(uint16_t)(c); \
    279     } else { \
    280         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
    281         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    282     } \
    283 }
    284 
    285 /**
    286  * Append a code point to a string, overwriting 1 or 2 code units.
    287  * The offset points to the current end of the string contents
    288  * and is advanced (post-increment).
    289  * "Safe" macro, checks for a valid code point.
    290  * If a surrogate pair is written, checks for sufficient space in the string.
    291  * If the code point is not valid or a trail surrogate does not fit,
    292  * then isError is set to TRUE.
    293  *
    294  * @param s const UChar * string buffer
    295  * @param i string offset, i<length
    296  * @param capacity size of the string buffer
    297  * @param c code point to append
    298  * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
    299  * @see U16_APPEND_UNSAFE
    300  * @stable ICU 2.4
    301  */
    302 #define U16_APPEND(s, i, capacity, c, isError) { \
    303     if((uint32_t)(c)<=0xffff) { \
    304         (s)[(i)++]=(uint16_t)(c); \
    305     } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
    306         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
    307         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
    308     } else /* c>0x10ffff or not enough space */ { \
    309         (isError)=TRUE; \
    310     } \
    311 }
    312 
    313 /**
    314  * Advance the string offset from one code point boundary to the next.
    315  * (Post-incrementing iteration.)
    316  * "Unsafe" macro, assumes well-formed UTF-16.
    317  *
    318  * @param s const UChar * string
    319  * @param i string offset
    320  * @see U16_FWD_1
    321  * @stable ICU 2.4
    322  */
    323 #define U16_FWD_1_UNSAFE(s, i) { \
    324     if(U16_IS_LEAD((s)[(i)++])) { \
    325         ++(i); \
    326     } \
    327 }
    328 
    329 /**
    330  * Advance the string offset from one code point boundary to the next.
    331  * (Post-incrementing iteration.)
    332  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    333  *
    334  * @param s const UChar * string
    335  * @param i string offset, i<length
    336  * @param length string length
    337  * @see U16_FWD_1_UNSAFE
    338  * @stable ICU 2.4
    339  */
    340 #define U16_FWD_1(s, i, length) { \
    341     if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
    342         ++(i); \
    343     } \
    344 }
    345 
    346 /**
    347  * Advance the string offset from one code point boundary to the n-th next one,
    348  * i.e., move forward by n code points.
    349  * (Post-incrementing iteration.)
    350  * "Unsafe" macro, assumes well-formed UTF-16.
    351  *
    352  * @param s const UChar * string
    353  * @param i string offset
    354  * @param n number of code points to skip
    355  * @see U16_FWD_N
    356  * @stable ICU 2.4
    357  */
    358 #define U16_FWD_N_UNSAFE(s, i, n) { \
    359     int32_t __N=(n); \
    360     while(__N>0) { \
    361         U16_FWD_1_UNSAFE(s, i); \
    362         --__N; \
    363     } \
    364 }
    365 
    366 /**
    367  * Advance the string offset from one code point boundary to the n-th next one,
    368  * i.e., move forward by n code points.
    369  * (Post-incrementing iteration.)
    370  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    371  *
    372  * @param s const UChar * string
    373  * @param i string offset, i<length
    374  * @param length string length
    375  * @param n number of code points to skip
    376  * @see U16_FWD_N_UNSAFE
    377  * @stable ICU 2.4
    378  */
    379 #define U16_FWD_N(s, i, length, n) { \
    380     int32_t __N=(n); \
    381     while(__N>0 && (i)<(length)) { \
    382         U16_FWD_1(s, i, length); \
    383         --__N; \
    384     } \
    385 }
    386 
    387 /**
    388  * Adjust a random-access offset to a code point boundary
    389  * at the start of a code point.
    390  * If the offset points to the trail surrogate of a surrogate pair,
    391  * then the offset is decremented.
    392  * Otherwise, it is not modified.
    393  * "Unsafe" macro, assumes well-formed UTF-16.
    394  *
    395  * @param s const UChar * string
    396  * @param i string offset
    397  * @see U16_SET_CP_START
    398  * @stable ICU 2.4
    399  */
    400 #define U16_SET_CP_START_UNSAFE(s, i) { \
    401     if(U16_IS_TRAIL((s)[i])) { \
    402         --(i); \
    403     } \
    404 }
    405 
    406 /**
    407  * Adjust a random-access offset to a code point boundary
    408  * at the start of a code point.
    409  * If the offset points to the trail surrogate of a surrogate pair,
    410  * then the offset is decremented.
    411  * Otherwise, it is not modified.
    412  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    413  *
    414  * @param s const UChar * string
    415  * @param start starting string offset (usually 0)
    416  * @param i string offset, start<=i
    417  * @see U16_SET_CP_START_UNSAFE
    418  * @stable ICU 2.4
    419  */
    420 #define U16_SET_CP_START(s, start, i) { \
    421     if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
    422         --(i); \
    423     } \
    424 }
    425 
    426 /* definitions with backward iteration -------------------------------------- */
    427 
    428 /**
    429  * Move the string offset from one code point boundary to the previous one
    430  * and get the code point between them.
    431  * (Pre-decrementing backward iteration.)
    432  * "Unsafe" macro, assumes well-formed UTF-16.
    433  *
    434  * The input offset may be the same as the string length.
    435  * If the offset is behind a trail surrogate unit
    436  * for a supplementary code point, then the macro will read
    437  * the preceding lead surrogate as well.
    438  * If the offset is behind a lead surrogate, then that itself
    439  * will be returned as the code point.
    440  * The result is undefined if the offset is behind a single, unpaired trail surrogate.
    441  *
    442  * @param s const UChar * string
    443  * @param i string offset
    444  * @param c output UChar32 variable
    445  * @see U16_PREV
    446  * @stable ICU 2.4
    447  */
    448 #define U16_PREV_UNSAFE(s, i, c) { \
    449     (c)=(s)[--(i)]; \
    450     if(U16_IS_TRAIL(c)) { \
    451         (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
    452     } \
    453 }
    454 
    455 /**
    456  * Move the string offset from one code point boundary to the previous one
    457  * and get the code point between them.
    458  * (Pre-decrementing backward iteration.)
    459  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    460  *
    461  * The input offset may be the same as the string length.
    462  * If the offset is behind a trail surrogate unit
    463  * for a supplementary code point, then the macro will read
    464  * the preceding lead surrogate as well.
    465  * If the offset is behind a lead surrogate or behind a single, unpaired
    466  * trail surrogate, then that itself
    467  * will be returned as the code point.
    468  *
    469  * @param s const UChar * string
    470  * @param start starting string offset (usually 0)
    471  * @param i string offset, start<=i
    472  * @param c output UChar32 variable
    473  * @see U16_PREV_UNSAFE
    474  * @stable ICU 2.4
    475  */
    476 #define U16_PREV(s, start, i, c) { \
    477     (c)=(s)[--(i)]; \
    478     if(U16_IS_TRAIL(c)) { \
    479         uint16_t __c2; \
    480         if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
    481             --(i); \
    482             (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
    483         } \
    484     } \
    485 }
    486 
    487 /**
    488  * Move the string offset from one code point boundary to the previous one.
    489  * (Pre-decrementing backward iteration.)
    490  * The input offset may be the same as the string length.
    491  * "Unsafe" macro, assumes well-formed UTF-16.
    492  *
    493  * @param s const UChar * string
    494  * @param i string offset
    495  * @see U16_BACK_1
    496  * @stable ICU 2.4
    497  */
    498 #define U16_BACK_1_UNSAFE(s, i) { \
    499     if(U16_IS_TRAIL((s)[--(i)])) { \
    500         --(i); \
    501     } \
    502 }
    503 
    504 /**
    505  * Move the string offset from one code point boundary to the previous one.
    506  * (Pre-decrementing backward iteration.)
    507  * The input offset may be the same as the string length.
    508  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    509  *
    510  * @param s const UChar * string
    511  * @param start starting string offset (usually 0)
    512  * @param i string offset, start<=i
    513  * @see U16_BACK_1_UNSAFE
    514  * @stable ICU 2.4
    515  */
    516 #define U16_BACK_1(s, start, i) { \
    517     if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
    518         --(i); \
    519     } \
    520 }
    521 
    522 /**
    523  * Move the string offset from one code point boundary to the n-th one before it,
    524  * i.e., move backward by n code points.
    525  * (Pre-decrementing backward iteration.)
    526  * The input offset may be the same as the string length.
    527  * "Unsafe" macro, assumes well-formed UTF-16.
    528  *
    529  * @param s const UChar * string
    530  * @param i string offset
    531  * @param n number of code points to skip
    532  * @see U16_BACK_N
    533  * @stable ICU 2.4
    534  */
    535 #define U16_BACK_N_UNSAFE(s, i, n) { \
    536     int32_t __N=(n); \
    537     while(__N>0) { \
    538         U16_BACK_1_UNSAFE(s, i); \
    539         --__N; \
    540     } \
    541 }
    542 
    543 /**
    544  * Move the string offset from one code point boundary to the n-th one before it,
    545  * i.e., move backward by n code points.
    546  * (Pre-decrementing backward iteration.)
    547  * The input offset may be the same as the string length.
    548  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    549  *
    550  * @param s const UChar * string
    551  * @param start start of string
    552  * @param i string offset, i<length
    553  * @param n number of code points to skip
    554  * @see U16_BACK_N_UNSAFE
    555  * @stable ICU 2.4
    556  */
    557 #define U16_BACK_N(s, start, i, n) { \
    558     int32_t __N=(n); \
    559     while(__N>0 && (i)>(start)) { \
    560         U16_BACK_1(s, start, i); \
    561         --__N; \
    562     } \
    563 }
    564 
    565 /**
    566  * Adjust a random-access offset to a code point boundary after a code point.
    567  * If the offset is behind the lead surrogate of a surrogate pair,
    568  * then the offset is incremented.
    569  * Otherwise, it is not modified.
    570  * The input offset may be the same as the string length.
    571  * "Unsafe" macro, assumes well-formed UTF-16.
    572  *
    573  * @param s const UChar * string
    574  * @param i string offset
    575  * @see U16_SET_CP_LIMIT
    576  * @stable ICU 2.4
    577  */
    578 #define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
    579     if(U16_IS_LEAD((s)[(i)-1])) { \
    580         ++(i); \
    581     } \
    582 }
    583 
    584 /**
    585  * Adjust a random-access offset to a code point boundary after a code point.
    586  * If the offset is behind the lead surrogate of a surrogate pair,
    587  * then the offset is incremented.
    588  * Otherwise, it is not modified.
    589  * The input offset may be the same as the string length.
    590  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
    591  *
    592  * @param s const UChar * string
    593  * @param start starting string offset (usually 0)
    594  * @param i string offset, start<=i<=length
    595  * @param length string length
    596  * @see U16_SET_CP_LIMIT_UNSAFE
    597  * @stable ICU 2.4
    598  */
    599 #define U16_SET_CP_LIMIT(s, start, i, length) { \
    600     if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
    601         ++(i); \
    602     } \
    603 }
    604 
    605 #endif
    606