Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 * Copyright (c) 1996-2008, International Business Machines
      4 * Corporation and others. All Rights Reserved.
      5 ******************************************************************************
      6 * File unorm.cpp
      7 *
      8 * Created by: Vladimir Weinstein 12052000
      9 *
     10 * Modification history :
     11 *
     12 * Date        Name        Description
     13 * 02/01/01    synwee      Added normalization quickcheck enum and method.
     14 * 02/12/01    synwee      Commented out quickcheck util api has been approved
     15 *                         Added private method for doing FCD checks
     16 * 02/23/01    synwee      Modified quickcheck and checkFCE to run through
     17 *                         string for codepoints < 0x300 for the normalization
     18 *                         mode NFC.
     19 * 05/25/01+   Markus Scherer total rewrite, implement all normalization here
     20 *                         instead of just wrappers around normlzr.cpp,
     21 *                         load unorm.dat, support Unicode 3.1 with
     22 *                         supplementary code points, etc.
     23 */
     24 
     25 #include "unicode/utypes.h"
     26 
     27 #if !UCONFIG_NO_NORMALIZATION
     28 
     29 #include "unicode/udata.h"
     30 #include "unicode/uchar.h"
     31 #include "unicode/ustring.h"
     32 #include "unicode/uiter.h"
     33 #include "unicode/uniset.h"
     34 #include "unicode/usetiter.h"
     35 #include "unicode/unorm.h"
     36 #include "ucln_cmn.h"
     37 #include "unormimp.h"
     38 #include "ucase.h"
     39 #include "cmemory.h"
     40 #include "umutex.h"
     41 #include "utrie2.h"
     42 #include "unicode/uset.h"
     43 #include "udataswp.h"
     44 #include "putilimp.h"
     45 
     46 /*
     47  * Status of tailored normalization
     48  *
     49  * This was done initially for investigation on Unicode public review issue 7
     50  * (http://www.unicode.org/review/). See Jitterbug 2481.
     51  * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
     52  * a permanent feature in ICU 2.6 in support of IDNA which requires true
     53  * Unicode 3.2 normalization.
     54  * (NormalizationCorrections are rolled into IDNA mapping tables.)
     55  *
     56  * Tailored normalization as implemented here allows to "normalize less"
     57  * than full Unicode normalization would.
     58  * Based internally on a UnicodeSet of code points that are
     59  * "excluded from normalization", the normalization functions leave those
     60  * code points alone ("inert"). This means that tailored normalization
     61  * still transforms text into a canonically equivalent form.
     62  * It does not add decompositions to code points that do not have any or
     63  * change decomposition results.
     64  *
     65  * Any function that searches for a safe boundary has not been touched,
     66  * which means that these functions will be over-pessimistic when
     67  * exclusions are applied.
     68  * This should not matter because subsequent checks and normalizations
     69  * do apply the exclusions; only a little more of the text may be processed
     70  * than necessary under exclusions.
     71  *
     72  * Normalization exclusions have the following effect on excluded code points c:
     73  * - c is not decomposed
     74  * - c is not a composition target
     75  * - c does not combine forward or backward for composition
     76  *   except that this is not implemented for Jamo
     77  * - c is treated as having a combining class of 0
     78  */
     79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     80 
     81 U_NAMESPACE_USE
     82 
     83 /*
     84  * This new implementation of the normalization code loads its data from
     85  * unorm.dat, which is generated with the gennorm tool.
     86  * The format of that file is described in unormimp.h .
     87  */
     88 
     89 /* -------------------------------------------------------------------------- */
     90 
     91 enum {
     92     _STACK_BUFFER_CAPACITY=100
     93 };
     94 
     95 /*
     96  * Constants for the bit fields in the options bit set parameter.
     97  * These need not be public.
     98  * A user only needs to know the currently assigned values.
     99  * The number and positions of reserved bits per field can remain private
    100  * and may change in future implementations.
    101  */
    102 enum {
    103     _NORM_OPTIONS_NX_MASK=0x1f,
    104     _NORM_OPTIONS_UNICODE_MASK=0x60,
    105     _NORM_OPTIONS_SETS_MASK=0x7f,
    106 
    107     _NORM_OPTIONS_UNICODE_SHIFT=5,
    108 
    109     /*
    110      * The following options are used only in some composition functions.
    111      * They use bits 12 and up to preserve lower bits for the available options
    112      * space in unorm_compare() -
    113      * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
    114      */
    115 
    116     /** Options bit 12, for compatibility vs. canonical decomposition. */
    117     _NORM_OPTIONS_COMPAT=0x1000,
    118     /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
    119     _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
    120 };
    121 
    122 U_CDECL_BEGIN
    123 static inline UBool
    124 isHangulWithoutJamoT(UChar c) {
    125     c-=HANGUL_BASE;
    126     return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
    127 }
    128 
    129 /* norm32 helpers */
    130 
    131 /* is this a norm32 with a regular index? */
    132 static inline UBool
    133 isNorm32Regular(uint32_t norm32) {
    134     return norm32<_NORM_MIN_SPECIAL;
    135 }
    136 
    137 #if 0  // Code changed to use U16_IS_LEAD(c) instead.
    138 /* is this a norm32 with a special index for a lead surrogate? */
    139 static inline UBool
    140 isNorm32LeadSurrogate(uint32_t norm32) {
    141     return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
    142 }
    143 #endif
    144 
    145 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
    146 static inline UBool
    147 isNorm32HangulOrJamo(uint32_t norm32) {
    148     return norm32>=_NORM_MIN_HANGUL;
    149 }
    150 
    151 /*
    152  * Given isNorm32HangulOrJamo(),
    153  * is this a Hangul syllable or a Jamo?
    154  */
    155 /*static inline UBool
    156 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
    157     return norm32<_NORM_MIN_JAMO_V;
    158 }*/
    159 
    160 /*
    161  * Given norm32 for Jamo V or T,
    162  * is this a Jamo V?
    163  */
    164 static inline UBool
    165 isJamoVTNorm32JamoV(uint32_t norm32) {
    166     return norm32<_NORM_JAMO_V_TOP;
    167 }
    168 U_CDECL_END
    169 
    170 /* load unorm.dat ----------------------------------------------------------- */
    171 
    172 #define UNORM_HARDCODE_DATA 1
    173 
    174 #if UNORM_HARDCODE_DATA
    175 
    176 /* unorm_props_data.c is machine-generated by gennorm --csource */
    177 #include "unorm_props_data.c"
    178 
    179 static const UBool formatVersion_2_2=TRUE;
    180 
    181 #else
    182 
    183 #define DATA_NAME "unorm"
    184 #define DATA_TYPE "icu"
    185 
    186 static UDataMemory *normData=NULL;
    187 static UErrorCode dataErrorCode=U_ZERO_ERROR;
    188 static int8_t haveNormData=0;
    189 
    190 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
    191 static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
    192 
    193 /*
    194  * pointers into the memory-mapped unorm.icu
    195  */
    196 static const uint16_t *extraData=NULL,
    197                       *combiningTable=NULL,
    198                       *canonStartSets=NULL;
    199 
    200 static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
    201 static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
    202 
    203 /* the Unicode version of the normalization data */
    204 static UVersionInfo dataVersion={ 0, 0, 0, 0 };
    205 
    206 #endif
    207 
    208 /* cache UnicodeSets for each combination of exclusion flags */
    209 static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
    210 
    211 U_CDECL_BEGIN
    212 
    213 static UBool U_CALLCONV
    214 unorm_cleanup(void) {
    215     int32_t i;
    216 
    217 #if !UNORM_HARDCODE_DATA
    218     if(normData!=NULL) {
    219         udata_close(normData);
    220         normData=NULL;
    221     }
    222     dataErrorCode=U_ZERO_ERROR;
    223     haveNormData=0;
    224 #endif
    225 
    226     for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
    227         if (nxCache[i]) {
    228             delete nxCache[i];
    229             nxCache[i] = 0;
    230         }
    231     }
    232 
    233     return TRUE;
    234 }
    235 
    236 #if !UNORM_HARDCODE_DATA
    237 
    238 static UBool U_CALLCONV
    239 isAcceptable(void * /* context */,
    240              const char * /* type */, const char * /* name */,
    241              const UDataInfo *pInfo) {
    242     if(
    243         pInfo->size>=20 &&
    244         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
    245         pInfo->charsetFamily==U_CHARSET_FAMILY &&
    246         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Norm" */
    247         pInfo->dataFormat[1]==0x6f &&
    248         pInfo->dataFormat[2]==0x72 &&
    249         pInfo->dataFormat[3]==0x6d &&
    250         pInfo->formatVersion[0]==2 &&
    251         pInfo->formatVersion[2]==UTRIE_SHIFT &&
    252         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
    253     ) {
    254         uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
    255         uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
    256         return TRUE;
    257     } else {
    258         return FALSE;
    259     }
    260 }
    261 
    262 #endif
    263 
    264 static UBool U_CALLCONV
    265 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
    266     /* add the start code point to the USet */
    267     const USetAdder *sa=(const USetAdder *)context;
    268     sa->add(sa->set, start);
    269     return TRUE;
    270 }
    271 
    272 U_CDECL_END
    273 
    274 #if !UNORM_HARDCODE_DATA
    275 
    276 static int8_t
    277 loadNormData(UErrorCode &errorCode) {
    278     /* load Unicode normalization data from file */
    279 
    280     /*
    281      * This lazy intialization with double-checked locking (without mutex protection for
    282      * haveNormData==0) is transiently unsafe under certain circumstances.
    283      * Check the readme and use u_init() if necessary.
    284      *
    285      * While u_init() initializes the main normalization data via this functions,
    286      * it does not do so for exclusion sets (which are fully mutexed).
    287      * This is because
    288      * - there can be many exclusion sets
    289      * - they are rarely used
    290      * - they are not usually used in execution paths that are
    291      *   as performance-sensitive as others
    292      *   (e.g., IDNA takes more time than unorm_quickCheck() anyway)
    293      */
    294     if(haveNormData==0) {
    295         UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
    296         UDataMemory *data;
    297 
    298         const int32_t *p=NULL;
    299         const uint8_t *pb;
    300 
    301         if(&errorCode==NULL || U_FAILURE(errorCode)) {
    302             return 0;
    303         }
    304 
    305         /* open the data outside the mutex block */
    306         data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
    307         dataErrorCode=errorCode;
    308         if(U_FAILURE(errorCode)) {
    309             return haveNormData=-1;
    310         }
    311 
    312         p=(const int32_t *)udata_getMemory(data);
    313         pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
    314         utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
    315         _normTrie.getFoldingOffset=getFoldingNormOffset;
    316 
    317         pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
    318         if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
    319             utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
    320         }
    321         pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
    322 
    323         if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
    324             utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
    325             _auxTrie.getFoldingOffset=getFoldingAuxOffset;
    326         }
    327 
    328         if(U_FAILURE(errorCode)) {
    329             dataErrorCode=errorCode;
    330             udata_close(data);
    331             return haveNormData=-1;
    332         }
    333 
    334         /* in the mutex block, set the data for this process */
    335         umtx_lock(NULL);
    336         if(normData==NULL) {
    337             normData=data;
    338             data=NULL;
    339 
    340             uprv_memcpy(&indexes, p, sizeof(indexes));
    341             uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
    342             uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
    343             uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
    344         } else {
    345             p=(const int32_t *)udata_getMemory(normData);
    346         }
    347 
    348         /* initialize some variables */
    349         extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
    350         combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
    351         formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
    352         formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
    353         if(formatVersion_2_1) {
    354             canonStartSets=combiningTable+
    355                 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
    356                 (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
    357         }
    358         haveNormData=1;
    359         ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
    360         umtx_unlock(NULL);
    361 
    362         /* if a different thread set it first, then close the extra data */
    363         if(data!=NULL) {
    364             udata_close(data); /* NULL if it was set correctly */
    365         }
    366     }
    367 
    368     return haveNormData;
    369 }
    370 
    371 #endif
    372 
    373 static inline UBool
    374 _haveData(UErrorCode &errorCode) {
    375 #if UNORM_HARDCODE_DATA
    376     return U_SUCCESS(errorCode);
    377 #else
    378     if(U_FAILURE(errorCode)) {
    379         return FALSE;
    380     } else if(haveNormData>0) {
    381         return TRUE;
    382     } else if(haveNormData<0) {
    383         errorCode=dataErrorCode;
    384         return FALSE;
    385     } else /* haveNormData==0 */ {
    386         return (UBool)(loadNormData(errorCode)>0);
    387     }
    388 #endif
    389 }
    390 
    391 U_CAPI UBool U_EXPORT2
    392 unorm_haveData(UErrorCode *pErrorCode) {
    393     return _haveData(*pErrorCode);
    394 }
    395 
    396 U_CAPI const uint16_t * U_EXPORT2
    397 unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode) {
    398     if(_haveData(*pErrorCode)) {
    399         fcdHighStart=fcdTrie.highStart;
    400         return fcdTrie.index;
    401     } else {
    402         return NULL;
    403     }
    404 }
    405 
    406 /* data access primitives --------------------------------------------------- */
    407 
    408 static inline uint32_t
    409 _getNorm32(UChar c) {
    410     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(&normTrie, c);
    411 }
    412 
    413 static inline uint32_t
    414 _getNorm32FromSurrogatePair(UChar c, UChar c2) {
    415     UChar32 cp=U16_GET_SUPPLEMENTARY(c, c2);
    416     return UTRIE2_GET32_FROM_SUPP(&normTrie, cp);
    417 }
    418 
    419 /*
    420  * get a norm32 from text with complete code points
    421  * (like from decompositions)
    422  */
    423 static inline uint32_t
    424 _getNorm32(const UChar *p, uint32_t mask) {
    425     UChar c=*p;
    426     uint32_t norm32=_getNorm32(c);
    427     if((norm32&mask) && U16_IS_LEAD(c)) {
    428         /* c is a lead surrogate, get the real norm32 */
    429         norm32=_getNorm32FromSurrogatePair(c, *(p+1));
    430     }
    431     return norm32;
    432 }
    433 
    434 static inline uint16_t
    435 _getFCD16(UChar c) {
    436     return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(&fcdTrie, c);
    437 }
    438 
    439 static inline uint16_t
    440 _getFCD16FromSurrogatePair(UChar c, UChar c2) {
    441     UChar32 cp=U16_GET_SUPPLEMENTARY(c, c2);
    442     return UTRIE2_GET16_FROM_SUPP(&fcdTrie, cp);
    443 }
    444 
    445 static inline const uint16_t *
    446 _getExtraData(uint32_t norm32) {
    447     return extraData+(norm32>>_NORM_EXTRA_SHIFT);
    448 }
    449 
    450 /*
    451  * TODO(markus): Revisit if it makes sense for functions like _getNextCC()
    452  * and their call sites, and a fair bit of other code here, to work with UTF-16 code units,
    453  * or whether code simplification would suggest just using UChar32 and maybe UTRIE2_NEXT32().
    454  */
    455 
    456 #if 0
    457 /*
    458  * It is possible to get the FCD data from the main trie if unorm.icu
    459  * was built without the FCD trie, although it is slower.
    460  * This is not implemented because it is hard to test, and because it seems
    461  * unusual to want to use FCD and not build the data file for it.
    462  *
    463  * Untested sample code:
    464  */
    465 static inline uint16_t
    466 _getFCD16FromNormData(UChar32 c) {
    467     uint32_t norm32, fcd;
    468 
    469     norm32=_getNorm32(c);
    470     if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) {
    471         /* get the lead/trail cc from the decomposition data */
    472         const uint16_t *nfd=_getExtraData(norm32);
    473         if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
    474             fcd=nfd[1];
    475         }
    476     } else {
    477         fcd=norm32&_NORM_CC_MASK;
    478         if(fcd!=0) {
    479             /* use the code point cc value for both lead and trail cc's */
    480             fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */
    481         }
    482     }
    483 
    484     return (uint16_t)fcd;
    485 }
    486 #endif
    487 
    488 /* normalization exclusion sets --------------------------------------------- */
    489 
    490 /*
    491  * Normalization exclusion UnicodeSets are used for tailored normalization;
    492  * see the comment near the beginning of this file.
    493  *
    494  * By specifying one or several sets of code points,
    495  * those code points become inert for normalization.
    496  */
    497 
    498 static const UnicodeSet *
    499 internalGetNXHangul(UErrorCode &errorCode) {
    500     /* internal function, does not check for incoming U_FAILURE */
    501     UBool isCached;
    502 
    503     UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached);
    504 
    505     if(!isCached) {
    506         UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
    507         if(set==NULL) {
    508             errorCode=U_MEMORY_ALLOCATION_ERROR;
    509             return NULL;
    510         }
    511         // Compact the set for caching.
    512         set->compact();
    513 
    514         umtx_lock(NULL);
    515         if(nxCache[UNORM_NX_HANGUL]==NULL) {
    516             nxCache[UNORM_NX_HANGUL]=set;
    517             set=NULL;
    518             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
    519         }
    520         umtx_unlock(NULL);
    521 
    522         delete set;
    523     }
    524 
    525     return nxCache[UNORM_NX_HANGUL];
    526 }
    527 
    528 /* unorm.cpp 1.116 had and used
    529 static const UnicodeSet *
    530 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
    531     ...
    532 }
    533 */
    534 
    535 /* get and set an exclusion set from a serialized UnicodeSet */
    536 static const UnicodeSet *
    537 internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
    538     /* internal function, does not check for incoming U_FAILURE */
    539     UBool isCached;
    540 
    541     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
    542 
    543     if( !isCached &&
    544         canonStartSets!=NULL &&
    545         canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
    546     ) {
    547         USerializedSet sset;
    548         UnicodeSet *set;
    549         UChar32 start, end;
    550         int32_t i;
    551 
    552         if( !uset_getSerializedSet(
    553                     &sset,
    554                     canonStartSets+canonStartSets[nxIndex],
    555                     canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
    556         ) {
    557             errorCode=U_INVALID_FORMAT_ERROR;
    558             return NULL;
    559         }
    560 
    561         /* turn the serialized set into a UnicodeSet */
    562         set=new UnicodeSet();
    563         if(set==NULL) {
    564             errorCode=U_MEMORY_ALLOCATION_ERROR;
    565             return NULL;
    566         }
    567         for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
    568             set->add(start, end);
    569         }
    570         // Compact the set for caching.
    571         set->compact();
    572 
    573         umtx_lock(NULL);
    574         if(nxCache[options]==NULL) {
    575             nxCache[options]=set;
    576             set=NULL;
    577             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
    578         }
    579         umtx_unlock(NULL);
    580 
    581         delete set;
    582     }
    583 
    584     return nxCache[options];
    585 }
    586 
    587 static const UnicodeSet *
    588 internalGetNXCJKCompat(UErrorCode &errorCode) {
    589     /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
    590     return internalGetSerializedNX(
    591                 UNORM_NX_CJK_COMPAT,
    592                 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
    593                 errorCode);
    594 }
    595 
    596 static const UnicodeSet *
    597 internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
    598     /* internal function, does not check for incoming U_FAILURE */
    599     int32_t nxIndex;
    600 
    601     options&=_NORM_OPTIONS_UNICODE_MASK;
    602     switch(options) {
    603     case 0:
    604         return NULL;
    605     case UNORM_UNICODE_3_2:
    606         /* [:^Age=3.2:] */
    607         nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
    608         break;
    609     default:
    610         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    611         return NULL;
    612     }
    613 
    614     /* build a set with all code points that were not designated by the specified Unicode version */
    615     return internalGetSerializedNX(options, nxIndex, errorCode);
    616 }
    617 
    618 /* Get a decomposition exclusion set. The data must be loaded. */
    619 static const UnicodeSet *
    620 internalGetNX(int32_t options, UErrorCode &errorCode) {
    621     options&=_NORM_OPTIONS_SETS_MASK;
    622 
    623     UBool isCached;
    624 
    625     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
    626 
    627     if(!isCached) {
    628         /* return basic sets */
    629         if(options==UNORM_NX_HANGUL) {
    630             return internalGetNXHangul(errorCode);
    631         }
    632         if(options==UNORM_NX_CJK_COMPAT) {
    633             return internalGetNXCJKCompat(errorCode);
    634         }
    635         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
    636             return internalGetNXUnicode(options, errorCode);
    637         }
    638 
    639         /* build a set from multiple subsets */
    640         UnicodeSet *set;
    641         const UnicodeSet *other;
    642 
    643         set=new UnicodeSet();
    644         if(set==NULL) {
    645             errorCode=U_MEMORY_ALLOCATION_ERROR;
    646             return NULL;
    647         }
    648 
    649         if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
    650             set->addAll(*other);
    651         }
    652         if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
    653             set->addAll(*other);
    654         }
    655         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
    656             set->addAll(*other);
    657         }
    658 
    659         if(U_FAILURE(errorCode)) {
    660             delete set;
    661             return NULL;
    662         }
    663         // Compact the set for caching.
    664         set->compact();
    665 
    666         umtx_lock(NULL);
    667         if(nxCache[options]==NULL) {
    668             nxCache[options]=set;
    669             set=NULL;
    670             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
    671         }
    672         umtx_unlock(NULL);
    673 
    674         delete set;
    675     }
    676 
    677     return nxCache[options];
    678 }
    679 
    680 static inline const UnicodeSet *
    681 getNX(int32_t options, UErrorCode &errorCode) {
    682     if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
    683         /* incoming failure, or no decomposition exclusions requested */
    684         return NULL;
    685     } else {
    686         return internalGetNX(options, errorCode);
    687     }
    688 }
    689 
    690 U_CFUNC const UnicodeSet *
    691 unorm_getNX(int32_t options, UErrorCode *pErrorCode) {
    692     return getNX(options, *pErrorCode);
    693 }
    694 
    695 static inline UBool
    696 nx_contains(const UnicodeSet *nx, UChar32 c) {
    697     return nx!=NULL && nx->contains(c);
    698 }
    699 
    700 static inline UBool
    701 nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
    702     return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
    703 }
    704 
    705 /* other normalization primitives ------------------------------------------- */
    706 
    707 /* get the canonical or compatibility decomposition for one character */
    708 static inline const UChar *
    709 _decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
    710            uint8_t &cc, uint8_t &trailCC) {
    711     const UChar *p=(const UChar *)_getExtraData(norm32);
    712     length=*p++;
    713 
    714     if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
    715         /* use compatibility decomposition, skip canonical data */
    716         p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
    717         length>>=8;
    718     }
    719 
    720     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
    721         /* get the lead and trail cc's */
    722         UChar bothCCs=*p++;
    723         cc=(uint8_t)(bothCCs>>8);
    724         trailCC=(uint8_t)bothCCs;
    725     } else {
    726         /* lead and trail cc's are both 0 */
    727         cc=trailCC=0;
    728     }
    729 
    730     length&=_NORM_DECOMP_LENGTH_MASK;
    731     return p;
    732 }
    733 
    734 /* get the canonical decomposition for one character */
    735 static inline const UChar *
    736 _decompose(uint32_t norm32, int32_t &length,
    737            uint8_t &cc, uint8_t &trailCC) {
    738     const UChar *p=(const UChar *)_getExtraData(norm32);
    739     length=*p++;
    740 
    741     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
    742         /* get the lead and trail cc's */
    743         UChar bothCCs=*p++;
    744         cc=(uint8_t)(bothCCs>>8);
    745         trailCC=(uint8_t)bothCCs;
    746     } else {
    747         /* lead and trail cc's are both 0 */
    748         cc=trailCC=0;
    749     }
    750 
    751     length&=_NORM_DECOMP_LENGTH_MASK;
    752     return p;
    753 }
    754 
    755 /**
    756  * Get the canonical decomposition for one code point.
    757  * @param c code point
    758  * @param buffer out-only buffer for algorithmic decompositions of Hangul
    759  * @param length out-only, takes the length of the decomposition, if any
    760  * @return pointer to decomposition, or 0 if none
    761  * @internal
    762  */
    763 U_CFUNC const UChar *
    764 unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) {
    765     uint32_t norm32;
    766 
    767     if(c<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
    768         /* trivial case */
    769         return NULL;
    770     }
    771 
    772     norm32=UTRIE2_GET32(&normTrie, c);
    773     if(norm32&_NORM_QC_NFD) {
    774         if(isNorm32HangulOrJamo(norm32)) {
    775             /* Hangul syllable: decompose algorithmically */
    776             UChar c2;
    777 
    778             c-=HANGUL_BASE;
    779 
    780             c2=(UChar)(c%JAMO_T_COUNT);
    781             c/=JAMO_T_COUNT;
    782             if(c2>0) {
    783                 buffer[2]=(UChar)(JAMO_T_BASE+c2);
    784                 *pLength=3;
    785             } else {
    786                 *pLength=2;
    787             }
    788 
    789             buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
    790             buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
    791             return buffer;
    792         } else {
    793             /* normal decomposition */
    794             uint8_t cc, trailCC;
    795             return _decompose(norm32, *pLength, cc, trailCC);
    796         }
    797     } else {
    798         return 0;
    799     }
    800 }
    801 
    802 /*
    803  * get the combining class of (c, c2)=*p++
    804  * before: p<limit  after: p<=limit
    805  * if only one code unit is used, then c2==0
    806  */
    807 static inline uint8_t
    808 _getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
    809     uint32_t norm32;
    810 
    811     c=*p++;
    812     c2=0;
    813     norm32=_getNorm32(c);
    814     if((norm32&_NORM_CC_MASK)==0) {
    815         return 0;
    816     } else if(U16_IS_LEAD(c)) {
    817         /* c is a lead surrogate, get the real norm32 */
    818         if(p!=limit && U16_IS_TRAIL(c2=*p)) {
    819             ++p;
    820             norm32=_getNorm32FromSurrogatePair(c, c2);
    821         } else {
    822             c2=0;
    823             return 0;
    824         }
    825     }
    826     return (uint8_t)(norm32>>_NORM_CC_SHIFT);
    827 }
    828 
    829 /*
    830  * read backwards and get norm32
    831  * return 0 if the character is <minC
    832  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
    833  */
    834 static inline uint32_t
    835 _getPrevNorm32(const UChar *start, const UChar *&src,
    836                uint32_t minC,
    837                UChar &c, UChar &c2) {
    838     c=*--src;
    839     c2=0;
    840 
    841     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
    842     if(c<minC) {
    843         return 0;
    844     } else if(!U_IS_SURROGATE(c)) {
    845         return _getNorm32(c);
    846     } else if(U16_IS_SURROGATE_TRAIL(c) && src!=start && U16_IS_LEAD(c2=*(src-1))) {
    847         --src;
    848         return _getNorm32FromSurrogatePair(c2, c);
    849     } else {
    850         /* unpaired surrogate */
    851         c2=0;
    852         return 0;
    853     }
    854 }
    855 
    856 /*
    857  * get the combining class of (c, c2)=*--p
    858  * before: start<p  after: start<=p
    859  */
    860 static inline uint8_t
    861 _getPrevCC(const UChar *start, const UChar *&p) {
    862     UChar c, c2;
    863 
    864     return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, c, c2)>>_NORM_CC_SHIFT);
    865 }
    866 
    867 /*
    868  * is this a safe boundary character for NF*D?
    869  * (lead cc==0)
    870  */
    871 static inline UBool
    872 _isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
    873     if((norm32&ccOrQCMask)==0) {
    874         return TRUE; /* cc==0 and no decomposition: this is NF*D safe */
    875     }
    876 
    877     /* inspect its decomposition - maybe a Hangul but not a surrogate here */
    878     if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
    879         int32_t length;
    880         uint8_t cc, trailCC;
    881 
    882         /* decomposes, get everything from the variable-length extra data */
    883         _decompose(norm32, decompQCMask, length, cc, trailCC);
    884         return cc==0;
    885     } else {
    886         /* no decomposition (or Hangul), test the cc directly */
    887         return (norm32&_NORM_CC_MASK)==0;
    888     }
    889 }
    890 
    891 /*
    892  * is this (or does its decomposition begin with) a "true starter"?
    893  * (cc==0 and NF*C_YES)
    894  */
    895 static inline UBool
    896 _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
    897     if((norm32&ccOrQCMask)==0) {
    898         return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
    899     }
    900 
    901     /* inspect its decomposition - not a Hangul or a surrogate here */
    902     if((norm32&decompQCMask)!=0) {
    903         const UChar *p;
    904         int32_t length;
    905         uint8_t cc, trailCC;
    906 
    907         /* decomposes, get everything from the variable-length extra data */
    908         p=_decompose(norm32, decompQCMask, length, cc, trailCC);
    909         if(cc==0) {
    910             uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
    911 
    912             /* does it begin with NFC_YES? */
    913             if((_getNorm32(p, qcMask)&qcMask)==0) {
    914                 /* yes, the decomposition begins with a true starter */
    915                 return TRUE;
    916             }
    917         }
    918     }
    919     return FALSE;
    920 }
    921 
    922 /* uchar.h */
    923 U_CAPI uint8_t U_EXPORT2
    924 u_getCombiningClass(UChar32 c) {
    925 #if !UNORM_HARDCODE_DATA
    926     UErrorCode errorCode=U_ZERO_ERROR;
    927     if(_haveData(errorCode)) {
    928 #endif
    929         uint32_t norm32=UTRIE2_GET32(&normTrie, c);
    930         return (uint8_t)(norm32>>_NORM_CC_SHIFT);
    931 #if !UNORM_HARDCODE_DATA
    932     } else {
    933         return 0;
    934     }
    935 #endif
    936 }
    937 
    938 U_CFUNC UBool U_EXPORT2
    939 unorm_internalIsFullCompositionExclusion(UChar32 c) {
    940 #if UNORM_HARDCODE_DATA
    941     if(auxTrie.index!=NULL) {
    942 #else
    943     UErrorCode errorCode=U_ZERO_ERROR;
    944     if(_haveData(errorCode) && auxTrie.index!=NULL) {
    945 #endif
    946         uint16_t aux=UTRIE2_GET16(&auxTrie, c);
    947         return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
    948     } else {
    949         return FALSE;
    950     }
    951 }
    952 
    953 U_CFUNC UBool U_EXPORT2
    954 unorm_isCanonSafeStart(UChar32 c) {
    955 #if UNORM_HARDCODE_DATA
    956     if(auxTrie.index!=NULL) {
    957 #else
    958     UErrorCode errorCode=U_ZERO_ERROR;
    959     if(_haveData(errorCode) && auxTrie.index!=NULL) {
    960 #endif
    961         uint16_t aux=UTRIE2_GET16(&auxTrie, c);
    962         return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
    963     } else {
    964         return FALSE;
    965     }
    966 }
    967 
    968 U_CAPI void U_EXPORT2
    969 unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
    970     if(unorm_haveData(pErrorCode)){
    971         uprv_memcpy(*versionInfo, dataVersion, 4);
    972     }
    973 }
    974 
    975 
    976 U_CAPI UBool U_EXPORT2
    977 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
    978 #if !UNORM_HARDCODE_DATA
    979     UErrorCode errorCode=U_ZERO_ERROR;
    980 #endif
    981     if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
    982 #if !UNORM_HARDCODE_DATA
    983         _haveData(errorCode) &&
    984 #endif
    985         canonStartSets!=NULL
    986     ) {
    987         const uint16_t *table;
    988         int32_t i, start, limit;
    989 
    990         /*
    991          * binary search for c
    992          *
    993          * There are two search tables,
    994          * one for BMP code points and one for supplementary ones.
    995          * See unormimp.h for details.
    996          */
    997         if(c<=0xffff) {
    998             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
    999             start=0;
   1000             limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
   1001 
   1002             /* each entry is a pair { c, result } */
   1003             while(start<limit-2) {
   1004                 i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
   1005                 if(c<table[i]) {
   1006                     limit=i;
   1007                 } else {
   1008                     start=i;
   1009                 }
   1010             }
   1011 
   1012             /* found? */
   1013             if(c==table[start]) {
   1014                 i=table[start+1];
   1015                 if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
   1016                     /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
   1017                     i&=(_NORM_MAX_CANON_SETS-1);
   1018                     return uset_getSerializedSet(fillSet,
   1019                                             canonStartSets+i,
   1020                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
   1021                 } else {
   1022                     /* other result values are BMP code points for single-code point sets */
   1023                     uset_setSerializedToOne(fillSet, (UChar32)i);
   1024                     return TRUE;
   1025                 }
   1026             }
   1027         } else {
   1028             uint16_t high, low, h;
   1029 
   1030             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
   1031                                  canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
   1032             start=0;
   1033             limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
   1034 
   1035             high=(uint16_t)(c>>16);
   1036             low=(uint16_t)c;
   1037 
   1038             /* each entry is a triplet { high(c), low(c), result } */
   1039             while(start<limit-3) {
   1040                 i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
   1041                 h=table[i]&0x1f; /* high word */
   1042                 if(high<h || (high==h && low<table[i+1])) {
   1043                     limit=i;
   1044                 } else {
   1045                     start=i;
   1046                 }
   1047             }
   1048 
   1049             /* found? */
   1050             h=table[start];
   1051             if(high==(h&0x1f) && low==table[start+1]) {
   1052                 i=table[start+2];
   1053                 if((h&0x8000)==0) {
   1054                     /* the result is an index to a USerializedSet */
   1055                     return uset_getSerializedSet(fillSet,
   1056                                             canonStartSets+i,
   1057                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
   1058                 } else {
   1059                     /*
   1060                      * single-code point set {x} in
   1061                      * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }
   1062                      */
   1063                     i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
   1064                     uset_setSerializedToOne(fillSet, (UChar32)i);
   1065                     return TRUE;
   1066                 }
   1067             }
   1068         }
   1069     }
   1070 
   1071     return FALSE; /* not found */
   1072 }
   1073 
   1074 U_CAPI int32_t U_EXPORT2
   1075 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
   1076     uint16_t aux;
   1077 
   1078     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1079         return 0;
   1080     }
   1081     if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
   1082         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1083         return 0;
   1084     }
   1085     if(_haveData(*pErrorCode) && auxTrie.index!=NULL) {
   1086         aux=UTRIE2_GET16(&auxTrie, c);
   1087         aux&=_NORM_AUX_FNC_MASK;
   1088     } else {
   1089         aux=0;
   1090     }
   1091     if(aux!=0) {
   1092         const UChar *s;
   1093         int32_t length;
   1094 
   1095         s=(const UChar *)(extraData+aux);
   1096         if(*s<0xff00) {
   1097             /* s points to the single-unit string */
   1098             length=1;
   1099         } else {
   1100             length=*s&0xff;
   1101             ++s;
   1102         }
   1103         if(0<length && length<=destCapacity) {
   1104             uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
   1105         }
   1106         return u_terminateUChars(dest, destCapacity, length, pErrorCode);
   1107     } else {
   1108         return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
   1109     }
   1110 }
   1111 
   1112 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
   1113 U_CAPI UBool U_EXPORT2
   1114 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
   1115     uint32_t norm32, mask;
   1116     uint16_t aux;
   1117 
   1118 #if !UNORM_HARDCODE_DATA
   1119     UErrorCode errorCode=U_ZERO_ERROR;
   1120     if(!_haveData(errorCode)) {
   1121         return FALSE;
   1122     }
   1123 #endif
   1124 
   1125     /* handle trivial cases; set the comparison mask for the normal ones */
   1126     switch(mode) {
   1127     case UNORM_NONE:
   1128         return TRUE;
   1129     case UNORM_NFD:
   1130         mask=_NORM_CC_MASK|_NORM_QC_NFD;
   1131         break;
   1132     case UNORM_NFKD:
   1133         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
   1134         break;
   1135     case UNORM_NFC:
   1136     /* case UNORM_FCC: */
   1137         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
   1138         break;
   1139     case UNORM_NFKC:
   1140         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
   1141         break;
   1142     case UNORM_FCD:
   1143         /* FCD: skippable if lead cc==0 and trail cc<=1 */
   1144         return fcdTrie.index!=NULL && UTRIE2_GET16(&fcdTrie, c)<=1;
   1145     default:
   1146         return FALSE;
   1147     }
   1148 
   1149     /* check conditions (a)..(e), see unormimp.h */
   1150     norm32=UTRIE2_GET32(&normTrie, c);
   1151     if((norm32&mask)!=0) {
   1152         return FALSE; /* fails (a)..(e), not skippable */
   1153     }
   1154 
   1155     if(mode<UNORM_NFC) {
   1156         return TRUE; /* NF*D, passed (a)..(c), is skippable */
   1157     }
   1158 
   1159     /* NF*C/FCC, passed (a)..(e) */
   1160     if((norm32&_NORM_QC_NFD)==0) {
   1161         return TRUE; /* no canonical decomposition, is skippable */
   1162     }
   1163 
   1164     /* check Hangul syllables algorithmically */
   1165     if(isNorm32HangulOrJamo(norm32)) {
   1166         /* Jamo passed (a)..(e) above, must be Hangul */
   1167         return !isHangulWithoutJamoT((UChar)c); /* LVT are skippable, LV are not */
   1168     }
   1169 
   1170     /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
   1171     /* NF*C, test (f) flag */
   1172     if(!formatVersion_2_2 || auxTrie.index==NULL) {
   1173         return FALSE; /* no (f) data, say not skippable to be safe */
   1174     }
   1175 
   1176     aux=UTRIE2_GET16(&auxTrie, c);
   1177     return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
   1178 
   1179     /* } else { FCC, test fcd<=1 instead of the above } */
   1180 }
   1181 
   1182 U_CAPI void U_EXPORT2
   1183 unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
   1184     UChar c;
   1185 
   1186     if(!_haveData(*pErrorCode)) {
   1187         return;
   1188     }
   1189 
   1190     /* add the start code point of each same-value range of each trie */
   1191     utrie2_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
   1192     if(fcdTrie.index!=NULL) {
   1193         utrie2_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
   1194     }
   1195     if(auxTrie.index!=NULL) {
   1196         utrie2_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
   1197     }
   1198 
   1199     /* add Hangul LV syllables and LV+1 because of skippables */
   1200     for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
   1201         sa->add(sa->set, c);
   1202         sa->add(sa->set, c+1);
   1203     }
   1204     sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
   1205 }
   1206 
   1207 U_CFUNC UNormalizationCheckResult U_EXPORT2
   1208 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
   1209     static const uint32_t qcMask[UNORM_MODE_COUNT]={
   1210         0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
   1211     };
   1212 
   1213     uint32_t norm32;
   1214 
   1215 #if !UNORM_HARDCODE_DATA
   1216     UErrorCode errorCode=U_ZERO_ERROR;
   1217     if(!_haveData(errorCode)) {
   1218         return UNORM_YES;
   1219     }
   1220 #endif
   1221 
   1222     norm32=UTRIE2_GET32(&normTrie, c);
   1223     norm32&=qcMask[mode];
   1224 
   1225     if(norm32==0) {
   1226         return UNORM_YES;
   1227     } else if(norm32&_NORM_QC_ANY_NO) {
   1228         return UNORM_NO;
   1229     } else /* _NORM_QC_ANY_MAYBE */ {
   1230         return UNORM_MAYBE;
   1231     }
   1232 }
   1233 
   1234 U_CFUNC uint16_t U_EXPORT2
   1235 unorm_getFCD16FromCodePoint(UChar32 c) {
   1236 #if !UNORM_HARDCODE_DATA
   1237     UErrorCode errorCode;
   1238     errorCode=U_ZERO_ERROR;
   1239 #endif
   1240 
   1241     if(
   1242 #if !UNORM_HARDCODE_DATA
   1243         !_haveData(errorCode) ||
   1244 #endif
   1245         fcdTrie.index==NULL
   1246     ) {
   1247         return 0;
   1248     }
   1249     return UTRIE2_GET16(&fcdTrie, c);
   1250 }
   1251 
   1252 /* reorder UTF-16 in-place -------------------------------------------------- */
   1253 
   1254 /*
   1255  * simpler, single-character version of _mergeOrdered() -
   1256  * bubble-insert one single code point into the preceding string
   1257  * which is already canonically ordered
   1258  * (c, c2) may or may not yet have been inserted at [current..p[
   1259  *
   1260  * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
   1261  *
   1262  * before: [start..current[ is already ordered, and
   1263  *         [current..p[     may or may not hold (c, c2) but
   1264  *                          must be exactly the same length as (c, c2)
   1265  * after: [start..p[ is ordered
   1266  *
   1267  * returns the trailing combining class
   1268  */
   1269 static uint8_t
   1270 _insertOrdered(const UChar *start, UChar *current, UChar *p,
   1271                UChar c, UChar c2, uint8_t cc) {
   1272     const UChar *pBack, *pPreBack;
   1273     UChar *r;
   1274     uint8_t prevCC, trailCC=cc;
   1275 
   1276     if(start<current && cc!=0) {
   1277         /* search for the insertion point where cc>=prevCC */
   1278         pPreBack=pBack=current;
   1279         prevCC=_getPrevCC(start, pPreBack);
   1280         if(cc<prevCC) {
   1281             /* this will be the last code point, so keep its cc */
   1282             trailCC=prevCC;
   1283             pBack=pPreBack;
   1284             while(start<pPreBack) {
   1285                 prevCC=_getPrevCC(start, pPreBack);
   1286                 if(cc>=prevCC) {
   1287                     break;
   1288                 }
   1289                 pBack=pPreBack;
   1290             }
   1291 
   1292             /*
   1293              * this is where we are right now with all these pointers:
   1294              * [start..pPreBack[ 0..? code points that we can ignore
   1295              * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
   1296              * [pBack..current[  0..n code points with >cc, move up to insert (c, c2)
   1297              * [current..p[         1 code point (c, c2) with cc
   1298              */
   1299 
   1300             /* move the code units in between up */
   1301             r=p;
   1302             do {
   1303                 *--r=*--current;
   1304             } while(pBack!=current);
   1305         }
   1306     }
   1307 
   1308     /* insert (c, c2) */
   1309     *current=c;
   1310     if(c2!=0) {
   1311         *(current+1)=c2;
   1312     }
   1313 
   1314     /* we know the cc of the last code point */
   1315     return trailCC;
   1316 }
   1317 
   1318 /*
   1319  * merge two UTF-16 string parts together
   1320  * to canonically order (order by combining classes) their concatenation
   1321  *
   1322  * the two strings may already be adjacent, so that the merging is done in-place
   1323  * if the two strings are not adjacent, then the buffer holding the first one
   1324  * must be large enough
   1325  * the second string may or may not be ordered in itself
   1326  *
   1327  * before: [start..current[ is already ordered, and
   1328  *         [next..limit[    may be ordered in itself, but
   1329  *                          is not in relation to [start..current[
   1330  * after: [start..current+(limit-next)[ is ordered
   1331  *
   1332  * the algorithm is a simple bubble-sort that takes the characters from *next++
   1333  * and inserts them in correct combining class order into the preceding part
   1334  * of the string
   1335  *
   1336  * since this function is called much less often than the single-code point
   1337  * _insertOrdered(), it just uses that for easier maintenance
   1338  * (see file version from before 2001aug31 for a more optimized version)
   1339  *
   1340  * returns the trailing combining class
   1341  */
   1342 static uint8_t
   1343 _mergeOrdered(UChar *start, UChar *current,
   1344               const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
   1345     UChar *r;
   1346     UChar c, c2;
   1347     uint8_t cc, trailCC=0;
   1348     UBool adjacent;
   1349 
   1350     adjacent= current==next;
   1351 
   1352     if(start!=current || !isOrdered) {
   1353         while(next<limit) {
   1354             cc=_getNextCC(next, limit, c, c2);
   1355             if(cc==0) {
   1356                 /* does not bubble back */
   1357                 trailCC=0;
   1358                 if(adjacent) {
   1359                     current=(UChar *)next;
   1360                 } else {
   1361                     *current++=c;
   1362                     if(c2!=0) {
   1363                         *current++=c2;
   1364                     }
   1365                 }
   1366                 if(isOrdered) {
   1367                     break;
   1368                 } else {
   1369                     start=current;
   1370                 }
   1371             } else {
   1372                 r=current+(c2==0 ? 1 : 2);
   1373                 trailCC=_insertOrdered(start, current, r, c, c2, cc);
   1374                 current=r;
   1375             }
   1376         }
   1377     }
   1378 
   1379     if(next==limit) {
   1380         /* we know the cc of the last code point */
   1381         return trailCC;
   1382     } else {
   1383         if(!adjacent) {
   1384             /* copy the second string part */
   1385             do {
   1386                 *current++=*next++;
   1387             } while(next!=limit);
   1388             limit=current;
   1389         }
   1390         return _getPrevCC(start, limit);
   1391     }
   1392 }
   1393 
   1394 /* find the last true starter in [start..src[ and return the pointer to it */
   1395 static const UChar *
   1396 _findPreviousStarter(const UChar *start, const UChar *src,
   1397                      uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
   1398     uint32_t norm32;
   1399     UChar c, c2;
   1400 
   1401     while(start<src) {
   1402         norm32=_getPrevNorm32(start, src, minNoMaybe, c, c2);
   1403         if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
   1404             break;
   1405         }
   1406     }
   1407     return src;
   1408 }
   1409 
   1410 /* find the first true starter in [src..limit[ and return the pointer to it */
   1411 static const UChar *
   1412 _findNextStarter(const UChar *src, const UChar *limit,
   1413                  uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
   1414     const UChar *p;
   1415     uint32_t norm32, ccOrQCMask;
   1416     int32_t length;
   1417     UChar c, c2;
   1418     uint8_t cc, trailCC;
   1419 
   1420     ccOrQCMask=_NORM_CC_MASK|qcMask;
   1421 
   1422     for(;;) {
   1423         if(src==limit) {
   1424             break; /* end of string */
   1425         }
   1426         c=*src;
   1427         if(c<minNoMaybe) {
   1428             break; /* catches NUL terminater, too */
   1429         }
   1430 
   1431         norm32=_getNorm32(c);
   1432         if((norm32&ccOrQCMask)==0) {
   1433             break; /* true starter */
   1434         }
   1435 
   1436         if(U16_IS_LEAD(c)) {
   1437             /* c is a lead surrogate, get the real norm32 */
   1438             if((src+1)==limit || !U16_IS_TRAIL(c2=*(src+1))) {
   1439                 break; /* unmatched first surrogate: counts as a true starter */
   1440             }
   1441             norm32=_getNorm32FromSurrogatePair(c, c2);
   1442 
   1443             if((norm32&ccOrQCMask)==0) {
   1444                 break; /* true starter */
   1445             }
   1446         } else {
   1447             c2=0;
   1448         }
   1449 
   1450         /* (c, c2) is not a true starter but its decomposition may be */
   1451         if(norm32&decompQCMask) {
   1452             /* (c, c2) decomposes, get everything from the variable-length extra data */
   1453             p=_decompose(norm32, decompQCMask, length, cc, trailCC);
   1454 
   1455             /* get the first character's norm32 to check if it is a true starter */
   1456             if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
   1457                 break; /* true starter */
   1458             }
   1459         }
   1460 
   1461         src+= c2==0 ? 1 : 2; /* not a true starter, continue */
   1462     }
   1463 
   1464     return src;
   1465 }
   1466 
   1467 /* make NFD & NFKD ---------------------------------------------------------- */
   1468 
   1469 U_CAPI int32_t U_EXPORT2
   1470 unorm_getDecomposition(UChar32 c, UBool compat,
   1471                        UChar *dest, int32_t destCapacity) {
   1472 #if !UNORM_HARDCODE_DATA
   1473     UErrorCode errorCode=U_ZERO_ERROR;
   1474 #endif
   1475     if( (uint32_t)c<=0x10ffff &&
   1476 #if !UNORM_HARDCODE_DATA
   1477         _haveData(errorCode) &&
   1478 #endif
   1479         ((dest!=NULL && destCapacity>0) || destCapacity==0)
   1480     ) {
   1481         uint32_t norm32, qcMask;
   1482         UChar32 minNoMaybe;
   1483         int32_t length;
   1484 
   1485         /* initialize */
   1486         if(!compat) {
   1487             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
   1488             qcMask=_NORM_QC_NFD;
   1489         } else {
   1490             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
   1491             qcMask=_NORM_QC_NFKD;
   1492         }
   1493 
   1494         if(c<minNoMaybe) {
   1495             /* trivial case */
   1496             if(destCapacity>0) {
   1497                 dest[0]=(UChar)c;
   1498             }
   1499             return -1;
   1500         }
   1501 
   1502         /* data lookup */
   1503         norm32=UTRIE2_GET32(&normTrie, c);
   1504         if((norm32&qcMask)==0) {
   1505             /* simple case: no decomposition */
   1506             if(c<=0xffff) {
   1507                 if(destCapacity>0) {
   1508                     dest[0]=(UChar)c;
   1509                 }
   1510                 return -1;
   1511             } else {
   1512                 if(destCapacity>=2) {
   1513                     dest[0]=UTF16_LEAD(c);
   1514                     dest[1]=UTF16_TRAIL(c);
   1515                 }
   1516                 return -2;
   1517             }
   1518         } else if(isNorm32HangulOrJamo(norm32)) {
   1519             /* Hangul syllable: decompose algorithmically */
   1520             UChar c2;
   1521 
   1522             c-=HANGUL_BASE;
   1523 
   1524             c2=(UChar)(c%JAMO_T_COUNT);
   1525             c/=JAMO_T_COUNT;
   1526             if(c2>0) {
   1527                 if(destCapacity>=3) {
   1528                     dest[2]=(UChar)(JAMO_T_BASE+c2);
   1529                 }
   1530                 length=3;
   1531             } else {
   1532                 length=2;
   1533             }
   1534 
   1535             if(destCapacity>=2) {
   1536                 dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
   1537                 dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
   1538             }
   1539             return length;
   1540         } else {
   1541             /* c decomposes, get everything from the variable-length extra data */
   1542             const UChar *p, *limit;
   1543             uint8_t cc, trailCC;
   1544 
   1545             p=_decompose(norm32, qcMask, length, cc, trailCC);
   1546             if(length<=destCapacity) {
   1547                 limit=p+length;
   1548                 do {
   1549                     *dest++=*p++;
   1550                 } while(p<limit);
   1551             }
   1552             return length;
   1553         }
   1554     } else {
   1555         return 0;
   1556     }
   1557 }
   1558 
   1559 static int32_t
   1560 _decompose(UChar *dest, int32_t destCapacity,
   1561            const UChar *src, int32_t srcLength,
   1562            UBool compat, const UnicodeSet *nx,
   1563            uint8_t &outTrailCC) {
   1564     UChar buffer[3];
   1565     const UChar *limit, *prevSrc, *p;
   1566     uint32_t norm32, ccOrQCMask, qcMask;
   1567     int32_t destIndex, reorderStartIndex, length;
   1568     UChar c, c2, minNoMaybe;
   1569     uint8_t cc, prevCC, trailCC;
   1570 
   1571     if(!compat) {
   1572         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
   1573         qcMask=_NORM_QC_NFD;
   1574     } else {
   1575         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
   1576         qcMask=_NORM_QC_NFKD;
   1577     }
   1578 
   1579     /* initialize */
   1580     ccOrQCMask=_NORM_CC_MASK|qcMask;
   1581     destIndex=reorderStartIndex=0;
   1582     prevCC=0;
   1583 
   1584     /* avoid compiler warnings */
   1585     norm32=0;
   1586     c=0;
   1587     cc=0;
   1588     trailCC=0;
   1589 
   1590     if(srcLength>=0) {
   1591         /* string with length */
   1592         limit=src+srcLength;
   1593     } else /* srcLength==-1 */ {
   1594         /* zero-terminated string */
   1595         limit=NULL;
   1596     }
   1597 
   1598     U_ALIGN_CODE(16);
   1599 
   1600     for(;;) {
   1601         /* count code units below the minimum or with irrelevant data for the quick check */
   1602         prevSrc=src;
   1603         if(limit==NULL) {
   1604             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
   1605                 prevCC=0;
   1606                 ++src;
   1607             }
   1608         } else {
   1609             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
   1610                 prevCC=0;
   1611                 ++src;
   1612             }
   1613         }
   1614 
   1615         /* copy these code units all at once */
   1616         if(src!=prevSrc) {
   1617             length=(int32_t)(src-prevSrc);
   1618             if((destIndex+length)<=destCapacity) {
   1619                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
   1620             }
   1621             destIndex+=length;
   1622             reorderStartIndex=destIndex;
   1623         }
   1624 
   1625         /* end of source reached? */
   1626         if(limit==NULL ? c==0 : src==limit) {
   1627             break;
   1628         }
   1629 
   1630         /* c already contains *src and norm32 is set for it, increment src */
   1631         ++src;
   1632 
   1633         /* check one above-minimum, relevant code unit */
   1634         /*
   1635          * generally, set p and length to the decomposition string
   1636          * in simple cases, p==NULL and (c, c2) will hold the length code units to append
   1637          * in all cases, set cc to the lead and trailCC to the trail combining class
   1638          *
   1639          * the following merge-sort of the current character into the preceding,
   1640          * canonically ordered result text will use the optimized _insertOrdered()
   1641          * if there is only one single code point to process;
   1642          * this is indicated with p==NULL, and (c, c2) is the character to insert
   1643          * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
   1644          * for a supplementary character)
   1645          * otherwise, p[length] is merged in with _mergeOrdered()
   1646          */
   1647         if(isNorm32HangulOrJamo(norm32)) {
   1648             if(nx_contains(nx, c)) {
   1649                 c2=0;
   1650                 p=NULL;
   1651                 length=1;
   1652             } else {
   1653                 /* Hangul syllable: decompose algorithmically */
   1654                 p=buffer;
   1655                 cc=trailCC=0;
   1656 
   1657                 c-=HANGUL_BASE;
   1658 
   1659                 c2=(UChar)(c%JAMO_T_COUNT);
   1660                 c/=JAMO_T_COUNT;
   1661                 if(c2>0) {
   1662                     buffer[2]=(UChar)(JAMO_T_BASE+c2);
   1663                     length=3;
   1664                 } else {
   1665                     length=2;
   1666                 }
   1667 
   1668                 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
   1669                 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
   1670             }
   1671         } else {
   1672             if(isNorm32Regular(norm32)) {
   1673                 c2=0;
   1674                 length=1;
   1675             } else {
   1676                 /* c is a lead surrogate, get the real norm32 */
   1677                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
   1678                     ++src;
   1679                     length=2;
   1680                     norm32=_getNorm32FromSurrogatePair(c, c2);
   1681                 } else {
   1682                     c2=0;
   1683                     length=1;
   1684                     norm32=0;
   1685                 }
   1686             }
   1687 
   1688             /* get the decomposition and the lead and trail cc's */
   1689             if(nx_contains(nx, c, c2)) {
   1690                 /* excluded: norm32==0 */
   1691                 cc=trailCC=0;
   1692                 p=NULL;
   1693             } else if((norm32&qcMask)==0) {
   1694                 /* c does not decompose */
   1695                 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
   1696                 p=NULL;
   1697             } else {
   1698                 /* c decomposes, get everything from the variable-length extra data */
   1699                 p=_decompose(norm32, qcMask, length, cc, trailCC);
   1700                 if(length==1) {
   1701                     /* fastpath a single code unit from decomposition */
   1702                     c=*p;
   1703                     c2=0;
   1704                     p=NULL;
   1705                 }
   1706             }
   1707         }
   1708 
   1709         /* append the decomposition to the destination buffer, assume length>0 */
   1710         if((destIndex+length)<=destCapacity) {
   1711             UChar *reorderSplit=dest+destIndex;
   1712             if(p==NULL) {
   1713                 /* fastpath: single code point */
   1714                 if(cc!=0 && cc<prevCC) {
   1715                     /* (c, c2) is out of order with respect to the preceding text */
   1716                     destIndex+=length;
   1717                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
   1718                 } else {
   1719                     /* just append (c, c2) */
   1720                     dest[destIndex++]=c;
   1721                     if(c2!=0) {
   1722                         dest[destIndex++]=c2;
   1723                     }
   1724                 }
   1725             } else {
   1726                 /* general: multiple code points (ordered by themselves) from decomposition */
   1727                 if(cc!=0 && cc<prevCC) {
   1728                     /* the decomposition is out of order with respect to the preceding text */
   1729                     destIndex+=length;
   1730                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
   1731                 } else {
   1732                     /* just append the decomposition */
   1733                     do {
   1734                         dest[destIndex++]=*p++;
   1735                     } while(--length>0);
   1736                 }
   1737             }
   1738         } else {
   1739             /* buffer overflow */
   1740             /* keep incrementing the destIndex for preflighting */
   1741             destIndex+=length;
   1742         }
   1743 
   1744         prevCC=trailCC;
   1745         if(prevCC==0) {
   1746             reorderStartIndex=destIndex;
   1747         }
   1748     }
   1749 
   1750     outTrailCC=prevCC;
   1751     return destIndex;
   1752 }
   1753 
   1754 U_CAPI int32_t U_EXPORT2
   1755 unorm_decompose(UChar *dest, int32_t destCapacity,
   1756                 const UChar *src, int32_t srcLength,
   1757                 UBool compat, int32_t options,
   1758                 UErrorCode *pErrorCode) {
   1759     const UnicodeSet *nx;
   1760     int32_t destIndex;
   1761     uint8_t trailCC;
   1762 
   1763     if(!_haveData(*pErrorCode)) {
   1764         return 0;
   1765     }
   1766 
   1767     nx=getNX(options, *pErrorCode);
   1768     if(U_FAILURE(*pErrorCode)) {
   1769         return 0;
   1770     }
   1771 
   1772     destIndex=_decompose(dest, destCapacity,
   1773                          src, srcLength,
   1774                          compat, nx,
   1775                          trailCC);
   1776 
   1777     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
   1778 }
   1779 
   1780 /* make NFC & NFKC ---------------------------------------------------------- */
   1781 
   1782 /* get the composition properties of the next character */
   1783 static inline uint32_t
   1784 _getNextCombining(UChar *&p, const UChar *limit,
   1785                   UChar &c, UChar &c2,
   1786                   uint16_t &combiningIndex, uint8_t &cc,
   1787                   const UnicodeSet *nx) {
   1788     uint32_t norm32, combineFlags;
   1789 
   1790     /* get properties */
   1791     c=*p++;
   1792     norm32=_getNorm32(c);
   1793 
   1794     /* preset output values for most characters */
   1795     c2=0;
   1796     combiningIndex=0;
   1797     cc=0;
   1798 
   1799     if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
   1800         return 0;
   1801     } else {
   1802         if(isNorm32Regular(norm32)) {
   1803             /* set cc etc. below */
   1804         } else if(isNorm32HangulOrJamo(norm32)) {
   1805             /* a compatibility decomposition contained Jamos */
   1806             combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
   1807             return norm32&_NORM_COMBINES_ANY;
   1808         } else {
   1809             /* c is a lead surrogate, get the real norm32 */
   1810             if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
   1811                 ++p;
   1812                 norm32=_getNorm32FromSurrogatePair(c, c2);
   1813             } else {
   1814                 c2=0;
   1815                 return 0;
   1816             }
   1817         }
   1818 
   1819         if(nx_contains(nx, c, c2)) {
   1820             return 0; /* excluded: norm32==0 */
   1821         }
   1822 
   1823         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
   1824 
   1825         combineFlags=norm32&_NORM_COMBINES_ANY;
   1826         if(combineFlags!=0) {
   1827             combiningIndex=*(_getExtraData(norm32)-1);
   1828         }
   1829         return combineFlags;
   1830     }
   1831 }
   1832 
   1833 /*
   1834  * given a composition-result starter (c, c2) - which means its cc==0,
   1835  * it combines forward, it has extra data, its norm32!=0,
   1836  * it is not a Hangul or Jamo,
   1837  * get just its combineFwdIndex
   1838  *
   1839  * norm32(c) is special if and only if c2!=0
   1840  */
   1841 static inline uint16_t
   1842 _getCombiningIndexFromStarter(UChar c, UChar c2) {
   1843     uint32_t norm32;
   1844 
   1845     if(c2==0) {
   1846         norm32=_getNorm32(c);
   1847     } else {
   1848         norm32=_getNorm32FromSurrogatePair(c, c2);
   1849     }
   1850     return *(_getExtraData(norm32)-1);
   1851 }
   1852 
   1853 /*
   1854  * Find the recomposition result for
   1855  * a forward-combining character
   1856  * (specified with a pointer to its part of the combiningTable[])
   1857  * and a backward-combining character
   1858  * (specified with its combineBackIndex).
   1859  *
   1860  * If these two characters combine, then set (value, value2)
   1861  * with the code unit(s) of the composition character.
   1862  *
   1863  * Return value:
   1864  * 0    do not combine
   1865  * 1    combine
   1866  * >1   combine, and the composition is a forward-combining starter
   1867  *
   1868  * See unormimp.h for a description of the composition table format.
   1869  */
   1870 static inline uint16_t
   1871 _combine(const uint16_t *table, uint16_t combineBackIndex,
   1872          uint16_t &value, uint16_t &value2) {
   1873     uint16_t key;
   1874 
   1875     /* search in the starter's composition table */
   1876     for(;;) {
   1877         key=*table++;
   1878         if(key>=combineBackIndex) {
   1879             break;
   1880         }
   1881         table+= *table&0x8000 ? 2 : 1;
   1882     }
   1883 
   1884     /* mask off bit 15, the last-entry-in-the-list flag */
   1885     if((key&0x7fff)==combineBackIndex) {
   1886         /* found! combine! */
   1887         value=*table;
   1888 
   1889         /* is the composition a starter that combines forward? */
   1890         key=(uint16_t)((value&0x2000)+1);
   1891 
   1892         /* get the composition result code point from the variable-length result value */
   1893         if(value&0x8000) {
   1894             if(value&0x4000) {
   1895                 /* surrogate pair composition result */
   1896                 value=(uint16_t)((value&0x3ff)|0xd800);
   1897                 value2=*(table+1);
   1898             } else {
   1899                 /* BMP composition result U+2000..U+ffff */
   1900                 value=*(table+1);
   1901                 value2=0;
   1902             }
   1903         } else {
   1904             /* BMP composition result U+0000..U+1fff */
   1905             value&=0x1fff;
   1906             value2=0;
   1907         }
   1908 
   1909         return key;
   1910     } else {
   1911         /* not found */
   1912         return 0;
   1913     }
   1914 }
   1915 
   1916 static inline UBool
   1917 _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
   1918                UBool compat, UChar *dest, const UnicodeSet *nx) {
   1919     if(isJamoVTNorm32JamoV(norm32)) {
   1920         /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
   1921         prev=(UChar)(prev-JAMO_L_BASE);
   1922         if(prev<JAMO_L_COUNT) {
   1923             c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
   1924 
   1925             /* check if the next character is a Jamo T (normal or compatibility) */
   1926             if(src!=limit) {
   1927                 UChar next, t;
   1928 
   1929                 next=*src;
   1930                 if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
   1931                     /* normal Jamo T */
   1932                     ++src;
   1933                     c+=t;
   1934                 } else if(compat) {
   1935                     /* if NFKC, then check for compatibility Jamo T (BMP only) */
   1936                     norm32=_getNorm32(next);
   1937                     if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
   1938                         const UChar *p;
   1939                         int32_t length;
   1940                         uint8_t cc, trailCC;
   1941 
   1942                         p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
   1943                         if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
   1944                             /* compatibility Jamo T */
   1945                             ++src;
   1946                             c+=t;
   1947                         }
   1948                     }
   1949                 }
   1950             }
   1951             if(nx_contains(nx, c)) {
   1952                 if(!isHangulWithoutJamoT(c)) {
   1953                     --src; /* undo ++src from reading the Jamo T */
   1954                 }
   1955                 return FALSE;
   1956             }
   1957             if(dest!=0) {
   1958                 *dest=c;
   1959             }
   1960             return TRUE;
   1961         }
   1962     } else if(isHangulWithoutJamoT(prev)) {
   1963         /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
   1964         c=(UChar)(prev+(c-JAMO_T_BASE));
   1965         if(nx_contains(nx, c)) {
   1966             return FALSE;
   1967         }
   1968         if(dest!=0) {
   1969             *dest=c;
   1970         }
   1971         return TRUE;
   1972     }
   1973     return FALSE;
   1974 }
   1975 
   1976 /*
   1977  * recompose the characters in [p..limit[
   1978  * (which is in NFD - decomposed and canonically ordered),
   1979  * adjust limit, and return the trailing cc
   1980  *
   1981  * since for NFKC we may get Jamos in decompositions, we need to
   1982  * recompose those too
   1983  *
   1984  * note that recomposition never lengthens the text:
   1985  * any character consists of either one or two code units;
   1986  * a composition may contain at most one more code unit than the original starter,
   1987  * while the combining mark that is removed has at least one code unit
   1988  */
   1989 static uint8_t
   1990 _recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
   1991     UChar *starter, *pRemove, *q, *r;
   1992     uint32_t combineFlags;
   1993     UChar c, c2;
   1994     uint16_t combineFwdIndex, combineBackIndex;
   1995     uint16_t result, value, value2;
   1996     uint8_t cc, prevCC;
   1997     UBool starterIsSupplementary;
   1998 
   1999     starter=NULL;                   /* no starter */
   2000     combineFwdIndex=0;              /* will not be used until starter!=NULL - avoid compiler warnings */
   2001     combineBackIndex=0;             /* will always be set if combineFlags!=0 - avoid compiler warnings */
   2002     value=value2=0;                 /* always set by _combine() before used - avoid compiler warnings */
   2003     starterIsSupplementary=FALSE;   /* will not be used until starter!=NULL - avoid compiler warnings */
   2004     prevCC=0;
   2005 
   2006     for(;;) {
   2007         combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
   2008         if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
   2009             if(combineBackIndex&0x8000) {
   2010                 /* c is a Jamo V/T, see if we can compose it with the previous character */
   2011                 /* for the PRI #29 fix, check that there is no intervening combining mark */
   2012                 if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
   2013                     pRemove=NULL; /* NULL while no Hangul composition */
   2014                     combineFlags=0;
   2015                     c2=*starter;
   2016                     if(combineBackIndex==0xfff2) {
   2017                         /* Jamo V, compose with previous Jamo L and following Jamo T */
   2018                         c2=(UChar)(c2-JAMO_L_BASE);
   2019                         if(c2<JAMO_L_COUNT) {
   2020                             pRemove=p-1;
   2021                             c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
   2022                             if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
   2023                                 ++p;
   2024                                 c+=c2;
   2025                             } else {
   2026                                 /* the result is an LV syllable, which is a starter (unlike LVT) */
   2027                                 combineFlags=_NORM_COMBINES_FWD;
   2028                             }
   2029                             if(!nx_contains(nx, c)) {
   2030                                 *starter=c;
   2031                             } else {
   2032                                 /* excluded */
   2033                                 if(!isHangulWithoutJamoT(c)) {
   2034                                     --p; /* undo the ++p from reading the Jamo T */
   2035                                 }
   2036                                 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
   2037                                 pRemove=NULL;
   2038                             }
   2039                         }
   2040 
   2041                     /*
   2042                      * Normally, the following can not occur:
   2043                      * Since the input is in NFD, there are no Hangul LV syllables that
   2044                      * a Jamo T could combine with.
   2045                      * All Jamo Ts are combined above when handling Jamo Vs.
   2046                      *
   2047                      * However, before the PRI #29 fix, this can occur due to
   2048                      * an intervening combining mark between the Hangul LV and the Jamo T.
   2049                      */
   2050                     } else {
   2051                         /* Jamo T, compose with previous Hangul that does not have a Jamo T */
   2052                         if(isHangulWithoutJamoT(c2)) {
   2053                             c2+=(UChar)(c-JAMO_T_BASE);
   2054                             if(!nx_contains(nx, c2)) {
   2055                                 pRemove=p-1;
   2056                                 *starter=c2;
   2057                             }
   2058                         }
   2059                     }
   2060 
   2061                     if(pRemove!=NULL) {
   2062                         /* remove the Jamo(s) */
   2063                         q=pRemove;
   2064                         r=p;
   2065                         while(r<limit) {
   2066                             *q++=*r++;
   2067                         }
   2068                         p=pRemove;
   2069                         limit=q;
   2070                     }
   2071 
   2072                     c2=0; /* c2 held *starter temporarily */
   2073 
   2074                     if(combineFlags!=0) {
   2075                         /*
   2076                          * not starter=NULL because the composition is a Hangul LV syllable
   2077                          * and might combine once more (but only before the PRI #29 fix)
   2078                          */
   2079 
   2080                         /* done? */
   2081                         if(p==limit) {
   2082                             return prevCC;
   2083                         }
   2084 
   2085                         /* the composition is a Hangul LV syllable which is a starter that combines forward */
   2086                         combineFwdIndex=0xfff0;
   2087 
   2088                         /* we combined; continue with looking for compositions */
   2089                         continue;
   2090                     }
   2091                 }
   2092 
   2093                 /*
   2094                  * now: cc==0 and the combining index does not include "forward" ->
   2095                  * the rest of the loop body will reset starter to NULL;
   2096                  * technically, a composed Hangul syllable is a starter, but it
   2097                  * does not combine forward now that we have consumed all eligible Jamos;
   2098                  * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
   2099                  */
   2100 
   2101             } else if(
   2102                 /* the starter is not a Hangul LV or Jamo V/T and */
   2103                 !(combineFwdIndex&0x8000) &&
   2104                 /* the combining mark is not blocked and */
   2105                 ((options&UNORM_BEFORE_PRI_29) ?
   2106                     (prevCC!=cc || prevCC==0) :
   2107                     (prevCC<cc || prevCC==0)) &&
   2108                 /* the starter and the combining mark (c, c2) do combine and */
   2109                 0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
   2110                 /* the composition result is not excluded */
   2111                 !nx_contains(nx, value, value2)
   2112             ) {
   2113                 /* replace the starter with the composition, remove the combining mark */
   2114                 pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
   2115 
   2116                 /* replace the starter with the composition */
   2117                 *starter=(UChar)value;
   2118                 if(starterIsSupplementary) {
   2119                     if(value2!=0) {
   2120                         /* both are supplementary */
   2121                         *(starter+1)=(UChar)value2;
   2122                     } else {
   2123                         /* the composition is shorter than the starter, move the intermediate characters forward one */
   2124                         starterIsSupplementary=FALSE;
   2125                         q=starter+1;
   2126                         r=q+1;
   2127                         while(r<pRemove) {
   2128                             *q++=*r++;
   2129                         }
   2130                         --pRemove;
   2131                     }
   2132                 } else if(value2!=0) {
   2133                     /* the composition is longer than the starter, move the intermediate characters back one */
   2134                     starterIsSupplementary=TRUE;
   2135                     ++starter; /* temporarily increment for the loop boundary */
   2136                     q=pRemove;
   2137                     r=++pRemove;
   2138                     while(starter<q) {
   2139                         *--r=*--q;
   2140                     }
   2141                     *starter=(UChar)value2;
   2142                     --starter; /* undo the temporary increment */
   2143                 /* } else { both are on the BMP, nothing more to do */
   2144                 }
   2145 
   2146                 /* remove the combining mark by moving the following text over it */
   2147                 if(pRemove<p) {
   2148                     q=pRemove;
   2149                     r=p;
   2150                     while(r<limit) {
   2151                         *q++=*r++;
   2152                     }
   2153                     p=pRemove;
   2154                     limit=q;
   2155                 }
   2156 
   2157                 /* keep prevCC because we removed the combining mark */
   2158 
   2159                 /* done? */
   2160                 if(p==limit) {
   2161                     return prevCC;
   2162                 }
   2163 
   2164                 /* is the composition a starter that combines forward? */
   2165                 if(result>1) {
   2166                     combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
   2167                 } else {
   2168                     starter=NULL;
   2169                 }
   2170 
   2171                 /* we combined; continue with looking for compositions */
   2172                 continue;
   2173             }
   2174         }
   2175 
   2176         /* no combination this time */
   2177         prevCC=cc;
   2178         if(p==limit) {
   2179             return prevCC;
   2180         }
   2181 
   2182         /* if (c, c2) did not combine, then check if it is a starter */
   2183         if(cc==0) {
   2184             /* found a new starter; combineFlags==0 if (c, c2) is excluded */
   2185             if(combineFlags&_NORM_COMBINES_FWD) {
   2186                 /* it may combine with something, prepare for it */
   2187                 if(c2==0) {
   2188                     starterIsSupplementary=FALSE;
   2189                     starter=p-1;
   2190                 } else {
   2191                     starterIsSupplementary=TRUE;
   2192                     starter=p-2;
   2193                 }
   2194                 combineFwdIndex=combineBackIndex;
   2195             } else {
   2196                 /* it will not combine with anything */
   2197                 starter=NULL;
   2198             }
   2199         } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
   2200             /* FCC: no discontiguous compositions; any intervening character blocks */
   2201             starter=NULL;
   2202         }
   2203     }
   2204 }
   2205 
   2206 /* decompose and recompose [prevStarter..src[ */
   2207 static const UChar *
   2208 _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
   2209              const UChar *prevStarter, const UChar *src,
   2210              uint8_t &prevCC,
   2211              int32_t options, const UnicodeSet *nx,
   2212              UErrorCode *pErrorCode) {
   2213     UChar *recomposeLimit;
   2214     uint8_t trailCC;
   2215     UBool compat;
   2216 
   2217     compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
   2218 
   2219     /* decompose [prevStarter..src[ */
   2220     length=_decompose(buffer, bufferCapacity,
   2221                       prevStarter, (int32_t)(src-prevStarter),
   2222                       compat, nx,
   2223                       trailCC);
   2224     if(length>bufferCapacity) {
   2225         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
   2226             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   2227             return NULL;
   2228         }
   2229         length=_decompose(buffer, bufferCapacity,
   2230                           prevStarter, (int32_t)(src-prevStarter),
   2231                           compat, nx,
   2232                           trailCC);
   2233     }
   2234 
   2235     /* recompose the decomposition */
   2236     recomposeLimit=buffer+length;
   2237     if(length>=2) {
   2238         prevCC=_recompose(buffer, recomposeLimit, options, nx);
   2239     }
   2240 
   2241     /* return with a pointer to the recomposition and its length */
   2242     length=(int32_t)(recomposeLimit-buffer);
   2243     return buffer;
   2244 }
   2245 
   2246 static int32_t
   2247 _compose(UChar *dest, int32_t destCapacity,
   2248          const UChar *src, int32_t srcLength,
   2249          int32_t options, const UnicodeSet *nx,
   2250          UErrorCode *pErrorCode) {
   2251     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
   2252     UChar *buffer;
   2253     int32_t bufferCapacity;
   2254 
   2255     const UChar *limit, *prevSrc, *prevStarter;
   2256     uint32_t norm32, ccOrQCMask, qcMask;
   2257     int32_t destIndex, reorderStartIndex, length;
   2258     UChar c, c2, minNoMaybe;
   2259     uint8_t cc, prevCC;
   2260 
   2261     if(options&_NORM_OPTIONS_COMPAT) {
   2262         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
   2263         qcMask=_NORM_QC_NFKC;
   2264     } else {
   2265         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
   2266         qcMask=_NORM_QC_NFC;
   2267     }
   2268 
   2269     /* initialize */
   2270     buffer=stackBuffer;
   2271     bufferCapacity=_STACK_BUFFER_CAPACITY;
   2272 
   2273     /*
   2274      * prevStarter points to the last character before the current one
   2275      * that is a "true" starter with cc==0 and quick check "yes".
   2276      *
   2277      * prevStarter will be used instead of looking for a true starter
   2278      * while incrementally decomposing [prevStarter..prevSrc[
   2279      * in _composePart(). Having a good prevStarter allows to just decompose
   2280      * the entire [prevStarter..prevSrc[.
   2281      *
   2282      * When _composePart() backs out from prevSrc back to prevStarter,
   2283      * then it also backs out destIndex by the same amount.
   2284      * Therefore, at all times, the (prevSrc-prevStarter) source units
   2285      * must correspond 1:1 to destination units counted with destIndex,
   2286      * except for reordering.
   2287      * This is true for the qc "yes" characters copied in the fast loop,
   2288      * and for pure reordering.
   2289      * prevStarter must be set forward to src when this is not true:
   2290      * In _composePart() and after composing a Hangul syllable.
   2291      *
   2292      * This mechanism relies on the assumption that the decomposition of a true starter
   2293      * also begins with a true starter. gennorm/store.c checks for this.
   2294      */
   2295     prevStarter=src;
   2296 
   2297     ccOrQCMask=_NORM_CC_MASK|qcMask;
   2298     destIndex=reorderStartIndex=0;
   2299     prevCC=0;
   2300 
   2301     /* avoid compiler warnings */
   2302     norm32=0;
   2303     c=0;
   2304 
   2305     if(srcLength>=0) {
   2306         /* string with length */
   2307         limit=src+srcLength;
   2308     } else /* srcLength==-1 */ {
   2309         /* zero-terminated string */
   2310         limit=NULL;
   2311     }
   2312 
   2313     U_ALIGN_CODE(16);
   2314 
   2315     for(;;) {
   2316         /* count code units below the minimum or with irrelevant data for the quick check */
   2317         prevSrc=src;
   2318         if(limit==NULL) {
   2319             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
   2320                 prevCC=0;
   2321                 ++src;
   2322             }
   2323         } else {
   2324             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
   2325                 prevCC=0;
   2326                 ++src;
   2327             }
   2328         }
   2329 
   2330         /* copy these code units all at once */
   2331         if(src!=prevSrc) {
   2332             length=(int32_t)(src-prevSrc);
   2333             if((destIndex+length)<=destCapacity) {
   2334                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
   2335             }
   2336             destIndex+=length;
   2337             reorderStartIndex=destIndex;
   2338 
   2339             /* set prevStarter to the last character in the quick check loop */
   2340             prevStarter=src-1;
   2341             if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
   2342                 --prevStarter;
   2343             }
   2344 
   2345             prevSrc=src;
   2346         }
   2347 
   2348         /* end of source reached? */
   2349         if(limit==NULL ? c==0 : src==limit) {
   2350             break;
   2351         }
   2352 
   2353         /* c already contains *src and norm32 is set for it, increment src */
   2354         ++src;
   2355 
   2356         /*
   2357          * source buffer pointers:
   2358          *
   2359          *  all done      quick check   current char  not yet
   2360          *                "yes" but     (c, c2)       processed
   2361          *                may combine
   2362          *                forward
   2363          * [-------------[-------------[-------------[-------------[
   2364          * |             |             |             |             |
   2365          * start         prevStarter   prevSrc       src           limit
   2366          *
   2367          *
   2368          * destination buffer pointers and indexes:
   2369          *
   2370          *  all done      might take    not filled yet
   2371          *                characters for
   2372          *                reordering
   2373          * [-------------[-------------[-------------[
   2374          * |             |             |             |
   2375          * dest      reorderStartIndex destIndex     destCapacity
   2376          */
   2377 
   2378         /* check one above-minimum, relevant code unit */
   2379         /*
   2380          * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
   2381          * check for Jamo V/T, then for surrogates and regular characters
   2382          * c is not a Hangul syllable or Jamo L because
   2383          * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
   2384          */
   2385         if(isNorm32HangulOrJamo(norm32)) {
   2386             /*
   2387              * c is a Jamo V/T:
   2388              * try to compose with the previous character, Jamo V also with a following Jamo T,
   2389              * and set values here right now in case we just continue with the main loop
   2390              */
   2391             prevCC=cc=0;
   2392             reorderStartIndex=destIndex;
   2393 
   2394             if(
   2395                 destIndex>0 &&
   2396                 _composeHangul(
   2397                     *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
   2398                     destIndex<=destCapacity ? dest+(destIndex-1) : 0,
   2399                     nx)
   2400             ) {
   2401                 prevStarter=src;
   2402                 continue;
   2403             }
   2404 
   2405             /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
   2406             c2=0;
   2407             length=1;
   2408             prevStarter=prevSrc;
   2409         } else {
   2410             if(isNorm32Regular(norm32)) {
   2411                 c2=0;
   2412                 length=1;
   2413             } else {
   2414                 /* c is a lead surrogate, get the real norm32 */
   2415                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
   2416                     ++src;
   2417                     length=2;
   2418                     norm32=_getNorm32FromSurrogatePair(c, c2);
   2419                 } else {
   2420                     /* c is an unpaired lead surrogate, nothing to do */
   2421                     c2=0;
   2422                     length=1;
   2423                     norm32=0;
   2424                 }
   2425             }
   2426 
   2427             /* we are looking at the character (c, c2) at [prevSrc..src[ */
   2428             if(nx_contains(nx, c, c2)) {
   2429                 /* excluded: norm32==0 */
   2430                 cc=0;
   2431             } else if((norm32&qcMask)==0) {
   2432                 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
   2433             } else {
   2434                 const UChar *p;
   2435                 uint32_t decompQCMask;
   2436 
   2437                 /*
   2438                  * find appropriate boundaries around this character,
   2439                  * decompose the source text from between the boundaries,
   2440                  * and recompose it
   2441                  *
   2442                  * this puts the intermediate text into the side buffer because
   2443                  * it might be longer than the recomposition end result,
   2444                  * or the destination buffer may be too short or missing
   2445                  *
   2446                  * note that destIndex may be adjusted backwards to account
   2447                  * for source text that passed the quick check but needed to
   2448                  * take part in the recomposition
   2449                  */
   2450                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
   2451 
   2452                 /*
   2453                  * find the last true starter in [prevStarter..src[
   2454                  * it is either the decomposition of the current character (at prevSrc),
   2455                  * or prevStarter
   2456                  */
   2457                 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
   2458                     prevStarter=prevSrc;
   2459                 } else {
   2460                     /* adjust destIndex: back out what had been copied with qc "yes" */
   2461                     destIndex-=(int32_t)(prevSrc-prevStarter);
   2462                 }
   2463 
   2464                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
   2465                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
   2466 
   2467                 /* compose [prevStarter..src[ */
   2468                 p=_composePart(stackBuffer, buffer, bufferCapacity,
   2469                                length,          /* output */
   2470                                prevStarter, src,
   2471                                prevCC,          /* output */
   2472                                options, nx,
   2473                                pErrorCode);
   2474 
   2475                 if(p==NULL) {
   2476                     destIndex=0;   /* an error occurred (out of memory) */
   2477                     break;
   2478                 }
   2479 
   2480                 /* append the recomposed buffer contents to the destination buffer */
   2481                 if((destIndex+length)<=destCapacity) {
   2482                     while(length>0) {
   2483                         dest[destIndex++]=*p++;
   2484                         --length;
   2485                     }
   2486                 } else {
   2487                     /* buffer overflow */
   2488                     /* keep incrementing the destIndex for preflighting */
   2489                     destIndex+=length;
   2490                 }
   2491 
   2492                 /* set the next starter */
   2493                 prevStarter=src;
   2494 
   2495                 continue;
   2496             }
   2497         }
   2498 
   2499         /* append the single code point (c, c2) to the destination buffer */
   2500         if((destIndex+length)<=destCapacity) {
   2501             if(cc!=0 && cc<prevCC) {
   2502                 /* (c, c2) is out of order with respect to the preceding text */
   2503                 UChar *reorderSplit=dest+destIndex;
   2504                 destIndex+=length;
   2505                 prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
   2506             } else {
   2507                 /* just append (c, c2) */
   2508                 dest[destIndex++]=c;
   2509                 if(c2!=0) {
   2510                     dest[destIndex++]=c2;
   2511                 }
   2512                 prevCC=cc;
   2513             }
   2514         } else {
   2515             /* buffer overflow */
   2516             /* keep incrementing the destIndex for preflighting */
   2517             destIndex+=length;
   2518             prevCC=cc;
   2519         }
   2520     }
   2521 
   2522     /* cleanup */
   2523     if(buffer!=stackBuffer) {
   2524         uprv_free(buffer);
   2525     }
   2526 
   2527     return destIndex;
   2528 }
   2529 
   2530 U_CAPI int32_t U_EXPORT2
   2531 unorm_compose(UChar *dest, int32_t destCapacity,
   2532               const UChar *src, int32_t srcLength,
   2533               UBool compat, int32_t options,
   2534               UErrorCode *pErrorCode) {
   2535     const UnicodeSet *nx;
   2536     int32_t destIndex;
   2537 
   2538     if(!_haveData(*pErrorCode)) {
   2539         return 0;
   2540     }
   2541 
   2542     nx=getNX(options, *pErrorCode);
   2543     if(U_FAILURE(*pErrorCode)) {
   2544         return 0;
   2545     }
   2546 
   2547     /* reset options bits that should only be set here or inside _compose() */
   2548     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
   2549 
   2550     if(compat) {
   2551         options|=_NORM_OPTIONS_COMPAT;
   2552     }
   2553 
   2554     destIndex=_compose(dest, destCapacity,
   2555                        src, srcLength,
   2556                        options, nx,
   2557                        pErrorCode);
   2558 
   2559     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
   2560 }
   2561 
   2562 /* make FCD ----------------------------------------------------------------- */
   2563 
   2564 static const UChar *
   2565 _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
   2566     UChar c, c2;
   2567 
   2568     /*
   2569      * find the first position in [src..limit[ after some cc==0 according to FCD data
   2570      *
   2571      * at the beginning of the loop, we have fcd16 from before src
   2572      *
   2573      * stop at positions:
   2574      * - after trail cc==0
   2575      * - at the end of the source
   2576      * - before lead cc==0
   2577      */
   2578     for(;;) {
   2579         /* stop if trail cc==0 for the previous character */
   2580         if((fcd16&0xff)==0) {
   2581             break;
   2582         }
   2583 
   2584         /* get c=*src - stop at end of string */
   2585         if(src==limit) {
   2586             break;
   2587         }
   2588         c=*src;
   2589 
   2590         /* stop if lead cc==0 for this character */
   2591         if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
   2592             break; /* catches terminating NUL, too */
   2593         }
   2594 
   2595         if(!UTF_IS_FIRST_SURROGATE(c)) {
   2596             if(fcd16<=0xff) {
   2597                 break;
   2598             }
   2599             ++src;
   2600         } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
   2601             /* c is a lead surrogate, get the real fcd16 */
   2602             fcd16=_getFCD16FromSurrogatePair(c, c2);
   2603             if(fcd16<=0xff) {
   2604                 break;
   2605             }
   2606             src+=2;
   2607         } else {
   2608             /* c is an unpaired first surrogate, lead cc==0 */
   2609             break;
   2610         }
   2611     }
   2612 
   2613     return src;
   2614 }
   2615 
   2616 static uint8_t
   2617 _decomposeFCD(const UChar *src, const UChar *decompLimit,
   2618               UChar *dest, int32_t &destIndex, int32_t destCapacity,
   2619               const UnicodeSet *nx) {
   2620     const UChar *p;
   2621     uint32_t norm32;
   2622     int32_t reorderStartIndex, length;
   2623     UChar c, c2;
   2624     uint8_t cc, prevCC, trailCC;
   2625 
   2626     /*
   2627      * canonically decompose [src..decompLimit[
   2628      *
   2629      * all characters in this range have some non-zero cc,
   2630      * directly or in decomposition,
   2631      * so that we do not need to check in the following for quick-check limits etc.
   2632      *
   2633      * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
   2634      *
   2635      * we also do not need to check for c==0 because we have an established decompLimit
   2636      */
   2637     reorderStartIndex=destIndex;
   2638     prevCC=0;
   2639 
   2640     while(src<decompLimit) {
   2641         c=*src++;
   2642         norm32=_getNorm32(c);
   2643         if(isNorm32Regular(norm32)) {
   2644             c2=0;
   2645             length=1;
   2646         } else {
   2647             /*
   2648              * reminder: this function is called with [src..decompLimit[
   2649              * not containing any Hangul/Jamo characters,
   2650              * therefore the only specials are lead surrogates
   2651              */
   2652             /* c is a lead surrogate, get the real norm32 */
   2653             if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
   2654                 ++src;
   2655                 length=2;
   2656                 norm32=_getNorm32FromSurrogatePair(c, c2);
   2657             } else {
   2658                 c2=0;
   2659                 length=1;
   2660                 norm32=0;
   2661             }
   2662         }
   2663 
   2664         /* get the decomposition and the lead and trail cc's */
   2665         if(nx_contains(nx, c, c2)) {
   2666             /* excluded: norm32==0 */
   2667             cc=trailCC=0;
   2668             p=NULL;
   2669         } else if((norm32&_NORM_QC_NFD)==0) {
   2670             /* c does not decompose */
   2671             cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
   2672             p=NULL;
   2673         } else {
   2674             /* c decomposes, get everything from the variable-length extra data */
   2675             p=_decompose(norm32, length, cc, trailCC);
   2676             if(length==1) {
   2677                 /* fastpath a single code unit from decomposition */
   2678                 c=*p;
   2679                 c2=0;
   2680                 p=NULL;
   2681             }
   2682         }
   2683 
   2684         /* append the decomposition to the destination buffer, assume length>0 */
   2685         if((destIndex+length)<=destCapacity) {
   2686             UChar *reorderSplit=dest+destIndex;
   2687             if(p==NULL) {
   2688                 /* fastpath: single code point */
   2689                 if(cc!=0 && cc<prevCC) {
   2690                     /* (c, c2) is out of order with respect to the preceding text */
   2691                     destIndex+=length;
   2692                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
   2693                 } else {
   2694                     /* just append (c, c2) */
   2695                     dest[destIndex++]=c;
   2696                     if(c2!=0) {
   2697                         dest[destIndex++]=c2;
   2698                     }
   2699                 }
   2700             } else {
   2701                 /* general: multiple code points (ordered by themselves) from decomposition */
   2702                 if(cc!=0 && cc<prevCC) {
   2703                     /* the decomposition is out of order with respect to the preceding text */
   2704                     destIndex+=length;
   2705                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
   2706                 } else {
   2707                     /* just append the decomposition */
   2708                     do {
   2709                         dest[destIndex++]=*p++;
   2710                     } while(--length>0);
   2711                 }
   2712             }
   2713         } else {
   2714             /* buffer overflow */
   2715             /* keep incrementing the destIndex for preflighting */
   2716             destIndex+=length;
   2717         }
   2718 
   2719         prevCC=trailCC;
   2720         if(prevCC==0) {
   2721             reorderStartIndex=destIndex;
   2722         }
   2723     }
   2724 
   2725     return prevCC;
   2726 }
   2727 
   2728 static int32_t
   2729 unorm_makeFCD(UChar *dest, int32_t destCapacity,
   2730               const UChar *src, int32_t srcLength,
   2731               const UnicodeSet *nx,
   2732               UErrorCode *pErrorCode) {
   2733     const UChar *limit, *prevSrc, *decompStart;
   2734     int32_t destIndex, length;
   2735     UChar c, c2;
   2736     uint16_t fcd16;
   2737     int16_t prevCC, cc;
   2738 
   2739     if(!_haveData(*pErrorCode)) {
   2740         return 0;
   2741     }
   2742 
   2743     /* initialize */
   2744     decompStart=src;
   2745     destIndex=0;
   2746     prevCC=0;
   2747 
   2748     /* avoid compiler warnings */
   2749     c=0;
   2750     fcd16=0;
   2751 
   2752     if(srcLength>=0) {
   2753         /* string with length */
   2754         limit=src+srcLength;
   2755     } else /* srcLength==-1 */ {
   2756         /* zero-terminated string */
   2757         limit=NULL;
   2758     }
   2759 
   2760     U_ALIGN_CODE(16);
   2761 
   2762     for(;;) {
   2763         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
   2764         prevSrc=src;
   2765         if(limit==NULL) {
   2766             for(;;) {
   2767                 c=*src;
   2768                 if(c<_NORM_MIN_WITH_LEAD_CC) {
   2769                     if(c==0) {
   2770                         break;
   2771                     }
   2772                     prevCC=(int16_t)-c;
   2773                 } else if((fcd16=_getFCD16(c))==0) {
   2774                     prevCC=0;
   2775                 } else {
   2776                     break;
   2777                 }
   2778                 ++src;
   2779             }
   2780         } else {
   2781             for(;;) {
   2782                 if(src==limit) {
   2783                     break;
   2784                 } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
   2785                     prevCC=(int16_t)-c;
   2786                 } else if((fcd16=_getFCD16(c))==0) {
   2787                     prevCC=0;
   2788                 } else {
   2789                     break;
   2790                 }
   2791                 ++src;
   2792             }
   2793         }
   2794 
   2795         /*
   2796          * prevCC has values from the following ranges:
   2797          * 0..0xff - the previous trail combining class
   2798          * <0      - the negative value of the previous code unit;
   2799          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
   2800          *           was deferred so that average text is checked faster
   2801          */
   2802 
   2803         /* copy these code units all at once */
   2804         if(src!=prevSrc) {
   2805             length=(int32_t)(src-prevSrc);
   2806             if((destIndex+length)<=destCapacity) {
   2807                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
   2808             }
   2809             destIndex+=length;
   2810             prevSrc=src;
   2811 
   2812             /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
   2813             if(prevCC<0) {
   2814                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
   2815                 if(!nx_contains(nx, (UChar32)-prevCC)) {
   2816                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
   2817                 } else {
   2818                     prevCC=0; /* excluded: fcd16==0 */
   2819                 }
   2820 
   2821                 /*
   2822                  * set a pointer to this below-U+0300 character;
   2823                  * if prevCC==0 then it will moved to after this character below
   2824                  */
   2825                 decompStart=prevSrc-1;
   2826             }
   2827         }
   2828         /*
   2829          * now:
   2830          * prevSrc==src - used later to adjust destIndex before decomposition
   2831          * prevCC>=0
   2832          */
   2833 
   2834         /* end of source reached? */
   2835         if(limit==NULL ? c==0 : src==limit) {
   2836             break;
   2837         }
   2838 
   2839         /* set a pointer to after the last source position where prevCC==0 */
   2840         if(prevCC==0) {
   2841             decompStart=prevSrc;
   2842         }
   2843 
   2844         /* c already contains *src and fcd16 is set for it, increment src */
   2845         ++src;
   2846 
   2847         /* check one above-minimum, relevant code unit */
   2848         if(UTF_IS_FIRST_SURROGATE(c)) {
   2849             /* c is a lead surrogate, get the real fcd16 */
   2850             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
   2851                 ++src;
   2852                 fcd16=_getFCD16FromSurrogatePair(c, c2);
   2853             } else {
   2854                 c2=0;
   2855                 fcd16=0;
   2856             }
   2857         } else {
   2858             c2=0;
   2859         }
   2860 
   2861         /* we are looking at the character (c, c2) at [prevSrc..src[ */
   2862         if(nx_contains(nx, c, c2)) {
   2863             fcd16=0; /* excluded: fcd16==0 */
   2864         }
   2865 
   2866         /* check the combining order, get the lead cc */
   2867         cc=(int16_t)(fcd16>>8);
   2868         if(cc==0 || cc>=prevCC) {
   2869             /* the order is ok */
   2870             if(cc==0) {
   2871                 decompStart=prevSrc;
   2872             }
   2873             prevCC=(int16_t)(fcd16&0xff);
   2874 
   2875             /* just append (c, c2) */
   2876             length= c2==0 ? 1 : 2;
   2877             if((destIndex+length)<=destCapacity) {
   2878                 dest[destIndex++]=c;
   2879                 if(c2!=0) {
   2880                     dest[destIndex++]=c2;
   2881                 }
   2882             } else {
   2883                 destIndex+=length;
   2884             }
   2885         } else {
   2886             /*
   2887              * back out the part of the source that we copied already but
   2888              * is now going to be decomposed;
   2889              * prevSrc is set to after what was copied
   2890              */
   2891             destIndex-=(int32_t)(prevSrc-decompStart);
   2892 
   2893             /*
   2894              * find the part of the source that needs to be decomposed;
   2895              * to be safe and simple, decompose to before the next character with lead cc==0
   2896              */
   2897             src=_findSafeFCD(src, limit, fcd16);
   2898 
   2899             /*
   2900              * the source text does not fulfill the conditions for FCD;
   2901              * decompose and reorder a limited piece of the text
   2902              */
   2903             prevCC=_decomposeFCD(decompStart, src,
   2904                                  dest, destIndex, destCapacity,
   2905                                  nx);
   2906             decompStart=src;
   2907         }
   2908     }
   2909 
   2910     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
   2911 }
   2912 
   2913 /* quick check functions ---------------------------------------------------- */
   2914 
   2915 static UBool
   2916 unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
   2917     const UChar *limit;
   2918     UChar c, c2;
   2919     uint16_t fcd16;
   2920     int16_t prevCC, cc;
   2921 
   2922     /* initialize */
   2923     prevCC=0;
   2924 
   2925     if(srcLength>=0) {
   2926         /* string with length */
   2927         limit=src+srcLength;
   2928     } else /* srcLength==-1 */ {
   2929         /* zero-terminated string */
   2930         limit=NULL;
   2931     }
   2932 
   2933     U_ALIGN_CODE(16);
   2934 
   2935     for(;;) {
   2936         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
   2937         if(limit==NULL) {
   2938             for(;;) {
   2939                 c=*src++;
   2940                 if(c<_NORM_MIN_WITH_LEAD_CC) {
   2941                     if(c==0) {
   2942                         return TRUE;
   2943                     }
   2944                     /*
   2945                      * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
   2946                      * because chances are good that the next one will have
   2947                      * a leading cc of 0;
   2948                      * _getFCD16(-prevCC) is later called when necessary -
   2949                      * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
   2950                      */
   2951                     prevCC=(int16_t)-c;
   2952                 } else if((fcd16=_getFCD16(c))==0) {
   2953                     prevCC=0;
   2954                 } else {
   2955                     break;
   2956                 }
   2957             }
   2958         } else {
   2959             for(;;) {
   2960                 if(src==limit) {
   2961                     return TRUE;
   2962                 } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
   2963                     prevCC=(int16_t)-c;
   2964                 } else if((fcd16=_getFCD16(c))==0) {
   2965                     prevCC=0;
   2966                 } else {
   2967                     break;
   2968                 }
   2969             }
   2970         }
   2971 
   2972         /* check one above-minimum, relevant code unit */
   2973         if(UTF_IS_FIRST_SURROGATE(c)) {
   2974             /* c is a lead surrogate, get the real fcd16 */
   2975             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
   2976                 ++src;
   2977                 fcd16=_getFCD16FromSurrogatePair(c, c2);
   2978             } else {
   2979                 c2=0;
   2980                 fcd16=0;
   2981             }
   2982         } else {
   2983             c2=0;
   2984         }
   2985 
   2986         if(nx_contains(nx, c, c2)) {
   2987             prevCC=0; /* excluded: fcd16==0 */
   2988             continue;
   2989         }
   2990 
   2991         /*
   2992          * prevCC has values from the following ranges:
   2993          * 0..0xff - the previous trail combining class
   2994          * <0      - the negative value of the previous code unit;
   2995          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
   2996          *           was deferred so that average text is checked faster
   2997          */
   2998 
   2999         /* check the combining order */
   3000         cc=(int16_t)(fcd16>>8);
   3001         if(cc!=0) {
   3002             if(prevCC<0) {
   3003                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
   3004                 if(!nx_contains(nx, (UChar32)-prevCC)) {
   3005                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
   3006                 } else {
   3007                     prevCC=0; /* excluded: fcd16==0 */
   3008                 }
   3009             }
   3010 
   3011             if(cc<prevCC) {
   3012                 return FALSE;
   3013             }
   3014         }
   3015         prevCC=(int16_t)(fcd16&0xff);
   3016     }
   3017 }
   3018 
   3019 static UNormalizationCheckResult
   3020 _quickCheck(const UChar *src,
   3021             int32_t srcLength,
   3022             UNormalizationMode mode,
   3023             UBool allowMaybe,
   3024             const UnicodeSet *nx,
   3025             UErrorCode *pErrorCode) {
   3026     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
   3027     UChar *buffer;
   3028     int32_t bufferCapacity;
   3029 
   3030     const UChar *start, *limit;
   3031     uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
   3032     int32_t options;
   3033     UChar c, c2, minNoMaybe;
   3034     uint8_t cc, prevCC;
   3035     UNormalizationCheckResult result;
   3036 
   3037     /* check arguments */
   3038     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   3039         return UNORM_MAYBE;
   3040     }
   3041 
   3042     if(src==NULL || srcLength<-1) {
   3043         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3044         return UNORM_MAYBE;
   3045     }
   3046 
   3047     if(!_haveData(*pErrorCode)) {
   3048         return UNORM_MAYBE;
   3049     }
   3050 
   3051     /* check for a valid mode and set the quick check minimum and mask */
   3052     switch(mode) {
   3053     case UNORM_NFC:
   3054         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
   3055         qcMask=_NORM_QC_NFC;
   3056         options=0;
   3057         break;
   3058     case UNORM_NFKC:
   3059         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
   3060         qcMask=_NORM_QC_NFKC;
   3061         options=_NORM_OPTIONS_COMPAT;
   3062         break;
   3063     case UNORM_NFD:
   3064         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
   3065         qcMask=_NORM_QC_NFD;
   3066         options=0;
   3067         break;
   3068     case UNORM_NFKD:
   3069         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
   3070         qcMask=_NORM_QC_NFKD;
   3071         options=_NORM_OPTIONS_COMPAT;
   3072         break;
   3073     case UNORM_FCD:
   3074         if(fcdTrie.index==NULL) {
   3075             *pErrorCode=U_UNSUPPORTED_ERROR;
   3076             return UNORM_MAYBE;
   3077         }
   3078         return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
   3079     default:
   3080         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3081         return UNORM_MAYBE;
   3082     }
   3083 
   3084     /* initialize */
   3085     buffer=stackBuffer;
   3086     bufferCapacity=_STACK_BUFFER_CAPACITY;
   3087 
   3088     ccOrQCMask=_NORM_CC_MASK|qcMask;
   3089     result=UNORM_YES;
   3090     prevCC=0;
   3091 
   3092     start=src;
   3093     if(srcLength>=0) {
   3094         /* string with length */
   3095         limit=src+srcLength;
   3096     } else /* srcLength==-1 */ {
   3097         /* zero-terminated string */
   3098         limit=NULL;
   3099     }
   3100 
   3101     U_ALIGN_CODE(16);
   3102 
   3103     for(;;) {
   3104         /* skip a run of code units below the minimum or with irrelevant data for the quick check */
   3105         if(limit==NULL) {
   3106             for(;;) {
   3107                 c=*src++;
   3108                 if(c<minNoMaybe) {
   3109                     if(c==0) {
   3110                         goto endloop; /* break out of outer loop */
   3111                     }
   3112                 } else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
   3113                     break;
   3114                 }
   3115                 prevCC=0;
   3116             }
   3117         } else {
   3118             for(;;) {
   3119                 if(src==limit) {
   3120                     goto endloop; /* break out of outer loop */
   3121                 } else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
   3122                     break;
   3123                 }
   3124                 prevCC=0;
   3125             }
   3126         }
   3127 
   3128         /* check one above-minimum, relevant code unit */
   3129         if(U16_IS_LEAD(c)) {
   3130             /* c is a lead surrogate, get the real norm32 */
   3131             if(src!=limit && U16_IS_TRAIL(c2=*src)) {
   3132                 ++src;
   3133                 norm32=_getNorm32FromSurrogatePair(c, c2);
   3134             } else {
   3135                 c2=0;
   3136                 norm32=0;
   3137             }
   3138         } else {
   3139             c2=0;
   3140         }
   3141 
   3142         if(nx_contains(nx, c, c2)) {
   3143             /* excluded: norm32==0 */
   3144             norm32=0;
   3145         }
   3146 
   3147         /* check the combining order */
   3148         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
   3149         if(cc!=0 && cc<prevCC) {
   3150             result=UNORM_NO;
   3151             break;
   3152         }
   3153         prevCC=cc;
   3154 
   3155         /* check for "no" or "maybe" quick check flags */
   3156         qcNorm32=norm32&qcMask;
   3157         if(qcNorm32&_NORM_QC_ANY_NO) {
   3158             result=UNORM_NO;
   3159             break;
   3160         } else if(qcNorm32!=0) {
   3161             /* "maybe" can only occur for NFC and NFKC */
   3162             if(allowMaybe) {
   3163                 result=UNORM_MAYBE;
   3164             } else {
   3165                 /* normalize a section around here to see if it is really normalized or not */
   3166                 const UChar *prevStarter;
   3167                 uint32_t decompQCMask;
   3168                 int32_t length;
   3169 
   3170                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
   3171 
   3172                 /* find the previous starter */
   3173                 prevStarter=src-1; /* set prevStarter to the beginning of the current character */
   3174                 if(UTF_IS_TRAIL(*prevStarter)) {
   3175                     --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */
   3176                 }
   3177                 prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
   3178 
   3179                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
   3180                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
   3181 
   3182                 /* decompose and recompose [prevStarter..src[ */
   3183                 _composePart(stackBuffer, buffer, bufferCapacity,
   3184                              length,
   3185                              prevStarter,
   3186                              src,
   3187                              prevCC,
   3188                              options, nx, pErrorCode);
   3189                 if(U_FAILURE(*pErrorCode)) {
   3190                     result=UNORM_MAYBE; /* error (out of memory) */
   3191                     break;
   3192                 }
   3193 
   3194                 /* compare the normalized version with the original */
   3195                 if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
   3196                     result=UNORM_NO; /* normalization differs */
   3197                     break;
   3198                 }
   3199 
   3200                 /* continue after the next starter */
   3201             }
   3202         }
   3203     }
   3204 endloop:
   3205 
   3206     if(buffer!=stackBuffer) {
   3207         uprv_free(buffer);
   3208     }
   3209 
   3210     return result;
   3211 }
   3212 
   3213 U_CAPI UNormalizationCheckResult U_EXPORT2
   3214 unorm_quickCheck(const UChar *src,
   3215                  int32_t srcLength,
   3216                  UNormalizationMode mode,
   3217                  UErrorCode *pErrorCode) {
   3218     return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
   3219 }
   3220 
   3221 U_CAPI UNormalizationCheckResult U_EXPORT2
   3222 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
   3223                             UNormalizationMode mode, int32_t options,
   3224                             UErrorCode *pErrorCode) {
   3225     return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
   3226 }
   3227 
   3228 U_CFUNC UNormalizationCheckResult
   3229 unorm_internalQuickCheck(const UChar *src,
   3230                          int32_t srcLength,
   3231                          UNormalizationMode mode,
   3232                          UBool allowMaybe,
   3233                          const UnicodeSet *nx,
   3234                          UErrorCode *pErrorCode) {
   3235     return _quickCheck(src, srcLength, mode, allowMaybe, nx, pErrorCode);
   3236 }
   3237 
   3238 U_CAPI UBool U_EXPORT2
   3239 unorm_isNormalized(const UChar *src, int32_t srcLength,
   3240                    UNormalizationMode mode,
   3241                    UErrorCode *pErrorCode) {
   3242     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
   3243 }
   3244 
   3245 U_CAPI UBool U_EXPORT2
   3246 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
   3247                               UNormalizationMode mode, int32_t options,
   3248                               UErrorCode *pErrorCode) {
   3249     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
   3250 }
   3251 
   3252 /* normalize() API ---------------------------------------------------------- */
   3253 
   3254 /**
   3255  * Internal API for normalizing.
   3256  * Does not check for bad input.
   3257  * Requires _haveData() to be true.
   3258  * @internal
   3259  */
   3260 U_CFUNC int32_t
   3261 unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
   3262                               const UChar *src, int32_t srcLength,
   3263                               UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
   3264                               UErrorCode *pErrorCode) {
   3265     int32_t destLength;
   3266     uint8_t trailCC;
   3267 
   3268     switch(mode) {
   3269     case UNORM_NFD:
   3270         destLength=_decompose(dest, destCapacity,
   3271                               src, srcLength,
   3272                               FALSE, nx, trailCC);
   3273         break;
   3274     case UNORM_NFKD:
   3275         destLength=_decompose(dest, destCapacity,
   3276                               src, srcLength,
   3277                               TRUE, nx, trailCC);
   3278         break;
   3279     case UNORM_NFC:
   3280         destLength=_compose(dest, destCapacity,
   3281                             src, srcLength,
   3282                             options, nx, pErrorCode);
   3283         break;
   3284     case UNORM_NFKC:
   3285         destLength=_compose(dest, destCapacity,
   3286                             src, srcLength,
   3287                             options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
   3288         break;
   3289     case UNORM_FCD:
   3290         if(fcdTrie.index==NULL) {
   3291             *pErrorCode=U_UNSUPPORTED_ERROR;
   3292             return 0;
   3293         }
   3294         return unorm_makeFCD(dest, destCapacity,
   3295                              src, srcLength,
   3296                              nx,
   3297                              pErrorCode);
   3298 #if 0
   3299     case UNORM_FCC:
   3300         destLength=_compose(dest, destCapacity,
   3301                             src, srcLength,
   3302                             options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
   3303         break;
   3304 #endif
   3305     case UNORM_NONE:
   3306         /* just copy the string */
   3307         if(srcLength==-1) {
   3308             srcLength=u_strlen(src);
   3309         }
   3310         if(srcLength>0 && srcLength<=destCapacity) {
   3311             uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
   3312         }
   3313         destLength=srcLength;
   3314         break;
   3315     default:
   3316         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3317         return 0;
   3318     }
   3319 
   3320     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
   3321 }
   3322 
   3323 /**
   3324  * Internal API for normalizing.
   3325  * Does not check for bad input.
   3326  * @internal
   3327  */
   3328 U_CAPI int32_t U_EXPORT2
   3329 unorm_internalNormalize(UChar *dest, int32_t destCapacity,
   3330                         const UChar *src, int32_t srcLength,
   3331                         UNormalizationMode mode, int32_t options,
   3332                         UErrorCode *pErrorCode) {
   3333     const UnicodeSet *nx;
   3334 
   3335     if(!_haveData(*pErrorCode)) {
   3336         return 0;
   3337     }
   3338 
   3339     nx=getNX(options, *pErrorCode);
   3340     if(U_FAILURE(*pErrorCode)) {
   3341         return 0;
   3342     }
   3343 
   3344     /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
   3345     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
   3346 
   3347     return unorm_internalNormalizeWithNX(dest, destCapacity,
   3348                                          src, srcLength,
   3349                                          mode, options, nx,
   3350                                          pErrorCode);
   3351 }
   3352 
   3353 /** Public API for normalizing. */
   3354 U_CAPI int32_t U_EXPORT2
   3355 unorm_normalize(const UChar *src, int32_t srcLength,
   3356                 UNormalizationMode mode, int32_t options,
   3357                 UChar *dest, int32_t destCapacity,
   3358                 UErrorCode *pErrorCode) {
   3359     /* check argument values */
   3360     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   3361         return 0;
   3362     }
   3363 
   3364     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
   3365         src==NULL || srcLength<-1
   3366     ) {
   3367         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3368         return 0;
   3369     }
   3370 
   3371     /* check for overlapping src and destination */
   3372     if( dest!=NULL &&
   3373         ((src>=dest && src<(dest+destCapacity)) ||
   3374          (srcLength>0 && dest>=src && dest<(src+srcLength)))
   3375     ) {
   3376         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3377         return 0;
   3378     }
   3379 
   3380     return unorm_internalNormalize(dest, destCapacity,
   3381                                    src, srcLength,
   3382                                    mode, options,
   3383                                    pErrorCode);
   3384 }
   3385 
   3386 
   3387 /* iteration functions ------------------------------------------------------ */
   3388 
   3389 /*
   3390  * These iteration functions are the core implementations of the
   3391  * Normalizer class iteration API.
   3392  * They read from a UCharIterator into their own buffer
   3393  * and normalize into the Normalizer iteration buffer.
   3394  * Normalizer itself then iterates over its buffer until that needs to be
   3395  * filled again.
   3396  */
   3397 
   3398 /*
   3399  * ### TODO:
   3400  * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
   3401  * if iteration bounds are reached,
   3402  * try to not call hasNext/hasPrevious and instead check for >=0.
   3403  */
   3404 
   3405 /* backward iteration ------------------------------------------------------- */
   3406 
   3407 /*
   3408  * read backwards and get norm32
   3409  * return 0 if the character is <minC
   3410  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
   3411  */
   3412 static inline uint32_t
   3413 _getPrevNorm32(UCharIterator &src, uint32_t minC, UChar &c, UChar &c2) {
   3414     /* need src.hasPrevious() */
   3415     c=(UChar)src.previous(&src);
   3416     c2=0;
   3417 
   3418     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
   3419     if(c<minC) {
   3420         return 0;
   3421     } else if(!UTF_IS_SURROGATE(c)) {
   3422         return _getNorm32(c);
   3423     } else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
   3424         /* unpaired surrogate */
   3425         return 0;
   3426     } else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
   3427         return _getNorm32FromSurrogatePair(c2, c);
   3428     } else {
   3429         /* unpaired second surrogate, undo the c2=src.previous() movement */
   3430         src.move(&src, 1, UITER_CURRENT);
   3431         c2=0;
   3432         return 0;
   3433     }
   3434 }
   3435 
   3436 /*
   3437  * read backwards and check if the character is a previous-iteration boundary
   3438  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
   3439  */
   3440 typedef UBool
   3441 IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
   3442 
   3443 /*
   3444  * for NF*D:
   3445  * read backwards and check if the lead combining class is 0
   3446  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
   3447  */
   3448 static UBool
   3449 _isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
   3450     return _isNFDSafe(_getPrevNorm32(src, minC, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
   3451 }
   3452 
   3453 /*
   3454  * read backwards and check if the character is (or its decomposition begins with)
   3455  * a "true starter" (cc==0 and NF*C_YES)
   3456  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
   3457  */
   3458 static UBool
   3459 _isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
   3460     uint32_t norm32, decompQCMask;
   3461 
   3462     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
   3463     norm32=_getPrevNorm32(src, minC, c, c2);
   3464     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
   3465 }
   3466 
   3467 static int32_t
   3468 _findPreviousIterationBoundary(UCharIterator &src,
   3469                                IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
   3470                                UChar *&buffer, int32_t &bufferCapacity,
   3471                                int32_t &startIndex,
   3472                                UErrorCode *pErrorCode) {
   3473     UChar *stackBuffer;
   3474     UChar c, c2;
   3475     UBool isBoundary;
   3476 
   3477     /* initialize */
   3478     stackBuffer=buffer;
   3479     startIndex=bufferCapacity; /* fill the buffer from the end backwards */
   3480 
   3481     while(src.hasPrevious(&src)) {
   3482         isBoundary=isPrevBoundary(src, minC, mask, c, c2);
   3483 
   3484         /* always write this character to the front of the buffer */
   3485         /* make sure there is enough space in the buffer */
   3486         if(startIndex < (c2==0 ? 1 : 2)) {
   3487             int32_t bufferLength=bufferCapacity;
   3488 
   3489             if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
   3490                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   3491                 src.move(&src, 0, UITER_START);
   3492                 return 0;
   3493             }
   3494 
   3495             /* move the current buffer contents up */
   3496             uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
   3497             startIndex+=bufferCapacity-bufferLength;
   3498         }
   3499 
   3500         buffer[--startIndex]=c;
   3501         if(c2!=0) {
   3502             buffer[--startIndex]=c2;
   3503         }
   3504 
   3505         /* stop if this just-copied character is a boundary */
   3506         if(isBoundary) {
   3507             break;
   3508         }
   3509     }
   3510 
   3511     /* return the length of the buffer contents */
   3512     return bufferCapacity-startIndex;
   3513 }
   3514 
   3515 U_CAPI int32_t U_EXPORT2
   3516 unorm_previous(UCharIterator *src,
   3517                UChar *dest, int32_t destCapacity,
   3518                UNormalizationMode mode, int32_t options,
   3519                UBool doNormalize, UBool *pNeededToNormalize,
   3520                UErrorCode *pErrorCode) {
   3521     UChar stackBuffer[100];
   3522     UChar *buffer=NULL;
   3523     IsPrevBoundaryFn *isPreviousBoundary=NULL;
   3524     uint32_t mask=0;
   3525     int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
   3526     int32_t c=0, c2=0;
   3527     UChar minC=0;
   3528 
   3529     /* check argument values */
   3530     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   3531         return 0;
   3532     }
   3533 
   3534     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
   3535         src==NULL
   3536     ) {
   3537         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3538         return 0;
   3539     }
   3540 
   3541     if(!_haveData(*pErrorCode)) {
   3542         return 0;
   3543     }
   3544 
   3545     if(pNeededToNormalize!=NULL) {
   3546         *pNeededToNormalize=FALSE;
   3547     }
   3548 
   3549     switch(mode) {
   3550     case UNORM_FCD:
   3551         if(fcdTrie.index==NULL) {
   3552             *pErrorCode=U_UNSUPPORTED_ERROR;
   3553             return 0;
   3554         }
   3555         /* fall through to NFD */
   3556     case UNORM_NFD:
   3557         isPreviousBoundary=_isPrevNFDSafe;
   3558         minC=_NORM_MIN_WITH_LEAD_CC;
   3559         mask=_NORM_CC_MASK|_NORM_QC_NFD;
   3560         break;
   3561     case UNORM_NFKD:
   3562         isPreviousBoundary=_isPrevNFDSafe;
   3563         minC=_NORM_MIN_WITH_LEAD_CC;
   3564         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
   3565         break;
   3566     case UNORM_NFC:
   3567         isPreviousBoundary=_isPrevTrueStarter;
   3568         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
   3569         mask=_NORM_CC_MASK|_NORM_QC_NFC;
   3570         break;
   3571     case UNORM_NFKC:
   3572         isPreviousBoundary=_isPrevTrueStarter;
   3573         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
   3574         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
   3575         break;
   3576     case UNORM_NONE:
   3577         destLength=0;
   3578         if((c=src->previous(src))>=0) {
   3579             destLength=1;
   3580             if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
   3581                 if(UTF_IS_LEAD(c2)) {
   3582                     if(destCapacity>=2) {
   3583                         dest[1]=(UChar)c; /* trail surrogate */
   3584                         destLength=2;
   3585                     }
   3586                     c=c2; /* lead surrogate to be written below */
   3587                 } else {
   3588                     src->move(src, 1, UITER_CURRENT);
   3589                 }
   3590             }
   3591 
   3592             if(destCapacity>0) {
   3593                 dest[0]=(UChar)c;
   3594             }
   3595         }
   3596         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
   3597     default:
   3598         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3599         return 0;
   3600     }
   3601 
   3602     buffer=stackBuffer;
   3603     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
   3604     bufferLength=_findPreviousIterationBoundary(*src,
   3605                                                 isPreviousBoundary, minC, mask,
   3606                                                 buffer, bufferCapacity,
   3607                                                 startIndex,
   3608                                                 pErrorCode);
   3609     if(bufferLength>0) {
   3610         if(doNormalize) {
   3611             destLength=unorm_internalNormalize(dest, destCapacity,
   3612                                                buffer+startIndex, bufferLength,
   3613                                                mode, options,
   3614                                                pErrorCode);
   3615             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
   3616                 *pNeededToNormalize=
   3617                     (UBool)(destLength!=bufferLength ||
   3618                             0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
   3619             }
   3620         } else {
   3621             /* just copy the source characters */
   3622             if(destCapacity>0) {
   3623                 uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
   3624             }
   3625             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
   3626         }
   3627     } else {
   3628         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
   3629     }
   3630 
   3631     /* cleanup */
   3632     if(buffer!=stackBuffer) {
   3633         uprv_free(buffer);
   3634     }
   3635 
   3636     return destLength;
   3637 }
   3638 
   3639 /* forward iteration -------------------------------------------------------- */
   3640 
   3641 /*
   3642  * read forward and get norm32
   3643  * return 0 if the character is <minC
   3644  * if c2!=0 then (c2, c) is a surrogate pair
   3645  * always reads complete characters
   3646  */
   3647 static inline uint32_t
   3648 _getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
   3649     uint32_t norm32;
   3650 
   3651     /* need src.hasNext() to be true */
   3652     c=(UChar)src.next(&src);
   3653     c2=0;
   3654 
   3655     if(c<minC) {
   3656         return 0;
   3657     }
   3658 
   3659     norm32=_getNorm32(c);
   3660     if(UTF_IS_FIRST_SURROGATE(c)) {
   3661         if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
   3662             src.move(&src, 1, UITER_CURRENT); /* skip the c2 surrogate */
   3663             if((norm32&mask)==0) {
   3664                 /* irrelevant data */
   3665                 return 0;
   3666             } else {
   3667                 /* norm32 must be a surrogate special */
   3668                 return _getNorm32FromSurrogatePair(c, c2);
   3669             }
   3670         } else {
   3671             /* unmatched surrogate */
   3672             c2=0;
   3673             return 0;
   3674         }
   3675     }
   3676     return norm32;
   3677 }
   3678 
   3679 /*
   3680  * read forward and check if the character is a next-iteration boundary
   3681  * if c2!=0 then (c, c2) is a surrogate pair
   3682  */
   3683 typedef UBool
   3684 IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
   3685 
   3686 /*
   3687  * for NF*D:
   3688  * read forward and check if the lead combining class is 0
   3689  * if c2!=0 then (c, c2) is a surrogate pair
   3690  */
   3691 static UBool
   3692 _isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
   3693     return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
   3694 }
   3695 
   3696 /*
   3697  * for NF*C:
   3698  * read forward and check if the character is (or its decomposition begins with)
   3699  * a "true starter" (cc==0 and NF*C_YES)
   3700  * if c2!=0 then (c, c2) is a surrogate pair
   3701  */
   3702 static UBool
   3703 _isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
   3704     uint32_t norm32, decompQCMask;
   3705 
   3706     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
   3707     norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
   3708     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
   3709 }
   3710 
   3711 static int32_t
   3712 _findNextIterationBoundary(UCharIterator &src,
   3713                            IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
   3714                            UChar *&buffer, int32_t &bufferCapacity,
   3715                            UErrorCode *pErrorCode) {
   3716     UChar *stackBuffer;
   3717     int32_t bufferIndex;
   3718     UChar c, c2;
   3719 
   3720     if(!src.hasNext(&src)) {
   3721         return 0;
   3722     }
   3723 
   3724     /* initialize */
   3725     stackBuffer=buffer;
   3726 
   3727     /* get one character and ignore its properties */
   3728     buffer[0]=c=(UChar)src.next(&src);
   3729     bufferIndex=1;
   3730     if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
   3731         if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
   3732             buffer[bufferIndex++]=c2;
   3733         } else {
   3734             src.move(&src, -1, UITER_CURRENT); /* back out the non-trail-surrogate */
   3735         }
   3736     }
   3737 
   3738     /* get all following characters until we see a boundary */
   3739     /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
   3740     while(src.hasNext(&src)) {
   3741         if(isNextBoundary(src, minC, mask, c, c2)) {
   3742             /* back out the latest movement to stop at the boundary */
   3743             src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
   3744             break;
   3745         } else {
   3746             if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
   3747                 /* attempt to grow the buffer */
   3748                 u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
   3749                                        2*bufferCapacity,
   3750                                        bufferIndex)
   3751             ) {
   3752                 buffer[bufferIndex++]=c;
   3753                 if(c2!=0) {
   3754                     buffer[bufferIndex++]=c2;
   3755                 }
   3756             } else {
   3757                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   3758                 src.move(&src, 0, UITER_LIMIT);
   3759                 return 0;
   3760             }
   3761         }
   3762     }
   3763 
   3764     /* return the length of the buffer contents */
   3765     return bufferIndex;
   3766 }
   3767 
   3768 U_CAPI int32_t U_EXPORT2
   3769 unorm_next(UCharIterator *src,
   3770            UChar *dest, int32_t destCapacity,
   3771            UNormalizationMode mode, int32_t options,
   3772            UBool doNormalize, UBool *pNeededToNormalize,
   3773            UErrorCode *pErrorCode) {
   3774     UChar stackBuffer[100];
   3775     UChar *buffer;
   3776     IsNextBoundaryFn *isNextBoundary;
   3777     uint32_t mask;
   3778     int32_t bufferLength, bufferCapacity, destLength;
   3779     int32_t c, c2;
   3780     UChar minC;
   3781 
   3782     /* check argument values */
   3783     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   3784         return 0;
   3785     }
   3786 
   3787     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
   3788         src==NULL
   3789     ) {
   3790         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3791         return 0;
   3792     }
   3793 
   3794     if(!_haveData(*pErrorCode)) {
   3795         return 0;
   3796     }
   3797 
   3798     if(pNeededToNormalize!=NULL) {
   3799         *pNeededToNormalize=FALSE;
   3800     }
   3801 
   3802     switch(mode) {
   3803     case UNORM_FCD:
   3804         if(fcdTrie.index==NULL) {
   3805             *pErrorCode=U_UNSUPPORTED_ERROR;
   3806             return 0;
   3807         }
   3808         /* fall through to NFD */
   3809     case UNORM_NFD:
   3810         isNextBoundary=_isNextNFDSafe;
   3811         minC=_NORM_MIN_WITH_LEAD_CC;
   3812         mask=_NORM_CC_MASK|_NORM_QC_NFD;
   3813         break;
   3814     case UNORM_NFKD:
   3815         isNextBoundary=_isNextNFDSafe;
   3816         minC=_NORM_MIN_WITH_LEAD_CC;
   3817         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
   3818         break;
   3819     case UNORM_NFC:
   3820         isNextBoundary=_isNextTrueStarter;
   3821         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
   3822         mask=_NORM_CC_MASK|_NORM_QC_NFC;
   3823         break;
   3824     case UNORM_NFKC:
   3825         isNextBoundary=_isNextTrueStarter;
   3826         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
   3827         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
   3828         break;
   3829     case UNORM_NONE:
   3830         destLength=0;
   3831         if((c=src->next(src))>=0) {
   3832             destLength=1;
   3833             if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
   3834                 if(UTF_IS_TRAIL(c2)) {
   3835                     if(destCapacity>=2) {
   3836                         dest[1]=(UChar)c2; /* trail surrogate */
   3837                         destLength=2;
   3838                     }
   3839                     /* lead surrogate to be written below */
   3840                 } else {
   3841                     src->move(src, -1, UITER_CURRENT);
   3842                 }
   3843             }
   3844 
   3845             if(destCapacity>0) {
   3846                 dest[0]=(UChar)c;
   3847             }
   3848         }
   3849         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
   3850     default:
   3851         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3852         return 0;
   3853     }
   3854 
   3855     buffer=stackBuffer;
   3856     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
   3857     bufferLength=_findNextIterationBoundary(*src,
   3858                                             isNextBoundary, minC, mask,
   3859                                             buffer, bufferCapacity,
   3860                                             pErrorCode);
   3861     if(bufferLength>0) {
   3862         if(doNormalize) {
   3863             destLength=unorm_internalNormalize(dest, destCapacity,
   3864                                                buffer, bufferLength,
   3865                                                mode, options,
   3866                                                pErrorCode);
   3867             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
   3868                 *pNeededToNormalize=
   3869                     (UBool)(destLength!=bufferLength ||
   3870                             0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
   3871             }
   3872         } else {
   3873             /* just copy the source characters */
   3874             if(destCapacity>0) {
   3875                 uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
   3876             }
   3877             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
   3878         }
   3879     } else {
   3880         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
   3881     }
   3882 
   3883     /* cleanup */
   3884     if(buffer!=stackBuffer) {
   3885         uprv_free(buffer);
   3886     }
   3887 
   3888     return destLength;
   3889 }
   3890 
   3891 /*
   3892  * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
   3893  * and if not, how hard it would be to improve it.
   3894  * For example, see _findSafeFCD().
   3895  */
   3896 
   3897 /* Concatenation of normalized strings -------------------------------------- */
   3898 
   3899 U_CAPI int32_t U_EXPORT2
   3900 unorm_concatenate(const UChar *left, int32_t leftLength,
   3901                   const UChar *right, int32_t rightLength,
   3902                   UChar *dest, int32_t destCapacity,
   3903                   UNormalizationMode mode, int32_t options,
   3904                   UErrorCode *pErrorCode) {
   3905     UChar stackBuffer[100];
   3906     UChar *buffer;
   3907     int32_t bufferLength, bufferCapacity;
   3908 
   3909     UCharIterator iter;
   3910     int32_t leftBoundary, rightBoundary, destLength;
   3911 
   3912     /* check argument values */
   3913     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   3914         return 0;
   3915     }
   3916 
   3917     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
   3918         left==NULL || leftLength<-1 ||
   3919         right==NULL || rightLength<-1
   3920     ) {
   3921         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3922         return 0;
   3923     }
   3924 
   3925     /* check for overlapping right and destination */
   3926     if( dest!=NULL &&
   3927         ((right>=dest && right<(dest+destCapacity)) ||
   3928          (rightLength>0 && dest>=right && dest<(right+rightLength)))
   3929     ) {
   3930         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   3931         return 0;
   3932     }
   3933 
   3934     /* allow left==dest */
   3935 
   3936     /* set up intermediate buffer */
   3937     buffer=stackBuffer;
   3938     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
   3939 
   3940     /*
   3941      * Input: left[0..leftLength[ + right[0..rightLength[
   3942      *
   3943      * Find normalization-safe boundaries leftBoundary and rightBoundary
   3944      * and copy the end parts together:
   3945      * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
   3946      *
   3947      * dest=left[0..leftBoundary[ +
   3948      *      normalize(buffer) +
   3949      *      right[rightBoundary..rightLength[
   3950      */
   3951 
   3952     /*
   3953      * find a normalization boundary at the end of the left string
   3954      * and copy the end part into the buffer
   3955      */
   3956     uiter_setString(&iter, left, leftLength);
   3957     iter.index=leftLength=iter.length; /* end of left string */
   3958 
   3959     bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
   3960                                 mode, options,
   3961                                 FALSE, NULL,
   3962                                 pErrorCode);
   3963     leftBoundary=iter.index;
   3964     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
   3965         *pErrorCode=U_ZERO_ERROR;
   3966         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
   3967             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   3968             /* dont need to cleanup here since
   3969              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
   3970              */
   3971             return 0;
   3972         }
   3973 
   3974         /* just copy from the left string: we know the boundary already */
   3975         uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
   3976     }
   3977 
   3978     /*
   3979      * find a normalization boundary at the beginning of the right string
   3980      * and concatenate the beginning part to the buffer
   3981      */
   3982     uiter_setString(&iter, right, rightLength);
   3983     rightLength=iter.length; /* in case it was -1 */
   3984 
   3985     rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
   3986                              mode, options,
   3987                              FALSE, NULL,
   3988                              pErrorCode);
   3989     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
   3990         *pErrorCode=U_ZERO_ERROR;
   3991         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
   3992             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   3993             /* dont need to cleanup here since
   3994              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
   3995              */
   3996             return 0;
   3997         }
   3998 
   3999         /* just copy from the right string: we know the boundary already */
   4000         uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
   4001     }
   4002 
   4003     bufferLength+=rightBoundary;
   4004 
   4005     /* copy left[0..leftBoundary[ to dest */
   4006     if(left!=dest && leftBoundary>0 && destCapacity>0) {
   4007         uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
   4008     }
   4009     destLength=leftBoundary;
   4010 
   4011     /* concatenate the normalization of the buffer to dest */
   4012     if(destCapacity>destLength) {
   4013         destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
   4014                                             buffer, bufferLength,
   4015                                             mode, options,
   4016                                             pErrorCode);
   4017     } else {
   4018         destLength+=unorm_internalNormalize(NULL, 0,
   4019                                             buffer, bufferLength,
   4020                                             mode, options,
   4021                                             pErrorCode);
   4022     }
   4023     /*
   4024      * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
   4025      * so we dont check for the error code here..just let it pass through
   4026      */
   4027     /* concatenate right[rightBoundary..rightLength[ to dest */
   4028     right+=rightBoundary;
   4029     rightLength-=rightBoundary;
   4030     if(rightLength>0 && destCapacity>destLength) {
   4031         uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
   4032     }
   4033     destLength+=rightLength;
   4034 
   4035     /* cleanup */
   4036     if(buffer!=stackBuffer) {
   4037         uprv_free(buffer);
   4038     }
   4039 
   4040     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
   4041 }
   4042 
   4043 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   4044