Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  normalizer2impl.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009nov22
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_NORMALIZATION
     20 
     21 #include "unicode/normalizer2.h"
     22 #include "unicode/udata.h"
     23 #include "unicode/ustring.h"
     24 #include "cmemory.h"
     25 #include "mutex.h"
     26 #include "normalizer2impl.h"
     27 #include "uassert.h"
     28 #include "uset_imp.h"
     29 #include "utrie2.h"
     30 
     31 U_NAMESPACE_BEGIN
     32 
     33 // ReorderingBuffer -------------------------------------------------------- ***
     34 
     35 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
     36     int32_t length=str.length();
     37     start=str.getBuffer(destCapacity);
     38     if(start==NULL) {
     39         // getBuffer() already did str.setToBogus()
     40         errorCode=U_MEMORY_ALLOCATION_ERROR;
     41         return FALSE;
     42     }
     43     limit=start+length;
     44     remainingCapacity=str.getCapacity()-length;
     45     reorderStart=start;
     46     if(start==limit) {
     47         lastCC=0;
     48     } else {
     49         setIterator();
     50         lastCC=previousCC();
     51         // Set reorderStart after the last code point with cc<=1 if there is one.
     52         if(lastCC>1) {
     53             while(previousCC()>1) {}
     54         }
     55         reorderStart=codePointLimit;
     56     }
     57     return TRUE;
     58 }
     59 
     60 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
     61     int32_t length=(int32_t)(limit-start);
     62     return
     63         length==(int32_t)(otherLimit-otherStart) &&
     64         0==u_memcmp(start, otherStart, length);
     65 }
     66 
     67 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
     68     if(remainingCapacity<2 && !resize(2, errorCode)) {
     69         return FALSE;
     70     }
     71     if(lastCC<=cc || cc==0) {
     72         limit[0]=U16_LEAD(c);
     73         limit[1]=U16_TRAIL(c);
     74         limit+=2;
     75         lastCC=cc;
     76         if(cc<=1) {
     77             reorderStart=limit;
     78         }
     79     } else {
     80         insert(c, cc);
     81     }
     82     remainingCapacity-=2;
     83     return TRUE;
     84 }
     85 
     86 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
     87                                uint8_t leadCC, uint8_t trailCC,
     88                                UErrorCode &errorCode) {
     89     if(length==0) {
     90         return TRUE;
     91     }
     92     if(remainingCapacity<length && !resize(length, errorCode)) {
     93         return FALSE;
     94     }
     95     remainingCapacity-=length;
     96     if(lastCC<=leadCC || leadCC==0) {
     97         if(trailCC<=1) {
     98             reorderStart=limit+length;
     99         } else if(leadCC<=1) {
    100             reorderStart=limit+1;  // Ok if not a code point boundary.
    101         }
    102         const UChar *sLimit=s+length;
    103         do { *limit++=*s++; } while(s!=sLimit);
    104         lastCC=trailCC;
    105     } else {
    106         int32_t i=0;
    107         UChar32 c;
    108         U16_NEXT(s, i, length, c);
    109         insert(c, leadCC);  // insert first code point
    110         while(i<length) {
    111             U16_NEXT(s, i, length, c);
    112             if(i<length) {
    113                 // s must be in NFD, otherwise we need to use getCC().
    114                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
    115             } else {
    116                 leadCC=trailCC;
    117             }
    118             append(c, leadCC, errorCode);
    119         }
    120     }
    121     return TRUE;
    122 }
    123 
    124 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
    125     int32_t cpLength=U16_LENGTH(c);
    126     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
    127         return FALSE;
    128     }
    129     remainingCapacity-=cpLength;
    130     if(cpLength==1) {
    131         *limit++=(UChar)c;
    132     } else {
    133         limit[0]=U16_LEAD(c);
    134         limit[1]=U16_TRAIL(c);
    135         limit+=2;
    136     }
    137     lastCC=0;
    138     reorderStart=limit;
    139     return TRUE;
    140 }
    141 
    142 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
    143     if(s==sLimit) {
    144         return TRUE;
    145     }
    146     int32_t length=(int32_t)(sLimit-s);
    147     if(remainingCapacity<length && !resize(length, errorCode)) {
    148         return FALSE;
    149     }
    150     u_memcpy(limit, s, length);
    151     limit+=length;
    152     remainingCapacity-=length;
    153     lastCC=0;
    154     reorderStart=limit;
    155     return TRUE;
    156 }
    157 
    158 void ReorderingBuffer::remove() {
    159     reorderStart=limit=start;
    160     remainingCapacity=str.getCapacity();
    161     lastCC=0;
    162 }
    163 
    164 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
    165     if(suffixLength<(limit-start)) {
    166         limit-=suffixLength;
    167         remainingCapacity+=suffixLength;
    168     } else {
    169         limit=start;
    170         remainingCapacity=str.getCapacity();
    171     }
    172     lastCC=0;
    173     reorderStart=limit;
    174 }
    175 
    176 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
    177     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
    178     int32_t length=(int32_t)(limit-start);
    179     str.releaseBuffer(length);
    180     int32_t newCapacity=length+appendLength;
    181     int32_t doubleCapacity=2*str.getCapacity();
    182     if(newCapacity<doubleCapacity) {
    183         newCapacity=doubleCapacity;
    184     }
    185     if(newCapacity<256) {
    186         newCapacity=256;
    187     }
    188     start=str.getBuffer(newCapacity);
    189     if(start==NULL) {
    190         // getBuffer() already did str.setToBogus()
    191         errorCode=U_MEMORY_ALLOCATION_ERROR;
    192         return FALSE;
    193     }
    194     reorderStart=start+reorderStartIndex;
    195     limit=start+length;
    196     remainingCapacity=str.getCapacity()-length;
    197     return TRUE;
    198 }
    199 
    200 void ReorderingBuffer::skipPrevious() {
    201     codePointLimit=codePointStart;
    202     UChar c=*--codePointStart;
    203     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
    204         --codePointStart;
    205     }
    206 }
    207 
    208 uint8_t ReorderingBuffer::previousCC() {
    209     codePointLimit=codePointStart;
    210     if(reorderStart>=codePointStart) {
    211         return 0;
    212     }
    213     UChar32 c=*--codePointStart;
    214     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
    215         return 0;
    216     }
    217 
    218     UChar c2;
    219     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
    220         --codePointStart;
    221         c=U16_GET_SUPPLEMENTARY(c2, c);
    222     }
    223     return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
    224 }
    225 
    226 // Inserts c somewhere before the last character.
    227 // Requires 0<cc<lastCC which implies reorderStart<limit.
    228 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
    229     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
    230     // insert c at codePointLimit, after the character with prevCC<=cc
    231     UChar *q=limit;
    232     UChar *r=limit+=U16_LENGTH(c);
    233     do {
    234         *--r=*--q;
    235     } while(codePointLimit!=q);
    236     writeCodePoint(q, c);
    237     if(cc<=1) {
    238         reorderStart=r;
    239     }
    240 }
    241 
    242 // Normalizer2Impl --------------------------------------------------------- ***
    243 
    244 Normalizer2Impl::~Normalizer2Impl() {
    245     udata_close(memory);
    246     utrie2_close(normTrie);
    247     UTrie2Singleton(fcdTrieSingleton).deleteInstance();
    248 }
    249 
    250 UBool U_CALLCONV
    251 Normalizer2Impl::isAcceptable(void *context,
    252                               const char *type, const char *name,
    253                               const UDataInfo *pInfo) {
    254     if(
    255         pInfo->size>=20 &&
    256         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
    257         pInfo->charsetFamily==U_CHARSET_FAMILY &&
    258         pInfo->dataFormat[0]==0x4e &&    /* dataFormat="Nrm2" */
    259         pInfo->dataFormat[1]==0x72 &&
    260         pInfo->dataFormat[2]==0x6d &&
    261         pInfo->dataFormat[3]==0x32 &&
    262         pInfo->formatVersion[0]==1
    263     ) {
    264         Normalizer2Impl *me=(Normalizer2Impl *)context;
    265         uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
    266         return TRUE;
    267     } else {
    268         return FALSE;
    269     }
    270 }
    271 
    272 void
    273 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
    274     if(U_FAILURE(errorCode)) {
    275         return;
    276     }
    277     memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
    278     if(U_FAILURE(errorCode)) {
    279         return;
    280     }
    281     const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
    282     const int32_t *inIndexes=(const int32_t *)inBytes;
    283     int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
    284     if(indexesLength<=IX_MIN_MAYBE_YES) {
    285         errorCode=U_INVALID_FORMAT_ERROR;  // Not enough indexes.
    286         return;
    287     }
    288 
    289     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
    290     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
    291 
    292     minYesNo=inIndexes[IX_MIN_YES_NO];
    293     minNoNo=inIndexes[IX_MIN_NO_NO];
    294     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
    295     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
    296 
    297     int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
    298     int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
    299     normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
    300                                        inBytes+offset, nextOffset-offset, NULL,
    301                                        &errorCode);
    302     if(U_FAILURE(errorCode)) {
    303         return;
    304     }
    305 
    306     offset=nextOffset;
    307     maybeYesCompositions=(const uint16_t *)(inBytes+offset);
    308     extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
    309 }
    310 
    311 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
    312     UChar32 c;
    313     if(cpStart==(cpLimit-1)) {
    314         c=*cpStart;
    315     } else {
    316         c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
    317     }
    318     uint16_t prevNorm16=getNorm16(c);
    319     if(prevNorm16<=minYesNo) {
    320         return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
    321     } else {
    322         return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo
    323     }
    324 }
    325 
    326 U_CDECL_BEGIN
    327 
    328 static UBool U_CALLCONV
    329 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
    330     /* add the start code point to the USet */
    331     const USetAdder *sa=(const USetAdder *)context;
    332     sa->add(sa->set, start);
    333     return TRUE;
    334 }
    335 
    336 U_CDECL_END
    337 
    338 void
    339 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
    340     /* add the start code point of each same-value range of each trie */
    341     utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
    342 
    343     /* add Hangul LV syllables and LV+1 because of skippables */
    344     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
    345         sa->add(sa->set, c);
    346         sa->add(sa->set, c+1);
    347     }
    348     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
    349 }
    350 
    351 const UChar *
    352 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
    353                                                 UChar32 minNeedDataCP,
    354                                                 ReorderingBuffer *buffer,
    355                                                 UErrorCode &errorCode) const {
    356     // Make some effort to support NUL-terminated strings reasonably.
    357     // Take the part of the fast quick check loop that does not look up
    358     // data and check the first part of the string.
    359     // After this prefix, determine the string length to simplify the rest
    360     // of the code.
    361     const UChar *prevSrc=src;
    362     UChar c;
    363     while((c=*src++)<minNeedDataCP && c!=0) {}
    364     // Back out the last character for full processing.
    365     // Copy this prefix.
    366     if(--src!=prevSrc) {
    367         if(buffer!=NULL) {
    368             buffer->appendZeroCC(prevSrc, src, errorCode);
    369         }
    370     }
    371     return src;
    372 }
    373 
    374 // Dual functionality:
    375 // buffer!=NULL: normalize
    376 // buffer==NULL: isNormalized/spanQuickCheckYes
    377 const UChar *
    378 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
    379                            ReorderingBuffer *buffer,
    380                            UErrorCode &errorCode) const {
    381     UChar32 minNoCP=minDecompNoCP;
    382     if(limit==NULL) {
    383         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
    384         if(U_FAILURE(errorCode)) {
    385             return src;
    386         }
    387         limit=u_strchr(src, 0);
    388     }
    389 
    390     const UChar *prevSrc;
    391     UChar32 c=0;
    392     uint16_t norm16=0;
    393 
    394     // only for quick check
    395     const UChar *prevBoundary=src;
    396     uint8_t prevCC=0;
    397 
    398     for(;;) {
    399         // count code units below the minimum or with irrelevant data for the quick check
    400         for(prevSrc=src; src!=limit;) {
    401             if( (c=*src)<minNoCP ||
    402                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
    403             ) {
    404                 ++src;
    405             } else if(!U16_IS_SURROGATE(c)) {
    406                 break;
    407             } else {
    408                 UChar c2;
    409                 if(U16_IS_SURROGATE_LEAD(c)) {
    410                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
    411                         c=U16_GET_SUPPLEMENTARY(c, c2);
    412                     }
    413                 } else /* trail surrogate */ {
    414                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
    415                         --src;
    416                         c=U16_GET_SUPPLEMENTARY(c2, c);
    417                     }
    418                 }
    419                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
    420                     src+=U16_LENGTH(c);
    421                 } else {
    422                     break;
    423                 }
    424             }
    425         }
    426         // copy these code units all at once
    427         if(src!=prevSrc) {
    428             if(buffer!=NULL) {
    429                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
    430                     break;
    431                 }
    432             } else {
    433                 prevCC=0;
    434                 prevBoundary=src;
    435             }
    436         }
    437         if(src==limit) {
    438             break;
    439         }
    440 
    441         // Check one above-minimum, relevant code point.
    442         src+=U16_LENGTH(c);
    443         if(buffer!=NULL) {
    444             if(!decompose(c, norm16, *buffer, errorCode)) {
    445                 break;
    446             }
    447         } else {
    448             if(isDecompYes(norm16)) {
    449                 uint8_t cc=getCCFromYesOrMaybe(norm16);
    450                 if(prevCC<=cc || cc==0) {
    451                     prevCC=cc;
    452                     if(cc<=1) {
    453                         prevBoundary=src;
    454                     }
    455                     continue;
    456                 }
    457             }
    458             return prevBoundary;  // "no" or cc out of order
    459         }
    460     }
    461     return src;
    462 }
    463 
    464 // Decompose a short piece of text which is likely to contain characters that
    465 // fail the quick check loop and/or where the quick check loop's overhead
    466 // is unlikely to be amortized.
    467 // Called by the compose() and makeFCD() implementations.
    468 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
    469                                       ReorderingBuffer &buffer,
    470                                       UErrorCode &errorCode) const {
    471     while(src<limit) {
    472         UChar32 c;
    473         uint16_t norm16;
    474         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
    475         if(!decompose(c, norm16, buffer, errorCode)) {
    476             return FALSE;
    477         }
    478     }
    479     return TRUE;
    480 }
    481 
    482 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
    483                                  ReorderingBuffer &buffer,
    484                                  UErrorCode &errorCode) const {
    485     // Only loops for 1:1 algorithmic mappings.
    486     for(;;) {
    487         // get the decomposition and the lead and trail cc's
    488         if(isDecompYes(norm16)) {
    489             // c does not decompose
    490             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
    491         } else if(isHangul(norm16)) {
    492             // Hangul syllable: decompose algorithmically
    493             UChar jamos[3];
    494             return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
    495         } else if(isDecompNoAlgorithmic(norm16)) {
    496             c=mapAlgorithmic(c, norm16);
    497             norm16=getNorm16(c);
    498         } else {
    499             // c decomposes, get everything from the variable-length extra data
    500             const uint16_t *mapping=getMapping(norm16);
    501             uint16_t firstUnit=*mapping++;
    502             int32_t length=firstUnit&MAPPING_LENGTH_MASK;
    503             uint8_t leadCC, trailCC;
    504             trailCC=(uint8_t)(firstUnit>>8);
    505             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
    506                 leadCC=(uint8_t)(*mapping++>>8);
    507             } else {
    508                 leadCC=0;
    509             }
    510             return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode);
    511         }
    512     }
    513 }
    514 
    515 const UChar *
    516 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
    517     const UChar *decomp=NULL;
    518     uint16_t norm16;
    519     for(;;) {
    520         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
    521             // c does not decompose
    522             return decomp;
    523         } else if(isHangul(norm16)) {
    524             // Hangul syllable: decompose algorithmically
    525             length=Hangul::decompose(c, buffer);
    526             return buffer;
    527         } else if(isDecompNoAlgorithmic(norm16)) {
    528             c=mapAlgorithmic(c, norm16);
    529             decomp=buffer;
    530             length=0;
    531             U16_APPEND_UNSAFE(buffer, length, c);
    532         } else {
    533             // c decomposes, get everything from the variable-length extra data
    534             const uint16_t *mapping=getMapping(norm16);
    535             uint16_t firstUnit=*mapping++;
    536             length=firstUnit&MAPPING_LENGTH_MASK;
    537             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
    538                 ++mapping;
    539             }
    540             return (const UChar *)mapping;
    541         }
    542     }
    543 }
    544 
    545 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
    546                                          UBool doDecompose,
    547                                          ReorderingBuffer &buffer,
    548                                          UErrorCode &errorCode) const {
    549     if(doDecompose) {
    550         decompose(src, limit, &buffer, errorCode);
    551         return;
    552     }
    553     // Just merge the strings at the boundary.
    554     ForwardUTrie2StringIterator iter(normTrie, src, limit);
    555     uint8_t firstCC, prevCC, cc;
    556     firstCC=prevCC=cc=getCC(iter.next16());
    557     while(cc!=0) {
    558         prevCC=cc;
    559         cc=getCC(iter.next16());
    560     };
    561     buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) &&
    562         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
    563 }
    564 
    565 // Note: hasDecompBoundary() could be implemented as aliases to
    566 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
    567 // at the cost of building the FCD trie for a decomposition normalizer.
    568 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
    569     for(;;) {
    570         if(c<minDecompNoCP) {
    571             return TRUE;
    572         }
    573         uint16_t norm16=getNorm16(c);
    574         if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
    575             return TRUE;
    576         } else if(norm16>MIN_NORMAL_MAYBE_YES) {
    577             return FALSE;  // ccc!=0
    578         } else if(isDecompNoAlgorithmic(norm16)) {
    579             c=mapAlgorithmic(c, norm16);
    580         } else {
    581             // c decomposes, get everything from the variable-length extra data
    582             const uint16_t *mapping=getMapping(norm16);
    583             uint16_t firstUnit=*mapping++;
    584             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
    585                 return FALSE;
    586             }
    587             if(!before) {
    588                 // decomp after-boundary: same as hasFCDBoundaryAfter(),
    589                 // fcd16<=1 || trailCC==0
    590                 if(firstUnit>0x1ff) {
    591                     return FALSE;  // trailCC>1
    592                 }
    593                 if(firstUnit<=0xff) {
    594                     return TRUE;  // trailCC==0
    595                 }
    596                 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
    597             }
    598             // TRUE if leadCC==0 (hasFCDBoundaryBefore())
    599             return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0;
    600         }
    601     }
    602 }
    603 
    604 /*
    605  * Finds the recomposition result for
    606  * a forward-combining "lead" character,
    607  * specified with a pointer to its compositions list,
    608  * and a backward-combining "trail" character.
    609  *
    610  * If the lead and trail characters combine, then this function returns
    611  * the following "compositeAndFwd" value:
    612  * Bits 21..1  composite character
    613  * Bit      0  set if the composite is a forward-combining starter
    614  * otherwise it returns -1.
    615  *
    616  * The compositions list has (trail, compositeAndFwd) pair entries,
    617  * encoded as either pairs or triples of 16-bit units.
    618  * The last entry has the high bit of its first unit set.
    619  *
    620  * The list is sorted by ascending trail characters (there are no duplicates).
    621  * A linear search is used.
    622  *
    623  * See normalizer2impl.h for a more detailed description
    624  * of the compositions list format.
    625  */
    626 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
    627     uint16_t key1, firstUnit;
    628     if(trail<COMP_1_TRAIL_LIMIT) {
    629         // trail character is 0..33FF
    630         // result entry may have 2 or 3 units
    631         key1=(uint16_t)(trail<<1);
    632         while(key1>(firstUnit=*list)) {
    633             list+=2+(firstUnit&COMP_1_TRIPLE);
    634         }
    635         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
    636             if(firstUnit&COMP_1_TRIPLE) {
    637                 return ((int32_t)list[1]<<16)|list[2];
    638             } else {
    639                 return list[1];
    640             }
    641         }
    642     } else {
    643         // trail character is 3400..10FFFF
    644         // result entry has 3 units
    645         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
    646                         ((trail>>COMP_1_TRAIL_SHIFT))&
    647                          ~COMP_1_TRIPLE);
    648         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
    649         uint16_t secondUnit;
    650         for(;;) {
    651             if(key1>(firstUnit=*list)) {
    652                 list+=2+(firstUnit&COMP_1_TRIPLE);
    653             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
    654                 if(key2>(secondUnit=list[1])) {
    655                     if(firstUnit&COMP_1_LAST_TUPLE) {
    656                         break;
    657                     } else {
    658                         list+=3;
    659                     }
    660                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
    661                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
    662                 } else {
    663                     break;
    664                 }
    665             } else {
    666                 break;
    667             }
    668         }
    669     }
    670     return -1;
    671 }
    672 
    673 /*
    674  * Recomposes the buffer text starting at recomposeStartIndex
    675  * (which is in NFD - decomposed and canonically ordered),
    676  * and truncates the buffer contents.
    677  *
    678  * Note that recomposition never lengthens the text:
    679  * Any character consists of either one or two code units;
    680  * a composition may contain at most one more code unit than the original starter,
    681  * while the combining mark that is removed has at least one code unit.
    682  */
    683 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
    684                                 UBool onlyContiguous) const {
    685     UChar *p=buffer.getStart()+recomposeStartIndex;
    686     UChar *limit=buffer.getLimit();
    687     if(p==limit) {
    688         return;
    689     }
    690 
    691     UChar *starter, *pRemove, *q, *r;
    692     const uint16_t *compositionsList;
    693     UChar32 c, compositeAndFwd;
    694     uint16_t norm16;
    695     uint8_t cc, prevCC;
    696     UBool starterIsSupplementary;
    697 
    698     // Some of the following variables are not used until we have a forward-combining starter
    699     // and are only initialized now to avoid compiler warnings.
    700     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
    701     starter=NULL;
    702     starterIsSupplementary=FALSE;
    703     prevCC=0;
    704 
    705     for(;;) {
    706         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
    707         cc=getCCFromYesOrMaybe(norm16);
    708         if( // this character combines backward and
    709             isMaybe(norm16) &&
    710             // we have seen a starter that combines forward and
    711             compositionsList!=NULL &&
    712             // the backward-combining character is not blocked
    713             (prevCC<cc || prevCC==0)
    714         ) {
    715             if(isJamoVT(norm16)) {
    716                 // c is a Jamo V/T, see if we can compose it with the previous character.
    717                 if(c<Hangul::JAMO_T_BASE) {
    718                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
    719                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
    720                     if(prev<Hangul::JAMO_L_COUNT) {
    721                         pRemove=p-1;
    722                         UChar syllable=(UChar)
    723                             (Hangul::HANGUL_BASE+
    724                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
    725                              Hangul::JAMO_T_COUNT);
    726                         UChar t;
    727                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
    728                             ++p;
    729                             syllable+=t;  // The next character was a Jamo T.
    730                         }
    731                         *starter=syllable;
    732                         // remove the Jamo V/T
    733                         q=pRemove;
    734                         r=p;
    735                         while(r<limit) {
    736                             *q++=*r++;
    737                         }
    738                         limit=q;
    739                         p=pRemove;
    740                     }
    741                 }
    742                 /*
    743                  * No "else" for Jamo T:
    744                  * Since the input is in NFD, there are no Hangul LV syllables that
    745                  * a Jamo T could combine with.
    746                  * All Jamo Ts are combined above when handling Jamo Vs.
    747                  */
    748                 if(p==limit) {
    749                     break;
    750                 }
    751                 compositionsList=NULL;
    752                 continue;
    753             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
    754                 // The starter and the combining mark (c) do combine.
    755                 UChar32 composite=compositeAndFwd>>1;
    756 
    757                 // Replace the starter with the composite, remove the combining mark.
    758                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
    759                 if(starterIsSupplementary) {
    760                     if(U_IS_SUPPLEMENTARY(composite)) {
    761                         // both are supplementary
    762                         starter[0]=U16_LEAD(composite);
    763                         starter[1]=U16_TRAIL(composite);
    764                     } else {
    765                         *starter=(UChar)composite;
    766                         // The composite is shorter than the starter,
    767                         // move the intermediate characters forward one.
    768                         starterIsSupplementary=FALSE;
    769                         q=starter+1;
    770                         r=q+1;
    771                         while(r<pRemove) {
    772                             *q++=*r++;
    773                         }
    774                         --pRemove;
    775                     }
    776                 } else if(U_IS_SUPPLEMENTARY(composite)) {
    777                     // The composite is longer than the starter,
    778                     // move the intermediate characters back one.
    779                     starterIsSupplementary=TRUE;
    780                     ++starter;  // temporarily increment for the loop boundary
    781                     q=pRemove;
    782                     r=++pRemove;
    783                     while(starter<q) {
    784                         *--r=*--q;
    785                     }
    786                     *starter=U16_TRAIL(composite);
    787                     *--starter=U16_LEAD(composite);  // undo the temporary increment
    788                 } else {
    789                     // both are on the BMP
    790                     *starter=(UChar)composite;
    791                 }
    792 
    793                 /* remove the combining mark by moving the following text over it */
    794                 if(pRemove<p) {
    795                     q=pRemove;
    796                     r=p;
    797                     while(r<limit) {
    798                         *q++=*r++;
    799                     }
    800                     limit=q;
    801                     p=pRemove;
    802                 }
    803                 // Keep prevCC because we removed the combining mark.
    804 
    805                 if(p==limit) {
    806                     break;
    807                 }
    808                 // Is the composite a starter that combines forward?
    809                 if(compositeAndFwd&1) {
    810                     compositionsList=
    811                         getCompositionsListForComposite(getNorm16(composite));
    812                 } else {
    813                     compositionsList=NULL;
    814                 }
    815 
    816                 // We combined; continue with looking for compositions.
    817                 continue;
    818             }
    819         }
    820 
    821         // no combination this time
    822         prevCC=cc;
    823         if(p==limit) {
    824             break;
    825         }
    826 
    827         // If c did not combine, then check if it is a starter.
    828         if(cc==0) {
    829             // Found a new starter.
    830             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
    831                 // It may combine with something, prepare for it.
    832                 if(U_IS_BMP(c)) {
    833                     starterIsSupplementary=FALSE;
    834                     starter=p-1;
    835                 } else {
    836                     starterIsSupplementary=TRUE;
    837                     starter=p-2;
    838                 }
    839             }
    840         } else if(onlyContiguous) {
    841             // FCC: no discontiguous compositions; any intervening character blocks.
    842             compositionsList=NULL;
    843         }
    844     }
    845     buffer.setReorderingLimit(limit);
    846 }
    847 
    848 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
    849 // doCompose: normalize
    850 // !doCompose: isNormalized (buffer must be empty and initialized)
    851 UBool
    852 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
    853                          UBool onlyContiguous,
    854                          UBool doCompose,
    855                          ReorderingBuffer &buffer,
    856                          UErrorCode &errorCode) const {
    857     UChar32 minNoMaybeCP=minCompNoMaybeCP;
    858     if(limit==NULL) {
    859         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
    860                                            doCompose ? &buffer : NULL,
    861                                            errorCode);
    862         if(U_FAILURE(errorCode)) {
    863             return FALSE;
    864         }
    865         limit=u_strchr(src, 0);
    866     }
    867 
    868     /*
    869      * prevBoundary points to the last character before the current one
    870      * that has a composition boundary before it with ccc==0 and quick check "yes".
    871      * Keeping track of prevBoundary saves us looking for a composition boundary
    872      * when we find a "no" or "maybe".
    873      *
    874      * When we back out from prevSrc back to prevBoundary,
    875      * then we also remove those same characters (which had been simply copied
    876      * or canonically-order-inserted) from the ReorderingBuffer.
    877      * Therefore, at all times, the [prevBoundary..prevSrc[ source units
    878      * must correspond 1:1 to destination units at the end of the destination buffer.
    879      */
    880     const UChar *prevBoundary=src;
    881     const UChar *prevSrc;
    882     UChar32 c=0;
    883     uint16_t norm16=0;
    884 
    885     // only for isNormalized
    886     uint8_t prevCC=0;
    887 
    888     for(;;) {
    889         // count code units below the minimum or with irrelevant data for the quick check
    890         for(prevSrc=src; src!=limit;) {
    891             if( (c=*src)<minNoMaybeCP ||
    892                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
    893             ) {
    894                 ++src;
    895             } else if(!U16_IS_SURROGATE(c)) {
    896                 break;
    897             } else {
    898                 UChar c2;
    899                 if(U16_IS_SURROGATE_LEAD(c)) {
    900                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
    901                         c=U16_GET_SUPPLEMENTARY(c, c2);
    902                     }
    903                 } else /* trail surrogate */ {
    904                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
    905                         --src;
    906                         c=U16_GET_SUPPLEMENTARY(c2, c);
    907                     }
    908                 }
    909                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
    910                     src+=U16_LENGTH(c);
    911                 } else {
    912                     break;
    913                 }
    914             }
    915         }
    916         // copy these code units all at once
    917         if(src!=prevSrc) {
    918             if(doCompose) {
    919                 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
    920                     break;
    921                 }
    922             } else {
    923                 prevCC=0;
    924             }
    925             if(src==limit) {
    926                 break;
    927             }
    928             // Set prevBoundary to the last character in the quick check loop.
    929             prevBoundary=src-1;
    930             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
    931                 U16_IS_LEAD(*(prevBoundary-1))
    932             ) {
    933                 --prevBoundary;
    934             }
    935             // The start of the current character (c).
    936             prevSrc=src;
    937         } else if(src==limit) {
    938             break;
    939         }
    940 
    941         src+=U16_LENGTH(c);
    942         /*
    943          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
    944          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
    945          * or has ccc!=0.
    946          * Check for Jamo V/T, then for regular characters.
    947          * c is not a Hangul syllable or Jamo L because those have "yes" properties.
    948          */
    949         if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
    950             UChar prev=*(prevSrc-1);
    951             UBool needToDecompose=FALSE;
    952             if(c<Hangul::JAMO_T_BASE) {
    953                 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
    954                 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
    955                 if(prev<Hangul::JAMO_L_COUNT) {
    956                     if(!doCompose) {
    957                         return FALSE;
    958                     }
    959                     UChar syllable=(UChar)
    960                         (Hangul::HANGUL_BASE+
    961                          (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
    962                          Hangul::JAMO_T_COUNT);
    963                     UChar t;
    964                     if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
    965                         ++src;
    966                         syllable+=t;  // The next character was a Jamo T.
    967                         prevBoundary=src;
    968                         buffer.setLastChar(syllable);
    969                         continue;
    970                     }
    971                     // If we see L+V+x where x!=T then we drop to the slow path,
    972                     // decompose and recompose.
    973                     // This is to deal with NFKC finding normal L and V but a
    974                     // compatibility variant of a T. We need to either fully compose that
    975                     // combination here (which would complicate the code and may not work
    976                     // with strange custom data) or use the slow path -- or else our replacing
    977                     // two input characters (L+V) with one output character (LV syllable)
    978                     // would violate the invariant that [prevBoundary..prevSrc[ has the same
    979                     // length as what we appended to the buffer since prevBoundary.
    980                     needToDecompose=TRUE;
    981                 }
    982             } else if(Hangul::isHangulWithoutJamoT(prev)) {
    983                 // c is a Jamo Trailing consonant,
    984                 // compose with previous Hangul LV that does not contain a Jamo T.
    985                 if(!doCompose) {
    986                     return FALSE;
    987                 }
    988                 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
    989                 prevBoundary=src;
    990                 continue;
    991             }
    992             if(!needToDecompose) {
    993                 // The Jamo V/T did not compose into a Hangul syllable.
    994                 if(doCompose) {
    995                     if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
    996                         break;
    997                     }
    998                 } else {
    999                     prevCC=0;
   1000                 }
   1001                 continue;
   1002             }
   1003         }
   1004         /*
   1005          * Source buffer pointers:
   1006          *
   1007          *  all done      quick check   current char  not yet
   1008          *                "yes" but     (c)           processed
   1009          *                may combine
   1010          *                forward
   1011          * [-------------[-------------[-------------[-------------[
   1012          * |             |             |             |             |
   1013          * orig. src     prevBoundary  prevSrc       src           limit
   1014          *
   1015          *
   1016          * Destination buffer pointers inside the ReorderingBuffer:
   1017          *
   1018          *  all done      might take    not filled yet
   1019          *                characters for
   1020          *                reordering
   1021          * [-------------[-------------[-------------[
   1022          * |             |             |             |
   1023          * start         reorderStart  limit         |
   1024          *                             +remainingCap.+
   1025          */
   1026         if(norm16>=MIN_YES_YES_WITH_CC) {
   1027             uint8_t cc=(uint8_t)norm16;  // cc!=0
   1028             if( onlyContiguous &&  // FCC
   1029                 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
   1030                 prevBoundary<prevSrc &&
   1031                 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
   1032                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
   1033                 // passed the quick check "yes && ccc==0" test.
   1034                 // Check whether the last character was a "yesYes" or a "yesNo".
   1035                 // If a "yesNo", then we get its trailing ccc from its
   1036                 // mapping and check for canonical order.
   1037                 // All other cases are ok.
   1038                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
   1039             ) {
   1040                 // Fails FCD test, need to decompose and contiguously recompose.
   1041                 if(!doCompose) {
   1042                     return FALSE;
   1043                 }
   1044             } else if(doCompose) {
   1045                 if(!buffer.append(c, cc, errorCode)) {
   1046                     break;
   1047                 }
   1048                 continue;
   1049             } else if(prevCC<=cc) {
   1050                 prevCC=cc;
   1051                 continue;
   1052             } else {
   1053                 return FALSE;
   1054             }
   1055         } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
   1056             return FALSE;
   1057         }
   1058 
   1059         /*
   1060          * Find appropriate boundaries around this character,
   1061          * decompose the source text from between the boundaries,
   1062          * and recompose it.
   1063          *
   1064          * We may need to remove the last few characters from the ReorderingBuffer
   1065          * to account for source text that was copied or appended
   1066          * but needs to take part in the recomposition.
   1067          */
   1068 
   1069         /*
   1070          * Find the last composition boundary in [prevBoundary..src[.
   1071          * It is either the decomposition of the current character (at prevSrc),
   1072          * or prevBoundary.
   1073          */
   1074         if(hasCompBoundaryBefore(c, norm16)) {
   1075             prevBoundary=prevSrc;
   1076         } else if(doCompose) {
   1077             buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
   1078         }
   1079 
   1080         // Find the next composition boundary in [src..limit[ -
   1081         // modifies src to point to the next starter.
   1082         src=(UChar *)findNextCompBoundary(src, limit);
   1083 
   1084         // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
   1085         int32_t recomposeStartIndex=buffer.length();
   1086         if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
   1087             break;
   1088         }
   1089         recompose(buffer, recomposeStartIndex, onlyContiguous);
   1090         if(!doCompose) {
   1091             if(!buffer.equals(prevBoundary, src)) {
   1092                 return FALSE;
   1093             }
   1094             buffer.remove();
   1095             prevCC=0;
   1096         }
   1097 
   1098         // Move to the next starter. We never need to look back before this point again.
   1099         prevBoundary=src;
   1100     }
   1101     return TRUE;
   1102 }
   1103 
   1104 // Very similar to compose(): Make the same changes in both places if relevant.
   1105 // pQCResult==NULL: spanQuickCheckYes
   1106 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
   1107 const UChar *
   1108 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
   1109                                    UBool onlyContiguous,
   1110                                    UNormalizationCheckResult *pQCResult) const {
   1111     UChar32 minNoMaybeCP=minCompNoMaybeCP;
   1112     if(limit==NULL) {
   1113         UErrorCode errorCode=U_ZERO_ERROR;
   1114         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
   1115         limit=u_strchr(src, 0);
   1116     }
   1117 
   1118     /*
   1119      * prevBoundary points to the last character before the current one
   1120      * that has a composition boundary before it with ccc==0 and quick check "yes".
   1121      */
   1122     const UChar *prevBoundary=src;
   1123     const UChar *prevSrc;
   1124     UChar32 c=0;
   1125     uint16_t norm16=0;
   1126     uint8_t prevCC=0;
   1127 
   1128     for(;;) {
   1129         // count code units below the minimum or with irrelevant data for the quick check
   1130         for(prevSrc=src;;) {
   1131             if(src==limit) {
   1132                 return src;
   1133             }
   1134             if( (c=*src)<minNoMaybeCP ||
   1135                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
   1136             ) {
   1137                 ++src;
   1138             } else if(!U16_IS_SURROGATE(c)) {
   1139                 break;
   1140             } else {
   1141                 UChar c2;
   1142                 if(U16_IS_SURROGATE_LEAD(c)) {
   1143                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
   1144                         c=U16_GET_SUPPLEMENTARY(c, c2);
   1145                     }
   1146                 } else /* trail surrogate */ {
   1147                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
   1148                         --src;
   1149                         c=U16_GET_SUPPLEMENTARY(c2, c);
   1150                     }
   1151                 }
   1152                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
   1153                     src+=U16_LENGTH(c);
   1154                 } else {
   1155                     break;
   1156                 }
   1157             }
   1158         }
   1159         if(src!=prevSrc) {
   1160             // Set prevBoundary to the last character in the quick check loop.
   1161             prevBoundary=src-1;
   1162             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
   1163                 U16_IS_LEAD(*(prevBoundary-1))
   1164             ) {
   1165                 --prevBoundary;
   1166             }
   1167             prevCC=0;
   1168             // The start of the current character (c).
   1169             prevSrc=src;
   1170         }
   1171 
   1172         src+=U16_LENGTH(c);
   1173         /*
   1174          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
   1175          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
   1176          * or has ccc!=0.
   1177          */
   1178         if(isMaybeOrNonZeroCC(norm16)) {
   1179             uint8_t cc=getCCFromYesOrMaybe(norm16);
   1180             if( onlyContiguous &&  // FCC
   1181                 cc!=0 &&
   1182                 prevCC==0 &&
   1183                 prevBoundary<prevSrc &&
   1184                 // prevCC==0 && prevBoundary<prevSrc tell us that
   1185                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
   1186                 // passed the quick check "yes && ccc==0" test.
   1187                 // Check whether the last character was a "yesYes" or a "yesNo".
   1188                 // If a "yesNo", then we get its trailing ccc from its
   1189                 // mapping and check for canonical order.
   1190                 // All other cases are ok.
   1191                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
   1192             ) {
   1193                 // Fails FCD test.
   1194             } else if(prevCC<=cc || cc==0) {
   1195                 prevCC=cc;
   1196                 if(norm16<MIN_YES_YES_WITH_CC) {
   1197                     if(pQCResult!=NULL) {
   1198                         *pQCResult=UNORM_MAYBE;
   1199                     } else {
   1200                         return prevBoundary;
   1201                     }
   1202                 }
   1203                 continue;
   1204             }
   1205         }
   1206         if(pQCResult!=NULL) {
   1207             *pQCResult=UNORM_NO;
   1208         }
   1209         return prevBoundary;
   1210     }
   1211 }
   1212 
   1213 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
   1214                                        UBool doCompose,
   1215                                        UBool onlyContiguous,
   1216                                        ReorderingBuffer &buffer,
   1217                                        UErrorCode &errorCode) const {
   1218     if(!buffer.isEmpty()) {
   1219         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
   1220         if(src!=firstStarterInSrc) {
   1221             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
   1222                                                                     buffer.getLimit());
   1223             UnicodeString middle(lastStarterInDest,
   1224                                  (int32_t)(buffer.getLimit()-lastStarterInDest));
   1225             buffer.removeSuffix((int32_t)(buffer.getLimit()-lastStarterInDest));
   1226             middle.append(src, (int32_t)(firstStarterInSrc-src));
   1227             const UChar *middleStart=middle.getBuffer();
   1228             compose(middleStart, middleStart+middle.length(), onlyContiguous,
   1229                     TRUE, buffer, errorCode);
   1230             if(U_FAILURE(errorCode)) {
   1231                 return;
   1232             }
   1233             src=firstStarterInSrc;
   1234         }
   1235     }
   1236     if(doCompose) {
   1237         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
   1238     } else {
   1239         buffer.appendZeroCC(src, limit, errorCode);
   1240     }
   1241 }
   1242 
   1243 /**
   1244  * Does c have a composition boundary before it?
   1245  * True if its decomposition begins with a character that has
   1246  * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
   1247  * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
   1248  * (isCompYesAndZeroCC()) so we need not decompose.
   1249  */
   1250 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
   1251     for(;;) {
   1252         if(isCompYesAndZeroCC(norm16)) {
   1253             return TRUE;
   1254         } else if(isMaybeOrNonZeroCC(norm16)) {
   1255             return FALSE;
   1256         } else if(isDecompNoAlgorithmic(norm16)) {
   1257             c=mapAlgorithmic(c, norm16);
   1258             norm16=getNorm16(c);
   1259         } else {
   1260             // c decomposes, get everything from the variable-length extra data
   1261             const uint16_t *mapping=getMapping(norm16);
   1262             uint16_t firstUnit=*mapping++;
   1263             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
   1264                 return FALSE;
   1265             }
   1266             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) {
   1267                 return FALSE;  // non-zero leadCC
   1268             }
   1269             int32_t i=0;
   1270             UChar32 c;
   1271             U16_NEXT_UNSAFE(mapping, i, c);
   1272             return isCompYesAndZeroCC(getNorm16(c));
   1273         }
   1274     }
   1275 }
   1276 
   1277 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
   1278     for(;;) {
   1279         uint16_t norm16=getNorm16(c);
   1280         if(isInert(norm16)) {
   1281             return TRUE;
   1282         } else if(norm16<=minYesNo) {
   1283             // Hangul LVT (==minYesNo) has a boundary after it.
   1284             // Hangul LV and non-inert yesYes characters combine forward.
   1285             return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
   1286         } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
   1287             return FALSE;
   1288         } else if(isDecompNoAlgorithmic(norm16)) {
   1289             c=mapAlgorithmic(c, norm16);
   1290         } else {
   1291             // c decomposes, get everything from the variable-length extra data.
   1292             // If testInert, then c must be a yesNo character which has lccc=0,
   1293             // otherwise it could be a noNo.
   1294             const uint16_t *mapping=getMapping(norm16);
   1295             uint16_t firstUnit=*mapping;
   1296             // TRUE if
   1297             //      c is not deleted, and
   1298             //      it and its decomposition do not combine forward, and it has a starter, and
   1299             //      if FCC then trailCC<=1
   1300             return
   1301                 (firstUnit&MAPPING_LENGTH_MASK)!=0 &&
   1302                 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
   1303                 (!onlyContiguous || firstUnit<=0x1ff);
   1304         }
   1305     }
   1306 }
   1307 
   1308 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
   1309     BackwardUTrie2StringIterator iter(normTrie, start, p);
   1310     uint16_t norm16;
   1311     do {
   1312         norm16=iter.previous16();
   1313     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
   1314     // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
   1315     // but that's probably not worth the extra cost.
   1316     return iter.codePointStart;
   1317 }
   1318 
   1319 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
   1320     ForwardUTrie2StringIterator iter(normTrie, p, limit);
   1321     uint16_t norm16;
   1322     do {
   1323         norm16=iter.next16();
   1324     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
   1325     return iter.codePointStart;
   1326 }
   1327 
   1328 class FCDTrieSingleton : public UTrie2Singleton {
   1329 public:
   1330     FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
   1331         UTrie2Singleton(s), impl(ni), errorCode(ec) {}
   1332     UTrie2 *getInstance(UErrorCode &errorCode) {
   1333         return UTrie2Singleton::getInstance(createInstance, this, errorCode);
   1334     }
   1335     static void *createInstance(const void *context, UErrorCode &errorCode);
   1336     UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
   1337         if(value!=0) {
   1338             impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode);
   1339         }
   1340         return U_SUCCESS(errorCode);
   1341     }
   1342 
   1343     Normalizer2Impl &impl;
   1344     UTrie2 *newFCDTrie;
   1345     UErrorCode &errorCode;
   1346 };
   1347 
   1348 U_CDECL_BEGIN
   1349 
   1350 // Set the FCD value for a range of same-norm16 characters.
   1351 static UBool U_CALLCONV
   1352 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
   1353     return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value);
   1354 }
   1355 
   1356 // Collect (OR together) the FCD values for a range of supplementary characters,
   1357 // for their lead surrogate code unit.
   1358 static UBool U_CALLCONV
   1359 enumRangeOrValue(const void *context, UChar32 start, UChar32 end, uint32_t value) {
   1360     *((uint32_t *)context)|=value;
   1361     return TRUE;
   1362 }
   1363 
   1364 U_CDECL_END
   1365 
   1366 void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) {
   1367     FCDTrieSingleton *me=(FCDTrieSingleton *)context;
   1368     me->newFCDTrie=utrie2_open(0, 0, &errorCode);
   1369     if(U_SUCCESS(errorCode)) {
   1370         utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me);
   1371         for(UChar lead=0xd800; lead<0xdc00; ++lead) {
   1372             uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead);
   1373             utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue);
   1374             if(oredValue!=0) {
   1375                 // Set a "bad" value for makeFCD() to break the quick check loop
   1376                 // and look up the value for the supplementary code point.
   1377                 // If there is any lccc, then set the worst-case lccc of 1.
   1378                 // The ORed-together value's tccc is already the worst case.
   1379                 if(oredValue>0xff) {
   1380                     oredValue=0x100|(oredValue&0xff);
   1381                 }
   1382                 utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode);
   1383             }
   1384         }
   1385         utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode);
   1386         if(U_SUCCESS(errorCode)) {
   1387             return me->newFCDTrie;
   1388         }
   1389     }
   1390     utrie2_close(me->newFCDTrie);
   1391     return NULL;
   1392 }
   1393 
   1394 void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
   1395                                          UTrie2 *newFCDTrie, UErrorCode &errorCode) const {
   1396     // Only loops for 1:1 algorithmic mappings.
   1397     for(;;) {
   1398         if(norm16>=MIN_NORMAL_MAYBE_YES) {
   1399             norm16&=0xff;
   1400             norm16|=norm16<<8;
   1401         } else if(norm16<=minYesNo || minMaybeYes<=norm16) {
   1402             // no decomposition or Hangul syllable, all zeros
   1403             break;
   1404         } else if(limitNoNo<=norm16) {
   1405             int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1);
   1406             if(start==end) {
   1407                 start+=delta;
   1408                 norm16=getNorm16(start);
   1409             } else {
   1410                 // the same delta leads from different original characters to different mappings
   1411                 do {
   1412                     UChar32 c=start+delta;
   1413                     setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode);
   1414                 } while(++start<=end);
   1415                 break;
   1416             }
   1417         } else {
   1418             // c decomposes, get everything from the variable-length extra data
   1419             const uint16_t *mapping=getMapping(norm16);
   1420             uint16_t firstUnit=*mapping;
   1421             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
   1422                 // A character that is deleted (maps to an empty string) must
   1423                 // get the worst-case lccc and tccc values because arbitrary
   1424                 // characters on both sides will become adjacent.
   1425                 norm16=0x1ff;
   1426             } else {
   1427                 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
   1428                     norm16=mapping[1]&0xff00;  // lccc
   1429                 } else {
   1430                     norm16=0;
   1431                 }
   1432                 norm16|=firstUnit>>8;  // tccc
   1433             }
   1434         }
   1435         utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);
   1436         break;
   1437     }
   1438 }
   1439 
   1440 const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {
   1441     // Logically const: Synchronized instantiation.
   1442     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
   1443     return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode);
   1444 }
   1445 
   1446 // Dual functionality:
   1447 // buffer!=NULL: normalize
   1448 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
   1449 const UChar *
   1450 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
   1451                          ReorderingBuffer *buffer,
   1452                          UErrorCode &errorCode) const {
   1453     if(limit==NULL) {
   1454         src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
   1455         if(U_FAILURE(errorCode)) {
   1456             return src;
   1457         }
   1458         limit=u_strchr(src, 0);
   1459     }
   1460 
   1461     // Note: In this function we use buffer->appendZeroCC() because we track
   1462     // the lead and trail combining classes here, rather than leaving it to
   1463     // the ReorderingBuffer.
   1464     // The exception is the call to decomposeShort() which uses the buffer
   1465     // in the normal way.
   1466 
   1467     const UTrie2 *trie=fcdTrie();
   1468 
   1469     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
   1470     // Similar to the prevBoundary in the compose() implementation.
   1471     const UChar *prevBoundary=src;
   1472     const UChar *prevSrc;
   1473     UChar32 c=0;
   1474     int32_t prevFCD16=0;
   1475     uint16_t fcd16=0;
   1476 
   1477     for(;;) {
   1478         // count code units with lccc==0
   1479         for(prevSrc=src; src!=limit;) {
   1480             if((c=*src)<MIN_CCC_LCCC_CP) {
   1481                 prevFCD16=~c;
   1482                 ++src;
   1483             } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) {
   1484                 prevFCD16=fcd16;
   1485                 ++src;
   1486             } else if(!U16_IS_SURROGATE(c)) {
   1487                 break;
   1488             } else {
   1489                 UChar c2;
   1490                 if(U16_IS_SURROGATE_LEAD(c)) {
   1491                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
   1492                         c=U16_GET_SUPPLEMENTARY(c, c2);
   1493                     }
   1494                 } else /* trail surrogate */ {
   1495                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
   1496                         --src;
   1497                         c=U16_GET_SUPPLEMENTARY(c2, c);
   1498                     }
   1499                 }
   1500                 if((fcd16=getFCD16(c))<=0xff) {
   1501                     prevFCD16=fcd16;
   1502                     src+=U16_LENGTH(c);
   1503                 } else {
   1504                     break;
   1505                 }
   1506             }
   1507         }
   1508         // copy these code units all at once
   1509         if(src!=prevSrc) {
   1510             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
   1511                 break;
   1512             }
   1513             if(src==limit) {
   1514                 break;
   1515             }
   1516             prevBoundary=src;
   1517             // We know that the previous character's lccc==0.
   1518             if(prevFCD16<0) {
   1519                 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
   1520                 prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16);
   1521                 if(prevFCD16>1) {
   1522                     --prevBoundary;
   1523                 }
   1524             } else {
   1525                 const UChar *p=src-1;
   1526                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
   1527                     --p;
   1528                     // Need to fetch the previous character's FCD value because
   1529                     // prevFCD16 was just for the trail surrogate code point.
   1530                     prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]);
   1531                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
   1532                 }
   1533                 if(prevFCD16>1) {
   1534                     prevBoundary=p;
   1535                 }
   1536             }
   1537             // The start of the current character (c).
   1538             prevSrc=src;
   1539         } else if(src==limit) {
   1540             break;
   1541         }
   1542 
   1543         src+=U16_LENGTH(c);
   1544         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
   1545         // Check for proper order, and decompose locally if necessary.
   1546         if((prevFCD16&0xff)<=(fcd16>>8)) {
   1547             // proper order: prev tccc <= current lccc
   1548             if((fcd16&0xff)<=1) {
   1549                 prevBoundary=src;
   1550             }
   1551             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
   1552                 break;
   1553             }
   1554             prevFCD16=fcd16;
   1555             continue;
   1556         } else if(buffer==NULL) {
   1557             return prevBoundary;  // quick check "no"
   1558         } else {
   1559             /*
   1560              * Back out the part of the source that we copied or appended
   1561              * already but is now going to be decomposed.
   1562              * prevSrc is set to after what was copied/appended.
   1563              */
   1564             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
   1565             /*
   1566              * Find the part of the source that needs to be decomposed,
   1567              * up to the next safe boundary.
   1568              */
   1569             src=findNextFCDBoundary(src, limit);
   1570             /*
   1571              * The source text does not fulfill the conditions for FCD.
   1572              * Decompose and reorder a limited piece of the text.
   1573              */
   1574             if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
   1575                 break;
   1576             }
   1577             prevBoundary=src;
   1578             prevFCD16=0;
   1579         }
   1580     }
   1581     return src;
   1582 }
   1583 
   1584 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
   1585                                        UBool doMakeFCD,
   1586                                        ReorderingBuffer &buffer,
   1587                                        UErrorCode &errorCode) const {
   1588     if(!buffer.isEmpty()) {
   1589         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
   1590         if(src!=firstBoundaryInSrc) {
   1591             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
   1592                                                                     buffer.getLimit());
   1593             UnicodeString middle(lastBoundaryInDest,
   1594                                  (int32_t)(buffer.getLimit()-lastBoundaryInDest));
   1595             buffer.removeSuffix((int32_t)(buffer.getLimit()-lastBoundaryInDest));
   1596             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
   1597             const UChar *middleStart=middle.getBuffer();
   1598             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
   1599             if(U_FAILURE(errorCode)) {
   1600                 return;
   1601             }
   1602             src=firstBoundaryInSrc;
   1603         }
   1604     }
   1605     if(doMakeFCD) {
   1606         makeFCD(src, limit, &buffer, errorCode);
   1607     } else {
   1608         buffer.appendZeroCC(src, limit, errorCode);
   1609     }
   1610 }
   1611 
   1612 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
   1613     BackwardUTrie2StringIterator iter(fcdTrie(), start, p);
   1614     uint16_t fcd16;
   1615     do {
   1616         fcd16=iter.previous16();
   1617     } while(fcd16>0xff);
   1618     return iter.codePointStart;
   1619 }
   1620 
   1621 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
   1622     ForwardUTrie2StringIterator iter(fcdTrie(), p, limit);
   1623     uint16_t fcd16;
   1624     do {
   1625         fcd16=iter.next16();
   1626     } while(fcd16>0xff);
   1627     return iter.codePointStart;
   1628 }
   1629 
   1630 U_NAMESPACE_END
   1631 
   1632 // Normalizer2 data swapping ----------------------------------------------- ***
   1633 
   1634 U_NAMESPACE_USE
   1635 
   1636 U_CAPI int32_t U_EXPORT2
   1637 unorm2_swap(const UDataSwapper *ds,
   1638             const void *inData, int32_t length, void *outData,
   1639             UErrorCode *pErrorCode) {
   1640     const UDataInfo *pInfo;
   1641     int32_t headerSize;
   1642 
   1643     const uint8_t *inBytes;
   1644     uint8_t *outBytes;
   1645 
   1646     const int32_t *inIndexes;
   1647     int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
   1648 
   1649     int32_t i, offset, nextOffset, size;
   1650 
   1651     /* udata_swapDataHeader checks the arguments */
   1652     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
   1653     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1654         return 0;
   1655     }
   1656 
   1657     /* check data format and format version */
   1658     pInfo=(const UDataInfo *)((const char *)inData+4);
   1659     if(!(
   1660         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
   1661         pInfo->dataFormat[1]==0x72 &&
   1662         pInfo->dataFormat[2]==0x6d &&
   1663         pInfo->dataFormat[3]==0x32 &&
   1664         pInfo->formatVersion[0]==1
   1665     )) {
   1666         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
   1667                          pInfo->dataFormat[0], pInfo->dataFormat[1],
   1668                          pInfo->dataFormat[2], pInfo->dataFormat[3],
   1669                          pInfo->formatVersion[0]);
   1670         *pErrorCode=U_UNSUPPORTED_ERROR;
   1671         return 0;
   1672     }
   1673 
   1674     inBytes=(const uint8_t *)inData+headerSize;
   1675     outBytes=(uint8_t *)outData+headerSize;
   1676 
   1677     inIndexes=(const int32_t *)inBytes;
   1678 
   1679     if(length>=0) {
   1680         length-=headerSize;
   1681         if(length<sizeof(indexes)) {
   1682             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
   1683                              length);
   1684             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1685             return 0;
   1686         }
   1687     }
   1688 
   1689     /* read the first few indexes */
   1690     for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
   1691         indexes[i]=udata_readInt32(ds, inIndexes[i]);
   1692     }
   1693 
   1694     /* get the total length of the data */
   1695     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
   1696 
   1697     if(length>=0) {
   1698         if(length<size) {
   1699             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
   1700                              length);
   1701             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1702             return 0;
   1703         }
   1704 
   1705         /* copy the data for inaccessible bytes */
   1706         if(inBytes!=outBytes) {
   1707             uprv_memcpy(outBytes, inBytes, size);
   1708         }
   1709 
   1710         offset=0;
   1711 
   1712         /* swap the int32_t indexes[] */
   1713         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
   1714         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
   1715         offset=nextOffset;
   1716 
   1717         /* swap the UTrie2 */
   1718         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
   1719         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
   1720         offset=nextOffset;
   1721 
   1722         /* swap the uint16_t extraData[] */
   1723         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1];
   1724         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
   1725         offset=nextOffset;
   1726 
   1727         U_ASSERT(offset==size);
   1728     }
   1729 
   1730     return headerSize+size;
   1731 }
   1732 
   1733 #endif  // !UCONFIG_NO_NORMALIZATION
   1734