Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2009-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  normalizer2impl.cpp
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009nov22
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_NORMALIZATION
     22 
     23 #include "unicode/normalizer2.h"
     24 #include "unicode/udata.h"
     25 #include "unicode/ustring.h"
     26 #include "unicode/utf16.h"
     27 #include "cmemory.h"
     28 #include "mutex.h"
     29 #include "normalizer2impl.h"
     30 #include "putilimp.h"
     31 #include "uassert.h"
     32 #include "uset_imp.h"
     33 #include "utrie2.h"
     34 #include "uvector.h"
     35 
     36 U_NAMESPACE_BEGIN
     37 
     38 // ReorderingBuffer -------------------------------------------------------- ***
     39 
     40 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
     41     int32_t length=str.length();
     42     start=str.getBuffer(destCapacity);
     43     if(start==NULL) {
     44         // getBuffer() already did str.setToBogus()
     45         errorCode=U_MEMORY_ALLOCATION_ERROR;
     46         return FALSE;
     47     }
     48     limit=start+length;
     49     remainingCapacity=str.getCapacity()-length;
     50     reorderStart=start;
     51     if(start==limit) {
     52         lastCC=0;
     53     } else {
     54         setIterator();
     55         lastCC=previousCC();
     56         // Set reorderStart after the last code point with cc<=1 if there is one.
     57         if(lastCC>1) {
     58             while(previousCC()>1) {}
     59         }
     60         reorderStart=codePointLimit;
     61     }
     62     return TRUE;
     63 }
     64 
     65 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
     66     int32_t length=(int32_t)(limit-start);
     67     return
     68         length==(int32_t)(otherLimit-otherStart) &&
     69         0==u_memcmp(start, otherStart, length);
     70 }
     71 
     72 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
     73     if(remainingCapacity<2 && !resize(2, errorCode)) {
     74         return FALSE;
     75     }
     76     if(lastCC<=cc || cc==0) {
     77         limit[0]=U16_LEAD(c);
     78         limit[1]=U16_TRAIL(c);
     79         limit+=2;
     80         lastCC=cc;
     81         if(cc<=1) {
     82             reorderStart=limit;
     83         }
     84     } else {
     85         insert(c, cc);
     86     }
     87     remainingCapacity-=2;
     88     return TRUE;
     89 }
     90 
     91 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
     92                                uint8_t leadCC, uint8_t trailCC,
     93                                UErrorCode &errorCode) {
     94     if(length==0) {
     95         return TRUE;
     96     }
     97     if(remainingCapacity<length && !resize(length, errorCode)) {
     98         return FALSE;
     99     }
    100     remainingCapacity-=length;
    101     if(lastCC<=leadCC || leadCC==0) {
    102         if(trailCC<=1) {
    103             reorderStart=limit+length;
    104         } else if(leadCC<=1) {
    105             reorderStart=limit+1;  // Ok if not a code point boundary.
    106         }
    107         const UChar *sLimit=s+length;
    108         do { *limit++=*s++; } while(s!=sLimit);
    109         lastCC=trailCC;
    110     } else {
    111         int32_t i=0;
    112         UChar32 c;
    113         U16_NEXT(s, i, length, c);
    114         insert(c, leadCC);  // insert first code point
    115         while(i<length) {
    116             U16_NEXT(s, i, length, c);
    117             if(i<length) {
    118                 // s must be in NFD, otherwise we need to use getCC().
    119                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
    120             } else {
    121                 leadCC=trailCC;
    122             }
    123             append(c, leadCC, errorCode);
    124         }
    125     }
    126     return TRUE;
    127 }
    128 
    129 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
    130     int32_t cpLength=U16_LENGTH(c);
    131     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
    132         return FALSE;
    133     }
    134     remainingCapacity-=cpLength;
    135     if(cpLength==1) {
    136         *limit++=(UChar)c;
    137     } else {
    138         limit[0]=U16_LEAD(c);
    139         limit[1]=U16_TRAIL(c);
    140         limit+=2;
    141     }
    142     lastCC=0;
    143     reorderStart=limit;
    144     return TRUE;
    145 }
    146 
    147 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
    148     if(s==sLimit) {
    149         return TRUE;
    150     }
    151     int32_t length=(int32_t)(sLimit-s);
    152     if(remainingCapacity<length && !resize(length, errorCode)) {
    153         return FALSE;
    154     }
    155     u_memcpy(limit, s, length);
    156     limit+=length;
    157     remainingCapacity-=length;
    158     lastCC=0;
    159     reorderStart=limit;
    160     return TRUE;
    161 }
    162 
    163 void ReorderingBuffer::remove() {
    164     reorderStart=limit=start;
    165     remainingCapacity=str.getCapacity();
    166     lastCC=0;
    167 }
    168 
    169 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
    170     if(suffixLength<(limit-start)) {
    171         limit-=suffixLength;
    172         remainingCapacity+=suffixLength;
    173     } else {
    174         limit=start;
    175         remainingCapacity=str.getCapacity();
    176     }
    177     lastCC=0;
    178     reorderStart=limit;
    179 }
    180 
    181 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
    182     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
    183     int32_t length=(int32_t)(limit-start);
    184     str.releaseBuffer(length);
    185     int32_t newCapacity=length+appendLength;
    186     int32_t doubleCapacity=2*str.getCapacity();
    187     if(newCapacity<doubleCapacity) {
    188         newCapacity=doubleCapacity;
    189     }
    190     if(newCapacity<256) {
    191         newCapacity=256;
    192     }
    193     start=str.getBuffer(newCapacity);
    194     if(start==NULL) {
    195         // getBuffer() already did str.setToBogus()
    196         errorCode=U_MEMORY_ALLOCATION_ERROR;
    197         return FALSE;
    198     }
    199     reorderStart=start+reorderStartIndex;
    200     limit=start+length;
    201     remainingCapacity=str.getCapacity()-length;
    202     return TRUE;
    203 }
    204 
    205 void ReorderingBuffer::skipPrevious() {
    206     codePointLimit=codePointStart;
    207     UChar c=*--codePointStart;
    208     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
    209         --codePointStart;
    210     }
    211 }
    212 
    213 uint8_t ReorderingBuffer::previousCC() {
    214     codePointLimit=codePointStart;
    215     if(reorderStart>=codePointStart) {
    216         return 0;
    217     }
    218     UChar32 c=*--codePointStart;
    219     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
    220         return 0;
    221     }
    222 
    223     UChar c2;
    224     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
    225         --codePointStart;
    226         c=U16_GET_SUPPLEMENTARY(c2, c);
    227     }
    228     return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
    229 }
    230 
    231 // Inserts c somewhere before the last character.
    232 // Requires 0<cc<lastCC which implies reorderStart<limit.
    233 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
    234     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
    235     // insert c at codePointLimit, after the character with prevCC<=cc
    236     UChar *q=limit;
    237     UChar *r=limit+=U16_LENGTH(c);
    238     do {
    239         *--r=*--q;
    240     } while(codePointLimit!=q);
    241     writeCodePoint(q, c);
    242     if(cc<=1) {
    243         reorderStart=r;
    244     }
    245 }
    246 
    247 // Normalizer2Impl --------------------------------------------------------- ***
    248 
    249 struct CanonIterData : public UMemory {
    250     CanonIterData(UErrorCode &errorCode);
    251     ~CanonIterData();
    252     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
    253     UTrie2 *trie;
    254     UVector canonStartSets;  // contains UnicodeSet *
    255 };
    256 
    257 Normalizer2Impl::~Normalizer2Impl() {
    258     delete fCanonIterData;
    259 }
    260 
    261 void
    262 Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
    263                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
    264     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
    265     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
    266 
    267     minYesNo=inIndexes[IX_MIN_YES_NO];
    268     minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
    269     minNoNo=inIndexes[IX_MIN_NO_NO];
    270     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
    271     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
    272 
    273     normTrie=inTrie;
    274 
    275     maybeYesCompositions=inExtraData;
    276     extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
    277 
    278     smallFCD=inSmallFCD;
    279 
    280     // Build tccc180[].
    281     // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
    282     uint8_t bits=0;
    283     for(UChar c=0; c<0x180; bits>>=1) {
    284         if((c&0xff)==0) {
    285             bits=smallFCD[c>>8];  // one byte per 0x100 code points
    286         }
    287         if(bits&1) {
    288             for(int i=0; i<0x20; ++i, ++c) {
    289                 tccc180[c]=(uint8_t)getFCD16FromNormData(c);
    290             }
    291         } else {
    292             uprv_memset(tccc180+c, 0, 0x20);
    293             c+=0x20;
    294         }
    295     }
    296 }
    297 
    298 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
    299     UChar32 c;
    300     if(cpStart==(cpLimit-1)) {
    301         c=*cpStart;
    302     } else {
    303         c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
    304     }
    305     uint16_t prevNorm16=getNorm16(c);
    306     if(prevNorm16<=minYesNo) {
    307         return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
    308     } else {
    309         return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo
    310     }
    311 }
    312 
    313 namespace {
    314 
    315 class LcccContext {
    316 public:
    317     LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
    318 
    319     void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
    320         if(impl.isAlgorithmicNoNo(norm16)) {
    321             // Range of code points with same-norm16-value algorithmic decompositions.
    322             // They might have different non-zero FCD16 values.
    323             do {
    324                 uint16_t fcd16=impl.getFCD16(start);
    325                 if(fcd16>0xff) { set.add(start); }
    326             } while(++start<=end);
    327         } else {
    328             uint16_t fcd16=impl.getFCD16(start);
    329             if(fcd16>0xff) { set.add(start, end); }
    330         }
    331     }
    332 
    333 private:
    334     const Normalizer2Impl &impl;
    335     UnicodeSet &set;
    336 };
    337 
    338 struct PropertyStartsContext {
    339     PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
    340             : impl(ni), sa(adder) {}
    341 
    342     const Normalizer2Impl &impl;
    343     const USetAdder *sa;
    344 };
    345 
    346 }  // namespace
    347 
    348 U_CDECL_BEGIN
    349 
    350 static UBool U_CALLCONV
    351 enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    352     ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
    353     return TRUE;
    354 }
    355 
    356 static UBool U_CALLCONV
    357 enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    358     /* add the start code point to the USet */
    359     const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
    360     const USetAdder *sa=ctx->sa;
    361     sa->add(sa->set, start);
    362     if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) {
    363         // Range of code points with same-norm16-value algorithmic decompositions.
    364         // They might have different non-zero FCD16 values.
    365         uint16_t prevFCD16=ctx->impl.getFCD16(start);
    366         while(++start<=end) {
    367             uint16_t fcd16=ctx->impl.getFCD16(start);
    368             if(fcd16!=prevFCD16) {
    369                 sa->add(sa->set, start);
    370                 prevFCD16=fcd16;
    371             }
    372         }
    373     }
    374     return TRUE;
    375 }
    376 
    377 static UBool U_CALLCONV
    378 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
    379     /* add the start code point to the USet */
    380     const USetAdder *sa=(const USetAdder *)context;
    381     sa->add(sa->set, start);
    382     return TRUE;
    383 }
    384 
    385 static uint32_t U_CALLCONV
    386 segmentStarterMapper(const void * /*context*/, uint32_t value) {
    387     return value&CANON_NOT_SEGMENT_STARTER;
    388 }
    389 
    390 U_CDECL_END
    391 
    392 void
    393 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
    394     /* add the start code point of each same-value range of each trie */
    395     LcccContext context(*this, set);
    396     utrie2_enum(normTrie, NULL, enumLcccRange, &context);
    397 }
    398 
    399 void
    400 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
    401     /* add the start code point of each same-value range of each trie */
    402     PropertyStartsContext context(*this, sa);
    403     utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
    404 
    405     /* add Hangul LV syllables and LV+1 because of skippables */
    406     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
    407         sa->add(sa->set, c);
    408         sa->add(sa->set, c+1);
    409     }
    410     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
    411 }
    412 
    413 void
    414 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
    415     /* add the start code point of each same-value range of the canonical iterator data trie */
    416     if(ensureCanonIterData(errorCode)) {
    417         // currently only used for the SEGMENT_STARTER property
    418         utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
    419     }
    420 }
    421 
    422 const UChar *
    423 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
    424                                                 UChar32 minNeedDataCP,
    425                                                 ReorderingBuffer *buffer,
    426                                                 UErrorCode &errorCode) const {
    427     // Make some effort to support NUL-terminated strings reasonably.
    428     // Take the part of the fast quick check loop that does not look up
    429     // data and check the first part of the string.
    430     // After this prefix, determine the string length to simplify the rest
    431     // of the code.
    432     const UChar *prevSrc=src;
    433     UChar c;
    434     while((c=*src++)<minNeedDataCP && c!=0) {}
    435     // Back out the last character for full processing.
    436     // Copy this prefix.
    437     if(--src!=prevSrc) {
    438         if(buffer!=NULL) {
    439             buffer->appendZeroCC(prevSrc, src, errorCode);
    440         }
    441     }
    442     return src;
    443 }
    444 
    445 UnicodeString &
    446 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
    447                            UErrorCode &errorCode) const {
    448     if(U_FAILURE(errorCode)) {
    449         dest.setToBogus();
    450         return dest;
    451     }
    452     const UChar *sArray=src.getBuffer();
    453     if(&dest==&src || sArray==NULL) {
    454         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    455         dest.setToBogus();
    456         return dest;
    457     }
    458     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
    459     return dest;
    460 }
    461 
    462 void
    463 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
    464                            UnicodeString &dest,
    465                            int32_t destLengthEstimate,
    466                            UErrorCode &errorCode) const {
    467     if(destLengthEstimate<0 && limit!=NULL) {
    468         destLengthEstimate=(int32_t)(limit-src);
    469     }
    470     dest.remove();
    471     ReorderingBuffer buffer(*this, dest);
    472     if(buffer.init(destLengthEstimate, errorCode)) {
    473         decompose(src, limit, &buffer, errorCode);
    474     }
    475 }
    476 
    477 // Dual functionality:
    478 // buffer!=NULL: normalize
    479 // buffer==NULL: isNormalized/spanQuickCheckYes
    480 const UChar *
    481 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
    482                            ReorderingBuffer *buffer,
    483                            UErrorCode &errorCode) const {
    484     UChar32 minNoCP=minDecompNoCP;
    485     if(limit==NULL) {
    486         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
    487         if(U_FAILURE(errorCode)) {
    488             return src;
    489         }
    490         limit=u_strchr(src, 0);
    491     }
    492 
    493     const UChar *prevSrc;
    494     UChar32 c=0;
    495     uint16_t norm16=0;
    496 
    497     // only for quick check
    498     const UChar *prevBoundary=src;
    499     uint8_t prevCC=0;
    500 
    501     for(;;) {
    502         // count code units below the minimum or with irrelevant data for the quick check
    503         for(prevSrc=src; src!=limit;) {
    504             if( (c=*src)<minNoCP ||
    505                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
    506             ) {
    507                 ++src;
    508             } else if(!U16_IS_SURROGATE(c)) {
    509                 break;
    510             } else {
    511                 UChar c2;
    512                 if(U16_IS_SURROGATE_LEAD(c)) {
    513                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
    514                         c=U16_GET_SUPPLEMENTARY(c, c2);
    515                     }
    516                 } else /* trail surrogate */ {
    517                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
    518                         --src;
    519                         c=U16_GET_SUPPLEMENTARY(c2, c);
    520                     }
    521                 }
    522                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
    523                     src+=U16_LENGTH(c);
    524                 } else {
    525                     break;
    526                 }
    527             }
    528         }
    529         // copy these code units all at once
    530         if(src!=prevSrc) {
    531             if(buffer!=NULL) {
    532                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
    533                     break;
    534                 }
    535             } else {
    536                 prevCC=0;
    537                 prevBoundary=src;
    538             }
    539         }
    540         if(src==limit) {
    541             break;
    542         }
    543 
    544         // Check one above-minimum, relevant code point.
    545         src+=U16_LENGTH(c);
    546         if(buffer!=NULL) {
    547             if(!decompose(c, norm16, *buffer, errorCode)) {
    548                 break;
    549             }
    550         } else {
    551             if(isDecompYes(norm16)) {
    552                 uint8_t cc=getCCFromYesOrMaybe(norm16);
    553                 if(prevCC<=cc || cc==0) {
    554                     prevCC=cc;
    555                     if(cc<=1) {
    556                         prevBoundary=src;
    557                     }
    558                     continue;
    559                 }
    560             }
    561             return prevBoundary;  // "no" or cc out of order
    562         }
    563     }
    564     return src;
    565 }
    566 
    567 // Decompose a short piece of text which is likely to contain characters that
    568 // fail the quick check loop and/or where the quick check loop's overhead
    569 // is unlikely to be amortized.
    570 // Called by the compose() and makeFCD() implementations.
    571 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
    572                                       ReorderingBuffer &buffer,
    573                                       UErrorCode &errorCode) const {
    574     while(src<limit) {
    575         UChar32 c;
    576         uint16_t norm16;
    577         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
    578         if(!decompose(c, norm16, buffer, errorCode)) {
    579             return FALSE;
    580         }
    581     }
    582     return TRUE;
    583 }
    584 
    585 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
    586                                  ReorderingBuffer &buffer,
    587                                  UErrorCode &errorCode) const {
    588     // Only loops for 1:1 algorithmic mappings.
    589     for(;;) {
    590         // get the decomposition and the lead and trail cc's
    591         if(isDecompYes(norm16)) {
    592             // c does not decompose
    593             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
    594         } else if(isHangul(norm16)) {
    595             // Hangul syllable: decompose algorithmically
    596             UChar jamos[3];
    597             return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
    598         } else if(isDecompNoAlgorithmic(norm16)) {
    599             c=mapAlgorithmic(c, norm16);
    600             norm16=getNorm16(c);
    601         } else {
    602             // c decomposes, get everything from the variable-length extra data
    603             const uint16_t *mapping=getMapping(norm16);
    604             uint16_t firstUnit=*mapping;
    605             int32_t length=firstUnit&MAPPING_LENGTH_MASK;
    606             uint8_t leadCC, trailCC;
    607             trailCC=(uint8_t)(firstUnit>>8);
    608             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
    609                 leadCC=(uint8_t)(*(mapping-1)>>8);
    610             } else {
    611                 leadCC=0;
    612             }
    613             return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
    614         }
    615     }
    616 }
    617 
    618 const UChar *
    619 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
    620     const UChar *decomp=NULL;
    621     uint16_t norm16;
    622     for(;;) {
    623         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
    624             // c does not decompose
    625             return decomp;
    626         } else if(isHangul(norm16)) {
    627             // Hangul syllable: decompose algorithmically
    628             length=Hangul::decompose(c, buffer);
    629             return buffer;
    630         } else if(isDecompNoAlgorithmic(norm16)) {
    631             c=mapAlgorithmic(c, norm16);
    632             decomp=buffer;
    633             length=0;
    634             U16_APPEND_UNSAFE(buffer, length, c);
    635         } else {
    636             // c decomposes, get everything from the variable-length extra data
    637             const uint16_t *mapping=getMapping(norm16);
    638             length=*mapping&MAPPING_LENGTH_MASK;
    639             return (const UChar *)mapping+1;
    640         }
    641     }
    642 }
    643 
    644 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
    645 // so that a raw mapping fits that consists of one unit ("rm0")
    646 // plus all but the first two code units of the normal mapping.
    647 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
    648 const UChar *
    649 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
    650     // We do not loop in this method because an algorithmic mapping itself
    651     // becomes a final result rather than having to be decomposed recursively.
    652     uint16_t norm16;
    653     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
    654         // c does not decompose
    655         return NULL;
    656     } else if(isHangul(norm16)) {
    657         // Hangul syllable: decompose algorithmically
    658         Hangul::getRawDecomposition(c, buffer);
    659         length=2;
    660         return buffer;
    661     } else if(isDecompNoAlgorithmic(norm16)) {
    662         c=mapAlgorithmic(c, norm16);
    663         length=0;
    664         U16_APPEND_UNSAFE(buffer, length, c);
    665         return buffer;
    666     } else {
    667         // c decomposes, get everything from the variable-length extra data
    668         const uint16_t *mapping=getMapping(norm16);
    669         uint16_t firstUnit=*mapping;
    670         int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
    671         if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
    672             // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
    673             // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
    674             const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
    675             uint16_t rm0=*rawMapping;
    676             if(rm0<=MAPPING_LENGTH_MASK) {
    677                 length=rm0;
    678                 return (const UChar *)rawMapping-rm0;
    679             } else {
    680                 // Copy the normal mapping and replace its first two code units with rm0.
    681                 buffer[0]=(UChar)rm0;
    682                 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
    683                 length=mLength-1;
    684                 return buffer;
    685             }
    686         } else {
    687             length=mLength;
    688             return (const UChar *)mapping+1;
    689         }
    690     }
    691 }
    692 
    693 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
    694                                          UBool doDecompose,
    695                                          UnicodeString &safeMiddle,
    696                                          ReorderingBuffer &buffer,
    697                                          UErrorCode &errorCode) const {
    698     buffer.copyReorderableSuffixTo(safeMiddle);
    699     if(doDecompose) {
    700         decompose(src, limit, &buffer, errorCode);
    701         return;
    702     }
    703     // Just merge the strings at the boundary.
    704     ForwardUTrie2StringIterator iter(normTrie, src, limit);
    705     uint8_t firstCC, prevCC, cc;
    706     firstCC=prevCC=cc=getCC(iter.next16());
    707     while(cc!=0) {
    708         prevCC=cc;
    709         cc=getCC(iter.next16());
    710     };
    711     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
    712         limit=u_strchr(iter.codePointStart, 0);
    713     }
    714 
    715     if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
    716         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
    717     }
    718 }
    719 
    720 // Note: hasDecompBoundary() could be implemented as aliases to
    721 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
    722 // at the cost of building the FCD trie for a decomposition normalizer.
    723 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
    724     for(;;) {
    725         if(c<minDecompNoCP) {
    726             return TRUE;
    727         }
    728         uint16_t norm16=getNorm16(c);
    729         if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
    730             return TRUE;
    731         } else if(norm16>MIN_NORMAL_MAYBE_YES) {
    732             return FALSE;  // ccc!=0
    733         } else if(isDecompNoAlgorithmic(norm16)) {
    734             c=mapAlgorithmic(c, norm16);
    735         } else {
    736             // c decomposes, get everything from the variable-length extra data
    737             const uint16_t *mapping=getMapping(norm16);
    738             uint16_t firstUnit=*mapping;
    739             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
    740                 return FALSE;
    741             }
    742             if(!before) {
    743                 // decomp after-boundary: same as hasFCDBoundaryAfter(),
    744                 // fcd16<=1 || trailCC==0
    745                 if(firstUnit>0x1ff) {
    746                     return FALSE;  // trailCC>1
    747                 }
    748                 if(firstUnit<=0xff) {
    749                     return TRUE;  // trailCC==0
    750                 }
    751                 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
    752             }
    753             // TRUE if leadCC==0 (hasFCDBoundaryBefore())
    754             return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
    755         }
    756     }
    757 }
    758 
    759 /*
    760  * Finds the recomposition result for
    761  * a forward-combining "lead" character,
    762  * specified with a pointer to its compositions list,
    763  * and a backward-combining "trail" character.
    764  *
    765  * If the lead and trail characters combine, then this function returns
    766  * the following "compositeAndFwd" value:
    767  * Bits 21..1  composite character
    768  * Bit      0  set if the composite is a forward-combining starter
    769  * otherwise it returns -1.
    770  *
    771  * The compositions list has (trail, compositeAndFwd) pair entries,
    772  * encoded as either pairs or triples of 16-bit units.
    773  * The last entry has the high bit of its first unit set.
    774  *
    775  * The list is sorted by ascending trail characters (there are no duplicates).
    776  * A linear search is used.
    777  *
    778  * See normalizer2impl.h for a more detailed description
    779  * of the compositions list format.
    780  */
    781 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
    782     uint16_t key1, firstUnit;
    783     if(trail<COMP_1_TRAIL_LIMIT) {
    784         // trail character is 0..33FF
    785         // result entry may have 2 or 3 units
    786         key1=(uint16_t)(trail<<1);
    787         while(key1>(firstUnit=*list)) {
    788             list+=2+(firstUnit&COMP_1_TRIPLE);
    789         }
    790         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
    791             if(firstUnit&COMP_1_TRIPLE) {
    792                 return ((int32_t)list[1]<<16)|list[2];
    793             } else {
    794                 return list[1];
    795             }
    796         }
    797     } else {
    798         // trail character is 3400..10FFFF
    799         // result entry has 3 units
    800         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
    801                         (((trail>>COMP_1_TRAIL_SHIFT))&
    802                           ~COMP_1_TRIPLE));
    803         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
    804         uint16_t secondUnit;
    805         for(;;) {
    806             if(key1>(firstUnit=*list)) {
    807                 list+=2+(firstUnit&COMP_1_TRIPLE);
    808             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
    809                 if(key2>(secondUnit=list[1])) {
    810                     if(firstUnit&COMP_1_LAST_TUPLE) {
    811                         break;
    812                     } else {
    813                         list+=3;
    814                     }
    815                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
    816                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
    817                 } else {
    818                     break;
    819                 }
    820             } else {
    821                 break;
    822             }
    823         }
    824     }
    825     return -1;
    826 }
    827 
    828 /**
    829   * @param list some character's compositions list
    830   * @param set recursively receives the composites from these compositions
    831   */
    832 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
    833     uint16_t firstUnit;
    834     int32_t compositeAndFwd;
    835     do {
    836         firstUnit=*list;
    837         if((firstUnit&COMP_1_TRIPLE)==0) {
    838             compositeAndFwd=list[1];
    839             list+=2;
    840         } else {
    841             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
    842             list+=3;
    843         }
    844         UChar32 composite=compositeAndFwd>>1;
    845         if((compositeAndFwd&1)!=0) {
    846             addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
    847         }
    848         set.add(composite);
    849     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
    850 }
    851 
    852 /*
    853  * Recomposes the buffer text starting at recomposeStartIndex
    854  * (which is in NFD - decomposed and canonically ordered),
    855  * and truncates the buffer contents.
    856  *
    857  * Note that recomposition never lengthens the text:
    858  * Any character consists of either one or two code units;
    859  * a composition may contain at most one more code unit than the original starter,
    860  * while the combining mark that is removed has at least one code unit.
    861  */
    862 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
    863                                 UBool onlyContiguous) const {
    864     UChar *p=buffer.getStart()+recomposeStartIndex;
    865     UChar *limit=buffer.getLimit();
    866     if(p==limit) {
    867         return;
    868     }
    869 
    870     UChar *starter, *pRemove, *q, *r;
    871     const uint16_t *compositionsList;
    872     UChar32 c, compositeAndFwd;
    873     uint16_t norm16;
    874     uint8_t cc, prevCC;
    875     UBool starterIsSupplementary;
    876 
    877     // Some of the following variables are not used until we have a forward-combining starter
    878     // and are only initialized now to avoid compiler warnings.
    879     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
    880     starter=NULL;
    881     starterIsSupplementary=FALSE;
    882     prevCC=0;
    883 
    884     for(;;) {
    885         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
    886         cc=getCCFromYesOrMaybe(norm16);
    887         if( // this character combines backward and
    888             isMaybe(norm16) &&
    889             // we have seen a starter that combines forward and
    890             compositionsList!=NULL &&
    891             // the backward-combining character is not blocked
    892             (prevCC<cc || prevCC==0)
    893         ) {
    894             if(isJamoVT(norm16)) {
    895                 // c is a Jamo V/T, see if we can compose it with the previous character.
    896                 if(c<Hangul::JAMO_T_BASE) {
    897                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
    898                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
    899                     if(prev<Hangul::JAMO_L_COUNT) {
    900                         pRemove=p-1;
    901                         UChar syllable=(UChar)
    902                             (Hangul::HANGUL_BASE+
    903                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
    904                              Hangul::JAMO_T_COUNT);
    905                         UChar t;
    906                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
    907                             ++p;
    908                             syllable+=t;  // The next character was a Jamo T.
    909                         }
    910                         *starter=syllable;
    911                         // remove the Jamo V/T
    912                         q=pRemove;
    913                         r=p;
    914                         while(r<limit) {
    915                             *q++=*r++;
    916                         }
    917                         limit=q;
    918                         p=pRemove;
    919                     }
    920                 }
    921                 /*
    922                  * No "else" for Jamo T:
    923                  * Since the input is in NFD, there are no Hangul LV syllables that
    924                  * a Jamo T could combine with.
    925                  * All Jamo Ts are combined above when handling Jamo Vs.
    926                  */
    927                 if(p==limit) {
    928                     break;
    929                 }
    930                 compositionsList=NULL;
    931                 continue;
    932             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
    933                 // The starter and the combining mark (c) do combine.
    934                 UChar32 composite=compositeAndFwd>>1;
    935 
    936                 // Replace the starter with the composite, remove the combining mark.
    937                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
    938                 if(starterIsSupplementary) {
    939                     if(U_IS_SUPPLEMENTARY(composite)) {
    940                         // both are supplementary
    941                         starter[0]=U16_LEAD(composite);
    942                         starter[1]=U16_TRAIL(composite);
    943                     } else {
    944                         *starter=(UChar)composite;
    945                         // The composite is shorter than the starter,
    946                         // move the intermediate characters forward one.
    947                         starterIsSupplementary=FALSE;
    948                         q=starter+1;
    949                         r=q+1;
    950                         while(r<pRemove) {
    951                             *q++=*r++;
    952                         }
    953                         --pRemove;
    954                     }
    955                 } else if(U_IS_SUPPLEMENTARY(composite)) {
    956                     // The composite is longer than the starter,
    957                     // move the intermediate characters back one.
    958                     starterIsSupplementary=TRUE;
    959                     ++starter;  // temporarily increment for the loop boundary
    960                     q=pRemove;
    961                     r=++pRemove;
    962                     while(starter<q) {
    963                         *--r=*--q;
    964                     }
    965                     *starter=U16_TRAIL(composite);
    966                     *--starter=U16_LEAD(composite);  // undo the temporary increment
    967                 } else {
    968                     // both are on the BMP
    969                     *starter=(UChar)composite;
    970                 }
    971 
    972                 /* remove the combining mark by moving the following text over it */
    973                 if(pRemove<p) {
    974                     q=pRemove;
    975                     r=p;
    976                     while(r<limit) {
    977                         *q++=*r++;
    978                     }
    979                     limit=q;
    980                     p=pRemove;
    981                 }
    982                 // Keep prevCC because we removed the combining mark.
    983 
    984                 if(p==limit) {
    985                     break;
    986                 }
    987                 // Is the composite a starter that combines forward?
    988                 if(compositeAndFwd&1) {
    989                     compositionsList=
    990                         getCompositionsListForComposite(getNorm16(composite));
    991                 } else {
    992                     compositionsList=NULL;
    993                 }
    994 
    995                 // We combined; continue with looking for compositions.
    996                 continue;
    997             }
    998         }
    999 
   1000         // no combination this time
   1001         prevCC=cc;
   1002         if(p==limit) {
   1003             break;
   1004         }
   1005 
   1006         // If c did not combine, then check if it is a starter.
   1007         if(cc==0) {
   1008             // Found a new starter.
   1009             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
   1010                 // It may combine with something, prepare for it.
   1011                 if(U_IS_BMP(c)) {
   1012                     starterIsSupplementary=FALSE;
   1013                     starter=p-1;
   1014                 } else {
   1015                     starterIsSupplementary=TRUE;
   1016                     starter=p-2;
   1017                 }
   1018             }
   1019         } else if(onlyContiguous) {
   1020             // FCC: no discontiguous compositions; any intervening character blocks.
   1021             compositionsList=NULL;
   1022         }
   1023     }
   1024     buffer.setReorderingLimit(limit);
   1025 }
   1026 
   1027 UChar32
   1028 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
   1029     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
   1030     const uint16_t *list;
   1031     if(isInert(norm16)) {
   1032         return U_SENTINEL;
   1033     } else if(norm16<minYesNoMappingsOnly) {
   1034         if(isJamoL(norm16)) {
   1035             b-=Hangul::JAMO_V_BASE;
   1036             if(0<=b && b<Hangul::JAMO_V_COUNT) {
   1037                 return
   1038                     (Hangul::HANGUL_BASE+
   1039                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
   1040                      Hangul::JAMO_T_COUNT);
   1041             } else {
   1042                 return U_SENTINEL;
   1043             }
   1044         } else if(isHangul(norm16)) {
   1045             b-=Hangul::JAMO_T_BASE;
   1046             if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
   1047                 return a+b;
   1048             } else {
   1049                 return U_SENTINEL;
   1050             }
   1051         } else {
   1052             // 'a' has a compositions list in extraData
   1053             list=extraData+norm16;
   1054             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
   1055                 list+=  // mapping pointer
   1056                     1+  // +1 to skip the first unit with the mapping lenth
   1057                     (*list&MAPPING_LENGTH_MASK);  // + mapping length
   1058             }
   1059         }
   1060     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
   1061         return U_SENTINEL;
   1062     } else {
   1063         list=maybeYesCompositions+norm16-minMaybeYes;
   1064     }
   1065     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
   1066         return U_SENTINEL;
   1067     }
   1068 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
   1069     return combine(list, b)>>1;
   1070 #else
   1071     int32_t compositeAndFwd=combine(list, b);
   1072     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
   1073 #endif
   1074 }
   1075 
   1076 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
   1077 // doCompose: normalize
   1078 // !doCompose: isNormalized (buffer must be empty and initialized)
   1079 UBool
   1080 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
   1081                          UBool onlyContiguous,
   1082                          UBool doCompose,
   1083                          ReorderingBuffer &buffer,
   1084                          UErrorCode &errorCode) const {
   1085     /*
   1086      * prevBoundary points to the last character before the current one
   1087      * that has a composition boundary before it with ccc==0 and quick check "yes".
   1088      * Keeping track of prevBoundary saves us looking for a composition boundary
   1089      * when we find a "no" or "maybe".
   1090      *
   1091      * When we back out from prevSrc back to prevBoundary,
   1092      * then we also remove those same characters (which had been simply copied
   1093      * or canonically-order-inserted) from the ReorderingBuffer.
   1094      * Therefore, at all times, the [prevBoundary..prevSrc[ source units
   1095      * must correspond 1:1 to destination units at the end of the destination buffer.
   1096      */
   1097     const UChar *prevBoundary=src;
   1098     UChar32 minNoMaybeCP=minCompNoMaybeCP;
   1099     if(limit==NULL) {
   1100         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
   1101                                            doCompose ? &buffer : NULL,
   1102                                            errorCode);
   1103         if(U_FAILURE(errorCode)) {
   1104             return FALSE;
   1105         }
   1106         if(prevBoundary<src) {
   1107             // Set prevBoundary to the last character in the prefix.
   1108             prevBoundary=src-1;
   1109         }
   1110         limit=u_strchr(src, 0);
   1111     }
   1112 
   1113     const UChar *prevSrc;
   1114     UChar32 c=0;
   1115     uint16_t norm16=0;
   1116 
   1117     // only for isNormalized
   1118     uint8_t prevCC=0;
   1119 
   1120     for(;;) {
   1121         // count code units below the minimum or with irrelevant data for the quick check
   1122         for(prevSrc=src; src!=limit;) {
   1123             if( (c=*src)<minNoMaybeCP ||
   1124                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
   1125             ) {
   1126                 ++src;
   1127             } else if(!U16_IS_SURROGATE(c)) {
   1128                 break;
   1129             } else {
   1130                 UChar c2;
   1131                 if(U16_IS_SURROGATE_LEAD(c)) {
   1132                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
   1133                         c=U16_GET_SUPPLEMENTARY(c, c2);
   1134                     }
   1135                 } else /* trail surrogate */ {
   1136                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
   1137                         --src;
   1138                         c=U16_GET_SUPPLEMENTARY(c2, c);
   1139                     }
   1140                 }
   1141                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
   1142                     src+=U16_LENGTH(c);
   1143                 } else {
   1144                     break;
   1145                 }
   1146             }
   1147         }
   1148         // copy these code units all at once
   1149         if(src!=prevSrc) {
   1150             if(doCompose) {
   1151                 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
   1152                     break;
   1153                 }
   1154             } else {
   1155                 prevCC=0;
   1156             }
   1157             if(src==limit) {
   1158                 break;
   1159             }
   1160             // Set prevBoundary to the last character in the quick check loop.
   1161             prevBoundary=src-1;
   1162             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
   1163                 U16_IS_LEAD(*(prevBoundary-1))
   1164             ) {
   1165                 --prevBoundary;
   1166             }
   1167             // The start of the current character (c).
   1168             prevSrc=src;
   1169         } else if(src==limit) {
   1170             break;
   1171         }
   1172 
   1173         src+=U16_LENGTH(c);
   1174         /*
   1175          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
   1176          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
   1177          * or has ccc!=0.
   1178          * Check for Jamo V/T, then for regular characters.
   1179          * c is not a Hangul syllable or Jamo L because those have "yes" properties.
   1180          */
   1181         if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
   1182             UChar prev=*(prevSrc-1);
   1183             UBool needToDecompose=FALSE;
   1184             if(c<Hangul::JAMO_T_BASE) {
   1185                 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
   1186                 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
   1187                 if(prev<Hangul::JAMO_L_COUNT) {
   1188                     if(!doCompose) {
   1189                         return FALSE;
   1190                     }
   1191                     UChar syllable=(UChar)
   1192                         (Hangul::HANGUL_BASE+
   1193                          (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
   1194                          Hangul::JAMO_T_COUNT);
   1195                     UChar t;
   1196                     if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
   1197                         ++src;
   1198                         syllable+=t;  // The next character was a Jamo T.
   1199                         prevBoundary=src;
   1200                         buffer.setLastChar(syllable);
   1201                         continue;
   1202                     }
   1203                     // If we see L+V+x where x!=T then we drop to the slow path,
   1204                     // decompose and recompose.
   1205                     // This is to deal with NFKC finding normal L and V but a
   1206                     // compatibility variant of a T. We need to either fully compose that
   1207                     // combination here (which would complicate the code and may not work
   1208                     // with strange custom data) or use the slow path -- or else our replacing
   1209                     // two input characters (L+V) with one output character (LV syllable)
   1210                     // would violate the invariant that [prevBoundary..prevSrc[ has the same
   1211                     // length as what we appended to the buffer since prevBoundary.
   1212                     needToDecompose=TRUE;
   1213                 }
   1214             } else if(Hangul::isHangulWithoutJamoT(prev)) {
   1215                 // c is a Jamo Trailing consonant,
   1216                 // compose with previous Hangul LV that does not contain a Jamo T.
   1217                 if(!doCompose) {
   1218                     return FALSE;
   1219                 }
   1220                 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
   1221                 prevBoundary=src;
   1222                 continue;
   1223             }
   1224             if(!needToDecompose) {
   1225                 // The Jamo V/T did not compose into a Hangul syllable.
   1226                 if(doCompose) {
   1227                     if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
   1228                         break;
   1229                     }
   1230                 } else {
   1231                     prevCC=0;
   1232                 }
   1233                 continue;
   1234             }
   1235         }
   1236         /*
   1237          * Source buffer pointers:
   1238          *
   1239          *  all done      quick check   current char  not yet
   1240          *                "yes" but     (c)           processed
   1241          *                may combine
   1242          *                forward
   1243          * [-------------[-------------[-------------[-------------[
   1244          * |             |             |             |             |
   1245          * orig. src     prevBoundary  prevSrc       src           limit
   1246          *
   1247          *
   1248          * Destination buffer pointers inside the ReorderingBuffer:
   1249          *
   1250          *  all done      might take    not filled yet
   1251          *                characters for
   1252          *                reordering
   1253          * [-------------[-------------[-------------[
   1254          * |             |             |             |
   1255          * start         reorderStart  limit         |
   1256          *                             +remainingCap.+
   1257          */
   1258         if(norm16>=MIN_YES_YES_WITH_CC) {
   1259             uint8_t cc=(uint8_t)norm16;  // cc!=0
   1260             if( onlyContiguous &&  // FCC
   1261                 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
   1262                 prevBoundary<prevSrc &&
   1263                 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
   1264                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
   1265                 // passed the quick check "yes && ccc==0" test.
   1266                 // Check whether the last character was a "yesYes" or a "yesNo".
   1267                 // If a "yesNo", then we get its trailing ccc from its
   1268                 // mapping and check for canonical order.
   1269                 // All other cases are ok.
   1270                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
   1271             ) {
   1272                 // Fails FCD test, need to decompose and contiguously recompose.
   1273                 if(!doCompose) {
   1274                     return FALSE;
   1275                 }
   1276             } else if(doCompose) {
   1277                 if(!buffer.append(c, cc, errorCode)) {
   1278                     break;
   1279                 }
   1280                 continue;
   1281             } else if(prevCC<=cc) {
   1282                 prevCC=cc;
   1283                 continue;
   1284             } else {
   1285                 return FALSE;
   1286             }
   1287         } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
   1288             return FALSE;
   1289         }
   1290 
   1291         /*
   1292          * Find appropriate boundaries around this character,
   1293          * decompose the source text from between the boundaries,
   1294          * and recompose it.
   1295          *
   1296          * We may need to remove the last few characters from the ReorderingBuffer
   1297          * to account for source text that was copied or appended
   1298          * but needs to take part in the recomposition.
   1299          */
   1300 
   1301         /*
   1302          * Find the last composition boundary in [prevBoundary..src[.
   1303          * It is either the decomposition of the current character (at prevSrc),
   1304          * or prevBoundary.
   1305          */
   1306         if(hasCompBoundaryBefore(c, norm16)) {
   1307             prevBoundary=prevSrc;
   1308         } else if(doCompose) {
   1309             buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
   1310         }
   1311 
   1312         // Find the next composition boundary in [src..limit[ -
   1313         // modifies src to point to the next starter.
   1314         src=(UChar *)findNextCompBoundary(src, limit);
   1315 
   1316         // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
   1317         int32_t recomposeStartIndex=buffer.length();
   1318         if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
   1319             break;
   1320         }
   1321         recompose(buffer, recomposeStartIndex, onlyContiguous);
   1322         if(!doCompose) {
   1323             if(!buffer.equals(prevBoundary, src)) {
   1324                 return FALSE;
   1325             }
   1326             buffer.remove();
   1327             prevCC=0;
   1328         }
   1329 
   1330         // Move to the next starter. We never need to look back before this point again.
   1331         prevBoundary=src;
   1332     }
   1333     return TRUE;
   1334 }
   1335 
   1336 // Very similar to compose(): Make the same changes in both places if relevant.
   1337 // pQCResult==NULL: spanQuickCheckYes
   1338 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
   1339 const UChar *
   1340 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
   1341                                    UBool onlyContiguous,
   1342                                    UNormalizationCheckResult *pQCResult) const {
   1343     /*
   1344      * prevBoundary points to the last character before the current one
   1345      * that has a composition boundary before it with ccc==0 and quick check "yes".
   1346      */
   1347     const UChar *prevBoundary=src;
   1348     UChar32 minNoMaybeCP=minCompNoMaybeCP;
   1349     if(limit==NULL) {
   1350         UErrorCode errorCode=U_ZERO_ERROR;
   1351         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
   1352         if(prevBoundary<src) {
   1353             // Set prevBoundary to the last character in the prefix.
   1354             prevBoundary=src-1;
   1355         }
   1356         limit=u_strchr(src, 0);
   1357     }
   1358 
   1359     const UChar *prevSrc;
   1360     UChar32 c=0;
   1361     uint16_t norm16=0;
   1362     uint8_t prevCC=0;
   1363 
   1364     for(;;) {
   1365         // count code units below the minimum or with irrelevant data for the quick check
   1366         for(prevSrc=src;;) {
   1367             if(src==limit) {
   1368                 return src;
   1369             }
   1370             if( (c=*src)<minNoMaybeCP ||
   1371                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
   1372             ) {
   1373                 ++src;
   1374             } else if(!U16_IS_SURROGATE(c)) {
   1375                 break;
   1376             } else {
   1377                 UChar c2;
   1378                 if(U16_IS_SURROGATE_LEAD(c)) {
   1379                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
   1380                         c=U16_GET_SUPPLEMENTARY(c, c2);
   1381                     }
   1382                 } else /* trail surrogate */ {
   1383                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
   1384                         --src;
   1385                         c=U16_GET_SUPPLEMENTARY(c2, c);
   1386                     }
   1387                 }
   1388                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
   1389                     src+=U16_LENGTH(c);
   1390                 } else {
   1391                     break;
   1392                 }
   1393             }
   1394         }
   1395         if(src!=prevSrc) {
   1396             // Set prevBoundary to the last character in the quick check loop.
   1397             prevBoundary=src-1;
   1398             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
   1399                 U16_IS_LEAD(*(prevBoundary-1))
   1400             ) {
   1401                 --prevBoundary;
   1402             }
   1403             prevCC=0;
   1404             // The start of the current character (c).
   1405             prevSrc=src;
   1406         }
   1407 
   1408         src+=U16_LENGTH(c);
   1409         /*
   1410          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
   1411          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
   1412          * or has ccc!=0.
   1413          */
   1414         if(isMaybeOrNonZeroCC(norm16)) {
   1415             uint8_t cc=getCCFromYesOrMaybe(norm16);
   1416             if( onlyContiguous &&  // FCC
   1417                 cc!=0 &&
   1418                 prevCC==0 &&
   1419                 prevBoundary<prevSrc &&
   1420                 // prevCC==0 && prevBoundary<prevSrc tell us that
   1421                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
   1422                 // passed the quick check "yes && ccc==0" test.
   1423                 // Check whether the last character was a "yesYes" or a "yesNo".
   1424                 // If a "yesNo", then we get its trailing ccc from its
   1425                 // mapping and check for canonical order.
   1426                 // All other cases are ok.
   1427                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
   1428             ) {
   1429                 // Fails FCD test.
   1430             } else if(prevCC<=cc || cc==0) {
   1431                 prevCC=cc;
   1432                 if(norm16<MIN_YES_YES_WITH_CC) {
   1433                     if(pQCResult!=NULL) {
   1434                         *pQCResult=UNORM_MAYBE;
   1435                     } else {
   1436                         return prevBoundary;
   1437                     }
   1438                 }
   1439                 continue;
   1440             }
   1441         }
   1442         if(pQCResult!=NULL) {
   1443             *pQCResult=UNORM_NO;
   1444         }
   1445         return prevBoundary;
   1446     }
   1447 }
   1448 
   1449 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
   1450                                        UBool doCompose,
   1451                                        UBool onlyContiguous,
   1452                                        UnicodeString &safeMiddle,
   1453                                        ReorderingBuffer &buffer,
   1454                                        UErrorCode &errorCode) const {
   1455     if(!buffer.isEmpty()) {
   1456         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
   1457         if(src!=firstStarterInSrc) {
   1458             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
   1459                                                                     buffer.getLimit());
   1460             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
   1461             UnicodeString middle(lastStarterInDest, destSuffixLength);
   1462             buffer.removeSuffix(destSuffixLength);
   1463             safeMiddle=middle;
   1464             middle.append(src, (int32_t)(firstStarterInSrc-src));
   1465             const UChar *middleStart=middle.getBuffer();
   1466             compose(middleStart, middleStart+middle.length(), onlyContiguous,
   1467                     TRUE, buffer, errorCode);
   1468             if(U_FAILURE(errorCode)) {
   1469                 return;
   1470             }
   1471             src=firstStarterInSrc;
   1472         }
   1473     }
   1474     if(doCompose) {
   1475         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
   1476     } else {
   1477         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
   1478             limit=u_strchr(src, 0);
   1479         }
   1480         buffer.appendZeroCC(src, limit, errorCode);
   1481     }
   1482 }
   1483 
   1484 /**
   1485  * Does c have a composition boundary before it?
   1486  * True if its decomposition begins with a character that has
   1487  * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
   1488  * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
   1489  * (isCompYesAndZeroCC()) so we need not decompose.
   1490  */
   1491 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
   1492     for(;;) {
   1493         if(isCompYesAndZeroCC(norm16)) {
   1494             return TRUE;
   1495         } else if(isMaybeOrNonZeroCC(norm16)) {
   1496             return FALSE;
   1497         } else if(isDecompNoAlgorithmic(norm16)) {
   1498             c=mapAlgorithmic(c, norm16);
   1499             norm16=getNorm16(c);
   1500         } else {
   1501             // c decomposes, get everything from the variable-length extra data
   1502             const uint16_t *mapping=getMapping(norm16);
   1503             uint16_t firstUnit=*mapping;
   1504             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
   1505                 return FALSE;
   1506             }
   1507             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
   1508                 return FALSE;  // non-zero leadCC
   1509             }
   1510             int32_t i=1;  // skip over the firstUnit
   1511             UChar32 c;
   1512             U16_NEXT_UNSAFE(mapping, i, c);
   1513             return isCompYesAndZeroCC(getNorm16(c));
   1514         }
   1515     }
   1516 }
   1517 
   1518 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
   1519     for(;;) {
   1520         uint16_t norm16=getNorm16(c);
   1521         if(isInert(norm16)) {
   1522             return TRUE;
   1523         } else if(norm16<=minYesNo) {
   1524             // Hangul: norm16==minYesNo
   1525             // Hangul LVT has a boundary after it.
   1526             // Hangul LV and non-inert yesYes characters combine forward.
   1527             return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
   1528         } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
   1529             return FALSE;
   1530         } else if(isDecompNoAlgorithmic(norm16)) {
   1531             c=mapAlgorithmic(c, norm16);
   1532         } else {
   1533             // c decomposes, get everything from the variable-length extra data.
   1534             // If testInert, then c must be a yesNo character which has lccc=0,
   1535             // otherwise it could be a noNo.
   1536             const uint16_t *mapping=getMapping(norm16);
   1537             uint16_t firstUnit=*mapping;
   1538             // TRUE if
   1539             //   not MAPPING_NO_COMP_BOUNDARY_AFTER
   1540             //     (which is set if
   1541             //       c is not deleted, and
   1542             //       it and its decomposition do not combine forward, and it has a starter)
   1543             //   and if FCC then trailCC<=1
   1544             return
   1545                 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
   1546                 (!onlyContiguous || firstUnit<=0x1ff);
   1547         }
   1548     }
   1549 }
   1550 
   1551 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
   1552     BackwardUTrie2StringIterator iter(normTrie, start, p);
   1553     uint16_t norm16;
   1554     do {
   1555         norm16=iter.previous16();
   1556     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
   1557     // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
   1558     // but that's probably not worth the extra cost.
   1559     return iter.codePointStart;
   1560 }
   1561 
   1562 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
   1563     ForwardUTrie2StringIterator iter(normTrie, p, limit);
   1564     uint16_t norm16;
   1565     do {
   1566         norm16=iter.next16();
   1567     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
   1568     return iter.codePointStart;
   1569 }
   1570 
   1571 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
   1572 // still had getFCDTrie() which built and cached an FCD trie.
   1573 // That provided faster access to FCD data than getFCD16FromNormData()
   1574 // but required synchronization and consumed some 10kB of heap memory
   1575 // in any process that uses FCD (e.g., via collation).
   1576 // tccc180[] and smallFCD[] are intended to help with any loss of performance,
   1577 // at least for Latin & CJK.
   1578 
   1579 // Gets the FCD value from the regular normalization data.
   1580 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
   1581     // Only loops for 1:1 algorithmic mappings.
   1582     for(;;) {
   1583         uint16_t norm16=getNorm16(c);
   1584         if(norm16<=minYesNo) {
   1585             // no decomposition or Hangul syllable, all zeros
   1586             return 0;
   1587         } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
   1588             // combining mark
   1589             norm16&=0xff;
   1590             return norm16|(norm16<<8);
   1591         } else if(norm16>=minMaybeYes) {
   1592             return 0;
   1593         } else if(isDecompNoAlgorithmic(norm16)) {
   1594             c=mapAlgorithmic(c, norm16);
   1595         } else {
   1596             // c decomposes, get everything from the variable-length extra data
   1597             const uint16_t *mapping=getMapping(norm16);
   1598             uint16_t firstUnit=*mapping;
   1599             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
   1600                 // A character that is deleted (maps to an empty string) must
   1601                 // get the worst-case lccc and tccc values because arbitrary
   1602                 // characters on both sides will become adjacent.
   1603                 return 0x1ff;
   1604             } else {
   1605                 norm16=firstUnit>>8;  // tccc
   1606                 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
   1607                     norm16|=*(mapping-1)&0xff00;  // lccc
   1608                 }
   1609                 return norm16;
   1610             }
   1611         }
   1612     }
   1613 }
   1614 
   1615 // Dual functionality:
   1616 // buffer!=NULL: normalize
   1617 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
   1618 const UChar *
   1619 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
   1620                          ReorderingBuffer *buffer,
   1621                          UErrorCode &errorCode) const {
   1622     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
   1623     // Similar to the prevBoundary in the compose() implementation.
   1624     const UChar *prevBoundary=src;
   1625     int32_t prevFCD16=0;
   1626     if(limit==NULL) {
   1627         src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
   1628         if(U_FAILURE(errorCode)) {
   1629             return src;
   1630         }
   1631         if(prevBoundary<src) {
   1632             prevBoundary=src;
   1633             // We know that the previous character's lccc==0.
   1634             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
   1635             prevFCD16=getFCD16(*(src-1));
   1636             if(prevFCD16>1) {
   1637                 --prevBoundary;
   1638             }
   1639         }
   1640         limit=u_strchr(src, 0);
   1641     }
   1642 
   1643     // Note: In this function we use buffer->appendZeroCC() because we track
   1644     // the lead and trail combining classes here, rather than leaving it to
   1645     // the ReorderingBuffer.
   1646     // The exception is the call to decomposeShort() which uses the buffer
   1647     // in the normal way.
   1648 
   1649     const UChar *prevSrc;
   1650     UChar32 c=0;
   1651     uint16_t fcd16=0;
   1652 
   1653     for(;;) {
   1654         // count code units with lccc==0
   1655         for(prevSrc=src; src!=limit;) {
   1656             if((c=*src)<MIN_CCC_LCCC_CP) {
   1657                 prevFCD16=~c;
   1658                 ++src;
   1659             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
   1660                 prevFCD16=0;
   1661                 ++src;
   1662             } else {
   1663                 if(U16_IS_SURROGATE(c)) {
   1664                     UChar c2;
   1665                     if(U16_IS_SURROGATE_LEAD(c)) {
   1666                         if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
   1667                             c=U16_GET_SUPPLEMENTARY(c, c2);
   1668                         }
   1669                     } else /* trail surrogate */ {
   1670                         if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
   1671                             --src;
   1672                             c=U16_GET_SUPPLEMENTARY(c2, c);
   1673                         }
   1674                     }
   1675                 }
   1676                 if((fcd16=getFCD16FromNormData(c))<=0xff) {
   1677                     prevFCD16=fcd16;
   1678                     src+=U16_LENGTH(c);
   1679                 } else {
   1680                     break;
   1681                 }
   1682             }
   1683         }
   1684         // copy these code units all at once
   1685         if(src!=prevSrc) {
   1686             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
   1687                 break;
   1688             }
   1689             if(src==limit) {
   1690                 break;
   1691             }
   1692             prevBoundary=src;
   1693             // We know that the previous character's lccc==0.
   1694             if(prevFCD16<0) {
   1695                 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
   1696                 UChar32 prev=~prevFCD16;
   1697                 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
   1698                 if(prevFCD16>1) {
   1699                     --prevBoundary;
   1700                 }
   1701             } else {
   1702                 const UChar *p=src-1;
   1703                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
   1704                     --p;
   1705                     // Need to fetch the previous character's FCD value because
   1706                     // prevFCD16 was just for the trail surrogate code point.
   1707                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
   1708                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
   1709                 }
   1710                 if(prevFCD16>1) {
   1711                     prevBoundary=p;
   1712                 }
   1713             }
   1714             // The start of the current character (c).
   1715             prevSrc=src;
   1716         } else if(src==limit) {
   1717             break;
   1718         }
   1719 
   1720         src+=U16_LENGTH(c);
   1721         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
   1722         // Check for proper order, and decompose locally if necessary.
   1723         if((prevFCD16&0xff)<=(fcd16>>8)) {
   1724             // proper order: prev tccc <= current lccc
   1725             if((fcd16&0xff)<=1) {
   1726                 prevBoundary=src;
   1727             }
   1728             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
   1729                 break;
   1730             }
   1731             prevFCD16=fcd16;
   1732             continue;
   1733         } else if(buffer==NULL) {
   1734             return prevBoundary;  // quick check "no"
   1735         } else {
   1736             /*
   1737              * Back out the part of the source that we copied or appended
   1738              * already but is now going to be decomposed.
   1739              * prevSrc is set to after what was copied/appended.
   1740              */
   1741             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
   1742             /*
   1743              * Find the part of the source that needs to be decomposed,
   1744              * up to the next safe boundary.
   1745              */
   1746             src=findNextFCDBoundary(src, limit);
   1747             /*
   1748              * The source text does not fulfill the conditions for FCD.
   1749              * Decompose and reorder a limited piece of the text.
   1750              */
   1751             if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
   1752                 break;
   1753             }
   1754             prevBoundary=src;
   1755             prevFCD16=0;
   1756         }
   1757     }
   1758     return src;
   1759 }
   1760 
   1761 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
   1762                                        UBool doMakeFCD,
   1763                                        UnicodeString &safeMiddle,
   1764                                        ReorderingBuffer &buffer,
   1765                                        UErrorCode &errorCode) const {
   1766     if(!buffer.isEmpty()) {
   1767         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
   1768         if(src!=firstBoundaryInSrc) {
   1769             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
   1770                                                                     buffer.getLimit());
   1771             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
   1772             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
   1773             buffer.removeSuffix(destSuffixLength);
   1774             safeMiddle=middle;
   1775             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
   1776             const UChar *middleStart=middle.getBuffer();
   1777             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
   1778             if(U_FAILURE(errorCode)) {
   1779                 return;
   1780             }
   1781             src=firstBoundaryInSrc;
   1782         }
   1783     }
   1784     if(doMakeFCD) {
   1785         makeFCD(src, limit, &buffer, errorCode);
   1786     } else {
   1787         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
   1788             limit=u_strchr(src, 0);
   1789         }
   1790         buffer.appendZeroCC(src, limit, errorCode);
   1791     }
   1792 }
   1793 
   1794 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
   1795     while(start<p && previousFCD16(start, p)>0xff) {}
   1796     return p;
   1797 }
   1798 
   1799 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
   1800     while(p<limit) {
   1801         const UChar *codePointStart=p;
   1802         if(nextFCD16(p, limit)<=0xff) {
   1803             return codePointStart;
   1804         }
   1805     }
   1806     return p;
   1807 }
   1808 
   1809 // CanonicalIterator data -------------------------------------------------- ***
   1810 
   1811 CanonIterData::CanonIterData(UErrorCode &errorCode) :
   1812         trie(utrie2_open(0, 0, &errorCode)),
   1813         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
   1814 
   1815 CanonIterData::~CanonIterData() {
   1816     utrie2_close(trie);
   1817 }
   1818 
   1819 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
   1820     uint32_t canonValue=utrie2_get32(trie, decompLead);
   1821     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
   1822         // origin is the first character whose decomposition starts with
   1823         // the character for which we are setting the value.
   1824         utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
   1825     } else {
   1826         // origin is not the first character, or it is U+0000.
   1827         UnicodeSet *set;
   1828         if((canonValue&CANON_HAS_SET)==0) {
   1829             set=new UnicodeSet;
   1830             if(set==NULL) {
   1831                 errorCode=U_MEMORY_ALLOCATION_ERROR;
   1832                 return;
   1833             }
   1834             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
   1835             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
   1836             utrie2_set32(trie, decompLead, canonValue, &errorCode);
   1837             canonStartSets.addElement(set, errorCode);
   1838             if(firstOrigin!=0) {
   1839                 set->add(firstOrigin);
   1840             }
   1841         } else {
   1842             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
   1843         }
   1844         set->add(origin);
   1845     }
   1846 }
   1847 
   1848 U_CDECL_BEGIN
   1849 
   1850 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
   1851 //     context: the Normalizer2Impl
   1852 static UBool U_CALLCONV
   1853 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
   1854     UErrorCode errorCode = U_ZERO_ERROR;
   1855     if (value != 0) {
   1856         Normalizer2Impl *impl = (Normalizer2Impl *)context;
   1857         impl->makeCanonIterDataFromNorm16(
   1858             start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
   1859     }
   1860     return U_SUCCESS(errorCode);
   1861 }
   1862 
   1863 
   1864 
   1865 // UInitOnce instantiation function for CanonIterData
   1866 
   1867 static void U_CALLCONV
   1868 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
   1869     U_ASSERT(impl->fCanonIterData == NULL);
   1870     impl->fCanonIterData = new CanonIterData(errorCode);
   1871     if (impl->fCanonIterData == NULL) {
   1872         errorCode=U_MEMORY_ALLOCATION_ERROR;
   1873     }
   1874     if (U_SUCCESS(errorCode)) {
   1875         utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
   1876         utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
   1877     }
   1878     if (U_FAILURE(errorCode)) {
   1879         delete impl->fCanonIterData;
   1880         impl->fCanonIterData = NULL;
   1881     }
   1882 }
   1883 
   1884 U_CDECL_END
   1885 
   1886 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
   1887                                                   CanonIterData &newData,
   1888                                                   UErrorCode &errorCode) const {
   1889     if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
   1890         // Inert, or 2-way mapping (including Hangul syllable).
   1891         // We do not write a canonStartSet for any yesNo character.
   1892         // Composites from 2-way mappings are added at runtime from the
   1893         // starter's compositions list, and the other characters in
   1894         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
   1895         // "maybe" characters.
   1896         return;
   1897     }
   1898     for(UChar32 c=start; c<=end; ++c) {
   1899         uint32_t oldValue=utrie2_get32(newData.trie, c);
   1900         uint32_t newValue=oldValue;
   1901         if(norm16>=minMaybeYes) {
   1902             // not a segment starter if it occurs in a decomposition or has cc!=0
   1903             newValue|=CANON_NOT_SEGMENT_STARTER;
   1904             if(norm16<MIN_NORMAL_MAYBE_YES) {
   1905                 newValue|=CANON_HAS_COMPOSITIONS;
   1906             }
   1907         } else if(norm16<minYesNo) {
   1908             newValue|=CANON_HAS_COMPOSITIONS;
   1909         } else {
   1910             // c has a one-way decomposition
   1911             UChar32 c2=c;
   1912             uint16_t norm16_2=norm16;
   1913             while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
   1914                 c2=mapAlgorithmic(c2, norm16_2);
   1915                 norm16_2=getNorm16(c2);
   1916             }
   1917             if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
   1918                 // c decomposes, get everything from the variable-length extra data
   1919                 const uint16_t *mapping=getMapping(norm16_2);
   1920                 uint16_t firstUnit=*mapping;
   1921                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
   1922                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
   1923                     if(c==c2 && (*(mapping-1)&0xff)!=0) {
   1924                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
   1925                     }
   1926                 }
   1927                 // Skip empty mappings (no characters in the decomposition).
   1928                 if(length!=0) {
   1929                     ++mapping;  // skip over the firstUnit
   1930                     // add c to first code point's start set
   1931                     int32_t i=0;
   1932                     U16_NEXT_UNSAFE(mapping, i, c2);
   1933                     newData.addToStartSet(c, c2, errorCode);
   1934                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
   1935                     // one-way mapping. A 2-way mapping is possible here after
   1936                     // intermediate algorithmic mapping.
   1937                     if(norm16_2>=minNoNo) {
   1938                         while(i<length) {
   1939                             U16_NEXT_UNSAFE(mapping, i, c2);
   1940                             uint32_t c2Value=utrie2_get32(newData.trie, c2);
   1941                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
   1942                                 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
   1943                                              &errorCode);
   1944                             }
   1945                         }
   1946                     }
   1947                 }
   1948             } else {
   1949                 // c decomposed to c2 algorithmically; c has cc==0
   1950                 newData.addToStartSet(c, c2, errorCode);
   1951             }
   1952         }
   1953         if(newValue!=oldValue) {
   1954             utrie2_set32(newData.trie, c, newValue, &errorCode);
   1955         }
   1956     }
   1957 }
   1958 
   1959 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
   1960     // Logically const: Synchronized instantiation.
   1961     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
   1962     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
   1963     return U_SUCCESS(errorCode);
   1964 }
   1965 
   1966 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
   1967     return (int32_t)utrie2_get32(fCanonIterData->trie, c);
   1968 }
   1969 
   1970 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
   1971     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
   1972 }
   1973 
   1974 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
   1975     return getCanonValue(c)>=0;
   1976 }
   1977 
   1978 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
   1979     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
   1980     if(canonValue==0) {
   1981         return FALSE;
   1982     }
   1983     set.clear();
   1984     int32_t value=canonValue&CANON_VALUE_MASK;
   1985     if((canonValue&CANON_HAS_SET)!=0) {
   1986         set.addAll(getCanonStartSet(value));
   1987     } else if(value!=0) {
   1988         set.add(value);
   1989     }
   1990     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
   1991         uint16_t norm16=getNorm16(c);
   1992         if(norm16==JAMO_L) {
   1993             UChar32 syllable=
   1994                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
   1995             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
   1996         } else {
   1997             addComposites(getCompositionsList(norm16), set);
   1998         }
   1999     }
   2000     return TRUE;
   2001 }
   2002 
   2003 U_NAMESPACE_END
   2004 
   2005 // Normalizer2 data swapping ----------------------------------------------- ***
   2006 
   2007 U_NAMESPACE_USE
   2008 
   2009 U_CAPI int32_t U_EXPORT2
   2010 unorm2_swap(const UDataSwapper *ds,
   2011             const void *inData, int32_t length, void *outData,
   2012             UErrorCode *pErrorCode) {
   2013     const UDataInfo *pInfo;
   2014     int32_t headerSize;
   2015 
   2016     const uint8_t *inBytes;
   2017     uint8_t *outBytes;
   2018 
   2019     const int32_t *inIndexes;
   2020     int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
   2021 
   2022     int32_t i, offset, nextOffset, size;
   2023 
   2024     /* udata_swapDataHeader checks the arguments */
   2025     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
   2026     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   2027         return 0;
   2028     }
   2029 
   2030     /* check data format and format version */
   2031     pInfo=(const UDataInfo *)((const char *)inData+4);
   2032     if(!(
   2033         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
   2034         pInfo->dataFormat[1]==0x72 &&
   2035         pInfo->dataFormat[2]==0x6d &&
   2036         pInfo->dataFormat[3]==0x32 &&
   2037         (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
   2038     )) {
   2039         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
   2040                          pInfo->dataFormat[0], pInfo->dataFormat[1],
   2041                          pInfo->dataFormat[2], pInfo->dataFormat[3],
   2042                          pInfo->formatVersion[0]);
   2043         *pErrorCode=U_UNSUPPORTED_ERROR;
   2044         return 0;
   2045     }
   2046 
   2047     inBytes=(const uint8_t *)inData+headerSize;
   2048     outBytes=(uint8_t *)outData+headerSize;
   2049 
   2050     inIndexes=(const int32_t *)inBytes;
   2051 
   2052     if(length>=0) {
   2053         length-=headerSize;
   2054         if(length<(int32_t)sizeof(indexes)) {
   2055             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
   2056                              length);
   2057             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   2058             return 0;
   2059         }
   2060     }
   2061 
   2062     /* read the first few indexes */
   2063     for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
   2064         indexes[i]=udata_readInt32(ds, inIndexes[i]);
   2065     }
   2066 
   2067     /* get the total length of the data */
   2068     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
   2069 
   2070     if(length>=0) {
   2071         if(length<size) {
   2072             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
   2073                              length);
   2074             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   2075             return 0;
   2076         }
   2077 
   2078         /* copy the data for inaccessible bytes */
   2079         if(inBytes!=outBytes) {
   2080             uprv_memcpy(outBytes, inBytes, size);
   2081         }
   2082 
   2083         offset=0;
   2084 
   2085         /* swap the int32_t indexes[] */
   2086         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
   2087         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
   2088         offset=nextOffset;
   2089 
   2090         /* swap the UTrie2 */
   2091         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
   2092         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
   2093         offset=nextOffset;
   2094 
   2095         /* swap the uint16_t extraData[] */
   2096         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
   2097         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
   2098         offset=nextOffset;
   2099 
   2100         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
   2101         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
   2102         offset=nextOffset;
   2103 
   2104         U_ASSERT(offset==size);
   2105     }
   2106 
   2107     return headerSize+size;
   2108 }
   2109 
   2110 #endif  // !UCONFIG_NO_NORMALIZATION
   2111