Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2009-2014, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  normalizer2impl.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2009nov22
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_NORMALIZATION
     20 
     21 #include "unicode/normalizer2.h"
     22 #include "unicode/udata.h"
     23 #include "unicode/ustring.h"
     24 #include "unicode/utf16.h"
     25 #include "cmemory.h"
     26 #include "mutex.h"
     27 #include "normalizer2impl.h"
     28 #include "putilimp.h"
     29 #include "uassert.h"
     30 #include "uset_imp.h"
     31 #include "utrie2.h"
     32 #include "uvector.h"
     33 
     34 U_NAMESPACE_BEGIN
     35 
     36 // ReorderingBuffer -------------------------------------------------------- ***
     37 
     38 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
     39     int32_t length=str.length();
     40     start=str.getBuffer(destCapacity);
     41     if(start==NULL) {
     42         // getBuffer() already did str.setToBogus()
     43         errorCode=U_MEMORY_ALLOCATION_ERROR;
     44         return FALSE;
     45     }
     46     limit=start+length;
     47     remainingCapacity=str.getCapacity()-length;
     48     reorderStart=start;
     49     if(start==limit) {
     50         lastCC=0;
     51     } else {
     52         setIterator();
     53         lastCC=previousCC();
     54         // Set reorderStart after the last code point with cc<=1 if there is one.
     55         if(lastCC>1) {
     56             while(previousCC()>1) {}
     57         }
     58         reorderStart=codePointLimit;
     59     }
     60     return TRUE;
     61 }
     62 
     63 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
     64     int32_t length=(int32_t)(limit-start);
     65     return
     66         length==(int32_t)(otherLimit-otherStart) &&
     67         0==u_memcmp(start, otherStart, length);
     68 }
     69 
     70 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
     71     if(remainingCapacity<2 && !resize(2, errorCode)) {
     72         return FALSE;
     73     }
     74     if(lastCC<=cc || cc==0) {
     75         limit[0]=U16_LEAD(c);
     76         limit[1]=U16_TRAIL(c);
     77         limit+=2;
     78         lastCC=cc;
     79         if(cc<=1) {
     80             reorderStart=limit;
     81         }
     82     } else {
     83         insert(c, cc);
     84     }
     85     remainingCapacity-=2;
     86     return TRUE;
     87 }
     88 
     89 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
     90                                uint8_t leadCC, uint8_t trailCC,
     91                                UErrorCode &errorCode) {
     92     if(length==0) {
     93         return TRUE;
     94     }
     95     if(remainingCapacity<length && !resize(length, errorCode)) {
     96         return FALSE;
     97     }
     98     remainingCapacity-=length;
     99     if(lastCC<=leadCC || leadCC==0) {
    100         if(trailCC<=1) {
    101             reorderStart=limit+length;
    102         } else if(leadCC<=1) {
    103             reorderStart=limit+1;  // Ok if not a code point boundary.
    104         }
    105         const UChar *sLimit=s+length;
    106         do { *limit++=*s++; } while(s!=sLimit);
    107         lastCC=trailCC;
    108     } else {
    109         int32_t i=0;
    110         UChar32 c;
    111         U16_NEXT(s, i, length, c);
    112         insert(c, leadCC);  // insert first code point
    113         while(i<length) {
    114             U16_NEXT(s, i, length, c);
    115             if(i<length) {
    116                 // s must be in NFD, otherwise we need to use getCC().
    117                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
    118             } else {
    119                 leadCC=trailCC;
    120             }
    121             append(c, leadCC, errorCode);
    122         }
    123     }
    124     return TRUE;
    125 }
    126 
    127 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
    128     int32_t cpLength=U16_LENGTH(c);
    129     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
    130         return FALSE;
    131     }
    132     remainingCapacity-=cpLength;
    133     if(cpLength==1) {
    134         *limit++=(UChar)c;
    135     } else {
    136         limit[0]=U16_LEAD(c);
    137         limit[1]=U16_TRAIL(c);
    138         limit+=2;
    139     }
    140     lastCC=0;
    141     reorderStart=limit;
    142     return TRUE;
    143 }
    144 
    145 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
    146     if(s==sLimit) {
    147         return TRUE;
    148     }
    149     int32_t length=(int32_t)(sLimit-s);
    150     if(remainingCapacity<length && !resize(length, errorCode)) {
    151         return FALSE;
    152     }
    153     u_memcpy(limit, s, length);
    154     limit+=length;
    155     remainingCapacity-=length;
    156     lastCC=0;
    157     reorderStart=limit;
    158     return TRUE;
    159 }
    160 
    161 void ReorderingBuffer::remove() {
    162     reorderStart=limit=start;
    163     remainingCapacity=str.getCapacity();
    164     lastCC=0;
    165 }
    166 
    167 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
    168     if(suffixLength<(limit-start)) {
    169         limit-=suffixLength;
    170         remainingCapacity+=suffixLength;
    171     } else {
    172         limit=start;
    173         remainingCapacity=str.getCapacity();
    174     }
    175     lastCC=0;
    176     reorderStart=limit;
    177 }
    178 
    179 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
    180     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
    181     int32_t length=(int32_t)(limit-start);
    182     str.releaseBuffer(length);
    183     int32_t newCapacity=length+appendLength;
    184     int32_t doubleCapacity=2*str.getCapacity();
    185     if(newCapacity<doubleCapacity) {
    186         newCapacity=doubleCapacity;
    187     }
    188     if(newCapacity<256) {
    189         newCapacity=256;
    190     }
    191     start=str.getBuffer(newCapacity);
    192     if(start==NULL) {
    193         // getBuffer() already did str.setToBogus()
    194         errorCode=U_MEMORY_ALLOCATION_ERROR;
    195         return FALSE;
    196     }
    197     reorderStart=start+reorderStartIndex;
    198     limit=start+length;
    199     remainingCapacity=str.getCapacity()-length;
    200     return TRUE;
    201 }
    202 
    203 void ReorderingBuffer::skipPrevious() {
    204     codePointLimit=codePointStart;
    205     UChar c=*--codePointStart;
    206     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
    207         --codePointStart;
    208     }
    209 }
    210 
    211 uint8_t ReorderingBuffer::previousCC() {
    212     codePointLimit=codePointStart;
    213     if(reorderStart>=codePointStart) {
    214         return 0;
    215     }
    216     UChar32 c=*--codePointStart;
    217     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
    218         return 0;
    219     }
    220 
    221     UChar c2;
    222     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
    223         --codePointStart;
    224         c=U16_GET_SUPPLEMENTARY(c2, c);
    225     }
    226     return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
    227 }
    228 
    229 // Inserts c somewhere before the last character.
    230 // Requires 0<cc<lastCC which implies reorderStart<limit.
    231 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
    232     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
    233     // insert c at codePointLimit, after the character with prevCC<=cc
    234     UChar *q=limit;
    235     UChar *r=limit+=U16_LENGTH(c);
    236     do {
    237         *--r=*--q;
    238     } while(codePointLimit!=q);
    239     writeCodePoint(q, c);
    240     if(cc<=1) {
    241         reorderStart=r;
    242     }
    243 }
    244 
    245 // Normalizer2Impl --------------------------------------------------------- ***
    246 
    247 struct CanonIterData : public UMemory {
    248     CanonIterData(UErrorCode &errorCode);
    249     ~CanonIterData();
    250     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
    251     UTrie2 *trie;
    252     UVector canonStartSets;  // contains UnicodeSet *
    253 };
    254 
    255 Normalizer2Impl::~Normalizer2Impl() {
    256     delete fCanonIterData;
    257 }
    258 
    259 void
    260 Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
    261                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
    262     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
    263     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
    264 
    265     minYesNo=inIndexes[IX_MIN_YES_NO];
    266     minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
    267     minNoNo=inIndexes[IX_MIN_NO_NO];
    268     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
    269     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
    270 
    271     normTrie=inTrie;
    272 
    273     maybeYesCompositions=inExtraData;
    274     extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
    275 
    276     smallFCD=inSmallFCD;
    277 
    278     // Build tccc180[].
    279     // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
    280     uint8_t bits=0;
    281     for(UChar c=0; c<0x180; bits>>=1) {
    282         if((c&0xff)==0) {
    283             bits=smallFCD[c>>8];  // one byte per 0x100 code points
    284         }
    285         if(bits&1) {
    286             for(int i=0; i<0x20; ++i, ++c) {
    287                 tccc180[c]=(uint8_t)getFCD16FromNormData(c);
    288             }
    289         } else {
    290             uprv_memset(tccc180+c, 0, 0x20);
    291             c+=0x20;
    292         }
    293     }
    294 }
    295 
    296 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
    297     UChar32 c;
    298     if(cpStart==(cpLimit-1)) {
    299         c=*cpStart;
    300     } else {
    301         c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
    302     }
    303     uint16_t prevNorm16=getNorm16(c);
    304     if(prevNorm16<=minYesNo) {
    305         return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
    306     } else {
    307         return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo
    308     }
    309 }
    310 
    311 namespace {
    312 
    313 class LcccContext {
    314 public:
    315     LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
    316 
    317     void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
    318         if(impl.isAlgorithmicNoNo(norm16)) {
    319             // Range of code points with same-norm16-value algorithmic decompositions.
    320             // They might have different non-zero FCD16 values.
    321             do {
    322                 uint16_t fcd16=impl.getFCD16(start);
    323                 if(fcd16>0xff) { set.add(start); }
    324             } while(++start<=end);
    325         } else {
    326             uint16_t fcd16=impl.getFCD16(start);
    327             if(fcd16>0xff) { set.add(start, end); }
    328         }
    329     }
    330 
    331 private:
    332     const Normalizer2Impl &impl;
    333     UnicodeSet &set;
    334 };
    335 
    336 struct PropertyStartsContext {
    337     PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
    338             : impl(ni), sa(adder) {}
    339 
    340     const Normalizer2Impl &impl;
    341     const USetAdder *sa;
    342 };
    343 
    344 }  // namespace
    345 
    346 U_CDECL_BEGIN
    347 
    348 static UBool U_CALLCONV
    349 enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    350     ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
    351     return TRUE;
    352 }
    353 
    354 static UBool U_CALLCONV
    355 enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    356     /* add the start code point to the USet */
    357     const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
    358     const USetAdder *sa=ctx->sa;
    359     sa->add(sa->set, start);
    360     if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) {
    361         // Range of code points with same-norm16-value algorithmic decompositions.
    362         // They might have different non-zero FCD16 values.
    363         uint16_t prevFCD16=ctx->impl.getFCD16(start);
    364         while(++start<=end) {
    365             uint16_t fcd16=ctx->impl.getFCD16(start);
    366             if(fcd16!=prevFCD16) {
    367                 sa->add(sa->set, start);
    368                 prevFCD16=fcd16;
    369             }
    370         }
    371     }
    372     return TRUE;
    373 }
    374 
    375 static UBool U_CALLCONV
    376 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
    377     /* add the start code point to the USet */
    378     const USetAdder *sa=(const USetAdder *)context;
    379     sa->add(sa->set, start);
    380     return TRUE;
    381 }
    382 
    383 static uint32_t U_CALLCONV
    384 segmentStarterMapper(const void * /*context*/, uint32_t value) {
    385     return value&CANON_NOT_SEGMENT_STARTER;
    386 }
    387 
    388 U_CDECL_END
    389 
    390 void
    391 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
    392     /* add the start code point of each same-value range of each trie */
    393     LcccContext context(*this, set);
    394     utrie2_enum(normTrie, NULL, enumLcccRange, &context);
    395 }
    396 
    397 void
    398 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
    399     /* add the start code point of each same-value range of each trie */
    400     PropertyStartsContext context(*this, sa);
    401     utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
    402 
    403     /* add Hangul LV syllables and LV+1 because of skippables */
    404     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
    405         sa->add(sa->set, c);
    406         sa->add(sa->set, c+1);
    407     }
    408     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
    409 }
    410 
    411 void
    412 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
    413     /* add the start code point of each same-value range of the canonical iterator data trie */
    414     if(ensureCanonIterData(errorCode)) {
    415         // currently only used for the SEGMENT_STARTER property
    416         utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
    417     }
    418 }
    419 
    420 const UChar *
    421 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
    422                                                 UChar32 minNeedDataCP,
    423                                                 ReorderingBuffer *buffer,
    424                                                 UErrorCode &errorCode) const {
    425     // Make some effort to support NUL-terminated strings reasonably.
    426     // Take the part of the fast quick check loop that does not look up
    427     // data and check the first part of the string.
    428     // After this prefix, determine the string length to simplify the rest
    429     // of the code.
    430     const UChar *prevSrc=src;
    431     UChar c;
    432     while((c=*src++)<minNeedDataCP && c!=0) {}
    433     // Back out the last character for full processing.
    434     // Copy this prefix.
    435     if(--src!=prevSrc) {
    436         if(buffer!=NULL) {
    437             buffer->appendZeroCC(prevSrc, src, errorCode);
    438         }
    439     }
    440     return src;
    441 }
    442 
    443 UnicodeString &
    444 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
    445                            UErrorCode &errorCode) const {
    446     if(U_FAILURE(errorCode)) {
    447         dest.setToBogus();
    448         return dest;
    449     }
    450     const UChar *sArray=src.getBuffer();
    451     if(&dest==&src || sArray==NULL) {
    452         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    453         dest.setToBogus();
    454         return dest;
    455     }
    456     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
    457     return dest;
    458 }
    459 
    460 void
    461 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
    462                            UnicodeString &dest,
    463                            int32_t destLengthEstimate,
    464                            UErrorCode &errorCode) const {
    465     if(destLengthEstimate<0 && limit!=NULL) {
    466         destLengthEstimate=(int32_t)(limit-src);
    467     }
    468     dest.remove();
    469     ReorderingBuffer buffer(*this, dest);
    470     if(buffer.init(destLengthEstimate, errorCode)) {
    471         decompose(src, limit, &buffer, errorCode);
    472     }
    473 }
    474 
    475 // Dual functionality:
    476 // buffer!=NULL: normalize
    477 // buffer==NULL: isNormalized/spanQuickCheckYes
    478 const UChar *
    479 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
    480                            ReorderingBuffer *buffer,
    481                            UErrorCode &errorCode) const {
    482     UChar32 minNoCP=minDecompNoCP;
    483     if(limit==NULL) {
    484         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
    485         if(U_FAILURE(errorCode)) {
    486             return src;
    487         }
    488         limit=u_strchr(src, 0);
    489     }
    490 
    491     const UChar *prevSrc;
    492     UChar32 c=0;
    493     uint16_t norm16=0;
    494 
    495     // only for quick check
    496     const UChar *prevBoundary=src;
    497     uint8_t prevCC=0;
    498 
    499     for(;;) {
    500         // count code units below the minimum or with irrelevant data for the quick check
    501         for(prevSrc=src; src!=limit;) {
    502             if( (c=*src)<minNoCP ||
    503                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
    504             ) {
    505                 ++src;
    506             } else if(!U16_IS_SURROGATE(c)) {
    507                 break;
    508             } else {
    509                 UChar c2;
    510                 if(U16_IS_SURROGATE_LEAD(c)) {
    511                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
    512                         c=U16_GET_SUPPLEMENTARY(c, c2);
    513                     }
    514                 } else /* trail surrogate */ {
    515                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
    516                         --src;
    517                         c=U16_GET_SUPPLEMENTARY(c2, c);
    518                     }
    519                 }
    520                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
    521                     src+=U16_LENGTH(c);
    522                 } else {
    523                     break;
    524                 }
    525             }
    526         }
    527         // copy these code units all at once
    528         if(src!=prevSrc) {
    529             if(buffer!=NULL) {
    530                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
    531                     break;
    532                 }
    533             } else {
    534                 prevCC=0;
    535                 prevBoundary=src;
    536             }
    537         }
    538         if(src==limit) {
    539             break;
    540         }
    541 
    542         // Check one above-minimum, relevant code point.
    543         src+=U16_LENGTH(c);
    544         if(buffer!=NULL) {
    545             if(!decompose(c, norm16, *buffer, errorCode)) {
    546                 break;
    547             }
    548         } else {
    549             if(isDecompYes(norm16)) {
    550                 uint8_t cc=getCCFromYesOrMaybe(norm16);
    551                 if(prevCC<=cc || cc==0) {
    552                     prevCC=cc;
    553                     if(cc<=1) {
    554                         prevBoundary=src;
    555                     }
    556                     continue;
    557                 }
    558             }
    559             return prevBoundary;  // "no" or cc out of order
    560         }
    561     }
    562     return src;
    563 }
    564 
    565 // Decompose a short piece of text which is likely to contain characters that
    566 // fail the quick check loop and/or where the quick check loop's overhead
    567 // is unlikely to be amortized.
    568 // Called by the compose() and makeFCD() implementations.
    569 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
    570                                       ReorderingBuffer &buffer,
    571                                       UErrorCode &errorCode) const {
    572     while(src<limit) {
    573         UChar32 c;
    574         uint16_t norm16;
    575         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
    576         if(!decompose(c, norm16, buffer, errorCode)) {
    577             return FALSE;
    578         }
    579     }
    580     return TRUE;
    581 }
    582 
    583 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
    584                                  ReorderingBuffer &buffer,
    585                                  UErrorCode &errorCode) const {
    586     // Only loops for 1:1 algorithmic mappings.
    587     for(;;) {
    588         // get the decomposition and the lead and trail cc's
    589         if(isDecompYes(norm16)) {
    590             // c does not decompose
    591             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
    592         } else if(isHangul(norm16)) {
    593             // Hangul syllable: decompose algorithmically
    594             UChar jamos[3];
    595             return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
    596         } else if(isDecompNoAlgorithmic(norm16)) {
    597             c=mapAlgorithmic(c, norm16);
    598             norm16=getNorm16(c);
    599         } else {
    600             // c decomposes, get everything from the variable-length extra data
    601             const uint16_t *mapping=getMapping(norm16);
    602             uint16_t firstUnit=*mapping;
    603             int32_t length=firstUnit&MAPPING_LENGTH_MASK;
    604             uint8_t leadCC, trailCC;
    605             trailCC=(uint8_t)(firstUnit>>8);
    606             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
    607                 leadCC=(uint8_t)(*(mapping-1)>>8);
    608             } else {
    609                 leadCC=0;
    610             }
    611             return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
    612         }
    613     }
    614 }
    615 
    616 const UChar *
    617 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
    618     const UChar *decomp=NULL;
    619     uint16_t norm16;
    620     for(;;) {
    621         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
    622             // c does not decompose
    623             return decomp;
    624         } else if(isHangul(norm16)) {
    625             // Hangul syllable: decompose algorithmically
    626             length=Hangul::decompose(c, buffer);
    627             return buffer;
    628         } else if(isDecompNoAlgorithmic(norm16)) {
    629             c=mapAlgorithmic(c, norm16);
    630             decomp=buffer;
    631             length=0;
    632             U16_APPEND_UNSAFE(buffer, length, c);
    633         } else {
    634             // c decomposes, get everything from the variable-length extra data
    635             const uint16_t *mapping=getMapping(norm16);
    636             length=*mapping&MAPPING_LENGTH_MASK;
    637             return (const UChar *)mapping+1;
    638         }
    639     }
    640 }
    641 
    642 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
    643 // so that a raw mapping fits that consists of one unit ("rm0")
    644 // plus all but the first two code units of the normal mapping.
    645 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
    646 const UChar *
    647 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
    648     // We do not loop in this method because an algorithmic mapping itself
    649     // becomes a final result rather than having to be decomposed recursively.
    650     uint16_t norm16;
    651     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
    652         // c does not decompose
    653         return NULL;
    654     } else if(isHangul(norm16)) {
    655         // Hangul syllable: decompose algorithmically
    656         Hangul::getRawDecomposition(c, buffer);
    657         length=2;
    658         return buffer;
    659     } else if(isDecompNoAlgorithmic(norm16)) {
    660         c=mapAlgorithmic(c, norm16);
    661         length=0;
    662         U16_APPEND_UNSAFE(buffer, length, c);
    663         return buffer;
    664     } else {
    665         // c decomposes, get everything from the variable-length extra data
    666         const uint16_t *mapping=getMapping(norm16);
    667         uint16_t firstUnit=*mapping;
    668         int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
    669         if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
    670             // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
    671             // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
    672             const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
    673             uint16_t rm0=*rawMapping;
    674             if(rm0<=MAPPING_LENGTH_MASK) {
    675                 length=rm0;
    676                 return (const UChar *)rawMapping-rm0;
    677             } else {
    678                 // Copy the normal mapping and replace its first two code units with rm0.
    679                 buffer[0]=(UChar)rm0;
    680                 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
    681                 length=mLength-1;
    682                 return buffer;
    683             }
    684         } else {
    685             length=mLength;
    686             return (const UChar *)mapping+1;
    687         }
    688     }
    689 }
    690 
    691 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
    692                                          UBool doDecompose,
    693                                          UnicodeString &safeMiddle,
    694                                          ReorderingBuffer &buffer,
    695                                          UErrorCode &errorCode) const {
    696     buffer.copyReorderableSuffixTo(safeMiddle);
    697     if(doDecompose) {
    698         decompose(src, limit, &buffer, errorCode);
    699         return;
    700     }
    701     // Just merge the strings at the boundary.
    702     ForwardUTrie2StringIterator iter(normTrie, src, limit);
    703     uint8_t firstCC, prevCC, cc;
    704     firstCC=prevCC=cc=getCC(iter.next16());
    705     while(cc!=0) {
    706         prevCC=cc;
    707         cc=getCC(iter.next16());
    708     };
    709     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
    710         limit=u_strchr(iter.codePointStart, 0);
    711     }
    712 
    713     if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
    714         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
    715     }
    716 }
    717 
    718 // Note: hasDecompBoundary() could be implemented as aliases to
    719 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
    720 // at the cost of building the FCD trie for a decomposition normalizer.
    721 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
    722     for(;;) {
    723         if(c<minDecompNoCP) {
    724             return TRUE;
    725         }
    726         uint16_t norm16=getNorm16(c);
    727         if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
    728             return TRUE;
    729         } else if(norm16>MIN_NORMAL_MAYBE_YES) {
    730             return FALSE;  // ccc!=0
    731         } else if(isDecompNoAlgorithmic(norm16)) {
    732             c=mapAlgorithmic(c, norm16);
    733         } else {
    734             // c decomposes, get everything from the variable-length extra data
    735             const uint16_t *mapping=getMapping(norm16);
    736             uint16_t firstUnit=*mapping;
    737             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
    738                 return FALSE;
    739             }
    740             if(!before) {
    741                 // decomp after-boundary: same as hasFCDBoundaryAfter(),
    742                 // fcd16<=1 || trailCC==0
    743                 if(firstUnit>0x1ff) {
    744                     return FALSE;  // trailCC>1
    745                 }
    746                 if(firstUnit<=0xff) {
    747                     return TRUE;  // trailCC==0
    748                 }
    749                 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
    750             }
    751             // TRUE if leadCC==0 (hasFCDBoundaryBefore())
    752             return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
    753         }
    754     }
    755 }
    756 
    757 /*
    758  * Finds the recomposition result for
    759  * a forward-combining "lead" character,
    760  * specified with a pointer to its compositions list,
    761  * and a backward-combining "trail" character.
    762  *
    763  * If the lead and trail characters combine, then this function returns
    764  * the following "compositeAndFwd" value:
    765  * Bits 21..1  composite character
    766  * Bit      0  set if the composite is a forward-combining starter
    767  * otherwise it returns -1.
    768  *
    769  * The compositions list has (trail, compositeAndFwd) pair entries,
    770  * encoded as either pairs or triples of 16-bit units.
    771  * The last entry has the high bit of its first unit set.
    772  *
    773  * The list is sorted by ascending trail characters (there are no duplicates).
    774  * A linear search is used.
    775  *
    776  * See normalizer2impl.h for a more detailed description
    777  * of the compositions list format.
    778  */
    779 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
    780     uint16_t key1, firstUnit;
    781     if(trail<COMP_1_TRAIL_LIMIT) {
    782         // trail character is 0..33FF
    783         // result entry may have 2 or 3 units
    784         key1=(uint16_t)(trail<<1);
    785         while(key1>(firstUnit=*list)) {
    786             list+=2+(firstUnit&COMP_1_TRIPLE);
    787         }
    788         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
    789             if(firstUnit&COMP_1_TRIPLE) {
    790                 return ((int32_t)list[1]<<16)|list[2];
    791             } else {
    792                 return list[1];
    793             }
    794         }
    795     } else {
    796         // trail character is 3400..10FFFF
    797         // result entry has 3 units
    798         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
    799                         (((trail>>COMP_1_TRAIL_SHIFT))&
    800                           ~COMP_1_TRIPLE));
    801         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
    802         uint16_t secondUnit;
    803         for(;;) {
    804             if(key1>(firstUnit=*list)) {
    805                 list+=2+(firstUnit&COMP_1_TRIPLE);
    806             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
    807                 if(key2>(secondUnit=list[1])) {
    808                     if(firstUnit&COMP_1_LAST_TUPLE) {
    809                         break;
    810                     } else {
    811                         list+=3;
    812                     }
    813                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
    814                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
    815                 } else {
    816                     break;
    817                 }
    818             } else {
    819                 break;
    820             }
    821         }
    822     }
    823     return -1;
    824 }
    825 
    826 /**
    827   * @param list some character's compositions list
    828   * @param set recursively receives the composites from these compositions
    829   */
    830 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
    831     uint16_t firstUnit;
    832     int32_t compositeAndFwd;
    833     do {
    834         firstUnit=*list;
    835         if((firstUnit&COMP_1_TRIPLE)==0) {
    836             compositeAndFwd=list[1];
    837             list+=2;
    838         } else {
    839             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
    840             list+=3;
    841         }
    842         UChar32 composite=compositeAndFwd>>1;
    843         if((compositeAndFwd&1)!=0) {
    844             addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
    845         }
    846         set.add(composite);
    847     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
    848 }
    849 
    850 /*
    851  * Recomposes the buffer text starting at recomposeStartIndex
    852  * (which is in NFD - decomposed and canonically ordered),
    853  * and truncates the buffer contents.
    854  *
    855  * Note that recomposition never lengthens the text:
    856  * Any character consists of either one or two code units;
    857  * a composition may contain at most one more code unit than the original starter,
    858  * while the combining mark that is removed has at least one code unit.
    859  */
    860 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
    861                                 UBool onlyContiguous) const {
    862     UChar *p=buffer.getStart()+recomposeStartIndex;
    863     UChar *limit=buffer.getLimit();
    864     if(p==limit) {
    865         return;
    866     }
    867 
    868     UChar *starter, *pRemove, *q, *r;
    869     const uint16_t *compositionsList;
    870     UChar32 c, compositeAndFwd;
    871     uint16_t norm16;
    872     uint8_t cc, prevCC;
    873     UBool starterIsSupplementary;
    874 
    875     // Some of the following variables are not used until we have a forward-combining starter
    876     // and are only initialized now to avoid compiler warnings.
    877     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
    878     starter=NULL;
    879     starterIsSupplementary=FALSE;
    880     prevCC=0;
    881 
    882     for(;;) {
    883         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
    884         cc=getCCFromYesOrMaybe(norm16);
    885         if( // this character combines backward and
    886             isMaybe(norm16) &&
    887             // we have seen a starter that combines forward and
    888             compositionsList!=NULL &&
    889             // the backward-combining character is not blocked
    890             (prevCC<cc || prevCC==0)
    891         ) {
    892             if(isJamoVT(norm16)) {
    893                 // c is a Jamo V/T, see if we can compose it with the previous character.
    894                 if(c<Hangul::JAMO_T_BASE) {
    895                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
    896                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
    897                     if(prev<Hangul::JAMO_L_COUNT) {
    898                         pRemove=p-1;
    899                         UChar syllable=(UChar)
    900                             (Hangul::HANGUL_BASE+
    901                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
    902                              Hangul::JAMO_T_COUNT);
    903                         UChar t;
    904                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
    905                             ++p;
    906                             syllable+=t;  // The next character was a Jamo T.
    907                         }
    908                         *starter=syllable;
    909                         // remove the Jamo V/T
    910                         q=pRemove;
    911                         r=p;
    912                         while(r<limit) {
    913                             *q++=*r++;
    914                         }
    915                         limit=q;
    916                         p=pRemove;
    917                     }
    918                 }
    919                 /*
    920                  * No "else" for Jamo T:
    921                  * Since the input is in NFD, there are no Hangul LV syllables that
    922                  * a Jamo T could combine with.
    923                  * All Jamo Ts are combined above when handling Jamo Vs.
    924                  */
    925                 if(p==limit) {
    926                     break;
    927                 }
    928                 compositionsList=NULL;
    929                 continue;
    930             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
    931                 // The starter and the combining mark (c) do combine.
    932                 UChar32 composite=compositeAndFwd>>1;
    933 
    934                 // Replace the starter with the composite, remove the combining mark.
    935                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
    936                 if(starterIsSupplementary) {
    937                     if(U_IS_SUPPLEMENTARY(composite)) {
    938                         // both are supplementary
    939                         starter[0]=U16_LEAD(composite);
    940                         starter[1]=U16_TRAIL(composite);
    941                     } else {
    942                         *starter=(UChar)composite;
    943                         // The composite is shorter than the starter,
    944                         // move the intermediate characters forward one.
    945                         starterIsSupplementary=FALSE;
    946                         q=starter+1;
    947                         r=q+1;
    948                         while(r<pRemove) {
    949                             *q++=*r++;
    950                         }
    951                         --pRemove;
    952                     }
    953                 } else if(U_IS_SUPPLEMENTARY(composite)) {
    954                     // The composite is longer than the starter,
    955                     // move the intermediate characters back one.
    956                     starterIsSupplementary=TRUE;
    957                     ++starter;  // temporarily increment for the loop boundary
    958                     q=pRemove;
    959                     r=++pRemove;
    960                     while(starter<q) {
    961                         *--r=*--q;
    962                     }
    963                     *starter=U16_TRAIL(composite);
    964                     *--starter=U16_LEAD(composite);  // undo the temporary increment
    965                 } else {
    966                     // both are on the BMP
    967                     *starter=(UChar)composite;
    968                 }
    969 
    970                 /* remove the combining mark by moving the following text over it */
    971                 if(pRemove<p) {
    972                     q=pRemove;
    973                     r=p;
    974                     while(r<limit) {
    975                         *q++=*r++;
    976                     }
    977                     limit=q;
    978                     p=pRemove;
    979                 }
    980                 // Keep prevCC because we removed the combining mark.
    981 
    982                 if(p==limit) {
    983                     break;
    984                 }
    985                 // Is the composite a starter that combines forward?
    986                 if(compositeAndFwd&1) {
    987                     compositionsList=
    988                         getCompositionsListForComposite(getNorm16(composite));
    989                 } else {
    990                     compositionsList=NULL;
    991                 }
    992 
    993                 // We combined; continue with looking for compositions.
    994                 continue;
    995             }
    996         }
    997 
    998         // no combination this time
    999         prevCC=cc;
   1000         if(p==limit) {
   1001             break;
   1002         }
   1003 
   1004         // If c did not combine, then check if it is a starter.
   1005         if(cc==0) {
   1006             // Found a new starter.
   1007             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
   1008                 // It may combine with something, prepare for it.
   1009                 if(U_IS_BMP(c)) {
   1010                     starterIsSupplementary=FALSE;
   1011                     starter=p-1;
   1012                 } else {
   1013                     starterIsSupplementary=TRUE;
   1014                     starter=p-2;
   1015                 }
   1016             }
   1017         } else if(onlyContiguous) {
   1018             // FCC: no discontiguous compositions; any intervening character blocks.
   1019             compositionsList=NULL;
   1020         }
   1021     }
   1022     buffer.setReorderingLimit(limit);
   1023 }
   1024 
   1025 UChar32
   1026 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
   1027     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
   1028     const uint16_t *list;
   1029     if(isInert(norm16)) {
   1030         return U_SENTINEL;
   1031     } else if(norm16<minYesNoMappingsOnly) {
   1032         if(isJamoL(norm16)) {
   1033             b-=Hangul::JAMO_V_BASE;
   1034             if(0<=b && b<Hangul::JAMO_V_COUNT) {
   1035                 return
   1036                     (Hangul::HANGUL_BASE+
   1037                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
   1038                      Hangul::JAMO_T_COUNT);
   1039             } else {
   1040                 return U_SENTINEL;
   1041             }
   1042         } else if(isHangul(norm16)) {
   1043             b-=Hangul::JAMO_T_BASE;
   1044             if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
   1045                 return a+b;
   1046             } else {
   1047                 return U_SENTINEL;
   1048             }
   1049         } else {
   1050             // 'a' has a compositions list in extraData
   1051             list=extraData+norm16;
   1052             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
   1053                 list+=  // mapping pointer
   1054                     1+  // +1 to skip the first unit with the mapping lenth
   1055                     (*list&MAPPING_LENGTH_MASK);  // + mapping length
   1056             }
   1057         }
   1058     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
   1059         return U_SENTINEL;
   1060     } else {
   1061         list=maybeYesCompositions+norm16-minMaybeYes;
   1062     }
   1063     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
   1064         return U_SENTINEL;
   1065     }
   1066 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
   1067     return combine(list, b)>>1;
   1068 #else
   1069     int32_t compositeAndFwd=combine(list, b);
   1070     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
   1071 #endif
   1072 }
   1073 
   1074 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
   1075 // doCompose: normalize
   1076 // !doCompose: isNormalized (buffer must be empty and initialized)
   1077 UBool
   1078 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
   1079                          UBool onlyContiguous,
   1080                          UBool doCompose,
   1081                          ReorderingBuffer &buffer,
   1082                          UErrorCode &errorCode) const {
   1083     /*
   1084      * prevBoundary points to the last character before the current one
   1085      * that has a composition boundary before it with ccc==0 and quick check "yes".
   1086      * Keeping track of prevBoundary saves us looking for a composition boundary
   1087      * when we find a "no" or "maybe".
   1088      *
   1089      * When we back out from prevSrc back to prevBoundary,
   1090      * then we also remove those same characters (which had been simply copied
   1091      * or canonically-order-inserted) from the ReorderingBuffer.
   1092      * Therefore, at all times, the [prevBoundary..prevSrc[ source units
   1093      * must correspond 1:1 to destination units at the end of the destination buffer.
   1094      */
   1095     const UChar *prevBoundary=src;
   1096     UChar32 minNoMaybeCP=minCompNoMaybeCP;
   1097     if(limit==NULL) {
   1098         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
   1099                                            doCompose ? &buffer : NULL,
   1100                                            errorCode);
   1101         if(U_FAILURE(errorCode)) {
   1102             return FALSE;
   1103         }
   1104         if(prevBoundary<src) {
   1105             // Set prevBoundary to the last character in the prefix.
   1106             prevBoundary=src-1;
   1107         }
   1108         limit=u_strchr(src, 0);
   1109     }
   1110 
   1111     const UChar *prevSrc;
   1112     UChar32 c=0;
   1113     uint16_t norm16=0;
   1114 
   1115     // only for isNormalized
   1116     uint8_t prevCC=0;
   1117 
   1118     for(;;) {
   1119         // count code units below the minimum or with irrelevant data for the quick check
   1120         for(prevSrc=src; src!=limit;) {
   1121             if( (c=*src)<minNoMaybeCP ||
   1122                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
   1123             ) {
   1124                 ++src;
   1125             } else if(!U16_IS_SURROGATE(c)) {
   1126                 break;
   1127             } else {
   1128                 UChar c2;
   1129                 if(U16_IS_SURROGATE_LEAD(c)) {
   1130                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
   1131                         c=U16_GET_SUPPLEMENTARY(c, c2);
   1132                     }
   1133                 } else /* trail surrogate */ {
   1134                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
   1135                         --src;
   1136                         c=U16_GET_SUPPLEMENTARY(c2, c);
   1137                     }
   1138                 }
   1139                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
   1140                     src+=U16_LENGTH(c);
   1141                 } else {
   1142                     break;
   1143                 }
   1144             }
   1145         }
   1146         // copy these code units all at once
   1147         if(src!=prevSrc) {
   1148             if(doCompose) {
   1149                 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
   1150                     break;
   1151                 }
   1152             } else {
   1153                 prevCC=0;
   1154             }
   1155             if(src==limit) {
   1156                 break;
   1157             }
   1158             // Set prevBoundary to the last character in the quick check loop.
   1159             prevBoundary=src-1;
   1160             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
   1161                 U16_IS_LEAD(*(prevBoundary-1))
   1162             ) {
   1163                 --prevBoundary;
   1164             }
   1165             // The start of the current character (c).
   1166             prevSrc=src;
   1167         } else if(src==limit) {
   1168             break;
   1169         }
   1170 
   1171         src+=U16_LENGTH(c);
   1172         /*
   1173          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
   1174          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
   1175          * or has ccc!=0.
   1176          * Check for Jamo V/T, then for regular characters.
   1177          * c is not a Hangul syllable or Jamo L because those have "yes" properties.
   1178          */
   1179         if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
   1180             UChar prev=*(prevSrc-1);
   1181             UBool needToDecompose=FALSE;
   1182             if(c<Hangul::JAMO_T_BASE) {
   1183                 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
   1184                 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
   1185                 if(prev<Hangul::JAMO_L_COUNT) {
   1186                     if(!doCompose) {
   1187                         return FALSE;
   1188                     }
   1189                     UChar syllable=(UChar)
   1190                         (Hangul::HANGUL_BASE+
   1191                          (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
   1192                          Hangul::JAMO_T_COUNT);
   1193                     UChar t;
   1194                     if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
   1195                         ++src;
   1196                         syllable+=t;  // The next character was a Jamo T.
   1197                         prevBoundary=src;
   1198                         buffer.setLastChar(syllable);
   1199                         continue;
   1200                     }
   1201                     // If we see L+V+x where x!=T then we drop to the slow path,
   1202                     // decompose and recompose.
   1203                     // This is to deal with NFKC finding normal L and V but a
   1204                     // compatibility variant of a T. We need to either fully compose that
   1205                     // combination here (which would complicate the code and may not work
   1206                     // with strange custom data) or use the slow path -- or else our replacing
   1207                     // two input characters (L+V) with one output character (LV syllable)
   1208                     // would violate the invariant that [prevBoundary..prevSrc[ has the same
   1209                     // length as what we appended to the buffer since prevBoundary.
   1210                     needToDecompose=TRUE;
   1211                 }
   1212             } else if(Hangul::isHangulWithoutJamoT(prev)) {
   1213                 // c is a Jamo Trailing consonant,
   1214                 // compose with previous Hangul LV that does not contain a Jamo T.
   1215                 if(!doCompose) {
   1216                     return FALSE;
   1217                 }
   1218                 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
   1219                 prevBoundary=src;
   1220                 continue;
   1221             }
   1222             if(!needToDecompose) {
   1223                 // The Jamo V/T did not compose into a Hangul syllable.
   1224                 if(doCompose) {
   1225                     if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
   1226                         break;
   1227                     }
   1228                 } else {
   1229                     prevCC=0;
   1230                 }
   1231                 continue;
   1232             }
   1233         }
   1234         /*
   1235          * Source buffer pointers:
   1236          *
   1237          *  all done      quick check   current char  not yet
   1238          *                "yes" but     (c)           processed
   1239          *                may combine
   1240          *                forward
   1241          * [-------------[-------------[-------------[-------------[
   1242          * |             |             |             |             |
   1243          * orig. src     prevBoundary  prevSrc       src           limit
   1244          *
   1245          *
   1246          * Destination buffer pointers inside the ReorderingBuffer:
   1247          *
   1248          *  all done      might take    not filled yet
   1249          *                characters for
   1250          *                reordering
   1251          * [-------------[-------------[-------------[
   1252          * |             |             |             |
   1253          * start         reorderStart  limit         |
   1254          *                             +remainingCap.+
   1255          */
   1256         if(norm16>=MIN_YES_YES_WITH_CC) {
   1257             uint8_t cc=(uint8_t)norm16;  // cc!=0
   1258             if( onlyContiguous &&  // FCC
   1259                 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
   1260                 prevBoundary<prevSrc &&
   1261                 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
   1262                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
   1263                 // passed the quick check "yes && ccc==0" test.
   1264                 // Check whether the last character was a "yesYes" or a "yesNo".
   1265                 // If a "yesNo", then we get its trailing ccc from its
   1266                 // mapping and check for canonical order.
   1267                 // All other cases are ok.
   1268                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
   1269             ) {
   1270                 // Fails FCD test, need to decompose and contiguously recompose.
   1271                 if(!doCompose) {
   1272                     return FALSE;
   1273                 }
   1274             } else if(doCompose) {
   1275                 if(!buffer.append(c, cc, errorCode)) {
   1276                     break;
   1277                 }
   1278                 continue;
   1279             } else if(prevCC<=cc) {
   1280                 prevCC=cc;
   1281                 continue;
   1282             } else {
   1283                 return FALSE;
   1284             }
   1285         } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
   1286             return FALSE;
   1287         }
   1288 
   1289         /*
   1290          * Find appropriate boundaries around this character,
   1291          * decompose the source text from between the boundaries,
   1292          * and recompose it.
   1293          *
   1294          * We may need to remove the last few characters from the ReorderingBuffer
   1295          * to account for source text that was copied or appended
   1296          * but needs to take part in the recomposition.
   1297          */
   1298 
   1299         /*
   1300          * Find the last composition boundary in [prevBoundary..src[.
   1301          * It is either the decomposition of the current character (at prevSrc),
   1302          * or prevBoundary.
   1303          */
   1304         if(hasCompBoundaryBefore(c, norm16)) {
   1305             prevBoundary=prevSrc;
   1306         } else if(doCompose) {
   1307             buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
   1308         }
   1309 
   1310         // Find the next composition boundary in [src..limit[ -
   1311         // modifies src to point to the next starter.
   1312         src=(UChar *)findNextCompBoundary(src, limit);
   1313 
   1314         // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
   1315         int32_t recomposeStartIndex=buffer.length();
   1316         if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
   1317             break;
   1318         }
   1319         recompose(buffer, recomposeStartIndex, onlyContiguous);
   1320         if(!doCompose) {
   1321             if(!buffer.equals(prevBoundary, src)) {
   1322                 return FALSE;
   1323             }
   1324             buffer.remove();
   1325             prevCC=0;
   1326         }
   1327 
   1328         // Move to the next starter. We never need to look back before this point again.
   1329         prevBoundary=src;
   1330     }
   1331     return TRUE;
   1332 }
   1333 
   1334 // Very similar to compose(): Make the same changes in both places if relevant.
   1335 // pQCResult==NULL: spanQuickCheckYes
   1336 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
   1337 const UChar *
   1338 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
   1339                                    UBool onlyContiguous,
   1340                                    UNormalizationCheckResult *pQCResult) const {
   1341     /*
   1342      * prevBoundary points to the last character before the current one
   1343      * that has a composition boundary before it with ccc==0 and quick check "yes".
   1344      */
   1345     const UChar *prevBoundary=src;
   1346     UChar32 minNoMaybeCP=minCompNoMaybeCP;
   1347     if(limit==NULL) {
   1348         UErrorCode errorCode=U_ZERO_ERROR;
   1349         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
   1350         if(prevBoundary<src) {
   1351             // Set prevBoundary to the last character in the prefix.
   1352             prevBoundary=src-1;
   1353         }
   1354         limit=u_strchr(src, 0);
   1355     }
   1356 
   1357     const UChar *prevSrc;
   1358     UChar32 c=0;
   1359     uint16_t norm16=0;
   1360     uint8_t prevCC=0;
   1361 
   1362     for(;;) {
   1363         // count code units below the minimum or with irrelevant data for the quick check
   1364         for(prevSrc=src;;) {
   1365             if(src==limit) {
   1366                 return src;
   1367             }
   1368             if( (c=*src)<minNoMaybeCP ||
   1369                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
   1370             ) {
   1371                 ++src;
   1372             } else if(!U16_IS_SURROGATE(c)) {
   1373                 break;
   1374             } else {
   1375                 UChar c2;
   1376                 if(U16_IS_SURROGATE_LEAD(c)) {
   1377                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
   1378                         c=U16_GET_SUPPLEMENTARY(c, c2);
   1379                     }
   1380                 } else /* trail surrogate */ {
   1381                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
   1382                         --src;
   1383                         c=U16_GET_SUPPLEMENTARY(c2, c);
   1384                     }
   1385                 }
   1386                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
   1387                     src+=U16_LENGTH(c);
   1388                 } else {
   1389                     break;
   1390                 }
   1391             }
   1392         }
   1393         if(src!=prevSrc) {
   1394             // Set prevBoundary to the last character in the quick check loop.
   1395             prevBoundary=src-1;
   1396             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
   1397                 U16_IS_LEAD(*(prevBoundary-1))
   1398             ) {
   1399                 --prevBoundary;
   1400             }
   1401             prevCC=0;
   1402             // The start of the current character (c).
   1403             prevSrc=src;
   1404         }
   1405 
   1406         src+=U16_LENGTH(c);
   1407         /*
   1408          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
   1409          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
   1410          * or has ccc!=0.
   1411          */
   1412         if(isMaybeOrNonZeroCC(norm16)) {
   1413             uint8_t cc=getCCFromYesOrMaybe(norm16);
   1414             if( onlyContiguous &&  // FCC
   1415                 cc!=0 &&
   1416                 prevCC==0 &&
   1417                 prevBoundary<prevSrc &&
   1418                 // prevCC==0 && prevBoundary<prevSrc tell us that
   1419                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
   1420                 // passed the quick check "yes && ccc==0" test.
   1421                 // Check whether the last character was a "yesYes" or a "yesNo".
   1422                 // If a "yesNo", then we get its trailing ccc from its
   1423                 // mapping and check for canonical order.
   1424                 // All other cases are ok.
   1425                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
   1426             ) {
   1427                 // Fails FCD test.
   1428             } else if(prevCC<=cc || cc==0) {
   1429                 prevCC=cc;
   1430                 if(norm16<MIN_YES_YES_WITH_CC) {
   1431                     if(pQCResult!=NULL) {
   1432                         *pQCResult=UNORM_MAYBE;
   1433                     } else {
   1434                         return prevBoundary;
   1435                     }
   1436                 }
   1437                 continue;
   1438             }
   1439         }
   1440         if(pQCResult!=NULL) {
   1441             *pQCResult=UNORM_NO;
   1442         }
   1443         return prevBoundary;
   1444     }
   1445 }
   1446 
   1447 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
   1448                                        UBool doCompose,
   1449                                        UBool onlyContiguous,
   1450                                        UnicodeString &safeMiddle,
   1451                                        ReorderingBuffer &buffer,
   1452                                        UErrorCode &errorCode) const {
   1453     if(!buffer.isEmpty()) {
   1454         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
   1455         if(src!=firstStarterInSrc) {
   1456             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
   1457                                                                     buffer.getLimit());
   1458             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
   1459             UnicodeString middle(lastStarterInDest, destSuffixLength);
   1460             buffer.removeSuffix(destSuffixLength);
   1461             safeMiddle=middle;
   1462             middle.append(src, (int32_t)(firstStarterInSrc-src));
   1463             const UChar *middleStart=middle.getBuffer();
   1464             compose(middleStart, middleStart+middle.length(), onlyContiguous,
   1465                     TRUE, buffer, errorCode);
   1466             if(U_FAILURE(errorCode)) {
   1467                 return;
   1468             }
   1469             src=firstStarterInSrc;
   1470         }
   1471     }
   1472     if(doCompose) {
   1473         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
   1474     } else {
   1475         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
   1476             limit=u_strchr(src, 0);
   1477         }
   1478         buffer.appendZeroCC(src, limit, errorCode);
   1479     }
   1480 }
   1481 
   1482 /**
   1483  * Does c have a composition boundary before it?
   1484  * True if its decomposition begins with a character that has
   1485  * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
   1486  * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
   1487  * (isCompYesAndZeroCC()) so we need not decompose.
   1488  */
   1489 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
   1490     for(;;) {
   1491         if(isCompYesAndZeroCC(norm16)) {
   1492             return TRUE;
   1493         } else if(isMaybeOrNonZeroCC(norm16)) {
   1494             return FALSE;
   1495         } else if(isDecompNoAlgorithmic(norm16)) {
   1496             c=mapAlgorithmic(c, norm16);
   1497             norm16=getNorm16(c);
   1498         } else {
   1499             // c decomposes, get everything from the variable-length extra data
   1500             const uint16_t *mapping=getMapping(norm16);
   1501             uint16_t firstUnit=*mapping;
   1502             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
   1503                 return FALSE;
   1504             }
   1505             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
   1506                 return FALSE;  // non-zero leadCC
   1507             }
   1508             int32_t i=1;  // skip over the firstUnit
   1509             UChar32 c;
   1510             U16_NEXT_UNSAFE(mapping, i, c);
   1511             return isCompYesAndZeroCC(getNorm16(c));
   1512         }
   1513     }
   1514 }
   1515 
   1516 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
   1517     for(;;) {
   1518         uint16_t norm16=getNorm16(c);
   1519         if(isInert(norm16)) {
   1520             return TRUE;
   1521         } else if(norm16<=minYesNo) {
   1522             // Hangul: norm16==minYesNo
   1523             // Hangul LVT has a boundary after it.
   1524             // Hangul LV and non-inert yesYes characters combine forward.
   1525             return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
   1526         } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
   1527             return FALSE;
   1528         } else if(isDecompNoAlgorithmic(norm16)) {
   1529             c=mapAlgorithmic(c, norm16);
   1530         } else {
   1531             // c decomposes, get everything from the variable-length extra data.
   1532             // If testInert, then c must be a yesNo character which has lccc=0,
   1533             // otherwise it could be a noNo.
   1534             const uint16_t *mapping=getMapping(norm16);
   1535             uint16_t firstUnit=*mapping;
   1536             // TRUE if
   1537             //   not MAPPING_NO_COMP_BOUNDARY_AFTER
   1538             //     (which is set if
   1539             //       c is not deleted, and
   1540             //       it and its decomposition do not combine forward, and it has a starter)
   1541             //   and if FCC then trailCC<=1
   1542             return
   1543                 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
   1544                 (!onlyContiguous || firstUnit<=0x1ff);
   1545         }
   1546     }
   1547 }
   1548 
   1549 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
   1550     BackwardUTrie2StringIterator iter(normTrie, start, p);
   1551     uint16_t norm16;
   1552     do {
   1553         norm16=iter.previous16();
   1554     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
   1555     // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
   1556     // but that's probably not worth the extra cost.
   1557     return iter.codePointStart;
   1558 }
   1559 
   1560 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
   1561     ForwardUTrie2StringIterator iter(normTrie, p, limit);
   1562     uint16_t norm16;
   1563     do {
   1564         norm16=iter.next16();
   1565     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
   1566     return iter.codePointStart;
   1567 }
   1568 
   1569 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
   1570 // still had getFCDTrie() which built and cached an FCD trie.
   1571 // That provided faster access to FCD data than getFCD16FromNormData()
   1572 // but required synchronization and consumed some 10kB of heap memory
   1573 // in any process that uses FCD (e.g., via collation).
   1574 // tccc180[] and smallFCD[] are intended to help with any loss of performance,
   1575 // at least for Latin & CJK.
   1576 
   1577 // Gets the FCD value from the regular normalization data.
   1578 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
   1579     // Only loops for 1:1 algorithmic mappings.
   1580     for(;;) {
   1581         uint16_t norm16=getNorm16(c);
   1582         if(norm16<=minYesNo) {
   1583             // no decomposition or Hangul syllable, all zeros
   1584             return 0;
   1585         } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
   1586             // combining mark
   1587             norm16&=0xff;
   1588             return norm16|(norm16<<8);
   1589         } else if(norm16>=minMaybeYes) {
   1590             return 0;
   1591         } else if(isDecompNoAlgorithmic(norm16)) {
   1592             c=mapAlgorithmic(c, norm16);
   1593         } else {
   1594             // c decomposes, get everything from the variable-length extra data
   1595             const uint16_t *mapping=getMapping(norm16);
   1596             uint16_t firstUnit=*mapping;
   1597             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
   1598                 // A character that is deleted (maps to an empty string) must
   1599                 // get the worst-case lccc and tccc values because arbitrary
   1600                 // characters on both sides will become adjacent.
   1601                 return 0x1ff;
   1602             } else {
   1603                 norm16=firstUnit>>8;  // tccc
   1604                 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
   1605                     norm16|=*(mapping-1)&0xff00;  // lccc
   1606                 }
   1607                 return norm16;
   1608             }
   1609         }
   1610     }
   1611 }
   1612 
   1613 // Dual functionality:
   1614 // buffer!=NULL: normalize
   1615 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
   1616 const UChar *
   1617 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
   1618                          ReorderingBuffer *buffer,
   1619                          UErrorCode &errorCode) const {
   1620     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
   1621     // Similar to the prevBoundary in the compose() implementation.
   1622     const UChar *prevBoundary=src;
   1623     int32_t prevFCD16=0;
   1624     if(limit==NULL) {
   1625         src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
   1626         if(U_FAILURE(errorCode)) {
   1627             return src;
   1628         }
   1629         if(prevBoundary<src) {
   1630             prevBoundary=src;
   1631             // We know that the previous character's lccc==0.
   1632             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
   1633             prevFCD16=getFCD16(*(src-1));
   1634             if(prevFCD16>1) {
   1635                 --prevBoundary;
   1636             }
   1637         }
   1638         limit=u_strchr(src, 0);
   1639     }
   1640 
   1641     // Note: In this function we use buffer->appendZeroCC() because we track
   1642     // the lead and trail combining classes here, rather than leaving it to
   1643     // the ReorderingBuffer.
   1644     // The exception is the call to decomposeShort() which uses the buffer
   1645     // in the normal way.
   1646 
   1647     const UChar *prevSrc;
   1648     UChar32 c=0;
   1649     uint16_t fcd16=0;
   1650 
   1651     for(;;) {
   1652         // count code units with lccc==0
   1653         for(prevSrc=src; src!=limit;) {
   1654             if((c=*src)<MIN_CCC_LCCC_CP) {
   1655                 prevFCD16=~c;
   1656                 ++src;
   1657             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
   1658                 prevFCD16=0;
   1659                 ++src;
   1660             } else {
   1661                 if(U16_IS_SURROGATE(c)) {
   1662                     UChar c2;
   1663                     if(U16_IS_SURROGATE_LEAD(c)) {
   1664                         if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
   1665                             c=U16_GET_SUPPLEMENTARY(c, c2);
   1666                         }
   1667                     } else /* trail surrogate */ {
   1668                         if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
   1669                             --src;
   1670                             c=U16_GET_SUPPLEMENTARY(c2, c);
   1671                         }
   1672                     }
   1673                 }
   1674                 if((fcd16=getFCD16FromNormData(c))<=0xff) {
   1675                     prevFCD16=fcd16;
   1676                     src+=U16_LENGTH(c);
   1677                 } else {
   1678                     break;
   1679                 }
   1680             }
   1681         }
   1682         // copy these code units all at once
   1683         if(src!=prevSrc) {
   1684             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
   1685                 break;
   1686             }
   1687             if(src==limit) {
   1688                 break;
   1689             }
   1690             prevBoundary=src;
   1691             // We know that the previous character's lccc==0.
   1692             if(prevFCD16<0) {
   1693                 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
   1694                 UChar32 prev=~prevFCD16;
   1695                 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
   1696                 if(prevFCD16>1) {
   1697                     --prevBoundary;
   1698                 }
   1699             } else {
   1700                 const UChar *p=src-1;
   1701                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
   1702                     --p;
   1703                     // Need to fetch the previous character's FCD value because
   1704                     // prevFCD16 was just for the trail surrogate code point.
   1705                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
   1706                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
   1707                 }
   1708                 if(prevFCD16>1) {
   1709                     prevBoundary=p;
   1710                 }
   1711             }
   1712             // The start of the current character (c).
   1713             prevSrc=src;
   1714         } else if(src==limit) {
   1715             break;
   1716         }
   1717 
   1718         src+=U16_LENGTH(c);
   1719         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
   1720         // Check for proper order, and decompose locally if necessary.
   1721         if((prevFCD16&0xff)<=(fcd16>>8)) {
   1722             // proper order: prev tccc <= current lccc
   1723             if((fcd16&0xff)<=1) {
   1724                 prevBoundary=src;
   1725             }
   1726             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
   1727                 break;
   1728             }
   1729             prevFCD16=fcd16;
   1730             continue;
   1731         } else if(buffer==NULL) {
   1732             return prevBoundary;  // quick check "no"
   1733         } else {
   1734             /*
   1735              * Back out the part of the source that we copied or appended
   1736              * already but is now going to be decomposed.
   1737              * prevSrc is set to after what was copied/appended.
   1738              */
   1739             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
   1740             /*
   1741              * Find the part of the source that needs to be decomposed,
   1742              * up to the next safe boundary.
   1743              */
   1744             src=findNextFCDBoundary(src, limit);
   1745             /*
   1746              * The source text does not fulfill the conditions for FCD.
   1747              * Decompose and reorder a limited piece of the text.
   1748              */
   1749             if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
   1750                 break;
   1751             }
   1752             prevBoundary=src;
   1753             prevFCD16=0;
   1754         }
   1755     }
   1756     return src;
   1757 }
   1758 
   1759 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
   1760                                        UBool doMakeFCD,
   1761                                        UnicodeString &safeMiddle,
   1762                                        ReorderingBuffer &buffer,
   1763                                        UErrorCode &errorCode) const {
   1764     if(!buffer.isEmpty()) {
   1765         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
   1766         if(src!=firstBoundaryInSrc) {
   1767             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
   1768                                                                     buffer.getLimit());
   1769             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
   1770             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
   1771             buffer.removeSuffix(destSuffixLength);
   1772             safeMiddle=middle;
   1773             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
   1774             const UChar *middleStart=middle.getBuffer();
   1775             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
   1776             if(U_FAILURE(errorCode)) {
   1777                 return;
   1778             }
   1779             src=firstBoundaryInSrc;
   1780         }
   1781     }
   1782     if(doMakeFCD) {
   1783         makeFCD(src, limit, &buffer, errorCode);
   1784     } else {
   1785         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
   1786             limit=u_strchr(src, 0);
   1787         }
   1788         buffer.appendZeroCC(src, limit, errorCode);
   1789     }
   1790 }
   1791 
   1792 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
   1793     while(start<p && previousFCD16(start, p)>0xff) {}
   1794     return p;
   1795 }
   1796 
   1797 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
   1798     while(p<limit) {
   1799         const UChar *codePointStart=p;
   1800         if(nextFCD16(p, limit)<=0xff) {
   1801             return codePointStart;
   1802         }
   1803     }
   1804     return p;
   1805 }
   1806 
   1807 // CanonicalIterator data -------------------------------------------------- ***
   1808 
   1809 CanonIterData::CanonIterData(UErrorCode &errorCode) :
   1810         trie(utrie2_open(0, 0, &errorCode)),
   1811         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
   1812 
   1813 CanonIterData::~CanonIterData() {
   1814     utrie2_close(trie);
   1815 }
   1816 
   1817 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
   1818     uint32_t canonValue=utrie2_get32(trie, decompLead);
   1819     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
   1820         // origin is the first character whose decomposition starts with
   1821         // the character for which we are setting the value.
   1822         utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
   1823     } else {
   1824         // origin is not the first character, or it is U+0000.
   1825         UnicodeSet *set;
   1826         if((canonValue&CANON_HAS_SET)==0) {
   1827             set=new UnicodeSet;
   1828             if(set==NULL) {
   1829                 errorCode=U_MEMORY_ALLOCATION_ERROR;
   1830                 return;
   1831             }
   1832             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
   1833             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
   1834             utrie2_set32(trie, decompLead, canonValue, &errorCode);
   1835             canonStartSets.addElement(set, errorCode);
   1836             if(firstOrigin!=0) {
   1837                 set->add(firstOrigin);
   1838             }
   1839         } else {
   1840             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
   1841         }
   1842         set->add(origin);
   1843     }
   1844 }
   1845 
   1846 U_CDECL_BEGIN
   1847 
   1848 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
   1849 //     context: the Normalizer2Impl
   1850 static UBool U_CALLCONV
   1851 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
   1852     UErrorCode errorCode = U_ZERO_ERROR;
   1853     if (value != 0) {
   1854         Normalizer2Impl *impl = (Normalizer2Impl *)context;
   1855         impl->makeCanonIterDataFromNorm16(
   1856             start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
   1857     }
   1858     return U_SUCCESS(errorCode);
   1859 }
   1860 
   1861 
   1862 
   1863 // UInitOnce instantiation function for CanonIterData
   1864 
   1865 static void U_CALLCONV
   1866 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
   1867     U_ASSERT(impl->fCanonIterData == NULL);
   1868     impl->fCanonIterData = new CanonIterData(errorCode);
   1869     if (impl->fCanonIterData == NULL) {
   1870         errorCode=U_MEMORY_ALLOCATION_ERROR;
   1871     }
   1872     if (U_SUCCESS(errorCode)) {
   1873         utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
   1874         utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
   1875     }
   1876     if (U_FAILURE(errorCode)) {
   1877         delete impl->fCanonIterData;
   1878         impl->fCanonIterData = NULL;
   1879     }
   1880 }
   1881 
   1882 U_CDECL_END
   1883 
   1884 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
   1885                                                   CanonIterData &newData,
   1886                                                   UErrorCode &errorCode) const {
   1887     if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
   1888         // Inert, or 2-way mapping (including Hangul syllable).
   1889         // We do not write a canonStartSet for any yesNo character.
   1890         // Composites from 2-way mappings are added at runtime from the
   1891         // starter's compositions list, and the other characters in
   1892         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
   1893         // "maybe" characters.
   1894         return;
   1895     }
   1896     for(UChar32 c=start; c<=end; ++c) {
   1897         uint32_t oldValue=utrie2_get32(newData.trie, c);
   1898         uint32_t newValue=oldValue;
   1899         if(norm16>=minMaybeYes) {
   1900             // not a segment starter if it occurs in a decomposition or has cc!=0
   1901             newValue|=CANON_NOT_SEGMENT_STARTER;
   1902             if(norm16<MIN_NORMAL_MAYBE_YES) {
   1903                 newValue|=CANON_HAS_COMPOSITIONS;
   1904             }
   1905         } else if(norm16<minYesNo) {
   1906             newValue|=CANON_HAS_COMPOSITIONS;
   1907         } else {
   1908             // c has a one-way decomposition
   1909             UChar32 c2=c;
   1910             uint16_t norm16_2=norm16;
   1911             while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
   1912                 c2=mapAlgorithmic(c2, norm16_2);
   1913                 norm16_2=getNorm16(c2);
   1914             }
   1915             if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
   1916                 // c decomposes, get everything from the variable-length extra data
   1917                 const uint16_t *mapping=getMapping(norm16_2);
   1918                 uint16_t firstUnit=*mapping;
   1919                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
   1920                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
   1921                     if(c==c2 && (*(mapping-1)&0xff)!=0) {
   1922                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
   1923                     }
   1924                 }
   1925                 // Skip empty mappings (no characters in the decomposition).
   1926                 if(length!=0) {
   1927                     ++mapping;  // skip over the firstUnit
   1928                     // add c to first code point's start set
   1929                     int32_t i=0;
   1930                     U16_NEXT_UNSAFE(mapping, i, c2);
   1931                     newData.addToStartSet(c, c2, errorCode);
   1932                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
   1933                     // one-way mapping. A 2-way mapping is possible here after
   1934                     // intermediate algorithmic mapping.
   1935                     if(norm16_2>=minNoNo) {
   1936                         while(i<length) {
   1937                             U16_NEXT_UNSAFE(mapping, i, c2);
   1938                             uint32_t c2Value=utrie2_get32(newData.trie, c2);
   1939                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
   1940                                 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
   1941                                              &errorCode);
   1942                             }
   1943                         }
   1944                     }
   1945                 }
   1946             } else {
   1947                 // c decomposed to c2 algorithmically; c has cc==0
   1948                 newData.addToStartSet(c, c2, errorCode);
   1949             }
   1950         }
   1951         if(newValue!=oldValue) {
   1952             utrie2_set32(newData.trie, c, newValue, &errorCode);
   1953         }
   1954     }
   1955 }
   1956 
   1957 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
   1958     // Logically const: Synchronized instantiation.
   1959     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
   1960     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
   1961     return U_SUCCESS(errorCode);
   1962 }
   1963 
   1964 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
   1965     return (int32_t)utrie2_get32(fCanonIterData->trie, c);
   1966 }
   1967 
   1968 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
   1969     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
   1970 }
   1971 
   1972 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
   1973     return getCanonValue(c)>=0;
   1974 }
   1975 
   1976 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
   1977     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
   1978     if(canonValue==0) {
   1979         return FALSE;
   1980     }
   1981     set.clear();
   1982     int32_t value=canonValue&CANON_VALUE_MASK;
   1983     if((canonValue&CANON_HAS_SET)!=0) {
   1984         set.addAll(getCanonStartSet(value));
   1985     } else if(value!=0) {
   1986         set.add(value);
   1987     }
   1988     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
   1989         uint16_t norm16=getNorm16(c);
   1990         if(norm16==JAMO_L) {
   1991             UChar32 syllable=
   1992                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
   1993             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
   1994         } else {
   1995             addComposites(getCompositionsList(norm16), set);
   1996         }
   1997     }
   1998     return TRUE;
   1999 }
   2000 
   2001 U_NAMESPACE_END
   2002 
   2003 // Normalizer2 data swapping ----------------------------------------------- ***
   2004 
   2005 U_NAMESPACE_USE
   2006 
   2007 U_CAPI int32_t U_EXPORT2
   2008 unorm2_swap(const UDataSwapper *ds,
   2009             const void *inData, int32_t length, void *outData,
   2010             UErrorCode *pErrorCode) {
   2011     const UDataInfo *pInfo;
   2012     int32_t headerSize;
   2013 
   2014     const uint8_t *inBytes;
   2015     uint8_t *outBytes;
   2016 
   2017     const int32_t *inIndexes;
   2018     int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
   2019 
   2020     int32_t i, offset, nextOffset, size;
   2021 
   2022     /* udata_swapDataHeader checks the arguments */
   2023     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
   2024     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   2025         return 0;
   2026     }
   2027 
   2028     /* check data format and format version */
   2029     pInfo=(const UDataInfo *)((const char *)inData+4);
   2030     if(!(
   2031         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
   2032         pInfo->dataFormat[1]==0x72 &&
   2033         pInfo->dataFormat[2]==0x6d &&
   2034         pInfo->dataFormat[3]==0x32 &&
   2035         (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
   2036     )) {
   2037         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
   2038                          pInfo->dataFormat[0], pInfo->dataFormat[1],
   2039                          pInfo->dataFormat[2], pInfo->dataFormat[3],
   2040                          pInfo->formatVersion[0]);
   2041         *pErrorCode=U_UNSUPPORTED_ERROR;
   2042         return 0;
   2043     }
   2044 
   2045     inBytes=(const uint8_t *)inData+headerSize;
   2046     outBytes=(uint8_t *)outData+headerSize;
   2047 
   2048     inIndexes=(const int32_t *)inBytes;
   2049 
   2050     if(length>=0) {
   2051         length-=headerSize;
   2052         if(length<(int32_t)sizeof(indexes)) {
   2053             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
   2054                              length);
   2055             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   2056             return 0;
   2057         }
   2058     }
   2059 
   2060     /* read the first few indexes */
   2061     for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
   2062         indexes[i]=udata_readInt32(ds, inIndexes[i]);
   2063     }
   2064 
   2065     /* get the total length of the data */
   2066     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
   2067 
   2068     if(length>=0) {
   2069         if(length<size) {
   2070             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
   2071                              length);
   2072             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   2073             return 0;
   2074         }
   2075 
   2076         /* copy the data for inaccessible bytes */
   2077         if(inBytes!=outBytes) {
   2078             uprv_memcpy(outBytes, inBytes, size);
   2079         }
   2080 
   2081         offset=0;
   2082 
   2083         /* swap the int32_t indexes[] */
   2084         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
   2085         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
   2086         offset=nextOffset;
   2087 
   2088         /* swap the UTrie2 */
   2089         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
   2090         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
   2091         offset=nextOffset;
   2092 
   2093         /* swap the uint16_t extraData[] */
   2094         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
   2095         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
   2096         offset=nextOffset;
   2097 
   2098         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
   2099         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
   2100         offset=nextOffset;
   2101 
   2102         U_ASSERT(offset==size);
   2103     }
   2104 
   2105     return headerSize+size;
   2106 }
   2107 
   2108 #endif  // !UCONFIG_NO_NORMALIZATION
   2109