Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 1996-2012, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ucol.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 * Modification history
     12 * Date        Name      Comments
     13 * 1996-1999   various members of ICU team maintained C API for collation framework
     14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
     15 * 03/01/2001  synwee    Added maxexpansion functionality.
     16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_COLLATION
     22 
     23 #include "unicode/bytestream.h"
     24 #include "unicode/coleitr.h"
     25 #include "unicode/unorm.h"
     26 #include "unicode/udata.h"
     27 #include "unicode/ustring.h"
     28 #include "unicode/utf8.h"
     29 
     30 #include "ucol_imp.h"
     31 #include "bocsu.h"
     32 
     33 #include "normalizer2impl.h"
     34 #include "unorm_it.h"
     35 #include "umutex.h"
     36 #include "cmemory.h"
     37 #include "ucln_in.h"
     38 #include "cstring.h"
     39 #include "utracimp.h"
     40 #include "putilimp.h"
     41 #include "uassert.h"
     42 #include "unicode/coll.h"
     43 
     44 #ifdef UCOL_DEBUG
     45 #include <stdio.h>
     46 #endif
     47 
     48 U_NAMESPACE_USE
     49 
     50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     51 
     52 #define LAST_BYTE_MASK_           0xFF
     53 #define SECOND_LAST_BYTE_SHIFT_   8
     54 
     55 #define ZERO_CC_LIMIT_            0xC0
     56 
     57 // These are static pointers to the NFC/NFD implementation instance.
     58 // Each of them is always the same between calls to u_cleanup
     59 // and therefore writing to it is not synchronized.
     60 // They are cleaned in ucol_cleanup
     61 static const Normalizer2 *g_nfd = NULL;
     62 static const Normalizer2Impl *g_nfcImpl = NULL;
     63 
     64 // These are values from UCA required for
     65 // implicit generation and supressing sort key compression
     66 // they should regularly be in the UCA, but if one
     67 // is running without UCA, it could be a problem
     68 static const int32_t maxRegularPrimary  = 0x7A;
     69 static const int32_t minImplicitPrimary = 0xE0;
     70 static const int32_t maxImplicitPrimary = 0xE4;
     71 
     72 U_CDECL_BEGIN
     73 static UBool U_CALLCONV
     74 ucol_cleanup(void)
     75 {
     76     g_nfd = NULL;
     77     g_nfcImpl = NULL;
     78     return TRUE;
     79 }
     80 
     81 static int32_t U_CALLCONV
     82 _getFoldingOffset(uint32_t data) {
     83     return (int32_t)(data&0xFFFFFF);
     84 }
     85 
     86 U_CDECL_END
     87 
     88 static inline
     89 UBool initializeNFD(UErrorCode *status) {
     90     if (g_nfd != NULL) {
     91         return TRUE;
     92     } else {
     93         // The result is constant, until the library is reloaded.
     94         g_nfd = Normalizer2Factory::getNFDInstance(*status);
     95         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
     96         return U_SUCCESS(*status);
     97     }
     98 }
     99 
    100 // init FCD data
    101 static inline
    102 UBool initializeFCD(UErrorCode *status) {
    103     if (g_nfcImpl != NULL) {
    104         return TRUE;
    105     } else {
    106         // The result is constant, until the library is reloaded.
    107         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
    108         // Note: Alternatively, we could also store this pointer in each collIterate struct,
    109         // same as Normalizer2Factory::getImpl(collIterate->nfd).
    110         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
    111         return U_SUCCESS(*status);
    112     }
    113 }
    114 
    115 static
    116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
    117                               int32_t sourceLen, collIterate *s,
    118                               UErrorCode *status)
    119 {
    120     (s)->string = (s)->pos = sourceString;
    121     (s)->origFlags = 0;
    122     (s)->flags = 0;
    123     if (sourceLen >= 0) {
    124         s->flags |= UCOL_ITER_HASLEN;
    125         (s)->endp = (UChar *)sourceString+sourceLen;
    126     }
    127     else {
    128         /* change to enable easier checking for end of string for fcdpositon */
    129         (s)->endp = NULL;
    130     }
    131     (s)->extendCEs = NULL;
    132     (s)->extendCEsSize = 0;
    133     (s)->CEpos = (s)->toReturn = (s)->CEs;
    134     (s)->offsetBuffer = NULL;
    135     (s)->offsetBufferSize = 0;
    136     (s)->offsetReturn = (s)->offsetStore = NULL;
    137     (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
    138     (s)->coll = (collator);
    139     if (initializeNFD(status)) {
    140         (s)->nfd = g_nfd;
    141     } else {
    142         return;
    143     }
    144     (s)->fcdPosition = 0;
    145     if(collator->normalizationMode == UCOL_ON) {
    146         (s)->flags |= UCOL_ITER_NORM;
    147     }
    148     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
    149         (s)->flags |= UCOL_HIRAGANA_Q;
    150     }
    151     (s)->iterator = NULL;
    152     //(s)->iteratorIndex = 0;
    153 }
    154 
    155 U_CAPI void  U_EXPORT2
    156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
    157                              int32_t sourceLen, collIterate *s,
    158                              UErrorCode *status) {
    159     /* Out-of-line version for use from other files. */
    160     IInit_collIterate(collator, sourceString, sourceLen, s, status);
    161 }
    162 
    163 U_CAPI collIterate * U_EXPORT2
    164 uprv_new_collIterate(UErrorCode *status) {
    165     if(U_FAILURE(*status)) {
    166         return NULL;
    167     }
    168     collIterate *s = new collIterate;
    169     if(s == NULL) {
    170         *status = U_MEMORY_ALLOCATION_ERROR;
    171         return NULL;
    172     }
    173     return s;
    174 }
    175 
    176 U_CAPI void U_EXPORT2
    177 uprv_delete_collIterate(collIterate *s) {
    178     delete s;
    179 }
    180 
    181 U_CAPI UBool U_EXPORT2
    182 uprv_collIterateAtEnd(collIterate *s) {
    183     return s == NULL || s->pos == s->endp;
    184 }
    185 
    186 /**
    187 * Backup the state of the collIterate struct data
    188 * @param data collIterate to backup
    189 * @param backup storage
    190 */
    191 static
    192 inline void backupState(const collIterate *data, collIterateState *backup)
    193 {
    194     backup->fcdPosition = data->fcdPosition;
    195     backup->flags       = data->flags;
    196     backup->origFlags   = data->origFlags;
    197     backup->pos         = data->pos;
    198     backup->bufferaddress = data->writableBuffer.getBuffer();
    199     backup->buffersize    = data->writableBuffer.length();
    200     backup->iteratorMove = 0;
    201     backup->iteratorIndex = 0;
    202     if(data->iterator != NULL) {
    203         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
    204         backup->iteratorIndex = data->iterator->getState(data->iterator);
    205         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
    206         if(backup->iteratorIndex == UITER_NO_STATE) {
    207             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
    208                 backup->iteratorMove++;
    209                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
    210             }
    211             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    212         }
    213     }
    214 }
    215 
    216 /**
    217 * Loads the state into the collIterate struct data
    218 * @param data collIterate to backup
    219 * @param backup storage
    220 * @param forwards boolean to indicate if forwards iteration is used,
    221 *        false indicates backwards iteration
    222 */
    223 static
    224 inline void loadState(collIterate *data, const collIterateState *backup,
    225                       UBool        forwards)
    226 {
    227     UErrorCode status = U_ZERO_ERROR;
    228     data->flags       = backup->flags;
    229     data->origFlags   = backup->origFlags;
    230     if(data->iterator != NULL) {
    231         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
    232         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
    233         if(backup->iteratorMove != 0) {
    234             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    235         }
    236     }
    237     data->pos         = backup->pos;
    238 
    239     if ((data->flags & UCOL_ITER_INNORMBUF) &&
    240         data->writableBuffer.getBuffer() != backup->bufferaddress) {
    241         /*
    242         this is when a new buffer has been reallocated and we'll have to
    243         calculate the new position.
    244         note the new buffer has to contain the contents of the old buffer.
    245         */
    246         if (forwards) {
    247             data->pos = data->writableBuffer.getTerminatedBuffer() +
    248                                          (data->pos - backup->bufferaddress);
    249         }
    250         else {
    251             /* backwards direction */
    252             int32_t temp = backup->buffersize -
    253                                   (int32_t)(data->pos - backup->bufferaddress);
    254             data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
    255         }
    256     }
    257     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
    258         /*
    259         this is alittle tricky.
    260         if we are initially not in the normalization buffer, even if we
    261         normalize in the later stage, the data in the buffer will be
    262         ignored, since we skip back up to the data string.
    263         however if we are already in the normalization buffer, any
    264         further normalization will pull data into the normalization
    265         buffer and modify the fcdPosition.
    266         since we are keeping the data in the buffer for use, the
    267         fcdPosition can not be reverted back.
    268         arrgghh....
    269         */
    270         data->fcdPosition = backup->fcdPosition;
    271     }
    272 }
    273 
    274 static UBool
    275 reallocCEs(collIterate *data, int32_t newCapacity) {
    276     uint32_t *oldCEs = data->extendCEs;
    277     if(oldCEs == NULL) {
    278         oldCEs = data->CEs;
    279     }
    280     int32_t length = data->CEpos - oldCEs;
    281     uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
    282     if(newCEs == NULL) {
    283         return FALSE;
    284     }
    285     uprv_memcpy(newCEs, oldCEs, length * 4);
    286     uprv_free(data->extendCEs);
    287     data->extendCEs = newCEs;
    288     data->extendCEsSize = newCapacity;
    289     data->CEpos = newCEs + length;
    290     return TRUE;
    291 }
    292 
    293 static UBool
    294 increaseCEsCapacity(collIterate *data) {
    295     int32_t oldCapacity;
    296     if(data->extendCEs != NULL) {
    297         oldCapacity = data->extendCEsSize;
    298     } else {
    299         oldCapacity = LENGTHOF(data->CEs);
    300     }
    301     return reallocCEs(data, 2 * oldCapacity);
    302 }
    303 
    304 static UBool
    305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
    306     int32_t oldCapacity;
    307     if(data->extendCEs != NULL) {
    308         oldCapacity = data->extendCEsSize;
    309     } else {
    310         oldCapacity = LENGTHOF(data->CEs);
    311     }
    312     if(minCapacity <= oldCapacity) {
    313         return TRUE;
    314     }
    315     oldCapacity *= 2;
    316     return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
    317 }
    318 
    319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
    320     if(U_FAILURE(errorCode)) {
    321         return;
    322     }
    323     int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
    324     U_ASSERT(length >= offsetBufferSize || offsetStore != NULL);
    325     if(length >= offsetBufferSize) {
    326         int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
    327         int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4));
    328         if(newBuffer == NULL) {
    329             errorCode = U_MEMORY_ALLOCATION_ERROR;
    330             return;
    331         }
    332         if(length > 0) {
    333             uprv_memcpy(newBuffer, offsetBuffer, length * 4);
    334         }
    335         uprv_free(offsetBuffer);
    336         offsetBuffer = newBuffer;
    337         offsetStore = offsetBuffer + length;
    338         offsetBufferSize = newCapacity;
    339     }
    340     *offsetStore++ = offset;
    341 }
    342 
    343 /*
    344 * collIter_eos()
    345 *     Checks for a collIterate being positioned at the end of
    346 *     its source string.
    347 *
    348 */
    349 static
    350 inline UBool collIter_eos(collIterate *s) {
    351     if(s->flags & UCOL_USE_ITERATOR) {
    352       return !(s->iterator->hasNext(s->iterator));
    353     }
    354     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
    355         // Null terminated string, but not at null, so not at end.
    356         //   Whether in main or normalization buffer doesn't matter.
    357         return FALSE;
    358     }
    359 
    360     // String with length.  Can't be in normalization buffer, which is always
    361     //  null termintated.
    362     if (s->flags & UCOL_ITER_HASLEN) {
    363         return (s->pos == s->endp);
    364     }
    365 
    366     // We are at a null termination, could be either normalization buffer or main string.
    367     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
    368         // At null at end of main string.
    369         return TRUE;
    370     }
    371 
    372     // At null at end of normalization buffer.  Need to check whether there there are
    373     //   any characters left in the main buffer.
    374     if(s->origFlags & UCOL_USE_ITERATOR) {
    375       return !(s->iterator->hasNext(s->iterator));
    376     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
    377         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
    378         return (*s->fcdPosition == 0);
    379     }
    380     else {
    381         // Main string with an end pointer.
    382         return s->fcdPosition == s->endp;
    383     }
    384 }
    385 
    386 /*
    387 * collIter_bos()
    388 *     Checks for a collIterate being positioned at the start of
    389 *     its source string.
    390 *
    391 */
    392 static
    393 inline UBool collIter_bos(collIterate *source) {
    394   // if we're going backwards, we need to know whether there is more in the
    395   // iterator, even if we are in the side buffer
    396   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    397     return !source->iterator->hasPrevious(source->iterator);
    398   }
    399   if (source->pos <= source->string ||
    400       ((source->flags & UCOL_ITER_INNORMBUF) &&
    401       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
    402     return TRUE;
    403   }
    404   return FALSE;
    405 }
    406 
    407 /*static
    408 inline UBool collIter_SimpleBos(collIterate *source) {
    409   // if we're going backwards, we need to know whether there is more in the
    410   // iterator, even if we are in the side buffer
    411   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    412     return !source->iterator->hasPrevious(source->iterator);
    413   }
    414   if (source->pos == source->string) {
    415     return TRUE;
    416   }
    417   return FALSE;
    418 }*/
    419     //return (data->pos == data->string) ||
    420 
    421 
    422 /****************************************************************************/
    423 /* Following are the open/close functions                                   */
    424 /*                                                                          */
    425 /****************************************************************************/
    426 
    427 static UCollator*
    428 ucol_initFromBinary(const uint8_t *bin, int32_t length,
    429                 const UCollator *base,
    430                 UCollator *fillIn,
    431                 UErrorCode *status)
    432 {
    433     UCollator *result = fillIn;
    434     if(U_FAILURE(*status)) {
    435         return NULL;
    436     }
    437     /*
    438     if(base == NULL) {
    439         // we don't support null base yet
    440         *status = U_ILLEGAL_ARGUMENT_ERROR;
    441         return NULL;
    442     }
    443     */
    444     // We need these and we could be running without UCA
    445     uprv_uca_initImplicitConstants(status);
    446     UCATableHeader *colData = (UCATableHeader *)bin;
    447     // do we want version check here? We're trying to figure out whether collators are compatible
    448     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
    449         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
    450         colData->version[0] != UCOL_BUILDER_VERSION)
    451     {
    452         *status = U_COLLATOR_VERSION_MISMATCH;
    453         return NULL;
    454     }
    455     else {
    456         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
    457             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
    458             if(U_FAILURE(*status)){
    459                 return NULL;
    460             }
    461             result->hasRealData = TRUE;
    462         }
    463         else {
    464             if(base) {
    465                 result = ucol_initCollator(base->image, result, base, status);
    466                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
    467                 if(U_FAILURE(*status)){
    468                     return NULL;
    469                 }
    470                 result->hasRealData = FALSE;
    471             }
    472             else {
    473                 *status = U_USELESS_COLLATOR_ERROR;
    474                 return NULL;
    475             }
    476         }
    477         result->freeImageOnClose = FALSE;
    478     }
    479     result->actualLocale = NULL;
    480     result->validLocale = NULL;
    481     result->requestedLocale = NULL;
    482     result->rules = NULL;
    483     result->rulesLength = 0;
    484     result->freeRulesOnClose = FALSE;
    485     result->ucaRules = NULL;
    486     return result;
    487 }
    488 
    489 U_CAPI UCollator* U_EXPORT2
    490 ucol_openBinary(const uint8_t *bin, int32_t length,
    491                 const UCollator *base,
    492                 UErrorCode *status)
    493 {
    494     return ucol_initFromBinary(bin, length, base, NULL, status);
    495 }
    496 
    497 U_CAPI int32_t U_EXPORT2
    498 ucol_cloneBinary(const UCollator *coll,
    499                  uint8_t *buffer, int32_t capacity,
    500                  UErrorCode *status)
    501 {
    502     int32_t length = 0;
    503     if(U_FAILURE(*status)) {
    504         return length;
    505     }
    506     if(capacity < 0) {
    507         *status = U_ILLEGAL_ARGUMENT_ERROR;
    508         return length;
    509     }
    510     if(coll->hasRealData == TRUE) {
    511         length = coll->image->size;
    512         if(length <= capacity) {
    513             uprv_memcpy(buffer, coll->image, length);
    514         } else {
    515             *status = U_BUFFER_OVERFLOW_ERROR;
    516         }
    517     } else {
    518         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    519         if(length <= capacity) {
    520             /* build the UCATableHeader with minimal entries */
    521             /* do not copy the header from the UCA file because its values are wrong! */
    522             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    523 
    524             /* reset everything */
    525             uprv_memset(buffer, 0, length);
    526 
    527             /* set the tailoring-specific values */
    528             UCATableHeader *myData = (UCATableHeader *)buffer;
    529             myData->size = length;
    530 
    531             /* offset for the options, the only part of the data that is present after the header */
    532             myData->options = sizeof(UCATableHeader);
    533 
    534             /* need to always set the expansion value for an upper bound of the options */
    535             myData->expansion = myData->options + sizeof(UColOptionSet);
    536 
    537             myData->magic = UCOL_HEADER_MAGIC;
    538             myData->isBigEndian = U_IS_BIG_ENDIAN;
    539             myData->charSetFamily = U_CHARSET_FAMILY;
    540 
    541             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    542             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    543 
    544             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    545             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    546             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    547             myData->jamoSpecial = coll->image->jamoSpecial;
    548 
    549             /* copy the collator options */
    550             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    551         } else {
    552             *status = U_BUFFER_OVERFLOW_ERROR;
    553         }
    554     }
    555     return length;
    556 }
    557 
    558 U_CAPI UCollator* U_EXPORT2
    559 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
    560 {
    561     UCollator * localCollator;
    562     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
    563     char *stackBufferChars = (char *)stackBuffer;
    564     int32_t imageSize = 0;
    565     int32_t rulesSize = 0;
    566     int32_t rulesPadding = 0;
    567     int32_t defaultReorderCodesSize = 0;
    568     int32_t reorderCodesSize = 0;
    569     uint8_t *image;
    570     UChar *rules;
    571     int32_t* defaultReorderCodes;
    572     int32_t* reorderCodes;
    573     uint8_t* leadBytePermutationTable;
    574     UBool colAllocated = FALSE;
    575     UBool imageAllocated = FALSE;
    576 
    577     if (status == NULL || U_FAILURE(*status)){
    578         return 0;
    579     }
    580     if ((stackBuffer && !pBufferSize) || !coll){
    581        *status = U_ILLEGAL_ARGUMENT_ERROR;
    582         return 0;
    583     }
    584 
    585     if (coll->rules && coll->freeRulesOnClose) {
    586         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
    587         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
    588         bufferSizeNeeded += rulesSize + rulesPadding;
    589     }
    590     // no padding for alignment needed from here since the next two are 4 byte quantities
    591     if (coll->defaultReorderCodes) {
    592         defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
    593         bufferSizeNeeded += defaultReorderCodesSize;
    594     }
    595     if (coll->reorderCodes) {
    596         reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
    597         bufferSizeNeeded += reorderCodesSize;
    598     }
    599     if (coll->leadBytePermutationTable) {
    600         bufferSizeNeeded += 256 * sizeof(uint8_t);
    601     }
    602 
    603     if (stackBuffer && *pBufferSize <= 0) { /* 'preflighting' request - set needed size into *pBufferSize */
    604         *pBufferSize =  bufferSizeNeeded;
    605         return 0;
    606     }
    607 
    608     /* Pointers on 64-bit platforms need to be aligned
    609      * on a 64-bit boundry in memory.
    610      */
    611     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
    612         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
    613         if (*pBufferSize > offsetUp) {
    614             *pBufferSize -= offsetUp;
    615             stackBufferChars += offsetUp;
    616         }
    617         else {
    618             /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
    619             *pBufferSize = 1;
    620         }
    621     }
    622     stackBuffer = (void *)stackBufferChars;
    623 
    624     if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
    625         /* allocate one here...*/
    626         stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
    627         // Null pointer check.
    628         if (stackBufferChars == NULL) {
    629             *status = U_MEMORY_ALLOCATION_ERROR;
    630             return NULL;
    631         }
    632         colAllocated = TRUE;
    633         if (U_SUCCESS(*status)) {
    634             *status = U_SAFECLONE_ALLOCATED_WARNING;
    635         }
    636     }
    637     localCollator = (UCollator *)stackBufferChars;
    638     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
    639     defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
    640     reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
    641     leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
    642 
    643     {
    644         UErrorCode tempStatus = U_ZERO_ERROR;
    645         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
    646     }
    647     if (coll->freeImageOnClose) {
    648         image = (uint8_t *)uprv_malloc(imageSize);
    649         // Null pointer check
    650         if (image == NULL) {
    651             *status = U_MEMORY_ALLOCATION_ERROR;
    652             return NULL;
    653         }
    654         ucol_cloneBinary(coll, image, imageSize, status);
    655         imageAllocated = TRUE;
    656     }
    657     else {
    658         image = (uint8_t *)coll->image;
    659     }
    660     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
    661     if (U_FAILURE(*status)) {
    662         return NULL;
    663     }
    664 
    665     if (coll->rules) {
    666         if (coll->freeRulesOnClose) {
    667             localCollator->rules = u_strcpy(rules, coll->rules);
    668             //bufferEnd += rulesSize;
    669         }
    670         else {
    671             localCollator->rules = coll->rules;
    672         }
    673         localCollator->freeRulesOnClose = FALSE;
    674         localCollator->rulesLength = coll->rulesLength;
    675     }
    676 
    677     // collator reordering
    678     if (coll->defaultReorderCodes) {
    679         localCollator->defaultReorderCodes =
    680             (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
    681         localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
    682         localCollator->freeDefaultReorderCodesOnClose = FALSE;
    683     }
    684     if (coll->reorderCodes) {
    685         localCollator->reorderCodes =
    686             (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
    687         localCollator->reorderCodesLength = coll->reorderCodesLength;
    688         localCollator->freeReorderCodesOnClose = FALSE;
    689     }
    690     if (coll->leadBytePermutationTable) {
    691         localCollator->leadBytePermutationTable =
    692             (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
    693         localCollator->freeLeadBytePermutationTableOnClose = FALSE;
    694     }
    695 
    696     int32_t i;
    697     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
    698         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
    699     }
    700     // zero copies of pointers
    701     localCollator->actualLocale = NULL;
    702     localCollator->validLocale = NULL;
    703     localCollator->requestedLocale = NULL;
    704     localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
    705     localCollator->freeOnClose = colAllocated;
    706     localCollator->freeImageOnClose = imageAllocated;
    707     return localCollator;
    708 }
    709 
    710 U_CAPI void U_EXPORT2
    711 ucol_close(UCollator *coll)
    712 {
    713     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
    714     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
    715     if(coll != NULL) {
    716         // these are always owned by each UCollator struct,
    717         // so we always free them
    718         if(coll->validLocale != NULL) {
    719             uprv_free(coll->validLocale);
    720         }
    721         if(coll->actualLocale != NULL) {
    722             uprv_free(coll->actualLocale);
    723         }
    724         if(coll->requestedLocale != NULL) {
    725             uprv_free(coll->requestedLocale);
    726         }
    727         if(coll->latinOneCEs != NULL) {
    728             uprv_free(coll->latinOneCEs);
    729         }
    730         if(coll->options != NULL && coll->freeOptionsOnClose) {
    731             uprv_free(coll->options);
    732         }
    733         if(coll->rules != NULL && coll->freeRulesOnClose) {
    734             uprv_free((UChar *)coll->rules);
    735         }
    736         if(coll->image != NULL && coll->freeImageOnClose) {
    737             uprv_free((UCATableHeader *)coll->image);
    738         }
    739 
    740         if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
    741             uprv_free(coll->leadBytePermutationTable);
    742         }
    743         if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
    744             uprv_free(coll->defaultReorderCodes);
    745         }
    746         if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
    747             uprv_free(coll->reorderCodes);
    748         }
    749 
    750         if(coll->delegate != NULL) {
    751           delete (Collator*)coll->delegate;
    752         }
    753 
    754         /* Here, it would be advisable to close: */
    755         /* - UData for UCA (unless we stuff it in the root resb */
    756         /* Again, do we need additional housekeeping... HMMM! */
    757         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
    758         if(coll->freeOnClose){
    759             /* for safeClone, if freeOnClose is FALSE,
    760             don't free the other instance data */
    761             uprv_free(coll);
    762         }
    763     }
    764     UTRACE_EXIT();
    765 }
    766 
    767 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
    768 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
    769 U_CFUNC uint8_t* U_EXPORT2
    770 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
    771 {
    772     uint8_t *result = NULL;
    773     if(U_FAILURE(*status)) {
    774         return NULL;
    775     }
    776     if(coll->hasRealData == TRUE) {
    777         *length = coll->image->size;
    778         result = (uint8_t *)uprv_malloc(*length);
    779         /* test for NULL */
    780         if (result == NULL) {
    781             *status = U_MEMORY_ALLOCATION_ERROR;
    782             return NULL;
    783         }
    784         uprv_memcpy(result, coll->image, *length);
    785     } else {
    786         *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    787         result = (uint8_t *)uprv_malloc(*length);
    788         /* test for NULL */
    789         if (result == NULL) {
    790             *status = U_MEMORY_ALLOCATION_ERROR;
    791             return NULL;
    792         }
    793 
    794         /* build the UCATableHeader with minimal entries */
    795         /* do not copy the header from the UCA file because its values are wrong! */
    796         /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    797 
    798         /* reset everything */
    799         uprv_memset(result, 0, *length);
    800 
    801         /* set the tailoring-specific values */
    802         UCATableHeader *myData = (UCATableHeader *)result;
    803         myData->size = *length;
    804 
    805         /* offset for the options, the only part of the data that is present after the header */
    806         myData->options = sizeof(UCATableHeader);
    807 
    808         /* need to always set the expansion value for an upper bound of the options */
    809         myData->expansion = myData->options + sizeof(UColOptionSet);
    810 
    811         myData->magic = UCOL_HEADER_MAGIC;
    812         myData->isBigEndian = U_IS_BIG_ENDIAN;
    813         myData->charSetFamily = U_CHARSET_FAMILY;
    814 
    815         /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    816         uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    817 
    818         uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    819         uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    820         uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    821         myData->jamoSpecial = coll->image->jamoSpecial;
    822 
    823         /* copy the collator options */
    824         uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    825     }
    826     return result;
    827 }
    828 
    829 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
    830     if(U_FAILURE(*status)) {
    831         return;
    832     }
    833     result->caseFirst = (UColAttributeValue)opts->caseFirst;
    834     result->caseLevel = (UColAttributeValue)opts->caseLevel;
    835     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
    836     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
    837     if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
    838         return;
    839     }
    840     result->strength = (UColAttributeValue)opts->strength;
    841     result->variableTopValue = opts->variableTopValue;
    842     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
    843     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
    844     result->numericCollation = (UColAttributeValue)opts->numericCollation;
    845     result->caseFirstisDefault = TRUE;
    846     result->caseLevelisDefault = TRUE;
    847     result->frenchCollationisDefault = TRUE;
    848     result->normalizationModeisDefault = TRUE;
    849     result->strengthisDefault = TRUE;
    850     result->variableTopValueisDefault = TRUE;
    851     result->alternateHandlingisDefault = TRUE;
    852     result->hiraganaQisDefault = TRUE;
    853     result->numericCollationisDefault = TRUE;
    854 
    855     ucol_updateInternalState(result, status);
    856 
    857     result->options = opts;
    858 }
    859 
    860 
    861 /**
    862 * Approximate determination if a character is at a contraction end.
    863 * Guaranteed to be TRUE if a character is at the end of a contraction,
    864 * otherwise it is not deterministic.
    865 * @param c character to be determined
    866 * @param coll collator
    867 */
    868 static
    869 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
    870     if (c < coll->minContrEndCP) {
    871         return FALSE;
    872     }
    873 
    874     int32_t  hash = c;
    875     uint8_t  htbyte;
    876     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
    877         if (U16_IS_TRAIL(c)) {
    878             return TRUE;
    879         }
    880         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
    881     }
    882     htbyte = coll->contrEndCP[hash>>3];
    883     return (((htbyte >> (hash & 7)) & 1) == 1);
    884 }
    885 
    886 
    887 
    888 /*
    889 *   i_getCombiningClass()
    890 *        A fast, at least partly inline version of u_getCombiningClass()
    891 *        This is a candidate for further optimization.  Used heavily
    892 *        in contraction processing.
    893 */
    894 static
    895 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
    896     uint8_t sCC = 0;
    897     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
    898         sCC = u_getCombiningClass(c);
    899     }
    900     return sCC;
    901 }
    902 
    903 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
    904     UChar c;
    905     UCollator *result = fillIn;
    906     if(U_FAILURE(*status) || image == NULL) {
    907         return NULL;
    908     }
    909 
    910     if(result == NULL) {
    911         result = (UCollator *)uprv_malloc(sizeof(UCollator));
    912         if(result == NULL) {
    913             *status = U_MEMORY_ALLOCATION_ERROR;
    914             return result;
    915         }
    916         result->freeOnClose = TRUE;
    917     } else {
    918         result->freeOnClose = FALSE;
    919     }
    920 
    921     result->delegate = NULL;
    922 
    923     result->image = image;
    924     result->mapping.getFoldingOffset = _getFoldingOffset;
    925     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
    926     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
    927     if(U_FAILURE(*status)) {
    928         if(result->freeOnClose == TRUE) {
    929             uprv_free(result);
    930             result = NULL;
    931         }
    932         return result;
    933     }
    934 
    935     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
    936     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
    937     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
    938     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
    939     result->rules = NULL;
    940     result->rulesLength = 0;
    941     result->freeRulesOnClose = FALSE;
    942     result->defaultReorderCodes = NULL;
    943     result->defaultReorderCodesLength = 0;
    944     result->freeDefaultReorderCodesOnClose = FALSE;
    945     result->reorderCodes = NULL;
    946     result->reorderCodesLength = 0;
    947     result->freeReorderCodesOnClose = FALSE;
    948     result->leadBytePermutationTable = NULL;
    949     result->freeLeadBytePermutationTableOnClose = FALSE;
    950 
    951     /* get the version info from UCATableHeader and populate the Collator struct*/
    952     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
    953     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
    954     result->dataVersion[2] = 0;
    955     result->dataVersion[3] = 0;
    956 
    957     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
    958     result->minUnsafeCP = 0;
    959     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
    960         if (ucol_unsafeCP(c, result)) break;
    961     }
    962     result->minUnsafeCP = c;
    963 
    964     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
    965     result->minContrEndCP = 0;
    966     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
    967         if (ucol_contractionEndCP(c, result)) break;
    968     }
    969     result->minContrEndCP = c;
    970 
    971     /* max expansion tables */
    972     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
    973                                          result->image->endExpansionCE);
    974     result->lastEndExpansionCE = result->endExpansionCE +
    975                                  result->image->endExpansionCECount - 1;
    976     result->expansionCESize = (uint8_t*)result->image +
    977                                                result->image->expansionCESize;
    978 
    979 
    980     //result->errorCode = *status;
    981 
    982     result->latinOneCEs = NULL;
    983 
    984     result->latinOneRegenTable = FALSE;
    985     result->latinOneFailed = FALSE;
    986     result->UCA = UCA;
    987 
    988     /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
    989     result->ucaRules = NULL;
    990     result->actualLocale = NULL;
    991     result->validLocale = NULL;
    992     result->requestedLocale = NULL;
    993     result->hasRealData = FALSE; // real data lives in .dat file...
    994     result->freeImageOnClose = FALSE;
    995 
    996     /* set attributes */
    997     ucol_setOptionsFromHeader(
    998         result,
    999         (UColOptionSet*)((uint8_t*)result->image+result->image->options),
   1000         status);
   1001     result->freeOptionsOnClose = FALSE;
   1002 
   1003     return result;
   1004 }
   1005 
   1006 /* new Mark's code */
   1007 
   1008 /**
   1009  * For generation of Implicit CEs
   1010  * @author Davis
   1011  *
   1012  * Cleaned up so that changes can be made more easily.
   1013  * Old values:
   1014 # First Implicit: E26A792D
   1015 # Last Implicit: E3DC70C0
   1016 # First CJK: E0030300
   1017 # Last CJK: E0A9DD00
   1018 # First CJK_A: E0A9DF00
   1019 # Last CJK_A: E0DE3100
   1020  */
   1021 /* Following is a port of Mark's code for new treatment of implicits.
   1022  * It is positioned here, since ucol_initUCA need to initialize the
   1023  * variables below according to the data in the fractional UCA.
   1024  */
   1025 
   1026 /**
   1027  * Function used to:
   1028  * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
   1029  * b) bump any non-CJK characters by 10FFFF.
   1030  * The relevant blocks are:
   1031  * A:    4E00..9FFF; CJK Unified Ideographs
   1032  *       F900..FAFF; CJK Compatibility Ideographs
   1033  * B:    3400..4DBF; CJK Unified Ideographs Extension A
   1034  *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
   1035  * As long as
   1036  *   no new B characters are allocated between 4E00 and FAFF, and
   1037  *   no new A characters are outside of this range,
   1038  * (very high probability) this simple code will work.
   1039  * The reordered blocks are:
   1040  * Block1 is CJK
   1041  * Block2 is CJK_COMPAT_USED
   1042  * Block3 is CJK_A
   1043  * (all contiguous)
   1044  * Any other CJK gets its normal code point
   1045  * Any non-CJK gets +10FFFF
   1046  * When we reorder Block1, we make sure that it is at the very start,
   1047  * so that it will use a 3-byte form.
   1048  * Warning: the we only pick up the compatibility characters that are
   1049  * NOT decomposed, so that block is smaller!
   1050  */
   1051 
   1052 // CONSTANTS
   1053 static const UChar32
   1054     NON_CJK_OFFSET = 0x110000,
   1055     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
   1056 
   1057 /**
   1058  * Precomputed by initImplicitConstants()
   1059  */
   1060 static int32_t
   1061     final3Multiplier = 0,
   1062     final4Multiplier = 0,
   1063     final3Count = 0,
   1064     final4Count = 0,
   1065     medialCount = 0,
   1066     min3Primary = 0,
   1067     min4Primary = 0,
   1068     max4Primary = 0,
   1069     minTrail = 0,
   1070     maxTrail = 0,
   1071     max3Trail = 0,
   1072     max4Trail = 0,
   1073     min4Boundary = 0;
   1074 
   1075 static const UChar32
   1076     // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
   1077     // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;  (Unicode 6.1)
   1078     CJK_BASE = 0x4E00,
   1079     CJK_LIMIT = 0x9FCC+1,
   1080     // Unified CJK ideographs in the compatibility ideographs block.
   1081     CJK_COMPAT_USED_BASE = 0xFA0E,
   1082     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
   1083     // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
   1084     // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
   1085     CJK_A_BASE = 0x3400,
   1086     CJK_A_LIMIT = 0x4DB5+1,
   1087     // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
   1088     // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
   1089     CJK_B_BASE = 0x20000,
   1090     CJK_B_LIMIT = 0x2A6D6+1,
   1091     // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
   1092     // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
   1093     CJK_C_BASE = 0x2A700,
   1094     CJK_C_LIMIT = 0x2B734+1,
   1095     // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
   1096     // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
   1097     CJK_D_BASE = 0x2B740,
   1098     CJK_D_LIMIT = 0x2B81D+1;
   1099     // when adding to this list, look for all occurrences (in project)
   1100     // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
   1101 
   1102 static UChar32 swapCJK(UChar32 i) {
   1103     if (i < CJK_A_BASE) {
   1104         // non-CJK
   1105     } else if (i < CJK_A_LIMIT) {
   1106         // Extension A has lower code points than the original Unihan+compat
   1107         // but sorts higher.
   1108         return i - CJK_A_BASE
   1109                 + (CJK_LIMIT - CJK_BASE)
   1110                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1111     } else if (i < CJK_BASE) {
   1112         // non-CJK
   1113     } else if (i < CJK_LIMIT) {
   1114         return i - CJK_BASE;
   1115     } else if (i < CJK_COMPAT_USED_BASE) {
   1116         // non-CJK
   1117     } else if (i < CJK_COMPAT_USED_LIMIT) {
   1118         return i - CJK_COMPAT_USED_BASE
   1119                 + (CJK_LIMIT - CJK_BASE);
   1120     } else if (i < CJK_B_BASE) {
   1121         // non-CJK
   1122     } else if (i < CJK_B_LIMIT) {
   1123         return i; // non-BMP-CJK
   1124     } else if (i < CJK_C_BASE) {
   1125         // non-CJK
   1126     } else if (i < CJK_C_LIMIT) {
   1127         return i; // non-BMP-CJK
   1128     } else if (i < CJK_D_BASE) {
   1129         // non-CJK
   1130     } else if (i < CJK_D_LIMIT) {
   1131         return i; // non-BMP-CJK
   1132     }
   1133     return i + NON_CJK_OFFSET; // non-CJK
   1134 }
   1135 
   1136 U_CAPI UChar32 U_EXPORT2
   1137 uprv_uca_getRawFromCodePoint(UChar32 i) {
   1138     return swapCJK(i)+1;
   1139 }
   1140 
   1141 U_CAPI UChar32 U_EXPORT2
   1142 uprv_uca_getCodePointFromRaw(UChar32 i) {
   1143     i--;
   1144     UChar32 result = 0;
   1145     if(i >= NON_CJK_OFFSET) {
   1146         result = i - NON_CJK_OFFSET;
   1147     } else if(i >= CJK_B_BASE) {
   1148         result = i;
   1149     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
   1150         if(i < CJK_LIMIT - CJK_BASE) {
   1151             result = i + CJK_BASE;
   1152         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
   1153             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
   1154         } else {
   1155             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1156         }
   1157     } else {
   1158         result = -1;
   1159     }
   1160     return result;
   1161 }
   1162 
   1163 // GET IMPLICIT PRIMARY WEIGHTS
   1164 // Return value is left justified primary key
   1165 U_CAPI uint32_t U_EXPORT2
   1166 uprv_uca_getImplicitFromRaw(UChar32 cp) {
   1167     /*
   1168     if (cp < 0 || cp > UCOL_MAX_INPUT) {
   1169         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
   1170     }
   1171     */
   1172     int32_t last0 = cp - min4Boundary;
   1173     if (last0 < 0) {
   1174         int32_t last1 = cp / final3Count;
   1175         last0 = cp % final3Count;
   1176 
   1177         int32_t last2 = last1 / medialCount;
   1178         last1 %= medialCount;
   1179 
   1180         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
   1181         last1 = minTrail + last1; // offset
   1182         last2 = min3Primary + last2; // offset
   1183         /*
   1184         if (last2 >= min4Primary) {
   1185             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
   1186         }
   1187         */
   1188         return (last2 << 24) + (last1 << 16) + (last0 << 8);
   1189     } else {
   1190         int32_t last1 = last0 / final4Count;
   1191         last0 %= final4Count;
   1192 
   1193         int32_t last2 = last1 / medialCount;
   1194         last1 %= medialCount;
   1195 
   1196         int32_t last3 = last2 / medialCount;
   1197         last2 %= medialCount;
   1198 
   1199         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
   1200         last1 = minTrail + last1; // offset
   1201         last2 = minTrail + last2; // offset
   1202         last3 = min4Primary + last3; // offset
   1203         /*
   1204         if (last3 > max4Primary) {
   1205             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
   1206         }
   1207         */
   1208         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
   1209     }
   1210 }
   1211 
   1212 static uint32_t U_EXPORT2
   1213 uprv_uca_getImplicitPrimary(UChar32 cp) {
   1214    //fprintf(stdout, "Incoming: %04x\n", cp);
   1215     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
   1216 
   1217     cp = swapCJK(cp);
   1218     cp++;
   1219     // we now have a range of numbers from 0 to 21FFFF.
   1220 
   1221     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
   1222     //fprintf(stdout, "CJK swapped: %04x\n", cp);
   1223 
   1224     return uprv_uca_getImplicitFromRaw(cp);
   1225 }
   1226 
   1227 /**
   1228  * Converts implicit CE into raw integer ("code point")
   1229  * @param implicit
   1230  * @return -1 if illegal format
   1231  */
   1232 U_CAPI UChar32 U_EXPORT2
   1233 uprv_uca_getRawFromImplicit(uint32_t implicit) {
   1234     UChar32 result;
   1235     UChar32 b3 = implicit & 0xFF;
   1236     UChar32 b2 = (implicit >> 8) & 0xFF;
   1237     UChar32 b1 = (implicit >> 16) & 0xFF;
   1238     UChar32 b0 = (implicit >> 24) & 0xFF;
   1239 
   1240     // simple parameter checks
   1241     if (b0 < min3Primary || b0 > max4Primary
   1242         || b1 < minTrail || b1 > maxTrail)
   1243         return -1;
   1244     // normal offsets
   1245     b1 -= minTrail;
   1246 
   1247     // take care of the final values, and compose
   1248     if (b0 < min4Primary) {
   1249         if (b2 < minTrail || b2 > max3Trail || b3 != 0)
   1250             return -1;
   1251         b2 -= minTrail;
   1252         UChar32 remainder = b2 % final3Multiplier;
   1253         if (remainder != 0)
   1254             return -1;
   1255         b0 -= min3Primary;
   1256         b2 /= final3Multiplier;
   1257         result = ((b0 * medialCount) + b1) * final3Count + b2;
   1258     } else {
   1259         if (b2 < minTrail || b2 > maxTrail
   1260             || b3 < minTrail || b3 > max4Trail)
   1261             return -1;
   1262         b2 -= minTrail;
   1263         b3 -= minTrail;
   1264         UChar32 remainder = b3 % final4Multiplier;
   1265         if (remainder != 0)
   1266             return -1;
   1267         b3 /= final4Multiplier;
   1268         b0 -= min4Primary;
   1269         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
   1270     }
   1271     // final check
   1272     if (result < 0 || result > UCOL_MAX_INPUT)
   1273         return -1;
   1274     return result;
   1275 }
   1276 
   1277 
   1278 static inline int32_t divideAndRoundUp(int a, int b) {
   1279     return 1 + (a-1)/b;
   1280 }
   1281 
   1282 /* this function is either called from initUCA or from genUCA before
   1283  * doing canonical closure for the UCA.
   1284  */
   1285 
   1286 /**
   1287  * Set up to generate implicits.
   1288  * Maintenance Note:  this function may end up being called more than once, due
   1289  *                    to threading races during initialization.  Make sure that
   1290  *                    none of the Constants is ever transiently assigned an
   1291  *                    incorrect value.
   1292  * @param minPrimary
   1293  * @param maxPrimary
   1294  * @param minTrail final byte
   1295  * @param maxTrail final byte
   1296  * @param gap3 the gap we leave for tailoring for 3-byte forms
   1297  * @param gap4 the gap we leave for tailoring for 4-byte forms
   1298  */
   1299 static void initImplicitConstants(int minPrimary, int maxPrimary,
   1300                                     int minTrailIn, int maxTrailIn,
   1301                                     int gap3, int primaries3count,
   1302                                     UErrorCode *status) {
   1303     // some simple parameter checks
   1304     if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
   1305         || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
   1306         || (primaries3count < 1))
   1307     {
   1308         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1309         return;
   1310     };
   1311 
   1312     minTrail = minTrailIn;
   1313     maxTrail = maxTrailIn;
   1314 
   1315     min3Primary = minPrimary;
   1316     max4Primary = maxPrimary;
   1317     // compute constants for use later.
   1318     // number of values we can use in trailing bytes
   1319     // leave room for empty values between AND above, e.g. if gap = 2
   1320     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
   1321     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
   1322     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
   1323     final3Multiplier = gap3 + 1;
   1324     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
   1325     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
   1326 
   1327     // medials can use full range
   1328     medialCount = (maxTrail - minTrail + 1);
   1329     // find out how many values fit in each form
   1330     int32_t threeByteCount = medialCount * final3Count;
   1331     // now determine where the 3/4 boundary is.
   1332     // we use 3 bytes below the boundary, and 4 above
   1333     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
   1334     int32_t primaries4count = primariesAvailable - primaries3count;
   1335 
   1336 
   1337     int32_t min3ByteCoverage = primaries3count * threeByteCount;
   1338     min4Primary = minPrimary + primaries3count;
   1339     min4Boundary = min3ByteCoverage;
   1340     // Now expand out the multiplier for the 4 bytes, and redo.
   1341 
   1342     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
   1343     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
   1344     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
   1345     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
   1346     if (gap4 < 1) {
   1347         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1348         return;
   1349     }
   1350     final4Multiplier = gap4 + 1;
   1351     final4Count = neededPerFinalByte;
   1352     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
   1353 }
   1354 
   1355     /**
   1356      * Supply parameters for generating implicit CEs
   1357      */
   1358 U_CAPI void U_EXPORT2
   1359 uprv_uca_initImplicitConstants(UErrorCode *status) {
   1360     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
   1361     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
   1362     initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
   1363 }
   1364 
   1365 
   1366 /*    collIterNormalize     Incremental Normalization happens here.                       */
   1367 /*                          pick up the range of chars identifed by FCD,                  */
   1368 /*                          normalize it into the collIterate's writable buffer,          */
   1369 /*                          switch the collIterate's state to use the writable buffer.    */
   1370 /*                                                                                        */
   1371 static
   1372 void collIterNormalize(collIterate *collationSource)
   1373 {
   1374     UErrorCode  status = U_ZERO_ERROR;
   1375     const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
   1376     const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
   1377 
   1378     collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
   1379                                     collationSource->writableBuffer,
   1380                                     status);
   1381     if (U_FAILURE(status)) {
   1382 #ifdef UCOL_DEBUG
   1383         fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
   1384 #endif
   1385         return;
   1386     }
   1387 
   1388     collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
   1389     collationSource->origFlags  = collationSource->flags;
   1390     collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1391     collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1392 }
   1393 
   1394 
   1395 // This function takes the iterator and extracts normalized stuff up to the next boundary
   1396 // It is similar in the end results to the collIterNormalize, but for the cases when we
   1397 // use an iterator
   1398 /*static
   1399 inline void normalizeIterator(collIterate *collationSource) {
   1400   UErrorCode status = U_ZERO_ERROR;
   1401   UBool wasNormalized = FALSE;
   1402   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
   1403   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
   1404   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1405     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1406   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
   1407     // reallocate and terminate
   1408     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
   1409                                &collationSource->writableBuffer,
   1410                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
   1411                                0)
   1412     ) {
   1413     #ifdef UCOL_DEBUG
   1414         fprintf(stderr, "normalizeIterator(), out of memory\n");
   1415     #endif
   1416         return;
   1417     }
   1418     status = U_ZERO_ERROR;
   1419     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
   1420     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
   1421     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1422     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1423   }
   1424   // Terminate the buffer - we already checked that it is big enough
   1425   collationSource->writableBuffer[normLen] = 0;
   1426   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
   1427       collationSource->flags |= UCOL_ITER_ALLOCATED;
   1428   }
   1429   collationSource->pos        = collationSource->writableBuffer;
   1430   collationSource->origFlags  = collationSource->flags;
   1431   collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1432   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1433 }*/
   1434 
   1435 
   1436 /* Incremental FCD check and normalize                                                    */
   1437 /*   Called from getNextCE when normalization state is suspect.                           */
   1438 /*   When entering, the state is known to be this:                                        */
   1439 /*      o   We are working in the main buffer of the collIterate, not the side            */
   1440 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
   1441 /*          so we won't get here.                                                         */
   1442 /*      o   The leading combining class from the current character is 0 or                */
   1443 /*          the trailing combining class of the previous char was zero.                   */
   1444 /*          True because the previous call to this function will have always exited       */
   1445 /*          that way, and we get called for every char where cc might be non-zero.        */
   1446 static
   1447 inline UBool collIterFCD(collIterate *collationSource) {
   1448     const UChar *srcP, *endP;
   1449     uint8_t     leadingCC;
   1450     uint8_t     prevTrailingCC = 0;
   1451     uint16_t    fcd;
   1452     UBool       needNormalize = FALSE;
   1453 
   1454     srcP = collationSource->pos-1;
   1455 
   1456     if (collationSource->flags & UCOL_ITER_HASLEN) {
   1457         endP = collationSource->endp;
   1458     } else {
   1459         endP = NULL;
   1460     }
   1461 
   1462     // Get the trailing combining class of the current character. If it's zero, we are OK.
   1463     fcd = g_nfcImpl->nextFCD16(srcP, endP);
   1464     if (fcd != 0) {
   1465         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1466 
   1467         if (prevTrailingCC != 0) {
   1468             // The current char has a non-zero trailing CC.  Scan forward until we find
   1469             //   a char with a leading cc of zero.
   1470             while (endP == NULL || srcP != endP)
   1471             {
   1472                 const UChar *savedSrcP = srcP;
   1473 
   1474                 fcd = g_nfcImpl->nextFCD16(srcP, endP);
   1475                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1476                 if (leadingCC == 0) {
   1477                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
   1478                                            //   back up over it.  (Could be surrogate pair!)
   1479                     break;
   1480                 }
   1481 
   1482                 if (leadingCC < prevTrailingCC) {
   1483                     needNormalize = TRUE;
   1484                 }
   1485 
   1486                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1487             }
   1488         }
   1489     }
   1490 
   1491     collationSource->fcdPosition = (UChar *)srcP;
   1492 
   1493     return needNormalize;
   1494 }
   1495 
   1496 /****************************************************************************/
   1497 /* Following are the CE retrieval functions                                 */
   1498 /*                                                                          */
   1499 /****************************************************************************/
   1500 
   1501 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
   1502 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
   1503 
   1504 /* there should be a macro version of this function in the header file */
   1505 /* This is the first function that tries to fetch a collation element  */
   1506 /* If it's not succesfull or it encounters a more difficult situation  */
   1507 /* some more sofisticated and slower functions are invoked             */
   1508 static
   1509 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1510     uint32_t order = 0;
   1511     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
   1512         order = *(collationSource->toReturn++);                         /* if so, return them */
   1513         if(collationSource->CEpos == collationSource->toReturn) {
   1514             collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
   1515         }
   1516         return order;
   1517     }
   1518 
   1519     UChar ch = 0;
   1520     collationSource->offsetReturn = NULL;
   1521 
   1522     do {
   1523         for (;;)                           /* Loop handles case when incremental normalize switches   */
   1524         {                                  /*   to or from the side buffer / original string, and we  */
   1525             /*   need to start again to get the next character.        */
   1526 
   1527             if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
   1528             {
   1529                 // The source string is null terminated and we're not working from the side buffer,
   1530                 //   and we're not normalizing.  This is the fast path.
   1531                 //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
   1532                 ch = *collationSource->pos++;
   1533                 if (ch != 0) {
   1534                     break;
   1535                 }
   1536                 else {
   1537                     return UCOL_NO_MORE_CES;
   1538                 }
   1539             }
   1540 
   1541             if (collationSource->flags & UCOL_ITER_HASLEN) {
   1542                 // Normal path for strings when length is specified.
   1543                 //   (We can't be in side buffer because it is always null terminated.)
   1544                 if (collationSource->pos >= collationSource->endp) {
   1545                     // Ran off of the end of the main source string.  We're done.
   1546                     return UCOL_NO_MORE_CES;
   1547                 }
   1548                 ch = *collationSource->pos++;
   1549             }
   1550             else if(collationSource->flags & UCOL_USE_ITERATOR) {
   1551                 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
   1552                 if(iterCh == U_SENTINEL) {
   1553                     return UCOL_NO_MORE_CES;
   1554                 }
   1555                 ch = (UChar)iterCh;
   1556             }
   1557             else
   1558             {
   1559                 // Null terminated string.
   1560                 ch = *collationSource->pos++;
   1561                 if (ch == 0) {
   1562                     // Ran off end of buffer.
   1563                     if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1564                         // Ran off end of main string. backing up one character.
   1565                         collationSource->pos--;
   1566                         return UCOL_NO_MORE_CES;
   1567                     }
   1568                     else
   1569                     {
   1570                         // Hit null in the normalize side buffer.
   1571                         // Usually this means the end of the normalized data,
   1572                         // except for one odd case: a null followed by combining chars,
   1573                         //   which is the case if we are at the start of the buffer.
   1574                         if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
   1575                             break;
   1576                         }
   1577 
   1578                         //  Null marked end of side buffer.
   1579                         //   Revert to the main string and
   1580                         //   loop back to top to try again to get a character.
   1581                         collationSource->pos   = collationSource->fcdPosition;
   1582                         collationSource->flags = collationSource->origFlags;
   1583                         continue;
   1584                     }
   1585                 }
   1586             }
   1587 
   1588             if(collationSource->flags&UCOL_HIRAGANA_Q) {
   1589                 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
   1590                  * based on whether the previous codepoint was Hiragana or Katakana.
   1591                  */
   1592                 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
   1593                         ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
   1594                     collationSource->flags |= UCOL_WAS_HIRAGANA;
   1595                 } else {
   1596                     collationSource->flags &= ~UCOL_WAS_HIRAGANA;
   1597                 }
   1598             }
   1599 
   1600             // We've got a character.  See if there's any fcd and/or normalization stuff to do.
   1601             //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
   1602             if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
   1603                 break;
   1604             }
   1605 
   1606             if (collationSource->fcdPosition >= collationSource->pos) {
   1607                 // An earlier FCD check has already covered the current character.
   1608                 // We can go ahead and process this char.
   1609                 break;
   1610             }
   1611 
   1612             if (ch < ZERO_CC_LIMIT_ ) {
   1613                 // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
   1614                 break;
   1615             }
   1616 
   1617             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1618                 // We need to peek at the next character in order to tell if we are FCD
   1619                 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
   1620                     // We are at the last char of source string.
   1621                     //  It is always OK for FCD check.
   1622                     break;
   1623                 }
   1624 
   1625                 // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
   1626                 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1627                     break;
   1628                 }
   1629             }
   1630 
   1631 
   1632             // Need a more complete FCD check and possible normalization.
   1633             if (collIterFCD(collationSource)) {
   1634                 collIterNormalize(collationSource);
   1635             }
   1636             if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1637                 //  No normalization was needed.  Go ahead and process the char we already had.
   1638                 break;
   1639             }
   1640 
   1641             // Some normalization happened.  Next loop iteration will pick up a char
   1642             //   from the normalization buffer.
   1643 
   1644         }   // end for (;;)
   1645 
   1646 
   1647         if (ch <= 0xFF) {
   1648             /*  For latin-1 characters we never need to fall back to the UCA table        */
   1649             /*    because all of the UCA data is replicated in the latinOneMapping array  */
   1650             order = coll->latinOneMapping[ch];
   1651             if (order > UCOL_NOT_FOUND) {
   1652                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
   1653             }
   1654         }
   1655         else
   1656         {
   1657             // Always use UCA for Han, Hangul
   1658             // (Han extension A is before main Han block)
   1659             // **** Han compatibility chars ?? ****
   1660             if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   1661                 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
   1662                 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
   1663                     // between the two target ranges; do normal lookup
   1664                     // **** this range is YI, Modifier tone letters, ****
   1665                     // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   1666                     // **** Latin-D might be tailored, so we need to ****
   1667                     // **** do the normal lookup for these guys.     ****
   1668                     order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1669                 } else {
   1670                     // in one of the target ranges; use UCA
   1671                     order = UCOL_NOT_FOUND;
   1672                 }
   1673             } else {
   1674                 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1675             }
   1676 
   1677             if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
   1678                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
   1679             }
   1680 
   1681             if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
   1682                 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
   1683                 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   1684 
   1685                 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
   1686                     order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
   1687                 }
   1688             }
   1689         }
   1690     } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
   1691 
   1692     if(order == UCOL_NOT_FOUND) {
   1693         order = getImplicit(ch, collationSource);
   1694     }
   1695     return order; /* return the CE */
   1696 }
   1697 
   1698 /* ucol_getNextCE, out-of-line version for use from other files.   */
   1699 U_CAPI uint32_t  U_EXPORT2
   1700 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1701     return ucol_IGetNextCE(coll, collationSource, status);
   1702 }
   1703 
   1704 
   1705 /**
   1706 * Incremental previous normalization happens here. Pick up the range of chars
   1707 * identifed by FCD, normalize it into the collIterate's writable buffer,
   1708 * switch the collIterate's state to use the writable buffer.
   1709 * @param data collation iterator data
   1710 */
   1711 static
   1712 void collPrevIterNormalize(collIterate *data)
   1713 {
   1714     UErrorCode status  = U_ZERO_ERROR;
   1715     const UChar *pEnd   = data->pos;  /* End normalize + 1 */
   1716     const UChar *pStart;
   1717 
   1718     /* Start normalize */
   1719     if (data->fcdPosition == NULL) {
   1720         pStart = data->string;
   1721     }
   1722     else {
   1723         pStart = data->fcdPosition + 1;
   1724     }
   1725 
   1726     int32_t normLen =
   1727         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
   1728                              data->writableBuffer,
   1729                              status).
   1730         length();
   1731     if(U_FAILURE(status)) {
   1732         return;
   1733     }
   1734     /*
   1735     this puts the null termination infront of the normalized string instead
   1736     of the end
   1737     */
   1738     data->writableBuffer.insert(0, (UChar)0);
   1739 
   1740     /*
   1741      * The usual case at this point is that we've got a base
   1742      * character followed by marks that were normalized. If
   1743      * fcdPosition is NULL, that means that we backed up to
   1744      * the beginning of the string and there's no base character.
   1745      *
   1746      * Forward processing will usually normalize when it sees
   1747      * the first mark, so that mark will get it's natural offset
   1748      * and the rest will get the offset of the character following
   1749      * the marks. The base character will also get its natural offset.
   1750      *
   1751      * We write the offset of the base character, if there is one,
   1752      * followed by the offset of the first mark and then the offsets
   1753      * of the rest of the marks.
   1754      */
   1755     int32_t firstMarkOffset = 0;
   1756     int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
   1757     int32_t trailCount      = normLen - 1;
   1758 
   1759     if (data->fcdPosition != NULL) {
   1760         int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
   1761         UChar   baseChar   = *data->fcdPosition;
   1762 
   1763         firstMarkOffset = baseOffset + 1;
   1764 
   1765         /*
   1766          * If the base character is the start of a contraction, forward processing
   1767          * will normalize the marks while checking for the contraction, which means
   1768          * that the offset of the first mark will the same as the other marks.
   1769          *
   1770          * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
   1771          */
   1772         if (baseChar >= 0x100) {
   1773             uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
   1774 
   1775             if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
   1776                 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
   1777             }
   1778 
   1779             if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
   1780                 firstMarkOffset = trailOffset;
   1781             }
   1782         }
   1783 
   1784         data->appendOffset(baseOffset, status);
   1785     }
   1786 
   1787     data->appendOffset(firstMarkOffset, status);
   1788 
   1789     for (int32_t i = 0; i < trailCount; i += 1) {
   1790         data->appendOffset(trailOffset, status);
   1791     }
   1792 
   1793     data->offsetRepeatValue = trailOffset;
   1794 
   1795     data->offsetReturn = data->offsetStore - 1;
   1796     if (data->offsetReturn == data->offsetBuffer) {
   1797         data->offsetStore = data->offsetBuffer;
   1798     }
   1799 
   1800     data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
   1801     data->origFlags  = data->flags;
   1802     data->flags     |= UCOL_ITER_INNORMBUF;
   1803     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   1804 }
   1805 
   1806 
   1807 /**
   1808 * Incremental FCD check for previous iteration and normalize. Called from
   1809 * getPrevCE when normalization state is suspect.
   1810 * When entering, the state is known to be this:
   1811 * o  We are working in the main buffer of the collIterate, not the side
   1812 *    writable buffer. When in the side buffer, normalization mode is always
   1813 *    off, so we won't get here.
   1814 * o  The leading combining class from the current character is 0 or the
   1815 *    trailing combining class of the previous char was zero.
   1816 *    True because the previous call to this function will have always exited
   1817 *    that way, and we get called for every char where cc might be non-zero.
   1818 * @param data collation iterate struct
   1819 * @return normalization status, TRUE for normalization to be done, FALSE
   1820 *         otherwise
   1821 */
   1822 static
   1823 inline UBool collPrevIterFCD(collIterate *data)
   1824 {
   1825     const UChar *src, *start;
   1826     uint8_t     leadingCC;
   1827     uint8_t     trailingCC = 0;
   1828     uint16_t    fcd;
   1829     UBool       result = FALSE;
   1830 
   1831     start = data->string;
   1832     src = data->pos + 1;
   1833 
   1834     /* Get the trailing combining class of the current character. */
   1835     fcd = g_nfcImpl->previousFCD16(start, src);
   1836 
   1837     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1838 
   1839     if (leadingCC != 0) {
   1840         /*
   1841         The current char has a non-zero leading combining class.
   1842         Scan backward until we find a char with a trailing cc of zero.
   1843         */
   1844         for (;;)
   1845         {
   1846             if (start == src) {
   1847                 data->fcdPosition = NULL;
   1848                 return result;
   1849             }
   1850 
   1851             fcd = g_nfcImpl->previousFCD16(start, src);
   1852 
   1853             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1854 
   1855             if (trailingCC == 0) {
   1856                 break;
   1857             }
   1858 
   1859             if (leadingCC < trailingCC) {
   1860                 result = TRUE;
   1861             }
   1862 
   1863             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1864         }
   1865     }
   1866 
   1867     data->fcdPosition = (UChar *)src;
   1868 
   1869     return result;
   1870 }
   1871 
   1872 /** gets a code unit from the string at a given offset
   1873  *  Handles both normal and iterative cases.
   1874  *  No error checking - caller beware!
   1875  */
   1876 static inline
   1877 UChar peekCodeUnit(collIterate *source, int32_t offset) {
   1878     if(source->pos != NULL) {
   1879         return *(source->pos + offset);
   1880     } else if(source->iterator != NULL) {
   1881         UChar32 c;
   1882         if(offset != 0) {
   1883             source->iterator->move(source->iterator, offset, UITER_CURRENT);
   1884             c = source->iterator->next(source->iterator);
   1885             source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
   1886         } else {
   1887             c = source->iterator->current(source->iterator);
   1888         }
   1889         return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
   1890     } else {
   1891         return 0xfffd;
   1892     }
   1893 }
   1894 
   1895 // Code point version. Treats the offset as a _code point_ delta.
   1896 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
   1897 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
   1898 static inline
   1899 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
   1900     UChar32 c;
   1901     if(source->pos != NULL) {
   1902         const UChar *p = source->pos;
   1903         if(offset >= 0) {
   1904             // Skip forward over (offset-1) code points.
   1905             while(--offset >= 0) {
   1906                 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
   1907                     ++p;
   1908                 }
   1909             }
   1910             // Read the code point there.
   1911             c = *p++;
   1912             UChar trail;
   1913             if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
   1914                 c = U16_GET_SUPPLEMENTARY(c, trail);
   1915             }
   1916         } else /* offset<0 */ {
   1917             // Skip backward over (offset-1) code points.
   1918             while(++offset < 0) {
   1919                 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
   1920                     --p;
   1921                 }
   1922             }
   1923             // Read the code point before that.
   1924             c = *--p;
   1925             UChar lead;
   1926             if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
   1927                 c = U16_GET_SUPPLEMENTARY(lead, c);
   1928             }
   1929         }
   1930     } else if(source->iterator != NULL) {
   1931         if(offset >= 0) {
   1932             // Skip forward over (offset-1) code points.
   1933             int32_t fwd = offset;
   1934             while(fwd-- > 0) {
   1935                 uiter_next32(source->iterator);
   1936             }
   1937             // Read the code point there.
   1938             c = uiter_current32(source->iterator);
   1939             // Return to the starting point, skipping backward over (offset-1) code points.
   1940             while(offset-- > 0) {
   1941                 uiter_previous32(source->iterator);
   1942             }
   1943         } else /* offset<0 */ {
   1944             // Read backward, reading offset code points, remember only the last-read one.
   1945             int32_t back = offset;
   1946             do {
   1947                 c = uiter_previous32(source->iterator);
   1948             } while(++back < 0);
   1949             // Return to the starting position, skipping forward over offset code points.
   1950             do {
   1951                 uiter_next32(source->iterator);
   1952             } while(++offset < 0);
   1953         }
   1954     } else {
   1955         c = U_SENTINEL;
   1956     }
   1957     return c;
   1958 }
   1959 
   1960 /**
   1961 * Determines if we are at the start of the data string in the backwards
   1962 * collation iterator
   1963 * @param data collation iterator
   1964 * @return TRUE if we are at the start
   1965 */
   1966 static
   1967 inline UBool isAtStartPrevIterate(collIterate *data) {
   1968     if(data->pos == NULL && data->iterator != NULL) {
   1969         return !data->iterator->hasPrevious(data->iterator);
   1970     }
   1971     //return (collIter_bos(data)) ||
   1972     return (data->pos == data->string) ||
   1973               ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&
   1974               *(data->pos - 1) == 0 && data->fcdPosition == NULL);
   1975 }
   1976 
   1977 static
   1978 inline void goBackOne(collIterate *data) {
   1979 # if 0
   1980     // somehow, it looks like we need to keep iterator synced up
   1981     // at all times, as above.
   1982     if(data->pos) {
   1983         data->pos--;
   1984     }
   1985     if(data->iterator) {
   1986         data->iterator->previous(data->iterator);
   1987     }
   1988 #endif
   1989     if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
   1990         data->iterator->previous(data->iterator);
   1991     }
   1992     if(data->pos) {
   1993         data->pos --;
   1994     }
   1995 }
   1996 
   1997 /**
   1998 * Inline function that gets a simple CE.
   1999 * So what it does is that it will first check the expansion buffer. If the
   2000 * expansion buffer is not empty, ie the end pointer to the expansion buffer
   2001 * is different from the string pointer, we return the collation element at the
   2002 * return pointer and decrement it.
   2003 * For more complicated CEs it resorts to getComplicatedCE.
   2004 * @param coll collator data
   2005 * @param data collation iterator struct
   2006 * @param status error status
   2007 */
   2008 static
   2009 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
   2010                                UErrorCode *status)
   2011 {
   2012     uint32_t result = (uint32_t)UCOL_NULLORDER;
   2013 
   2014     if (data->offsetReturn != NULL) {
   2015         if (data->offsetRepeatCount > 0) {
   2016                 data->offsetRepeatCount -= 1;
   2017         } else {
   2018             if (data->offsetReturn == data->offsetBuffer) {
   2019                 data->offsetReturn = NULL;
   2020                 data->offsetStore  = data->offsetBuffer;
   2021             } else {
   2022                 data->offsetReturn -= 1;
   2023             }
   2024         }
   2025     }
   2026 
   2027     if ((data->extendCEs && data->toReturn > data->extendCEs) ||
   2028             (!data->extendCEs && data->toReturn > data->CEs))
   2029     {
   2030         data->toReturn -= 1;
   2031         result = *(data->toReturn);
   2032         if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
   2033             data->CEpos = data->toReturn;
   2034         }
   2035     }
   2036     else {
   2037         UChar ch = 0;
   2038 
   2039         do {
   2040             /*
   2041             Loop handles case when incremental normalize switches to or from the
   2042             side buffer / original string, and we need to start again to get the
   2043             next character.
   2044             */
   2045             for (;;) {
   2046                 if (data->flags & UCOL_ITER_HASLEN) {
   2047                     /*
   2048                     Normal path for strings when length is specified.
   2049                     Not in side buffer because it is always null terminated.
   2050                     */
   2051                     if (data->pos <= data->string) {
   2052                         /* End of the main source string */
   2053                         return UCOL_NO_MORE_CES;
   2054                     }
   2055                     data->pos --;
   2056                     ch = *data->pos;
   2057                 }
   2058                 // we are using an iterator to go back. Pray for us!
   2059                 else if (data->flags & UCOL_USE_ITERATOR) {
   2060                   UChar32 iterCh = data->iterator->previous(data->iterator);
   2061                   if(iterCh == U_SENTINEL) {
   2062                     return UCOL_NO_MORE_CES;
   2063                   } else {
   2064                     ch = (UChar)iterCh;
   2065                   }
   2066                 }
   2067                 else {
   2068                     data->pos --;
   2069                     ch = *data->pos;
   2070                     /* we are in the side buffer. */
   2071                     if (ch == 0) {
   2072                         /*
   2073                         At the start of the normalize side buffer.
   2074                         Go back to string.
   2075                         Because pointer points to the last accessed character,
   2076                         hence we have to increment it by one here.
   2077                         */
   2078                         data->flags = data->origFlags;
   2079                         data->offsetRepeatValue = 0;
   2080 
   2081                          if (data->fcdPosition == NULL) {
   2082                             data->pos = data->string;
   2083                             return UCOL_NO_MORE_CES;
   2084                         }
   2085                         else {
   2086                             data->pos   = data->fcdPosition + 1;
   2087                         }
   2088 
   2089                        continue;
   2090                     }
   2091                 }
   2092 
   2093                 if(data->flags&UCOL_HIRAGANA_Q) {
   2094                   if(ch>=0x3040 && ch<=0x309f) {
   2095                     data->flags |= UCOL_WAS_HIRAGANA;
   2096                   } else {
   2097                     data->flags &= ~UCOL_WAS_HIRAGANA;
   2098                   }
   2099                 }
   2100 
   2101                 /*
   2102                 * got a character to determine if there's fcd and/or normalization
   2103                 * stuff to do.
   2104                 * if the current character is not fcd.
   2105                 * if current character is at the start of the string
   2106                 * Trailing combining class == 0.
   2107                 * Note if pos is in the writablebuffer, norm is always 0
   2108                 */
   2109                 if (ch < ZERO_CC_LIMIT_ ||
   2110                   // this should propel us out of the loop in the iterator case
   2111                     (data->flags & UCOL_ITER_NORM) == 0 ||
   2112                     (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
   2113                     || data->string == data->pos) {
   2114                     break;
   2115                 }
   2116 
   2117                 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   2118                     /* if next character is FCD */
   2119                     if (data->pos == data->string) {
   2120                         /* First char of string is always OK for FCD check */
   2121                         break;
   2122                     }
   2123 
   2124                     /* Not first char of string, do the FCD fast test */
   2125                     if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
   2126                         break;
   2127                     }
   2128                 }
   2129 
   2130                 /* Need a more complete FCD check and possible normalization. */
   2131                 if (collPrevIterFCD(data)) {
   2132                     collPrevIterNormalize(data);
   2133                 }
   2134 
   2135                 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2136                     /*  No normalization. Go ahead and process the char. */
   2137                     break;
   2138                 }
   2139 
   2140                 /*
   2141                 Some normalization happened.
   2142                 Next loop picks up a char from the normalization buffer.
   2143                 */
   2144             }
   2145 
   2146             /* attempt to handle contractions, after removal of the backwards
   2147             contraction
   2148             */
   2149             if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
   2150                 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
   2151             } else {
   2152                 if (ch <= 0xFF) {
   2153                     result = coll->latinOneMapping[ch];
   2154                 }
   2155                 else {
   2156                     // Always use UCA for [3400..9FFF], [AC00..D7AF]
   2157                     // **** [FA0E..FA2F] ?? ****
   2158                     if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   2159                         (ch >= 0x3400 && ch <= 0xD7AF)) {
   2160                         if (ch > 0x9FFF && ch < 0xAC00) {
   2161                             // between the two target ranges; do normal lookup
   2162                             // **** this range is YI, Modifier tone letters, ****
   2163                             // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   2164                             // **** Latin-D might be tailored, so we need to ****
   2165                             // **** do the normal lookup for these guys.     ****
   2166                              result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   2167                         } else {
   2168                             result = UCOL_NOT_FOUND;
   2169                         }
   2170                     } else {
   2171                         result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   2172                     }
   2173                 }
   2174                 if (result > UCOL_NOT_FOUND) {
   2175                     result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
   2176                 }
   2177                 if (result == UCOL_NOT_FOUND) { // Not found in master list
   2178                     if (!isAtStartPrevIterate(data) &&
   2179                         ucol_contractionEndCP(ch, data->coll))
   2180                     {
   2181                         result = UCOL_CONTRACTION;
   2182                     } else {
   2183                         if(coll->UCA) {
   2184                             result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   2185                         }
   2186                     }
   2187 
   2188                     if (result > UCOL_NOT_FOUND) {
   2189                         if(coll->UCA) {
   2190                             result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
   2191                         }
   2192                     }
   2193                 }
   2194             }
   2195         } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
   2196 
   2197         if(result == UCOL_NOT_FOUND) {
   2198             result = getPrevImplicit(ch, data);
   2199         }
   2200     }
   2201 
   2202     return result;
   2203 }
   2204 
   2205 
   2206 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
   2207 U_CFUNC uint32_t  U_EXPORT2
   2208 ucol_getPrevCE(const UCollator *coll, collIterate *data,
   2209                         UErrorCode *status) {
   2210     return ucol_IGetPrevCE(coll, data, status);
   2211 }
   2212 
   2213 
   2214 /* this should be connected to special Jamo handling */
   2215 U_CFUNC uint32_t  U_EXPORT2
   2216 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
   2217     collIterate colIt;
   2218     IInit_collIterate(coll, &u, 1, &colIt, status);
   2219     if(U_FAILURE(*status)) {
   2220         return 0;
   2221     }
   2222     return ucol_IGetNextCE(coll, &colIt, status);
   2223 }
   2224 
   2225 /**
   2226 * Inserts the argument character into the end of the buffer pushing back the
   2227 * null terminator.
   2228 * @param data collIterate struct data
   2229 * @param ch character to be appended
   2230 * @return the position of the new addition
   2231 */
   2232 static
   2233 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
   2234 {
   2235     int32_t oldLength = data->writableBuffer.length();
   2236     return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
   2237 }
   2238 
   2239 /**
   2240 * Inserts the argument string into the end of the buffer pushing back the
   2241 * null terminator.
   2242 * @param data collIterate struct data
   2243 * @param string to be appended
   2244 * @param length of the string to be appended
   2245 * @return the position of the new addition
   2246 */
   2247 static
   2248 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
   2249 {
   2250     int32_t oldLength = data->writableBuffer.length();
   2251     return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
   2252 }
   2253 
   2254 /**
   2255 * Special normalization function for contraction in the forwards iterator.
   2256 * This normalization sequence will place the current character at source->pos
   2257 * and its following normalized sequence into the buffer.
   2258 * The fcd position, pos will be changed.
   2259 * pos will now point to positions in the buffer.
   2260 * Flags will be changed accordingly.
   2261 * @param data collation iterator data
   2262 */
   2263 static
   2264 inline void normalizeNextContraction(collIterate *data)
   2265 {
   2266     int32_t     strsize;
   2267     UErrorCode  status     = U_ZERO_ERROR;
   2268     /* because the pointer points to the next character */
   2269     const UChar *pStart    = data->pos - 1;
   2270     const UChar *pEnd;
   2271 
   2272     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2273         data->writableBuffer.setTo(*(pStart - 1));
   2274         strsize               = 1;
   2275     }
   2276     else {
   2277         strsize = data->writableBuffer.length();
   2278     }
   2279 
   2280     pEnd = data->fcdPosition;
   2281 
   2282     data->writableBuffer.append(
   2283         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
   2284     if(U_FAILURE(status)) {
   2285         return;
   2286     }
   2287 
   2288     data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
   2289     data->origFlags  = data->flags;
   2290     data->flags     |= UCOL_ITER_INNORMBUF;
   2291     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2292 }
   2293 
   2294 /**
   2295 * Contraction character management function that returns the next character
   2296 * for the forwards iterator.
   2297 * Does nothing if the next character is in buffer and not the first character
   2298 * in it.
   2299 * Else it checks next character in data string to see if it is normalizable.
   2300 * If it is not, the character is simply copied into the buffer, else
   2301 * the whole normalized substring is copied into the buffer, including the
   2302 * current character.
   2303 * @param data collation element iterator data
   2304 * @return next character
   2305 */
   2306 static
   2307 inline UChar getNextNormalizedChar(collIterate *data)
   2308 {
   2309     UChar  nextch;
   2310     UChar  ch;
   2311     // Here we need to add the iterator code. One problem is the way
   2312     // end of string is handled. If we just return next char, it could
   2313     // be the sentinel. Most of the cases already check for this, but we
   2314     // need to be sure.
   2315     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
   2316          /* if no normalization and not in buffer. */
   2317       if(data->flags & UCOL_USE_ITERATOR) {
   2318          return (UChar)data->iterator->next(data->iterator);
   2319       } else {
   2320          return *(data->pos ++);
   2321       }
   2322     }
   2323 
   2324     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
   2325       //normalizeIterator(data);
   2326     //}
   2327 
   2328     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2329     if ((innormbuf && *data->pos != 0) ||
   2330         (data->fcdPosition != NULL && !innormbuf &&
   2331         data->pos < data->fcdPosition)) {
   2332         /*
   2333         if next character is in normalized buffer, no further normalization
   2334         is required
   2335         */
   2336         return *(data->pos ++);
   2337     }
   2338 
   2339     if (data->flags & UCOL_ITER_HASLEN) {
   2340         /* in data string */
   2341         if (data->pos + 1 == data->endp) {
   2342             return *(data->pos ++);
   2343         }
   2344     }
   2345     else {
   2346         if (innormbuf) {
   2347           // inside the normalization buffer, but at the end
   2348           // (since we encountered zero). This means, in the
   2349           // case we're using char iterator, that we need to
   2350           // do another round of normalization.
   2351           //if(data->origFlags & UCOL_USE_ITERATOR) {
   2352             // we need to restore original flags,
   2353             // otherwise, we'll lose them
   2354             //data->flags = data->origFlags;
   2355             //normalizeIterator(data);
   2356             //return *(data->pos++);
   2357           //} else {
   2358             /*
   2359             in writable buffer, at this point fcdPosition can not be
   2360             pointing to the end of the data string. see contracting tag.
   2361             */
   2362           if(data->fcdPosition) {
   2363             if (*(data->fcdPosition + 1) == 0 ||
   2364                 data->fcdPosition + 1 == data->endp) {
   2365                 /* at the end of the string, dump it into the normalizer */
   2366                 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
   2367                 // Check if data->pos received a null pointer
   2368                 if (data->pos == NULL) {
   2369                     return (UChar)-1; // Return to indicate error.
   2370                 }
   2371                 return *(data->fcdPosition ++);
   2372             }
   2373             data->pos = data->fcdPosition;
   2374           } else if(data->origFlags & UCOL_USE_ITERATOR) {
   2375             // if we are here, we're using a normalizing iterator.
   2376             // we should just continue further.
   2377             data->flags = data->origFlags;
   2378             data->pos = NULL;
   2379             return (UChar)data->iterator->next(data->iterator);
   2380           }
   2381           //}
   2382         }
   2383         else {
   2384             if (*(data->pos + 1) == 0) {
   2385                 return *(data->pos ++);
   2386             }
   2387         }
   2388     }
   2389 
   2390     ch = *data->pos ++;
   2391     nextch = *data->pos;
   2392 
   2393     /*
   2394     * if the current character is not fcd.
   2395     * Trailing combining class == 0.
   2396     */
   2397     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
   2398         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
   2399          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
   2400             /*
   2401             Need a more complete FCD check and possible normalization.
   2402             normalize substring will be appended to buffer
   2403             */
   2404         if (collIterFCD(data)) {
   2405             normalizeNextContraction(data);
   2406             return *(data->pos ++);
   2407         }
   2408         else if (innormbuf) {
   2409             /* fcdposition shifted even when there's no normalization, if we
   2410             don't input the rest into this, we'll get the wrong position when
   2411             we reach the end of the writableBuffer */
   2412             int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
   2413             data->pos = insertBufferEnd(data, data->pos - 1, length);
   2414             // Check if data->pos received a null pointer
   2415             if (data->pos == NULL) {
   2416                 return (UChar)-1; // Return to indicate error.
   2417             }
   2418             return *(data->pos ++);
   2419         }
   2420     }
   2421 
   2422     if (innormbuf) {
   2423         /*
   2424         no normalization is to be done hence only one character will be
   2425         appended to the buffer.
   2426         */
   2427         data->pos = insertBufferEnd(data, ch) + 1;
   2428         // Check if data->pos received a null pointer
   2429         if (data->pos == NULL) {
   2430             return (UChar)-1; // Return to indicate error.
   2431         }
   2432     }
   2433 
   2434     /* points back to the pos in string */
   2435     return ch;
   2436 }
   2437 
   2438 
   2439 
   2440 /**
   2441 * Function to copy the buffer into writableBuffer and sets the fcd position to
   2442 * the correct position
   2443 * @param source data string source
   2444 * @param buffer character buffer
   2445 */
   2446 static
   2447 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
   2448 {
   2449     /* okay confusing part here. to ensure that the skipped characters are
   2450     considered later, we need to place it in the appropriate position in the
   2451     normalization buffer and reassign the pos pointer. simple case if pos
   2452     reside in string, simply copy to normalization buffer and
   2453     fcdposition = pos, pos = start of normalization buffer. if pos in
   2454     normalization buffer, we'll insert the copy infront of pos and point pos
   2455     to the start of the normalization buffer. why am i doing these copies?
   2456     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
   2457     not require any changes, which be really painful. */
   2458     if (source->flags & UCOL_ITER_INNORMBUF) {
   2459         int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
   2460         source->writableBuffer.replace(0, replaceLength, buffer);
   2461     }
   2462     else {
   2463         source->fcdPosition  = source->pos;
   2464         source->origFlags    = source->flags;
   2465         source->flags       |= UCOL_ITER_INNORMBUF;
   2466         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   2467         source->writableBuffer = buffer;
   2468     }
   2469 
   2470     source->pos = source->writableBuffer.getTerminatedBuffer();
   2471 }
   2472 
   2473 /**
   2474 * Function to get the discontiguos collation element within the source.
   2475 * Note this function will set the position to the appropriate places.
   2476 * @param coll current collator used
   2477 * @param source data string source
   2478 * @param constart index to the start character in the contraction table
   2479 * @return discontiguos collation element offset
   2480 */
   2481 static
   2482 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
   2483                                 const UChar *constart)
   2484 {
   2485     /* source->pos currently points to the second combining character after
   2486        the start character */
   2487           const UChar *temppos      = source->pos;
   2488           UnicodeString buffer;
   2489     const UChar   *tempconstart = constart;
   2490           uint8_t  tempflags    = source->flags;
   2491           UBool    multicontraction = FALSE;
   2492           collIterateState discState;
   2493 
   2494           backupState(source, &discState);
   2495 
   2496     buffer.setTo(peekCodePoint(source, -1));
   2497     for (;;) {
   2498         UChar    *UCharOffset;
   2499         UChar     schar,
   2500                   tchar;
   2501         uint32_t  result;
   2502 
   2503         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
   2504             || (peekCodeUnit(source, 0) == 0  &&
   2505             //|| (*source->pos == 0  &&
   2506                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
   2507                  source->fcdPosition == NULL ||
   2508                  source->fcdPosition == source->endp ||
   2509                  *(source->fcdPosition) == 0 ||
   2510                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
   2511                  /* end of string in null terminated string or stopped by a
   2512                  null character, note fcd does not always point to a base
   2513                  character after the discontiguos change */
   2514                  u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
   2515                  //u_getCombiningClass(*(source->pos)) == 0) {
   2516             //constart = (UChar *)coll->image + getContractOffset(CE);
   2517             if (multicontraction) {
   2518                 source->pos    = temppos - 1;
   2519                 setDiscontiguosAttribute(source, buffer);
   2520                 return *(coll->contractionCEs +
   2521                                     (tempconstart - coll->contractionIndex));
   2522             }
   2523             constart = tempconstart;
   2524             break;
   2525         }
   2526 
   2527         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
   2528         schar = getNextNormalizedChar(source);
   2529 
   2530         while (schar > (tchar = *UCharOffset)) {
   2531             UCharOffset++;
   2532         }
   2533 
   2534         if (schar != tchar) {
   2535             /* not the correct codepoint. we stuff the current codepoint into
   2536             the discontiguos buffer and try the next character */
   2537             buffer.append(schar);
   2538             continue;
   2539         }
   2540         else {
   2541             if (u_getCombiningClass(schar) ==
   2542                 u_getCombiningClass(peekCodePoint(source, -2))) {
   2543                 buffer.append(schar);
   2544                 continue;
   2545             }
   2546             result = *(coll->contractionCEs +
   2547                                       (UCharOffset - coll->contractionIndex));
   2548         }
   2549 
   2550         if (result == UCOL_NOT_FOUND) {
   2551           break;
   2552         } else if (isContraction(result)) {
   2553             /* this is a multi-contraction*/
   2554             tempconstart = (UChar *)coll->image + getContractOffset(result);
   2555             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
   2556                 != UCOL_NOT_FOUND) {
   2557                 multicontraction = TRUE;
   2558                 temppos       = source->pos + 1;
   2559             }
   2560         } else {
   2561             setDiscontiguosAttribute(source, buffer);
   2562             return result;
   2563         }
   2564     }
   2565 
   2566     /* no problems simply reverting just like that,
   2567     if we are in string before getting into this function, points back to
   2568     string hence no problem.
   2569     if we are in normalization buffer before getting into this function,
   2570     since we'll never use another normalization within this function, we
   2571     know that fcdposition points to a base character. the normalization buffer
   2572     never change, hence this revert works. */
   2573     loadState(source, &discState, TRUE);
   2574     goBackOne(source);
   2575 
   2576     //source->pos   = temppos - 1;
   2577     source->flags = tempflags;
   2578     return *(coll->contractionCEs + (constart - coll->contractionIndex));
   2579 }
   2580 
   2581 /* now uses Mark's getImplicitPrimary code */
   2582 static
   2583 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
   2584     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   2585     *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
   2586     collationSource->offsetRepeatCount += 1;
   2587     return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
   2588 }
   2589 
   2590 /**
   2591 * Inserts the argument character into the front of the buffer replacing the
   2592 * front null terminator.
   2593 * @param data collation element iterator data
   2594 * @param ch character to be appended
   2595 */
   2596 static
   2597 inline void insertBufferFront(collIterate *data, UChar ch)
   2598 {
   2599     data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
   2600 }
   2601 
   2602 /**
   2603 * Special normalization function for contraction in the previous iterator.
   2604 * This normalization sequence will place the current character at source->pos
   2605 * and its following normalized sequence into the buffer.
   2606 * The fcd position, pos will be changed.
   2607 * pos will now point to positions in the buffer.
   2608 * Flags will be changed accordingly.
   2609 * @param data collation iterator data
   2610 */
   2611 static
   2612 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
   2613 {
   2614     const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
   2615     const UChar *pStart;
   2616 
   2617     UnicodeString endOfBuffer;
   2618     if (data->flags & UCOL_ITER_HASLEN) {
   2619         /*
   2620         normalization buffer not used yet, we'll pull down the next
   2621         character into the end of the buffer
   2622         */
   2623         endOfBuffer.setTo(*pEnd);
   2624     }
   2625     else {
   2626         endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
   2627     }
   2628 
   2629     if (data->fcdPosition == NULL) {
   2630         pStart = data->string;
   2631     }
   2632     else {
   2633         pStart = data->fcdPosition + 1;
   2634     }
   2635     int32_t normLen =
   2636         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
   2637                              data->writableBuffer,
   2638                              *status).
   2639         length();
   2640     if(U_FAILURE(*status)) {
   2641         return;
   2642     }
   2643     /*
   2644     this puts the null termination infront of the normalized string instead
   2645     of the end
   2646     */
   2647     data->pos =
   2648         data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
   2649         1 + normLen;
   2650     data->origFlags  = data->flags;
   2651     data->flags     |= UCOL_ITER_INNORMBUF;
   2652     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2653 }
   2654 
   2655 /**
   2656 * Contraction character management function that returns the previous character
   2657 * for the backwards iterator.
   2658 * Does nothing if the previous character is in buffer and not the first
   2659 * character in it.
   2660 * Else it checks previous character in data string to see if it is
   2661 * normalizable.
   2662 * If it is not, the character is simply copied into the buffer, else
   2663 * the whole normalized substring is copied into the buffer, including the
   2664 * current character.
   2665 * @param data collation element iterator data
   2666 * @return previous character
   2667 */
   2668 static
   2669 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
   2670 {
   2671     UChar  prevch;
   2672     UChar  ch;
   2673     const UChar *start;
   2674     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2675     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
   2676         (innormbuf && *(data->pos - 1) != 0)) {
   2677         /*
   2678         if no normalization.
   2679         if previous character is in normalized buffer, no further normalization
   2680         is required
   2681         */
   2682       if(data->flags & UCOL_USE_ITERATOR) {
   2683         data->iterator->move(data->iterator, -1, UITER_CURRENT);
   2684         return (UChar)data->iterator->next(data->iterator);
   2685       } else {
   2686         return *(data->pos - 1);
   2687       }
   2688     }
   2689 
   2690     start = data->pos;
   2691     if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
   2692         /* in data string */
   2693         if ((start - 1) == data->string) {
   2694             return *(start - 1);
   2695         }
   2696         start --;
   2697         ch     = *start;
   2698         prevch = *(start - 1);
   2699     }
   2700     else {
   2701         /*
   2702         in writable buffer, at this point fcdPosition can not be NULL.
   2703         see contracting tag.
   2704         */
   2705         if (data->fcdPosition == data->string) {
   2706             /* at the start of the string, just dump it into the normalizer */
   2707             insertBufferFront(data, *(data->fcdPosition));
   2708             data->fcdPosition = NULL;
   2709             return *(data->pos - 1);
   2710         }
   2711         start  = data->fcdPosition;
   2712         ch     = *start;
   2713         prevch = *(start - 1);
   2714     }
   2715     /*
   2716     * if the current character is not fcd.
   2717     * Trailing combining class == 0.
   2718     */
   2719     if (data->fcdPosition > start &&
   2720        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
   2721     {
   2722         /*
   2723         Need a more complete FCD check and possible normalization.
   2724         normalize substring will be appended to buffer
   2725         */
   2726         const UChar *backuppos = data->pos;
   2727         data->pos = start;
   2728         if (collPrevIterFCD(data)) {
   2729             normalizePrevContraction(data, status);
   2730             return *(data->pos - 1);
   2731         }
   2732         data->pos = backuppos;
   2733         data->fcdPosition ++;
   2734     }
   2735 
   2736     if (innormbuf) {
   2737     /*
   2738     no normalization is to be done hence only one character will be
   2739     appended to the buffer.
   2740     */
   2741         insertBufferFront(data, ch);
   2742         data->fcdPosition --;
   2743     }
   2744 
   2745     return ch;
   2746 }
   2747 
   2748 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
   2749 /* It is called by getNextCE */
   2750 
   2751 /* The following should be even */
   2752 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
   2753 
   2754 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
   2755     collIterateState entryState;
   2756     backupState(source, &entryState);
   2757     UChar32 cp = ch;
   2758 
   2759     for (;;) {
   2760         // This loop will repeat only in the case of contractions, and only when a contraction
   2761         //   is found and the first CE resulting from that contraction is itself a special
   2762         //   (an expansion, for example.)  All other special CE types are fully handled the
   2763         //   first time through, and the loop exits.
   2764 
   2765         const uint32_t *CEOffset = NULL;
   2766         switch(getCETag(CE)) {
   2767         case NOT_FOUND_TAG:
   2768             /* This one is not found, and we'll let somebody else bother about it... no more games */
   2769             return CE;
   2770         case SPEC_PROC_TAG:
   2771             {
   2772                 // Special processing is getting a CE that is preceded by a certain prefix
   2773                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   2774                 // When we encouter a special processing tag, we go backwards and try to see if
   2775                 // we have a match.
   2776                 // Contraction tables are used - so the whole process is not unlike contraction.
   2777                 // prefix data is stored backwards in the table.
   2778                 const UChar *UCharOffset;
   2779                 UChar schar, tchar;
   2780                 collIterateState prefixState;
   2781                 backupState(source, &prefixState);
   2782                 loadState(source, &entryState, TRUE);
   2783                 goBackOne(source); // We want to look at the point where we entered - actually one
   2784                 // before that...
   2785 
   2786                 for(;;) {
   2787                     // This loop will run once per source string character, for as long as we
   2788                     //  are matching a potential contraction sequence
   2789 
   2790                     // First we position ourselves at the begining of contraction sequence
   2791                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2792                     if (collIter_bos(source)) {
   2793                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2794                         break;
   2795                     }
   2796                     schar = getPrevNormalizedChar(source, status);
   2797                     goBackOne(source);
   2798 
   2799                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2800                         UCharOffset++;
   2801                     }
   2802 
   2803                     if (schar == tchar) {
   2804                         // Found the source string char in the table.
   2805                         //  Pick up the corresponding CE from the table.
   2806                         CE = *(coll->contractionCEs +
   2807                             (UCharOffset - coll->contractionIndex));
   2808                     }
   2809                     else
   2810                     {
   2811                         // Source string char was not in the table.
   2812                         //   We have not found the prefix.
   2813                         CE = *(coll->contractionCEs +
   2814                             (ContractionStart - coll->contractionIndex));
   2815                     }
   2816 
   2817                     if(!isPrefix(CE)) {
   2818                         // The source string char was in the contraction table, and the corresponding
   2819                         //   CE is not a prefix CE.  We found the prefix, break
   2820                         //   out of loop, this CE will end up being returned.  This is the normal
   2821                         //   way out of prefix handling when the source actually contained
   2822                         //   the prefix.
   2823                         break;
   2824                     }
   2825                 }
   2826                 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
   2827                     loadState(source, &prefixState, TRUE);
   2828                     if(source->origFlags & UCOL_USE_ITERATOR) {
   2829                         source->flags = source->origFlags;
   2830                     }
   2831                 } else { // prefix search was a failure, we have to backup all the way to the start
   2832                     loadState(source, &entryState, TRUE);
   2833                 }
   2834                 break;
   2835             }
   2836         case CONTRACTION_TAG:
   2837             {
   2838                 /* This should handle contractions */
   2839                 collIterateState state;
   2840                 backupState(source, &state);
   2841                 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
   2842                 const UChar *UCharOffset;
   2843                 UChar schar, tchar;
   2844 
   2845                 for (;;) {
   2846                     /* This loop will run once per source string character, for as long as we     */
   2847                     /*  are matching a potential contraction sequence                  */
   2848 
   2849                     /* First we position ourselves at the begining of contraction sequence */
   2850                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2851 
   2852                     if (collIter_eos(source)) {
   2853                         // Ran off the end of the source string.
   2854                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2855                         // So we'll pick whatever we have at the point...
   2856                         if (CE == UCOL_NOT_FOUND) {
   2857                             // back up the source over all the chars we scanned going into this contraction.
   2858                             CE = firstCE;
   2859                             loadState(source, &state, TRUE);
   2860                             if(source->origFlags & UCOL_USE_ITERATOR) {
   2861                                 source->flags = source->origFlags;
   2862                             }
   2863                         }
   2864                         break;
   2865                     }
   2866 
   2867                     uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
   2868                     uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
   2869 
   2870                     schar = getNextNormalizedChar(source);
   2871                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2872                         UCharOffset++;
   2873                     }
   2874 
   2875                     if (schar == tchar) {
   2876                         // Found the source string char in the contraction table.
   2877                         //  Pick up the corresponding CE from the table.
   2878                         CE = *(coll->contractionCEs +
   2879                             (UCharOffset - coll->contractionIndex));
   2880                     }
   2881                     else
   2882                     {
   2883                         // Source string char was not in contraction table.
   2884                         //   Unless we have a discontiguous contraction, we have finished
   2885                         //   with this contraction.
   2886                         // in order to do the proper detection, we
   2887                         // need to see if we're dealing with a supplementary
   2888                         /* We test whether the next two char are surrogate pairs.
   2889                         * This test is done if the iterator is not NULL.
   2890                         * If there is no surrogate pair, the iterator
   2891                         * goes back one if needed. */
   2892                         UChar32 miss = schar;
   2893                         if (source->iterator) {
   2894                             UChar32 surrNextChar; /* the next char in the iteration to test */
   2895                             int32_t prevPos; /* holds the previous position before move forward of the source iterator */
   2896                             if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
   2897                                 prevPos = source->iterator->index;
   2898                                 surrNextChar = getNextNormalizedChar(source);
   2899                                 if (U16_IS_TRAIL(surrNextChar)) {
   2900                                     miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
   2901                                 } else if (prevPos < source->iterator->index){
   2902                                     goBackOne(source);
   2903                                 }
   2904                             }
   2905                         } else if (U16_IS_LEAD(schar)) {
   2906                             miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
   2907                         }
   2908 
   2909                         uint8_t sCC;
   2910                         if (miss < 0x300 ||
   2911                             maxCC == 0 ||
   2912                             (sCC = i_getCombiningClass(miss, coll)) == 0 ||
   2913                             sCC>maxCC ||
   2914                             (allSame != 0 && sCC == maxCC) ||
   2915                             collIter_eos(source))
   2916                         {
   2917                             //  Contraction can not be discontiguous.
   2918                             goBackOne(source);  // back up the source string by one,
   2919                             //  because  the character we just looked at was
   2920                             //  not part of the contraction.   */
   2921                             if(U_IS_SUPPLEMENTARY(miss)) {
   2922                                 goBackOne(source);
   2923                             }
   2924                             CE = *(coll->contractionCEs +
   2925                                 (ContractionStart - coll->contractionIndex));
   2926                         } else {
   2927                             //
   2928                             // Contraction is possibly discontiguous.
   2929                             //   Scan more of source string looking for a match
   2930                             //
   2931                             UChar tempchar;
   2932                             /* find the next character if schar is not a base character
   2933                             and we are not yet at the end of the string */
   2934                             tempchar = getNextNormalizedChar(source);
   2935                             // probably need another supplementary thingie here
   2936                             goBackOne(source);
   2937                             if (i_getCombiningClass(tempchar, coll) == 0) {
   2938                                 goBackOne(source);
   2939                                 if(U_IS_SUPPLEMENTARY(miss)) {
   2940                                     goBackOne(source);
   2941                                 }
   2942                                 /* Spit out the last char of the string, wasn't tasty enough */
   2943                                 CE = *(coll->contractionCEs +
   2944                                     (ContractionStart - coll->contractionIndex));
   2945                             } else {
   2946                                 CE = getDiscontiguous(coll, source, ContractionStart);
   2947                             }
   2948                         }
   2949                     } // else after if(schar == tchar)
   2950 
   2951                     if(CE == UCOL_NOT_FOUND) {
   2952                         /* The Source string did not match the contraction that we were checking.  */
   2953                         /*  Back up the source position to undo the effects of having partially    */
   2954                         /*   scanned through what ultimately proved to not be a contraction.       */
   2955                         loadState(source, &state, TRUE);
   2956                         CE = firstCE;
   2957                         break;
   2958                     }
   2959 
   2960                     if(!isContraction(CE)) {
   2961                         // The source string char was in the contraction table, and the corresponding
   2962                         //   CE is not a contraction CE.  We completed the contraction, break
   2963                         //   out of loop, this CE will end up being returned.  This is the normal
   2964                         //   way out of contraction handling when the source actually contained
   2965                         //   the contraction.
   2966                         break;
   2967                     }
   2968 
   2969 
   2970                     // The source string char was in the contraction table, and the corresponding
   2971                     //   CE is IS  a contraction CE.  We will continue looping to check the source
   2972                     //   string for the remaining chars in the contraction.
   2973                     uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
   2974                     if(tempCE != UCOL_NOT_FOUND) {
   2975                         // We have scanned a a section of source string for which there is a
   2976                         //  CE from the contraction table.  Remember the CE and scan position, so
   2977                         //  that we can return to this point if further scanning fails to
   2978                         //  match a longer contraction sequence.
   2979                         firstCE = tempCE;
   2980 
   2981                         goBackOne(source);
   2982                         backupState(source, &state);
   2983                         getNextNormalizedChar(source);
   2984 
   2985                         // Another way to do this is:
   2986                         //collIterateState tempState;
   2987                         //backupState(source, &tempState);
   2988                         //goBackOne(source);
   2989                         //backupState(source, &state);
   2990                         //loadState(source, &tempState, TRUE);
   2991 
   2992                         // The problem is that for incomplete contractions we have to remember the previous
   2993                         // position. Before, the only thing I needed to do was state.pos--;
   2994                         // After iterator introduction and especially after introduction of normalizing
   2995                         // iterators, it became much more difficult to decrease the saved state.
   2996                         // I'm not yet sure which of the two methods above is faster.
   2997                     }
   2998                 } // for(;;)
   2999                 break;
   3000             } // case CONTRACTION_TAG:
   3001         case LONG_PRIMARY_TAG:
   3002             {
   3003                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   3004                 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   3005                 source->offsetRepeatCount += 1;
   3006                 return CE;
   3007             }
   3008         case EXPANSION_TAG:
   3009             {
   3010                 /* This should handle expansion. */
   3011                 /* NOTE: we can encounter both continuations and expansions in an expansion! */
   3012                 /* I have to decide where continuations are going to be dealt with */
   3013                 uint32_t size;
   3014                 uint32_t i;    /* general counter */
   3015 
   3016                 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   3017                 size = getExpansionCount(CE);
   3018                 CE = *CEOffset++;
   3019               //source->offsetRepeatCount = -1;
   3020 
   3021                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   3022                     for(i = 1; i<size; i++) {
   3023                         *(source->CEpos++) = *CEOffset++;
   3024                         source->offsetRepeatCount += 1;
   3025                     }
   3026                 } else { /* else, we do */
   3027                     while(*CEOffset != 0) {
   3028                         *(source->CEpos++) = *CEOffset++;
   3029                         source->offsetRepeatCount += 1;
   3030                     }
   3031                 }
   3032 
   3033                 return CE;
   3034             }
   3035         case DIGIT_TAG:
   3036             {
   3037                 /*
   3038                 We do a check to see if we want to collate digits as numbers; if so we generate
   3039                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   3040                 */
   3041                 //uint32_t size;
   3042                 uint32_t i;    /* general counter */
   3043 
   3044                 if (source->coll->numericCollation == UCOL_ON){
   3045                     collIterateState digitState = {0,0,0,0,0,0,0,0,0};
   3046                     UChar32 char32 = 0;
   3047                     int32_t digVal = 0;
   3048 
   3049                     uint32_t digIndx = 0;
   3050                     uint32_t endIndex = 0;
   3051                     uint32_t trailingZeroIndex = 0;
   3052 
   3053                     uint8_t collateVal = 0;
   3054 
   3055                     UBool nonZeroValReached = FALSE;
   3056 
   3057                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
   3058                     /*
   3059                          We parse the source string until we hit a char that's NOT a digit.
   3060                         Use this u_charDigitValue. This might be slow because we have to
   3061                         handle surrogates...
   3062                     */
   3063             /*
   3064                     if (U16_IS_LEAD(ch)){
   3065                       if (!collIter_eos(source)) {
   3066                         backupState(source, &digitState);
   3067                         UChar trail = getNextNormalizedChar(source);
   3068                         if(U16_IS_TRAIL(trail)) {
   3069                           char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   3070                         } else {
   3071                           loadState(source, &digitState, TRUE);
   3072                           char32 = ch;
   3073                         }
   3074                       } else {
   3075                         char32 = ch;
   3076                       }
   3077                     } else {
   3078                       char32 = ch;
   3079                     }
   3080                     digVal = u_charDigitValue(char32);
   3081             */
   3082                     digVal = u_charDigitValue(cp); // if we have arrived here, we have
   3083                     // already processed possible supplementaries that trigered the digit tag -
   3084                     // all supplementaries are marked in the UCA.
   3085                     /*
   3086                         We  pad a zero in front of the first element anyways. This takes
   3087                         care of the (probably) most common case where people are sorting things followed
   3088                         by a single digit
   3089                     */
   3090                     digIndx++;
   3091                     for(;;){
   3092                         // Make sure we have enough space. No longer needed;
   3093                         // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
   3094                         // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
   3095                         // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
   3096 
   3097                         // Skipping over leading zeroes.
   3098                         if (digVal != 0) {
   3099                             nonZeroValReached = TRUE;
   3100                         }
   3101                         if (nonZeroValReached) {
   3102                             /*
   3103                             We parse the digit string into base 100 numbers (this fits into a byte).
   3104                             We only add to the buffer in twos, thus if we are parsing an odd character,
   3105                             that serves as the 'tens' digit while the if we are parsing an even one, that
   3106                             is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3107                             a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3108                             overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3109                             than all the other bytes.
   3110                             */
   3111 
   3112                             if (digIndx % 2 == 1){
   3113                                 collateVal += (uint8_t)digVal;
   3114 
   3115                                 // We don't enter the low-order-digit case unless we've already seen
   3116                                 // the high order, or for the first digit, which is always non-zero.
   3117                                 if (collateVal != 0)
   3118                                     trailingZeroIndex = 0;
   3119 
   3120                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3121                                 collateVal = 0;
   3122                             }
   3123                             else{
   3124                                 // We drop the collation value into the buffer so if we need to do
   3125                                 // a "front patch" we don't have to check to see if we're hitting the
   3126                                 // last element.
   3127                                 collateVal = (uint8_t)(digVal * 10);
   3128 
   3129                                 // Check for trailing zeroes.
   3130                                 if (collateVal == 0)
   3131                                 {
   3132                                     if (!trailingZeroIndex)
   3133                                         trailingZeroIndex = (digIndx/2) + 2;
   3134                                 }
   3135                                 else
   3136                                     trailingZeroIndex = 0;
   3137 
   3138                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3139                             }
   3140                             digIndx++;
   3141                         }
   3142 
   3143                         // Get next character.
   3144                         if (!collIter_eos(source)){
   3145                             ch = getNextNormalizedChar(source);
   3146                             if (U16_IS_LEAD(ch)){
   3147                                 if (!collIter_eos(source)) {
   3148                                     backupState(source, &digitState);
   3149                                     UChar trail = getNextNormalizedChar(source);
   3150                                     if(U16_IS_TRAIL(trail)) {
   3151                                         char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   3152                                     } else {
   3153                                         loadState(source, &digitState, TRUE);
   3154                                         char32 = ch;
   3155                                     }
   3156                                 }
   3157                             } else {
   3158                                 char32 = ch;
   3159                             }
   3160 
   3161                             if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
   3162                                 // Resetting position to point to the next unprocessed char. We
   3163                                 // overshot it when doing our test/set for numbers.
   3164                                 if (char32 > 0xFFFF) { // For surrogates.
   3165                                     loadState(source, &digitState, TRUE);
   3166                                     //goBackOne(source);
   3167                                 }
   3168                                 goBackOne(source);
   3169                                 break;
   3170                             }
   3171                         } else {
   3172                             break;
   3173                         }
   3174                     }
   3175 
   3176                     if (nonZeroValReached == FALSE){
   3177                         digIndx = 2;
   3178                         numTempBuf[2] = 6;
   3179                     }
   3180 
   3181                     endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
   3182                     if (digIndx % 2 != 0){
   3183                         /*
   3184                         We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
   3185                         we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
   3186                         Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
   3187                         single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
   3188                         */
   3189 
   3190                         for(i = 2; i < endIndex; i++){
   3191                             numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
   3192                                 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
   3193                         }
   3194                         --digIndx;
   3195                     }
   3196 
   3197                     // Subtract one off of the last byte.
   3198                     numTempBuf[endIndex-1] -= 1;
   3199 
   3200                     /*
   3201                     We want to skip over the first two slots in the buffer. The first slot
   3202                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3203                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3204                     */
   3205                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3206                     numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
   3207 
   3208                     // Now transfer the collation key to our collIterate struct.
   3209                     // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
   3210                     //size = ((endIndex+1) & ~1)/2;
   3211                     CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3212                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3213                         UCOL_BYTE_COMMON; // Tertiary weight.
   3214                     i = 2; // Reset the index into the buffer.
   3215                     while(i < endIndex)
   3216                     {
   3217                         uint32_t primWeight = numTempBuf[i++] << 8;
   3218                         if ( i < endIndex)
   3219                             primWeight |= numTempBuf[i++];
   3220                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3221                     }
   3222 
   3223                 } else {
   3224                     // no numeric mode, we'll just switch to whatever we stashed and continue
   3225                     CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   3226                     CE = *CEOffset++;
   3227                     break;
   3228                 }
   3229                 return CE;
   3230             }
   3231             /* various implicits optimization */
   3232         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   3233             /* UCA is filled with these. Tailorings are NOT_FOUND */
   3234             return getImplicit(cp, source);
   3235         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   3236             // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
   3237             return getImplicit(cp, source);
   3238         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3239             {
   3240                 static const uint32_t
   3241                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3242                 //const uint32_t LCount = 19;
   3243                 static const uint32_t VCount = 21;
   3244                 static const uint32_t TCount = 28;
   3245                 //const uint32_t NCount = VCount * TCount;   // 588
   3246                 //const uint32_t SCount = LCount * NCount;   // 11172
   3247                 uint32_t L = ch - SBase;
   3248 
   3249                 // divide into pieces
   3250 
   3251                 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
   3252                 L /= TCount;
   3253                 uint32_t V = L % VCount;
   3254                 L /= VCount;
   3255 
   3256                 // offset them
   3257 
   3258                 L += LBase;
   3259                 V += VBase;
   3260                 T += TBase;
   3261 
   3262                 // return the first CE, but first put the rest into the expansion buffer
   3263                 if (!source->coll->image->jamoSpecial) { // FAST PATH
   3264 
   3265                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   3266                     if (T != TBase) {
   3267                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   3268                     }
   3269 
   3270                     return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   3271 
   3272                 } else { // Jamo is Special
   3273                     // Since Hanguls pass the FCD check, it is
   3274                     // guaranteed that we won't be in
   3275                     // the normalization buffer if something like this happens
   3276 
   3277                     // However, if we are using a uchar iterator and normalization
   3278                     // is ON, the Hangul that lead us here is going to be in that
   3279                     // normalization buffer. Here we want to restore the uchar
   3280                     // iterator state and pull out of the normalization buffer
   3281                     if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
   3282                         source->flags = source->origFlags; // restore the iterator
   3283                         source->pos = NULL;
   3284                     }
   3285 
   3286                     // Move Jamos into normalization buffer
   3287                     UChar *buffer = source->writableBuffer.getBuffer(4);
   3288                     int32_t bufferLength;
   3289                     buffer[0] = (UChar)L;
   3290                     buffer[1] = (UChar)V;
   3291                     if (T != TBase) {
   3292                         buffer[2] = (UChar)T;
   3293                         bufferLength = 3;
   3294                     } else {
   3295                         bufferLength = 2;
   3296                     }
   3297                     source->writableBuffer.releaseBuffer(bufferLength);
   3298 
   3299                     // Indicate where to continue in main input string after exhausting the writableBuffer
   3300                     source->fcdPosition       = source->pos;
   3301 
   3302                     source->pos   = source->writableBuffer.getTerminatedBuffer();
   3303                     source->origFlags   = source->flags;
   3304                     source->flags       |= UCOL_ITER_INNORMBUF;
   3305                     source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   3306 
   3307                     return(UCOL_IGNORABLE);
   3308                 }
   3309             }
   3310         case SURROGATE_TAG:
   3311             /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
   3312             /* two things can happen here: next code point can be a trailing surrogate - we will use it */
   3313             /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
   3314             /* we treat it like an unassigned code point. */
   3315             {
   3316                 UChar trail;
   3317                 collIterateState state;
   3318                 backupState(source, &state);
   3319                 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
   3320                     // we chould have stepped one char forward and it might have turned that it
   3321                     // was not a trail surrogate. In that case, we have to backup.
   3322                     loadState(source, &state, TRUE);
   3323                     return UCOL_NOT_FOUND;
   3324                 } else {
   3325                     /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
   3326                     CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
   3327                     if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
   3328                         // We need to backup
   3329                         loadState(source, &state, TRUE);
   3330                         return CE;
   3331                     }
   3332                     // calculate the supplementary code point value, if surrogate was not tailored
   3333                     cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   3334                 }
   3335             }
   3336             break;
   3337         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   3338             UChar nextChar;
   3339             if( source->flags & UCOL_USE_ITERATOR) {
   3340                 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
   3341                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3342                     source->iterator->next(source->iterator);
   3343                     return getImplicit(cp, source);
   3344                 }
   3345             } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
   3346                       U_IS_TRAIL((nextChar=*source->pos))) {
   3347                 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3348                 source->pos++;
   3349                 return getImplicit(cp, source);
   3350             }
   3351             return UCOL_NOT_FOUND;
   3352         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   3353             return UCOL_NOT_FOUND; /* broken surrogate sequence */
   3354         case CHARSET_TAG:
   3355             /* not yet implemented */
   3356             /* probably after 1.8 */
   3357             return UCOL_NOT_FOUND;
   3358         default:
   3359             *status = U_INTERNAL_PROGRAM_ERROR;
   3360             CE=0;
   3361             break;
   3362     }
   3363     if (CE <= UCOL_NOT_FOUND) break;
   3364   }
   3365   return CE;
   3366 }
   3367 
   3368 
   3369 /* now uses Mark's getImplicitPrimary code */
   3370 static
   3371 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
   3372     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   3373 
   3374     *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
   3375     collationSource->toReturn = collationSource->CEpos;
   3376 
   3377     // **** doesn't work if using iterator ****
   3378     if (collationSource->flags & UCOL_ITER_INNORMBUF) {
   3379         collationSource->offsetRepeatCount = 1;
   3380     } else {
   3381         int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
   3382 
   3383         UErrorCode errorCode = U_ZERO_ERROR;
   3384         collationSource->appendOffset(firstOffset, errorCode);
   3385         collationSource->appendOffset(firstOffset + 1, errorCode);
   3386 
   3387         collationSource->offsetReturn = collationSource->offsetStore - 1;
   3388         *(collationSource->offsetBuffer) = firstOffset;
   3389         if (collationSource->offsetReturn == collationSource->offsetBuffer) {
   3390             collationSource->offsetStore = collationSource->offsetBuffer;
   3391         }
   3392     }
   3393 
   3394     return ((r & 0x0000FFFF)<<16) | 0x000000C0;
   3395 }
   3396 
   3397 /**
   3398  * This function handles the special CEs like contractions, expansions,
   3399  * surrogates, Thai.
   3400  * It is called by both getPrevCE
   3401  */
   3402 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
   3403                           collIterate *source,
   3404                           UErrorCode *status)
   3405 {
   3406     const uint32_t *CEOffset    = NULL;
   3407           UChar    *UCharOffset = NULL;
   3408           UChar    schar;
   3409     const UChar    *constart    = NULL;
   3410           uint32_t size;
   3411           UChar    buffer[UCOL_MAX_BUFFER];
   3412           uint32_t *endCEBuffer;
   3413           UChar   *strbuffer;
   3414           int32_t noChars = 0;
   3415           int32_t CECount = 0;
   3416 
   3417     for(;;)
   3418     {
   3419         /* the only ces that loops are thai and contractions */
   3420         switch (getCETag(CE))
   3421         {
   3422         case NOT_FOUND_TAG:  /* this tag always returns */
   3423             return CE;
   3424 
   3425         case SPEC_PROC_TAG:
   3426             {
   3427                 // Special processing is getting a CE that is preceded by a certain prefix
   3428                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   3429                 // When we encouter a special processing tag, we go backwards and try to see if
   3430                 // we have a match.
   3431                 // Contraction tables are used - so the whole process is not unlike contraction.
   3432                 // prefix data is stored backwards in the table.
   3433                 const UChar *UCharOffset;
   3434                 UChar schar, tchar;
   3435                 collIterateState prefixState;
   3436                 backupState(source, &prefixState);
   3437                 for(;;) {
   3438                     // This loop will run once per source string character, for as long as we
   3439                     //  are matching a potential contraction sequence
   3440 
   3441                     // First we position ourselves at the begining of contraction sequence
   3442                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   3443 
   3444                     if (collIter_bos(source)) {
   3445                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   3446                         break;
   3447                     }
   3448                     schar = getPrevNormalizedChar(source, status);
   3449                     goBackOne(source);
   3450 
   3451                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   3452                         UCharOffset++;
   3453                     }
   3454 
   3455                     if (schar == tchar) {
   3456                         // Found the source string char in the table.
   3457                         //  Pick up the corresponding CE from the table.
   3458                         CE = *(coll->contractionCEs +
   3459                             (UCharOffset - coll->contractionIndex));
   3460                     }
   3461                     else
   3462                     {
   3463                         // if there is a completely ignorable code point in the middle of
   3464                         // a prefix, we need to act as if it's not there
   3465                         // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
   3466                         // lone surrogates cannot be set to zero as it would break other processing
   3467                         uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   3468                         // it's easy for BMP code points
   3469                         if(isZeroCE == 0) {
   3470                             continue;
   3471                         } else if(U16_IS_SURROGATE(schar)) {
   3472                             // for supplementary code points, we have to check the next one
   3473                             // situations where we are going to ignore
   3474                             // 1. beginning of the string: schar is a lone surrogate
   3475                             // 2. schar is a lone surrogate
   3476                             // 3. schar is a trail surrogate in a valid surrogate sequence
   3477                             //    that is explicitly set to zero.
   3478                             if (!collIter_bos(source)) {
   3479                                 UChar lead;
   3480                                 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
   3481                                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
   3482                                     if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
   3483                                         uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
   3484                                         if(finalCE == 0) {
   3485                                             // this is a real, assigned completely ignorable code point
   3486                                             goBackOne(source);
   3487                                             continue;
   3488                                         }
   3489                                     }
   3490                                 } else {
   3491                                     // lone surrogate, treat like unassigned
   3492                                     return UCOL_NOT_FOUND;
   3493                                 }
   3494                             } else {
   3495                                 // lone surrogate at the beggining, treat like unassigned
   3496                                 return UCOL_NOT_FOUND;
   3497                             }
   3498                         }
   3499                         // Source string char was not in the table.
   3500                         //   We have not found the prefix.
   3501                         CE = *(coll->contractionCEs +
   3502                             (ContractionStart - coll->contractionIndex));
   3503                     }
   3504 
   3505                     if(!isPrefix(CE)) {
   3506                         // The source string char was in the contraction table, and the corresponding
   3507                         //   CE is not a prefix CE.  We found the prefix, break
   3508                         //   out of loop, this CE will end up being returned.  This is the normal
   3509                         //   way out of prefix handling when the source actually contained
   3510                         //   the prefix.
   3511                         break;
   3512                     }
   3513                 }
   3514                 loadState(source, &prefixState, TRUE);
   3515                 break;
   3516             }
   3517 
   3518         case CONTRACTION_TAG: {
   3519             /* to ensure that the backwards and forwards iteration matches, we
   3520             take the current region of most possible match and pass it through
   3521             the forward iteration. this will ensure that the obstinate problem of
   3522             overlapping contractions will not occur.
   3523             */
   3524             schar = peekCodeUnit(source, 0);
   3525             constart = (UChar *)coll->image + getContractOffset(CE);
   3526             if (isAtStartPrevIterate(source)
   3527                 /* commented away contraction end checks after adding the checks
   3528                 in getPrevCE  */) {
   3529                     /* start of string or this is not the end of any contraction */
   3530                     CE = *(coll->contractionCEs +
   3531                         (constart - coll->contractionIndex));
   3532                     break;
   3533             }
   3534             strbuffer = buffer;
   3535             UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
   3536             *(UCharOffset --) = 0;
   3537             noChars = 0;
   3538             // have to swap thai characters
   3539             while (ucol_unsafeCP(schar, coll)) {
   3540                 *(UCharOffset) = schar;
   3541                 noChars++;
   3542                 UCharOffset --;
   3543                 schar = getPrevNormalizedChar(source, status);
   3544                 goBackOne(source);
   3545                 // TODO: when we exhaust the contraction buffer,
   3546                 // it needs to get reallocated. The problem is
   3547                 // that the size depends on the string which is
   3548                 // not iterated over. However, since we're travelling
   3549                 // backwards, we already had to set the iterator at
   3550                 // the end - so we might as well know where we are?
   3551                 if (UCharOffset + 1 == buffer) {
   3552                     /* we have exhausted the buffer */
   3553                     int32_t newsize = 0;
   3554                     if(source->pos) { // actually dealing with a position
   3555                         newsize = (int32_t)(source->pos - source->string + 1);
   3556                     } else { // iterator
   3557                         newsize = 4 * UCOL_MAX_BUFFER;
   3558                     }
   3559                     strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
   3560                         (newsize + UCOL_MAX_BUFFER));
   3561                     /* test for NULL */
   3562                     if (strbuffer == NULL) {
   3563                         *status = U_MEMORY_ALLOCATION_ERROR;
   3564                         return UCOL_NO_MORE_CES;
   3565                     }
   3566                     UCharOffset = strbuffer + newsize;
   3567                     uprv_memcpy(UCharOffset, buffer,
   3568                         UCOL_MAX_BUFFER * sizeof(UChar));
   3569                     UCharOffset --;
   3570                 }
   3571                 if ((source->pos && (source->pos == source->string ||
   3572                     ((source->flags & UCOL_ITER_INNORMBUF) &&
   3573                     *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
   3574                     || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
   3575                         break;
   3576                 }
   3577             }
   3578             /* adds the initial base character to the string */
   3579             *(UCharOffset) = schar;
   3580             noChars++;
   3581 
   3582             int32_t offsetBias;
   3583 
   3584             // **** doesn't work if using iterator ****
   3585             if (source->flags & UCOL_ITER_INNORMBUF) {
   3586                 offsetBias = -1;
   3587             } else {
   3588                 offsetBias = (int32_t)(source->pos - source->string);
   3589             }
   3590 
   3591             /* a new collIterate is used to simplify things, since using the current
   3592             collIterate will mean that the forward and backwards iteration will
   3593             share and change the same buffers. we don't want to get into that. */
   3594             collIterate temp;
   3595             int32_t rawOffset;
   3596 
   3597             IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
   3598             if(U_FAILURE(*status)) {
   3599                 return (uint32_t)UCOL_NULLORDER;
   3600             }
   3601             temp.flags &= ~UCOL_ITER_NORM;
   3602             temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
   3603 
   3604             rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
   3605             CE = ucol_IGetNextCE(coll, &temp, status);
   3606 
   3607             if (source->extendCEs) {
   3608                 endCEBuffer = source->extendCEs + source->extendCEsSize;
   3609                 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
   3610             } else {
   3611                 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
   3612                 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
   3613             }
   3614 
   3615             while (CE != UCOL_NO_MORE_CES) {
   3616                 *(source->CEpos ++) = CE;
   3617 
   3618                 if (offsetBias >= 0) {
   3619                     source->appendOffset(rawOffset + offsetBias, *status);
   3620                 }
   3621 
   3622                 CECount++;
   3623                 if (source->CEpos == endCEBuffer) {
   3624                     /* ran out of CE space, reallocate to new buffer.
   3625                     If reallocation fails, reset pointers and bail out,
   3626                     there's no guarantee of the right character position after
   3627                     this bail*/
   3628                     if (!increaseCEsCapacity(source)) {
   3629                         *status = U_MEMORY_ALLOCATION_ERROR;
   3630                         break;
   3631                     }
   3632 
   3633                     endCEBuffer = source->extendCEs + source->extendCEsSize;
   3634                 }
   3635 
   3636                 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
   3637                     rawOffset = (int32_t)(temp.fcdPosition - temp.string);
   3638                 } else {
   3639                     rawOffset = (int32_t)(temp.pos - temp.string);
   3640                 }
   3641 
   3642                 CE = ucol_IGetNextCE(coll, &temp, status);
   3643             }
   3644 
   3645             if (strbuffer != buffer) {
   3646                 uprv_free(strbuffer);
   3647             }
   3648             if (U_FAILURE(*status)) {
   3649                 return (uint32_t)UCOL_NULLORDER;
   3650             }
   3651 
   3652             if (source->offsetRepeatValue != 0) {
   3653                 if (CECount > noChars) {
   3654                     source->offsetRepeatCount += temp.offsetRepeatCount;
   3655                 } else {
   3656                     // **** does this really skip the right offsets? ****
   3657                     source->offsetReturn -= (noChars - CECount);
   3658                 }
   3659             }
   3660 
   3661             if (offsetBias >= 0) {
   3662                 source->offsetReturn = source->offsetStore - 1;
   3663                 if (source->offsetReturn == source->offsetBuffer) {
   3664                     source->offsetStore = source->offsetBuffer;
   3665                 }
   3666             }
   3667 
   3668             source->toReturn = source->CEpos - 1;
   3669             if (source->toReturn == source->CEs) {
   3670                 source->CEpos = source->CEs;
   3671             }
   3672 
   3673             return *(source->toReturn);
   3674         }
   3675         case LONG_PRIMARY_TAG:
   3676             {
   3677                 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   3678                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   3679                 source->toReturn = source->CEpos - 1;
   3680 
   3681                 if (source->flags & UCOL_ITER_INNORMBUF) {
   3682                     source->offsetRepeatCount = 1;
   3683                 } else {
   3684                     int32_t firstOffset = (int32_t)(source->pos - source->string);
   3685 
   3686                     source->appendOffset(firstOffset, *status);
   3687                     source->appendOffset(firstOffset + 1, *status);
   3688 
   3689                     source->offsetReturn = source->offsetStore - 1;
   3690                     *(source->offsetBuffer) = firstOffset;
   3691                     if (source->offsetReturn == source->offsetBuffer) {
   3692                         source->offsetStore = source->offsetBuffer;
   3693                     }
   3694                 }
   3695 
   3696 
   3697                 return *(source->toReturn);
   3698             }
   3699 
   3700         case EXPANSION_TAG: /* this tag always returns */
   3701             {
   3702             /*
   3703             This should handle expansion.
   3704             NOTE: we can encounter both continuations and expansions in an expansion!
   3705             I have to decide where continuations are going to be dealt with
   3706             */
   3707             int32_t firstOffset = (int32_t)(source->pos - source->string);
   3708 
   3709             // **** doesn't work if using iterator ****
   3710             if (source->offsetReturn != NULL) {
   3711                 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
   3712                     source->offsetStore = source->offsetBuffer;
   3713                 }else {
   3714                   firstOffset = -1;
   3715                 }
   3716             }
   3717 
   3718             /* find the offset to expansion table */
   3719             CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3720             size     = getExpansionCount(CE);
   3721             if (size != 0) {
   3722                 /*
   3723                 if there are less than 16 elements in expansion, we don't terminate
   3724                 */
   3725                 uint32_t count;
   3726 
   3727                 for (count = 0; count < size; count++) {
   3728                     *(source->CEpos ++) = *CEOffset++;
   3729 
   3730                     if (firstOffset >= 0) {
   3731                         source->appendOffset(firstOffset + 1, *status);
   3732                     }
   3733                 }
   3734             } else {
   3735                 /* else, we do */
   3736                 while (*CEOffset != 0) {
   3737                     *(source->CEpos ++) = *CEOffset ++;
   3738 
   3739                     if (firstOffset >= 0) {
   3740                         source->appendOffset(firstOffset + 1, *status);
   3741                     }
   3742                 }
   3743             }
   3744 
   3745             if (firstOffset >= 0) {
   3746                 source->offsetReturn = source->offsetStore - 1;
   3747                 *(source->offsetBuffer) = firstOffset;
   3748                 if (source->offsetReturn == source->offsetBuffer) {
   3749                     source->offsetStore = source->offsetBuffer;
   3750                 }
   3751             } else {
   3752                 source->offsetRepeatCount += size - 1;
   3753             }
   3754 
   3755             source->toReturn = source->CEpos - 1;
   3756             // in case of one element expansion, we
   3757             // want to immediately return CEpos
   3758             if(source->toReturn == source->CEs) {
   3759                 source->CEpos = source->CEs;
   3760             }
   3761 
   3762             return *(source->toReturn);
   3763             }
   3764 
   3765         case DIGIT_TAG:
   3766             {
   3767                 /*
   3768                 We do a check to see if we want to collate digits as numbers; if so we generate
   3769                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   3770                 */
   3771                 uint32_t i;    /* general counter */
   3772 
   3773                 if (source->coll->numericCollation == UCOL_ON){
   3774                     uint32_t digIndx = 0;
   3775                     uint32_t endIndex = 0;
   3776                     uint32_t leadingZeroIndex = 0;
   3777                     uint32_t trailingZeroCount = 0;
   3778 
   3779                     uint8_t collateVal = 0;
   3780 
   3781                     UBool nonZeroValReached = FALSE;
   3782 
   3783                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
   3784                     /*
   3785                     We parse the source string until we hit a char that's NOT a digit.
   3786                     Use this u_charDigitValue. This might be slow because we have to
   3787                     handle surrogates...
   3788                     */
   3789                     /*
   3790                     We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
   3791                     with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
   3792                     element we process when going backward. To determine how long that chunk might be, we may need to make
   3793                     two passes through the loop that collects digits - one to see how long the string is (and how much is
   3794                     leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
   3795                     more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
   3796                     element chunk after resetting the state to the initialState at the right side of the digit string.
   3797                     */
   3798                     uint32_t ceLimit = 0;
   3799                     UChar initial_ch = ch;
   3800                     collIterateState initialState = {0,0,0,0,0,0,0,0,0};
   3801                     backupState(source, &initialState);
   3802 
   3803                     for(;;) {
   3804                         collIterateState state = {0,0,0,0,0,0,0,0,0};
   3805                         UChar32 char32 = 0;
   3806                         int32_t digVal = 0;
   3807 
   3808                         if (U16_IS_TRAIL (ch)) {
   3809                             if (!collIter_bos(source)){
   3810                                 UChar lead = getPrevNormalizedChar(source, status);
   3811                                 if(U16_IS_LEAD(lead)) {
   3812                                     char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3813                                     goBackOne(source);
   3814                                 } else {
   3815                                     char32 = ch;
   3816                                 }
   3817                             } else {
   3818                                 char32 = ch;
   3819                             }
   3820                         } else {
   3821                             char32 = ch;
   3822                         }
   3823                         digVal = u_charDigitValue(char32);
   3824 
   3825                         for(;;) {
   3826                             // Make sure we have enough space. No longer needed;
   3827                             // at this point the largest value of digIndx when we need to save data in numTempBuf
   3828                             // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
   3829                             // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
   3830 
   3831                             // Skip over trailing zeroes, and keep a count of them.
   3832                             if (digVal != 0)
   3833                                 nonZeroValReached = TRUE;
   3834 
   3835                             if (nonZeroValReached) {
   3836                                 /*
   3837                                 We parse the digit string into base 100 numbers (this fits into a byte).
   3838                                 We only add to the buffer in twos, thus if we are parsing an odd character,
   3839                                 that serves as the 'tens' digit while the if we are parsing an even one, that
   3840                                 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3841                                 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3842                                 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3843                                 than all the other bytes.
   3844 
   3845                                 Since we're doing in this reverse we want to put the first digit encountered into the
   3846                                 ones place and the second digit encountered into the tens place.
   3847                                 */
   3848 
   3849                                 if ((digIndx + trailingZeroCount) % 2 == 1) {
   3850                                     // High-order digit case (tens place)
   3851                                     collateVal += (uint8_t)(digVal * 10);
   3852 
   3853                                     // We cannot set leadingZeroIndex unless it has been set for the
   3854                                     // low-order digit. Therefore, all we can do for the high-order
   3855                                     // digit is turn it off, never on.
   3856                                     // The only time we will have a high digit without a low is for
   3857                                     // the very first non-zero digit, so no zero check is necessary.
   3858                                     if (collateVal != 0)
   3859                                         leadingZeroIndex = 0;
   3860 
   3861                                     // The first pass through, digIndx may exceed the limit, but in that case
   3862                                     // we no longer care about numTempBuf contents since they will be discarded
   3863                                     if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
   3864                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3865                                     }
   3866                                     collateVal = 0;
   3867                                 } else {
   3868                                     // Low-order digit case (ones place)
   3869                                     collateVal = (uint8_t)digVal;
   3870 
   3871                                     // Check for leading zeroes.
   3872                                     if (collateVal == 0) {
   3873                                         if (!leadingZeroIndex)
   3874                                             leadingZeroIndex = (digIndx/2) + 2;
   3875                                     } else
   3876                                         leadingZeroIndex = 0;
   3877 
   3878                                     // No need to write to buffer; the case of a last odd digit
   3879                                     // is handled below.
   3880                                 }
   3881                                 ++digIndx;
   3882                             } else
   3883                                 ++trailingZeroCount;
   3884 
   3885                             if (!collIter_bos(source)) {
   3886                                 ch = getPrevNormalizedChar(source, status);
   3887                                 //goBackOne(source);
   3888                                 if (U16_IS_TRAIL(ch)) {
   3889                                     backupState(source, &state);
   3890                                     if (!collIter_bos(source)) {
   3891                                         goBackOne(source);
   3892                                         UChar lead = getPrevNormalizedChar(source, status);
   3893 
   3894                                         if(U16_IS_LEAD(lead)) {
   3895                                             char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3896                                         } else {
   3897                                             loadState(source, &state, FALSE);
   3898                                             char32 = ch;
   3899                                         }
   3900                                     }
   3901                                 } else
   3902                                     char32 = ch;
   3903 
   3904                                 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
   3905                                     if (char32 > 0xFFFF) {// For surrogates.
   3906                                         loadState(source, &state, FALSE);
   3907                                     }
   3908                                     // Don't need to "reverse" the goBackOne call,
   3909                                     // as this points to the next position to process..
   3910                                     //if (char32 > 0xFFFF) // For surrogates.
   3911                                     //getNextNormalizedChar(source);
   3912                                     break;
   3913                                 }
   3914 
   3915                                 goBackOne(source);
   3916                             }else
   3917                                 break;
   3918                         }
   3919 
   3920                         if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
   3921                             // our collation element is not too big, go ahead and finish with it
   3922                             break;
   3923                         }
   3924                         // our digit string is too long for a collation element;
   3925                         // set the limit for it, reset the state and begin again
   3926                         ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
   3927                         if ( ceLimit == 0 ) {
   3928                             ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
   3929                         }
   3930                         ch = initial_ch;
   3931                         loadState(source, &initialState, FALSE);
   3932                         digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
   3933                         collateVal = 0;
   3934                         nonZeroValReached = FALSE;
   3935                     }
   3936 
   3937                     if (! nonZeroValReached) {
   3938                         digIndx = 2;
   3939                         trailingZeroCount = 0;
   3940                         numTempBuf[2] = 6;
   3941                     }
   3942 
   3943                     if ((digIndx + trailingZeroCount) % 2 != 0) {
   3944                         numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
   3945                         digIndx += 1;       // The implicit leading zero
   3946                     }
   3947                     if (trailingZeroCount % 2 != 0) {
   3948                         // We had to consume one trailing zero for the low digit
   3949                         // of the least significant byte
   3950                         digIndx += 1;       // The trailing zero not in the exponent
   3951                         trailingZeroCount -= 1;
   3952                     }
   3953 
   3954                     endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
   3955 
   3956                     // Subtract one off of the last byte. Really the first byte here, but it's reversed...
   3957                     numTempBuf[2] -= 1;
   3958 
   3959                     /*
   3960                     We want to skip over the first two slots in the buffer. The first slot
   3961                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3962                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3963                     The exponent must be adjusted by the number of leading zeroes, and the number of
   3964                     trailing zeroes.
   3965                     */
   3966                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3967                     uint32_t exponent = (digIndx+trailingZeroCount)/2;
   3968                     if (leadingZeroIndex)
   3969                         exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
   3970                     numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
   3971 
   3972                     // Now transfer the collation key to our collIterate struct.
   3973                     // The total size for our collation key is half of endIndex, rounded up.
   3974                     int32_t size = (endIndex+1)/2;
   3975                     if(!ensureCEsCapacity(source, size)) {
   3976                         return (uint32_t)UCOL_NULLORDER;
   3977                     }
   3978                     *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3979                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3980                         UCOL_BYTE_COMMON; // Tertiary weight.
   3981                     i = endIndex - 1; // Reset the index into the buffer.
   3982                     while(i >= 2) {
   3983                         uint32_t primWeight = numTempBuf[i--] << 8;
   3984                         if ( i >= 2)
   3985                             primWeight |= numTempBuf[i--];
   3986                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3987                     }
   3988 
   3989                     source->toReturn = source->CEpos -1;
   3990                     return *(source->toReturn);
   3991                 } else {
   3992                     CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3993                     CE = *(CEOffset++);
   3994                     break;
   3995                 }
   3996             }
   3997 
   3998         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3999             {
   4000                 static const uint32_t
   4001                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   4002                 //const uint32_t LCount = 19;
   4003                 static const uint32_t VCount = 21;
   4004                 static const uint32_t TCount = 28;
   4005                 //const uint32_t NCount = VCount * TCount;   /* 588 */
   4006                 //const uint32_t SCount = LCount * NCount;   /* 11172 */
   4007 
   4008                 uint32_t L = ch - SBase;
   4009                 /*
   4010                 divide into pieces.
   4011                 we do it in this order since some compilers can do % and / in one
   4012                 operation
   4013                 */
   4014                 uint32_t T = L % TCount;
   4015                 L /= TCount;
   4016                 uint32_t V = L % VCount;
   4017                 L /= VCount;
   4018 
   4019                 /* offset them */
   4020                 L += LBase;
   4021                 V += VBase;
   4022                 T += TBase;
   4023 
   4024                 int32_t firstOffset = (int32_t)(source->pos - source->string);
   4025                 source->appendOffset(firstOffset, *status);
   4026 
   4027                 /*
   4028                  * return the first CE, but first put the rest into the expansion buffer
   4029                  */
   4030                 if (!source->coll->image->jamoSpecial) {
   4031                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   4032                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   4033                     source->appendOffset(firstOffset + 1, *status);
   4034 
   4035                     if (T != TBase) {
   4036                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   4037                         source->appendOffset(firstOffset + 1, *status);
   4038                     }
   4039 
   4040                     source->toReturn = source->CEpos - 1;
   4041 
   4042                     source->offsetReturn = source->offsetStore - 1;
   4043                     if (source->offsetReturn == source->offsetBuffer) {
   4044                         source->offsetStore = source->offsetBuffer;
   4045                     }
   4046 
   4047                     return *(source->toReturn);
   4048                 } else {
   4049                     // Since Hanguls pass the FCD check, it is
   4050                     // guaranteed that we won't be in
   4051                     // the normalization buffer if something like this happens
   4052 
   4053                     // Move Jamos into normalization buffer
   4054                     UChar *tempbuffer = source->writableBuffer.getBuffer(5);
   4055                     int32_t tempbufferLength, jamoOffset;
   4056                     tempbuffer[0] = 0;
   4057                     tempbuffer[1] = (UChar)L;
   4058                     tempbuffer[2] = (UChar)V;
   4059                     if (T != TBase) {
   4060                         tempbuffer[3] = (UChar)T;
   4061                         tempbufferLength = 4;
   4062                     } else {
   4063                         tempbufferLength = 3;
   4064                     }
   4065                     source->writableBuffer.releaseBuffer(tempbufferLength);
   4066 
   4067                     // Indicate where to continue in main input string after exhausting the writableBuffer
   4068                     if (source->pos  == source->string) {
   4069                         jamoOffset = 0;
   4070                         source->fcdPosition = NULL;
   4071                     } else {
   4072                         jamoOffset = source->pos - source->string;
   4073                         source->fcdPosition       = source->pos-1;
   4074                     }
   4075 
   4076                     // Append offsets for the additional chars
   4077                     // (not the 0, and not the L whose offsets match the original Hangul)
   4078                     int32_t jamoRemaining = tempbufferLength - 2;
   4079                     jamoOffset++; // appended offsets should match end of original Hangul
   4080                     while (jamoRemaining-- > 0) {
   4081                         source->appendOffset(jamoOffset, *status);
   4082                     }
   4083 
   4084                     source->offsetRepeatValue = jamoOffset;
   4085 
   4086                     source->offsetReturn = source->offsetStore - 1;
   4087                     if (source->offsetReturn == source->offsetBuffer) {
   4088                         source->offsetStore = source->offsetBuffer;
   4089                     }
   4090 
   4091                     source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
   4092                     source->origFlags         = source->flags;
   4093                     source->flags            |= UCOL_ITER_INNORMBUF;
   4094                     source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   4095 
   4096                     return(UCOL_IGNORABLE);
   4097                 }
   4098             }
   4099 
   4100         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   4101             return getPrevImplicit(ch, source);
   4102 
   4103             // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
   4104         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   4105             return getPrevImplicit(ch, source);
   4106 
   4107         case SURROGATE_TAG:  /* This is a surrogate pair */
   4108             /* essentially an engaged lead surrogate. */
   4109             /* if you have encountered it here, it means that a */
   4110             /* broken sequence was encountered and this is an error */
   4111             return UCOL_NOT_FOUND;
   4112 
   4113         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   4114             return UCOL_NOT_FOUND; /* broken surrogate sequence */
   4115 
   4116         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   4117             {
   4118                 UChar32 cp = 0;
   4119                 UChar  prevChar;
   4120                 const UChar *prev;
   4121                 if (isAtStartPrevIterate(source)) {
   4122                     /* we are at the start of the string, wrong place to be at */
   4123                     return UCOL_NOT_FOUND;
   4124                 }
   4125                 if (source->pos != source->writableBuffer.getBuffer()) {
   4126                     prev     = source->pos - 1;
   4127                 } else {
   4128                     prev     = source->fcdPosition;
   4129                 }
   4130                 prevChar = *prev;
   4131 
   4132                 /* Handles Han and Supplementary characters here.*/
   4133                 if (U16_IS_LEAD(prevChar)) {
   4134                     cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   4135                     source->pos = prev;
   4136                 } else {
   4137                     return UCOL_NOT_FOUND; /* like unassigned */
   4138                 }
   4139 
   4140                 return getPrevImplicit(cp, source);
   4141             }
   4142 
   4143             /* UCA is filled with these. Tailorings are NOT_FOUND */
   4144             /* not yet implemented */
   4145         case CHARSET_TAG:  /* this tag always returns */
   4146             /* probably after 1.8 */
   4147             return UCOL_NOT_FOUND;
   4148 
   4149         default:           /* this tag always returns */
   4150             *status = U_INTERNAL_PROGRAM_ERROR;
   4151             CE=0;
   4152             break;
   4153         }
   4154 
   4155         if (CE <= UCOL_NOT_FOUND) {
   4156             break;
   4157         }
   4158     }
   4159 
   4160     return CE;
   4161 }
   4162 
   4163 /* This should really be a macro                                                                      */
   4164 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
   4165 /* secondaries in French                                                                              */
   4166 /*
   4167 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
   4168   uint8_t temp;
   4169   while(start<end) {
   4170     temp = *start;
   4171     *start++ = *end;
   4172     *end-- = temp;
   4173   }
   4174 }
   4175 */
   4176 
   4177 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
   4178   TYPE tempA; \
   4179 while((start)<(end)) { \
   4180     tempA = *(start); \
   4181     *(start)++ = *(end); \
   4182     *(end)-- = tempA; \
   4183 } \
   4184 }
   4185 
   4186 /****************************************************************************/
   4187 /* Following are the sortkey generation functions                           */
   4188 /*                                                                          */
   4189 /****************************************************************************/
   4190 
   4191 /**
   4192  * Merge two sort keys.
   4193  * This is useful, for example, to combine sort keys from first and last names
   4194  * to sort such pairs.
   4195  * Merged sort keys consider on each collation level the first part first entirely,
   4196  * then the second one.
   4197  * It is possible to merge multiple sort keys by consecutively merging
   4198  * another one with the intermediate result.
   4199  *
   4200  * The length of the merge result is the sum of the lengths of the input sort keys
   4201  * minus 1.
   4202  *
   4203  * @param src1 the first sort key
   4204  * @param src1Length the length of the first sort key, including the zero byte at the end;
   4205  *        can be -1 if the function is to find the length
   4206  * @param src2 the second sort key
   4207  * @param src2Length the length of the second sort key, including the zero byte at the end;
   4208  *        can be -1 if the function is to find the length
   4209  * @param dest the buffer where the merged sort key is written,
   4210  *        can be NULL if destCapacity==0
   4211  * @param destCapacity the number of bytes in the dest buffer
   4212  * @return the length of the merged sort key, src1Length+src2Length-1;
   4213  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
   4214  *         in which cases the contents of dest is undefined
   4215  *
   4216  * @draft
   4217  */
   4218 U_CAPI int32_t U_EXPORT2
   4219 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
   4220                    const uint8_t *src2, int32_t src2Length,
   4221                    uint8_t *dest, int32_t destCapacity) {
   4222     int32_t destLength;
   4223     uint8_t b;
   4224 
   4225     /* check arguments */
   4226     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
   4227         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
   4228         destCapacity<0 || (destCapacity>0 && dest==NULL)
   4229     ) {
   4230         /* error, attempt to write a zero byte and return 0 */
   4231         if(dest!=NULL && destCapacity>0) {
   4232             *dest=0;
   4233         }
   4234         return 0;
   4235     }
   4236 
   4237     /* check lengths and capacity */
   4238     if(src1Length<0) {
   4239         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
   4240     }
   4241     if(src2Length<0) {
   4242         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
   4243     }
   4244 
   4245     destLength=src1Length+src2Length-1;
   4246     if(destLength>destCapacity) {
   4247         /* the merged sort key does not fit into the destination */
   4248         return destLength;
   4249     }
   4250 
   4251     /* merge the sort keys with the same number of levels */
   4252     while(*src1!=0 && *src2!=0) { /* while both have another level */
   4253         /* copy level from src1 not including 00 or 01 */
   4254         while((b=*src1)>=2) {
   4255             ++src1;
   4256             *dest++=b;
   4257         }
   4258 
   4259         /* add a 02 merge separator */
   4260         *dest++=2;
   4261 
   4262         /* copy level from src2 not including 00 or 01 */
   4263         while((b=*src2)>=2) {
   4264             ++src2;
   4265             *dest++=b;
   4266         }
   4267 
   4268         /* if both sort keys have another level, then add a 01 level separator and continue */
   4269         if(*src1==1 && *src2==1) {
   4270             ++src1;
   4271             ++src2;
   4272             *dest++=1;
   4273         }
   4274     }
   4275 
   4276     /*
   4277      * here, at least one sort key is finished now, but the other one
   4278      * might have some contents left from containing more levels;
   4279      * that contents is just appended to the result
   4280      */
   4281     if(*src1!=0) {
   4282         /* src1 is not finished, therefore *src2==0, and src1 is appended */
   4283         src2=src1;
   4284     }
   4285     /* append src2, "the other, unfinished sort key" */
   4286     uprv_strcpy((char *)dest, (const char *)src2);
   4287 
   4288     /* trust that neither sort key contained illegally embedded zero bytes */
   4289     return destLength;
   4290 }
   4291 
   4292 U_NAMESPACE_BEGIN
   4293 
   4294 class SortKeyByteSink : public ByteSink {
   4295 public:
   4296     SortKeyByteSink(char *dest, int32_t destCapacity)
   4297             : buffer_(dest), capacity_(destCapacity),
   4298               appended_(0) {
   4299         if (buffer_ == NULL) {
   4300             capacity_ = 0;
   4301         } else if(capacity_ < 0) {
   4302             buffer_ = NULL;
   4303             capacity_ = 0;
   4304         }
   4305     }
   4306     virtual ~SortKeyByteSink();
   4307 
   4308     virtual void Append(const char *bytes, int32_t n);
   4309     void Append(uint32_t b) {
   4310         if (appended_ < capacity_ || Resize(1, appended_)) {
   4311             buffer_[appended_] = (char)b;
   4312         }
   4313         ++appended_;
   4314     }
   4315     void Append(uint32_t b1, uint32_t b2) {
   4316         int32_t a2 = appended_ + 2;
   4317         if (a2 <= capacity_ || Resize(2, appended_)) {
   4318             buffer_[appended_] = (char)b1;
   4319             buffer_[appended_ + 1] = (char)b2;
   4320         } else if(appended_ < capacity_) {
   4321             buffer_[appended_] = (char)b1;
   4322         }
   4323         appended_ = a2;
   4324     }
   4325     virtual char *GetAppendBuffer(int32_t min_capacity,
   4326                                   int32_t desired_capacity_hint,
   4327                                   char *scratch, int32_t scratch_capacity,
   4328                                   int32_t *result_capacity);
   4329     int32_t NumberOfBytesAppended() const { return appended_; }
   4330     /** @return FALSE if memory allocation failed */
   4331     UBool IsOk() const { return buffer_ != NULL; }
   4332 
   4333 protected:
   4334     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0;
   4335     virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;
   4336 
   4337     void SetNotOk() {
   4338         buffer_ = NULL;
   4339         capacity_ = 0;
   4340     }
   4341 
   4342     char *buffer_;
   4343     int32_t capacity_;
   4344     int32_t appended_;
   4345 
   4346 private:
   4347     SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
   4348     SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
   4349 };
   4350 
   4351 SortKeyByteSink::~SortKeyByteSink() {}
   4352 
   4353 void
   4354 SortKeyByteSink::Append(const char *bytes, int32_t n) {
   4355     if (n <= 0 || bytes == NULL) {
   4356         return;
   4357     }
   4358     int32_t length = appended_;
   4359     appended_ += n;
   4360     if ((buffer_ + length) == bytes) {
   4361         return;  // the caller used GetAppendBuffer() and wrote the bytes already
   4362     }
   4363     int32_t available = capacity_ - length;
   4364     if (n <= available) {
   4365         uprv_memcpy(buffer_ + length, bytes, n);
   4366     } else {
   4367         AppendBeyondCapacity(bytes, n, length);
   4368     }
   4369 }
   4370 
   4371 char *
   4372 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
   4373                                  int32_t desired_capacity_hint,
   4374                                  char *scratch,
   4375                                  int32_t scratch_capacity,
   4376                                  int32_t *result_capacity) {
   4377     if (min_capacity < 1 || scratch_capacity < min_capacity) {
   4378         *result_capacity = 0;
   4379         return NULL;
   4380     }
   4381     int32_t available = capacity_ - appended_;
   4382     if (available >= min_capacity) {
   4383         *result_capacity = available;
   4384         return buffer_ + appended_;
   4385     } else if (Resize(desired_capacity_hint, appended_)) {
   4386         *result_capacity = capacity_ - appended_;
   4387         return buffer_ + appended_;
   4388     } else {
   4389         *result_capacity = scratch_capacity;
   4390         return scratch;
   4391     }
   4392 }
   4393 
   4394 class FixedSortKeyByteSink : public SortKeyByteSink {
   4395 public:
   4396     FixedSortKeyByteSink(char *dest, int32_t destCapacity)
   4397             : SortKeyByteSink(dest, destCapacity) {}
   4398     virtual ~FixedSortKeyByteSink();
   4399 
   4400 private:
   4401     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
   4402     virtual UBool Resize(int32_t appendCapacity, int32_t length);
   4403 };
   4404 
   4405 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
   4406 
   4407 void
   4408 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
   4409     // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
   4410     // Fill the buffer completely.
   4411     int32_t available = capacity_ - length;
   4412     if (available > 0) {
   4413         uprv_memcpy(buffer_ + length, bytes, available);
   4414     }
   4415 }
   4416 
   4417 UBool
   4418 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
   4419     return FALSE;
   4420 }
   4421 
   4422 class CollationKeyByteSink : public SortKeyByteSink {
   4423 public:
   4424     CollationKeyByteSink(CollationKey &key)
   4425             : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
   4426               key_(key) {}
   4427     virtual ~CollationKeyByteSink();
   4428 
   4429 private:
   4430     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
   4431     virtual UBool Resize(int32_t appendCapacity, int32_t length);
   4432 
   4433     CollationKey &key_;
   4434 };
   4435 
   4436 CollationKeyByteSink::~CollationKeyByteSink() {}
   4437 
   4438 void
   4439 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
   4440     // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
   4441     if (Resize(n, length)) {
   4442         uprv_memcpy(buffer_ + length, bytes, n);
   4443     }
   4444 }
   4445 
   4446 UBool
   4447 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
   4448     if (buffer_ == NULL) {
   4449         return FALSE;  // allocation failed before already
   4450     }
   4451     int32_t newCapacity = 2 * capacity_;
   4452     int32_t altCapacity = length + 2 * appendCapacity;
   4453     if (newCapacity < altCapacity) {
   4454         newCapacity = altCapacity;
   4455     }
   4456     if (newCapacity < 200) {
   4457         newCapacity = 200;
   4458     }
   4459     uint8_t *newBuffer = key_.reallocate(newCapacity, length);
   4460     if (newBuffer == NULL) {
   4461         SetNotOk();
   4462         return FALSE;
   4463     }
   4464     buffer_ = reinterpret_cast<char *>(newBuffer);
   4465     capacity_ = newCapacity;
   4466     return TRUE;
   4467 }
   4468 
   4469 /**
   4470  * uint8_t byte buffer, similar to CharString but simpler.
   4471  */
   4472 class SortKeyLevel : public UMemory {
   4473 public:
   4474     SortKeyLevel() : len(0), ok(TRUE) {}
   4475     ~SortKeyLevel() {}
   4476 
   4477     /** @return FALSE if memory allocation failed */
   4478     UBool isOk() const { return ok; }
   4479     UBool isEmpty() const { return len == 0; }
   4480     int32_t length() const { return len; }
   4481     const uint8_t *data() const { return buffer.getAlias(); }
   4482     uint8_t operator[](int32_t index) const { return buffer[index]; }
   4483 
   4484     void appendByte(uint32_t b);
   4485 
   4486     void appendTo(ByteSink &sink) const {
   4487         sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len);
   4488     }
   4489 
   4490     uint8_t &lastByte() {
   4491         U_ASSERT(len > 0);
   4492         return buffer[len - 1];
   4493     }
   4494 
   4495     uint8_t *getLastFewBytes(int32_t n) {
   4496         if (ok && len >= n) {
   4497             return buffer.getAlias() + len - n;
   4498         } else {
   4499             return NULL;
   4500         }
   4501     }
   4502 
   4503 private:
   4504     MaybeStackArray<uint8_t, 40> buffer;
   4505     int32_t len;
   4506     UBool ok;
   4507 
   4508     UBool ensureCapacity(int32_t appendCapacity);
   4509 
   4510     SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class
   4511     SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class
   4512 };
   4513 
   4514 void SortKeyLevel::appendByte(uint32_t b) {
   4515     if(len < buffer.getCapacity() || ensureCapacity(1)) {
   4516         buffer[len++] = (uint8_t)b;
   4517     }
   4518 }
   4519 
   4520 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) {
   4521     if(!ok) {
   4522         return FALSE;
   4523     }
   4524     int32_t newCapacity = 2 * buffer.getCapacity();
   4525     int32_t altCapacity = len + 2 * appendCapacity;
   4526     if (newCapacity < altCapacity) {
   4527         newCapacity = altCapacity;
   4528     }
   4529     if (newCapacity < 200) {
   4530         newCapacity = 200;
   4531     }
   4532     if(buffer.resize(newCapacity, len)==NULL) {
   4533         return ok = FALSE;
   4534     }
   4535     return TRUE;
   4536 }
   4537 
   4538 U_NAMESPACE_END
   4539 
   4540 /* sortkey API */
   4541 U_CAPI int32_t U_EXPORT2
   4542 ucol_getSortKey(const    UCollator    *coll,
   4543         const    UChar        *source,
   4544         int32_t        sourceLength,
   4545         uint8_t        *result,
   4546         int32_t        resultLength)
   4547 {
   4548     UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
   4549     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   4550         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
   4551             ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
   4552     }
   4553 
   4554     if(coll->delegate != NULL) {
   4555       return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength);
   4556     }
   4557 
   4558     UErrorCode status = U_ZERO_ERROR;
   4559     int32_t keySize   = 0;
   4560 
   4561     if(source != NULL) {
   4562         // source == NULL is actually an error situation, but we would need to
   4563         // have an error code to return it. Until we introduce a new
   4564         // API, it stays like this
   4565 
   4566         /* this uses the function pointer that is set in updateinternalstate */
   4567         /* currently, there are two funcs: */
   4568         /*ucol_calcSortKey(...);*/
   4569         /*ucol_calcSortKeySimpleTertiary(...);*/
   4570 
   4571         uint8_t noDest[1] = { 0 };
   4572         if(result == NULL) {
   4573             // Distinguish pure preflighting from an allocation error.
   4574             result = noDest;
   4575             resultLength = 0;
   4576         }
   4577         FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength);
   4578         coll->sortKeyGen(coll, source, sourceLength, sink, &status);
   4579         if(U_SUCCESS(status)) {
   4580             keySize = sink.NumberOfBytesAppended();
   4581         }
   4582     }
   4583     UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
   4584     UTRACE_EXIT_STATUS(status);
   4585     return keySize;
   4586 }
   4587 
   4588 U_CFUNC int32_t
   4589 ucol_getCollationKey(const UCollator *coll,
   4590                      const UChar *source, int32_t sourceLength,
   4591                      CollationKey &key,
   4592                      UErrorCode &errorCode) {
   4593     CollationKeyByteSink sink(key);
   4594     coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode);
   4595     return sink.NumberOfBytesAppended();
   4596 }
   4597 
   4598 // Is this primary weight compressible?
   4599 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
   4600 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
   4601 static inline UBool
   4602 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
   4603     return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
   4604 }
   4605 
   4606 static
   4607 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) {
   4608     if (caseShift  == 0) {
   4609         cases.appendByte(UCOL_CASE_BYTE_START);
   4610         caseShift = UCOL_CASE_SHIFT_START;
   4611     }
   4612 }
   4613 
   4614 // Packs the secondary buffer when processing French locale.
   4615 static void
   4616 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
   4617     secondaries += secsize;  // We read the secondary-level bytes back to front.
   4618     uint8_t secondary;
   4619     int32_t count2 = 0;
   4620     int32_t i = 0;
   4621     // we use i here since the key size already accounts for terminators, so we'll discard the increment
   4622     for(i = 0; i<secsize; i++) {
   4623         secondary = *(secondaries-i-1);
   4624         /* This is compression code. */
   4625         if (secondary == UCOL_COMMON2) {
   4626             ++count2;
   4627         } else {
   4628             if (count2 > 0) {
   4629                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4630                     while (count2 > UCOL_TOP_COUNT2) {
   4631                         result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   4632                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4633                     }
   4634                     result.Append(UCOL_COMMON_TOP2 - (count2-1));
   4635                 } else {
   4636                     while (count2 > UCOL_BOT_COUNT2) {
   4637                         result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4638                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4639                     }
   4640                     result.Append(UCOL_COMMON_BOT2 + (count2-1));
   4641                 }
   4642                 count2 = 0;
   4643             }
   4644             result.Append(secondary);
   4645         }
   4646     }
   4647     if (count2 > 0) {
   4648         while (count2 > UCOL_BOT_COUNT2) {
   4649             result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4650             count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4651         }
   4652         result.Append(UCOL_COMMON_BOT2 + (count2-1));
   4653     }
   4654 }
   4655 
   4656 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
   4657 
   4658 /* This is the sortkey work horse function */
   4659 U_CFUNC void U_CALLCONV
   4660 ucol_calcSortKey(const    UCollator    *coll,
   4661         const    UChar        *source,
   4662         int32_t        sourceLength,
   4663         SortKeyByteSink &result,
   4664         UErrorCode *status)
   4665 {
   4666     if(U_FAILURE(*status)) {
   4667         return;
   4668     }
   4669 
   4670     SortKeyByteSink &primaries = result;
   4671     SortKeyLevel secondaries;
   4672     SortKeyLevel tertiaries;
   4673     SortKeyLevel cases;
   4674     SortKeyLevel quads;
   4675 
   4676     UnicodeString normSource;
   4677 
   4678     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
   4679 
   4680     UColAttributeValue strength = coll->strength;
   4681 
   4682     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
   4683     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
   4684     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
   4685     UBool  compareIdent = (strength == UCOL_IDENTICAL);
   4686     UBool  doCase = (coll->caseLevel == UCOL_ON);
   4687     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
   4688     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
   4689     //UBool  qShifted = shifted && (compareQuad == 0);
   4690     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
   4691 
   4692     uint32_t variableTopValue = coll->variableTopValue;
   4693     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
   4694     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
   4695     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
   4696     uint8_t UCOL_HIRAGANA_QUAD = 0;
   4697     if(doHiragana) {
   4698         UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
   4699         /* allocate one more space for hiragana, value for hiragana */
   4700     }
   4701     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
   4702 
   4703     /* support for special features like caselevel and funky secondaries */
   4704     int32_t lastSecondaryLength = 0;
   4705     uint32_t caseShift = 0;
   4706 
   4707     /* If we need to normalize, we'll do it all at once at the beginning! */
   4708     const Normalizer2 *norm2;
   4709     if(compareIdent) {
   4710         norm2 = Normalizer2Factory::getNFDInstance(*status);
   4711     } else if(coll->normalizationMode != UCOL_OFF) {
   4712         norm2 = Normalizer2Factory::getFCDInstance(*status);
   4713     } else {
   4714         norm2 = NULL;
   4715     }
   4716     if(norm2 != NULL) {
   4717         normSource.setTo(FALSE, source, len);
   4718         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   4719         if(qcYesLength != len) {
   4720             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   4721             normSource.truncate(qcYesLength);
   4722             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   4723             source = normSource.getBuffer();
   4724             len = normSource.length();
   4725         }
   4726     }
   4727     collIterate s;
   4728     IInit_collIterate(coll, source, len, &s, status);
   4729     if(U_FAILURE(*status)) {
   4730         return;
   4731     }
   4732     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
   4733 
   4734     uint32_t order = 0;
   4735 
   4736     uint8_t primary1 = 0;
   4737     uint8_t primary2 = 0;
   4738     uint8_t secondary = 0;
   4739     uint8_t tertiary = 0;
   4740     uint8_t caseSwitch = coll->caseSwitch;
   4741     uint8_t tertiaryMask = coll->tertiaryMask;
   4742     int8_t tertiaryAddition = coll->tertiaryAddition;
   4743     uint8_t tertiaryTop = coll->tertiaryTop;
   4744     uint8_t tertiaryBottom = coll->tertiaryBottom;
   4745     uint8_t tertiaryCommon = coll->tertiaryCommon;
   4746     uint8_t caseBits = 0;
   4747 
   4748     UBool wasShifted = FALSE;
   4749     UBool notIsContinuation = FALSE;
   4750 
   4751     uint32_t count2 = 0, count3 = 0, count4 = 0;
   4752     uint8_t leadPrimary = 0;
   4753 
   4754     for(;;) {
   4755         order = ucol_IGetNextCE(coll, &s, status);
   4756         if(order == UCOL_NO_MORE_CES) {
   4757             break;
   4758         }
   4759 
   4760         if(order == 0) {
   4761             continue;
   4762         }
   4763 
   4764         notIsContinuation = !isContinuation(order);
   4765 
   4766         if(notIsContinuation) {
   4767             tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
   4768         } else {
   4769             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   4770         }
   4771 
   4772         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4773         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4774         primary1 = (uint8_t)(order >> 8);
   4775 
   4776         uint8_t originalPrimary1 = primary1;
   4777         if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
   4778             primary1 = coll->leadBytePermutationTable[primary1];
   4779         }
   4780 
   4781         if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
   4782                         || (!notIsContinuation && wasShifted)))
   4783             || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   4784         {
   4785             /* and other ignorables should be removed if following a shifted code point */
   4786             if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
   4787                 /* we should just completely ignore it */
   4788                 continue;
   4789             }
   4790             if(compareQuad == 0) {
   4791                 if(count4 > 0) {
   4792                     while (count4 > UCOL_BOT_COUNT4) {
   4793                         quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   4794                         count4 -= UCOL_BOT_COUNT4;
   4795                     }
   4796                     quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
   4797                     count4 = 0;
   4798                 }
   4799                 /* We are dealing with a variable and we're treating them as shifted */
   4800                 /* This is a shifted ignorable */
   4801                 if(primary1 != 0) { /* we need to check this since we could be in continuation */
   4802                     quads.appendByte(primary1);
   4803                 }
   4804                 if(primary2 != 0) {
   4805                     quads.appendByte(primary2);
   4806                 }
   4807             }
   4808             wasShifted = TRUE;
   4809         } else {
   4810             wasShifted = FALSE;
   4811             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   4812             /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   4813             /* regular and simple sortkey calc */
   4814             if(primary1 != UCOL_IGNORABLE) {
   4815                 if(notIsContinuation) {
   4816                     if(leadPrimary == primary1) {
   4817                         primaries.Append(primary2);
   4818                     } else {
   4819                         if(leadPrimary != 0) {
   4820                             primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
   4821                         }
   4822                         if(primary2 == UCOL_IGNORABLE) {
   4823                             /* one byter, not compressed */
   4824                             primaries.Append(primary1);
   4825                             leadPrimary = 0;
   4826                         } else if(isCompressible(coll, originalPrimary1)) {
   4827                             /* compress */
   4828                             primaries.Append(leadPrimary = primary1, primary2);
   4829                         } else {
   4830                             leadPrimary = 0;
   4831                             primaries.Append(primary1, primary2);
   4832                         }
   4833                     }
   4834                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   4835                     if(primary2 == UCOL_IGNORABLE) {
   4836                         primaries.Append(primary1);
   4837                     } else {
   4838                         primaries.Append(primary1, primary2);
   4839                     }
   4840                 }
   4841             }
   4842 
   4843             if(secondary > compareSec) {
   4844                 if(!isFrenchSec) {
   4845                     /* This is compression code. */
   4846                     if (secondary == UCOL_COMMON2 && notIsContinuation) {
   4847                         ++count2;
   4848                     } else {
   4849                         if (count2 > 0) {
   4850                             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4851                                 while (count2 > UCOL_TOP_COUNT2) {
   4852                                     secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   4853                                     count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4854                                 }
   4855                                 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
   4856                             } else {
   4857                                 while (count2 > UCOL_BOT_COUNT2) {
   4858                                     secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4859                                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4860                                 }
   4861                                 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
   4862                             }
   4863                             count2 = 0;
   4864                         }
   4865                         secondaries.appendByte(secondary);
   4866                     }
   4867                 } else {
   4868                     /* Do the special handling for French secondaries */
   4869                     /* We need to get continuation elements and do intermediate restore */
   4870                     /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
   4871                     if(notIsContinuation) {
   4872                         if (lastSecondaryLength > 1) {
   4873                             uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
   4874                             if (frenchStartPtr != NULL) {
   4875                                 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4876                                 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
   4877                                 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4878                             }
   4879                         }
   4880                         lastSecondaryLength = 1;
   4881                     } else {
   4882                         ++lastSecondaryLength;
   4883                     }
   4884                     secondaries.appendByte(secondary);
   4885                 }
   4886             }
   4887 
   4888             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
   4889                 // do the case level if we need to do it. We don't want to calculate
   4890                 // case level for primary ignorables if we have only primary strength and case level
   4891                 // otherwise we would break well formedness of CEs
   4892                 doCaseShift(cases, caseShift);
   4893                 if(notIsContinuation) {
   4894                     caseBits = (uint8_t)(tertiary & 0xC0);
   4895 
   4896                     if(tertiary != 0) {
   4897                         if(coll->caseFirst == UCOL_UPPER_FIRST) {
   4898                             if((caseBits & 0xC0) == 0) {
   4899                                 cases.lastByte() |= 1 << (--caseShift);
   4900                             } else {
   4901                                 cases.lastByte() |= 0 << (--caseShift);
   4902                                 /* second bit */
   4903                                 doCaseShift(cases, caseShift);
   4904                                 cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift);
   4905                             }
   4906                         } else {
   4907                             if((caseBits & 0xC0) == 0) {
   4908                                 cases.lastByte() |= 0 << (--caseShift);
   4909                             } else {
   4910                                 cases.lastByte() |= 1 << (--caseShift);
   4911                                 /* second bit */
   4912                                 doCaseShift(cases, caseShift);
   4913                                 cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift);
   4914                             }
   4915                         }
   4916                     }
   4917                 }
   4918             } else {
   4919                 if(notIsContinuation) {
   4920                     tertiary ^= caseSwitch;
   4921                 }
   4922             }
   4923 
   4924             tertiary &= tertiaryMask;
   4925             if(tertiary > compareTer) {
   4926                 /* This is compression code. */
   4927                 /* sequence size check is included in the if clause */
   4928                 if (tertiary == tertiaryCommon && notIsContinuation) {
   4929                     ++count3;
   4930                 } else {
   4931                     if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   4932                         tertiary += tertiaryAddition;
   4933                     } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   4934                         tertiary -= tertiaryAddition;
   4935                     }
   4936                     if (count3 > 0) {
   4937                         if ((tertiary > tertiaryCommon)) {
   4938                             while (count3 > coll->tertiaryTopCount) {
   4939                                 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
   4940                                 count3 -= (uint32_t)coll->tertiaryTopCount;
   4941                             }
   4942                             tertiaries.appendByte(tertiaryTop - (count3-1));
   4943                         } else {
   4944                             while (count3 > coll->tertiaryBottomCount) {
   4945                                 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
   4946                                 count3 -= (uint32_t)coll->tertiaryBottomCount;
   4947                             }
   4948                             tertiaries.appendByte(tertiaryBottom + (count3-1));
   4949                         }
   4950                         count3 = 0;
   4951                     }
   4952                     tertiaries.appendByte(tertiary);
   4953                 }
   4954             }
   4955 
   4956             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
   4957                 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   4958                     if(count4>0) { // Close this part
   4959                         while (count4 > UCOL_BOT_COUNT4) {
   4960                             quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   4961                             count4 -= UCOL_BOT_COUNT4;
   4962                         }
   4963                         quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
   4964                         count4 = 0;
   4965                     }
   4966                     quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana
   4967                 } else { // This wasn't Hiragana, so we can continue adding stuff
   4968                     count4++;
   4969                 }
   4970             }
   4971         }
   4972     }
   4973 
   4974     /* Here, we are generally done with processing */
   4975     /* bailing out would not be too productive */
   4976 
   4977     UBool ok = TRUE;
   4978     if(U_SUCCESS(*status)) {
   4979         /* we have done all the CE's, now let's put them together to form a key */
   4980         if(compareSec == 0) {
   4981             if (count2 > 0) {
   4982                 while (count2 > UCOL_BOT_COUNT2) {
   4983                     secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4984                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4985                 }
   4986                 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
   4987             }
   4988             result.Append(UCOL_LEVELTERMINATOR);
   4989             if(!secondaries.isOk()) {
   4990                 ok = FALSE;
   4991             } else if(!isFrenchSec) {
   4992                 secondaries.appendTo(result);
   4993             } else {
   4994                 // If there are any unresolved continuation secondaries,
   4995                 // reverse them here so that we can reverse the whole secondary thing.
   4996                 if (lastSecondaryLength > 1) {
   4997                     uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
   4998                     if (frenchStartPtr != NULL) {
   4999                         /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   5000                         uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
   5001                         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   5002                     }
   5003                 }
   5004                 packFrench(secondaries.data(), secondaries.length(), result);
   5005             }
   5006         }
   5007 
   5008         if(doCase) {
   5009             ok &= cases.isOk();
   5010             result.Append(UCOL_LEVELTERMINATOR);
   5011             cases.appendTo(result);
   5012         }
   5013 
   5014         if(compareTer == 0) {
   5015             if (count3 > 0) {
   5016                 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
   5017                     while (count3 >= coll->tertiaryTopCount) {
   5018                         tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
   5019                         count3 -= (uint32_t)coll->tertiaryTopCount;
   5020                     }
   5021                     tertiaries.appendByte(tertiaryTop - count3);
   5022                 } else {
   5023                     while (count3 > coll->tertiaryBottomCount) {
   5024                         tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
   5025                         count3 -= (uint32_t)coll->tertiaryBottomCount;
   5026                     }
   5027                     tertiaries.appendByte(tertiaryBottom + (count3-1));
   5028                 }
   5029             }
   5030             ok &= tertiaries.isOk();
   5031             result.Append(UCOL_LEVELTERMINATOR);
   5032             tertiaries.appendTo(result);
   5033 
   5034             if(compareQuad == 0/*qShifted == TRUE*/) {
   5035                 if(count4 > 0) {
   5036                     while (count4 > UCOL_BOT_COUNT4) {
   5037                         quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   5038                         count4 -= UCOL_BOT_COUNT4;
   5039                     }
   5040                     quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
   5041                 }
   5042                 ok &= quads.isOk();
   5043                 result.Append(UCOL_LEVELTERMINATOR);
   5044                 quads.appendTo(result);
   5045             }
   5046 
   5047             if(compareIdent) {
   5048                 result.Append(UCOL_LEVELTERMINATOR);
   5049                 u_writeIdenticalLevelRun(s.string, len, result);
   5050             }
   5051         }
   5052         result.Append(0);
   5053     }
   5054 
   5055     /* To avoid memory leak, free the offset buffer if necessary. */
   5056     ucol_freeOffsetBuffer(&s);
   5057 
   5058     ok &= result.IsOk();
   5059     if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
   5060 }
   5061 
   5062 
   5063 U_CFUNC void U_CALLCONV
   5064 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
   5065         const    UChar        *source,
   5066         int32_t        sourceLength,
   5067         SortKeyByteSink &result,
   5068         UErrorCode *status)
   5069 {
   5070     U_ALIGN_CODE(16);
   5071 
   5072     if(U_FAILURE(*status)) {
   5073         return;
   5074     }
   5075 
   5076     SortKeyByteSink &primaries = result;
   5077     SortKeyLevel secondaries;
   5078     SortKeyLevel tertiaries;
   5079 
   5080     UnicodeString normSource;
   5081 
   5082     int32_t len =  sourceLength;
   5083 
   5084     /* If we need to normalize, we'll do it all at once at the beginning! */
   5085     if(coll->normalizationMode != UCOL_OFF) {
   5086         normSource.setTo(len < 0, source, len);
   5087         const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
   5088         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   5089         if(qcYesLength != normSource.length()) {
   5090             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   5091             normSource.truncate(qcYesLength);
   5092             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   5093             source = normSource.getBuffer();
   5094             len = normSource.length();
   5095         }
   5096     }
   5097     collIterate s;
   5098     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   5099     if(U_FAILURE(*status)) {
   5100         return;
   5101     }
   5102     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
   5103 
   5104     uint32_t order = 0;
   5105 
   5106     uint8_t primary1 = 0;
   5107     uint8_t primary2 = 0;
   5108     uint8_t secondary = 0;
   5109     uint8_t tertiary = 0;
   5110     uint8_t caseSwitch = coll->caseSwitch;
   5111     uint8_t tertiaryMask = coll->tertiaryMask;
   5112     int8_t tertiaryAddition = coll->tertiaryAddition;
   5113     uint8_t tertiaryTop = coll->tertiaryTop;
   5114     uint8_t tertiaryBottom = coll->tertiaryBottom;
   5115     uint8_t tertiaryCommon = coll->tertiaryCommon;
   5116 
   5117     UBool notIsContinuation = FALSE;
   5118 
   5119     uint32_t count2 = 0, count3 = 0;
   5120     uint8_t leadPrimary = 0;
   5121 
   5122     for(;;) {
   5123         order = ucol_IGetNextCE(coll, &s, status);
   5124 
   5125         if(order == 0) {
   5126             continue;
   5127         }
   5128 
   5129         if(order == UCOL_NO_MORE_CES) {
   5130             break;
   5131         }
   5132 
   5133         notIsContinuation = !isContinuation(order);
   5134 
   5135         if(notIsContinuation) {
   5136             tertiary = (uint8_t)((order & tertiaryMask));
   5137         } else {
   5138             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   5139         }
   5140 
   5141         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5142         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5143         primary1 = (uint8_t)(order >> 8);
   5144 
   5145         uint8_t originalPrimary1 = primary1;
   5146         if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
   5147             primary1 = coll->leadBytePermutationTable[primary1];
   5148         }
   5149 
   5150         /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   5151         /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   5152         /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
   5153         /* regular and simple sortkey calc */
   5154         if(primary1 != UCOL_IGNORABLE) {
   5155             if(notIsContinuation) {
   5156                 if(leadPrimary == primary1) {
   5157                     primaries.Append(primary2);
   5158                 } else {
   5159                     if(leadPrimary != 0) {
   5160                         primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
   5161                     }
   5162                     if(primary2 == UCOL_IGNORABLE) {
   5163                         /* one byter, not compressed */
   5164                         primaries.Append(primary1);
   5165                         leadPrimary = 0;
   5166                     } else if(isCompressible(coll, originalPrimary1)) {
   5167                         /* compress */
   5168                         primaries.Append(leadPrimary = primary1, primary2);
   5169                     } else {
   5170                         leadPrimary = 0;
   5171                         primaries.Append(primary1, primary2);
   5172                     }
   5173                 }
   5174             } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   5175                 if(primary2 == UCOL_IGNORABLE) {
   5176                     primaries.Append(primary1);
   5177                 } else {
   5178                     primaries.Append(primary1, primary2);
   5179                 }
   5180             }
   5181         }
   5182 
   5183         if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
   5184             /* This is compression code. */
   5185             if (secondary == UCOL_COMMON2 && notIsContinuation) {
   5186                 ++count2;
   5187             } else {
   5188                 if (count2 > 0) {
   5189                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   5190                         while (count2 > UCOL_TOP_COUNT2) {
   5191                             secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   5192                             count2 -= (uint32_t)UCOL_TOP_COUNT2;
   5193                         }
   5194                         secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
   5195                     } else {
   5196                         while (count2 > UCOL_BOT_COUNT2) {
   5197                             secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5198                             count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5199                         }
   5200                         secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
   5201                     }
   5202                     count2 = 0;
   5203                 }
   5204                 secondaries.appendByte(secondary);
   5205             }
   5206         }
   5207 
   5208         if(notIsContinuation) {
   5209             tertiary ^= caseSwitch;
   5210         }
   5211 
   5212         if(tertiary > 0) {
   5213             /* This is compression code. */
   5214             /* sequence size check is included in the if clause */
   5215             if (tertiary == tertiaryCommon && notIsContinuation) {
   5216                 ++count3;
   5217             } else {
   5218                 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   5219                     tertiary += tertiaryAddition;
   5220                 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   5221                     tertiary -= tertiaryAddition;
   5222                 }
   5223                 if (count3 > 0) {
   5224                     if ((tertiary > tertiaryCommon)) {
   5225                         while (count3 > coll->tertiaryTopCount) {
   5226                             tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
   5227                             count3 -= (uint32_t)coll->tertiaryTopCount;
   5228                         }
   5229                         tertiaries.appendByte(tertiaryTop - (count3-1));
   5230                     } else {
   5231                         while (count3 > coll->tertiaryBottomCount) {
   5232                             tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
   5233                             count3 -= (uint32_t)coll->tertiaryBottomCount;
   5234                         }
   5235                         tertiaries.appendByte(tertiaryBottom + (count3-1));
   5236                     }
   5237                     count3 = 0;
   5238                 }
   5239                 tertiaries.appendByte(tertiary);
   5240             }
   5241         }
   5242     }
   5243 
   5244     UBool ok = TRUE;
   5245     if(U_SUCCESS(*status)) {
   5246         /* we have done all the CE's, now let's put them together to form a key */
   5247         if (count2 > 0) {
   5248             while (count2 > UCOL_BOT_COUNT2) {
   5249                 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5250                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5251             }
   5252             secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
   5253         }
   5254         ok &= secondaries.isOk();
   5255         result.Append(UCOL_LEVELTERMINATOR);
   5256         secondaries.appendTo(result);
   5257 
   5258         if (count3 > 0) {
   5259             if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
   5260                 while (count3 >= coll->tertiaryTopCount) {
   5261                     tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
   5262                     count3 -= (uint32_t)coll->tertiaryTopCount;
   5263                 }
   5264                 tertiaries.appendByte(tertiaryTop - count3);
   5265             } else {
   5266                 while (count3 > coll->tertiaryBottomCount) {
   5267                     tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
   5268                     count3 -= (uint32_t)coll->tertiaryBottomCount;
   5269                 }
   5270                 tertiaries.appendByte(tertiaryBottom + (count3-1));
   5271             }
   5272         }
   5273         ok &= tertiaries.isOk();
   5274         result.Append(UCOL_LEVELTERMINATOR);
   5275         tertiaries.appendTo(result);
   5276 
   5277         result.Append(0);
   5278     }
   5279 
   5280     /* To avoid memory leak, free the offset buffer if necessary. */
   5281     ucol_freeOffsetBuffer(&s);
   5282 
   5283     ok &= result.IsOk();
   5284     if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
   5285 }
   5286 
   5287 static inline
   5288 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
   5289     UBool notIsContinuation = !isContinuation(CE);
   5290     uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
   5291     if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
   5292                || (!notIsContinuation && *wasShifted)))
   5293         || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   5294     {
   5295         // The stuff below should probably be in the sortkey code... maybe not...
   5296         if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
   5297             /* we should just completely ignore it */
   5298             *wasShifted = TRUE;
   5299             //continue;
   5300         }
   5301         //*wasShifted = TRUE;
   5302         return TRUE;
   5303     } else {
   5304         *wasShifted = FALSE;
   5305         return FALSE;
   5306     }
   5307 }
   5308 static inline
   5309 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
   5310     if(level < maxLevel) {
   5311         dest[i++] = UCOL_LEVELTERMINATOR;
   5312     } else {
   5313         dest[i++] = 0;
   5314     }
   5315 }
   5316 
   5317 /** enumeration of level identifiers for partial sort key generation */
   5318 enum {
   5319   UCOL_PSK_PRIMARY = 0,
   5320     UCOL_PSK_SECONDARY = 1,
   5321     UCOL_PSK_CASE = 2,
   5322     UCOL_PSK_TERTIARY = 3,
   5323     UCOL_PSK_QUATERNARY = 4,
   5324     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
   5325     UCOL_PSK_IDENTICAL = 6,
   5326     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
   5327     UCOL_PSK_LIMIT
   5328 };
   5329 
   5330 /** collation state enum. *_SHIFT value is how much to shift right
   5331  *  to get the state piece to the right. *_MASK value should be
   5332  *  ANDed with the shifted state. This data is stored in state[1]
   5333  *  field.
   5334  */
   5335 enum {
   5336     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
   5337     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
   5338     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
   5339     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
   5340     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
   5341      *  This field is also used to denote that the French secondary level is finished
   5342      */
   5343     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
   5344     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
   5345     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
   5346     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
   5347     /** When we do French we need to reverse secondary values. However, continuations
   5348      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
   5349      */
   5350     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
   5351     UCOL_PSK_BOCSU_BYTES_MASK = 3,
   5352     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
   5353     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
   5354 };
   5355 
   5356 // macro calculating the number of expansion CEs available
   5357 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
   5358 
   5359 
   5360 /** main sortkey part procedure. On the first call,
   5361  *  you should pass in a collator, an iterator, empty state
   5362  *  state[0] == state[1] == 0, a buffer to hold results
   5363  *  number of bytes you need and an error code pointer.
   5364  *  Make sure your buffer is big enough to hold the wanted
   5365  *  number of sortkey bytes. I don't check.
   5366  *  The only meaningful status you can get back is
   5367  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
   5368  *  have been dealt a raw deal and that you probably won't
   5369  *  be able to use partial sortkey generation for this
   5370  *  particular combination of string and collator. This
   5371  *  is highly unlikely, but you should still check the error code.
   5372  *  Any other status means that you're not in a sane situation
   5373  *  anymore. After the first call, preserve state values and
   5374  *  use them on subsequent calls to obtain more bytes of a sortkey.
   5375  *  Use until the number of bytes written is smaller than the requested
   5376  *  number of bytes. Generated sortkey is not compatible with the
   5377  *  one generated by ucol_getSortKey, as we don't do any compression.
   5378  *  However, levels are still terminated by a 1 (one) and the sortkey
   5379  *  is terminated by a 0 (zero). Identical level is the same as in the
   5380  *  regular sortkey - internal bocu-1 implementation is used.
   5381  *  For curious, although you cannot do much about this, here is
   5382  *  the structure of state words.
   5383  *  state[0] - iterator state. Depends on the iterator implementation,
   5384  *             but allows the iterator to continue where it stopped in
   5385  *             the last iteration.
   5386  *  state[1] - collation processing state. Here is the distribution
   5387  *             of the bits:
   5388  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
   5389  *             quaternary, quin (we don't use this one), identical and
   5390  *             null (producing only zeroes - first one to terminate the
   5391  *             sortkey and subsequent to fill the buffer).
   5392  *   3       - byte count. Number of bytes written on the primary level.
   5393  *   4       - was shifted. Whether the previous iteration finished in the
   5394  *             shifted state.
   5395  *   5, 6    - French continuation bytes written. See the comment in the enum
   5396  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
   5397  *             the identical level.
   5398  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
   5399  *             since thes last successful update of the iterator state.
   5400  */
   5401 U_CAPI int32_t U_EXPORT2
   5402 ucol_nextSortKeyPart(const UCollator *coll,
   5403                      UCharIterator *iter,
   5404                      uint32_t state[2],
   5405                      uint8_t *dest, int32_t count,
   5406                      UErrorCode *status)
   5407 {
   5408     /* error checking */
   5409     if(status==NULL || U_FAILURE(*status)) {
   5410         return 0;
   5411     }
   5412     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
   5413     if( coll==NULL || iter==NULL ||
   5414         state==NULL ||
   5415         count<0 || (count>0 && dest==NULL)
   5416     ) {
   5417         *status=U_ILLEGAL_ARGUMENT_ERROR;
   5418         UTRACE_EXIT_STATUS(status);
   5419         return 0;
   5420     }
   5421 
   5422     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
   5423                   coll, iter, state[0], state[1], dest, count);
   5424 
   5425     if(count==0) {
   5426         /* nothing to do */
   5427         UTRACE_EXIT_VALUE(0);
   5428         return 0;
   5429     }
   5430     /** Setting up situation according to the state we got from the previous iteration */
   5431     // The state of the iterator from the previous invocation
   5432     uint32_t iterState = state[0];
   5433     // Has the last iteration ended in the shifted state
   5434     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
   5435     // What is the current level of the sortkey?
   5436     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
   5437     // Have we written only one byte from a two byte primary in the previous iteration?
   5438     // Also on secondary level - have we finished with the French secondary?
   5439     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
   5440     // number of bytes in the continuation buffer for French
   5441     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
   5442     // Number of bytes already written from a bocsu sequence. Since
   5443     // the longes bocsu sequence is 4 long, this can be up to 3.
   5444     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
   5445     // Number of elements that need to be consumed in this iteration because
   5446     // the iterator returned UITER_NO_STATE at the end of the last iteration,
   5447     // so we had to save the last valid state.
   5448     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
   5449 
   5450     /** values that depend on the collator attributes */
   5451     // strength of the collator.
   5452     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
   5453     // maximal level of the partial sortkey. Need to take whether case level is done
   5454     int32_t maxLevel = 0;
   5455     if(strength < UCOL_TERTIARY) {
   5456         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   5457             maxLevel = UCOL_PSK_CASE;
   5458         } else {
   5459             maxLevel = strength;
   5460         }
   5461     } else {
   5462         if(strength == UCOL_TERTIARY) {
   5463             maxLevel = UCOL_PSK_TERTIARY;
   5464         } else if(strength == UCOL_QUATERNARY) {
   5465             maxLevel = UCOL_PSK_QUATERNARY;
   5466         } else { // identical
   5467             maxLevel = UCOL_IDENTICAL;
   5468         }
   5469     }
   5470     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
   5471     uint8_t UCOL_HIRAGANA_QUAD =
   5472       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
   5473     // Boundary value that decides whether a CE is shifted or not
   5474     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
   5475     // Are we doing French collation?
   5476     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
   5477 
   5478     /** initializing the collation state */
   5479     UBool notIsContinuation = FALSE;
   5480     uint32_t CE = UCOL_NO_MORE_CES;
   5481 
   5482     collIterate s;
   5483     IInit_collIterate(coll, NULL, -1, &s, status);
   5484     if(U_FAILURE(*status)) {
   5485         UTRACE_EXIT_STATUS(*status);
   5486         return 0;
   5487     }
   5488     s.iterator = iter;
   5489     s.flags |= UCOL_USE_ITERATOR;
   5490     // This variable tells us whether we have produced some other levels in this iteration
   5491     // before we moved to the identical level. In that case, we need to switch the
   5492     // type of the iterator.
   5493     UBool doingIdenticalFromStart = FALSE;
   5494     // Normalizing iterator
   5495     // The division for the array length may truncate the array size to
   5496     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   5497     // for all platforms anyway.
   5498     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   5499     UNormIterator *normIter = NULL;
   5500     // If the normalization is turned on for the collator and we are below identical level
   5501     // we will use a FCD normalizing iterator
   5502     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
   5503         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5504         s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
   5505         s.flags &= ~UCOL_ITER_NORM;
   5506         if(U_FAILURE(*status)) {
   5507             UTRACE_EXIT_STATUS(*status);
   5508             return 0;
   5509         }
   5510     } else if(level == UCOL_PSK_IDENTICAL) {
   5511         // for identical level, we need a NFD iterator. We need to instantiate it here, since we
   5512         // will be updating the state - and this cannot be done on an ordinary iterator.
   5513         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5514         s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   5515         s.flags &= ~UCOL_ITER_NORM;
   5516         if(U_FAILURE(*status)) {
   5517             UTRACE_EXIT_STATUS(*status);
   5518             return 0;
   5519         }
   5520         doingIdenticalFromStart = TRUE;
   5521     }
   5522 
   5523     // This is the tentative new state of the iterator. The problem
   5524     // is that the iterator might return an undefined state, in
   5525     // which case we should save the last valid state and increase
   5526     // the iterator skip value.
   5527     uint32_t newState = 0;
   5528 
   5529     // First, we set the iterator to the last valid position
   5530     // from the last iteration. This was saved in state[0].
   5531     if(iterState == 0) {
   5532         /* initial state */
   5533         if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
   5534             s.iterator->move(s.iterator, 0, UITER_LIMIT);
   5535         } else {
   5536             s.iterator->move(s.iterator, 0, UITER_START);
   5537         }
   5538     } else {
   5539         /* reset to previous state */
   5540         s.iterator->setState(s.iterator, iterState, status);
   5541         if(U_FAILURE(*status)) {
   5542             UTRACE_EXIT_STATUS(*status);
   5543             return 0;
   5544         }
   5545     }
   5546 
   5547 
   5548 
   5549     // This variable tells us whether we can attempt to update the state
   5550     // of iterator. Situations where we don't want to update iterator state
   5551     // are the existence of expansion CEs that are not yet processed, and
   5552     // finishing the case level without enough space in the buffer to insert
   5553     // a level terminator.
   5554     UBool canUpdateState = TRUE;
   5555 
   5556     // Consume all the CEs that were consumed at the end of the previous
   5557     // iteration without updating the iterator state. On identical level,
   5558     // consume the code points.
   5559     int32_t counter = cces;
   5560     if(level < UCOL_PSK_IDENTICAL) {
   5561         while(counter-->0) {
   5562             // If we're doing French and we are on the secondary level,
   5563             // we go backwards.
   5564             if(level == UCOL_PSK_SECONDARY && doingFrench) {
   5565                 CE = ucol_IGetPrevCE(coll, &s, status);
   5566             } else {
   5567                 CE = ucol_IGetNextCE(coll, &s, status);
   5568             }
   5569             if(CE==UCOL_NO_MORE_CES) {
   5570                 /* should not happen */
   5571                 *status=U_INTERNAL_PROGRAM_ERROR;
   5572                 UTRACE_EXIT_STATUS(*status);
   5573                 return 0;
   5574             }
   5575             if(uprv_numAvailableExpCEs(s)) {
   5576                 canUpdateState = FALSE;
   5577             }
   5578         }
   5579     } else {
   5580         while(counter-->0) {
   5581             uiter_next32(s.iterator);
   5582         }
   5583     }
   5584 
   5585     // French secondary needs to know whether the iterator state of zero came from previous level OR
   5586     // from a new invocation...
   5587     UBool wasDoingPrimary = FALSE;
   5588     // destination buffer byte counter. When this guy
   5589     // gets to count, we're done with the iteration
   5590     int32_t i = 0;
   5591     // used to count the zero bytes written after we
   5592     // have finished with the sort key
   5593     int32_t j = 0;
   5594 
   5595 
   5596     // Hm.... I think we're ready to plunge in. Basic story is as following:
   5597     // we have a fall through case based on level. This is used for initial
   5598     // positioning on iteration start. Every level processor contains a
   5599     // for(;;) which will be broken when we exhaust all the CEs. Other
   5600     // way to exit is a goto saveState, which happens when we have filled
   5601     // out our buffer.
   5602     switch(level) {
   5603     case UCOL_PSK_PRIMARY:
   5604         wasDoingPrimary = TRUE;
   5605         for(;;) {
   5606             if(i==count) {
   5607                 goto saveState;
   5608             }
   5609             // We should save the state only if we
   5610             // are sure that we are done with the
   5611             // previous iterator state
   5612             if(canUpdateState && byteCountOrFrenchDone == 0) {
   5613                 newState = s.iterator->getState(s.iterator);
   5614                 if(newState != UITER_NO_STATE) {
   5615                     iterState = newState;
   5616                     cces = 0;
   5617                 }
   5618             }
   5619             CE = ucol_IGetNextCE(coll, &s, status);
   5620             cces++;
   5621             if(CE==UCOL_NO_MORE_CES) {
   5622                 // Add the level separator
   5623                 terminatePSKLevel(level, maxLevel, i, dest);
   5624                 byteCountOrFrenchDone=0;
   5625                 // Restart the iteration an move to the
   5626                 // second level
   5627                 s.iterator->move(s.iterator, 0, UITER_START);
   5628                 cces = 0;
   5629                 level = UCOL_PSK_SECONDARY;
   5630                 break;
   5631             }
   5632             if(!isContinuation(CE)){
   5633                 if(coll->leadBytePermutationTable != NULL){
   5634                     CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
   5635                 }
   5636             }
   5637             if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5638                 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
   5639                 if(CE != 0) {
   5640                     if(byteCountOrFrenchDone == 0) {
   5641                         // get the second byte of primary
   5642                         dest[i++]=(uint8_t)(CE >> 8);
   5643                     } else {
   5644                         byteCountOrFrenchDone = 0;
   5645                     }
   5646                     if((CE &=0xff)!=0) {
   5647                         if(i==count) {
   5648                             /* overflow */
   5649                             byteCountOrFrenchDone = 1;
   5650                             cces--;
   5651                             goto saveState;
   5652                         }
   5653                         dest[i++]=(uint8_t)CE;
   5654                     }
   5655                 }
   5656             }
   5657             if(uprv_numAvailableExpCEs(s)) {
   5658                 canUpdateState = FALSE;
   5659             } else {
   5660                 canUpdateState = TRUE;
   5661             }
   5662         }
   5663         /* fall through to next level */
   5664     case UCOL_PSK_SECONDARY:
   5665         if(strength >= UCOL_SECONDARY) {
   5666             if(!doingFrench) {
   5667                 for(;;) {
   5668                     if(i == count) {
   5669                         goto saveState;
   5670                     }
   5671                     // We should save the state only if we
   5672                     // are sure that we are done with the
   5673                     // previous iterator state
   5674                     if(canUpdateState) {
   5675                         newState = s.iterator->getState(s.iterator);
   5676                         if(newState != UITER_NO_STATE) {
   5677                             iterState = newState;
   5678                             cces = 0;
   5679                         }
   5680                     }
   5681                     CE = ucol_IGetNextCE(coll, &s, status);
   5682                     cces++;
   5683                     if(CE==UCOL_NO_MORE_CES) {
   5684                         // Add the level separator
   5685                         terminatePSKLevel(level, maxLevel, i, dest);
   5686                         byteCountOrFrenchDone = 0;
   5687                         // Restart the iteration an move to the
   5688                         // second level
   5689                         s.iterator->move(s.iterator, 0, UITER_START);
   5690                         cces = 0;
   5691                         level = UCOL_PSK_CASE;
   5692                         break;
   5693                     }
   5694                     if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5695                         CE >>= 8; /* get secondary */
   5696                         if(CE != 0) {
   5697                             dest[i++]=(uint8_t)CE;
   5698                         }
   5699                     }
   5700                     if(uprv_numAvailableExpCEs(s)) {
   5701                         canUpdateState = FALSE;
   5702                     } else {
   5703                         canUpdateState = TRUE;
   5704                     }
   5705                 }
   5706             } else { // French secondary processing
   5707                 uint8_t frenchBuff[UCOL_MAX_BUFFER];
   5708                 int32_t frenchIndex = 0;
   5709                 // Here we are going backwards.
   5710                 // If the iterator is at the beggining, it should be
   5711                 // moved to end.
   5712                 if(wasDoingPrimary) {
   5713                     s.iterator->move(s.iterator, 0, UITER_LIMIT);
   5714                     cces = 0;
   5715                 }
   5716                 for(;;) {
   5717                     if(i == count) {
   5718                         goto saveState;
   5719                     }
   5720                     if(canUpdateState) {
   5721                         newState = s.iterator->getState(s.iterator);
   5722                         if(newState != UITER_NO_STATE) {
   5723                             iterState = newState;
   5724                             cces = 0;
   5725                         }
   5726                     }
   5727                     CE = ucol_IGetPrevCE(coll, &s, status);
   5728                     cces++;
   5729                     if(CE==UCOL_NO_MORE_CES) {
   5730                         // Add the level separator
   5731                         terminatePSKLevel(level, maxLevel, i, dest);
   5732                         byteCountOrFrenchDone = 0;
   5733                         // Restart the iteration an move to the next level
   5734                         s.iterator->move(s.iterator, 0, UITER_START);
   5735                         level = UCOL_PSK_CASE;
   5736                         break;
   5737                     }
   5738                     if(isContinuation(CE)) { // if it's a continuation, we want to save it and
   5739                         // reverse when we get a first non-continuation CE.
   5740                         CE >>= 8;
   5741                         frenchBuff[frenchIndex++] = (uint8_t)CE;
   5742                     } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5743                         CE >>= 8; /* get secondary */
   5744                         if(!frenchIndex) {
   5745                             if(CE != 0) {
   5746                                 dest[i++]=(uint8_t)CE;
   5747                             }
   5748                         } else {
   5749                             frenchBuff[frenchIndex++] = (uint8_t)CE;
   5750                             frenchIndex -= usedFrench;
   5751                             usedFrench = 0;
   5752                             while(i < count && frenchIndex) {
   5753                                 dest[i++] = frenchBuff[--frenchIndex];
   5754                                 usedFrench++;
   5755                             }
   5756                         }
   5757                     }
   5758                     if(uprv_numAvailableExpCEs(s)) {
   5759                         canUpdateState = FALSE;
   5760                     } else {
   5761                         canUpdateState = TRUE;
   5762                     }
   5763                 }
   5764             }
   5765         } else {
   5766             level = UCOL_PSK_CASE;
   5767         }
   5768         /* fall through to next level */
   5769     case UCOL_PSK_CASE:
   5770         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   5771             uint32_t caseShift = UCOL_CASE_SHIFT_START;
   5772             uint8_t caseByte = UCOL_CASE_BYTE_START;
   5773             uint8_t caseBits = 0;
   5774 
   5775             for(;;) {
   5776                 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
   5777                 if(i == count) {
   5778                     goto saveState;
   5779                 }
   5780                 // We should save the state only if we
   5781                 // are sure that we are done with the
   5782                 // previous iterator state
   5783                 if(canUpdateState) {
   5784                     newState = s.iterator->getState(s.iterator);
   5785                     if(newState != UITER_NO_STATE) {
   5786                         iterState = newState;
   5787                         cces = 0;
   5788                     }
   5789                 }
   5790                 CE = ucol_IGetNextCE(coll, &s, status);
   5791                 cces++;
   5792                 if(CE==UCOL_NO_MORE_CES) {
   5793                     // On the case level we might have an unfinished
   5794                     // case byte. Add one if it's started.
   5795                     if(caseShift != UCOL_CASE_SHIFT_START) {
   5796                         dest[i++] = caseByte;
   5797                     }
   5798                     cces = 0;
   5799                     // We have finished processing CEs on this level.
   5800                     // However, we don't know if we have enough space
   5801                     // to add a case level terminator.
   5802                     if(i < count) {
   5803                         // Add the level separator
   5804                         terminatePSKLevel(level, maxLevel, i, dest);
   5805                         // Restart the iteration and move to the
   5806                         // next level
   5807                         s.iterator->move(s.iterator, 0, UITER_START);
   5808                         level = UCOL_PSK_TERTIARY;
   5809                     } else {
   5810                         canUpdateState = FALSE;
   5811                     }
   5812                     break;
   5813                 }
   5814 
   5815                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5816                     if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
   5817                         // do the case level if we need to do it. We don't want to calculate
   5818                         // case level for primary ignorables if we have only primary strength and case level
   5819                         // otherwise we would break well formedness of CEs
   5820                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   5821                         caseBits = (uint8_t)(CE & 0xC0);
   5822                         // this copies the case level logic from the
   5823                         // sort key generation code
   5824                         if(CE != 0) {
   5825                             if (caseShift == 0) {
   5826                                 dest[i++] = caseByte;
   5827                                 caseShift = UCOL_CASE_SHIFT_START;
   5828                                 caseByte = UCOL_CASE_BYTE_START;
   5829                             }
   5830                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   5831                                 if((caseBits & 0xC0) == 0) {
   5832                                     caseByte |= 1 << (--caseShift);
   5833                                 } else {
   5834                                     caseByte |= 0 << (--caseShift);
   5835                                     /* second bit */
   5836                                     if(caseShift == 0) {
   5837                                         dest[i++] = caseByte;
   5838                                         caseShift = UCOL_CASE_SHIFT_START;
   5839                                         caseByte = UCOL_CASE_BYTE_START;
   5840                                     }
   5841                                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
   5842                                 }
   5843                             } else {
   5844                                 if((caseBits & 0xC0) == 0) {
   5845                                     caseByte |= 0 << (--caseShift);
   5846                                 } else {
   5847                                     caseByte |= 1 << (--caseShift);
   5848                                     /* second bit */
   5849                                     if(caseShift == 0) {
   5850                                         dest[i++] = caseByte;
   5851                                         caseShift = UCOL_CASE_SHIFT_START;
   5852                                         caseByte = UCOL_CASE_BYTE_START;
   5853                                     }
   5854                                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
   5855                                 }
   5856                             }
   5857                         }
   5858 
   5859                     }
   5860                 }
   5861                 // Not sure this is correct for the case level - revisit
   5862                 if(uprv_numAvailableExpCEs(s)) {
   5863                     canUpdateState = FALSE;
   5864                 } else {
   5865                     canUpdateState = TRUE;
   5866                 }
   5867             }
   5868         } else {
   5869             level = UCOL_PSK_TERTIARY;
   5870         }
   5871         /* fall through to next level */
   5872     case UCOL_PSK_TERTIARY:
   5873         if(strength >= UCOL_TERTIARY) {
   5874             for(;;) {
   5875                 if(i == count) {
   5876                     goto saveState;
   5877                 }
   5878                 // We should save the state only if we
   5879                 // are sure that we are done with the
   5880                 // previous iterator state
   5881                 if(canUpdateState) {
   5882                     newState = s.iterator->getState(s.iterator);
   5883                     if(newState != UITER_NO_STATE) {
   5884                         iterState = newState;
   5885                         cces = 0;
   5886                     }
   5887                 }
   5888                 CE = ucol_IGetNextCE(coll, &s, status);
   5889                 cces++;
   5890                 if(CE==UCOL_NO_MORE_CES) {
   5891                     // Add the level separator
   5892                     terminatePSKLevel(level, maxLevel, i, dest);
   5893                     byteCountOrFrenchDone = 0;
   5894                     // Restart the iteration an move to the
   5895                     // second level
   5896                     s.iterator->move(s.iterator, 0, UITER_START);
   5897                     cces = 0;
   5898                     level = UCOL_PSK_QUATERNARY;
   5899                     break;
   5900                 }
   5901                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5902                     notIsContinuation = !isContinuation(CE);
   5903 
   5904                     if(notIsContinuation) {
   5905                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   5906                         CE ^= coll->caseSwitch;
   5907                         CE &= coll->tertiaryMask;
   5908                     } else {
   5909                         CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   5910                     }
   5911 
   5912                     if(CE != 0) {
   5913                         dest[i++]=(uint8_t)CE;
   5914                     }
   5915                 }
   5916                 if(uprv_numAvailableExpCEs(s)) {
   5917                     canUpdateState = FALSE;
   5918                 } else {
   5919                     canUpdateState = TRUE;
   5920                 }
   5921             }
   5922         } else {
   5923             // if we're not doing tertiary
   5924             // skip to the end
   5925             level = UCOL_PSK_NULL;
   5926         }
   5927         /* fall through to next level */
   5928     case UCOL_PSK_QUATERNARY:
   5929         if(strength >= UCOL_QUATERNARY) {
   5930             for(;;) {
   5931                 if(i == count) {
   5932                     goto saveState;
   5933                 }
   5934                 // We should save the state only if we
   5935                 // are sure that we are done with the
   5936                 // previous iterator state
   5937                 if(canUpdateState) {
   5938                     newState = s.iterator->getState(s.iterator);
   5939                     if(newState != UITER_NO_STATE) {
   5940                         iterState = newState;
   5941                         cces = 0;
   5942                     }
   5943                 }
   5944                 CE = ucol_IGetNextCE(coll, &s, status);
   5945                 cces++;
   5946                 if(CE==UCOL_NO_MORE_CES) {
   5947                     // Add the level separator
   5948                     terminatePSKLevel(level, maxLevel, i, dest);
   5949                     //dest[i++] = UCOL_LEVELTERMINATOR;
   5950                     byteCountOrFrenchDone = 0;
   5951                     // Restart the iteration an move to the
   5952                     // second level
   5953                     s.iterator->move(s.iterator, 0, UITER_START);
   5954                     cces = 0;
   5955                     level = UCOL_PSK_QUIN;
   5956                     break;
   5957                 }
   5958                 if(CE==0)
   5959                     continue;
   5960                 if(isShiftedCE(CE, LVT, &wasShifted)) {
   5961                     CE >>= 16; /* get primary */
   5962                     if(CE != 0) {
   5963                         if(byteCountOrFrenchDone == 0) {
   5964                             dest[i++]=(uint8_t)(CE >> 8);
   5965                         } else {
   5966                             byteCountOrFrenchDone = 0;
   5967                         }
   5968                         if((CE &=0xff)!=0) {
   5969                             if(i==count) {
   5970                                 /* overflow */
   5971                                 byteCountOrFrenchDone = 1;
   5972                                 goto saveState;
   5973                             }
   5974                             dest[i++]=(uint8_t)CE;
   5975                         }
   5976                     }
   5977                 } else {
   5978                     notIsContinuation = !isContinuation(CE);
   5979                     if(notIsContinuation) {
   5980                         if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   5981                             dest[i++] = UCOL_HIRAGANA_QUAD;
   5982                         } else {
   5983                             dest[i++] = 0xFF;
   5984                         }
   5985                     }
   5986                 }
   5987                 if(uprv_numAvailableExpCEs(s)) {
   5988                     canUpdateState = FALSE;
   5989                 } else {
   5990                     canUpdateState = TRUE;
   5991                 }
   5992             }
   5993         } else {
   5994             // if we're not doing quaternary
   5995             // skip to the end
   5996             level = UCOL_PSK_NULL;
   5997         }
   5998         /* fall through to next level */
   5999     case UCOL_PSK_QUIN:
   6000         level = UCOL_PSK_IDENTICAL;
   6001         /* fall through to next level */
   6002     case UCOL_PSK_IDENTICAL:
   6003         if(strength >= UCOL_IDENTICAL) {
   6004             UChar32 first, second;
   6005             int32_t bocsuBytesWritten = 0;
   6006             // We always need to do identical on
   6007             // the NFD form of the string.
   6008             if(normIter == NULL) {
   6009                 // we arrived from the level below and
   6010                 // normalization was not turned on.
   6011                 // therefore, we need to make a fresh NFD iterator
   6012                 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   6013                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   6014             } else if(!doingIdenticalFromStart) {
   6015                 // there is an iterator, but we did some other levels.
   6016                 // therefore, we have a FCD iterator - need to make
   6017                 // a NFD one.
   6018                 // normIter being at the beginning does not guarantee
   6019                 // that the underlying iterator is at the beginning
   6020                 iter->move(iter, 0, UITER_START);
   6021                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   6022             }
   6023             // At this point we have a NFD iterator that is positioned
   6024             // in the right place
   6025             if(U_FAILURE(*status)) {
   6026                 UTRACE_EXIT_STATUS(*status);
   6027                 return 0;
   6028             }
   6029             first = uiter_previous32(s.iterator);
   6030             // maybe we're at the start of the string
   6031             if(first == U_SENTINEL) {
   6032                 first = 0;
   6033             } else {
   6034                 uiter_next32(s.iterator);
   6035             }
   6036 
   6037             j = 0;
   6038             for(;;) {
   6039                 if(i == count) {
   6040                     if(j+1 < bocsuBytesWritten) {
   6041                         bocsuBytesUsed = j+1;
   6042                     }
   6043                     goto saveState;
   6044                 }
   6045 
   6046                 // On identical level, we will always save
   6047                 // the state if we reach this point, since
   6048                 // we don't depend on getNextCE for content
   6049                 // all the content is in our buffer and we
   6050                 // already either stored the full buffer OR
   6051                 // otherwise we won't arrive here.
   6052                 newState = s.iterator->getState(s.iterator);
   6053                 if(newState != UITER_NO_STATE) {
   6054                     iterState = newState;
   6055                     cces = 0;
   6056                 }
   6057 
   6058                 uint8_t buff[4];
   6059                 second = uiter_next32(s.iterator);
   6060                 cces++;
   6061 
   6062                 // end condition for identical level
   6063                 if(second == U_SENTINEL) {
   6064                     terminatePSKLevel(level, maxLevel, i, dest);
   6065                     level = UCOL_PSK_NULL;
   6066                     break;
   6067                 }
   6068                 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
   6069                 first = second;
   6070 
   6071                 j = 0;
   6072                 if(bocsuBytesUsed != 0) {
   6073                     while(bocsuBytesUsed-->0) {
   6074                         j++;
   6075                     }
   6076                 }
   6077 
   6078                 while(i < count && j < bocsuBytesWritten) {
   6079                     dest[i++] = buff[j++];
   6080                 }
   6081             }
   6082 
   6083         } else {
   6084             level = UCOL_PSK_NULL;
   6085         }
   6086         /* fall through to next level */
   6087     case UCOL_PSK_NULL:
   6088         j = i;
   6089         while(j<count) {
   6090             dest[j++]=0;
   6091         }
   6092         break;
   6093     default:
   6094         *status = U_INTERNAL_PROGRAM_ERROR;
   6095         UTRACE_EXIT_STATUS(*status);
   6096         return 0;
   6097     }
   6098 
   6099 saveState:
   6100     // Now we need to return stuff. First we want to see whether we have
   6101     // done everything for the current state of iterator.
   6102     if(byteCountOrFrenchDone
   6103         || canUpdateState == FALSE
   6104         || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
   6105     {
   6106         // Any of above mean that the previous transaction
   6107         // wasn't finished and that we should store the
   6108         // previous iterator state.
   6109         state[0] = iterState;
   6110     } else {
   6111         // The transaction is complete. We will continue in the next iteration.
   6112         state[0] = s.iterator->getState(s.iterator);
   6113         cces = 0;
   6114     }
   6115     // Store the number of bocsu bytes written.
   6116     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
   6117         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6118     }
   6119     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
   6120 
   6121     // Next we put in the level of comparison
   6122     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
   6123 
   6124     // If we are doing French, we need to store whether we have just finished the French level
   6125     if(level == UCOL_PSK_SECONDARY && doingFrench) {
   6126         state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6127     } else {
   6128         state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6129     }
   6130 
   6131     // Was the latest CE shifted
   6132     if(wasShifted) {
   6133         state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
   6134     }
   6135     // Check for cces overflow
   6136     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
   6137         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6138     }
   6139     // Store cces
   6140     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
   6141 
   6142     // Check for French overflow
   6143     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
   6144         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6145     }
   6146     // Store number of bytes written in the French secondary continuation sequence
   6147     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
   6148 
   6149 
   6150     // If we have used normalizing iterator, get rid of it
   6151     if(normIter != NULL) {
   6152         unorm_closeIter(normIter);
   6153     }
   6154 
   6155     /* To avoid memory leak, free the offset buffer if necessary. */
   6156     ucol_freeOffsetBuffer(&s);
   6157 
   6158     // Return number of meaningful sortkey bytes.
   6159     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
   6160                   dest,i, state[0], state[1]);
   6161     UTRACE_EXIT_VALUE(i);
   6162     return i;
   6163 }
   6164 
   6165 /**
   6166  * Produce a bound for a given sortkey and a number of levels.
   6167  */
   6168 U_CAPI int32_t U_EXPORT2
   6169 ucol_getBound(const uint8_t       *source,
   6170         int32_t             sourceLength,
   6171         UColBoundMode       boundType,
   6172         uint32_t            noOfLevels,
   6173         uint8_t             *result,
   6174         int32_t             resultLength,
   6175         UErrorCode          *status)
   6176 {
   6177     // consistency checks
   6178     if(status == NULL || U_FAILURE(*status)) {
   6179         return 0;
   6180     }
   6181     if(source == NULL) {
   6182         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6183         return 0;
   6184     }
   6185 
   6186     int32_t sourceIndex = 0;
   6187     // Scan the string until we skip enough of the key OR reach the end of the key
   6188     do {
   6189         sourceIndex++;
   6190         if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
   6191             noOfLevels--;
   6192         }
   6193     } while (noOfLevels > 0
   6194         && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
   6195 
   6196     if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
   6197         && noOfLevels > 0) {
   6198             *status = U_SORT_KEY_TOO_SHORT_WARNING;
   6199     }
   6200 
   6201 
   6202     // READ ME: this code assumes that the values for boundType
   6203     // enum will not changes. They are set so that the enum value
   6204     // corresponds to the number of extra bytes each bound type
   6205     // needs.
   6206     if(result != NULL && resultLength >= sourceIndex+boundType) {
   6207         uprv_memcpy(result, source, sourceIndex);
   6208         switch(boundType) {
   6209             // Lower bound just gets terminated. No extra bytes
   6210         case UCOL_BOUND_LOWER: // = 0
   6211             break;
   6212             // Upper bound needs one extra byte
   6213         case UCOL_BOUND_UPPER: // = 1
   6214             result[sourceIndex++] = 2;
   6215             break;
   6216             // Upper long bound needs two extra bytes
   6217         case UCOL_BOUND_UPPER_LONG: // = 2
   6218             result[sourceIndex++] = 0xFF;
   6219             result[sourceIndex++] = 0xFF;
   6220             break;
   6221         default:
   6222             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6223             return 0;
   6224         }
   6225         result[sourceIndex++] = 0;
   6226 
   6227         return sourceIndex;
   6228     } else {
   6229         return sourceIndex+boundType+1;
   6230     }
   6231 }
   6232 
   6233 /****************************************************************************/
   6234 /* Following are the functions that deal with the properties of a collator  */
   6235 /* there are new APIs and some compatibility APIs                           */
   6236 /****************************************************************************/
   6237 
   6238 static inline void
   6239 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
   6240                     int32_t *primShift, int32_t *secShift, int32_t *terShift)
   6241 {
   6242     uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
   6243     UBool reverseSecondary = FALSE;
   6244     UBool continuation = isContinuation(CE);
   6245     if(!continuation) {
   6246         tertiary = (uint8_t)((CE & coll->tertiaryMask));
   6247         tertiary ^= coll->caseSwitch;
   6248         reverseSecondary = TRUE;
   6249     } else {
   6250         tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   6251         tertiary &= UCOL_REMOVE_CASE;
   6252         reverseSecondary = FALSE;
   6253     }
   6254 
   6255     secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6256     primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6257     primary1 = (uint8_t)(CE >> 8);
   6258 
   6259     if(primary1 != 0) {
   6260         if (coll->leadBytePermutationTable != NULL && !continuation) {
   6261             primary1 = coll->leadBytePermutationTable[primary1];
   6262         }
   6263 
   6264         coll->latinOneCEs[ch] |= (primary1 << *primShift);
   6265         *primShift -= 8;
   6266     }
   6267     if(primary2 != 0) {
   6268         if(*primShift < 0) {
   6269             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6270             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6271             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6272             return;
   6273         }
   6274         coll->latinOneCEs[ch] |= (primary2 << *primShift);
   6275         *primShift -= 8;
   6276     }
   6277     if(secondary != 0) {
   6278         if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
   6279             coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
   6280             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
   6281         } else { // normal case
   6282             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
   6283         }
   6284         *secShift -= 8;
   6285     }
   6286     if(tertiary != 0) {
   6287         coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
   6288         *terShift -= 8;
   6289     }
   6290 }
   6291 
   6292 static inline UBool
   6293 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
   6294     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
   6295     if(newTable == NULL) {
   6296       *status = U_MEMORY_ALLOCATION_ERROR;
   6297       coll->latinOneFailed = TRUE;
   6298       return FALSE;
   6299     }
   6300     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
   6301     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
   6302     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
   6303     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
   6304     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
   6305     coll->latinOneTableLen = size;
   6306     uprv_free(coll->latinOneCEs);
   6307     coll->latinOneCEs = newTable;
   6308     return TRUE;
   6309 }
   6310 
   6311 static UBool
   6312 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
   6313     UBool result = TRUE;
   6314     if(coll->latinOneCEs == NULL) {
   6315         coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
   6316         if(coll->latinOneCEs == NULL) {
   6317             *status = U_MEMORY_ALLOCATION_ERROR;
   6318             return FALSE;
   6319         }
   6320         coll->latinOneTableLen = UCOL_LATINONETABLELEN;
   6321     }
   6322     UChar ch = 0;
   6323     UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
   6324     // Check for null pointer
   6325     if (U_FAILURE(*status)) {
   6326         ucol_closeElements(it);
   6327         return FALSE;
   6328     }
   6329     uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
   6330 
   6331     int32_t primShift = 24, secShift = 24, terShift = 24;
   6332     uint32_t CE = 0;
   6333     int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
   6334 
   6335     // TODO: make safe if you get more than you wanted...
   6336     for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
   6337         primShift = 24; secShift = 24; terShift = 24;
   6338         if(ch < 0x100) {
   6339             CE = coll->latinOneMapping[ch];
   6340         } else {
   6341             CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   6342             if(CE == UCOL_NOT_FOUND && coll->UCA) {
   6343                 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   6344             }
   6345         }
   6346         if(CE < UCOL_NOT_FOUND) {
   6347             ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6348         } else {
   6349             switch (getCETag(CE)) {
   6350             case EXPANSION_TAG:
   6351             case DIGIT_TAG:
   6352                 ucol_setText(it, &ch, 1, status);
   6353                 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
   6354                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6355                         coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6356                         coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6357                         coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6358                         break;
   6359                     }
   6360                     ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6361                 }
   6362                 break;
   6363             case CONTRACTION_TAG:
   6364                 // here is the trick
   6365                 // F2 is contraction. We do something very similar to contractions
   6366                 // but have two indices, one in the real contraction table and the
   6367                 // other to where we stuffed things. This hopes that we don't have
   6368                 // many contractions (this should work for latin-1 tables).
   6369                 {
   6370                     if((CE & 0x00FFF000) != 0) {
   6371                         *status = U_UNSUPPORTED_ERROR;
   6372                         goto cleanup_after_failure;
   6373                     }
   6374 
   6375                     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   6376 
   6377                     CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
   6378 
   6379                     coll->latinOneCEs[ch] = CE;
   6380                     coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
   6381                     coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
   6382 
   6383                     // We're going to jump into contraction table, pick the elements
   6384                     // and use them
   6385                     do {
   6386                         CE = *(coll->contractionCEs +
   6387                             (UCharOffset - coll->contractionIndex));
   6388                         if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
   6389                             uint32_t size;
   6390                             uint32_t i;    /* general counter */
   6391                             uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   6392                             size = getExpansionCount(CE);
   6393                             //CE = *CEOffset++;
   6394                             if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   6395                                 for(i = 0; i<size; i++) {
   6396                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6397                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6398                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6399                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6400                                         break;
   6401                                     }
   6402                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6403                                 }
   6404                             } else { /* else, we do */
   6405                                 while(*CEOffset != 0) {
   6406                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6407                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6408                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6409                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6410                                         break;
   6411                                     }
   6412                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6413                                 }
   6414                             }
   6415                             contractionOffset++;
   6416                         } else if(CE < UCOL_NOT_FOUND) {
   6417                             ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
   6418                         } else {
   6419                             coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6420                             coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6421                             coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6422                             contractionOffset++;
   6423                         }
   6424                         UCharOffset++;
   6425                         primShift = 24; secShift = 24; terShift = 24;
   6426                         if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
   6427                             if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
   6428                                 goto cleanup_after_failure;
   6429                             }
   6430                         }
   6431                     } while(*UCharOffset != 0xFFFF);
   6432                 }
   6433                 break;;
   6434             case SPEC_PROC_TAG:
   6435                 {
   6436                     // 0xB7 is a precontext character defined in UCA5.1, a special
   6437                     // handle is implemeted in order to save LatinOne table for
   6438                     // most locales.
   6439                     if (ch==0xb7) {
   6440                         ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6441                     }
   6442                     else {
   6443                         goto cleanup_after_failure;
   6444                     }
   6445                 }
   6446                 break;
   6447             default:
   6448                 goto cleanup_after_failure;
   6449             }
   6450         }
   6451     }
   6452     // compact table
   6453     if(contractionOffset < coll->latinOneTableLen) {
   6454         if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
   6455             goto cleanup_after_failure;
   6456         }
   6457     }
   6458     ucol_closeElements(it);
   6459     return result;
   6460 
   6461 cleanup_after_failure:
   6462     // status should already be set before arriving here.
   6463     coll->latinOneFailed = TRUE;
   6464     ucol_closeElements(it);
   6465     return FALSE;
   6466 }
   6467 
   6468 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
   6469     if(U_SUCCESS(*status)) {
   6470         if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6471             coll->caseSwitch = UCOL_CASE_SWITCH;
   6472         } else {
   6473             coll->caseSwitch = UCOL_NO_CASE_SWITCH;
   6474         }
   6475 
   6476         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
   6477             coll->tertiaryMask = UCOL_REMOVE_CASE;
   6478             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6479             coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
   6480             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
   6481             coll->tertiaryBottom = UCOL_COMMON_BOT3;
   6482         } else {
   6483             coll->tertiaryMask = UCOL_KEEP_CASE;
   6484             coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
   6485             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6486                 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
   6487                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
   6488                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
   6489             } else {
   6490                 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6491                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
   6492                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
   6493             }
   6494         }
   6495 
   6496         /* Set the compression values */
   6497         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
   6498         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
   6499         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
   6500 
   6501         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
   6502             && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
   6503         {
   6504             coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
   6505         } else {
   6506             coll->sortKeyGen = ucol_calcSortKey;
   6507         }
   6508         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
   6509             && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
   6510         {
   6511             if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
   6512                 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
   6513                     //fprintf(stderr, "F");
   6514                     coll->latinOneUse = TRUE;
   6515                 } else {
   6516                     coll->latinOneUse = FALSE;
   6517                 }
   6518                 if(*status == U_UNSUPPORTED_ERROR) {
   6519                     *status = U_ZERO_ERROR;
   6520                 }
   6521             } else { // latin1Table exists and it doesn't need to be regenerated, just use it
   6522                 coll->latinOneUse = TRUE;
   6523             }
   6524         } else {
   6525             coll->latinOneUse = FALSE;
   6526         }
   6527     }
   6528 }
   6529 
   6530 U_CAPI uint32_t  U_EXPORT2
   6531 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
   6532     if(U_FAILURE(*status) || coll == NULL) {
   6533         return 0;
   6534     }
   6535     if(len == -1) {
   6536         len = u_strlen(varTop);
   6537     }
   6538     if(len == 0) {
   6539         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6540         return 0;
   6541     }
   6542 
   6543     if(coll->delegate!=NULL) {
   6544       return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status);
   6545     }
   6546 
   6547 
   6548     collIterate s;
   6549     IInit_collIterate(coll, varTop, len, &s, status);
   6550     if(U_FAILURE(*status)) {
   6551         return 0;
   6552     }
   6553 
   6554     uint32_t CE = ucol_IGetNextCE(coll, &s, status);
   6555 
   6556     /* here we check if we have consumed all characters */
   6557     /* you can put in either one character or a contraction */
   6558     /* you shouldn't put more... */
   6559     if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
   6560         *status = U_CE_NOT_FOUND_ERROR;
   6561         return 0;
   6562     }
   6563 
   6564     uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
   6565 
   6566     if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
   6567         *status = U_PRIMARY_TOO_LONG_ERROR;
   6568         return 0;
   6569     }
   6570     if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
   6571         coll->variableTopValueisDefault = FALSE;
   6572         coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
   6573     }
   6574 
   6575     /* To avoid memory leak, free the offset buffer if necessary. */
   6576     ucol_freeOffsetBuffer(&s);
   6577 
   6578     return CE & UCOL_PRIMARYMASK;
   6579 }
   6580 
   6581 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
   6582     if(U_FAILURE(*status) || coll == NULL) {
   6583         return 0;
   6584     }
   6585     if(coll->delegate!=NULL) {
   6586       return ((const Collator*)coll->delegate)->getVariableTop(*status);
   6587     }
   6588     return coll->variableTopValue<<16;
   6589 }
   6590 
   6591 U_CAPI void  U_EXPORT2
   6592 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
   6593     if(U_FAILURE(*status) || coll == NULL) {
   6594         return;
   6595     }
   6596 
   6597     if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
   6598         coll->variableTopValueisDefault = FALSE;
   6599         coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
   6600     }
   6601 }
   6602 /* Attribute setter API */
   6603 U_CAPI void  U_EXPORT2
   6604 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
   6605     if(U_FAILURE(*status) || coll == NULL) {
   6606       return;
   6607     }
   6608 
   6609     if(coll->delegate != NULL) {
   6610       ((Collator*)coll->delegate)->setAttribute(attr,value,*status);
   6611       return;
   6612     }
   6613 
   6614     UColAttributeValue oldFrench = coll->frenchCollation;
   6615     UColAttributeValue oldCaseFirst = coll->caseFirst;
   6616     switch(attr) {
   6617     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
   6618         if(value == UCOL_ON) {
   6619             coll->numericCollation = UCOL_ON;
   6620             coll->numericCollationisDefault = FALSE;
   6621         } else if (value == UCOL_OFF) {
   6622             coll->numericCollation = UCOL_OFF;
   6623             coll->numericCollationisDefault = FALSE;
   6624         } else if (value == UCOL_DEFAULT) {
   6625             coll->numericCollationisDefault = TRUE;
   6626             coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
   6627         } else {
   6628             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6629         }
   6630         break;
   6631     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
   6632         if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) {
   6633             // This attribute is an implementation detail of the CLDR Japanese tailoring.
   6634             // The implementation might change to use a different mechanism
   6635             // to achieve the same Japanese sort order.
   6636             // Since ICU 50, this attribute is not settable any more via API functions.
   6637         } else {
   6638             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6639         }
   6640         break;
   6641     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   6642         if(value == UCOL_ON) {
   6643             coll->frenchCollation = UCOL_ON;
   6644             coll->frenchCollationisDefault = FALSE;
   6645         } else if (value == UCOL_OFF) {
   6646             coll->frenchCollation = UCOL_OFF;
   6647             coll->frenchCollationisDefault = FALSE;
   6648         } else if (value == UCOL_DEFAULT) {
   6649             coll->frenchCollationisDefault = TRUE;
   6650             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
   6651         } else {
   6652             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6653         }
   6654         break;
   6655     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   6656         if(value == UCOL_SHIFTED) {
   6657             coll->alternateHandling = UCOL_SHIFTED;
   6658             coll->alternateHandlingisDefault = FALSE;
   6659         } else if (value == UCOL_NON_IGNORABLE) {
   6660             coll->alternateHandling = UCOL_NON_IGNORABLE;
   6661             coll->alternateHandlingisDefault = FALSE;
   6662         } else if (value == UCOL_DEFAULT) {
   6663             coll->alternateHandlingisDefault = TRUE;
   6664             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
   6665         } else {
   6666             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6667         }
   6668         break;
   6669     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   6670         if(value == UCOL_LOWER_FIRST) {
   6671             coll->caseFirst = UCOL_LOWER_FIRST;
   6672             coll->caseFirstisDefault = FALSE;
   6673         } else if (value == UCOL_UPPER_FIRST) {
   6674             coll->caseFirst = UCOL_UPPER_FIRST;
   6675             coll->caseFirstisDefault = FALSE;
   6676         } else if (value == UCOL_OFF) {
   6677             coll->caseFirst = UCOL_OFF;
   6678             coll->caseFirstisDefault = FALSE;
   6679         } else if (value == UCOL_DEFAULT) {
   6680             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
   6681             coll->caseFirstisDefault = TRUE;
   6682         } else {
   6683             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6684         }
   6685         break;
   6686     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   6687         if(value == UCOL_ON) {
   6688             coll->caseLevel = UCOL_ON;
   6689             coll->caseLevelisDefault = FALSE;
   6690         } else if (value == UCOL_OFF) {
   6691             coll->caseLevel = UCOL_OFF;
   6692             coll->caseLevelisDefault = FALSE;
   6693         } else if (value == UCOL_DEFAULT) {
   6694             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
   6695             coll->caseLevelisDefault = TRUE;
   6696         } else {
   6697             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6698         }
   6699         break;
   6700     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   6701         if(value == UCOL_ON) {
   6702             coll->normalizationMode = UCOL_ON;
   6703             coll->normalizationModeisDefault = FALSE;
   6704             initializeFCD(status);
   6705         } else if (value == UCOL_OFF) {
   6706             coll->normalizationMode = UCOL_OFF;
   6707             coll->normalizationModeisDefault = FALSE;
   6708         } else if (value == UCOL_DEFAULT) {
   6709             coll->normalizationModeisDefault = TRUE;
   6710             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
   6711             if(coll->normalizationMode == UCOL_ON) {
   6712                 initializeFCD(status);
   6713             }
   6714         } else {
   6715             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6716         }
   6717         break;
   6718     case UCOL_STRENGTH:         /* attribute for strength */
   6719         if (value == UCOL_DEFAULT) {
   6720             coll->strengthisDefault = TRUE;
   6721             coll->strength = (UColAttributeValue)coll->options->strength;
   6722         } else if (value <= UCOL_IDENTICAL) {
   6723             coll->strengthisDefault = FALSE;
   6724             coll->strength = value;
   6725         } else {
   6726             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6727         }
   6728         break;
   6729     case UCOL_ATTRIBUTE_COUNT:
   6730     default:
   6731         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6732         break;
   6733     }
   6734     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
   6735         coll->latinOneRegenTable = TRUE;
   6736     } else {
   6737         coll->latinOneRegenTable = FALSE;
   6738     }
   6739     ucol_updateInternalState(coll, status);
   6740 }
   6741 
   6742 U_CAPI UColAttributeValue  U_EXPORT2
   6743 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
   6744     if(U_FAILURE(*status) || coll == NULL) {
   6745       return UCOL_DEFAULT;
   6746     }
   6747 
   6748     if(coll->delegate != NULL) {
   6749       return ((Collator*)coll->delegate)->getAttribute(attr,*status);
   6750     }
   6751 
   6752     switch(attr) {
   6753     case UCOL_NUMERIC_COLLATION:
   6754       return coll->numericCollation;
   6755     case UCOL_HIRAGANA_QUATERNARY_MODE:
   6756       return coll->hiraganaQ;
   6757     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   6758         return coll->frenchCollation;
   6759     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   6760         return coll->alternateHandling;
   6761     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   6762         return coll->caseFirst;
   6763     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   6764         return coll->caseLevel;
   6765     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   6766         return coll->normalizationMode;
   6767     case UCOL_STRENGTH:         /* attribute for strength */
   6768         return coll->strength;
   6769     case UCOL_ATTRIBUTE_COUNT:
   6770     default:
   6771         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6772         break;
   6773     }
   6774     return UCOL_DEFAULT;
   6775 }
   6776 
   6777 U_CAPI void U_EXPORT2
   6778 ucol_setStrength(    UCollator                *coll,
   6779             UCollationStrength        strength)
   6780 {
   6781     UErrorCode status = U_ZERO_ERROR;
   6782     ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
   6783 }
   6784 
   6785 U_CAPI UCollationStrength U_EXPORT2
   6786 ucol_getStrength(const UCollator *coll)
   6787 {
   6788     UErrorCode status = U_ZERO_ERROR;
   6789     return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
   6790 }
   6791 
   6792 U_CAPI int32_t U_EXPORT2
   6793 ucol_getReorderCodes(const UCollator *coll,
   6794                     int32_t *dest,
   6795                     int32_t destCapacity,
   6796                     UErrorCode *status) {
   6797     if (U_FAILURE(*status)) {
   6798         return 0;
   6799     }
   6800 
   6801     if(coll->delegate!=NULL) {
   6802       return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status);
   6803     }
   6804 
   6805     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
   6806         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6807         return 0;
   6808     }
   6809 
   6810 #ifdef UCOL_DEBUG
   6811     printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
   6812     printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
   6813 #endif
   6814 
   6815     if (coll->reorderCodesLength > destCapacity) {
   6816         *status = U_BUFFER_OVERFLOW_ERROR;
   6817         return coll->reorderCodesLength;
   6818     }
   6819     for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
   6820         dest[i] = coll->reorderCodes[i];
   6821     }
   6822     return coll->reorderCodesLength;
   6823 }
   6824 
   6825 U_CAPI void U_EXPORT2
   6826 ucol_setReorderCodes(UCollator* coll,
   6827                     const int32_t* reorderCodes,
   6828                     int32_t reorderCodesLength,
   6829                     UErrorCode *status) {
   6830     if (U_FAILURE(*status)) {
   6831         return;
   6832     }
   6833 
   6834     if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
   6835         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6836         return;
   6837     }
   6838 
   6839     if(coll->delegate!=NULL) {
   6840       ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
   6841       return;
   6842     }
   6843 
   6844     if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
   6845         uprv_free(coll->reorderCodes);
   6846     }
   6847     coll->reorderCodes = NULL;
   6848     coll->reorderCodesLength = 0;
   6849     if (reorderCodesLength == 0) {
   6850         if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
   6851             uprv_free(coll->leadBytePermutationTable);
   6852         }
   6853         coll->leadBytePermutationTable = NULL;
   6854         return;
   6855     }
   6856     coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
   6857     if (coll->reorderCodes == NULL) {
   6858         *status = U_MEMORY_ALLOCATION_ERROR;
   6859         return;
   6860     }
   6861     coll->freeReorderCodesOnClose = TRUE;
   6862     for (int32_t i = 0; i < reorderCodesLength; i++) {
   6863         coll->reorderCodes[i] = reorderCodes[i];
   6864     }
   6865     coll->reorderCodesLength = reorderCodesLength;
   6866     ucol_buildPermutationTable(coll, status);
   6867 }
   6868 
   6869 U_CAPI int32_t U_EXPORT2
   6870 ucol_getEquivalentReorderCodes(int32_t reorderCode,
   6871                     int32_t* dest,
   6872                     int32_t destCapacity,
   6873                     UErrorCode *pErrorCode) {
   6874     bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
   6875     uint16_t leadBytes[256];
   6876     int leadBytesCount;
   6877     int leadByteIndex;
   6878     int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
   6879     int reorderCodesForLeadByteCount;
   6880     int reorderCodeIndex;
   6881 
   6882     int32_t equivalentCodesCount = 0;
   6883     int setIndex;
   6884 
   6885     if (U_FAILURE(*pErrorCode)) {
   6886         return 0;
   6887     }
   6888 
   6889     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
   6890         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   6891         return 0;
   6892     }
   6893 
   6894     uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
   6895 
   6896     const UCollator* uca = ucol_initUCA(pErrorCode);
   6897     if (U_FAILURE(*pErrorCode)) {
   6898 	return 0;
   6899     }
   6900     leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
   6901     for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
   6902         reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
   6903             uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
   6904         for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
   6905             equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
   6906         }
   6907     }
   6908 
   6909     for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
   6910         if (equivalentCodesSet[setIndex] == true) {
   6911             equivalentCodesCount++;
   6912         }
   6913     }
   6914 
   6915     if (destCapacity == 0) {
   6916         return equivalentCodesCount;
   6917     }
   6918 
   6919     equivalentCodesCount = 0;
   6920     for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
   6921         if (equivalentCodesSet[setIndex] == true) {
   6922             dest[equivalentCodesCount++] = setIndex;
   6923             if (equivalentCodesCount >= destCapacity) {
   6924                 break;
   6925             }
   6926         }
   6927     }
   6928     return equivalentCodesCount;
   6929 }
   6930 
   6931 
   6932 /****************************************************************************/
   6933 /* Following are misc functions                                             */
   6934 /* there are new APIs and some compatibility APIs                           */
   6935 /****************************************************************************/
   6936 
   6937 U_CAPI void U_EXPORT2
   6938 ucol_getVersion(const UCollator* coll,
   6939                 UVersionInfo versionInfo)
   6940 {
   6941     if(coll->delegate!=NULL) {
   6942       ((const Collator*)coll->delegate)->getVersion(versionInfo);
   6943       return;
   6944     }
   6945     /* RunTime version  */
   6946     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
   6947     /* Builder version*/
   6948     uint8_t bdVersion = coll->image->version[0];
   6949 
   6950     /* Charset Version. Need to get the version from cnv files
   6951      * makeconv should populate cnv files with version and
   6952      * an api has to be provided in ucnv.h to obtain this version
   6953      */
   6954     uint8_t csVersion = 0;
   6955 
   6956     /* combine the version info */
   6957     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
   6958 
   6959     /* Tailoring rules */
   6960     versionInfo[0] = (uint8_t)(cmbVersion>>8);
   6961     versionInfo[1] = (uint8_t)cmbVersion;
   6962     versionInfo[2] = coll->image->version[1];
   6963     if(coll->UCA) {
   6964         /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
   6965         versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
   6966     } else {
   6967         versionInfo[3] = 0;
   6968     }
   6969 }
   6970 
   6971 
   6972 /* This internal API checks whether a character is tailored or not */
   6973 U_CAPI UBool  U_EXPORT2
   6974 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
   6975     if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
   6976         return FALSE;
   6977     }
   6978 
   6979     uint32_t CE = UCOL_NOT_FOUND;
   6980     const UChar *ContractionStart = NULL;
   6981     if(u < 0x100) { /* latin-1 */
   6982         CE = coll->latinOneMapping[u];
   6983         if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
   6984             return FALSE;
   6985         }
   6986     } else { /* regular */
   6987         CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
   6988     }
   6989 
   6990     if(isContraction(CE)) {
   6991         ContractionStart = (UChar *)coll->image+getContractOffset(CE);
   6992         CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
   6993     }
   6994 
   6995     return (UBool)(CE != UCOL_NOT_FOUND);
   6996 }
   6997 
   6998 
   6999 /****************************************************************************/
   7000 /* Following are the string compare functions                               */
   7001 /*                                                                          */
   7002 /****************************************************************************/
   7003 
   7004 
   7005 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
   7006 /*                     Used by strcoll if strength == identical and strings  */
   7007 /*                     are otherwise equal.                                  */
   7008 /*                                                                           */
   7009 /*                     Comparison must be done on NFD normalized strings.    */
   7010 /*                     FCD is not good enough.                               */
   7011 
   7012 static
   7013 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
   7014 {
   7015     // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
   7016     // of same type, but that doesn't really mean that it will stay that way.
   7017     int32_t            comparison;
   7018 
   7019     if (sColl->flags & UCOL_USE_ITERATOR) {
   7020         // The division for the array length may truncate the array size to
   7021         // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   7022         // for all platforms anyway.
   7023         UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   7024         UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   7025         UNormIterator *sNIt = NULL, *tNIt = NULL;
   7026         sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   7027         tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   7028         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   7029         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   7030         UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
   7031         UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
   7032         comparison = u_strCompareIter(sIt, tIt, TRUE);
   7033         unorm_closeIter(sNIt);
   7034         unorm_closeIter(tNIt);
   7035     } else {
   7036         int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
   7037         const UChar *sBuf = sColl->string;
   7038         int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
   7039         const UChar *tBuf = tColl->string;
   7040 
   7041         if (normalize) {
   7042             *status = U_ZERO_ERROR;
   7043             // Note: We could use Normalizer::compare() or similar, but for short strings
   7044             // which may not be in FCD it might be faster to just NFD them.
   7045             // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
   7046             // NFD'ing immediately might be faster for long strings,
   7047             // but string comparison is usually done on relatively short strings.
   7048             sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
   7049                                   sColl->writableBuffer,
   7050                                   *status);
   7051             tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
   7052                                   tColl->writableBuffer,
   7053                                   *status);
   7054             if(U_FAILURE(*status)) {
   7055                 return UCOL_LESS;
   7056             }
   7057             comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
   7058         } else {
   7059             comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
   7060         }
   7061     }
   7062 
   7063     if (comparison < 0) {
   7064         return UCOL_LESS;
   7065     } else if (comparison == 0) {
   7066         return UCOL_EQUAL;
   7067     } else /* comparison > 0 */ {
   7068         return UCOL_GREATER;
   7069     }
   7070 }
   7071 
   7072 /*  CEBuf - A struct and some inline functions to handle the saving    */
   7073 /*          of CEs in a buffer within ucol_strcoll                     */
   7074 
   7075 #define UCOL_CEBUF_SIZE 512
   7076 typedef struct ucol_CEBuf {
   7077     uint32_t    *buf;
   7078     uint32_t    *endp;
   7079     uint32_t    *pos;
   7080     uint32_t     localArray[UCOL_CEBUF_SIZE];
   7081 } ucol_CEBuf;
   7082 
   7083 
   7084 static
   7085 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
   7086     (b)->buf = (b)->pos = (b)->localArray;
   7087     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
   7088 }
   7089 
   7090 static
   7091 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
   7092     uint32_t  oldSize;
   7093     uint32_t  newSize;
   7094     uint32_t  *newBuf;
   7095 
   7096     ci->flags |= UCOL_ITER_ALLOCATED;
   7097     oldSize = (uint32_t)(b->pos - b->buf);
   7098     newSize = oldSize * 2;
   7099     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
   7100     if(newBuf == NULL) {
   7101         *status = U_MEMORY_ALLOCATION_ERROR;
   7102     }
   7103     else {
   7104         uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
   7105         if (b->buf != b->localArray) {
   7106             uprv_free(b->buf);
   7107         }
   7108         b->buf = newBuf;
   7109         b->endp = b->buf + newSize;
   7110         b->pos  = b->buf + oldSize;
   7111     }
   7112 }
   7113 
   7114 static
   7115 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
   7116     if (b->pos == b->endp) {
   7117         ucol_CEBuf_Expand(b, ci, status);
   7118     }
   7119     if (U_SUCCESS(*status)) {
   7120         *(b)->pos++ = ce;
   7121     }
   7122 }
   7123 
   7124 /* This is a trick string compare function that goes in and uses sortkeys to compare */
   7125 /* It is used when compare gets in trouble and needs to bail out                     */
   7126 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
   7127                                                   collIterate *tColl,
   7128                                                   UErrorCode *status)
   7129 {
   7130     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
   7131     uint8_t *sourceKeyP = sourceKey;
   7132     uint8_t *targetKeyP = targetKey;
   7133     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
   7134     const UCollator *coll = sColl->coll;
   7135     const UChar *source = NULL;
   7136     const UChar *target = NULL;
   7137     int32_t result = UCOL_EQUAL;
   7138     UnicodeString sourceString, targetString;
   7139     int32_t sourceLength;
   7140     int32_t targetLength;
   7141 
   7142     if(sColl->flags & UCOL_USE_ITERATOR) {
   7143         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   7144         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   7145         UChar32 c;
   7146         while((c=sColl->iterator->next(sColl->iterator))>=0) {
   7147             sourceString.append((UChar)c);
   7148         }
   7149         while((c=tColl->iterator->next(tColl->iterator))>=0) {
   7150             targetString.append((UChar)c);
   7151         }
   7152         source = sourceString.getBuffer();
   7153         sourceLength = sourceString.length();
   7154         target = targetString.getBuffer();
   7155         targetLength = targetString.length();
   7156     } else { // no iterators
   7157         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
   7158         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
   7159         source = sColl->string;
   7160         target = tColl->string;
   7161     }
   7162 
   7163 
   7164 
   7165     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7166     if(sourceKeyLen > UCOL_MAX_BUFFER) {
   7167         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
   7168         if(sourceKeyP == NULL) {
   7169             *status = U_MEMORY_ALLOCATION_ERROR;
   7170             goto cleanup_and_do_compare;
   7171         }
   7172         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7173     }
   7174 
   7175     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7176     if(targetKeyLen > UCOL_MAX_BUFFER) {
   7177         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
   7178         if(targetKeyP == NULL) {
   7179             *status = U_MEMORY_ALLOCATION_ERROR;
   7180             goto cleanup_and_do_compare;
   7181         }
   7182         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7183     }
   7184 
   7185     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
   7186 
   7187 cleanup_and_do_compare:
   7188     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
   7189         uprv_free(sourceKeyP);
   7190     }
   7191 
   7192     if(targetKeyP != NULL && targetKeyP != targetKey) {
   7193         uprv_free(targetKeyP);
   7194     }
   7195 
   7196     if(result<0) {
   7197         return UCOL_LESS;
   7198     } else if(result>0) {
   7199         return UCOL_GREATER;
   7200     } else {
   7201         return UCOL_EQUAL;
   7202     }
   7203 }
   7204 
   7205 
   7206 static UCollationResult
   7207 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
   7208 {
   7209     U_ALIGN_CODE(16);
   7210 
   7211     const UCollator *coll = sColl->coll;
   7212 
   7213 
   7214     // setting up the collator parameters
   7215     UColAttributeValue strength = coll->strength;
   7216     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
   7217 
   7218     UBool checkSecTer = initialCheckSecTer;
   7219     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
   7220     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
   7221     UBool checkIdent = (strength == UCOL_IDENTICAL);
   7222     UBool checkCase = (coll->caseLevel == UCOL_ON);
   7223     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
   7224     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
   7225     UBool qShifted = shifted && checkQuad;
   7226     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
   7227 
   7228     if(doHiragana && shifted) {
   7229         return (ucol_compareUsingSortKeys(sColl, tColl, status));
   7230     }
   7231     uint8_t caseSwitch = coll->caseSwitch;
   7232     uint8_t tertiaryMask = coll->tertiaryMask;
   7233 
   7234     // This is the lowest primary value that will not be ignored if shifted
   7235     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
   7236 
   7237     UCollationResult result = UCOL_EQUAL;
   7238     UCollationResult hirResult = UCOL_EQUAL;
   7239 
   7240     // Preparing the CE buffers. They will be filled during the primary phase
   7241     ucol_CEBuf   sCEs;
   7242     ucol_CEBuf   tCEs;
   7243     UCOL_INIT_CEBUF(&sCEs);
   7244     UCOL_INIT_CEBUF(&tCEs);
   7245 
   7246     uint32_t secS = 0, secT = 0;
   7247     uint32_t sOrder=0, tOrder=0;
   7248 
   7249     // Non shifted primary processing is quite simple
   7250     if(!shifted) {
   7251         for(;;) {
   7252 
   7253             // We fetch CEs until we hit a non ignorable primary or end.
   7254             do {
   7255                 // We get the next CE
   7256                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7257                 // Stuff it in the buffer
   7258                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7259                 // And keep just the primary part.
   7260                 sOrder &= UCOL_PRIMARYMASK;
   7261             } while(sOrder == 0);
   7262 
   7263             // see the comments on the above block
   7264             do {
   7265                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7266                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7267                 tOrder &= UCOL_PRIMARYMASK;
   7268             } while(tOrder == 0);
   7269 
   7270             // if both primaries are the same
   7271             if(sOrder == tOrder) {
   7272                 // and there are no more CEs, we advance to the next level
   7273                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7274                     break;
   7275                 }
   7276                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7277                     if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
   7278                         hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
   7279                             ? UCOL_LESS:UCOL_GREATER;
   7280                     }
   7281                 }
   7282             } else {
   7283                 // only need to check one for continuation
   7284                 // if one is then the other must be or the preceding CE would be a prefix of the other
   7285                 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
   7286                     sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
   7287                     tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
   7288                 }
   7289                 // if two primaries are different, we are done
   7290                 result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
   7291                 goto commonReturn;
   7292             }
   7293         } // no primary difference... do the rest from the buffers
   7294     } else { // shifted - do a slightly more complicated processing :)
   7295         for(;;) {
   7296             UBool sInShifted = FALSE;
   7297             UBool tInShifted = FALSE;
   7298             // This version of code can be refactored. However, it seems easier to understand this way.
   7299             // Source loop. Sam as the target loop.
   7300             for(;;) {
   7301                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7302                 if(sOrder == UCOL_NO_MORE_CES) {
   7303                     UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7304                     break;
   7305                 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
   7306                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7307                     continue;
   7308                 } else if(isContinuation(sOrder)) {
   7309                     if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7310                         if(sInShifted) {
   7311                             sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7312                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7313                             continue;
   7314                         } else {
   7315                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7316                             break;
   7317                         }
   7318                     } else { /* Just lower level values */
   7319                         if(sInShifted) {
   7320                             continue;
   7321                         } else {
   7322                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7323                             continue;
   7324                         }
   7325                     }
   7326                 } else { /* regular */
   7327                     if(coll->leadBytePermutationTable != NULL){
   7328                         sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
   7329                     }
   7330                     if((sOrder & UCOL_PRIMARYMASK) > LVT) {
   7331                         UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7332                         break;
   7333                     } else {
   7334                         if((sOrder & UCOL_PRIMARYMASK) > 0) {
   7335                             sInShifted = TRUE;
   7336                             sOrder &= UCOL_PRIMARYMASK;
   7337                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7338                             continue;
   7339                         } else {
   7340                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7341                             sInShifted = FALSE;
   7342                             continue;
   7343                         }
   7344                     }
   7345                 }
   7346             }
   7347             sOrder &= UCOL_PRIMARYMASK;
   7348             sInShifted = FALSE;
   7349 
   7350             for(;;) {
   7351                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7352                 if(tOrder == UCOL_NO_MORE_CES) {
   7353                     UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7354                     break;
   7355                 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
   7356                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7357                     continue;
   7358                 } else if(isContinuation(tOrder)) {
   7359                     if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7360                         if(tInShifted) {
   7361                             tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7362                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7363                             continue;
   7364                         } else {
   7365                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7366                             break;
   7367                         }
   7368                     } else { /* Just lower level values */
   7369                         if(tInShifted) {
   7370                             continue;
   7371                         } else {
   7372                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7373                             continue;
   7374                         }
   7375                     }
   7376                 } else { /* regular */
   7377                     if(coll->leadBytePermutationTable != NULL){
   7378                         tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
   7379                     }
   7380                     if((tOrder & UCOL_PRIMARYMASK) > LVT) {
   7381                         UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7382                         break;
   7383                     } else {
   7384                         if((tOrder & UCOL_PRIMARYMASK) > 0) {
   7385                             tInShifted = TRUE;
   7386                             tOrder &= UCOL_PRIMARYMASK;
   7387                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7388                             continue;
   7389                         } else {
   7390                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7391                             tInShifted = FALSE;
   7392                             continue;
   7393                         }
   7394                     }
   7395                 }
   7396             }
   7397             tOrder &= UCOL_PRIMARYMASK;
   7398             tInShifted = FALSE;
   7399 
   7400             if(sOrder == tOrder) {
   7401                 /*
   7402                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7403                 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
   7404                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
   7405                 ? UCOL_LESS:UCOL_GREATER;
   7406                 }
   7407                 }
   7408                 */
   7409                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7410                     break;
   7411                 } else {
   7412                     sOrder = 0;
   7413                     tOrder = 0;
   7414                     continue;
   7415                 }
   7416             } else {
   7417                 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
   7418                 goto commonReturn;
   7419             }
   7420         } /* no primary difference... do the rest from the buffers */
   7421     }
   7422 
   7423     /* now, we're gonna reexamine collected CEs */
   7424     uint32_t    *sCE;
   7425     uint32_t    *tCE;
   7426 
   7427     /* This is the secondary level of comparison */
   7428     if(checkSecTer) {
   7429         if(!isFrenchSec) { /* normal */
   7430             sCE = sCEs.buf;
   7431             tCE = tCEs.buf;
   7432             for(;;) {
   7433                 while (secS == 0) {
   7434                     secS = *(sCE++) & UCOL_SECONDARYMASK;
   7435                 }
   7436 
   7437                 while(secT == 0) {
   7438                     secT = *(tCE++) & UCOL_SECONDARYMASK;
   7439                 }
   7440 
   7441                 if(secS == secT) {
   7442                     if(secS == UCOL_NO_MORE_CES_SECONDARY) {
   7443                         break;
   7444                     } else {
   7445                         secS = 0; secT = 0;
   7446                         continue;
   7447                     }
   7448                 } else {
   7449                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7450                     goto commonReturn;
   7451                 }
   7452             }
   7453         } else { /* do the French */
   7454             uint32_t *sCESave = NULL;
   7455             uint32_t *tCESave = NULL;
   7456             sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
   7457             tCE = tCEs.pos-2;
   7458             for(;;) {
   7459                 while (secS == 0 && sCE >= sCEs.buf) {
   7460                     if(sCESave == NULL) {
   7461                         secS = *(sCE--);
   7462                         if(isContinuation(secS)) {
   7463                             while(isContinuation(secS = *(sCE--)))
   7464                                 ;
   7465                             /* after this, secS has the start of continuation, and sCEs points before that */
   7466                             sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7467                             sCE+=2;  /* need to point to the first continuation CP */
   7468                             /* However, now you can just continue doing stuff */
   7469                         }
   7470                     } else {
   7471                         secS = *(sCE++);
   7472                         if(!isContinuation(secS)) { /* This means we have finished with this cont */
   7473                             sCE = sCESave;            /* reset the pointer to before continuation */
   7474                             sCESave = NULL;
   7475                             secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
   7476                             continue;
   7477                         }
   7478                     }
   7479                     secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7480                 }
   7481 
   7482                 while(secT == 0 && tCE >= tCEs.buf) {
   7483                     if(tCESave == NULL) {
   7484                         secT = *(tCE--);
   7485                         if(isContinuation(secT)) {
   7486                             while(isContinuation(secT = *(tCE--)))
   7487                                 ;
   7488                             /* after this, secS has the start of continuation, and sCEs points before that */
   7489                             tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7490                             tCE+=2;  /* need to point to the first continuation CP */
   7491                             /* However, now you can just continue doing stuff */
   7492                         }
   7493                     } else {
   7494                         secT = *(tCE++);
   7495                         if(!isContinuation(secT)) { /* This means we have finished with this cont */
   7496                             tCE = tCESave;          /* reset the pointer to before continuation */
   7497                             tCESave = NULL;
   7498                             secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
   7499                             continue;
   7500                         }
   7501                     }
   7502                     secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7503                 }
   7504 
   7505                 if(secS == secT) {
   7506                     if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
   7507                         break;
   7508                     } else {
   7509                         secS = 0; secT = 0;
   7510                         continue;
   7511                     }
   7512                 } else {
   7513                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7514                     goto commonReturn;
   7515                 }
   7516             }
   7517         }
   7518     }
   7519 
   7520     /* doing the case bit */
   7521     if(checkCase) {
   7522         sCE = sCEs.buf;
   7523         tCE = tCEs.buf;
   7524         for(;;) {
   7525             while((secS & UCOL_REMOVE_CASE) == 0) {
   7526                 if(!isContinuation(*sCE++)) {
   7527                     secS =*(sCE-1);
   7528                     if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7529                         // primary ignorables should not be considered on the case level when the strength is primary
   7530                         // otherwise, the CEs stop being well-formed
   7531                         secS &= UCOL_TERT_CASE_MASK;
   7532                         secS ^= caseSwitch;
   7533                     } else {
   7534                         secS = 0;
   7535                     }
   7536                 } else {
   7537                     secS = 0;
   7538                 }
   7539             }
   7540 
   7541             while((secT & UCOL_REMOVE_CASE) == 0) {
   7542                 if(!isContinuation(*tCE++)) {
   7543                     secT = *(tCE-1);
   7544                     if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7545                         // primary ignorables should not be considered on the case level when the strength is primary
   7546                         // otherwise, the CEs stop being well-formed
   7547                         secT &= UCOL_TERT_CASE_MASK;
   7548                         secT ^= caseSwitch;
   7549                     } else {
   7550                         secT = 0;
   7551                     }
   7552                 } else {
   7553                     secT = 0;
   7554                 }
   7555             }
   7556 
   7557             if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
   7558                 result = UCOL_LESS;
   7559                 goto commonReturn;
   7560             } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
   7561                 result = UCOL_GREATER;
   7562                 goto commonReturn;
   7563             }
   7564 
   7565             if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
   7566                 break;
   7567             } else {
   7568                 secS = 0;
   7569                 secT = 0;
   7570             }
   7571         }
   7572     }
   7573 
   7574     /* Tertiary level */
   7575     if(checkTertiary) {
   7576         secS = 0;
   7577         secT = 0;
   7578         sCE = sCEs.buf;
   7579         tCE = tCEs.buf;
   7580         for(;;) {
   7581             while((secS & UCOL_REMOVE_CASE) == 0) {
   7582                 secS = *(sCE++) & tertiaryMask;
   7583                 if(!isContinuation(secS)) {
   7584                     secS ^= caseSwitch;
   7585                 } else {
   7586                     secS &= UCOL_REMOVE_CASE;
   7587                 }
   7588             }
   7589 
   7590             while((secT & UCOL_REMOVE_CASE)  == 0) {
   7591                 secT = *(tCE++) & tertiaryMask;
   7592                 if(!isContinuation(secT)) {
   7593                     secT ^= caseSwitch;
   7594                 } else {
   7595                     secT &= UCOL_REMOVE_CASE;
   7596                 }
   7597             }
   7598 
   7599             if(secS == secT) {
   7600                 if((secS & UCOL_REMOVE_CASE) == 1) {
   7601                     break;
   7602                 } else {
   7603                     secS = 0; secT = 0;
   7604                     continue;
   7605                 }
   7606             } else {
   7607                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7608                 goto commonReturn;
   7609             }
   7610         }
   7611     }
   7612 
   7613 
   7614     if(qShifted /*checkQuad*/) {
   7615         UBool sInShifted = TRUE;
   7616         UBool tInShifted = TRUE;
   7617         secS = 0;
   7618         secT = 0;
   7619         sCE = sCEs.buf;
   7620         tCE = tCEs.buf;
   7621         for(;;) {
   7622             while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
   7623                 secS = *(sCE++);
   7624                 if(isContinuation(secS)) {
   7625                     if(!sInShifted) {
   7626                         continue;
   7627                     }
   7628                 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
   7629                     secS = UCOL_PRIMARYMASK;
   7630                     sInShifted = FALSE;
   7631                 } else {
   7632                     sInShifted = TRUE;
   7633                 }
   7634             }
   7635             secS &= UCOL_PRIMARYMASK;
   7636 
   7637 
   7638             while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
   7639                 secT = *(tCE++);
   7640                 if(isContinuation(secT)) {
   7641                     if(!tInShifted) {
   7642                         continue;
   7643                     }
   7644                 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
   7645                     secT = UCOL_PRIMARYMASK;
   7646                     tInShifted = FALSE;
   7647                 } else {
   7648                     tInShifted = TRUE;
   7649                 }
   7650             }
   7651             secT &= UCOL_PRIMARYMASK;
   7652 
   7653             if(secS == secT) {
   7654                 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
   7655                     break;
   7656                 } else {
   7657                     secS = 0; secT = 0;
   7658                     continue;
   7659                 }
   7660             } else {
   7661                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7662                 goto commonReturn;
   7663             }
   7664         }
   7665     } else if(doHiragana && hirResult != UCOL_EQUAL) {
   7666         // If we're fine on quaternaries, we might be different
   7667         // on Hiragana. This, however, might fail us in shifted.
   7668         result = hirResult;
   7669         goto commonReturn;
   7670     }
   7671 
   7672     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
   7673     /*  as a tiebreaker if all else is equal.                                */
   7674     /*  Getting here  should be quite rare - strings are not identical -     */
   7675     /*     that is checked first, but compared == through all other checks.  */
   7676     if(checkIdent)
   7677     {
   7678         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
   7679         result = ucol_checkIdent(sColl, tColl, TRUE, status);
   7680     }
   7681 
   7682 commonReturn:
   7683     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
   7684         if (sCEs.buf != sCEs.localArray ) {
   7685             uprv_free(sCEs.buf);
   7686         }
   7687         if (tCEs.buf != tCEs.localArray ) {
   7688             uprv_free(tCEs.buf);
   7689         }
   7690     }
   7691 
   7692     return result;
   7693 }
   7694 
   7695 static UCollationResult
   7696 ucol_strcollRegular(const UCollator *coll,
   7697                     const UChar *source, int32_t sourceLength,
   7698                     const UChar *target, int32_t targetLength,
   7699                     UErrorCode *status) {
   7700     collIterate sColl, tColl;
   7701     // Preparing the context objects for iterating over strings
   7702     IInit_collIterate(coll, source, sourceLength, &sColl, status);
   7703     IInit_collIterate(coll, target, targetLength, &tColl, status);
   7704     if(U_FAILURE(*status)) {
   7705         return UCOL_LESS;
   7706     }
   7707     return ucol_strcollRegular(&sColl, &tColl, status);
   7708 }
   7709 
   7710 static inline uint32_t
   7711 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
   7712                           uint32_t CE, const UChar *s, int32_t *index, int32_t len)
   7713 {
   7714     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
   7715     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
   7716     int32_t offset = 1;
   7717     UChar schar = 0, tchar = 0;
   7718 
   7719     for(;;) {
   7720         if(len == -1) {
   7721             if(s[*index] == 0) { // end of string
   7722                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7723             } else {
   7724                 schar = s[*index];
   7725             }
   7726         } else {
   7727             if(*index == len) {
   7728                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7729             } else {
   7730                 schar = s[*index];
   7731             }
   7732         }
   7733 
   7734         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   7735             offset++;
   7736         }
   7737 
   7738         if (schar == tchar) {
   7739             (*index)++;
   7740             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
   7741         }
   7742         else
   7743         {
   7744             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
   7745                 return UCOL_BAIL_OUT_CE;
   7746             }
   7747             // skip completely ignorables
   7748             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   7749             if(isZeroCE == 0) { // we have to ignore completely ignorables
   7750                 (*index)++;
   7751                 continue;
   7752             }
   7753 
   7754             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7755         }
   7756     }
   7757 }
   7758 
   7759 
   7760 /**
   7761  * This is a fast strcoll, geared towards text in Latin-1.
   7762  * It supports contractions of size two, French secondaries
   7763  * and case switching. You can use it with strengths primary
   7764  * to tertiary. It does not support shifted and case level.
   7765  * It relies on the table build by setupLatin1Table. If it
   7766  * doesn't understand something, it will go to the regular
   7767  * strcoll.
   7768  */
   7769 static UCollationResult
   7770 ucol_strcollUseLatin1( const UCollator    *coll,
   7771               const UChar        *source,
   7772               int32_t            sLen,
   7773               const UChar        *target,
   7774               int32_t            tLen,
   7775               UErrorCode *status)
   7776 {
   7777     U_ALIGN_CODE(16);
   7778     int32_t strength = coll->strength;
   7779 
   7780     int32_t sIndex = 0, tIndex = 0;
   7781     UChar sChar = 0, tChar = 0;
   7782     uint32_t sOrder=0, tOrder=0;
   7783 
   7784     UBool endOfSource = FALSE;
   7785 
   7786     uint32_t *elements = coll->latinOneCEs;
   7787 
   7788     UBool haveContractions = FALSE; // if we have contractions in our string
   7789                                     // we cannot do French secondary
   7790 
   7791     // Do the primary level
   7792     for(;;) {
   7793         while(sOrder==0) { // this loop skips primary ignorables
   7794             // sOrder=getNextlatinOneCE(source);
   7795             if(sLen==-1) {   // handling zero terminated strings
   7796                 sChar=source[sIndex++];
   7797                 if(sChar==0) {
   7798                     endOfSource = TRUE;
   7799                     break;
   7800                 }
   7801             } else {        // handling strings with known length
   7802                 if(sIndex==sLen) {
   7803                     endOfSource = TRUE;
   7804                     break;
   7805                 }
   7806                 sChar=source[sIndex++];
   7807             }
   7808             if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   7809                 //fprintf(stderr, "R");
   7810                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7811             }
   7812             sOrder = elements[sChar];
   7813             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
   7814                 // specials can basically be either contractions or bail-out signs. If we get anything
   7815                 // else, we'll bail out anywasy
   7816                 if(getCETag(sOrder) == CONTRACTION_TAG) {
   7817                     sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
   7818                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
   7819                     // However, if there are contractions in the table, but we always use just one char,
   7820                     // we might be able to do French. This should be checked out.
   7821                 }
   7822                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   7823                     //fprintf(stderr, "S");
   7824                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7825                 }
   7826             }
   7827         }
   7828 
   7829         while(tOrder==0) {  // this loop skips primary ignorables
   7830             // tOrder=getNextlatinOneCE(target);
   7831             if(tLen==-1) {    // handling zero terminated strings
   7832                 tChar=target[tIndex++];
   7833                 if(tChar==0) {
   7834                     if(endOfSource) { // this is different than source loop,
   7835                         // as we already know that source loop is done here,
   7836                         // so we can either finish the primary loop if both
   7837                         // strings are done or anounce the result if only
   7838                         // target is done. Same below.
   7839                         goto endOfPrimLoop;
   7840                     } else {
   7841                         return UCOL_GREATER;
   7842                     }
   7843                 }
   7844             } else {          // handling strings with known length
   7845                 if(tIndex==tLen) {
   7846                     if(endOfSource) {
   7847                         goto endOfPrimLoop;
   7848                     } else {
   7849                         return UCOL_GREATER;
   7850                     }
   7851                 }
   7852                 tChar=target[tIndex++];
   7853             }
   7854             if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   7855                 //fprintf(stderr, "R");
   7856                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7857             }
   7858             tOrder = elements[tChar];
   7859             if(tOrder >= UCOL_NOT_FOUND) {
   7860                 // Handling specials, see the comments for source
   7861                 if(getCETag(tOrder) == CONTRACTION_TAG) {
   7862                     tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
   7863                     haveContractions = TRUE;
   7864                 }
   7865                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   7866                     //fprintf(stderr, "S");
   7867                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7868                 }
   7869             }
   7870         }
   7871         if(endOfSource) { // source is finished, but target is not, say the result.
   7872             return UCOL_LESS;
   7873         }
   7874 
   7875         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
   7876             sOrder = 0; tOrder = 0;
   7877             continue;
   7878         } else {
   7879             // compare current top bytes
   7880             if(((sOrder^tOrder)&0xFF000000)!=0) {
   7881                 // top bytes differ, return difference
   7882                 if(sOrder < tOrder) {
   7883                     return UCOL_LESS;
   7884                 } else if(sOrder > tOrder) {
   7885                     return UCOL_GREATER;
   7886                 }
   7887                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
   7888                 // since we must return enum value
   7889             }
   7890 
   7891             // top bytes match, continue with following bytes
   7892             sOrder<<=8;
   7893             tOrder<<=8;
   7894         }
   7895     }
   7896 
   7897 endOfPrimLoop:
   7898     // after primary loop, we definitely know the sizes of strings,
   7899     // so we set it and use simpler loop for secondaries and tertiaries
   7900     sLen = sIndex; tLen = tIndex;
   7901     if(strength >= UCOL_SECONDARY) {
   7902         // adjust the table beggining
   7903         elements += coll->latinOneTableLen;
   7904         endOfSource = FALSE;
   7905 
   7906         if(coll->frenchCollation == UCOL_OFF) { // non French
   7907             // This loop is a simplified copy of primary loop
   7908             // at this point we know that whole strings are latin-1, so we don't
   7909             // check for that. We also know that we only have contractions as
   7910             // specials.
   7911             sIndex = 0; tIndex = 0;
   7912             for(;;) {
   7913                 while(sOrder==0) {
   7914                     if(sIndex==sLen) {
   7915                         endOfSource = TRUE;
   7916                         break;
   7917                     }
   7918                     sChar=source[sIndex++];
   7919                     sOrder = elements[sChar];
   7920                     if(sOrder > UCOL_NOT_FOUND) {
   7921                         sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
   7922                     }
   7923                 }
   7924 
   7925                 while(tOrder==0) {
   7926                     if(tIndex==tLen) {
   7927                         if(endOfSource) {
   7928                             goto endOfSecLoop;
   7929                         } else {
   7930                             return UCOL_GREATER;
   7931                         }
   7932                     }
   7933                     tChar=target[tIndex++];
   7934                     tOrder = elements[tChar];
   7935                     if(tOrder > UCOL_NOT_FOUND) {
   7936                         tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
   7937                     }
   7938                 }
   7939                 if(endOfSource) {
   7940                     return UCOL_LESS;
   7941                 }
   7942 
   7943                 if(sOrder == tOrder) {
   7944                     sOrder = 0; tOrder = 0;
   7945                     continue;
   7946                 } else {
   7947                     // see primary loop for comments on this
   7948                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   7949                         if(sOrder < tOrder) {
   7950                             return UCOL_LESS;
   7951                         } else if(sOrder > tOrder) {
   7952                             return UCOL_GREATER;
   7953                         }
   7954                     }
   7955                     sOrder<<=8;
   7956                     tOrder<<=8;
   7957                 }
   7958             }
   7959         } else { // French
   7960             if(haveContractions) { // if we have contractions, we have to bail out
   7961                 // since we don't really know how to handle them here
   7962                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7963             }
   7964             // For French, we go backwards
   7965             sIndex = sLen; tIndex = tLen;
   7966             for(;;) {
   7967                 while(sOrder==0) {
   7968                     if(sIndex==0) {
   7969                         endOfSource = TRUE;
   7970                         break;
   7971                     }
   7972                     sChar=source[--sIndex];
   7973                     sOrder = elements[sChar];
   7974                     // don't even look for contractions
   7975                 }
   7976 
   7977                 while(tOrder==0) {
   7978                     if(tIndex==0) {
   7979                         if(endOfSource) {
   7980                             goto endOfSecLoop;
   7981                         } else {
   7982                             return UCOL_GREATER;
   7983                         }
   7984                     }
   7985                     tChar=target[--tIndex];
   7986                     tOrder = elements[tChar];
   7987                     // don't even look for contractions
   7988                 }
   7989                 if(endOfSource) {
   7990                     return UCOL_LESS;
   7991                 }
   7992 
   7993                 if(sOrder == tOrder) {
   7994                     sOrder = 0; tOrder = 0;
   7995                     continue;
   7996                 } else {
   7997                     // see the primary loop for comments
   7998                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   7999                         if(sOrder < tOrder) {
   8000                             return UCOL_LESS;
   8001                         } else if(sOrder > tOrder) {
   8002                             return UCOL_GREATER;
   8003                         }
   8004                     }
   8005                     sOrder<<=8;
   8006                     tOrder<<=8;
   8007                 }
   8008             }
   8009         }
   8010     }
   8011 
   8012 endOfSecLoop:
   8013     if(strength >= UCOL_TERTIARY) {
   8014         // tertiary loop is the same as secondary (except no French)
   8015         elements += coll->latinOneTableLen;
   8016         sIndex = 0; tIndex = 0;
   8017         endOfSource = FALSE;
   8018         for(;;) {
   8019             while(sOrder==0) {
   8020                 if(sIndex==sLen) {
   8021                     endOfSource = TRUE;
   8022                     break;
   8023                 }
   8024                 sChar=source[sIndex++];
   8025                 sOrder = elements[sChar];
   8026                 if(sOrder > UCOL_NOT_FOUND) {
   8027                     sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
   8028                 }
   8029             }
   8030             while(tOrder==0) {
   8031                 if(tIndex==tLen) {
   8032                     if(endOfSource) {
   8033                         return UCOL_EQUAL; // if both strings are at the end, they are equal
   8034                     } else {
   8035                         return UCOL_GREATER;
   8036                     }
   8037                 }
   8038                 tChar=target[tIndex++];
   8039                 tOrder = elements[tChar];
   8040                 if(tOrder > UCOL_NOT_FOUND) {
   8041                     tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
   8042                 }
   8043             }
   8044             if(endOfSource) {
   8045                 return UCOL_LESS;
   8046             }
   8047             if(sOrder == tOrder) {
   8048                 sOrder = 0; tOrder = 0;
   8049                 continue;
   8050             } else {
   8051                 if(((sOrder^tOrder)&0xff000000)!=0) {
   8052                     if(sOrder < tOrder) {
   8053                         return UCOL_LESS;
   8054                     } else if(sOrder > tOrder) {
   8055                         return UCOL_GREATER;
   8056                     }
   8057                 }
   8058                 sOrder<<=8;
   8059                 tOrder<<=8;
   8060             }
   8061         }
   8062     }
   8063     return UCOL_EQUAL;
   8064 }
   8065 
   8066 /*
   8067   Slightly modified version of U8_NEXT macro defined in utf8.h. U8_NEXT requires
   8068   the length of UTF-8 string. This version assumes that the UTF-8 string is null
   8069   terminated and does not require the length as input.
   8070 
   8071   Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
   8072   null terminated input string takes extra amount of CPU cycles.
   8073 */
   8074 static const UChar32
   8075 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
   8076 
   8077 #define UTF8_ERROR_VALUE_1 0x15
   8078 #define UTF8_ERROR_VALUE_2 0x9f
   8079 #define UTF_ERROR_VALUE 0xffff
   8080 
   8081 static const UChar32
   8082 utf8_errorValue[6]={
   8083     UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
   8084     0x3ffffff, 0x7fffffff
   8085 };
   8086 
   8087 static
   8088 UChar32 utf8_nextCharSafeBodyNullTerm(const uint8_t *s, int32_t *pi, UChar32 c, UBool strict) {
   8089     int32_t i=*pi;
   8090     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
   8091     U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
   8092 
   8093     if (c) {
   8094         uint8_t trail, illegal=0;
   8095 
   8096         U8_MASK_LEAD_BYTE((c), count);
   8097         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
   8098         switch(count) {
   8099         /* each branch falls through to the next one */
   8100         case 5:
   8101         case 4:
   8102             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
   8103             illegal=1;
   8104             break;
   8105         case 3:
   8106             trail=s[(i)];
   8107             if (trail==0) {
   8108                 illegal=1;
   8109                 break;
   8110             }
   8111             (c)=((c)<<6)|(trail&0x3f);
   8112             if(c<0x110) {
   8113                 illegal|=(trail&0xc0)^0x80;
   8114             } else {
   8115                 /* code point>0x10ffff, outside Unicode */
   8116                 illegal=1;
   8117                 break;
   8118             }
   8119             ++(i);
   8120         case 2:
   8121             trail=s[(i)];
   8122             if (trail==0) {
   8123                 illegal=1;
   8124                 break;
   8125             }
   8126             (c)=((c)<<6)|(trail&0x3f);
   8127             illegal|=(trail&0xc0)^0x80;
   8128             ++(i);
   8129         case 1:
   8130             trail=s[(i)];
   8131             if (trail==0) {
   8132                 illegal=1;
   8133                 break;
   8134             }
   8135             (c)=((c)<<6)|(trail&0x3f);
   8136             illegal|=(trail&0xc0)^0x80;
   8137             ++(i);
   8138             break;
   8139         case 0:
   8140             if(strict>=0) {
   8141                 return UTF8_ERROR_VALUE_1;
   8142             } else {
   8143                 return U_SENTINEL;
   8144             }
   8145         /* no default branch to optimize switch()  - all values are covered */
   8146         }
   8147 
   8148         /*
   8149          * All the error handling should return a value
   8150          * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
   8151          *
   8152          * Starting with Unicode 3.0.1, non-shortest forms are illegal.
   8153          * Starting with Unicode 3.2, surrogate code points must not be
   8154          * encoded in UTF-8, and there are no irregular sequences any more.
   8155          *
   8156          * U8_ macros (new in ICU 2.4) return negative values for error conditions.
   8157          */
   8158 
   8159         /* correct sequence - all trail bytes have (b7..b6)==(10)? */
   8160         /* illegal is also set if count>=4 */
   8161         if(illegal || (c)<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2)) {
   8162             /* error handling */
   8163             uint8_t errorCount=count;
   8164             /* don't go beyond this sequence */
   8165             i=*pi;
   8166             while(count>0 && U8_IS_TRAIL(s[i])) {
   8167                 ++(i);
   8168                 --count;
   8169             }
   8170             if(strict>=0) {
   8171                 c=utf8_errorValue[errorCount-count];
   8172             } else {
   8173                 c=U_SENTINEL;
   8174             }
   8175         } else if((strict)>0 && U_IS_UNICODE_NONCHAR(c)) {
   8176             /* strict: forbid non-characters like U+fffe */
   8177             c=utf8_errorValue[count];
   8178         }
   8179     }
   8180     *pi=i;
   8181     return c;
   8182 }
   8183 
   8184 #define U8_NEXT_NULLTERM(s, i, c) { \
   8185     (c)=(uint8_t)(s)[(i)]; \
   8186     if((c)>=0x80) { \
   8187         uint8_t __t1, __t2; \
   8188         if( /* handle U+1000..U+CFFF inline */ \
   8189             (0xe0<(c) && (c)<=0xec) && \
   8190             (__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 && \
   8191             (__t2=(uint8_t)((s)[(i)+2]-0x80))<= 0x3f && __t2 != 0 \
   8192         ) { \
   8193             /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
   8194             (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
   8195             (i)+=3; \
   8196         } else if( /* handle U+0080..U+07FF inline */ \
   8197             ((c)<0xe0 && (c)>=0xc2) && \
   8198             (__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 \
   8199         ) { \
   8200             (c)=(UChar)((((c)&0x1f)<<6)|__t1); \
   8201             (i)+=2; \
   8202         } else if(U8_IS_LEAD(c)) { \
   8203             /* function call for "complicated" and error cases */ \
   8204             ++(i); \
   8205             (c)=utf8_nextCharSafeBodyNullTerm((const uint8_t *)s, &(i), c, -1); \
   8206         } else { \
   8207             (c)=U_SENTINEL; \
   8208             ++(i); \
   8209         } \
   8210     } else { \
   8211         if ((c)) { \
   8212             ++(i); \
   8213         } \
   8214     } \
   8215 }
   8216 
   8217 #define U8_GET_NULLTERM(s, start, i, c) { \
   8218     int32_t _u8_get_index=(int32_t)(i); \
   8219     U8_SET_CP_START(s, start, _u8_get_index); \
   8220     U8_NEXT_NULLTERM(s, _u8_get_index, c); \
   8221 }
   8222 
   8223 
   8224 static UCollationResult
   8225 ucol_strcollRegularUTF8(
   8226                     const UCollator *coll,
   8227                     const char      *source,
   8228                     int32_t         sourceLength,
   8229                     const char      *target,
   8230                     int32_t         targetLength,
   8231                     UErrorCode      *status)
   8232 {
   8233     UCharIterator src;
   8234     UCharIterator tgt;
   8235 
   8236     uiter_setUTF8(&src, source, sourceLength);
   8237     uiter_setUTF8(&tgt, target, targetLength);
   8238 
   8239     // Preparing the context objects for iterating over strings
   8240     collIterate sColl, tColl;
   8241     IInit_collIterate(coll, NULL, -1, &sColl, status);
   8242     IInit_collIterate(coll, NULL, -1, &tColl, status);
   8243     if(U_FAILURE(*status)) {
   8244         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8245         return UCOL_EQUAL;
   8246     }
   8247     // The division for the array length may truncate the array size to
   8248     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   8249     // for all platforms anyway.
   8250     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8251     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8252     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
   8253 
   8254     sColl.iterator = &src;
   8255     sColl.flags |= UCOL_USE_ITERATOR;
   8256     tColl.flags |= UCOL_USE_ITERATOR;
   8257     tColl.iterator = &tgt;
   8258 
   8259     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
   8260         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   8261         sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
   8262         sColl.flags &= ~UCOL_ITER_NORM;
   8263 
   8264         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   8265         tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
   8266         tColl.flags &= ~UCOL_ITER_NORM;
   8267     }
   8268 
   8269     return ucol_strcollRegular(&sColl, &tColl, status);
   8270 }
   8271 
   8272 static inline uint32_t
   8273 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
   8274                           uint32_t CE, const char *s, int32_t *index, int32_t len)
   8275 {
   8276     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
   8277     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
   8278     int32_t offset = 1;
   8279     UChar32 schar = 0, tchar = 0;
   8280 
   8281     for(;;) {
   8282         if (len == -1) {
   8283             U8_GET_NULLTERM((const uint8_t*)s, 0, *index, schar);
   8284             if (schar == 0) {
   8285                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   8286             }
   8287         } else {
   8288             if (*index == len) {
   8289                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   8290             }
   8291             U8_GET((const uint8_t*)s, 0, *index, len, schar);
   8292         }
   8293         if (schar == -1) {
   8294             schar = 0xfffd;
   8295         }
   8296 
   8297         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   8298             offset++;
   8299         }
   8300 
   8301         if (schar == tchar) {
   8302             U8_FWD_1(s, *index, len);
   8303             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
   8304         }
   8305         else
   8306         {
   8307             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
   8308                 return UCOL_BAIL_OUT_CE;
   8309             }
   8310             // skip completely ignorables
   8311             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   8312             if(isZeroCE == 0) { // we have to ignore completely ignorables
   8313                 U8_FWD_1(s, *index, len);
   8314                 continue;
   8315             }
   8316 
   8317             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   8318         }
   8319     }
   8320 }
   8321 
   8322 static inline UCollationResult
   8323 ucol_strcollUseLatin1UTF8(
   8324                 const UCollator *coll,
   8325                 const char      *source,
   8326                 int32_t         sLen,
   8327                 const char      *target,
   8328                 int32_t         tLen,
   8329                 UErrorCode      *status)
   8330 {
   8331     U_ALIGN_CODE(16);
   8332     int32_t strength = coll->strength;
   8333 
   8334     int32_t sIndex = 0, tIndex = 0;
   8335     UChar32 sChar = 0, tChar = 0;
   8336     uint32_t sOrder=0, tOrder=0;
   8337 
   8338     UBool endOfSource = FALSE;
   8339 
   8340     uint32_t *elements = coll->latinOneCEs;
   8341 
   8342     UBool haveContractions = FALSE; // if we have contractions in our string
   8343                                     // we cannot do French secondary
   8344 
   8345     // Do the primary level
   8346     for(;;) {
   8347         while(sOrder==0) { // this loop skips primary ignorables
   8348             // sOrder=getNextlatinOneCE(source);
   8349             if (sLen==-1) {
   8350                 U8_NEXT_NULLTERM(source, sIndex, sChar);
   8351                 if (sChar == 0) {
   8352                     endOfSource = TRUE;
   8353                     sLen = sIndex;
   8354                     break;
   8355                 }
   8356             } else {
   8357                 if (sIndex == sLen) {
   8358                     endOfSource = TRUE;
   8359                     break;
   8360                 }
   8361                 U8_NEXT(source, sIndex, sLen ,sChar);
   8362             }
   8363             if (sChar == -1) {
   8364                 sChar = 0xfffd; // fallback for the bad code
   8365             }
   8366             if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   8367                 //fprintf(stderr, "R");
   8368                 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
   8369             }
   8370             sOrder = elements[sChar];
   8371             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
   8372                 // specials can basically be either contractions or bail-out signs. If we get anything
   8373                 // else, we'll bail out anywasy
   8374                 if(getCETag(sOrder) == CONTRACTION_TAG) {
   8375                     sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
   8376                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
   8377                     // However, if there are contractions in the table, but we always use just one char,
   8378                     // we might be able to do French. This should be checked out.
   8379                 }
   8380                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   8381                     //fprintf(stderr, "S");
   8382                     return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
   8383                 }
   8384             }
   8385         }
   8386 
   8387         while(tOrder==0) {  // this loop skips primary ignorables
   8388             // tOrder=getNextlatinOneCE(target);
   8389             if (tLen == -1) {
   8390                 U8_NEXT_NULLTERM(target, tIndex, tChar);
   8391                 if (tChar == 0) {
   8392                     if(endOfSource) {
   8393                         tLen = tIndex;
   8394                         goto endOfPrimLoopU8;
   8395                     } else {
   8396                         return UCOL_GREATER;
   8397                     }
   8398                 }
   8399             } else {
   8400                 if (tIndex == tLen) {
   8401                     if(endOfSource) {
   8402                         goto endOfPrimLoopU8;
   8403                     } else {
   8404                         return UCOL_GREATER;
   8405                     }
   8406                 }
   8407                 U8_NEXT(target, tIndex, tLen, tChar);
   8408             }
   8409             if (tChar == -1) {
   8410                 tChar = 0xfffd;
   8411             }
   8412             if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   8413                 //fprintf(stderr, "R");
   8414                 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
   8415             }
   8416             tOrder = elements[tChar];
   8417             if(tOrder >= UCOL_NOT_FOUND) {
   8418                 // Handling specials, see the comments for source
   8419                 if(getCETag(tOrder) == CONTRACTION_TAG) {
   8420                     tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
   8421                     haveContractions = TRUE;
   8422                 }
   8423                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   8424                     //fprintf(stderr, "S");
   8425                     return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
   8426                 }
   8427             }
   8428         }
   8429         if(endOfSource) { // source is finished, but target is not, say the result.
   8430             return UCOL_LESS;
   8431         }
   8432 
   8433         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
   8434             sOrder = 0; tOrder = 0;
   8435             continue;
   8436         } else {
   8437             // compare current top bytes
   8438             if(((sOrder^tOrder)&0xFF000000)!=0) {
   8439                 // top bytes differ, return difference
   8440                 if(sOrder < tOrder) {
   8441                     return UCOL_LESS;
   8442                 } else if(sOrder > tOrder) {
   8443                     return UCOL_GREATER;
   8444                 }
   8445                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
   8446                 // since we must return enum value
   8447             }
   8448 
   8449             // top bytes match, continue with following bytes
   8450             sOrder<<=8;
   8451             tOrder<<=8;
   8452         }
   8453     }
   8454 
   8455 endOfPrimLoopU8:
   8456     // after primary loop, we definitely know the sizes of strings,
   8457     // so we set it and use simpler loop for secondaries and tertiaries
   8458     sLen = sIndex; tLen = tIndex;
   8459     if(strength >= UCOL_SECONDARY) {
   8460         // adjust the table beggining
   8461         elements += coll->latinOneTableLen;
   8462         endOfSource = FALSE;
   8463 
   8464         if(coll->frenchCollation == UCOL_OFF) { // non French
   8465             // This loop is a simplified copy of primary loop
   8466             // at this point we know that whole strings are latin-1, so we don't
   8467             // check for that. We also know that we only have contractions as
   8468             // specials.
   8469             sIndex = 0; tIndex = 0;
   8470             for(;;) {
   8471                 while(sOrder==0) {
   8472                     if(sIndex==sLen) {
   8473                         endOfSource = TRUE;
   8474                         break;
   8475                     }
   8476                     U_ASSERT(sLen >= 0);
   8477                     U8_NEXT(source, sIndex, sLen, sChar);
   8478                     U_ASSERT(sChar >= 0 && sChar <= 0xFF);
   8479                     sOrder = elements[sChar];
   8480                     if(sOrder > UCOL_NOT_FOUND) {
   8481                         sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
   8482                     }
   8483                 }
   8484 
   8485                 while(tOrder==0) {
   8486                     if(tIndex==tLen) {
   8487                         if(endOfSource) {
   8488                             goto endOfSecLoopU8;
   8489                         } else {
   8490                             return UCOL_GREATER;
   8491                         }
   8492                     }
   8493                     U_ASSERT(tLen >= 0);
   8494                     U8_NEXT(target, tIndex, tLen, tChar);
   8495                     U_ASSERT(tChar >= 0 && tChar <= 0xFF);
   8496                     tOrder = elements[tChar];
   8497                     if(tOrder > UCOL_NOT_FOUND) {
   8498                         tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
   8499                     }
   8500                 }
   8501                 if(endOfSource) {
   8502                     return UCOL_LESS;
   8503                 }
   8504 
   8505                 if(sOrder == tOrder) {
   8506                     sOrder = 0; tOrder = 0;
   8507                     continue;
   8508                 } else {
   8509                     // see primary loop for comments on this
   8510                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   8511                         if(sOrder < tOrder) {
   8512                             return UCOL_LESS;
   8513                         } else if(sOrder > tOrder) {
   8514                             return UCOL_GREATER;
   8515                         }
   8516                     }
   8517                     sOrder<<=8;
   8518                     tOrder<<=8;
   8519                 }
   8520             }
   8521         } else { // French
   8522             if(haveContractions) { // if we have contractions, we have to bail out
   8523                 // since we don't really know how to handle them here
   8524                 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
   8525             }
   8526             // For French, we go backwards
   8527             sIndex = sLen; tIndex = tLen;
   8528             for(;;) {
   8529                 while(sOrder==0) {
   8530                     if(sIndex==0) {
   8531                         endOfSource = TRUE;
   8532                         break;
   8533                     }
   8534                     U8_PREV(source, 0, sIndex, sChar);
   8535                     U_ASSERT(sChar >= 0 && sChar <= 0xFF);
   8536                     sOrder = elements[sChar];
   8537                     // don't even look for contractions
   8538                 }
   8539 
   8540                 while(tOrder==0) {
   8541                     if(tIndex==0) {
   8542                         if(endOfSource) {
   8543                             goto endOfSecLoopU8;
   8544                         } else {
   8545                             return UCOL_GREATER;
   8546                         }
   8547                     }
   8548                     U8_PREV(target, 0, tIndex, tChar);
   8549                     U_ASSERT(tChar >= 0 && tChar <= 0xFF);
   8550                     tOrder = elements[tChar];
   8551                     // don't even look for contractions
   8552                 }
   8553                 if(endOfSource) {
   8554                     return UCOL_LESS;
   8555                 }
   8556 
   8557                 if(sOrder == tOrder) {
   8558                     sOrder = 0; tOrder = 0;
   8559                     continue;
   8560                 } else {
   8561                     // see the primary loop for comments
   8562                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   8563                         if(sOrder < tOrder) {
   8564                             return UCOL_LESS;
   8565                         } else if(sOrder > tOrder) {
   8566                             return UCOL_GREATER;
   8567                         }
   8568                     }
   8569                     sOrder<<=8;
   8570                     tOrder<<=8;
   8571                 }
   8572             }
   8573         }
   8574     }
   8575 
   8576 endOfSecLoopU8:
   8577     if(strength >= UCOL_TERTIARY) {
   8578         // tertiary loop is the same as secondary (except no French)
   8579         elements += coll->latinOneTableLen;
   8580         sIndex = 0; tIndex = 0;
   8581         endOfSource = FALSE;
   8582         for(;;) {
   8583             while(sOrder==0) {
   8584                 if(sIndex==sLen) {
   8585                     endOfSource = TRUE;
   8586                     break;
   8587                 }
   8588                 U_ASSERT(sLen >= 0);
   8589                 U8_NEXT(source, sIndex, sLen, sChar);
   8590                 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
   8591                 sOrder = elements[sChar];
   8592                 if(sOrder > UCOL_NOT_FOUND) {
   8593                     sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
   8594                 }
   8595             }
   8596             while(tOrder==0) {
   8597                 if(tIndex==tLen) {
   8598                     if(endOfSource) {
   8599                         return UCOL_EQUAL; // if both strings are at the end, they are equal
   8600                     } else {
   8601                         return UCOL_GREATER;
   8602                     }
   8603                 }
   8604                 U_ASSERT(tLen >= 0);
   8605                 U8_NEXT(target, tIndex, tLen, tChar);
   8606                 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
   8607                 tOrder = elements[tChar];
   8608                 if(tOrder > UCOL_NOT_FOUND) {
   8609                     tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
   8610                 }
   8611             }
   8612             if(endOfSource) {
   8613                 return UCOL_LESS;
   8614             }
   8615             if(sOrder == tOrder) {
   8616                 sOrder = 0; tOrder = 0;
   8617                 continue;
   8618             } else {
   8619                 if(((sOrder^tOrder)&0xff000000)!=0) {
   8620                     if(sOrder < tOrder) {
   8621                         return UCOL_LESS;
   8622                     } else if(sOrder > tOrder) {
   8623                         return UCOL_GREATER;
   8624                     }
   8625                 }
   8626                 sOrder<<=8;
   8627                 tOrder<<=8;
   8628             }
   8629         }
   8630     }
   8631     return UCOL_EQUAL;
   8632 }
   8633 
   8634 U_CAPI UCollationResult U_EXPORT2
   8635 ucol_strcollIter( const UCollator    *coll,
   8636                  UCharIterator *sIter,
   8637                  UCharIterator *tIter,
   8638                  UErrorCode         *status)
   8639 {
   8640     if(!status || U_FAILURE(*status)) {
   8641         return UCOL_EQUAL;
   8642     }
   8643 
   8644     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
   8645     UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
   8646 
   8647     if (sIter == tIter) {
   8648         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8649         return UCOL_EQUAL;
   8650     }
   8651     if(sIter == NULL || tIter == NULL || coll == NULL) {
   8652         *status = U_ILLEGAL_ARGUMENT_ERROR;
   8653         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8654         return UCOL_EQUAL;
   8655     }
   8656 
   8657     UCollationResult result = UCOL_EQUAL;
   8658 
   8659     // Preparing the context objects for iterating over strings
   8660     collIterate sColl, tColl;
   8661     IInit_collIterate(coll, NULL, -1, &sColl, status);
   8662     IInit_collIterate(coll, NULL, -1, &tColl, status);
   8663     if(U_FAILURE(*status)) {
   8664         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8665         return UCOL_EQUAL;
   8666     }
   8667     // The division for the array length may truncate the array size to
   8668     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   8669     // for all platforms anyway.
   8670     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8671     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8672     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
   8673 
   8674     sColl.iterator = sIter;
   8675     sColl.flags |= UCOL_USE_ITERATOR;
   8676     tColl.flags |= UCOL_USE_ITERATOR;
   8677     tColl.iterator = tIter;
   8678 
   8679     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
   8680         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   8681         sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
   8682         sColl.flags &= ~UCOL_ITER_NORM;
   8683 
   8684         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   8685         tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
   8686         tColl.flags &= ~UCOL_ITER_NORM;
   8687     }
   8688 
   8689     UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
   8690 
   8691     while((sChar = sColl.iterator->next(sColl.iterator)) ==
   8692         (tChar = tColl.iterator->next(tColl.iterator))) {
   8693             if(sChar == U_SENTINEL) {
   8694                 result = UCOL_EQUAL;
   8695                 goto end_compare;
   8696             }
   8697     }
   8698 
   8699     if(sChar == U_SENTINEL) {
   8700         tChar = tColl.iterator->previous(tColl.iterator);
   8701     }
   8702 
   8703     if(tChar == U_SENTINEL) {
   8704         sChar = sColl.iterator->previous(sColl.iterator);
   8705     }
   8706 
   8707     sChar = sColl.iterator->previous(sColl.iterator);
   8708     tChar = tColl.iterator->previous(tColl.iterator);
   8709 
   8710     if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
   8711     {
   8712         // We are stopped in the middle of a contraction.
   8713         // Scan backwards through the == part of the string looking for the start of the contraction.
   8714         //   It doesn't matter which string we scan, since they are the same in this region.
   8715         do
   8716         {
   8717             sChar = sColl.iterator->previous(sColl.iterator);
   8718             tChar = tColl.iterator->previous(tColl.iterator);
   8719         }
   8720         while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
   8721     }
   8722 
   8723 
   8724     if(U_SUCCESS(*status)) {
   8725         result = ucol_strcollRegular(&sColl, &tColl, status);
   8726     }
   8727 
   8728 end_compare:
   8729     if(sNormIter || tNormIter) {
   8730         unorm_closeIter(sNormIter);
   8731         unorm_closeIter(tNormIter);
   8732     }
   8733 
   8734     UTRACE_EXIT_VALUE_STATUS(result, *status)
   8735     return result;
   8736 }
   8737 
   8738 
   8739 /*                                                                      */
   8740 /* ucol_strcoll     Main public API string comparison function          */
   8741 /*                                                                      */
   8742 U_CAPI UCollationResult U_EXPORT2
   8743 ucol_strcoll( const UCollator    *coll,
   8744               const UChar        *source,
   8745               int32_t            sourceLength,
   8746               const UChar        *target,
   8747               int32_t            targetLength)
   8748 {
   8749     U_ALIGN_CODE(16);
   8750 
   8751     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
   8752     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   8753         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
   8754         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
   8755         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
   8756     }
   8757 
   8758     if(source == NULL || target == NULL) {
   8759         // do not crash, but return. Should have
   8760         // status argument to return error.
   8761         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8762         return UCOL_EQUAL;
   8763     }
   8764 
   8765     /* Quick check if source and target are same strings. */
   8766     /* They should either both be NULL terminated or the explicit length should be set on both. */
   8767     if (source==target && sourceLength==targetLength) {
   8768         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8769         return UCOL_EQUAL;
   8770     }
   8771 
   8772     if(coll->delegate != NULL) {
   8773       UErrorCode status = U_ZERO_ERROR;
   8774       return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
   8775     }
   8776 
   8777     /* Scan the strings.  Find:                                                             */
   8778     /*    The length of any leading portion that is equal                                   */
   8779     /*    Whether they are exactly equal.  (in which case we just return)                   */
   8780     const UChar    *pSrc    = source;
   8781     const UChar    *pTarg   = target;
   8782     int32_t        equalLength;
   8783 
   8784     if (sourceLength == -1 && targetLength == -1) {
   8785         // Both strings are null terminated.
   8786         //    Scan through any leading equal portion.
   8787         while (*pSrc == *pTarg && *pSrc != 0) {
   8788             pSrc++;
   8789             pTarg++;
   8790         }
   8791         if (*pSrc == 0 && *pTarg == 0) {
   8792             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8793             return UCOL_EQUAL;
   8794         }
   8795         equalLength = (int32_t)(pSrc - source);
   8796     }
   8797     else
   8798     {
   8799         // One or both strings has an explicit length.
   8800         const UChar    *pSrcEnd = source + sourceLength;
   8801         const UChar    *pTargEnd = target + targetLength;
   8802 
   8803         // Scan while the strings are bitwise ==, or until one is exhausted.
   8804         for (;;) {
   8805             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
   8806                 break;
   8807             }
   8808             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
   8809                 break;
   8810             }
   8811             if (*pSrc != *pTarg) {
   8812                 break;
   8813             }
   8814             pSrc++;
   8815             pTarg++;
   8816         }
   8817         equalLength = (int32_t)(pSrc - source);
   8818 
   8819         // If we made it all the way through both strings, we are done.  They are ==
   8820         if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
   8821             (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
   8822         {
   8823             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8824             return UCOL_EQUAL;
   8825         }
   8826     }
   8827     if (equalLength > 0) {
   8828         /* There is an identical portion at the beginning of the two strings.        */
   8829         /*   If the identical portion ends within a contraction or a comibining      */
   8830         /*   character sequence, back up to the start of that sequence.              */
   8831 
   8832         // These values should already be set by the code above.
   8833         //pSrc  = source + equalLength;        /* point to the first differing chars   */
   8834         //pTarg = target + equalLength;
   8835         if ((pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
   8836             (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
   8837         {
   8838             // We are stopped in the middle of a contraction.
   8839             // Scan backwards through the == part of the string looking for the start of the contraction.
   8840             //   It doesn't matter which string we scan, since they are the same in this region.
   8841             do
   8842             {
   8843                 equalLength--;
   8844                 pSrc--;
   8845             }
   8846             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
   8847         }
   8848 
   8849         source += equalLength;
   8850         target += equalLength;
   8851         if (sourceLength > 0) {
   8852             sourceLength -= equalLength;
   8853         }
   8854         if (targetLength > 0) {
   8855             targetLength -= equalLength;
   8856         }
   8857     }
   8858 
   8859     UErrorCode status = U_ZERO_ERROR;
   8860     UCollationResult returnVal;
   8861     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
   8862         returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
   8863     } else {
   8864         returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
   8865     }
   8866     UTRACE_EXIT_VALUE(returnVal);
   8867     return returnVal;
   8868 }
   8869 
   8870 U_CAPI UCollationResult U_EXPORT2
   8871 ucol_strcollUTF8(
   8872         const UCollator *coll,
   8873         const char      *source,
   8874         int32_t         sourceLength,
   8875         const char      *target,
   8876         int32_t         targetLength,
   8877         UErrorCode      *status)
   8878 {
   8879     U_ALIGN_CODE(16);
   8880 
   8881     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
   8882     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   8883         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
   8884         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
   8885         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
   8886     }
   8887 
   8888     if (U_FAILURE(*status)) {
   8889         /* do nothing */
   8890         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
   8891         return UCOL_EQUAL;
   8892     }
   8893 
   8894     if(source == NULL || target == NULL) {
   8895         *status = U_ILLEGAL_ARGUMENT_ERROR;
   8896         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
   8897         return UCOL_EQUAL;
   8898     }
   8899 
   8900     /* Quick check if source and target are same strings. */
   8901     /* They should either both be NULL terminated or the explicit length should be set on both. */
   8902     if (source==target && sourceLength==targetLength) {
   8903         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
   8904         return UCOL_EQUAL;
   8905     }
   8906 
   8907     if(coll->delegate != NULL) {
   8908         return ((const Collator*)coll->delegate)->compareUTF8(
   8909             StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength),
   8910             StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength),
   8911             *status);
   8912     }
   8913 
   8914     /* Scan the strings.  Find:                                                             */
   8915     /*    The length of any leading portion that is equal                                   */
   8916     /*    Whether they are exactly equal.  (in which case we just return)                   */
   8917     const char  *pSrc = source;
   8918     const char  *pTarg = target;
   8919     UBool       bSrcLimit = FALSE;
   8920     UBool       bTargLimit = FALSE;
   8921 
   8922     if (sourceLength == -1 && targetLength == -1) {
   8923         // Both strings are null terminated.
   8924         //    Scan through any leading equal portion.
   8925         while (*pSrc == *pTarg && *pSrc != 0) {
   8926             pSrc++;
   8927             pTarg++;
   8928         }
   8929         if (*pSrc == 0 && *pTarg == 0) {
   8930             UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
   8931             return UCOL_EQUAL;
   8932         }
   8933         bSrcLimit = (*pSrc == 0);
   8934         bTargLimit = (*pTarg == 0);
   8935     }
   8936     else
   8937     {
   8938         // One or both strings has an explicit length.
   8939         const char *pSrcEnd = source + sourceLength;
   8940         const char *pTargEnd = target + targetLength;
   8941 
   8942         // Scan while the strings are bitwise ==, or until one is exhausted.
   8943         for (;;) {
   8944             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
   8945                 break;
   8946             }
   8947             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
   8948                 break;
   8949             }
   8950             if (*pSrc != *pTarg) {
   8951                 break;
   8952             }
   8953             pSrc++;
   8954             pTarg++;
   8955         }
   8956         bSrcLimit = (pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0));
   8957         bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
   8958 
   8959         // If we made it all the way through both strings, we are done.  They are ==
   8960         if (bSrcLimit &&    /* At end of src string, however it was specified. */
   8961             bTargLimit)     /* and also at end of dest string                  */
   8962         {
   8963             UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
   8964             return UCOL_EQUAL;
   8965         }
   8966     }
   8967 
   8968     U_ASSERT(!(bSrcLimit && bTargLimit));
   8969 
   8970     int32_t    equalLength = pSrc - source;
   8971     UBool       bSawNonLatin1 = FALSE;
   8972 
   8973     if (equalLength > 0) {
   8974         // Align position to the start of UTF-8 code point.
   8975         if (bTargLimit) {
   8976             U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
   8977         } else {
   8978             U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
   8979         }
   8980         pSrc = source + equalLength;
   8981         pTarg = target + equalLength;
   8982     }
   8983 
   8984     if (equalLength > 0) {
   8985         /* There is an identical portion at the beginning of the two strings.        */
   8986         /*   If the identical portion ends within a contraction or a comibining      */
   8987         /*   character sequence, back up to the start of that sequence.              */
   8988         UBool bUnsafeCP = FALSE;
   8989         UChar32 uc32 = -1;
   8990 
   8991         if (!bSrcLimit) {
   8992             if (sourceLength >= 0) {
   8993                 U8_GET((uint8_t*)source, 0, equalLength, sourceLength, uc32);
   8994             } else {
   8995                 U8_GET_NULLTERM((uint8_t*)source, 0, equalLength, uc32);
   8996             }
   8997             if (uc32 == -1) {
   8998                 uc32 = 0xfffd;
   8999                 bSawNonLatin1 |= TRUE;
   9000             } else {
   9001                 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
   9002                     bUnsafeCP = TRUE;
   9003                 }
   9004                 bSawNonLatin1 |= (uc32 > 0xff);
   9005             }
   9006         }
   9007         if (!bTargLimit) {
   9008             if (targetLength >= 0) {
   9009                 U8_GET((uint8_t*)target, 0, equalLength, targetLength, uc32);
   9010             } else {
   9011                 U8_GET_NULLTERM((uint8_t*)target, 0, equalLength, uc32);
   9012             }
   9013             if (uc32 == -1) {
   9014                 uc32 = 0xfffd;
   9015                 bSawNonLatin1 |= TRUE;
   9016             } else {
   9017                 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
   9018                     bUnsafeCP = TRUE;
   9019                 }
   9020                 bSawNonLatin1 |= (uc32 > 0xff);
   9021             }
   9022         }
   9023 
   9024         if (bUnsafeCP) {
   9025             while (equalLength > 0) {
   9026                 // We are stopped in the middle of a contraction.
   9027                 // Scan backwards through the == part of the string looking for the start of the contraction.
   9028                 //   It doesn't matter which string we scan, since they are the same in this region.
   9029                 U8_PREV((uint8_t*)source, 0, equalLength, uc32);
   9030                 bSawNonLatin1 |= (uc32 > 0xff);
   9031                 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
   9032                     break;
   9033                 }
   9034             }
   9035         }
   9036         source += equalLength;
   9037         target += equalLength;
   9038         if (sourceLength > 0) {
   9039             sourceLength -= equalLength;
   9040         }
   9041         if (targetLength > 0) {
   9042             targetLength -= equalLength;
   9043         }
   9044     } else {
   9045         // Lead byte of Latin 1 character is 0x00 - 0xC3
   9046         bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3);
   9047         bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3);
   9048     }
   9049 
   9050     UCollationResult returnVal;
   9051 
   9052     if(!coll->latinOneUse || bSawNonLatin1) {
   9053         returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
   9054     } else {
   9055         returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status);
   9056     }
   9057     UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
   9058     return returnVal;
   9059 }
   9060 
   9061 
   9062 /* convenience function for comparing strings */
   9063 U_CAPI UBool U_EXPORT2
   9064 ucol_greater(    const    UCollator        *coll,
   9065         const    UChar            *source,
   9066         int32_t            sourceLength,
   9067         const    UChar            *target,
   9068         int32_t            targetLength)
   9069 {
   9070     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   9071         == UCOL_GREATER);
   9072 }
   9073 
   9074 /* convenience function for comparing strings */
   9075 U_CAPI UBool U_EXPORT2
   9076 ucol_greaterOrEqual(    const    UCollator    *coll,
   9077             const    UChar        *source,
   9078             int32_t        sourceLength,
   9079             const    UChar        *target,
   9080             int32_t        targetLength)
   9081 {
   9082     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   9083         != UCOL_LESS);
   9084 }
   9085 
   9086 /* convenience function for comparing strings */
   9087 U_CAPI UBool U_EXPORT2
   9088 ucol_equal(        const    UCollator        *coll,
   9089             const    UChar            *source,
   9090             int32_t            sourceLength,
   9091             const    UChar            *target,
   9092             int32_t            targetLength)
   9093 {
   9094     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   9095         == UCOL_EQUAL);
   9096 }
   9097 
   9098 U_CAPI void U_EXPORT2
   9099 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
   9100     if(coll && coll->UCA) {
   9101         uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
   9102     }
   9103 }
   9104 
   9105 #endif /* #if !UCONFIG_NO_COLLATION */
   9106