Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 1996-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ucol.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 * Modification history
     12 * Date        Name      Comments
     13 * 1996-1999   various members of ICU team maintained C API for collation framework
     14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
     15 * 03/01/2001  synwee    Added maxexpansion functionality.
     16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_COLLATION
     22 
     23 #include "unicode/coleitr.h"
     24 #include "unicode/unorm.h"
     25 #include "unicode/udata.h"
     26 #include "unicode/ustring.h"
     27 
     28 #include "ucol_imp.h"
     29 #include "bocsu.h"
     30 
     31 #include "normalizer2impl.h"
     32 #include "unorm_it.h"
     33 #include "umutex.h"
     34 #include "cmemory.h"
     35 #include "ucln_in.h"
     36 #include "cstring.h"
     37 #include "utracimp.h"
     38 #include "putilimp.h"
     39 #include "uassert.h"
     40 
     41 #ifdef UCOL_DEBUG
     42 #include <stdio.h>
     43 #endif
     44 
     45 U_NAMESPACE_USE
     46 
     47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     48 
     49 #define LAST_BYTE_MASK_           0xFF
     50 #define SECOND_LAST_BYTE_SHIFT_   8
     51 
     52 #define ZERO_CC_LIMIT_            0xC0
     53 
     54 // this is static pointer to the normalizer fcdTrieIndex
     55 // it is always the same between calls to u_cleanup
     56 // and therefore writing to it is not synchronized.
     57 // It is cleaned in ucol_cleanup
     58 static const uint16_t *fcdTrieIndex=NULL;
     59 // Code points at fcdHighStart and above have a zero FCD value.
     60 static UChar32 fcdHighStart = 0;
     61 
     62 // These are values from UCA required for
     63 // implicit generation and supressing sort key compression
     64 // they should regularly be in the UCA, but if one
     65 // is running without UCA, it could be a problem
     66 static const int32_t maxRegularPrimary  = 0xA0;
     67 static const int32_t minImplicitPrimary = 0xE0;
     68 static const int32_t maxImplicitPrimary = 0xE4;
     69 
     70 U_CDECL_BEGIN
     71 static UBool U_CALLCONV
     72 ucol_cleanup(void)
     73 {
     74     fcdTrieIndex = NULL;
     75     return TRUE;
     76 }
     77 
     78 static int32_t U_CALLCONV
     79 _getFoldingOffset(uint32_t data) {
     80     return (int32_t)(data&0xFFFFFF);
     81 }
     82 
     83 U_CDECL_END
     84 
     85 static
     86 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
     87                               int32_t sourceLen, collIterate *s,
     88                               UErrorCode *status)
     89 {
     90     (s)->string = (s)->pos = sourceString;
     91     (s)->origFlags = 0;
     92     (s)->flags = 0;
     93     if (sourceLen >= 0) {
     94         s->flags |= UCOL_ITER_HASLEN;
     95         (s)->endp = (UChar *)sourceString+sourceLen;
     96     }
     97     else {
     98         /* change to enable easier checking for end of string for fcdpositon */
     99         (s)->endp = NULL;
    100     }
    101     (s)->extendCEs = NULL;
    102     (s)->extendCEsSize = 0;
    103     (s)->CEpos = (s)->toReturn = (s)->CEs;
    104     (s)->offsetBuffer = NULL;
    105     (s)->offsetBufferSize = 0;
    106     (s)->offsetReturn = (s)->offsetStore = NULL;
    107     (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
    108     (s)->coll = (collator);
    109     (s)->nfd = Normalizer2Factory::getNFDInstance(*status);
    110     (s)->fcdPosition = 0;
    111     if(collator->normalizationMode == UCOL_ON) {
    112         (s)->flags |= UCOL_ITER_NORM;
    113     }
    114     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
    115         (s)->flags |= UCOL_HIRAGANA_Q;
    116     }
    117     (s)->iterator = NULL;
    118     //(s)->iteratorIndex = 0;
    119 }
    120 
    121 U_CAPI void  U_EXPORT2
    122 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
    123                              int32_t sourceLen, collIterate *s,
    124                              UErrorCode *status) {
    125     /* Out-of-line version for use from other files. */
    126     IInit_collIterate(collator, sourceString, sourceLen, s, status);
    127 }
    128 
    129 U_CAPI collIterate * U_EXPORT2
    130 uprv_new_collIterate(UErrorCode *status) {
    131     if(U_FAILURE(*status)) {
    132         return NULL;
    133     }
    134     collIterate *s = new collIterate;
    135     if(s == NULL) {
    136         *status = U_MEMORY_ALLOCATION_ERROR;
    137         return NULL;
    138     }
    139     return s;
    140 }
    141 
    142 U_CAPI void U_EXPORT2
    143 uprv_delete_collIterate(collIterate *s) {
    144     delete s;
    145 }
    146 
    147 U_CAPI UBool U_EXPORT2
    148 uprv_collIterateAtEnd(collIterate *s) {
    149     return s == NULL || s->pos == s->endp;
    150 }
    151 
    152 /**
    153 * Backup the state of the collIterate struct data
    154 * @param data collIterate to backup
    155 * @param backup storage
    156 */
    157 static
    158 inline void backupState(const collIterate *data, collIterateState *backup)
    159 {
    160     backup->fcdPosition = data->fcdPosition;
    161     backup->flags       = data->flags;
    162     backup->origFlags   = data->origFlags;
    163     backup->pos         = data->pos;
    164     backup->bufferaddress = data->writableBuffer.getBuffer();
    165     backup->buffersize    = data->writableBuffer.length();
    166     backup->iteratorMove = 0;
    167     backup->iteratorIndex = 0;
    168     if(data->iterator != NULL) {
    169         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
    170         backup->iteratorIndex = data->iterator->getState(data->iterator);
    171         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
    172         if(backup->iteratorIndex == UITER_NO_STATE) {
    173             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
    174                 backup->iteratorMove++;
    175                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
    176             }
    177             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    178         }
    179     }
    180 }
    181 
    182 /**
    183 * Loads the state into the collIterate struct data
    184 * @param data collIterate to backup
    185 * @param backup storage
    186 * @param forwards boolean to indicate if forwards iteration is used,
    187 *        false indicates backwards iteration
    188 */
    189 static
    190 inline void loadState(collIterate *data, const collIterateState *backup,
    191                       UBool        forwards)
    192 {
    193     UErrorCode status = U_ZERO_ERROR;
    194     data->flags       = backup->flags;
    195     data->origFlags   = backup->origFlags;
    196     if(data->iterator != NULL) {
    197         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
    198         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
    199         if(backup->iteratorMove != 0) {
    200             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    201         }
    202     }
    203     data->pos         = backup->pos;
    204 
    205     if ((data->flags & UCOL_ITER_INNORMBUF) &&
    206         data->writableBuffer.getBuffer() != backup->bufferaddress) {
    207         /*
    208         this is when a new buffer has been reallocated and we'll have to
    209         calculate the new position.
    210         note the new buffer has to contain the contents of the old buffer.
    211         */
    212         if (forwards) {
    213             data->pos = data->writableBuffer.getTerminatedBuffer() +
    214                                          (data->pos - backup->bufferaddress);
    215         }
    216         else {
    217             /* backwards direction */
    218             int32_t temp = backup->buffersize -
    219                                   (int32_t)(data->pos - backup->bufferaddress);
    220             data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
    221         }
    222     }
    223     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
    224         /*
    225         this is alittle tricky.
    226         if we are initially not in the normalization buffer, even if we
    227         normalize in the later stage, the data in the buffer will be
    228         ignored, since we skip back up to the data string.
    229         however if we are already in the normalization buffer, any
    230         further normalization will pull data into the normalization
    231         buffer and modify the fcdPosition.
    232         since we are keeping the data in the buffer for use, the
    233         fcdPosition can not be reverted back.
    234         arrgghh....
    235         */
    236         data->fcdPosition = backup->fcdPosition;
    237     }
    238 }
    239 
    240 static UBool
    241 reallocCEs(collIterate *data, int32_t newCapacity) {
    242     uint32_t *oldCEs = data->extendCEs;
    243     if(oldCEs == NULL) {
    244         oldCEs = data->CEs;
    245     }
    246     int32_t length = data->CEpos - oldCEs;
    247     uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
    248     if(newCEs == NULL) {
    249         return FALSE;
    250     }
    251     uprv_memcpy(newCEs, oldCEs, length * 4);
    252     uprv_free(data->extendCEs);
    253     data->extendCEs = newCEs;
    254     data->extendCEsSize = newCapacity;
    255     data->CEpos = newCEs + length;
    256     return TRUE;
    257 }
    258 
    259 static UBool
    260 increaseCEsCapacity(collIterate *data) {
    261     int32_t oldCapacity;
    262     if(data->extendCEs != NULL) {
    263         oldCapacity = data->extendCEsSize;
    264     } else {
    265         oldCapacity = LENGTHOF(data->CEs);
    266     }
    267     return reallocCEs(data, 2 * oldCapacity);
    268 }
    269 
    270 static UBool
    271 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
    272     int32_t oldCapacity;
    273     if(data->extendCEs != NULL) {
    274         oldCapacity = data->extendCEsSize;
    275     } else {
    276         oldCapacity = LENGTHOF(data->CEs);
    277     }
    278     if(minCapacity <= oldCapacity) {
    279         return TRUE;
    280     }
    281     oldCapacity *= 2;
    282     return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
    283 }
    284 
    285 /*
    286 * collIter_eos()
    287 *     Checks for a collIterate being positioned at the end of
    288 *     its source string.
    289 *
    290 */
    291 static
    292 inline UBool collIter_eos(collIterate *s) {
    293     if(s->flags & UCOL_USE_ITERATOR) {
    294       return !(s->iterator->hasNext(s->iterator));
    295     }
    296     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
    297         // Null terminated string, but not at null, so not at end.
    298         //   Whether in main or normalization buffer doesn't matter.
    299         return FALSE;
    300     }
    301 
    302     // String with length.  Can't be in normalization buffer, which is always
    303     //  null termintated.
    304     if (s->flags & UCOL_ITER_HASLEN) {
    305         return (s->pos == s->endp);
    306     }
    307 
    308     // We are at a null termination, could be either normalization buffer or main string.
    309     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
    310         // At null at end of main string.
    311         return TRUE;
    312     }
    313 
    314     // At null at end of normalization buffer.  Need to check whether there there are
    315     //   any characters left in the main buffer.
    316     if(s->origFlags & UCOL_USE_ITERATOR) {
    317       return !(s->iterator->hasNext(s->iterator));
    318     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
    319         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
    320         return (*s->fcdPosition == 0);
    321     }
    322     else {
    323         // Main string with an end pointer.
    324         return s->fcdPosition == s->endp;
    325     }
    326 }
    327 
    328 /*
    329 * collIter_bos()
    330 *     Checks for a collIterate being positioned at the start of
    331 *     its source string.
    332 *
    333 */
    334 static
    335 inline UBool collIter_bos(collIterate *source) {
    336   // if we're going backwards, we need to know whether there is more in the
    337   // iterator, even if we are in the side buffer
    338   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    339     return !source->iterator->hasPrevious(source->iterator);
    340   }
    341   if (source->pos <= source->string ||
    342       ((source->flags & UCOL_ITER_INNORMBUF) &&
    343       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
    344     return TRUE;
    345   }
    346   return FALSE;
    347 }
    348 
    349 /*static
    350 inline UBool collIter_SimpleBos(collIterate *source) {
    351   // if we're going backwards, we need to know whether there is more in the
    352   // iterator, even if we are in the side buffer
    353   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    354     return !source->iterator->hasPrevious(source->iterator);
    355   }
    356   if (source->pos == source->string) {
    357     return TRUE;
    358   }
    359   return FALSE;
    360 }*/
    361     //return (data->pos == data->string) ||
    362 
    363 
    364 /****************************************************************************/
    365 /* Following are the open/close functions                                   */
    366 /*                                                                          */
    367 /****************************************************************************/
    368 
    369 static UCollator*
    370 ucol_initFromBinary(const uint8_t *bin, int32_t length,
    371                 const UCollator *base,
    372                 UCollator *fillIn,
    373                 UErrorCode *status)
    374 {
    375     UCollator *result = fillIn;
    376     if(U_FAILURE(*status)) {
    377         return NULL;
    378     }
    379     /*
    380     if(base == NULL) {
    381         // we don't support null base yet
    382         *status = U_ILLEGAL_ARGUMENT_ERROR;
    383         return NULL;
    384     }
    385     */
    386     // We need these and we could be running without UCA
    387     uprv_uca_initImplicitConstants(status);
    388     UCATableHeader *colData = (UCATableHeader *)bin;
    389     // do we want version check here? We're trying to figure out whether collators are compatible
    390     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
    391         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
    392         colData->version[0] != UCOL_BUILDER_VERSION)
    393     {
    394         *status = U_COLLATOR_VERSION_MISMATCH;
    395         return NULL;
    396     }
    397     else {
    398         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
    399             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
    400             if(U_FAILURE(*status)){
    401                 return NULL;
    402             }
    403             result->hasRealData = TRUE;
    404         }
    405         else {
    406             if(base) {
    407                 result = ucol_initCollator(base->image, result, base, status);
    408                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
    409                 if(U_FAILURE(*status)){
    410                     return NULL;
    411                 }
    412                 result->hasRealData = FALSE;
    413             }
    414             else {
    415                 *status = U_USELESS_COLLATOR_ERROR;
    416                 return NULL;
    417             }
    418         }
    419         result->freeImageOnClose = FALSE;
    420     }
    421     result->actualLocale = NULL;
    422     result->validLocale = NULL;
    423     result->requestedLocale = NULL;
    424     result->rules = NULL;
    425     result->rulesLength = 0;
    426     result->freeRulesOnClose = FALSE;
    427     result->ucaRules = NULL;
    428     return result;
    429 }
    430 
    431 U_CAPI UCollator* U_EXPORT2
    432 ucol_openBinary(const uint8_t *bin, int32_t length,
    433                 const UCollator *base,
    434                 UErrorCode *status)
    435 {
    436     return ucol_initFromBinary(bin, length, base, NULL, status);
    437 }
    438 
    439 U_CAPI int32_t U_EXPORT2
    440 ucol_cloneBinary(const UCollator *coll,
    441                  uint8_t *buffer, int32_t capacity,
    442                  UErrorCode *status)
    443 {
    444     int32_t length = 0;
    445     if(U_FAILURE(*status)) {
    446         return length;
    447     }
    448     if(capacity < 0) {
    449         *status = U_ILLEGAL_ARGUMENT_ERROR;
    450         return length;
    451     }
    452     if(coll->hasRealData == TRUE) {
    453         length = coll->image->size;
    454         if(length <= capacity) {
    455             uprv_memcpy(buffer, coll->image, length);
    456         } else {
    457             *status = U_BUFFER_OVERFLOW_ERROR;
    458         }
    459     } else {
    460         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    461         if(length <= capacity) {
    462             /* build the UCATableHeader with minimal entries */
    463             /* do not copy the header from the UCA file because its values are wrong! */
    464             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    465 
    466             /* reset everything */
    467             uprv_memset(buffer, 0, length);
    468 
    469             /* set the tailoring-specific values */
    470             UCATableHeader *myData = (UCATableHeader *)buffer;
    471             myData->size = length;
    472 
    473             /* offset for the options, the only part of the data that is present after the header */
    474             myData->options = sizeof(UCATableHeader);
    475 
    476             /* need to always set the expansion value for an upper bound of the options */
    477             myData->expansion = myData->options + sizeof(UColOptionSet);
    478 
    479             myData->magic = UCOL_HEADER_MAGIC;
    480             myData->isBigEndian = U_IS_BIG_ENDIAN;
    481             myData->charSetFamily = U_CHARSET_FAMILY;
    482 
    483             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    484             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    485 
    486             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    487             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    488             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    489             myData->jamoSpecial = coll->image->jamoSpecial;
    490 
    491             /* copy the collator options */
    492             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    493         } else {
    494             *status = U_BUFFER_OVERFLOW_ERROR;
    495         }
    496     }
    497     return length;
    498 }
    499 
    500 U_CAPI UCollator* U_EXPORT2
    501 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
    502 {
    503     UCollator * localCollator;
    504     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
    505     char *stackBufferChars = (char *)stackBuffer;
    506     int32_t imageSize = 0;
    507     int32_t rulesSize = 0;
    508     int32_t rulesPadding = 0;
    509     uint8_t *image;
    510     UChar *rules;
    511     UBool colAllocated = FALSE;
    512     UBool imageAllocated = FALSE;
    513 
    514     if (status == NULL || U_FAILURE(*status)){
    515         return 0;
    516     }
    517     if ((stackBuffer && !pBufferSize) || !coll){
    518        *status = U_ILLEGAL_ARGUMENT_ERROR;
    519         return 0;
    520     }
    521     if (coll->rules && coll->freeRulesOnClose) {
    522         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
    523         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
    524         bufferSizeNeeded += rulesSize + rulesPadding;
    525     }
    526 
    527     if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
    528         *pBufferSize =  bufferSizeNeeded;
    529         return 0;
    530     }
    531 
    532     /* Pointers on 64-bit platforms need to be aligned
    533      * on a 64-bit boundry in memory.
    534      */
    535     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
    536         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
    537         if (*pBufferSize > offsetUp) {
    538             *pBufferSize -= offsetUp;
    539             stackBufferChars += offsetUp;
    540         }
    541         else {
    542             /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
    543             *pBufferSize = 1;
    544         }
    545     }
    546     stackBuffer = (void *)stackBufferChars;
    547 
    548     if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
    549         /* allocate one here...*/
    550         stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
    551         // Null pointer check.
    552         if (stackBufferChars == NULL) {
    553             *status = U_MEMORY_ALLOCATION_ERROR;
    554             return NULL;
    555         }
    556         colAllocated = TRUE;
    557         if (U_SUCCESS(*status)) {
    558             *status = U_SAFECLONE_ALLOCATED_WARNING;
    559         }
    560     }
    561     localCollator = (UCollator *)stackBufferChars;
    562     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
    563     {
    564         UErrorCode tempStatus = U_ZERO_ERROR;
    565         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
    566     }
    567     if (coll->freeImageOnClose) {
    568         image = (uint8_t *)uprv_malloc(imageSize);
    569         // Null pointer check
    570         if (image == NULL) {
    571             *status = U_MEMORY_ALLOCATION_ERROR;
    572             return NULL;
    573         }
    574         ucol_cloneBinary(coll, image, imageSize, status);
    575         imageAllocated = TRUE;
    576     }
    577     else {
    578         image = (uint8_t *)coll->image;
    579     }
    580     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
    581     if (U_FAILURE(*status)) {
    582         return NULL;
    583     }
    584 
    585     if (coll->rules) {
    586         if (coll->freeRulesOnClose) {
    587             localCollator->rules = u_strcpy(rules, coll->rules);
    588             //bufferEnd += rulesSize;
    589         }
    590         else {
    591             localCollator->rules = coll->rules;
    592         }
    593         localCollator->freeRulesOnClose = FALSE;
    594         localCollator->rulesLength = coll->rulesLength;
    595     }
    596 
    597     int32_t i;
    598     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
    599         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
    600     }
    601     // zero copies of pointers
    602     localCollator->actualLocale = NULL;
    603     localCollator->validLocale = NULL;
    604     localCollator->requestedLocale = NULL;
    605     localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
    606     localCollator->freeOnClose = colAllocated;
    607     localCollator->freeImageOnClose = imageAllocated;
    608     return localCollator;
    609 }
    610 
    611 U_CAPI void U_EXPORT2
    612 ucol_close(UCollator *coll)
    613 {
    614     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
    615     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
    616     if(coll != NULL) {
    617         // these are always owned by each UCollator struct,
    618         // so we always free them
    619         if(coll->validLocale != NULL) {
    620             uprv_free(coll->validLocale);
    621         }
    622         if(coll->actualLocale != NULL) {
    623             uprv_free(coll->actualLocale);
    624         }
    625         if(coll->requestedLocale != NULL) {
    626             uprv_free(coll->requestedLocale);
    627         }
    628         if(coll->latinOneCEs != NULL) {
    629             uprv_free(coll->latinOneCEs);
    630         }
    631         if(coll->options != NULL && coll->freeOptionsOnClose) {
    632             uprv_free(coll->options);
    633         }
    634         if(coll->rules != NULL && coll->freeRulesOnClose) {
    635             uprv_free((UChar *)coll->rules);
    636         }
    637         if(coll->image != NULL && coll->freeImageOnClose) {
    638             uprv_free((UCATableHeader *)coll->image);
    639         }
    640 
    641         /* Here, it would be advisable to close: */
    642         /* - UData for UCA (unless we stuff it in the root resb */
    643         /* Again, do we need additional housekeeping... HMMM! */
    644         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
    645         if(coll->freeOnClose){
    646             /* for safeClone, if freeOnClose is FALSE,
    647             don't free the other instance data */
    648             uprv_free(coll);
    649         }
    650     }
    651     UTRACE_EXIT();
    652 }
    653 
    654 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
    655 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
    656 U_CFUNC uint8_t* U_EXPORT2
    657 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
    658 {
    659     uint8_t *result = NULL;
    660     if(U_FAILURE(*status)) {
    661         return NULL;
    662     }
    663     if(coll->hasRealData == TRUE) {
    664         *length = coll->image->size;
    665         result = (uint8_t *)uprv_malloc(*length);
    666         /* test for NULL */
    667         if (result == NULL) {
    668             *status = U_MEMORY_ALLOCATION_ERROR;
    669             return NULL;
    670         }
    671         uprv_memcpy(result, coll->image, *length);
    672     } else {
    673         *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    674         result = (uint8_t *)uprv_malloc(*length);
    675         /* test for NULL */
    676         if (result == NULL) {
    677             *status = U_MEMORY_ALLOCATION_ERROR;
    678             return NULL;
    679         }
    680 
    681         /* build the UCATableHeader with minimal entries */
    682         /* do not copy the header from the UCA file because its values are wrong! */
    683         /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    684 
    685         /* reset everything */
    686         uprv_memset(result, 0, *length);
    687 
    688         /* set the tailoring-specific values */
    689         UCATableHeader *myData = (UCATableHeader *)result;
    690         myData->size = *length;
    691 
    692         /* offset for the options, the only part of the data that is present after the header */
    693         myData->options = sizeof(UCATableHeader);
    694 
    695         /* need to always set the expansion value for an upper bound of the options */
    696         myData->expansion = myData->options + sizeof(UColOptionSet);
    697 
    698         myData->magic = UCOL_HEADER_MAGIC;
    699         myData->isBigEndian = U_IS_BIG_ENDIAN;
    700         myData->charSetFamily = U_CHARSET_FAMILY;
    701 
    702         /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    703         uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    704 
    705         uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    706         uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    707         uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    708         myData->jamoSpecial = coll->image->jamoSpecial;
    709 
    710         /* copy the collator options */
    711         uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    712     }
    713     return result;
    714 }
    715 
    716 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
    717     if(U_FAILURE(*status)) {
    718         return;
    719     }
    720     result->caseFirst = (UColAttributeValue)opts->caseFirst;
    721     result->caseLevel = (UColAttributeValue)opts->caseLevel;
    722     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
    723     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
    724     result->strength = (UColAttributeValue)opts->strength;
    725     result->variableTopValue = opts->variableTopValue;
    726     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
    727     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
    728     result->numericCollation = (UColAttributeValue)opts->numericCollation;
    729 
    730     result->caseFirstisDefault = TRUE;
    731     result->caseLevelisDefault = TRUE;
    732     result->frenchCollationisDefault = TRUE;
    733     result->normalizationModeisDefault = TRUE;
    734     result->strengthisDefault = TRUE;
    735     result->variableTopValueisDefault = TRUE;
    736     result->hiraganaQisDefault = TRUE;
    737     result->numericCollationisDefault = TRUE;
    738 
    739     ucol_updateInternalState(result, status);
    740 
    741     result->options = opts;
    742 }
    743 
    744 
    745 /**
    746 * Approximate determination if a character is at a contraction end.
    747 * Guaranteed to be TRUE if a character is at the end of a contraction,
    748 * otherwise it is not deterministic.
    749 * @param c character to be determined
    750 * @param coll collator
    751 */
    752 static
    753 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
    754     if (c < coll->minContrEndCP) {
    755         return FALSE;
    756     }
    757 
    758     int32_t  hash = c;
    759     uint8_t  htbyte;
    760     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
    761         if (U16_IS_TRAIL(c)) {
    762             return TRUE;
    763         }
    764         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
    765     }
    766     htbyte = coll->contrEndCP[hash>>3];
    767     return (((htbyte >> (hash & 7)) & 1) == 1);
    768 }
    769 
    770 
    771 
    772 /*
    773 *   i_getCombiningClass()
    774 *        A fast, at least partly inline version of u_getCombiningClass()
    775 *        This is a candidate for further optimization.  Used heavily
    776 *        in contraction processing.
    777 */
    778 static
    779 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
    780     uint8_t sCC = 0;
    781     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
    782         sCC = u_getCombiningClass(c);
    783     }
    784     return sCC;
    785 }
    786 
    787 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
    788     UChar c;
    789     UCollator *result = fillIn;
    790     if(U_FAILURE(*status) || image == NULL) {
    791         return NULL;
    792     }
    793 
    794     if(result == NULL) {
    795         result = (UCollator *)uprv_malloc(sizeof(UCollator));
    796         if(result == NULL) {
    797             *status = U_MEMORY_ALLOCATION_ERROR;
    798             return result;
    799         }
    800         result->freeOnClose = TRUE;
    801     } else {
    802         result->freeOnClose = FALSE;
    803     }
    804 
    805     // init FCD data
    806     if (fcdTrieIndex == NULL) {
    807         // The result is constant, until the library is reloaded.
    808         fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
    809         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
    810     }
    811 
    812     result->image = image;
    813     result->mapping.getFoldingOffset = _getFoldingOffset;
    814     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
    815     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
    816     if(U_FAILURE(*status)) {
    817         if(result->freeOnClose == TRUE) {
    818             uprv_free(result);
    819             result = NULL;
    820         }
    821         return result;
    822     }
    823 
    824     /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
    825     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
    826     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
    827     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
    828     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
    829 
    830     result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
    831     result->freeOptionsOnClose = FALSE;
    832 
    833     /* set attributes */
    834     result->caseFirst = (UColAttributeValue)result->options->caseFirst;
    835     result->caseLevel = (UColAttributeValue)result->options->caseLevel;
    836     result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
    837     result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
    838     result->strength = (UColAttributeValue)result->options->strength;
    839     result->variableTopValue = result->options->variableTopValue;
    840     result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
    841     result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
    842     result->numericCollation = (UColAttributeValue)result->options->numericCollation;
    843 
    844     result->caseFirstisDefault = TRUE;
    845     result->caseLevelisDefault = TRUE;
    846     result->frenchCollationisDefault = TRUE;
    847     result->normalizationModeisDefault = TRUE;
    848     result->strengthisDefault = TRUE;
    849     result->variableTopValueisDefault = TRUE;
    850     result->alternateHandlingisDefault = TRUE;
    851     result->hiraganaQisDefault = TRUE;
    852     result->numericCollationisDefault = TRUE;
    853 
    854     /*result->scriptOrder = NULL;*/
    855 
    856     result->rules = NULL;
    857     result->rulesLength = 0;
    858     result->freeRulesOnClose = FALSE;
    859 
    860     /* get the version info from UCATableHeader and populate the Collator struct*/
    861     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
    862     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
    863     result->dataVersion[2] = 0;
    864     result->dataVersion[3] = 0;
    865 
    866     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
    867     result->minUnsafeCP = 0;
    868     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
    869         if (ucol_unsafeCP(c, result)) break;
    870     }
    871     result->minUnsafeCP = c;
    872 
    873     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
    874     result->minContrEndCP = 0;
    875     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
    876         if (ucol_contractionEndCP(c, result)) break;
    877     }
    878     result->minContrEndCP = c;
    879 
    880     /* max expansion tables */
    881     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
    882                                          result->image->endExpansionCE);
    883     result->lastEndExpansionCE = result->endExpansionCE +
    884                                  result->image->endExpansionCECount - 1;
    885     result->expansionCESize = (uint8_t*)result->image +
    886                                                result->image->expansionCESize;
    887 
    888 
    889     //result->errorCode = *status;
    890 
    891     result->latinOneCEs = NULL;
    892 
    893     result->latinOneRegenTable = FALSE;
    894     result->latinOneFailed = FALSE;
    895     result->UCA = UCA;
    896 
    897     ucol_updateInternalState(result, status);
    898 
    899     /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
    900     result->ucaRules = NULL;
    901     result->actualLocale = NULL;
    902     result->validLocale = NULL;
    903     result->requestedLocale = NULL;
    904     result->hasRealData = FALSE; // real data lives in .dat file...
    905     result->freeImageOnClose = FALSE;
    906 
    907     return result;
    908 }
    909 
    910 /* new Mark's code */
    911 
    912 /**
    913  * For generation of Implicit CEs
    914  * @author Davis
    915  *
    916  * Cleaned up so that changes can be made more easily.
    917  * Old values:
    918 # First Implicit: E26A792D
    919 # Last Implicit: E3DC70C0
    920 # First CJK: E0030300
    921 # Last CJK: E0A9DD00
    922 # First CJK_A: E0A9DF00
    923 # Last CJK_A: E0DE3100
    924  */
    925 /* Following is a port of Mark's code for new treatment of implicits.
    926  * It is positioned here, since ucol_initUCA need to initialize the
    927  * variables below according to the data in the fractional UCA.
    928  */
    929 
    930 /**
    931  * Function used to:
    932  * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
    933  * b) bump any non-CJK characters by 10FFFF.
    934  * The relevant blocks are:
    935  * A:    4E00..9FFF; CJK Unified Ideographs
    936  *       F900..FAFF; CJK Compatibility Ideographs
    937  * B:    3400..4DBF; CJK Unified Ideographs Extension A
    938  *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
    939  * As long as
    940  *   no new B characters are allocated between 4E00 and FAFF, and
    941  *   no new A characters are outside of this range,
    942  * (very high probability) this simple code will work.
    943  * The reordered blocks are:
    944  * Block1 is CJK
    945  * Block2 is CJK_COMPAT_USED
    946  * Block3 is CJK_A
    947  * (all contiguous)
    948  * Any other CJK gets its normal code point
    949  * Any non-CJK gets +10FFFF
    950  * When we reorder Block1, we make sure that it is at the very start,
    951  * so that it will use a 3-byte form.
    952  * Warning: the we only pick up the compatibility characters that are
    953  * NOT decomposed, so that block is smaller!
    954  */
    955 
    956 // CONSTANTS
    957 static const UChar32
    958     NON_CJK_OFFSET = 0x110000,
    959     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
    960 
    961 /**
    962  * Precomputed by initImplicitConstants()
    963  */
    964 static int32_t
    965     final3Multiplier = 0,
    966     final4Multiplier = 0,
    967     final3Count = 0,
    968     final4Count = 0,
    969     medialCount = 0,
    970     min3Primary = 0,
    971     min4Primary = 0,
    972     max4Primary = 0,
    973     minTrail = 0,
    974     maxTrail = 0,
    975     max3Trail = 0,
    976     max4Trail = 0,
    977     min4Boundary = 0;
    978 
    979 static const UChar32
    980     CJK_BASE = 0x4E00,
    981     CJK_LIMIT = 0x9FFF+1,
    982     CJK_COMPAT_USED_BASE = 0xFA0E,
    983     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
    984     CJK_A_BASE = 0x3400,
    985     CJK_A_LIMIT = 0x4DBF+1,
    986     CJK_B_BASE = 0x20000,
    987     CJK_B_LIMIT = 0x2A6DF+1;
    988 
    989 static UChar32 swapCJK(UChar32 i) {
    990 
    991     if (i >= CJK_BASE) {
    992         if (i < CJK_LIMIT)              return i - CJK_BASE;
    993 
    994         if (i < CJK_COMPAT_USED_BASE)   return i + NON_CJK_OFFSET;
    995 
    996         if (i < CJK_COMPAT_USED_LIMIT)  return i - CJK_COMPAT_USED_BASE
    997                                                 + (CJK_LIMIT - CJK_BASE);
    998         if (i < CJK_B_BASE)             return i + NON_CJK_OFFSET;
    999 
   1000         if (i < CJK_B_LIMIT)            return i; // non-BMP-CJK
   1001 
   1002         return i + NON_CJK_OFFSET;  // non-CJK
   1003     }
   1004     if (i < CJK_A_BASE)                 return i + NON_CJK_OFFSET;
   1005 
   1006     if (i < CJK_A_LIMIT)                return i - CJK_A_BASE
   1007                                                 + (CJK_LIMIT - CJK_BASE)
   1008                                                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1009     return i + NON_CJK_OFFSET; // non-CJK
   1010 }
   1011 
   1012 U_CAPI UChar32 U_EXPORT2
   1013 uprv_uca_getRawFromCodePoint(UChar32 i) {
   1014     return swapCJK(i)+1;
   1015 }
   1016 
   1017 U_CAPI UChar32 U_EXPORT2
   1018 uprv_uca_getCodePointFromRaw(UChar32 i) {
   1019     i--;
   1020     UChar32 result = 0;
   1021     if(i >= NON_CJK_OFFSET) {
   1022         result = i - NON_CJK_OFFSET;
   1023     } else if(i >= CJK_B_BASE) {
   1024         result = i;
   1025     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
   1026         if(i < CJK_LIMIT - CJK_BASE) {
   1027             result = i + CJK_BASE;
   1028         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
   1029             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
   1030         } else {
   1031             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1032         }
   1033     } else {
   1034         result = -1;
   1035     }
   1036     return result;
   1037 }
   1038 
   1039 // GET IMPLICIT PRIMARY WEIGHTS
   1040 // Return value is left justified primary key
   1041 U_CAPI uint32_t U_EXPORT2
   1042 uprv_uca_getImplicitFromRaw(UChar32 cp) {
   1043     /*
   1044     if (cp < 0 || cp > UCOL_MAX_INPUT) {
   1045         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
   1046     }
   1047     */
   1048     int32_t last0 = cp - min4Boundary;
   1049     if (last0 < 0) {
   1050         int32_t last1 = cp / final3Count;
   1051         last0 = cp % final3Count;
   1052 
   1053         int32_t last2 = last1 / medialCount;
   1054         last1 %= medialCount;
   1055 
   1056         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
   1057         last1 = minTrail + last1; // offset
   1058         last2 = min3Primary + last2; // offset
   1059         /*
   1060         if (last2 >= min4Primary) {
   1061             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
   1062         }
   1063         */
   1064         return (last2 << 24) + (last1 << 16) + (last0 << 8);
   1065     } else {
   1066         int32_t last1 = last0 / final4Count;
   1067         last0 %= final4Count;
   1068 
   1069         int32_t last2 = last1 / medialCount;
   1070         last1 %= medialCount;
   1071 
   1072         int32_t last3 = last2 / medialCount;
   1073         last2 %= medialCount;
   1074 
   1075         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
   1076         last1 = minTrail + last1; // offset
   1077         last2 = minTrail + last2; // offset
   1078         last3 = min4Primary + last3; // offset
   1079         /*
   1080         if (last3 > max4Primary) {
   1081             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
   1082         }
   1083         */
   1084         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
   1085     }
   1086 }
   1087 
   1088 static uint32_t U_EXPORT2
   1089 uprv_uca_getImplicitPrimary(UChar32 cp) {
   1090     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
   1091 
   1092     cp = swapCJK(cp);
   1093     cp++;
   1094     // we now have a range of numbers from 0 to 21FFFF.
   1095 
   1096     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
   1097 
   1098     return uprv_uca_getImplicitFromRaw(cp);
   1099 }
   1100 
   1101 /**
   1102  * Converts implicit CE into raw integer ("code point")
   1103  * @param implicit
   1104  * @return -1 if illegal format
   1105  */
   1106 U_CAPI UChar32 U_EXPORT2
   1107 uprv_uca_getRawFromImplicit(uint32_t implicit) {
   1108     UChar32 result;
   1109     UChar32 b3 = implicit & 0xFF;
   1110     UChar32 b2 = (implicit >> 8) & 0xFF;
   1111     UChar32 b1 = (implicit >> 16) & 0xFF;
   1112     UChar32 b0 = (implicit >> 24) & 0xFF;
   1113 
   1114     // simple parameter checks
   1115     if (b0 < min3Primary || b0 > max4Primary
   1116         || b1 < minTrail || b1 > maxTrail)
   1117         return -1;
   1118     // normal offsets
   1119     b1 -= minTrail;
   1120 
   1121     // take care of the final values, and compose
   1122     if (b0 < min4Primary) {
   1123         if (b2 < minTrail || b2 > max3Trail || b3 != 0)
   1124             return -1;
   1125         b2 -= minTrail;
   1126         UChar32 remainder = b2 % final3Multiplier;
   1127         if (remainder != 0)
   1128             return -1;
   1129         b0 -= min3Primary;
   1130         b2 /= final3Multiplier;
   1131         result = ((b0 * medialCount) + b1) * final3Count + b2;
   1132     } else {
   1133         if (b2 < minTrail || b2 > maxTrail
   1134             || b3 < minTrail || b3 > max4Trail)
   1135             return -1;
   1136         b2 -= minTrail;
   1137         b3 -= minTrail;
   1138         UChar32 remainder = b3 % final4Multiplier;
   1139         if (remainder != 0)
   1140             return -1;
   1141         b3 /= final4Multiplier;
   1142         b0 -= min4Primary;
   1143         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
   1144     }
   1145     // final check
   1146     if (result < 0 || result > UCOL_MAX_INPUT)
   1147         return -1;
   1148     return result;
   1149 }
   1150 
   1151 
   1152 static inline int32_t divideAndRoundUp(int a, int b) {
   1153     return 1 + (a-1)/b;
   1154 }
   1155 
   1156 /* this function is either called from initUCA or from genUCA before
   1157  * doing canonical closure for the UCA.
   1158  */
   1159 
   1160 /**
   1161  * Set up to generate implicits.
   1162  * Maintenance Note:  this function may end up being called more than once, due
   1163  *                    to threading races during initialization.  Make sure that
   1164  *                    none of the Constants is ever transiently assigned an
   1165  *                    incorrect value.
   1166  * @param minPrimary
   1167  * @param maxPrimary
   1168  * @param minTrail final byte
   1169  * @param maxTrail final byte
   1170  * @param gap3 the gap we leave for tailoring for 3-byte forms
   1171  * @param gap4 the gap we leave for tailoring for 4-byte forms
   1172  */
   1173 static void initImplicitConstants(int minPrimary, int maxPrimary,
   1174                                     int minTrailIn, int maxTrailIn,
   1175                                     int gap3, int primaries3count,
   1176                                     UErrorCode *status) {
   1177     // some simple parameter checks
   1178     if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
   1179         || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
   1180         || (primaries3count < 1))
   1181     {
   1182         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1183         return;
   1184     };
   1185 
   1186     minTrail = minTrailIn;
   1187     maxTrail = maxTrailIn;
   1188 
   1189     min3Primary = minPrimary;
   1190     max4Primary = maxPrimary;
   1191     // compute constants for use later.
   1192     // number of values we can use in trailing bytes
   1193     // leave room for empty values between AND above, e.g. if gap = 2
   1194     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
   1195     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
   1196     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
   1197     final3Multiplier = gap3 + 1;
   1198     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
   1199     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
   1200 
   1201     // medials can use full range
   1202     medialCount = (maxTrail - minTrail + 1);
   1203     // find out how many values fit in each form
   1204     int32_t threeByteCount = medialCount * final3Count;
   1205     // now determine where the 3/4 boundary is.
   1206     // we use 3 bytes below the boundary, and 4 above
   1207     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
   1208     int32_t primaries4count = primariesAvailable - primaries3count;
   1209 
   1210 
   1211     int32_t min3ByteCoverage = primaries3count * threeByteCount;
   1212     min4Primary = minPrimary + primaries3count;
   1213     min4Boundary = min3ByteCoverage;
   1214     // Now expand out the multiplier for the 4 bytes, and redo.
   1215 
   1216     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
   1217     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
   1218     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
   1219     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
   1220     if (gap4 < 1) {
   1221         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1222         return;
   1223     }
   1224     final4Multiplier = gap4 + 1;
   1225     final4Count = neededPerFinalByte;
   1226     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
   1227 }
   1228 
   1229     /**
   1230      * Supply parameters for generating implicit CEs
   1231      */
   1232 U_CAPI void U_EXPORT2
   1233 uprv_uca_initImplicitConstants(UErrorCode *status) {
   1234     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
   1235     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
   1236     initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
   1237 }
   1238 
   1239 
   1240 /*    collIterNormalize     Incremental Normalization happens here.                       */
   1241 /*                          pick up the range of chars identifed by FCD,                  */
   1242 /*                          normalize it into the collIterate's writable buffer,          */
   1243 /*                          switch the collIterate's state to use the writable buffer.    */
   1244 /*                                                                                        */
   1245 static
   1246 void collIterNormalize(collIterate *collationSource)
   1247 {
   1248     UErrorCode  status = U_ZERO_ERROR;
   1249     const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
   1250     const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
   1251 
   1252     collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
   1253                                     collationSource->writableBuffer,
   1254                                     status);
   1255     if (U_FAILURE(status)) {
   1256 #ifdef UCOL_DEBUG
   1257         fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
   1258 #endif
   1259         return;
   1260     }
   1261 
   1262     collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
   1263     collationSource->origFlags  = collationSource->flags;
   1264     collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1265     collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1266 }
   1267 
   1268 
   1269 // This function takes the iterator and extracts normalized stuff up to the next boundary
   1270 // It is similar in the end results to the collIterNormalize, but for the cases when we
   1271 // use an iterator
   1272 /*static
   1273 inline void normalizeIterator(collIterate *collationSource) {
   1274   UErrorCode status = U_ZERO_ERROR;
   1275   UBool wasNormalized = FALSE;
   1276   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
   1277   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
   1278   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1279     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1280   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
   1281     // reallocate and terminate
   1282     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
   1283                                &collationSource->writableBuffer,
   1284                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
   1285                                0)
   1286     ) {
   1287     #ifdef UCOL_DEBUG
   1288         fprintf(stderr, "normalizeIterator(), out of memory\n");
   1289     #endif
   1290         return;
   1291     }
   1292     status = U_ZERO_ERROR;
   1293     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
   1294     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
   1295     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1296     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1297   }
   1298   // Terminate the buffer - we already checked that it is big enough
   1299   collationSource->writableBuffer[normLen] = 0;
   1300   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
   1301       collationSource->flags |= UCOL_ITER_ALLOCATED;
   1302   }
   1303   collationSource->pos        = collationSource->writableBuffer;
   1304   collationSource->origFlags  = collationSource->flags;
   1305   collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1306   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1307 }*/
   1308 
   1309 
   1310 /* Incremental FCD check and normalize                                                    */
   1311 /*   Called from getNextCE when normalization state is suspect.                           */
   1312 /*   When entering, the state is known to be this:                                        */
   1313 /*      o   We are working in the main buffer of the collIterate, not the side            */
   1314 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
   1315 /*          so we won't get here.                                                         */
   1316 /*      o   The leading combining class from the current character is 0 or                */
   1317 /*          the trailing combining class of the previous char was zero.                   */
   1318 /*          True because the previous call to this function will have always exited       */
   1319 /*          that way, and we get called for every char where cc might be non-zero.        */
   1320 static
   1321 inline UBool collIterFCD(collIterate *collationSource) {
   1322     const UChar *srcP, *endP;
   1323     uint8_t     leadingCC;
   1324     uint8_t     prevTrailingCC = 0;
   1325     uint16_t    fcd;
   1326     UBool       needNormalize = FALSE;
   1327 
   1328     srcP = collationSource->pos-1;
   1329 
   1330     if (collationSource->flags & UCOL_ITER_HASLEN) {
   1331         endP = collationSource->endp;
   1332     } else {
   1333         endP = NULL;
   1334     }
   1335 
   1336     // Get the trailing combining class of the current character.  If it's zero,
   1337     //   we are OK.
   1338     /* trie access */
   1339     fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
   1340     if (fcd != 0) {
   1341         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1342 
   1343         if (prevTrailingCC != 0) {
   1344             // The current char has a non-zero trailing CC.  Scan forward until we find
   1345             //   a char with a leading cc of zero.
   1346             while (endP == NULL || srcP != endP)
   1347             {
   1348                 const UChar *savedSrcP = srcP;
   1349 
   1350                 /* trie access */
   1351                 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
   1352                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1353                 if (leadingCC == 0) {
   1354                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
   1355                                            //   back up over it.  (Could be surrogate pair!)
   1356                     break;
   1357                 }
   1358 
   1359                 if (leadingCC < prevTrailingCC) {
   1360                     needNormalize = TRUE;
   1361                 }
   1362 
   1363                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1364             }
   1365         }
   1366     }
   1367 
   1368     collationSource->fcdPosition = (UChar *)srcP;
   1369 
   1370     return needNormalize;
   1371 }
   1372 
   1373 /****************************************************************************/
   1374 /* Following are the CE retrieval functions                                 */
   1375 /*                                                                          */
   1376 /****************************************************************************/
   1377 
   1378 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
   1379 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
   1380 
   1381 /* there should be a macro version of this function in the header file */
   1382 /* This is the first function that tries to fetch a collation element  */
   1383 /* If it's not succesfull or it encounters a more difficult situation  */
   1384 /* some more sofisticated and slower functions are invoked             */
   1385 static
   1386 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1387     uint32_t order = 0;
   1388     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
   1389         order = *(collationSource->toReturn++);                         /* if so, return them */
   1390         if(collationSource->CEpos == collationSource->toReturn) {
   1391             collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
   1392         }
   1393         return order;
   1394     }
   1395 
   1396     UChar ch = 0;
   1397     collationSource->offsetReturn = NULL;
   1398 
   1399     for (;;)                           /* Loop handles case when incremental normalize switches   */
   1400     {                                  /*   to or from the side buffer / original string, and we  */
   1401         /*   need to start again to get the next character.        */
   1402 
   1403         if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
   1404         {
   1405             // The source string is null terminated and we're not working from the side buffer,
   1406             //   and we're not normalizing.  This is the fast path.
   1407             //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
   1408             ch = *collationSource->pos++;
   1409             if (ch != 0) {
   1410                 break;
   1411             }
   1412             else {
   1413                 return UCOL_NO_MORE_CES;
   1414             }
   1415         }
   1416 
   1417         if (collationSource->flags & UCOL_ITER_HASLEN) {
   1418             // Normal path for strings when length is specified.
   1419             //   (We can't be in side buffer because it is always null terminated.)
   1420             if (collationSource->pos >= collationSource->endp) {
   1421                 // Ran off of the end of the main source string.  We're done.
   1422                 return UCOL_NO_MORE_CES;
   1423             }
   1424             ch = *collationSource->pos++;
   1425         }
   1426         else if(collationSource->flags & UCOL_USE_ITERATOR) {
   1427             UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
   1428             if(iterCh == U_SENTINEL) {
   1429                 return UCOL_NO_MORE_CES;
   1430             }
   1431             ch = (UChar)iterCh;
   1432         }
   1433         else
   1434         {
   1435             // Null terminated string.
   1436             ch = *collationSource->pos++;
   1437             if (ch == 0) {
   1438                 // Ran off end of buffer.
   1439                 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1440                     // Ran off end of main string. backing up one character.
   1441                     collationSource->pos--;
   1442                     return UCOL_NO_MORE_CES;
   1443                 }
   1444                 else
   1445                 {
   1446                     // Hit null in the normalize side buffer.
   1447                     // Usually this means the end of the normalized data,
   1448                     // except for one odd case: a null followed by combining chars,
   1449                     //   which is the case if we are at the start of the buffer.
   1450                     if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
   1451                         break;
   1452                     }
   1453 
   1454                     //  Null marked end of side buffer.
   1455                     //   Revert to the main string and
   1456                     //   loop back to top to try again to get a character.
   1457                     collationSource->pos   = collationSource->fcdPosition;
   1458                     collationSource->flags = collationSource->origFlags;
   1459                     continue;
   1460                 }
   1461             }
   1462         }
   1463 
   1464         if(collationSource->flags&UCOL_HIRAGANA_Q) {
   1465             /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
   1466              * based on whether the previous codepoint was Hiragana or Katakana.
   1467              */
   1468             if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
   1469                     ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
   1470                 collationSource->flags |= UCOL_WAS_HIRAGANA;
   1471             } else {
   1472                 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
   1473             }
   1474         }
   1475 
   1476         // We've got a character.  See if there's any fcd and/or normalization stuff to do.
   1477         //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
   1478         if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
   1479             break;
   1480         }
   1481 
   1482         if (collationSource->fcdPosition >= collationSource->pos) {
   1483             // An earlier FCD check has already covered the current character.
   1484             // We can go ahead and process this char.
   1485             break;
   1486         }
   1487 
   1488         if (ch < ZERO_CC_LIMIT_ ) {
   1489             // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
   1490             break;
   1491         }
   1492 
   1493         if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1494             // We need to peek at the next character in order to tell if we are FCD
   1495             if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
   1496                 // We are at the last char of source string.
   1497                 //  It is always OK for FCD check.
   1498                 break;
   1499             }
   1500 
   1501             // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
   1502             if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1503                 break;
   1504             }
   1505         }
   1506 
   1507 
   1508         // Need a more complete FCD check and possible normalization.
   1509         if (collIterFCD(collationSource)) {
   1510             collIterNormalize(collationSource);
   1511         }
   1512         if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1513             //  No normalization was needed.  Go ahead and process the char we already had.
   1514             break;
   1515         }
   1516 
   1517         // Some normalization happened.  Next loop iteration will pick up a char
   1518         //   from the normalization buffer.
   1519 
   1520     }   // end for (;;)
   1521 
   1522 
   1523     if (ch <= 0xFF) {
   1524         /*  For latin-1 characters we never need to fall back to the UCA table        */
   1525         /*    because all of the UCA data is replicated in the latinOneMapping array  */
   1526         order = coll->latinOneMapping[ch];
   1527         if (order > UCOL_NOT_FOUND) {
   1528             order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
   1529         }
   1530     }
   1531     else
   1532     {
   1533         // Always use UCA for Han, Hangul
   1534         // (Han extension A is before main Han block)
   1535         // **** Han compatibility chars ?? ****
   1536         if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   1537             (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
   1538             if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
   1539                 // between the two target ranges; do normal lookup
   1540                 // **** this range is YI, Modifier tone letters, ****
   1541                 // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   1542                 // **** Latin-D might be tailored, so we need to ****
   1543                 // **** do the normal lookup for these guys.     ****
   1544                 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1545             } else {
   1546                 // in one of the target ranges; use UCA
   1547                 order = UCOL_NOT_FOUND;
   1548             }
   1549         } else {
   1550             order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1551         }
   1552 
   1553         if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
   1554             order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
   1555         }
   1556 
   1557         if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
   1558             /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
   1559             order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   1560 
   1561             if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
   1562                 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
   1563             }
   1564         }
   1565     }
   1566     if(order == UCOL_NOT_FOUND) {
   1567         order = getImplicit(ch, collationSource);
   1568     }
   1569     return order; /* return the CE */
   1570 }
   1571 
   1572 /* ucol_getNextCE, out-of-line version for use from other files.   */
   1573 U_CAPI uint32_t  U_EXPORT2
   1574 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1575     return ucol_IGetNextCE(coll, collationSource, status);
   1576 }
   1577 
   1578 
   1579 /**
   1580 * Incremental previous normalization happens here. Pick up the range of chars
   1581 * identifed by FCD, normalize it into the collIterate's writable buffer,
   1582 * switch the collIterate's state to use the writable buffer.
   1583 * @param data collation iterator data
   1584 */
   1585 static
   1586 void collPrevIterNormalize(collIterate *data)
   1587 {
   1588     UErrorCode status  = U_ZERO_ERROR;
   1589     const UChar *pEnd   = data->pos;  /* End normalize + 1 */
   1590     const UChar *pStart;
   1591 
   1592     /* Start normalize */
   1593     if (data->fcdPosition == NULL) {
   1594         pStart = data->string;
   1595     }
   1596     else {
   1597         pStart = data->fcdPosition + 1;
   1598     }
   1599 
   1600     int32_t normLen =
   1601         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
   1602                              data->writableBuffer,
   1603                              status).
   1604         length();
   1605     if(U_FAILURE(status)) {
   1606         return;
   1607     }
   1608     /*
   1609     this puts the null termination infront of the normalized string instead
   1610     of the end
   1611     */
   1612     data->writableBuffer.insert(0, (UChar)0);
   1613 
   1614     if (data->offsetBuffer == NULL) {
   1615         int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE;
   1616 
   1617         data->offsetBufferSize = len;
   1618         data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len);
   1619         data->offsetStore = data->offsetBuffer;
   1620     } else if(data->offsetBufferSize < normLen) {
   1621         int32_t storeIX = (int32_t)(data->offsetStore - data->offsetBuffer);
   1622         int32_t *tob    = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1));
   1623 
   1624         if (tob != NULL) {
   1625             data->offsetBuffer = tob;
   1626             data->offsetStore = &data->offsetBuffer[storeIX];
   1627             data->offsetBufferSize = normLen + 1;
   1628         }
   1629     }
   1630 
   1631     /*
   1632      * The usual case at this point is that we've got a base
   1633      * character followed by marks that were normalized. If
   1634      * fcdPosition is NULL, that means that we backed up to
   1635      * the beginning of the string and there's no base character.
   1636      *
   1637      * Forward processing will usually normalize when it sees
   1638      * the first mark, so that mark will get it's natural offset
   1639      * and the rest will get the offset of the character following
   1640      * the marks. The base character will also get its natural offset.
   1641      *
   1642      * We write the offset of the base character, if there is one,
   1643      * followed by the offset of the first mark and then the offsets
   1644      * of the rest of the marks.
   1645      */
   1646     int32_t firstMarkOffset = 0;
   1647     int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
   1648     int32_t trailCount      = normLen - 1;
   1649 
   1650     if (data->fcdPosition != NULL) {
   1651         int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
   1652         UChar   baseChar   = *data->fcdPosition;
   1653 
   1654         firstMarkOffset = baseOffset + 1;
   1655 
   1656         /*
   1657          * If the base character is the start of a contraction, forward processing
   1658          * will normalize the marks while checking for the contraction, which means
   1659          * that the offset of the first mark will the same as the other marks.
   1660          *
   1661          * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
   1662          */
   1663         if (baseChar >= 0x100) {
   1664             uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
   1665 
   1666             if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
   1667                 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
   1668             }
   1669 
   1670             if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
   1671                 firstMarkOffset = trailOffset;
   1672             }
   1673         }
   1674 
   1675         *(data->offsetStore++) = baseOffset;
   1676     }
   1677 
   1678     *(data->offsetStore++) = firstMarkOffset;
   1679 
   1680     for (int32_t i = 0; i < trailCount; i += 1) {
   1681         *(data->offsetStore++) = trailOffset;
   1682     }
   1683 
   1684     data->offsetRepeatValue = trailOffset;
   1685 
   1686     data->offsetReturn = data->offsetStore - 1;
   1687     if (data->offsetReturn == data->offsetBuffer) {
   1688         data->offsetStore = data->offsetBuffer;
   1689     }
   1690 
   1691     data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
   1692     data->origFlags  = data->flags;
   1693     data->flags     |= UCOL_ITER_INNORMBUF;
   1694     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   1695 }
   1696 
   1697 
   1698 /**
   1699 * Incremental FCD check for previous iteration and normalize. Called from
   1700 * getPrevCE when normalization state is suspect.
   1701 * When entering, the state is known to be this:
   1702 * o  We are working in the main buffer of the collIterate, not the side
   1703 *    writable buffer. When in the side buffer, normalization mode is always
   1704 *    off, so we won't get here.
   1705 * o  The leading combining class from the current character is 0 or the
   1706 *    trailing combining class of the previous char was zero.
   1707 *    True because the previous call to this function will have always exited
   1708 *    that way, and we get called for every char where cc might be non-zero.
   1709 * @param data collation iterate struct
   1710 * @return normalization status, TRUE for normalization to be done, FALSE
   1711 *         otherwise
   1712 */
   1713 static
   1714 inline UBool collPrevIterFCD(collIterate *data)
   1715 {
   1716     const UChar *src, *start;
   1717     uint8_t     leadingCC;
   1718     uint8_t     trailingCC = 0;
   1719     uint16_t    fcd;
   1720     UBool       result = FALSE;
   1721 
   1722     start = data->string;
   1723     src = data->pos + 1;
   1724 
   1725     /* Get the trailing combining class of the current character. */
   1726     fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
   1727 
   1728     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1729 
   1730     if (leadingCC != 0) {
   1731         /*
   1732         The current char has a non-zero leading combining class.
   1733         Scan backward until we find a char with a trailing cc of zero.
   1734         */
   1735         for (;;)
   1736         {
   1737             if (start == src) {
   1738                 data->fcdPosition = NULL;
   1739                 return result;
   1740             }
   1741 
   1742             fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
   1743 
   1744             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1745 
   1746             if (trailingCC == 0) {
   1747                 break;
   1748             }
   1749 
   1750             if (leadingCC < trailingCC) {
   1751                 result = TRUE;
   1752             }
   1753 
   1754             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1755         }
   1756     }
   1757 
   1758     data->fcdPosition = (UChar *)src;
   1759 
   1760     return result;
   1761 }
   1762 
   1763 /** gets a character from the string at a given offset
   1764  *  Handles both normal and iterative cases.
   1765  *  No error checking - caller beware!
   1766  */
   1767 inline static
   1768 UChar peekCharacter(collIterate *source, int32_t offset) {
   1769     if(source->pos != NULL) {
   1770         return *(source->pos + offset);
   1771     } else if(source->iterator != NULL) {
   1772         if(offset != 0) {
   1773             source->iterator->move(source->iterator, offset, UITER_CURRENT);
   1774             UChar toReturn = (UChar)source->iterator->next(source->iterator);
   1775             source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
   1776             return toReturn;
   1777         } else {
   1778             return (UChar)source->iterator->current(source->iterator);
   1779         }
   1780     } else {
   1781         return (UChar)U_SENTINEL;
   1782     }
   1783 }
   1784 
   1785 /**
   1786 * Determines if we are at the start of the data string in the backwards
   1787 * collation iterator
   1788 * @param data collation iterator
   1789 * @return TRUE if we are at the start
   1790 */
   1791 static
   1792 inline UBool isAtStartPrevIterate(collIterate *data) {
   1793     if(data->pos == NULL && data->iterator != NULL) {
   1794         return !data->iterator->hasPrevious(data->iterator);
   1795     }
   1796     //return (collIter_bos(data)) ||
   1797     return (data->pos == data->string) ||
   1798               ((data->flags & UCOL_ITER_INNORMBUF) &&
   1799               *(data->pos - 1) == 0 && data->fcdPosition == NULL);
   1800 }
   1801 
   1802 static
   1803 inline void goBackOne(collIterate *data) {
   1804 # if 0
   1805     // somehow, it looks like we need to keep iterator synced up
   1806     // at all times, as above.
   1807     if(data->pos) {
   1808         data->pos--;
   1809     }
   1810     if(data->iterator) {
   1811         data->iterator->previous(data->iterator);
   1812     }
   1813 #endif
   1814     if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
   1815         data->iterator->previous(data->iterator);
   1816     }
   1817     if(data->pos) {
   1818         data->pos --;
   1819     }
   1820 }
   1821 
   1822 /**
   1823 * Inline function that gets a simple CE.
   1824 * So what it does is that it will first check the expansion buffer. If the
   1825 * expansion buffer is not empty, ie the end pointer to the expansion buffer
   1826 * is different from the string pointer, we return the collation element at the
   1827 * return pointer and decrement it.
   1828 * For more complicated CEs it resorts to getComplicatedCE.
   1829 * @param coll collator data
   1830 * @param data collation iterator struct
   1831 * @param status error status
   1832 */
   1833 static
   1834 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
   1835                                UErrorCode *status)
   1836 {
   1837     uint32_t result = (uint32_t)UCOL_NULLORDER;
   1838 
   1839     if (data->offsetReturn != NULL) {
   1840         if (data->offsetRepeatCount > 0) {
   1841                 data->offsetRepeatCount -= 1;
   1842         } else {
   1843             if (data->offsetReturn == data->offsetBuffer) {
   1844                 data->offsetReturn = NULL;
   1845 				data->offsetStore  = data->offsetBuffer;
   1846             } else {
   1847                 data->offsetReturn -= 1;
   1848             }
   1849         }
   1850     }
   1851 
   1852     if ((data->extendCEs && data->toReturn > data->extendCEs) ||
   1853             (!data->extendCEs && data->toReturn > data->CEs))
   1854     {
   1855         data->toReturn -= 1;
   1856         result = *(data->toReturn);
   1857         if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
   1858             data->CEpos = data->toReturn;
   1859         }
   1860     }
   1861     else {
   1862         UChar ch = 0;
   1863 
   1864         /*
   1865         Loop handles case when incremental normalize switches to or from the
   1866         side buffer / original string, and we need to start again to get the
   1867         next character.
   1868         */
   1869         for (;;) {
   1870             if (data->flags & UCOL_ITER_HASLEN) {
   1871                 /*
   1872                 Normal path for strings when length is specified.
   1873                 Not in side buffer because it is always null terminated.
   1874                 */
   1875                 if (data->pos <= data->string) {
   1876                     /* End of the main source string */
   1877                     return UCOL_NO_MORE_CES;
   1878                 }
   1879                 data->pos --;
   1880                 ch = *data->pos;
   1881             }
   1882             // we are using an iterator to go back. Pray for us!
   1883             else if (data->flags & UCOL_USE_ITERATOR) {
   1884               UChar32 iterCh = data->iterator->previous(data->iterator);
   1885               if(iterCh == U_SENTINEL) {
   1886                 return UCOL_NO_MORE_CES;
   1887               } else {
   1888                 ch = (UChar)iterCh;
   1889               }
   1890             }
   1891             else {
   1892                 data->pos --;
   1893                 ch = *data->pos;
   1894                 /* we are in the side buffer. */
   1895                 if (ch == 0) {
   1896                     /*
   1897                     At the start of the normalize side buffer.
   1898                     Go back to string.
   1899                     Because pointer points to the last accessed character,
   1900                     hence we have to increment it by one here.
   1901                     */
   1902                     data->flags = data->origFlags;
   1903                     data->offsetRepeatValue = 0;
   1904 
   1905                      if (data->fcdPosition == NULL) {
   1906                         data->pos = data->string;
   1907                         return UCOL_NO_MORE_CES;
   1908                     }
   1909                     else {
   1910                         data->pos   = data->fcdPosition + 1;
   1911                     }
   1912 
   1913                    continue;
   1914                 }
   1915             }
   1916 
   1917             if(data->flags&UCOL_HIRAGANA_Q) {
   1918               if(ch>=0x3040 && ch<=0x309f) {
   1919                 data->flags |= UCOL_WAS_HIRAGANA;
   1920               } else {
   1921                 data->flags &= ~UCOL_WAS_HIRAGANA;
   1922               }
   1923             }
   1924 
   1925             /*
   1926             * got a character to determine if there's fcd and/or normalization
   1927             * stuff to do.
   1928             * if the current character is not fcd.
   1929             * if current character is at the start of the string
   1930             * Trailing combining class == 0.
   1931             * Note if pos is in the writablebuffer, norm is always 0
   1932             */
   1933             if (ch < ZERO_CC_LIMIT_ ||
   1934               // this should propel us out of the loop in the iterator case
   1935                 (data->flags & UCOL_ITER_NORM) == 0 ||
   1936                 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
   1937                 || data->string == data->pos) {
   1938                 break;
   1939             }
   1940 
   1941             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1942                 /* if next character is FCD */
   1943                 if (data->pos == data->string) {
   1944                     /* First char of string is always OK for FCD check */
   1945                     break;
   1946                 }
   1947 
   1948                 /* Not first char of string, do the FCD fast test */
   1949                 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1950                     break;
   1951                 }
   1952             }
   1953 
   1954             /* Need a more complete FCD check and possible normalization. */
   1955             if (collPrevIterFCD(data)) {
   1956                 collPrevIterNormalize(data);
   1957             }
   1958 
   1959             if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   1960                 /*  No normalization. Go ahead and process the char. */
   1961                 break;
   1962             }
   1963 
   1964             /*
   1965             Some normalization happened.
   1966             Next loop picks up a char from the normalization buffer.
   1967             */
   1968         }
   1969 
   1970         /* attempt to handle contractions, after removal of the backwards
   1971         contraction
   1972         */
   1973         if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
   1974             result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
   1975         } else {
   1976             if (ch <= 0xFF) {
   1977                 result = coll->latinOneMapping[ch];
   1978             }
   1979             else {
   1980                 // Always use UCA for [3400..9FFF], [AC00..D7AF]
   1981                 // **** [FA0E..FA2F] ?? ****
   1982                 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   1983                     (ch >= 0x3400 && ch <= 0xD7AF)) {
   1984                     if (ch > 0x9FFF && ch < 0xAC00) {
   1985                         // between the two target ranges; do normal lookup
   1986                         // **** this range is YI, Modifier tone letters, ****
   1987                         // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   1988                         // **** Latin-D might be tailored, so we need to ****
   1989                         // **** do the normal lookup for these guys.     ****
   1990                          result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1991                     } else {
   1992                         result = UCOL_NOT_FOUND;
   1993                     }
   1994                 } else {
   1995                     result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1996                 }
   1997             }
   1998             if (result > UCOL_NOT_FOUND) {
   1999                 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
   2000             }
   2001             if (result == UCOL_NOT_FOUND) { // Not found in master list
   2002                 if (!isAtStartPrevIterate(data) &&
   2003                     ucol_contractionEndCP(ch, data->coll))
   2004                 {
   2005                     result = UCOL_CONTRACTION;
   2006                 } else {
   2007                     if(coll->UCA) {
   2008                         result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   2009                     }
   2010                 }
   2011 
   2012                 if (result > UCOL_NOT_FOUND) {
   2013                     if(coll->UCA) {
   2014                         result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
   2015                     }
   2016                 }
   2017             }
   2018         }
   2019 
   2020         if(result == UCOL_NOT_FOUND) {
   2021             result = getPrevImplicit(ch, data);
   2022         }
   2023     }
   2024 
   2025     return result;
   2026 }
   2027 
   2028 
   2029 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
   2030 U_CFUNC uint32_t  U_EXPORT2
   2031 ucol_getPrevCE(const UCollator *coll, collIterate *data,
   2032                         UErrorCode *status) {
   2033     return ucol_IGetPrevCE(coll, data, status);
   2034 }
   2035 
   2036 
   2037 /* this should be connected to special Jamo handling */
   2038 U_CFUNC uint32_t  U_EXPORT2
   2039 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
   2040     collIterate colIt;
   2041     IInit_collIterate(coll, &u, 1, &colIt, status);
   2042     if(U_FAILURE(*status)) {
   2043         return 0;
   2044     }
   2045     return ucol_IGetNextCE(coll, &colIt, status);
   2046 }
   2047 
   2048 /**
   2049 * Inserts the argument character into the end of the buffer pushing back the
   2050 * null terminator.
   2051 * @param data collIterate struct data
   2052 * @param ch character to be appended
   2053 * @return the position of the new addition
   2054 */
   2055 static
   2056 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
   2057 {
   2058     int32_t oldLength = data->writableBuffer.length();
   2059     return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
   2060 }
   2061 
   2062 /**
   2063 * Inserts the argument string into the end of the buffer pushing back the
   2064 * null terminator.
   2065 * @param data collIterate struct data
   2066 * @param string to be appended
   2067 * @param length of the string to be appended
   2068 * @return the position of the new addition
   2069 */
   2070 static
   2071 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
   2072 {
   2073     int32_t oldLength = data->writableBuffer.length();
   2074     return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
   2075 }
   2076 
   2077 /**
   2078 * Special normalization function for contraction in the forwards iterator.
   2079 * This normalization sequence will place the current character at source->pos
   2080 * and its following normalized sequence into the buffer.
   2081 * The fcd position, pos will be changed.
   2082 * pos will now point to positions in the buffer.
   2083 * Flags will be changed accordingly.
   2084 * @param data collation iterator data
   2085 */
   2086 static
   2087 inline void normalizeNextContraction(collIterate *data)
   2088 {
   2089     int32_t     strsize;
   2090     UErrorCode  status     = U_ZERO_ERROR;
   2091     /* because the pointer points to the next character */
   2092     const UChar *pStart    = data->pos - 1;
   2093     const UChar *pEnd;
   2094 
   2095     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2096         data->writableBuffer.setTo(*(pStart - 1));
   2097         strsize               = 1;
   2098     }
   2099     else {
   2100         strsize = data->writableBuffer.length();
   2101     }
   2102 
   2103     pEnd = data->fcdPosition;
   2104 
   2105     data->writableBuffer.append(
   2106         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
   2107     if(U_FAILURE(status)) {
   2108         return;
   2109     }
   2110 
   2111     data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
   2112     data->origFlags  = data->flags;
   2113     data->flags     |= UCOL_ITER_INNORMBUF;
   2114     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2115 }
   2116 
   2117 /**
   2118 * Contraction character management function that returns the next character
   2119 * for the forwards iterator.
   2120 * Does nothing if the next character is in buffer and not the first character
   2121 * in it.
   2122 * Else it checks next character in data string to see if it is normalizable.
   2123 * If it is not, the character is simply copied into the buffer, else
   2124 * the whole normalized substring is copied into the buffer, including the
   2125 * current character.
   2126 * @param data collation element iterator data
   2127 * @return next character
   2128 */
   2129 static
   2130 inline UChar getNextNormalizedChar(collIterate *data)
   2131 {
   2132     UChar  nextch;
   2133     UChar  ch;
   2134     // Here we need to add the iterator code. One problem is the way
   2135     // end of string is handled. If we just return next char, it could
   2136     // be the sentinel. Most of the cases already check for this, but we
   2137     // need to be sure.
   2138     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
   2139          /* if no normalization and not in buffer. */
   2140       if(data->flags & UCOL_USE_ITERATOR) {
   2141          return (UChar)data->iterator->next(data->iterator);
   2142       } else {
   2143          return *(data->pos ++);
   2144       }
   2145     }
   2146 
   2147     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
   2148       //normalizeIterator(data);
   2149     //}
   2150 
   2151     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2152     if ((innormbuf && *data->pos != 0) ||
   2153         (data->fcdPosition != NULL && !innormbuf &&
   2154         data->pos < data->fcdPosition)) {
   2155         /*
   2156         if next character is in normalized buffer, no further normalization
   2157         is required
   2158         */
   2159         return *(data->pos ++);
   2160     }
   2161 
   2162     if (data->flags & UCOL_ITER_HASLEN) {
   2163         /* in data string */
   2164         if (data->pos + 1 == data->endp) {
   2165             return *(data->pos ++);
   2166         }
   2167     }
   2168     else {
   2169         if (innormbuf) {
   2170           // inside the normalization buffer, but at the end
   2171           // (since we encountered zero). This means, in the
   2172           // case we're using char iterator, that we need to
   2173           // do another round of normalization.
   2174           //if(data->origFlags & UCOL_USE_ITERATOR) {
   2175             // we need to restore original flags,
   2176             // otherwise, we'll lose them
   2177             //data->flags = data->origFlags;
   2178             //normalizeIterator(data);
   2179             //return *(data->pos++);
   2180           //} else {
   2181             /*
   2182             in writable buffer, at this point fcdPosition can not be
   2183             pointing to the end of the data string. see contracting tag.
   2184             */
   2185           if(data->fcdPosition) {
   2186             if (*(data->fcdPosition + 1) == 0 ||
   2187                 data->fcdPosition + 1 == data->endp) {
   2188                 /* at the end of the string, dump it into the normalizer */
   2189                 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
   2190                 // Check if data->pos received a null pointer
   2191                 if (data->pos == NULL) {
   2192                     return (UChar)-1; // Return to indicate error.
   2193                 }
   2194                 return *(data->fcdPosition ++);
   2195             }
   2196             data->pos = data->fcdPosition;
   2197           } else if(data->origFlags & UCOL_USE_ITERATOR) {
   2198             // if we are here, we're using a normalizing iterator.
   2199             // we should just continue further.
   2200             data->flags = data->origFlags;
   2201             data->pos = NULL;
   2202             return (UChar)data->iterator->next(data->iterator);
   2203           }
   2204           //}
   2205         }
   2206         else {
   2207             if (*(data->pos + 1) == 0) {
   2208                 return *(data->pos ++);
   2209             }
   2210         }
   2211     }
   2212 
   2213     ch = *data->pos ++;
   2214     nextch = *data->pos;
   2215 
   2216     /*
   2217     * if the current character is not fcd.
   2218     * Trailing combining class == 0.
   2219     */
   2220     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
   2221         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
   2222          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
   2223             /*
   2224             Need a more complete FCD check and possible normalization.
   2225             normalize substring will be appended to buffer
   2226             */
   2227         if (collIterFCD(data)) {
   2228             normalizeNextContraction(data);
   2229             return *(data->pos ++);
   2230         }
   2231         else if (innormbuf) {
   2232             /* fcdposition shifted even when there's no normalization, if we
   2233             don't input the rest into this, we'll get the wrong position when
   2234             we reach the end of the writableBuffer */
   2235             int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
   2236             data->pos = insertBufferEnd(data, data->pos - 1, length);
   2237             // Check if data->pos received a null pointer
   2238             if (data->pos == NULL) {
   2239                 return (UChar)-1; // Return to indicate error.
   2240             }
   2241             return *(data->pos ++);
   2242         }
   2243     }
   2244 
   2245     if (innormbuf) {
   2246         /*
   2247         no normalization is to be done hence only one character will be
   2248         appended to the buffer.
   2249         */
   2250         data->pos = insertBufferEnd(data, ch) + 1;
   2251         // Check if data->pos received a null pointer
   2252         if (data->pos == NULL) {
   2253             return (UChar)-1; // Return to indicate error.
   2254         }
   2255     }
   2256 
   2257     /* points back to the pos in string */
   2258     return ch;
   2259 }
   2260 
   2261 
   2262 
   2263 /**
   2264 * Function to copy the buffer into writableBuffer and sets the fcd position to
   2265 * the correct position
   2266 * @param source data string source
   2267 * @param buffer character buffer
   2268 */
   2269 static
   2270 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
   2271 {
   2272     /* okay confusing part here. to ensure that the skipped characters are
   2273     considered later, we need to place it in the appropriate position in the
   2274     normalization buffer and reassign the pos pointer. simple case if pos
   2275     reside in string, simply copy to normalization buffer and
   2276     fcdposition = pos, pos = start of normalization buffer. if pos in
   2277     normalization buffer, we'll insert the copy infront of pos and point pos
   2278     to the start of the normalization buffer. why am i doing these copies?
   2279     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
   2280     not require any changes, which be really painful. */
   2281     if (source->flags & UCOL_ITER_INNORMBUF) {
   2282         int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
   2283         source->writableBuffer.replace(0, replaceLength, buffer);
   2284     }
   2285     else {
   2286         source->fcdPosition  = source->pos;
   2287         source->origFlags    = source->flags;
   2288         source->flags       |= UCOL_ITER_INNORMBUF;
   2289         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   2290         source->writableBuffer = buffer;
   2291     }
   2292 
   2293     source->pos = source->writableBuffer.getTerminatedBuffer();
   2294 }
   2295 
   2296 /**
   2297 * Function to get the discontiguos collation element within the source.
   2298 * Note this function will set the position to the appropriate places.
   2299 * @param coll current collator used
   2300 * @param source data string source
   2301 * @param constart index to the start character in the contraction table
   2302 * @return discontiguos collation element offset
   2303 */
   2304 static
   2305 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
   2306                                 const UChar *constart)
   2307 {
   2308     /* source->pos currently points to the second combining character after
   2309        the start character */
   2310           const UChar *temppos      = source->pos;
   2311           UnicodeString buffer;
   2312     const UChar   *tempconstart = constart;
   2313           uint8_t  tempflags    = source->flags;
   2314           UBool    multicontraction = FALSE;
   2315           collIterateState discState;
   2316 
   2317           backupState(source, &discState);
   2318 
   2319     buffer.setTo(peekCharacter(source, -1));
   2320     for (;;) {
   2321         UChar    *UCharOffset;
   2322         UChar     schar,
   2323                   tchar;
   2324         uint32_t  result;
   2325 
   2326         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
   2327             || (peekCharacter(source, 0) == 0  &&
   2328             //|| (*source->pos == 0  &&
   2329                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
   2330                  source->fcdPosition == NULL ||
   2331                  source->fcdPosition == source->endp ||
   2332                  *(source->fcdPosition) == 0 ||
   2333                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
   2334                  /* end of string in null terminated string or stopped by a
   2335                  null character, note fcd does not always point to a base
   2336                  character after the discontiguos change */
   2337                  u_getCombiningClass(peekCharacter(source, 0)) == 0) {
   2338                  //u_getCombiningClass(*(source->pos)) == 0) {
   2339             //constart = (UChar *)coll->image + getContractOffset(CE);
   2340             if (multicontraction) {
   2341                 source->pos    = temppos - 1;
   2342                 setDiscontiguosAttribute(source, buffer);
   2343                 return *(coll->contractionCEs +
   2344                                     (tempconstart - coll->contractionIndex));
   2345             }
   2346             constart = tempconstart;
   2347             break;
   2348         }
   2349 
   2350         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
   2351         schar = getNextNormalizedChar(source);
   2352 
   2353         while (schar > (tchar = *UCharOffset)) {
   2354             UCharOffset++;
   2355         }
   2356 
   2357         if (schar != tchar) {
   2358             /* not the correct codepoint. we stuff the current codepoint into
   2359             the discontiguos buffer and try the next character */
   2360             buffer.append(schar);
   2361             continue;
   2362         }
   2363         else {
   2364             if (u_getCombiningClass(schar) ==
   2365                 u_getCombiningClass(peekCharacter(source, -2))) {
   2366                 //u_getCombiningClass(*(source->pos - 2))) {
   2367                 buffer.append(schar);
   2368                 continue;
   2369             }
   2370             result = *(coll->contractionCEs +
   2371                                       (UCharOffset - coll->contractionIndex));
   2372         }
   2373 
   2374         if (result == UCOL_NOT_FOUND) {
   2375           break;
   2376         } else if (isContraction(result)) {
   2377             /* this is a multi-contraction*/
   2378             tempconstart = (UChar *)coll->image + getContractOffset(result);
   2379             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
   2380                 != UCOL_NOT_FOUND) {
   2381                 multicontraction = TRUE;
   2382                 temppos       = source->pos + 1;
   2383             }
   2384         } else {
   2385             setDiscontiguosAttribute(source, buffer);
   2386             return result;
   2387         }
   2388     }
   2389 
   2390     /* no problems simply reverting just like that,
   2391     if we are in string before getting into this function, points back to
   2392     string hence no problem.
   2393     if we are in normalization buffer before getting into this function,
   2394     since we'll never use another normalization within this function, we
   2395     know that fcdposition points to a base character. the normalization buffer
   2396     never change, hence this revert works. */
   2397     loadState(source, &discState, TRUE);
   2398     goBackOne(source);
   2399 
   2400     //source->pos   = temppos - 1;
   2401     source->flags = tempflags;
   2402     return *(coll->contractionCEs + (constart - coll->contractionIndex));
   2403 }
   2404 
   2405 static
   2406 inline UBool isNonChar(UChar32 cp) {
   2407     return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF));
   2408 }
   2409 
   2410 /* now uses Mark's getImplicitPrimary code */
   2411 static
   2412 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
   2413     if(isNonChar(cp)) {
   2414         return 0;
   2415     }
   2416     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   2417     *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
   2418     collationSource->offsetRepeatCount += 1;
   2419     return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
   2420 }
   2421 
   2422 /**
   2423 * Inserts the argument character into the front of the buffer replacing the
   2424 * front null terminator.
   2425 * @param data collation element iterator data
   2426 * @param ch character to be appended
   2427 */
   2428 static
   2429 inline void insertBufferFront(collIterate *data, UChar ch)
   2430 {
   2431     data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
   2432 }
   2433 
   2434 /**
   2435 * Special normalization function for contraction in the previous iterator.
   2436 * This normalization sequence will place the current character at source->pos
   2437 * and its following normalized sequence into the buffer.
   2438 * The fcd position, pos will be changed.
   2439 * pos will now point to positions in the buffer.
   2440 * Flags will be changed accordingly.
   2441 * @param data collation iterator data
   2442 */
   2443 static
   2444 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
   2445 {
   2446     const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
   2447     const UChar *pStart;
   2448 
   2449     UnicodeString endOfBuffer;
   2450     if (data->flags & UCOL_ITER_HASLEN) {
   2451         /*
   2452         normalization buffer not used yet, we'll pull down the next
   2453         character into the end of the buffer
   2454         */
   2455         endOfBuffer.setTo(*pEnd);
   2456     }
   2457     else {
   2458         endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
   2459     }
   2460 
   2461     if (data->fcdPosition == NULL) {
   2462         pStart = data->string;
   2463     }
   2464     else {
   2465         pStart = data->fcdPosition + 1;
   2466     }
   2467     int32_t normLen =
   2468         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
   2469                              data->writableBuffer,
   2470                              *status).
   2471         length();
   2472     if(U_FAILURE(*status)) {
   2473         return;
   2474     }
   2475     /*
   2476     this puts the null termination infront of the normalized string instead
   2477     of the end
   2478     */
   2479     data->pos =
   2480         data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
   2481         1 + normLen;
   2482     data->origFlags  = data->flags;
   2483     data->flags     |= UCOL_ITER_INNORMBUF;
   2484     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2485 }
   2486 
   2487 /**
   2488 * Contraction character management function that returns the previous character
   2489 * for the backwards iterator.
   2490 * Does nothing if the previous character is in buffer and not the first
   2491 * character in it.
   2492 * Else it checks previous character in data string to see if it is
   2493 * normalizable.
   2494 * If it is not, the character is simply copied into the buffer, else
   2495 * the whole normalized substring is copied into the buffer, including the
   2496 * current character.
   2497 * @param data collation element iterator data
   2498 * @return previous character
   2499 */
   2500 static
   2501 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
   2502 {
   2503     UChar  prevch;
   2504     UChar  ch;
   2505     const UChar *start;
   2506     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2507     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
   2508         (innormbuf && *(data->pos - 1) != 0)) {
   2509         /*
   2510         if no normalization.
   2511         if previous character is in normalized buffer, no further normalization
   2512         is required
   2513         */
   2514       if(data->flags & UCOL_USE_ITERATOR) {
   2515         data->iterator->move(data->iterator, -1, UITER_CURRENT);
   2516         return (UChar)data->iterator->next(data->iterator);
   2517       } else {
   2518         return *(data->pos - 1);
   2519       }
   2520     }
   2521 
   2522     start = data->pos;
   2523     if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
   2524         /* in data string */
   2525         if ((start - 1) == data->string) {
   2526             return *(start - 1);
   2527         }
   2528         start --;
   2529         ch     = *start;
   2530         prevch = *(start - 1);
   2531     }
   2532     else {
   2533         /*
   2534         in writable buffer, at this point fcdPosition can not be NULL.
   2535         see contracting tag.
   2536         */
   2537         if (data->fcdPosition == data->string) {
   2538             /* at the start of the string, just dump it into the normalizer */
   2539             insertBufferFront(data, *(data->fcdPosition));
   2540             data->fcdPosition = NULL;
   2541             return *(data->pos - 1);
   2542         }
   2543         start  = data->fcdPosition;
   2544         ch     = *start;
   2545         prevch = *(start - 1);
   2546     }
   2547     /*
   2548     * if the current character is not fcd.
   2549     * Trailing combining class == 0.
   2550     */
   2551     if (data->fcdPosition > start &&
   2552        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
   2553     {
   2554         /*
   2555         Need a more complete FCD check and possible normalization.
   2556         normalize substring will be appended to buffer
   2557         */
   2558         const UChar *backuppos = data->pos;
   2559         data->pos = start;
   2560         if (collPrevIterFCD(data)) {
   2561             normalizePrevContraction(data, status);
   2562             return *(data->pos - 1);
   2563         }
   2564         data->pos = backuppos;
   2565         data->fcdPosition ++;
   2566     }
   2567 
   2568     if (innormbuf) {
   2569     /*
   2570     no normalization is to be done hence only one character will be
   2571     appended to the buffer.
   2572     */
   2573         insertBufferFront(data, ch);
   2574         data->fcdPosition --;
   2575     }
   2576 
   2577     return ch;
   2578 }
   2579 
   2580 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
   2581 /* It is called by getNextCE */
   2582 
   2583 /* The following should be even */
   2584 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
   2585 
   2586 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
   2587     collIterateState entryState;
   2588     backupState(source, &entryState);
   2589     UChar32 cp = ch;
   2590 
   2591     for (;;) {
   2592         // This loop will repeat only in the case of contractions, and only when a contraction
   2593         //   is found and the first CE resulting from that contraction is itself a special
   2594         //   (an expansion, for example.)  All other special CE types are fully handled the
   2595         //   first time through, and the loop exits.
   2596 
   2597         const uint32_t *CEOffset = NULL;
   2598         switch(getCETag(CE)) {
   2599         case NOT_FOUND_TAG:
   2600             /* This one is not found, and we'll let somebody else bother about it... no more games */
   2601             return CE;
   2602         case SPEC_PROC_TAG:
   2603             {
   2604                 // Special processing is getting a CE that is preceded by a certain prefix
   2605                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   2606                 // When we encouter a special processing tag, we go backwards and try to see if
   2607                 // we have a match.
   2608                 // Contraction tables are used - so the whole process is not unlike contraction.
   2609                 // prefix data is stored backwards in the table.
   2610                 const UChar *UCharOffset;
   2611                 UChar schar, tchar;
   2612                 collIterateState prefixState;
   2613                 backupState(source, &prefixState);
   2614                 loadState(source, &entryState, TRUE);
   2615                 goBackOne(source); // We want to look at the point where we entered - actually one
   2616                 // before that...
   2617 
   2618                 for(;;) {
   2619                     // This loop will run once per source string character, for as long as we
   2620                     //  are matching a potential contraction sequence
   2621 
   2622                     // First we position ourselves at the begining of contraction sequence
   2623                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2624                     if (collIter_bos(source)) {
   2625                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2626                         break;
   2627                     }
   2628                     schar = getPrevNormalizedChar(source, status);
   2629                     goBackOne(source);
   2630 
   2631                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2632                         UCharOffset++;
   2633                     }
   2634 
   2635                     if (schar == tchar) {
   2636                         // Found the source string char in the table.
   2637                         //  Pick up the corresponding CE from the table.
   2638                         CE = *(coll->contractionCEs +
   2639                             (UCharOffset - coll->contractionIndex));
   2640                     }
   2641                     else
   2642                     {
   2643                         // Source string char was not in the table.
   2644                         //   We have not found the prefix.
   2645                         CE = *(coll->contractionCEs +
   2646                             (ContractionStart - coll->contractionIndex));
   2647                     }
   2648 
   2649                     if(!isPrefix(CE)) {
   2650                         // The source string char was in the contraction table, and the corresponding
   2651                         //   CE is not a prefix CE.  We found the prefix, break
   2652                         //   out of loop, this CE will end up being returned.  This is the normal
   2653                         //   way out of prefix handling when the source actually contained
   2654                         //   the prefix.
   2655                         break;
   2656                     }
   2657                 }
   2658                 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
   2659                     loadState(source, &prefixState, TRUE);
   2660                     if(source->origFlags & UCOL_USE_ITERATOR) {
   2661                         source->flags = source->origFlags;
   2662                     }
   2663                 } else { // prefix search was a failure, we have to backup all the way to the start
   2664                     loadState(source, &entryState, TRUE);
   2665                 }
   2666                 break;
   2667             }
   2668         case CONTRACTION_TAG:
   2669             {
   2670                 /* This should handle contractions */
   2671                 collIterateState state;
   2672                 backupState(source, &state);
   2673                 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
   2674                 const UChar *UCharOffset;
   2675                 UChar schar, tchar;
   2676 
   2677                 for (;;) {
   2678                     /* This loop will run once per source string character, for as long as we     */
   2679                     /*  are matching a potential contraction sequence                  */
   2680 
   2681                     /* First we position ourselves at the begining of contraction sequence */
   2682                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2683 
   2684                     if (collIter_eos(source)) {
   2685                         // Ran off the end of the source string.
   2686                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2687                         // So we'll pick whatever we have at the point...
   2688                         if (CE == UCOL_NOT_FOUND) {
   2689                             // back up the source over all the chars we scanned going into this contraction.
   2690                             CE = firstCE;
   2691                             loadState(source, &state, TRUE);
   2692                             if(source->origFlags & UCOL_USE_ITERATOR) {
   2693                                 source->flags = source->origFlags;
   2694                             }
   2695                         }
   2696                         break;
   2697                     }
   2698 
   2699                     uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
   2700                     uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
   2701 
   2702                     schar = getNextNormalizedChar(source);
   2703                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2704                         UCharOffset++;
   2705                     }
   2706 
   2707                     if (schar == tchar) {
   2708                         // Found the source string char in the contraction table.
   2709                         //  Pick up the corresponding CE from the table.
   2710                         CE = *(coll->contractionCEs +
   2711                             (UCharOffset - coll->contractionIndex));
   2712                     }
   2713                     else
   2714                     {
   2715                         // Source string char was not in contraction table.
   2716                         //   Unless we have a discontiguous contraction, we have finished
   2717                         //   with this contraction.
   2718                         // in order to do the proper detection, we
   2719                         // need to see if we're dealing with a supplementary
   2720                         /* We test whether the next two char are surrogate pairs.
   2721                         * This test is done if the iterator is not NULL.
   2722                         * If there is no surrogate pair, the iterator
   2723                         * goes back one if needed. */
   2724                         UChar32 miss = schar;
   2725                         if (source->iterator) {
   2726                             UChar32 surrNextChar; /* the next char in the iteration to test */
   2727                             int32_t prevPos; /* holds the previous position before move forward of the source iterator */
   2728                             if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
   2729                                 prevPos = source->iterator->index;
   2730                                 surrNextChar = getNextNormalizedChar(source);
   2731                                 if (U16_IS_TRAIL(surrNextChar)) {
   2732                                     miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
   2733                                 } else if (prevPos < source->iterator->index){
   2734                                     goBackOne(source);
   2735                                 }
   2736                             }
   2737                         } else if (U16_IS_LEAD(schar)) {
   2738                             miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
   2739                         }
   2740 
   2741                         uint8_t sCC;
   2742                         if (miss < 0x300 ||
   2743                             maxCC == 0 ||
   2744                             (sCC = i_getCombiningClass(miss, coll)) == 0 ||
   2745                             sCC>maxCC ||
   2746                             (allSame != 0 && sCC == maxCC) ||
   2747                             collIter_eos(source))
   2748                         {
   2749                             //  Contraction can not be discontiguous.
   2750                             goBackOne(source);  // back up the source string by one,
   2751                             //  because  the character we just looked at was
   2752                             //  not part of the contraction.   */
   2753                             if(U_IS_SUPPLEMENTARY(miss)) {
   2754                                 goBackOne(source);
   2755                             }
   2756                             CE = *(coll->contractionCEs +
   2757                                 (ContractionStart - coll->contractionIndex));
   2758                         } else {
   2759                             //
   2760                             // Contraction is possibly discontiguous.
   2761                             //   Scan more of source string looking for a match
   2762                             //
   2763                             UChar tempchar;
   2764                             /* find the next character if schar is not a base character
   2765                             and we are not yet at the end of the string */
   2766                             tempchar = getNextNormalizedChar(source);
   2767                             // probably need another supplementary thingie here
   2768                             goBackOne(source);
   2769                             if (i_getCombiningClass(tempchar, coll) == 0) {
   2770                                 goBackOne(source);
   2771                                 if(U_IS_SUPPLEMENTARY(miss)) {
   2772                                     goBackOne(source);
   2773                                 }
   2774                                 /* Spit out the last char of the string, wasn't tasty enough */
   2775                                 CE = *(coll->contractionCEs +
   2776                                     (ContractionStart - coll->contractionIndex));
   2777                             } else {
   2778                                 CE = getDiscontiguous(coll, source, ContractionStart);
   2779                             }
   2780                         }
   2781                     } // else after if(schar == tchar)
   2782 
   2783                     if(CE == UCOL_NOT_FOUND) {
   2784                         /* The Source string did not match the contraction that we were checking.  */
   2785                         /*  Back up the source position to undo the effects of having partially    */
   2786                         /*   scanned through what ultimately proved to not be a contraction.       */
   2787                         loadState(source, &state, TRUE);
   2788                         CE = firstCE;
   2789                         break;
   2790                     }
   2791 
   2792                     if(!isContraction(CE)) {
   2793                         // The source string char was in the contraction table, and the corresponding
   2794                         //   CE is not a contraction CE.  We completed the contraction, break
   2795                         //   out of loop, this CE will end up being returned.  This is the normal
   2796                         //   way out of contraction handling when the source actually contained
   2797                         //   the contraction.
   2798                         break;
   2799                     }
   2800 
   2801 
   2802                     // The source string char was in the contraction table, and the corresponding
   2803                     //   CE is IS  a contraction CE.  We will continue looping to check the source
   2804                     //   string for the remaining chars in the contraction.
   2805                     uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
   2806                     if(tempCE != UCOL_NOT_FOUND) {
   2807                         // We have scanned a a section of source string for which there is a
   2808                         //  CE from the contraction table.  Remember the CE and scan position, so
   2809                         //  that we can return to this point if further scanning fails to
   2810                         //  match a longer contraction sequence.
   2811                         firstCE = tempCE;
   2812 
   2813                         goBackOne(source);
   2814                         backupState(source, &state);
   2815                         getNextNormalizedChar(source);
   2816 
   2817                         // Another way to do this is:
   2818                         //collIterateState tempState;
   2819                         //backupState(source, &tempState);
   2820                         //goBackOne(source);
   2821                         //backupState(source, &state);
   2822                         //loadState(source, &tempState, TRUE);
   2823 
   2824                         // The problem is that for incomplete contractions we have to remember the previous
   2825                         // position. Before, the only thing I needed to do was state.pos--;
   2826                         // After iterator introduction and especially after introduction of normalizing
   2827                         // iterators, it became much more difficult to decrease the saved state.
   2828                         // I'm not yet sure which of the two methods above is faster.
   2829                     }
   2830                 } // for(;;)
   2831                 break;
   2832             } // case CONTRACTION_TAG:
   2833         case LONG_PRIMARY_TAG:
   2834             {
   2835                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   2836                 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   2837                 source->offsetRepeatCount += 1;
   2838                 return CE;
   2839             }
   2840         case EXPANSION_TAG:
   2841             {
   2842                 /* This should handle expansion. */
   2843                 /* NOTE: we can encounter both continuations and expansions in an expansion! */
   2844                 /* I have to decide where continuations are going to be dealt with */
   2845                 uint32_t size;
   2846                 uint32_t i;    /* general counter */
   2847 
   2848                 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   2849                 size = getExpansionCount(CE);
   2850                 CE = *CEOffset++;
   2851 			  //source->offsetRepeatCount = -1;
   2852 
   2853                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   2854                     for(i = 1; i<size; i++) {
   2855                         *(source->CEpos++) = *CEOffset++;
   2856 						source->offsetRepeatCount += 1;
   2857                     }
   2858                 } else { /* else, we do */
   2859                     while(*CEOffset != 0) {
   2860                         *(source->CEpos++) = *CEOffset++;
   2861 						source->offsetRepeatCount += 1;
   2862                     }
   2863                 }
   2864 
   2865                 return CE;
   2866             }
   2867         case DIGIT_TAG:
   2868             {
   2869                 /*
   2870                 We do a check to see if we want to collate digits as numbers; if so we generate
   2871                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   2872                 */
   2873                 //uint32_t size;
   2874                 uint32_t i;    /* general counter */
   2875 
   2876                 if (source->coll->numericCollation == UCOL_ON){
   2877                     collIterateState digitState = {0,0,0,0,0,0,0,0,0};
   2878                     UChar32 char32 = 0;
   2879                     int32_t digVal = 0;
   2880 
   2881                     uint32_t digIndx = 0;
   2882                     uint32_t endIndex = 0;
   2883                     uint32_t trailingZeroIndex = 0;
   2884 
   2885                     uint8_t collateVal = 0;
   2886 
   2887                     UBool nonZeroValReached = FALSE;
   2888 
   2889                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
   2890                     /*
   2891                          We parse the source string until we hit a char that's NOT a digit.
   2892                         Use this u_charDigitValue. This might be slow because we have to
   2893                         handle surrogates...
   2894                     */
   2895             /*
   2896                     if (U16_IS_LEAD(ch)){
   2897                       if (!collIter_eos(source)) {
   2898                         backupState(source, &digitState);
   2899                         UChar trail = getNextNormalizedChar(source);
   2900                         if(U16_IS_TRAIL(trail)) {
   2901                           char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   2902                         } else {
   2903                           loadState(source, &digitState, TRUE);
   2904                           char32 = ch;
   2905                         }
   2906                       } else {
   2907                         char32 = ch;
   2908                       }
   2909                     } else {
   2910                       char32 = ch;
   2911                     }
   2912                     digVal = u_charDigitValue(char32);
   2913             */
   2914                     digVal = u_charDigitValue(cp); // if we have arrived here, we have
   2915                     // already processed possible supplementaries that trigered the digit tag -
   2916                     // all supplementaries are marked in the UCA.
   2917                     /*
   2918                         We  pad a zero in front of the first element anyways. This takes
   2919                         care of the (probably) most common case where people are sorting things followed
   2920                         by a single digit
   2921                     */
   2922                     digIndx++;
   2923                     for(;;){
   2924                         // Make sure we have enough space. No longer needed;
   2925                         // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
   2926                         // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
   2927                         // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
   2928 
   2929                         // Skipping over leading zeroes.
   2930                         if (digVal != 0) {
   2931                             nonZeroValReached = TRUE;
   2932                         }
   2933                         if (nonZeroValReached) {
   2934                             /*
   2935                             We parse the digit string into base 100 numbers (this fits into a byte).
   2936                             We only add to the buffer in twos, thus if we are parsing an odd character,
   2937                             that serves as the 'tens' digit while the if we are parsing an even one, that
   2938                             is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   2939                             a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   2940                             overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   2941                             than all the other bytes.
   2942                             */
   2943 
   2944                             if (digIndx % 2 == 1){
   2945                                 collateVal += (uint8_t)digVal;
   2946 
   2947                                 // We don't enter the low-order-digit case unless we've already seen
   2948                                 // the high order, or for the first digit, which is always non-zero.
   2949                                 if (collateVal != 0)
   2950                                     trailingZeroIndex = 0;
   2951 
   2952                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   2953                                 collateVal = 0;
   2954                             }
   2955                             else{
   2956                                 // We drop the collation value into the buffer so if we need to do
   2957                                 // a "front patch" we don't have to check to see if we're hitting the
   2958                                 // last element.
   2959                                 collateVal = (uint8_t)(digVal * 10);
   2960 
   2961                                 // Check for trailing zeroes.
   2962                                 if (collateVal == 0)
   2963                                 {
   2964                                     if (!trailingZeroIndex)
   2965                                         trailingZeroIndex = (digIndx/2) + 2;
   2966                                 }
   2967                                 else
   2968                                     trailingZeroIndex = 0;
   2969 
   2970                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   2971                             }
   2972                             digIndx++;
   2973                         }
   2974 
   2975                         // Get next character.
   2976                         if (!collIter_eos(source)){
   2977                             ch = getNextNormalizedChar(source);
   2978                             if (U16_IS_LEAD(ch)){
   2979                                 if (!collIter_eos(source)) {
   2980                                     backupState(source, &digitState);
   2981                                     UChar trail = getNextNormalizedChar(source);
   2982                                     if(U16_IS_TRAIL(trail)) {
   2983                                         char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   2984                                     } else {
   2985                                         loadState(source, &digitState, TRUE);
   2986                                         char32 = ch;
   2987                                     }
   2988                                 }
   2989                             } else {
   2990                                 char32 = ch;
   2991                             }
   2992 
   2993                             if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
   2994                                 // Resetting position to point to the next unprocessed char. We
   2995                                 // overshot it when doing our test/set for numbers.
   2996                                 if (char32 > 0xFFFF) { // For surrogates.
   2997                                     loadState(source, &digitState, TRUE);
   2998                                     //goBackOne(source);
   2999                                 }
   3000                                 goBackOne(source);
   3001                                 break;
   3002                             }
   3003                         } else {
   3004                             break;
   3005                         }
   3006                     }
   3007 
   3008                     if (nonZeroValReached == FALSE){
   3009                         digIndx = 2;
   3010                         numTempBuf[2] = 6;
   3011                     }
   3012 
   3013                     endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
   3014                     if (digIndx % 2 != 0){
   3015                         /*
   3016                         We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
   3017                         we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
   3018                         Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
   3019                         single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
   3020                         */
   3021 
   3022                         for(i = 2; i < endIndex; i++){
   3023                             numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
   3024                                 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
   3025                         }
   3026                         --digIndx;
   3027                     }
   3028 
   3029                     // Subtract one off of the last byte.
   3030                     numTempBuf[endIndex-1] -= 1;
   3031 
   3032                     /*
   3033                     We want to skip over the first two slots in the buffer. The first slot
   3034                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3035                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3036                     */
   3037                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3038                     numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
   3039 
   3040                     // Now transfer the collation key to our collIterate struct.
   3041                     // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
   3042                     //size = ((endIndex+1) & ~1)/2;
   3043                     CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3044                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3045                         UCOL_BYTE_COMMON; // Tertiary weight.
   3046                     i = 2; // Reset the index into the buffer.
   3047                     while(i < endIndex)
   3048                     {
   3049                         uint32_t primWeight = numTempBuf[i++] << 8;
   3050                         if ( i < endIndex)
   3051                             primWeight |= numTempBuf[i++];
   3052                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3053                     }
   3054 
   3055                 } else {
   3056                     // no numeric mode, we'll just switch to whatever we stashed and continue
   3057                     CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   3058                     CE = *CEOffset++;
   3059                     break;
   3060                 }
   3061                 return CE;
   3062             }
   3063             /* various implicits optimization */
   3064         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   3065             /* UCA is filled with these. Tailorings are NOT_FOUND */
   3066             return getImplicit(cp, source);
   3067         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   3068             // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
   3069             return getImplicit(cp, source);
   3070         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3071             {
   3072                 static const uint32_t
   3073                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3074                 //const uint32_t LCount = 19;
   3075                 static const uint32_t VCount = 21;
   3076                 static const uint32_t TCount = 28;
   3077                 //const uint32_t NCount = VCount * TCount;   // 588
   3078                 //const uint32_t SCount = LCount * NCount;   // 11172
   3079                 uint32_t L = ch - SBase;
   3080 
   3081                 // divide into pieces
   3082 
   3083                 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
   3084                 L /= TCount;
   3085                 uint32_t V = L % VCount;
   3086                 L /= VCount;
   3087 
   3088                 // offset them
   3089 
   3090                 L += LBase;
   3091                 V += VBase;
   3092                 T += TBase;
   3093 
   3094                 // return the first CE, but first put the rest into the expansion buffer
   3095                 if (!source->coll->image->jamoSpecial) { // FAST PATH
   3096 
   3097                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   3098                     if (T != TBase) {
   3099                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   3100                     }
   3101 
   3102                     return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   3103 
   3104                 } else { // Jamo is Special
   3105                     // Since Hanguls pass the FCD check, it is
   3106                     // guaranteed that we won't be in
   3107                     // the normalization buffer if something like this happens
   3108                     // However, if we are using a uchar iterator and normalization
   3109                     // is ON, the Hangul that lead us here is going to be in that
   3110                     // normalization buffer. Here we want to restore the uchar
   3111                     // iterator state and pull out of the normalization buffer
   3112                     if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
   3113                         source->flags = source->origFlags; // restore the iterator
   3114                         source->pos = NULL;
   3115                     }
   3116                     // Move Jamos into normalization buffer
   3117                     UChar *buffer = source->writableBuffer.getBuffer(4);
   3118                     int32_t bufferLength;
   3119                     buffer[0] = (UChar)L;
   3120                     buffer[1] = (UChar)V;
   3121                     if (T != TBase) {
   3122                         buffer[2] = (UChar)T;
   3123                         bufferLength = 3;
   3124                     } else {
   3125                         bufferLength = 2;
   3126                     }
   3127                     source->writableBuffer.releaseBuffer(bufferLength);
   3128 
   3129                     source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
   3130                     //   after exhausting the writableBuffer
   3131                     source->pos   = source->writableBuffer.getTerminatedBuffer();
   3132                     source->origFlags   = source->flags;
   3133                     source->flags       |= UCOL_ITER_INNORMBUF;
   3134                     source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   3135 
   3136                     return(UCOL_IGNORABLE);
   3137                 }
   3138             }
   3139         case SURROGATE_TAG:
   3140             /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
   3141             /* two things can happen here: next code point can be a trailing surrogate - we will use it */
   3142             /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
   3143             /* we return 0 (completely ignorable - per UCA specification */
   3144             {
   3145                 UChar trail;
   3146                 collIterateState state;
   3147                 backupState(source, &state);
   3148                 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
   3149                     // we chould have stepped one char forward and it might have turned that it
   3150                     // was not a trail surrogate. In that case, we have to backup.
   3151                     loadState(source, &state, TRUE);
   3152                     return 0;
   3153                 } else {
   3154                     /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
   3155                     CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
   3156                     if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
   3157                         // We need to backup
   3158                         loadState(source, &state, TRUE);
   3159                         return CE;
   3160                     }
   3161                     // calculate the supplementary code point value, if surrogate was not tailored
   3162                     cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   3163                 }
   3164             }
   3165             break;
   3166         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   3167             UChar nextChar;
   3168             if( source->flags & UCOL_USE_ITERATOR) {
   3169                 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
   3170                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3171                     source->iterator->next(source->iterator);
   3172                     return getImplicit(cp, source);
   3173                 } else {
   3174                     return 0;
   3175                 }
   3176             } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
   3177                 U_IS_TRAIL((nextChar=*source->pos))) {
   3178                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3179                     source->pos++;
   3180                     return getImplicit(cp, source);
   3181             } else {
   3182                 return 0; /* completely ignorable */
   3183             }
   3184         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   3185             return 0; /* broken surrogate sequence */
   3186         case CHARSET_TAG:
   3187             /* not yet implemented */
   3188             /* probably after 1.8 */
   3189             return UCOL_NOT_FOUND;
   3190         default:
   3191             *status = U_INTERNAL_PROGRAM_ERROR;
   3192             CE=0;
   3193             break;
   3194     }
   3195     if (CE <= UCOL_NOT_FOUND) break;
   3196   }
   3197   return CE;
   3198 }
   3199 
   3200 
   3201 /* now uses Mark's getImplicitPrimary code */
   3202 static
   3203 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
   3204     if(isNonChar(cp)) {
   3205         return 0;
   3206     }
   3207 
   3208     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   3209 
   3210     *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
   3211     collationSource->toReturn = collationSource->CEpos;
   3212 
   3213 	if (collationSource->offsetBuffer == NULL) {
   3214 		collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
   3215 		collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
   3216 		collationSource->offsetStore = collationSource->offsetBuffer;
   3217 	}
   3218 
   3219 	// **** doesn't work if using iterator ****
   3220 	if (collationSource->flags & UCOL_ITER_INNORMBUF) {
   3221 	  collationSource->offsetRepeatCount = 1;
   3222 	} else {
   3223 	  int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
   3224 
   3225 	  *(collationSource->offsetStore++) = firstOffset;
   3226 	  *(collationSource->offsetStore++) = firstOffset + 1;
   3227 
   3228 		collationSource->offsetReturn = collationSource->offsetStore - 1;
   3229 		*(collationSource->offsetBuffer) = firstOffset;
   3230 		if (collationSource->offsetReturn == collationSource->offsetBuffer) {
   3231 			collationSource->offsetStore = collationSource->offsetBuffer;
   3232 		}
   3233 	}
   3234 
   3235     return ((r & 0x0000FFFF)<<16) | 0x000000C0;
   3236 }
   3237 
   3238 /**
   3239  * This function handles the special CEs like contractions, expansions,
   3240  * surrogates, Thai.
   3241  * It is called by both getPrevCE
   3242  */
   3243 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
   3244                           collIterate *source,
   3245                           UErrorCode *status)
   3246 {
   3247     const uint32_t *CEOffset    = NULL;
   3248           UChar    *UCharOffset = NULL;
   3249           UChar    schar;
   3250     const UChar    *constart    = NULL;
   3251           uint32_t size;
   3252           UChar    buffer[UCOL_MAX_BUFFER];
   3253           uint32_t *endCEBuffer;
   3254           UChar   *strbuffer;
   3255           int32_t noChars = 0;
   3256           int32_t CECount = 0;
   3257 
   3258     for(;;)
   3259     {
   3260         /* the only ces that loops are thai and contractions */
   3261         switch (getCETag(CE))
   3262         {
   3263         case NOT_FOUND_TAG:  /* this tag always returns */
   3264             return CE;
   3265 
   3266         case SPEC_PROC_TAG:
   3267             {
   3268                 // Special processing is getting a CE that is preceded by a certain prefix
   3269                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   3270                 // When we encouter a special processing tag, we go backwards and try to see if
   3271                 // we have a match.
   3272                 // Contraction tables are used - so the whole process is not unlike contraction.
   3273                 // prefix data is stored backwards in the table.
   3274                 const UChar *UCharOffset;
   3275                 UChar schar, tchar;
   3276                 collIterateState prefixState;
   3277                 backupState(source, &prefixState);
   3278                 for(;;) {
   3279                     // This loop will run once per source string character, for as long as we
   3280                     //  are matching a potential contraction sequence
   3281 
   3282                     // First we position ourselves at the begining of contraction sequence
   3283                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   3284 
   3285                     if (collIter_bos(source)) {
   3286                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   3287                         break;
   3288                     }
   3289                     schar = getPrevNormalizedChar(source, status);
   3290                     goBackOne(source);
   3291 
   3292                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   3293                         UCharOffset++;
   3294                     }
   3295 
   3296                     if (schar == tchar) {
   3297                         // Found the source string char in the table.
   3298                         //  Pick up the corresponding CE from the table.
   3299                         CE = *(coll->contractionCEs +
   3300                             (UCharOffset - coll->contractionIndex));
   3301                     }
   3302                     else
   3303                     {
   3304                         // if there is a completely ignorable code point in the middle of
   3305                         // a prefix, we need to act as if it's not there
   3306                         // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
   3307                         // lone surrogates cannot be set to zero as it would break other processing
   3308                         uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   3309                         // it's easy for BMP code points
   3310                         if(isZeroCE == 0) {
   3311                             continue;
   3312                         } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
   3313                             // for supplementary code points, we have to check the next one
   3314                             // situations where we are going to ignore
   3315                             // 1. beginning of the string: schar is a lone surrogate
   3316                             // 2. schar is a lone surrogate
   3317                             // 3. schar is a trail surrogate in a valid surrogate sequence
   3318                             //    that is explicitly set to zero.
   3319                             if (!collIter_bos(source)) {
   3320                                 UChar lead;
   3321                                 if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
   3322                                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
   3323                                     if(getCETag(isZeroCE) == SURROGATE_TAG) {
   3324                                         uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
   3325                                         if(finalCE == 0) {
   3326                                             // this is a real, assigned completely ignorable code point
   3327                                             goBackOne(source);
   3328                                             continue;
   3329                                         }
   3330                                     }
   3331                                 } else {
   3332                                     // lone surrogate, completely ignorable
   3333                                     continue;
   3334                                 }
   3335                             } else {
   3336                                 // lone surrogate at the beggining, completely ignorable
   3337                                 continue;
   3338                             }
   3339                         }
   3340                         // Source string char was not in the table.
   3341                         //   We have not found the prefix.
   3342                         CE = *(coll->contractionCEs +
   3343                             (ContractionStart - coll->contractionIndex));
   3344                     }
   3345 
   3346                     if(!isPrefix(CE)) {
   3347                         // The source string char was in the contraction table, and the corresponding
   3348                         //   CE is not a prefix CE.  We found the prefix, break
   3349                         //   out of loop, this CE will end up being returned.  This is the normal
   3350                         //   way out of prefix handling when the source actually contained
   3351                         //   the prefix.
   3352                         break;
   3353                     }
   3354                 }
   3355                 loadState(source, &prefixState, TRUE);
   3356                 break;
   3357             }
   3358 
   3359         case CONTRACTION_TAG: {
   3360             /* to ensure that the backwards and forwards iteration matches, we
   3361             take the current region of most possible match and pass it through
   3362             the forward iteration. this will ensure that the obstinate problem of
   3363             overlapping contractions will not occur.
   3364             */
   3365             schar = peekCharacter(source, 0);
   3366             constart = (UChar *)coll->image + getContractOffset(CE);
   3367             if (isAtStartPrevIterate(source)
   3368                 /* commented away contraction end checks after adding the checks
   3369                 in getPrevCE  */) {
   3370                     /* start of string or this is not the end of any contraction */
   3371                     CE = *(coll->contractionCEs +
   3372                         (constart - coll->contractionIndex));
   3373                     break;
   3374             }
   3375             strbuffer = buffer;
   3376             UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
   3377             *(UCharOffset --) = 0;
   3378             noChars = 0;
   3379             // have to swap thai characters
   3380             while (ucol_unsafeCP(schar, coll)) {
   3381                 *(UCharOffset) = schar;
   3382                 noChars++;
   3383                 UCharOffset --;
   3384                 schar = getPrevNormalizedChar(source, status);
   3385                 goBackOne(source);
   3386                 // TODO: when we exhaust the contraction buffer,
   3387                 // it needs to get reallocated. The problem is
   3388                 // that the size depends on the string which is
   3389                 // not iterated over. However, since we're travelling
   3390                 // backwards, we already had to set the iterator at
   3391                 // the end - so we might as well know where we are?
   3392                 if (UCharOffset + 1 == buffer) {
   3393                     /* we have exhausted the buffer */
   3394                     int32_t newsize = 0;
   3395                     if(source->pos) { // actually dealing with a position
   3396                         newsize = (int32_t)(source->pos - source->string + 1);
   3397                     } else { // iterator
   3398                         newsize = 4 * UCOL_MAX_BUFFER;
   3399                     }
   3400                     strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
   3401                         (newsize + UCOL_MAX_BUFFER));
   3402                     /* test for NULL */
   3403                     if (strbuffer == NULL) {
   3404                         *status = U_MEMORY_ALLOCATION_ERROR;
   3405                         return UCOL_NO_MORE_CES;
   3406                     }
   3407                     UCharOffset = strbuffer + newsize;
   3408                     uprv_memcpy(UCharOffset, buffer,
   3409                         UCOL_MAX_BUFFER * sizeof(UChar));
   3410                     UCharOffset --;
   3411                 }
   3412                 if ((source->pos && (source->pos == source->string ||
   3413                     ((source->flags & UCOL_ITER_INNORMBUF) &&
   3414                     *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
   3415                     || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
   3416                         break;
   3417                 }
   3418             }
   3419             /* adds the initial base character to the string */
   3420             *(UCharOffset) = schar;
   3421             noChars++;
   3422 
   3423             int32_t offsetBias;
   3424 
   3425             // **** doesn't work if using iterator ****
   3426             if (source->flags & UCOL_ITER_INNORMBUF) {
   3427                 offsetBias = -1;
   3428             } else {
   3429                 offsetBias = (int32_t)(source->pos - source->string);
   3430             }
   3431 
   3432             /* a new collIterate is used to simplify things, since using the current
   3433             collIterate will mean that the forward and backwards iteration will
   3434             share and change the same buffers. we don't want to get into that. */
   3435             collIterate temp;
   3436             int32_t rawOffset;
   3437 
   3438             IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
   3439             if(U_FAILURE(*status)) {
   3440                 return UCOL_NULLORDER;
   3441             }
   3442             temp.flags &= ~UCOL_ITER_NORM;
   3443             temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
   3444 
   3445             rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
   3446             CE = ucol_IGetNextCE(coll, &temp, status);
   3447 
   3448             if (source->extendCEs) {
   3449                 endCEBuffer = source->extendCEs + source->extendCEsSize;
   3450                 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
   3451             } else {
   3452                 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
   3453                 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
   3454             }
   3455 
   3456             if (source->offsetBuffer == NULL) {
   3457                 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
   3458                 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
   3459                 source->offsetStore = source->offsetBuffer;
   3460             }
   3461 
   3462             while (CE != UCOL_NO_MORE_CES) {
   3463                 *(source->CEpos ++) = CE;
   3464 
   3465                 if (offsetBias >= 0) {
   3466                     *(source->offsetStore ++) = rawOffset + offsetBias;
   3467                 }
   3468 
   3469                 CECount++;
   3470                 if (source->CEpos == endCEBuffer) {
   3471                     /* ran out of CE space, reallocate to new buffer.
   3472                     If reallocation fails, reset pointers and bail out,
   3473                     there's no guarantee of the right character position after
   3474                     this bail*/
   3475                     if (!increaseCEsCapacity(source)) {
   3476                         *status = U_MEMORY_ALLOCATION_ERROR;
   3477                         if (strbuffer != buffer) {
   3478                             uprv_free(strbuffer);
   3479                         }
   3480 
   3481                         return (uint32_t)UCOL_NULLORDER;
   3482                     }
   3483 
   3484                     endCEBuffer = source->extendCEs + source->extendCEsSize;
   3485                 }
   3486 
   3487                 if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) {
   3488                     int32_t  storeIX = (int32_t)(source->offsetStore - source->offsetBuffer);
   3489                     int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer,
   3490                         sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
   3491 
   3492                     if (tob != NULL) {
   3493                         source->offsetBuffer = tob;
   3494                         source->offsetStore = &source->offsetBuffer[storeIX];
   3495                         source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE;
   3496                     } else {
   3497                         // memory error...
   3498                         *status = U_MEMORY_ALLOCATION_ERROR;
   3499                         source->CEpos = source->CEs;
   3500 
   3501                         if (strbuffer != buffer) {
   3502                             uprv_free(strbuffer);
   3503                         }
   3504 
   3505                         return (uint32_t) UCOL_NULLORDER;
   3506                     }
   3507                 }
   3508 
   3509                 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
   3510                     rawOffset = (int32_t)(temp.fcdPosition - temp.string);
   3511                 } else {
   3512                     rawOffset = (int32_t)(temp.pos - temp.string);
   3513                 }
   3514 
   3515                 CE = ucol_IGetNextCE(coll, &temp, status);
   3516             }
   3517 
   3518 			if (source->offsetRepeatValue != 0) {
   3519                 if (CECount > noChars) {
   3520 				    source->offsetRepeatCount += temp.offsetRepeatCount;
   3521                 } else {
   3522                     // **** does this really skip the right offsets? ****
   3523                     source->offsetReturn -= (noChars - CECount);
   3524                 }
   3525 			}
   3526 
   3527             if (strbuffer != buffer) {
   3528                 uprv_free(strbuffer);
   3529             }
   3530 
   3531             if (offsetBias >= 0) {
   3532                 source->offsetReturn = source->offsetStore - 1;
   3533                 if (source->offsetReturn == source->offsetBuffer) {
   3534                     source->offsetStore = source->offsetBuffer;
   3535                 }
   3536             }
   3537 
   3538             source->toReturn = source->CEpos - 1;
   3539             if (source->toReturn == source->CEs) {
   3540                 source->CEpos = source->CEs;
   3541             }
   3542 
   3543             return *(source->toReturn);
   3544         }
   3545         case LONG_PRIMARY_TAG:
   3546             {
   3547                 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   3548                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   3549                 source->toReturn = source->CEpos - 1;
   3550 
   3551 				if (source->offsetBuffer == NULL) {
   3552 					source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
   3553 					source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
   3554 					source->offsetStore = source->offsetBuffer;
   3555 				}
   3556 
   3557 				if (source->flags & UCOL_ITER_INNORMBUF) {
   3558                     source->offsetRepeatCount = 1;
   3559 				} else {
   3560 				  int32_t firstOffset = (int32_t)(source->pos - source->string);
   3561 
   3562 				  *(source->offsetStore++) = firstOffset;
   3563 				  *(source->offsetStore++) = firstOffset + 1;
   3564 
   3565 					source->offsetReturn = source->offsetStore - 1;
   3566 					*(source->offsetBuffer) = firstOffset;
   3567 					if (source->offsetReturn == source->offsetBuffer) {
   3568 						source->offsetStore = source->offsetBuffer;
   3569 					}
   3570 				}
   3571 
   3572 
   3573                 return *(source->toReturn);
   3574             }
   3575 
   3576         case EXPANSION_TAG: /* this tag always returns */
   3577             {
   3578             /*
   3579             This should handle expansion.
   3580             NOTE: we can encounter both continuations and expansions in an expansion!
   3581             I have to decide where continuations are going to be dealt with
   3582             */
   3583             int32_t firstOffset = (int32_t)(source->pos - source->string);
   3584 
   3585             // **** doesn't work if using iterator ****
   3586             if (source->offsetReturn != NULL) {
   3587                 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
   3588                     source->offsetStore = source->offsetBuffer;
   3589                 }else {
   3590                   firstOffset = -1;
   3591                 }
   3592             }
   3593 
   3594             if (source->offsetBuffer == NULL) {
   3595                 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
   3596                 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
   3597                 source->offsetStore = source->offsetBuffer;
   3598             }
   3599 
   3600             /* find the offset to expansion table */
   3601             CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3602             size     = getExpansionCount(CE);
   3603             if (size != 0) {
   3604                 /*
   3605                 if there are less than 16 elements in expansion, we don't terminate
   3606                 */
   3607                 uint32_t count;
   3608 
   3609                 for (count = 0; count < size; count++) {
   3610                     *(source->CEpos ++) = *CEOffset++;
   3611 
   3612                     if (firstOffset >= 0) {
   3613                         *(source->offsetStore ++) = firstOffset + 1;
   3614                     }
   3615                 }
   3616             } else {
   3617                 /* else, we do */
   3618                 while (*CEOffset != 0) {
   3619                     *(source->CEpos ++) = *CEOffset ++;
   3620 
   3621                     if (firstOffset >= 0) {
   3622                         *(source->offsetStore ++) = firstOffset + 1;
   3623                     }
   3624                 }
   3625             }
   3626 
   3627             if (firstOffset >= 0) {
   3628                 source->offsetReturn = source->offsetStore - 1;
   3629                 *(source->offsetBuffer) = firstOffset;
   3630                 if (source->offsetReturn == source->offsetBuffer) {
   3631                     source->offsetStore = source->offsetBuffer;
   3632                 }
   3633             } else {
   3634                 source->offsetRepeatCount += size - 1;
   3635             }
   3636 
   3637             source->toReturn = source->CEpos - 1;
   3638             // in case of one element expansion, we
   3639             // want to immediately return CEpos
   3640             if(source->toReturn == source->CEs) {
   3641                 source->CEpos = source->CEs;
   3642             }
   3643 
   3644             return *(source->toReturn);
   3645             }
   3646 
   3647         case DIGIT_TAG:
   3648             {
   3649                 /*
   3650                 We do a check to see if we want to collate digits as numbers; if so we generate
   3651                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   3652                 */
   3653                 uint32_t i;    /* general counter */
   3654 
   3655                 if (source->coll->numericCollation == UCOL_ON){
   3656                     uint32_t digIndx = 0;
   3657                     uint32_t endIndex = 0;
   3658                     uint32_t leadingZeroIndex = 0;
   3659                     uint32_t trailingZeroCount = 0;
   3660 
   3661                     uint8_t collateVal = 0;
   3662 
   3663                     UBool nonZeroValReached = FALSE;
   3664 
   3665                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
   3666                     /*
   3667                     We parse the source string until we hit a char that's NOT a digit.
   3668                     Use this u_charDigitValue. This might be slow because we have to
   3669                     handle surrogates...
   3670                     */
   3671                     /*
   3672                     We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
   3673                     with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
   3674                     element we process when going backward. To determine how long that chunk might be, we may need to make
   3675                     two passes through the loop that collects digits - one to see how long the string is (and how much is
   3676                     leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
   3677                     more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
   3678                     element chunk after resetting the state to the initialState at the right side of the digit string.
   3679                     */
   3680                     uint32_t ceLimit = 0;
   3681                     UChar initial_ch = ch;
   3682                     collIterateState initialState = {0,0,0,0,0,0,0,0,0};
   3683                     backupState(source, &initialState);
   3684 
   3685                     for(;;) {
   3686                         collIterateState state = {0,0,0,0,0,0,0,0,0};
   3687                         UChar32 char32 = 0;
   3688                         int32_t digVal = 0;
   3689 
   3690                         if (U16_IS_TRAIL (ch)) {
   3691                             if (!collIter_bos(source)){
   3692                                 UChar lead = getPrevNormalizedChar(source, status);
   3693                                 if(U16_IS_LEAD(lead)) {
   3694                                     char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3695                                     goBackOne(source);
   3696                                 } else {
   3697                                     char32 = ch;
   3698                                 }
   3699                             } else {
   3700                                 char32 = ch;
   3701                             }
   3702                         } else {
   3703                             char32 = ch;
   3704                         }
   3705                         digVal = u_charDigitValue(char32);
   3706 
   3707                         for(;;) {
   3708                             // Make sure we have enough space. No longer needed;
   3709                             // at this point the largest value of digIndx when we need to save data in numTempBuf
   3710                             // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
   3711                             // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
   3712 
   3713                             // Skip over trailing zeroes, and keep a count of them.
   3714                             if (digVal != 0)
   3715                                 nonZeroValReached = TRUE;
   3716 
   3717                             if (nonZeroValReached) {
   3718                                 /*
   3719                                 We parse the digit string into base 100 numbers (this fits into a byte).
   3720                                 We only add to the buffer in twos, thus if we are parsing an odd character,
   3721                                 that serves as the 'tens' digit while the if we are parsing an even one, that
   3722                                 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3723                                 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3724                                 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3725                                 than all the other bytes.
   3726 
   3727                                 Since we're doing in this reverse we want to put the first digit encountered into the
   3728                                 ones place and the second digit encountered into the tens place.
   3729                                 */
   3730 
   3731                                 if ((digIndx + trailingZeroCount) % 2 == 1) {
   3732                                     // High-order digit case (tens place)
   3733                                     collateVal += (uint8_t)(digVal * 10);
   3734 
   3735                                     // We cannot set leadingZeroIndex unless it has been set for the
   3736                                     // low-order digit. Therefore, all we can do for the high-order
   3737                                     // digit is turn it off, never on.
   3738                                     // The only time we will have a high digit without a low is for
   3739                                     // the very first non-zero digit, so no zero check is necessary.
   3740                                     if (collateVal != 0)
   3741                                         leadingZeroIndex = 0;
   3742 
   3743                                     // The first pass through, digIndx may exceed the limit, but in that case
   3744                                     // we no longer care about numTempBuf contents since they will be discarded
   3745                                     if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
   3746                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3747                                     }
   3748                                     collateVal = 0;
   3749                                 } else {
   3750                                     // Low-order digit case (ones place)
   3751                                     collateVal = (uint8_t)digVal;
   3752 
   3753                                     // Check for leading zeroes.
   3754                                     if (collateVal == 0) {
   3755                                         if (!leadingZeroIndex)
   3756                                             leadingZeroIndex = (digIndx/2) + 2;
   3757                                     } else
   3758                                         leadingZeroIndex = 0;
   3759 
   3760                                     // No need to write to buffer; the case of a last odd digit
   3761                                     // is handled below.
   3762                                 }
   3763                                 ++digIndx;
   3764                             } else
   3765                                 ++trailingZeroCount;
   3766 
   3767                             if (!collIter_bos(source)) {
   3768                                 ch = getPrevNormalizedChar(source, status);
   3769                                 //goBackOne(source);
   3770                                 if (U16_IS_TRAIL(ch)) {
   3771                                     backupState(source, &state);
   3772                                     if (!collIter_bos(source)) {
   3773                                         goBackOne(source);
   3774                                         UChar lead = getPrevNormalizedChar(source, status);
   3775 
   3776                                         if(U16_IS_LEAD(lead)) {
   3777                                             char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3778                                         } else {
   3779                                             loadState(source, &state, FALSE);
   3780                                             char32 = ch;
   3781                                         }
   3782                                     }
   3783                                 } else
   3784                                     char32 = ch;
   3785 
   3786                                 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
   3787                                     if (char32 > 0xFFFF) {// For surrogates.
   3788                                         loadState(source, &state, FALSE);
   3789                                     }
   3790                                     // Don't need to "reverse" the goBackOne call,
   3791                                     // as this points to the next position to process..
   3792                                     //if (char32 > 0xFFFF) // For surrogates.
   3793                                     //getNextNormalizedChar(source);
   3794                                     break;
   3795                                 }
   3796 
   3797                                 goBackOne(source);
   3798                             }else
   3799                                 break;
   3800                         }
   3801 
   3802                         if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
   3803                             // our collation element is not too big, go ahead and finish with it
   3804                             break;
   3805                         }
   3806                         // our digit string is too long for a collation element;
   3807                         // set the limit for it, reset the state and begin again
   3808                         ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
   3809                         if ( ceLimit == 0 ) {
   3810                             ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
   3811                         }
   3812                         ch = initial_ch;
   3813                         loadState(source, &initialState, FALSE);
   3814                         digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
   3815                         collateVal = 0;
   3816                         nonZeroValReached = FALSE;
   3817                     }
   3818 
   3819                     if (! nonZeroValReached) {
   3820                         digIndx = 2;
   3821                         trailingZeroCount = 0;
   3822                         numTempBuf[2] = 6;
   3823                     }
   3824 
   3825                     if ((digIndx + trailingZeroCount) % 2 != 0) {
   3826                         numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
   3827                         digIndx += 1;       // The implicit leading zero
   3828                     }
   3829                     if (trailingZeroCount % 2 != 0) {
   3830                         // We had to consume one trailing zero for the low digit
   3831                         // of the least significant byte
   3832                         digIndx += 1;       // The trailing zero not in the exponent
   3833                         trailingZeroCount -= 1;
   3834                     }
   3835 
   3836                     endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
   3837 
   3838                     // Subtract one off of the last byte. Really the first byte here, but it's reversed...
   3839                     numTempBuf[2] -= 1;
   3840 
   3841                     /*
   3842                     We want to skip over the first two slots in the buffer. The first slot
   3843                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3844                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3845                     The exponent must be adjusted by the number of leading zeroes, and the number of
   3846                     trailing zeroes.
   3847                     */
   3848                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3849                     uint32_t exponent = (digIndx+trailingZeroCount)/2;
   3850                     if (leadingZeroIndex)
   3851                         exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
   3852                     numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
   3853 
   3854                     // Now transfer the collation key to our collIterate struct.
   3855                     // The total size for our collation key is half of endIndex, rounded up.
   3856                     int32_t size = (endIndex+1)/2;
   3857                     if(!ensureCEsCapacity(source, size)) {
   3858                         return UCOL_NULLORDER;
   3859                     }
   3860                     *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3861                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3862                         UCOL_BYTE_COMMON; // Tertiary weight.
   3863                     i = endIndex - 1; // Reset the index into the buffer.
   3864                     while(i >= 2) {
   3865                         uint32_t primWeight = numTempBuf[i--] << 8;
   3866                         if ( i >= 2)
   3867                             primWeight |= numTempBuf[i--];
   3868                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3869                     }
   3870 
   3871                     source->toReturn = source->CEpos -1;
   3872                     return *(source->toReturn);
   3873                 } else {
   3874                     CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3875                     CE = *(CEOffset++);
   3876                     break;
   3877                 }
   3878             }
   3879 
   3880         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3881             {
   3882                 static const uint32_t
   3883                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3884                 //const uint32_t LCount = 19;
   3885                 static const uint32_t VCount = 21;
   3886                 static const uint32_t TCount = 28;
   3887                 //const uint32_t NCount = VCount * TCount;   /* 588 */
   3888                 //const uint32_t SCount = LCount * NCount;   /* 11172 */
   3889 
   3890                 uint32_t L = ch - SBase;
   3891                 /*
   3892                 divide into pieces.
   3893                 we do it in this order since some compilers can do % and / in one
   3894                 operation
   3895                 */
   3896                 uint32_t T = L % TCount;
   3897                 L /= TCount;
   3898                 uint32_t V = L % VCount;
   3899                 L /= VCount;
   3900 
   3901                 /* offset them */
   3902                 L += LBase;
   3903                 V += VBase;
   3904                 T += TBase;
   3905 
   3906 				if (source->offsetBuffer == NULL) {
   3907 					source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
   3908 					source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
   3909 					source->offsetStore = source->offsetBuffer;
   3910 				}
   3911 
   3912 			  int32_t firstOffset = (int32_t)(source->pos - source->string);
   3913 
   3914 			  *(source->offsetStore++) = firstOffset;
   3915 
   3916                 /*
   3917                  * return the first CE, but first put the rest into the expansion buffer
   3918                  */
   3919                 if (!source->coll->image->jamoSpecial) {
   3920                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   3921                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   3922 					*(source->offsetStore++) = firstOffset + 1;
   3923 
   3924 					if (T != TBase) {
   3925                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   3926 					    *(source->offsetStore++) = firstOffset + 1;
   3927 					}
   3928 
   3929                     source->toReturn = source->CEpos - 1;
   3930 
   3931 					source->offsetReturn = source->offsetStore - 1;
   3932 					if (source->offsetReturn == source->offsetBuffer) {
   3933 						source->offsetStore = source->offsetBuffer;
   3934 					}
   3935 
   3936 					return *(source->toReturn);
   3937                 } else {
   3938                     // Since Hanguls pass the FCD check, it is
   3939                     // guaranteed that we won't be in
   3940                     // the normalization buffer if something like this happens
   3941                     // Move Jamos into normalization buffer
   3942                     /*
   3943                     Move the Jamos into the
   3944                     normalization buffer
   3945                     */
   3946                     UChar *tempbuffer = source->writableBuffer.getBuffer(5);
   3947                     int32_t tempbufferLength;
   3948                     tempbuffer[0] = 0;
   3949                     tempbuffer[1] = (UChar)L;
   3950                     tempbuffer[2] = (UChar)V;
   3951                     if (T != TBase) {
   3952                         tempbuffer[3] = (UChar)T;
   3953                         tempbufferLength = 4;
   3954                     } else {
   3955                         tempbufferLength = 3;
   3956                     }
   3957                     source->writableBuffer.releaseBuffer(tempbufferLength);
   3958 
   3959                     /*
   3960                     Indicate where to continue in main input string after exhausting
   3961                     the writableBuffer
   3962                     */
   3963                     if (source->pos  == source->string) {
   3964                         source->fcdPosition = NULL;
   3965                     } else {
   3966                         source->fcdPosition       = source->pos-1;
   3967                     }
   3968 
   3969                     source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
   3970                     source->origFlags         = source->flags;
   3971                     source->flags            |= UCOL_ITER_INNORMBUF;
   3972                     source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   3973 
   3974                     return(UCOL_IGNORABLE);
   3975                 }
   3976             }
   3977 
   3978         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   3979             return getPrevImplicit(ch, source);
   3980 
   3981             // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
   3982         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   3983             return getPrevImplicit(ch, source);
   3984 
   3985         case SURROGATE_TAG:  /* This is a surrogate pair */
   3986             /* essentialy an engaged lead surrogate. */
   3987             /* if you have encountered it here, it means that a */
   3988             /* broken sequence was encountered and this is an error */
   3989             return 0;
   3990 
   3991         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   3992             return 0; /* broken surrogate sequence */
   3993 
   3994         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   3995             {
   3996                 UChar32 cp = 0;
   3997                 UChar  prevChar;
   3998                 const UChar *prev;
   3999                 if (isAtStartPrevIterate(source)) {
   4000                     /* we are at the start of the string, wrong place to be at */
   4001                     return 0;
   4002                 }
   4003                 if (source->pos != source->writableBuffer.getBuffer()) {
   4004                     prev     = source->pos - 1;
   4005                 } else {
   4006                     prev     = source->fcdPosition;
   4007                 }
   4008                 prevChar = *prev;
   4009 
   4010                 /* Handles Han and Supplementary characters here.*/
   4011                 if (U16_IS_LEAD(prevChar)) {
   4012                     cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   4013                     source->pos = prev;
   4014                 } else {
   4015                     return 0; /* completely ignorable */
   4016                 }
   4017 
   4018                 return getPrevImplicit(cp, source);
   4019             }
   4020 
   4021             /* UCA is filled with these. Tailorings are NOT_FOUND */
   4022             /* not yet implemented */
   4023         case CHARSET_TAG:  /* this tag always returns */
   4024             /* probably after 1.8 */
   4025             return UCOL_NOT_FOUND;
   4026 
   4027         default:           /* this tag always returns */
   4028             *status = U_INTERNAL_PROGRAM_ERROR;
   4029             CE=0;
   4030             break;
   4031         }
   4032 
   4033         if (CE <= UCOL_NOT_FOUND) {
   4034             break;
   4035         }
   4036     }
   4037 
   4038     return CE;
   4039 }
   4040 
   4041 /* This should really be a macro        */
   4042 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
   4043 /* anyway */
   4044 static
   4045 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
   4046 #ifdef UCOL_DEBUG
   4047     fprintf(stderr, ".");
   4048 #endif
   4049     uint8_t *newStart = NULL;
   4050     uint32_t offset = (uint32_t)(*secondaries-secStart);
   4051 
   4052     if(secStart==second) {
   4053         newStart=(uint8_t*)uprv_malloc(newSize);
   4054         if(newStart==NULL) {
   4055             *status = U_MEMORY_ALLOCATION_ERROR;
   4056             return NULL;
   4057         }
   4058         uprv_memcpy(newStart, secStart, *secondaries-secStart);
   4059     } else {
   4060         newStart=(uint8_t*)uprv_realloc(secStart, newSize);
   4061         if(newStart==NULL) {
   4062             *status = U_MEMORY_ALLOCATION_ERROR;
   4063             /* Since we're reallocating, return original reference so we don't loose it. */
   4064             return secStart;
   4065         }
   4066     }
   4067     *secondaries=newStart+offset;
   4068     *secSize=newSize;
   4069     return newStart;
   4070 }
   4071 
   4072 
   4073 /* This should really be a macro                                                                      */
   4074 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
   4075 /* secondaries in French                                                                              */
   4076 /*
   4077 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
   4078   uint8_t temp;
   4079   while(start<end) {
   4080     temp = *start;
   4081     *start++ = *end;
   4082     *end-- = temp;
   4083   }
   4084 }
   4085 */
   4086 
   4087 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
   4088   TYPE tempA; \
   4089 while((start)<(end)) { \
   4090     tempA = *(start); \
   4091     *(start)++ = *(end); \
   4092     *(end)-- = tempA; \
   4093 } \
   4094 }
   4095 
   4096 /****************************************************************************/
   4097 /* Following are the sortkey generation functions                           */
   4098 /*                                                                          */
   4099 /****************************************************************************/
   4100 
   4101 /**
   4102  * Merge two sort keys.
   4103  * This is useful, for example, to combine sort keys from first and last names
   4104  * to sort such pairs.
   4105  * Merged sort keys consider on each collation level the first part first entirely,
   4106  * then the second one.
   4107  * It is possible to merge multiple sort keys by consecutively merging
   4108  * another one with the intermediate result.
   4109  *
   4110  * The length of the merge result is the sum of the lengths of the input sort keys
   4111  * minus 1.
   4112  *
   4113  * @param src1 the first sort key
   4114  * @param src1Length the length of the first sort key, including the zero byte at the end;
   4115  *        can be -1 if the function is to find the length
   4116  * @param src2 the second sort key
   4117  * @param src2Length the length of the second sort key, including the zero byte at the end;
   4118  *        can be -1 if the function is to find the length
   4119  * @param dest the buffer where the merged sort key is written,
   4120  *        can be NULL if destCapacity==0
   4121  * @param destCapacity the number of bytes in the dest buffer
   4122  * @return the length of the merged sort key, src1Length+src2Length-1;
   4123  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
   4124  *         in which cases the contents of dest is undefined
   4125  *
   4126  * @draft
   4127  */
   4128 U_CAPI int32_t U_EXPORT2
   4129 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
   4130                    const uint8_t *src2, int32_t src2Length,
   4131                    uint8_t *dest, int32_t destCapacity) {
   4132     int32_t destLength;
   4133     uint8_t b;
   4134 
   4135     /* check arguments */
   4136     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
   4137         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
   4138         destCapacity<0 || (destCapacity>0 && dest==NULL)
   4139     ) {
   4140         /* error, attempt to write a zero byte and return 0 */
   4141         if(dest!=NULL && destCapacity>0) {
   4142             *dest=0;
   4143         }
   4144         return 0;
   4145     }
   4146 
   4147     /* check lengths and capacity */
   4148     if(src1Length<0) {
   4149         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
   4150     }
   4151     if(src2Length<0) {
   4152         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
   4153     }
   4154 
   4155     destLength=src1Length+src2Length-1;
   4156     if(destLength>destCapacity) {
   4157         /* the merged sort key does not fit into the destination */
   4158         return destLength;
   4159     }
   4160 
   4161     /* merge the sort keys with the same number of levels */
   4162     while(*src1!=0 && *src2!=0) { /* while both have another level */
   4163         /* copy level from src1 not including 00 or 01 */
   4164         while((b=*src1)>=2) {
   4165             ++src1;
   4166             *dest++=b;
   4167         }
   4168 
   4169         /* add a 02 merge separator */
   4170         *dest++=2;
   4171 
   4172         /* copy level from src2 not including 00 or 01 */
   4173         while((b=*src2)>=2) {
   4174             ++src2;
   4175             *dest++=b;
   4176         }
   4177 
   4178         /* if both sort keys have another level, then add a 01 level separator and continue */
   4179         if(*src1==1 && *src2==1) {
   4180             ++src1;
   4181             ++src2;
   4182             *dest++=1;
   4183         }
   4184     }
   4185 
   4186     /*
   4187      * here, at least one sort key is finished now, but the other one
   4188      * might have some contents left from containing more levels;
   4189      * that contents is just appended to the result
   4190      */
   4191     if(*src1!=0) {
   4192         /* src1 is not finished, therefore *src2==0, and src1 is appended */
   4193         src2=src1;
   4194     }
   4195     /* append src2, "the other, unfinished sort key" */
   4196     uprv_strcpy((char *)dest, (const char *)src2);
   4197 
   4198     /* trust that neither sort key contained illegally embedded zero bytes */
   4199     return destLength;
   4200 }
   4201 
   4202 /* sortkey API */
   4203 U_CAPI int32_t U_EXPORT2
   4204 ucol_getSortKey(const    UCollator    *coll,
   4205         const    UChar        *source,
   4206         int32_t        sourceLength,
   4207         uint8_t        *result,
   4208         int32_t        resultLength)
   4209 {
   4210     UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
   4211     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   4212         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
   4213             ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
   4214     }
   4215 
   4216     UErrorCode status = U_ZERO_ERROR;
   4217     int32_t keySize   = 0;
   4218 
   4219     if(source != NULL) {
   4220         // source == NULL is actually an error situation, but we would need to
   4221         // have an error code to return it. Until we introduce a new
   4222         // API, it stays like this
   4223 
   4224         /* this uses the function pointer that is set in updateinternalstate */
   4225         /* currently, there are two funcs: */
   4226         /*ucol_calcSortKey(...);*/
   4227         /*ucol_calcSortKeySimpleTertiary(...);*/
   4228 
   4229         keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
   4230         //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) {
   4231             // That's not good. Something unusual happened.
   4232             // We don't know how much we initialized before we failed.
   4233             // NULL terminate for safety.
   4234             // We have no way say that we have generated a partial sort key.
   4235             //result[0] = 0;
   4236             //keySize = 0;
   4237         //}
   4238     }
   4239     UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
   4240     UTRACE_EXIT_STATUS(status);
   4241     return keySize;
   4242 }
   4243 
   4244 /* this function is called by the C++ API for sortkey generation */
   4245 U_CFUNC int32_t
   4246 ucol_getSortKeyWithAllocation(const UCollator *coll,
   4247                               const UChar *source, int32_t sourceLength,
   4248                               uint8_t **pResult,
   4249                               UErrorCode *pErrorCode) {
   4250     *pResult = 0;
   4251     return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
   4252 }
   4253 
   4254 #define UCOL_FSEC_BUF_SIZE 256
   4255 
   4256 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
   4257 /* or if we run out of space while making a sortkey and want to return ASAP                                   */
   4258 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
   4259     UErrorCode status = U_ZERO_ERROR;
   4260     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
   4261     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
   4262     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
   4263     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
   4264     UBool  compareIdent = (strength == UCOL_IDENTICAL);
   4265     UBool  doCase = (coll->caseLevel == UCOL_ON);
   4266     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
   4267     //UBool  qShifted = shifted  && (compareQuad == 0);
   4268     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
   4269     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
   4270     uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
   4271     uint8_t *fSecs = fSecsBuff;
   4272     uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
   4273     uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
   4274 
   4275     uint32_t variableTopValue = coll->variableTopValue;
   4276     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
   4277     if(doHiragana) {
   4278         UCOL_COMMON_BOT4++;
   4279         /* allocate one more space for hiragana */
   4280     }
   4281     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
   4282 
   4283     uint32_t order = UCOL_NO_MORE_CES;
   4284     uint8_t primary1 = 0;
   4285     uint8_t primary2 = 0;
   4286     uint8_t secondary = 0;
   4287     uint8_t tertiary = 0;
   4288     int32_t caseShift = 0;
   4289     uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
   4290 
   4291     uint8_t caseSwitch = coll->caseSwitch;
   4292     uint8_t tertiaryMask = coll->tertiaryMask;
   4293     uint8_t tertiaryCommon = coll->tertiaryCommon;
   4294 
   4295     UBool wasShifted = FALSE;
   4296     UBool notIsContinuation = FALSE;
   4297     uint8_t leadPrimary = 0;
   4298 
   4299 
   4300     for(;;) {
   4301         order = ucol_IGetNextCE(coll, s, &status);
   4302         if(order == UCOL_NO_MORE_CES) {
   4303             break;
   4304         }
   4305 
   4306         if(order == 0) {
   4307             continue;
   4308         }
   4309 
   4310         notIsContinuation = !isContinuation(order);
   4311 
   4312 
   4313         if(notIsContinuation) {
   4314             tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
   4315         } else {
   4316             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   4317         }
   4318         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4319         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4320         primary1 = (uint8_t)(order >> 8);
   4321 
   4322 
   4323         if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
   4324             || (!notIsContinuation && wasShifted))
   4325             || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
   4326                 /* and other ignorables should be removed if following a shifted code point */
   4327                 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
   4328                     /* we should just completely ignore it */
   4329                     continue;
   4330                 }
   4331                 if(compareQuad == 0) {
   4332                     if(c4 > 0) {
   4333                         currentSize += (c2/UCOL_BOT_COUNT4)+1;
   4334                         c4 = 0;
   4335                     }
   4336                     currentSize++;
   4337                     if(primary2 != 0) {
   4338                         currentSize++;
   4339                     }
   4340                 }
   4341                 wasShifted = TRUE;
   4342         } else {
   4343             wasShifted = FALSE;
   4344             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   4345             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
   4346             /* calculate sortkey size */
   4347             if(primary1 != UCOL_IGNORABLE) {
   4348                 if(notIsContinuation) {
   4349                     if(leadPrimary == primary1) {
   4350                         currentSize++;
   4351                     } else {
   4352                         if(leadPrimary != 0) {
   4353                             currentSize++;
   4354                         }
   4355                         if(primary2 == UCOL_IGNORABLE) {
   4356                             /* one byter, not compressed */
   4357                             currentSize++;
   4358                             leadPrimary = 0;
   4359                         }
   4360                         else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
   4361                             //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
   4362                             //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
   4363                             (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary))
   4364                         {
   4365                             /* not compressible */
   4366                             leadPrimary = 0;
   4367                             currentSize+=2;
   4368                         }
   4369                         else { /* compress */
   4370                             leadPrimary = primary1;
   4371                             currentSize+=2;
   4372                         }
   4373                     }
   4374                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   4375                     currentSize++;
   4376                     if(primary2 != UCOL_IGNORABLE) {
   4377                         currentSize++;
   4378                     }
   4379                 }
   4380             }
   4381 
   4382             if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
   4383                 if(!isFrenchSec){
   4384                     if (secondary == UCOL_COMMON2 && notIsContinuation) {
   4385                         c2++;
   4386                     } else {
   4387                         if(c2 > 0) {
   4388                             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4389                                 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
   4390                             } else {
   4391                                 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
   4392                             }
   4393                             c2 = 0;
   4394                         }
   4395                         currentSize++;
   4396                     }
   4397                 } else {
   4398                     fSecs[fSecsLen++] = secondary;
   4399                     if(fSecsLen == fSecsMaxLen) {
   4400                         uint8_t *fSecsTemp;
   4401                         if(fSecs == fSecsBuff) {
   4402                             fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen);
   4403                         } else {
   4404                             fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
   4405                         }
   4406                         if(fSecsTemp == NULL) {
   4407                             status = U_MEMORY_ALLOCATION_ERROR;
   4408                             return 0;
   4409                         }
   4410                         fSecs = fSecsTemp;
   4411                         fSecsMaxLen *= 2;
   4412                     }
   4413                     if(notIsContinuation) {
   4414                         if (frenchStartPtr != NULL) {
   4415                             /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4416                             uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4417                             frenchStartPtr = NULL;
   4418                         }
   4419                     } else {
   4420                         if (frenchStartPtr == NULL) {
   4421                             frenchStartPtr = fSecs+fSecsLen-2;
   4422                         }
   4423                         frenchEndPtr = fSecs+fSecsLen-1;
   4424                     }
   4425                 }
   4426             }
   4427 
   4428             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
   4429                 // do the case level if we need to do it. We don't want to calculate
   4430                 // case level for primary ignorables if we have only primary strength and case level
   4431                 // otherwise we would break well formedness of CEs
   4432                 if (caseShift  == 0) {
   4433                     currentSize++;
   4434                     caseShift = UCOL_CASE_SHIFT_START;
   4435                 }
   4436                 if((tertiary&0x3F) > 0 && notIsContinuation) {
   4437                     caseShift--;
   4438                     if((tertiary &0xC0) != 0) {
   4439                         if (caseShift  == 0) {
   4440                             currentSize++;
   4441                             caseShift = UCOL_CASE_SHIFT_START;
   4442                         }
   4443                         caseShift--;
   4444                     }
   4445                 }
   4446             } else {
   4447                 if(notIsContinuation) {
   4448                     tertiary ^= caseSwitch;
   4449                 }
   4450             }
   4451 
   4452             tertiary &= tertiaryMask;
   4453             if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
   4454                 if (tertiary == tertiaryCommon && notIsContinuation) {
   4455                     c3++;
   4456                 } else {
   4457                     if(c3 > 0) {
   4458                         if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
   4459                             || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
   4460                                 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
   4461                         } else {
   4462                             currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
   4463                         }
   4464                         c3 = 0;
   4465                     }
   4466                     currentSize++;
   4467                 }
   4468             }
   4469 
   4470             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
   4471                 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   4472                     if(c4>0) { // Close this part
   4473                         currentSize += (c4/UCOL_BOT_COUNT4)+1;
   4474                         c4 = 0;
   4475                     }
   4476                     currentSize++; // Add the Hiragana
   4477                 } else { // This wasn't Hiragana, so we can continue adding stuff
   4478                     c4++;
   4479                 }
   4480             }
   4481         }
   4482     }
   4483 
   4484     if(!isFrenchSec){
   4485         if(c2 > 0) {
   4486             currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
   4487         }
   4488     } else {
   4489         uint32_t i = 0;
   4490         if(frenchStartPtr != NULL) {
   4491             uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4492         }
   4493         for(i = 0; i<fSecsLen; i++) {
   4494             secondary = *(fSecs+fSecsLen-i-1);
   4495             /* This is compression code. */
   4496             if (secondary == UCOL_COMMON2) {
   4497                 ++c2;
   4498             } else {
   4499                 if(c2 > 0) {
   4500                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4501                         currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
   4502                     } else {
   4503                         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
   4504                     }
   4505                     c2 = 0;
   4506                 }
   4507                 currentSize++;
   4508             }
   4509         }
   4510         if(c2 > 0) {
   4511             currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
   4512         }
   4513         if(fSecs != fSecsBuff) {
   4514             uprv_free(fSecs);
   4515         }
   4516     }
   4517 
   4518     if(c3 > 0) {
   4519         currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
   4520     }
   4521 
   4522     if(c4 > 0  && compareQuad == 0) {
   4523         currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
   4524     }
   4525 
   4526     if(compareIdent) {
   4527         currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
   4528     }
   4529     return currentSize;
   4530 }
   4531 
   4532 static
   4533 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
   4534     if (caseShift  == 0) {
   4535         *(*cases)++ = UCOL_CASE_BYTE_START;
   4536         caseShift = UCOL_CASE_SHIFT_START;
   4537     }
   4538 }
   4539 
   4540 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
   4541 // know how many values we wanted to add, even if we didn't add them all
   4542 static
   4543 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
   4544     size++;
   4545     if(primaries < limit) {
   4546         *(primaries)++ = value;
   4547     }
   4548 }
   4549 
   4550 // Packs the secondary buffer when processing French locale. Adds the terminator.
   4551 static
   4552 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
   4553     uint8_t secondary;
   4554     int32_t count2 = 0;
   4555     uint32_t i = 0, size = 0;
   4556     // we use i here since the key size already accounts for terminators, so we'll discard the increment
   4557     addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
   4558     /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
   4559     if(frenchStartPtr != NULL) {
   4560         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4561     }
   4562     for(i = 0; i<*secsize; i++) {
   4563         secondary = *(secondaries-i-1);
   4564         /* This is compression code. */
   4565         if (secondary == UCOL_COMMON2) {
   4566             ++count2;
   4567         } else {
   4568             if (count2 > 0) {
   4569                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4570                     while (count2 > UCOL_TOP_COUNT2) {
   4571                         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
   4572                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4573                     }
   4574                     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
   4575                 } else {
   4576                     while (count2 > UCOL_BOT_COUNT2) {
   4577                         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   4578                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4579                     }
   4580                     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   4581                 }
   4582                 count2 = 0;
   4583             }
   4584             addWithIncrement(primaries, primEnd, size, secondary);
   4585         }
   4586     }
   4587     if (count2 > 0) {
   4588         while (count2 > UCOL_BOT_COUNT2) {
   4589             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   4590             count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4591         }
   4592         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   4593     }
   4594     *secsize = size;
   4595     return primaries;
   4596 }
   4597 
   4598 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
   4599 
   4600 /* This is the sortkey work horse function */
   4601 U_CFUNC int32_t U_CALLCONV
   4602 ucol_calcSortKey(const    UCollator    *coll,
   4603         const    UChar        *source,
   4604         int32_t        sourceLength,
   4605         uint8_t        **result,
   4606         uint32_t        resultLength,
   4607         UBool allocateSKBuffer,
   4608         UErrorCode *status)
   4609 {
   4610     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
   4611 
   4612     uint32_t i = 0; /* general purpose counter */
   4613 
   4614     /* Stack allocated buffers for buffers we use */
   4615     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
   4616 
   4617     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
   4618 
   4619     if(U_FAILURE(*status)) {
   4620         return 0;
   4621     }
   4622 
   4623     if(primaries == NULL && allocateSKBuffer == TRUE) {
   4624         primaries = *result = prim;
   4625         resultLength = UCOL_PRIMARY_MAX_BUFFER;
   4626     }
   4627 
   4628     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
   4629       caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
   4630 
   4631     uint32_t sortKeySize = 1; /* it is always \0 terminated */
   4632 
   4633     UnicodeString normSource;
   4634 
   4635     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
   4636 
   4637     UColAttributeValue strength = coll->strength;
   4638 
   4639     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
   4640     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
   4641     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
   4642     UBool  compareIdent = (strength == UCOL_IDENTICAL);
   4643     UBool  doCase = (coll->caseLevel == UCOL_ON);
   4644     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
   4645     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
   4646     //UBool  qShifted = shifted && (compareQuad == 0);
   4647     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
   4648     /*const uint8_t *scriptOrder = coll->scriptOrder;*/
   4649 
   4650     uint32_t variableTopValue = coll->variableTopValue;
   4651     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
   4652     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
   4653     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
   4654     uint8_t UCOL_HIRAGANA_QUAD = 0;
   4655     if(doHiragana) {
   4656         UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
   4657         /* allocate one more space for hiragana, value for hiragana */
   4658     }
   4659     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
   4660 
   4661     /* support for special features like caselevel and funky secondaries */
   4662     uint8_t *frenchStartPtr = NULL;
   4663     uint8_t *frenchEndPtr = NULL;
   4664     uint32_t caseShift = 0;
   4665 
   4666     sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
   4667 
   4668     /* If we need to normalize, we'll do it all at once at the beginning! */
   4669     const Normalizer2 *norm2;
   4670     if(compareIdent) {
   4671         norm2 = Normalizer2Factory::getNFDInstance(*status);
   4672     } else if(coll->normalizationMode != UCOL_OFF) {
   4673         norm2 = Normalizer2Factory::getFCDInstance(*status);
   4674     } else {
   4675         norm2 = NULL;
   4676     }
   4677     if(norm2 != NULL) {
   4678         normSource.setTo(FALSE, source, len);
   4679         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   4680         if(qcYesLength != len) {
   4681             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   4682             normSource.truncate(qcYesLength);
   4683             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   4684             source = normSource.getBuffer();
   4685             len = normSource.length();
   4686         }
   4687     }
   4688     collIterate s;
   4689     IInit_collIterate(coll, source, len, &s, status);
   4690     if(U_FAILURE(*status)) {
   4691         return 0;
   4692     }
   4693     if(source == normSource.getBuffer()) {
   4694         s.flags &= ~UCOL_ITER_NORM;
   4695     }
   4696 
   4697     if(resultLength == 0 || primaries == NULL) {
   4698         return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
   4699     }
   4700     uint8_t *primarySafeEnd = primaries + resultLength - 1;
   4701     if(strength > UCOL_PRIMARY) {
   4702         primarySafeEnd--;
   4703     }
   4704 
   4705     uint32_t minBufferSize = UCOL_MAX_BUFFER;
   4706 
   4707     uint8_t *primStart = primaries;
   4708     uint8_t *secStart = secondaries;
   4709     uint8_t *terStart = tertiaries;
   4710     uint8_t *caseStart = cases;
   4711     uint8_t *quadStart = quads;
   4712 
   4713     uint32_t order = 0;
   4714 
   4715     uint8_t primary1 = 0;
   4716     uint8_t primary2 = 0;
   4717     uint8_t secondary = 0;
   4718     uint8_t tertiary = 0;
   4719     uint8_t caseSwitch = coll->caseSwitch;
   4720     uint8_t tertiaryMask = coll->tertiaryMask;
   4721     int8_t tertiaryAddition = coll->tertiaryAddition;
   4722     uint8_t tertiaryTop = coll->tertiaryTop;
   4723     uint8_t tertiaryBottom = coll->tertiaryBottom;
   4724     uint8_t tertiaryCommon = coll->tertiaryCommon;
   4725     uint8_t caseBits = 0;
   4726 
   4727     UBool finished = FALSE;
   4728     UBool wasShifted = FALSE;
   4729     UBool notIsContinuation = FALSE;
   4730 
   4731     uint32_t prevBuffSize = 0;
   4732 
   4733     uint32_t count2 = 0, count3 = 0, count4 = 0;
   4734     uint8_t leadPrimary = 0;
   4735 
   4736     for(;;) {
   4737         for(i=prevBuffSize; i<minBufferSize; ++i) {
   4738 
   4739             order = ucol_IGetNextCE(coll, &s, status);
   4740             if(order == UCOL_NO_MORE_CES) {
   4741                 finished = TRUE;
   4742                 break;
   4743             }
   4744 
   4745             if(order == 0) {
   4746                 continue;
   4747             }
   4748 
   4749             notIsContinuation = !isContinuation(order);
   4750 
   4751             if(notIsContinuation) {
   4752                 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
   4753             } else {
   4754                 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   4755             }
   4756 
   4757             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4758             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4759             primary1 = (uint8_t)(order >> 8);
   4760 
   4761             /*if(notIsContinuation && scriptOrder != NULL) {
   4762             primary1 = scriptOrder[primary1];
   4763             }*/
   4764 
   4765             if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
   4766                 || (!notIsContinuation && wasShifted))
   4767                 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   4768             {
   4769                 /* and other ignorables should be removed if following a shifted code point */
   4770                 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
   4771                     /* we should just completely ignore it */
   4772                     continue;
   4773                 }
   4774                 if(compareQuad == 0) {
   4775                     if(count4 > 0) {
   4776                         while (count4 > UCOL_BOT_COUNT4) {
   4777                             *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   4778                             count4 -= UCOL_BOT_COUNT4;
   4779                         }
   4780                         *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
   4781                         count4 = 0;
   4782                     }
   4783                     /* We are dealing with a variable and we're treating them as shifted */
   4784                     /* This is a shifted ignorable */
   4785                     if(primary1 != 0) { /* we need to check this since we could be in continuation */
   4786                         *quads++ = primary1;
   4787                     }
   4788                     if(primary2 != 0) {
   4789                         *quads++ = primary2;
   4790                     }
   4791                 }
   4792                 wasShifted = TRUE;
   4793             } else {
   4794                 wasShifted = FALSE;
   4795                 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   4796                 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
   4797                 /* regular and simple sortkey calc */
   4798                 if(primary1 != UCOL_IGNORABLE) {
   4799                     if(notIsContinuation) {
   4800                         if(leadPrimary == primary1) {
   4801                             *primaries++ = primary2;
   4802                         } else {
   4803                             if(leadPrimary != 0) {
   4804                                 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
   4805                             }
   4806                             if(primary2 == UCOL_IGNORABLE) {
   4807                                 /* one byter, not compressed */
   4808                                 *primaries++ = primary1;
   4809                                 leadPrimary = 0;
   4810                             } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
   4811                                 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
   4812                                 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
   4813                                     /* not compressible */
   4814                                     leadPrimary = 0;
   4815                                     *primaries++ = primary1;
   4816                                     if(primaries <= primarySafeEnd) {
   4817                                         *primaries++ = primary2;
   4818                                     }
   4819                             } else { /* compress */
   4820                                 *primaries++ = leadPrimary = primary1;
   4821                                 if(primaries <= primarySafeEnd) {
   4822                                     *primaries++ = primary2;
   4823                                 }
   4824                             }
   4825                         }
   4826                     } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   4827                         *primaries++ = primary1;
   4828                         if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) {
   4829                                 *primaries++ = primary2; /* second part */
   4830                         }
   4831                     }
   4832                 }
   4833 
   4834                 if(secondary > compareSec) {
   4835                     if(!isFrenchSec) {
   4836                         /* This is compression code. */
   4837                         if (secondary == UCOL_COMMON2 && notIsContinuation) {
   4838                             ++count2;
   4839                         } else {
   4840                             if (count2 > 0) {
   4841                                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4842                                     while (count2 > UCOL_TOP_COUNT2) {
   4843                                         *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   4844                                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4845                                     }
   4846                                     *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
   4847                                 } else {
   4848                                     while (count2 > UCOL_BOT_COUNT2) {
   4849                                         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4850                                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4851                                     }
   4852                                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   4853                                 }
   4854                                 count2 = 0;
   4855                             }
   4856                             *secondaries++ = secondary;
   4857                         }
   4858                     } else {
   4859                         *secondaries++ = secondary;
   4860                         /* Do the special handling for French secondaries */
   4861                         /* We need to get continuation elements and do intermediate restore */
   4862                         /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
   4863                         if(notIsContinuation) {
   4864                             if (frenchStartPtr != NULL) {
   4865                                 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4866                                 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4867                                 frenchStartPtr = NULL;
   4868                             }
   4869                         } else {
   4870                             if (frenchStartPtr == NULL) {
   4871                                 frenchStartPtr = secondaries - 2;
   4872                             }
   4873                             frenchEndPtr = secondaries-1;
   4874                         }
   4875                     }
   4876                 }
   4877 
   4878                 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
   4879                     // do the case level if we need to do it. We don't want to calculate
   4880                     // case level for primary ignorables if we have only primary strength and case level
   4881                     // otherwise we would break well formedness of CEs
   4882                     doCaseShift(&cases, caseShift);
   4883                     if(notIsContinuation) {
   4884                         caseBits = (uint8_t)(tertiary & 0xC0);
   4885 
   4886                         if(tertiary != 0) {
   4887                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   4888                                 if((caseBits & 0xC0) == 0) {
   4889                                     *(cases-1) |= 1 << (--caseShift);
   4890                                 } else {
   4891                                     *(cases-1) |= 0 << (--caseShift);
   4892                                     /* second bit */
   4893                                     doCaseShift(&cases, caseShift);
   4894                                     *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
   4895                                 }
   4896                             } else {
   4897                                 if((caseBits & 0xC0) == 0) {
   4898                                     *(cases-1) |= 0 << (--caseShift);
   4899                                 } else {
   4900                                     *(cases-1) |= 1 << (--caseShift);
   4901                                     /* second bit */
   4902                                     doCaseShift(&cases, caseShift);
   4903                                     *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
   4904                                 }
   4905                             }
   4906                         }
   4907 
   4908                     }
   4909                 } else {
   4910                     if(notIsContinuation) {
   4911                         tertiary ^= caseSwitch;
   4912                     }
   4913                 }
   4914 
   4915                 tertiary &= tertiaryMask;
   4916                 if(tertiary > compareTer) {
   4917                     /* This is compression code. */
   4918                     /* sequence size check is included in the if clause */
   4919                     if (tertiary == tertiaryCommon && notIsContinuation) {
   4920                         ++count3;
   4921                     } else {
   4922                         if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   4923                             tertiary += tertiaryAddition;
   4924                         } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   4925                             tertiary -= tertiaryAddition;
   4926                         }
   4927                         if (count3 > 0) {
   4928                             if ((tertiary > tertiaryCommon)) {
   4929                                 while (count3 > coll->tertiaryTopCount) {
   4930                                     *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   4931                                     count3 -= (uint32_t)coll->tertiaryTopCount;
   4932                                 }
   4933                                 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
   4934                             } else {
   4935                                 while (count3 > coll->tertiaryBottomCount) {
   4936                                     *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   4937                                     count3 -= (uint32_t)coll->tertiaryBottomCount;
   4938                                 }
   4939                                 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   4940                             }
   4941                             count3 = 0;
   4942                         }
   4943                         *tertiaries++ = tertiary;
   4944                     }
   4945                 }
   4946 
   4947                 if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
   4948                     if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   4949                         if(count4>0) { // Close this part
   4950                             while (count4 > UCOL_BOT_COUNT4) {
   4951                                 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   4952                                 count4 -= UCOL_BOT_COUNT4;
   4953                             }
   4954                             *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
   4955                             count4 = 0;
   4956                         }
   4957                         *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
   4958                     } else { // This wasn't Hiragana, so we can continue adding stuff
   4959                         count4++;
   4960                     }
   4961                 }
   4962             }
   4963 
   4964             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
   4965                 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
   4966                     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   4967                     if(U_FAILURE(*status)) {
   4968                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   4969                         finished = TRUE;
   4970                         break;
   4971                     }
   4972                     if(source == normSource.getBuffer()) {
   4973                         s.flags &= ~UCOL_ITER_NORM;
   4974                     }
   4975                     sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
   4976                     *status = U_BUFFER_OVERFLOW_ERROR;
   4977                     finished = TRUE;
   4978                     break;
   4979                 } else { /* It's much nicer if we can actually reallocate */
   4980                     int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart));
   4981                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
   4982                     if(U_SUCCESS(*status)) {
   4983                         *result = primStart;
   4984                         primarySafeEnd = primStart + resultLength - 1;
   4985                         if(strength > UCOL_PRIMARY) {
   4986                             primarySafeEnd--;
   4987                         }
   4988                     } else {
   4989                         /* We ran out of memory!? We can't recover. */
   4990                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   4991                         finished = TRUE;
   4992                         break;
   4993                     }
   4994                 }
   4995             }
   4996         }
   4997         if(finished) {
   4998             break;
   4999         } else {
   5000             prevBuffSize = minBufferSize;
   5001 
   5002             uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
   5003             if (frenchStartPtr != NULL) {
   5004                 frenchStartOffset = (uint32_t)(frenchStartPtr - secStart);
   5005                 frenchEndOffset = (uint32_t)(frenchEndPtr - secStart);
   5006             }
   5007             secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
   5008             terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
   5009             caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
   5010             quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
   5011             if(U_FAILURE(*status)) {
   5012                 /* We ran out of memory!? We can't recover. */
   5013                 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5014                 break;
   5015             }
   5016             if (frenchStartPtr != NULL) {
   5017                 frenchStartPtr = secStart + frenchStartOffset;
   5018                 frenchEndPtr = secStart + frenchEndOffset;
   5019             }
   5020             minBufferSize *= 2;
   5021         }
   5022     }
   5023 
   5024     /* Here, we are generally done with processing */
   5025     /* bailing out would not be too productive */
   5026 
   5027     if(U_SUCCESS(*status)) {
   5028         sortKeySize += (uint32_t)(primaries - primStart);
   5029         /* we have done all the CE's, now let's put them together to form a key */
   5030         if(compareSec == 0) {
   5031             if (count2 > 0) {
   5032                 while (count2 > UCOL_BOT_COUNT2) {
   5033                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5034                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5035                 }
   5036                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   5037             }
   5038             uint32_t secsize = (uint32_t)(secondaries-secStart);
   5039             if(!isFrenchSec) { // Regular situation, we know the length of secondaries
   5040                 sortKeySize += secsize;
   5041                 if(sortKeySize <= resultLength) {
   5042                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5043                     uprv_memcpy(primaries, secStart, secsize);
   5044                     primaries += secsize;
   5045                 } else {
   5046                     if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
   5047                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5048                         if(U_SUCCESS(*status)) {
   5049                             *result = primStart;
   5050                             *(primaries++) = UCOL_LEVELTERMINATOR;
   5051                             uprv_memcpy(primaries, secStart, secsize);
   5052                             primaries += secsize;
   5053                         }
   5054                         else {
   5055                             /* We ran out of memory!? We can't recover. */
   5056                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5057                             goto cleanup;
   5058                         }
   5059                     } else {
   5060                         *status = U_BUFFER_OVERFLOW_ERROR;
   5061                     }
   5062                 }
   5063             } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
   5064                 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
   5065                 sortKeySize += secsize;
   5066                 if(sortKeySize <= resultLength) { // if we managed to pack fine
   5067                     primaries = newPrim; // update the primary pointer
   5068                 } else { // overflow, need to reallocate and redo
   5069                     if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
   5070                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5071                         if(U_SUCCESS(*status)) {
   5072                             primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
   5073                         }
   5074                         else {
   5075                             /* We ran out of memory!? We can't recover. */
   5076                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5077                             goto cleanup;
   5078                         }
   5079                     } else {
   5080                         *status = U_BUFFER_OVERFLOW_ERROR;
   5081                     }
   5082                 }
   5083             }
   5084         }
   5085 
   5086         if(doCase) {
   5087             uint32_t casesize = (uint32_t)(cases - caseStart);
   5088             sortKeySize += casesize;
   5089             if(sortKeySize <= resultLength) {
   5090                 *(primaries++) = UCOL_LEVELTERMINATOR;
   5091                 uprv_memcpy(primaries, caseStart, casesize);
   5092                 primaries += casesize;
   5093             } else {
   5094                 if(allocateSKBuffer == TRUE) {
   5095                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5096                     if(U_SUCCESS(*status)) {
   5097                         *result = primStart;
   5098                         *(primaries++) = UCOL_LEVELTERMINATOR;
   5099                         uprv_memcpy(primaries, caseStart, casesize);
   5100                     }
   5101                     else {
   5102                         /* We ran out of memory!? We can't recover. */
   5103                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5104                         goto cleanup;
   5105                     }
   5106                 } else {
   5107                     *status = U_BUFFER_OVERFLOW_ERROR;
   5108                 }
   5109             }
   5110         }
   5111 
   5112         if(compareTer == 0) {
   5113             if (count3 > 0) {
   5114                 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
   5115                     while (count3 >= coll->tertiaryTopCount) {
   5116                         *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   5117                         count3 -= (uint32_t)coll->tertiaryTopCount;
   5118                     }
   5119                     *tertiaries++ = (uint8_t)(tertiaryTop - count3);
   5120                 } else {
   5121                     while (count3 > coll->tertiaryBottomCount) {
   5122                         *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   5123                         count3 -= (uint32_t)coll->tertiaryBottomCount;
   5124                     }
   5125                     *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   5126                 }
   5127             }
   5128             uint32_t tersize = (uint32_t)(tertiaries - terStart);
   5129             sortKeySize += tersize;
   5130             if(sortKeySize <= resultLength) {
   5131                 *(primaries++) = UCOL_LEVELTERMINATOR;
   5132                 uprv_memcpy(primaries, terStart, tersize);
   5133                 primaries += tersize;
   5134             } else {
   5135                 if(allocateSKBuffer == TRUE) {
   5136                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5137                     if(U_SUCCESS(*status)) {
   5138                         *result = primStart;
   5139                         *(primaries++) = UCOL_LEVELTERMINATOR;
   5140                         uprv_memcpy(primaries, terStart, tersize);
   5141                     }
   5142                     else {
   5143                         /* We ran out of memory!? We can't recover. */
   5144                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5145                         goto cleanup;
   5146                     }
   5147                 } else {
   5148                     *status = U_BUFFER_OVERFLOW_ERROR;
   5149                 }
   5150             }
   5151 
   5152             if(compareQuad == 0/*qShifted == TRUE*/) {
   5153                 if(count4 > 0) {
   5154                     while (count4 > UCOL_BOT_COUNT4) {
   5155                         *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   5156                         count4 -= UCOL_BOT_COUNT4;
   5157                     }
   5158                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
   5159                 }
   5160                 uint32_t quadsize = (uint32_t)(quads - quadStart);
   5161                 sortKeySize += quadsize;
   5162                 if(sortKeySize <= resultLength) {
   5163                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5164                     uprv_memcpy(primaries, quadStart, quadsize);
   5165                     primaries += quadsize;
   5166                 } else {
   5167                     if(allocateSKBuffer == TRUE) {
   5168                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5169                         if(U_SUCCESS(*status)) {
   5170                             *result = primStart;
   5171                             *(primaries++) = UCOL_LEVELTERMINATOR;
   5172                             uprv_memcpy(primaries, quadStart, quadsize);
   5173                         }
   5174                         else {
   5175                             /* We ran out of memory!? We can't recover. */
   5176                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5177                             goto cleanup;
   5178                         }
   5179                     } else {
   5180                         *status = U_BUFFER_OVERFLOW_ERROR;
   5181                     }
   5182                 }
   5183             }
   5184 
   5185             if(compareIdent) {
   5186                 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
   5187                 if(sortKeySize <= resultLength) {
   5188                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5189                     primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
   5190                 } else {
   5191                     if(allocateSKBuffer == TRUE) {
   5192                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
   5193                         if(U_SUCCESS(*status)) {
   5194                             *result = primStart;
   5195                             *(primaries++) = UCOL_LEVELTERMINATOR;
   5196                             u_writeIdenticalLevelRun(s.string, len, primaries);
   5197                         }
   5198                         else {
   5199                             /* We ran out of memory!? We can't recover. */
   5200                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5201                             goto cleanup;
   5202                         }
   5203                     } else {
   5204                         *status = U_BUFFER_OVERFLOW_ERROR;
   5205                     }
   5206                 }
   5207             }
   5208         }
   5209         *(primaries++) = '\0';
   5210     }
   5211 
   5212     if(allocateSKBuffer == TRUE) {
   5213         *result = (uint8_t*)uprv_malloc(sortKeySize);
   5214         /* test for NULL */
   5215         if (*result == NULL) {
   5216             *status = U_MEMORY_ALLOCATION_ERROR;
   5217             goto cleanup;
   5218         }
   5219         uprv_memcpy(*result, primStart, sortKeySize);
   5220         if(primStart != prim) {
   5221             uprv_free(primStart);
   5222         }
   5223     }
   5224 
   5225 cleanup:
   5226     if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
   5227         /* NULL terminate for safety */
   5228         **result = 0;
   5229     }
   5230     if(terStart != tert) {
   5231         uprv_free(terStart);
   5232         uprv_free(secStart);
   5233         uprv_free(caseStart);
   5234         uprv_free(quadStart);
   5235     }
   5236 
   5237     /* To avoid memory leak, free the offset buffer if necessary. */
   5238     ucol_freeOffsetBuffer(&s);
   5239 
   5240     return sortKeySize;
   5241 }
   5242 
   5243 
   5244 U_CFUNC int32_t U_CALLCONV
   5245 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
   5246         const    UChar        *source,
   5247         int32_t        sourceLength,
   5248         uint8_t        **result,
   5249         uint32_t        resultLength,
   5250         UBool allocateSKBuffer,
   5251         UErrorCode *status)
   5252 {
   5253     U_ALIGN_CODE(16);
   5254 
   5255     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
   5256     uint32_t i = 0; /* general purpose counter */
   5257 
   5258     /* Stack allocated buffers for buffers we use */
   5259     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
   5260 
   5261     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
   5262 
   5263     if(U_FAILURE(*status)) {
   5264         return 0;
   5265     }
   5266 
   5267     if(primaries == NULL && allocateSKBuffer == TRUE) {
   5268         primaries = *result = prim;
   5269         resultLength = UCOL_PRIMARY_MAX_BUFFER;
   5270     }
   5271 
   5272     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
   5273 
   5274     uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
   5275 
   5276     UnicodeString normSource;
   5277 
   5278     int32_t len =  sourceLength;
   5279 
   5280     /* If we need to normalize, we'll do it all at once at the beginning! */
   5281     if(coll->normalizationMode != UCOL_OFF) {
   5282         normSource.setTo(len < 0, source, len);
   5283         const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
   5284         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   5285         if(qcYesLength != normSource.length()) {
   5286             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   5287             normSource.truncate(qcYesLength);
   5288             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   5289             source = normSource.getBuffer();
   5290             len = normSource.length();
   5291         }
   5292     }
   5293     collIterate s;
   5294     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   5295     if(U_FAILURE(*status)) {
   5296         return 0;
   5297     }
   5298     if(source == normSource.getBuffer()) {
   5299         s.flags &= ~UCOL_ITER_NORM;
   5300     }
   5301 
   5302     if(resultLength == 0 || primaries == NULL) {
   5303         return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
   5304     }
   5305 
   5306     uint8_t *primarySafeEnd = primaries + resultLength - 2;
   5307 
   5308     uint32_t minBufferSize = UCOL_MAX_BUFFER;
   5309 
   5310     uint8_t *primStart = primaries;
   5311     uint8_t *secStart = secondaries;
   5312     uint8_t *terStart = tertiaries;
   5313 
   5314     uint32_t order = 0;
   5315 
   5316     uint8_t primary1 = 0;
   5317     uint8_t primary2 = 0;
   5318     uint8_t secondary = 0;
   5319     uint8_t tertiary = 0;
   5320     uint8_t caseSwitch = coll->caseSwitch;
   5321     uint8_t tertiaryMask = coll->tertiaryMask;
   5322     int8_t tertiaryAddition = coll->tertiaryAddition;
   5323     uint8_t tertiaryTop = coll->tertiaryTop;
   5324     uint8_t tertiaryBottom = coll->tertiaryBottom;
   5325     uint8_t tertiaryCommon = coll->tertiaryCommon;
   5326 
   5327     uint32_t prevBuffSize = 0;
   5328 
   5329     UBool finished = FALSE;
   5330     UBool notIsContinuation = FALSE;
   5331 
   5332     uint32_t count2 = 0, count3 = 0;
   5333     uint8_t leadPrimary = 0;
   5334 
   5335     for(;;) {
   5336         for(i=prevBuffSize; i<minBufferSize; ++i) {
   5337 
   5338             order = ucol_IGetNextCE(coll, &s, status);
   5339 
   5340             if(order == 0) {
   5341                 continue;
   5342             }
   5343 
   5344             if(order == UCOL_NO_MORE_CES) {
   5345                 finished = TRUE;
   5346                 break;
   5347             }
   5348 
   5349             notIsContinuation = !isContinuation(order);
   5350 
   5351             if(notIsContinuation) {
   5352                 tertiary = (uint8_t)((order & tertiaryMask));
   5353             } else {
   5354                 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   5355             }
   5356             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5357             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5358             primary1 = (uint8_t)(order >> 8);
   5359 
   5360             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   5361             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
   5362             /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
   5363             /* regular and simple sortkey calc */
   5364             if(primary1 != UCOL_IGNORABLE) {
   5365                 if(notIsContinuation) {
   5366                     if(leadPrimary == primary1) {
   5367                         *primaries++ = primary2;
   5368                     } else {
   5369                         if(leadPrimary != 0) {
   5370                             *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
   5371                         }
   5372                         if(primary2 == UCOL_IGNORABLE) {
   5373                             /* one byter, not compressed */
   5374                             *primaries++ = primary1;
   5375                             leadPrimary = 0;
   5376                         } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
   5377                             //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
   5378                             //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
   5379                             (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
   5380                                 /* not compressible */
   5381                                 leadPrimary = 0;
   5382                                 *primaries++ = primary1;
   5383                                 *primaries++ = primary2;
   5384                         } else { /* compress */
   5385                             *primaries++ = leadPrimary = primary1;
   5386                             *primaries++ = primary2;
   5387                         }
   5388                     }
   5389                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   5390                     *primaries++ = primary1;
   5391                     if(primary2 != UCOL_IGNORABLE) {
   5392                         *primaries++ = primary2; /* second part */
   5393                     }
   5394                 }
   5395             }
   5396 
   5397             if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
   5398                 /* This is compression code. */
   5399                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
   5400                     ++count2;
   5401                 } else {
   5402                     if (count2 > 0) {
   5403                         if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   5404                             while (count2 > UCOL_TOP_COUNT2) {
   5405                                 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   5406                                 count2 -= (uint32_t)UCOL_TOP_COUNT2;
   5407                             }
   5408                             *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
   5409                         } else {
   5410                             while (count2 > UCOL_BOT_COUNT2) {
   5411                                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5412                                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5413                             }
   5414                             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   5415                         }
   5416                         count2 = 0;
   5417                     }
   5418                     *secondaries++ = secondary;
   5419                 }
   5420             }
   5421 
   5422             if(notIsContinuation) {
   5423                 tertiary ^= caseSwitch;
   5424             }
   5425 
   5426             if(tertiary > 0) {
   5427                 /* This is compression code. */
   5428                 /* sequence size check is included in the if clause */
   5429                 if (tertiary == tertiaryCommon && notIsContinuation) {
   5430                     ++count3;
   5431                 } else {
   5432                     if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   5433                         tertiary += tertiaryAddition;
   5434                     } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   5435                         tertiary -= tertiaryAddition;
   5436                     }
   5437                     if (count3 > 0) {
   5438                         if ((tertiary > tertiaryCommon)) {
   5439                             while (count3 > coll->tertiaryTopCount) {
   5440                                 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   5441                                 count3 -= (uint32_t)coll->tertiaryTopCount;
   5442                             }
   5443                             *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
   5444                         } else {
   5445                             while (count3 > coll->tertiaryBottomCount) {
   5446                                 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   5447                                 count3 -= (uint32_t)coll->tertiaryBottomCount;
   5448                             }
   5449                             *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   5450                         }
   5451                         count3 = 0;
   5452                     }
   5453                     *tertiaries++ = tertiary;
   5454                 }
   5455             }
   5456 
   5457             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
   5458                 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
   5459                     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   5460                     if(U_FAILURE(*status)) {
   5461                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5462                         finished = TRUE;
   5463                         break;
   5464                     }
   5465                     if(source == normSource.getBuffer()) {
   5466                         s.flags &= ~UCOL_ITER_NORM;
   5467                     }
   5468                     sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
   5469                     *status = U_BUFFER_OVERFLOW_ERROR;
   5470                     finished = TRUE;
   5471                     break;
   5472                 } else { /* It's much nicer if we can actually reallocate */
   5473                     int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart));
   5474                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
   5475                     if(U_SUCCESS(*status)) {
   5476                         *result = primStart;
   5477                         primarySafeEnd = primStart + resultLength - 2;
   5478                     } else {
   5479                         /* We ran out of memory!? We can't recover. */
   5480                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5481                         finished = TRUE;
   5482                         break;
   5483                     }
   5484                 }
   5485             }
   5486         }
   5487         if(finished) {
   5488             break;
   5489         } else {
   5490             prevBuffSize = minBufferSize;
   5491             secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
   5492             terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
   5493             minBufferSize *= 2;
   5494             if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
   5495                 /* We ran out of memory!? We can't recover. */
   5496                 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5497                 break;
   5498             }
   5499         }
   5500     }
   5501 
   5502     if(U_SUCCESS(*status)) {
   5503         sortKeySize += (uint32_t)(primaries - primStart);
   5504         /* we have done all the CE's, now let's put them together to form a key */
   5505         if (count2 > 0) {
   5506             while (count2 > UCOL_BOT_COUNT2) {
   5507                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5508                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5509             }
   5510             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   5511         }
   5512         uint32_t secsize = (uint32_t)(secondaries-secStart);
   5513         sortKeySize += secsize;
   5514         if(sortKeySize <= resultLength) {
   5515             *(primaries++) = UCOL_LEVELTERMINATOR;
   5516             uprv_memcpy(primaries, secStart, secsize);
   5517             primaries += secsize;
   5518         } else {
   5519             if(allocateSKBuffer == TRUE) {
   5520                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5521                 if(U_SUCCESS(*status)) {
   5522                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5523                     *result = primStart;
   5524                     uprv_memcpy(primaries, secStart, secsize);
   5525                 }
   5526                 else {
   5527                     /* We ran out of memory!? We can't recover. */
   5528                     sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5529                     goto cleanup;
   5530                 }
   5531             } else {
   5532                 *status = U_BUFFER_OVERFLOW_ERROR;
   5533             }
   5534         }
   5535 
   5536         if (count3 > 0) {
   5537             if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
   5538                 while (count3 >= coll->tertiaryTopCount) {
   5539                     *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   5540                     count3 -= (uint32_t)coll->tertiaryTopCount;
   5541                 }
   5542                 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
   5543             } else {
   5544                 while (count3 > coll->tertiaryBottomCount) {
   5545                     *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   5546                     count3 -= (uint32_t)coll->tertiaryBottomCount;
   5547                 }
   5548                 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   5549             }
   5550         }
   5551         uint32_t tersize = (uint32_t)(tertiaries - terStart);
   5552         sortKeySize += tersize;
   5553         if(sortKeySize <= resultLength) {
   5554             *(primaries++) = UCOL_LEVELTERMINATOR;
   5555             uprv_memcpy(primaries, terStart, tersize);
   5556             primaries += tersize;
   5557         } else {
   5558             if(allocateSKBuffer == TRUE) {
   5559                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5560                 if(U_SUCCESS(*status)) {
   5561                     *result = primStart;
   5562                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5563                     uprv_memcpy(primaries, terStart, tersize);
   5564                 }
   5565                 else {
   5566                     /* We ran out of memory!? We can't recover. */
   5567                     sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5568                     goto cleanup;
   5569                 }
   5570             } else {
   5571                 *status = U_MEMORY_ALLOCATION_ERROR;
   5572             }
   5573         }
   5574 
   5575         *(primaries++) = '\0';
   5576     }
   5577 
   5578     if(allocateSKBuffer == TRUE) {
   5579         *result = (uint8_t*)uprv_malloc(sortKeySize);
   5580         /* test for NULL */
   5581         if (*result == NULL) {
   5582             *status = U_MEMORY_ALLOCATION_ERROR;
   5583             goto cleanup;
   5584         }
   5585         uprv_memcpy(*result, primStart, sortKeySize);
   5586         if(primStart != prim) {
   5587             uprv_free(primStart);
   5588         }
   5589     }
   5590 
   5591 cleanup:
   5592     if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
   5593         /* NULL terminate for safety */
   5594         **result = 0;
   5595     }
   5596     if(terStart != tert) {
   5597         uprv_free(terStart);
   5598         uprv_free(secStart);
   5599     }
   5600 
   5601     /* To avoid memory leak, free the offset buffer if necessary. */
   5602     ucol_freeOffsetBuffer(&s);
   5603 
   5604     return sortKeySize;
   5605 }
   5606 
   5607 static inline
   5608 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
   5609     UBool notIsContinuation = !isContinuation(CE);
   5610     uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
   5611     if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
   5612         || (!notIsContinuation && *wasShifted))
   5613         || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   5614     {
   5615         // The stuff below should probably be in the sortkey code... maybe not...
   5616         if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
   5617             /* we should just completely ignore it */
   5618             *wasShifted = TRUE;
   5619             //continue;
   5620         }
   5621         //*wasShifted = TRUE;
   5622         return TRUE;
   5623     } else {
   5624         *wasShifted = FALSE;
   5625         return FALSE;
   5626     }
   5627 }
   5628 static inline
   5629 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
   5630     if(level < maxLevel) {
   5631         dest[i++] = UCOL_LEVELTERMINATOR;
   5632     } else {
   5633         dest[i++] = 0;
   5634     }
   5635 }
   5636 
   5637 /** enumeration of level identifiers for partial sort key generation */
   5638 enum {
   5639   UCOL_PSK_PRIMARY = 0,
   5640     UCOL_PSK_SECONDARY = 1,
   5641     UCOL_PSK_CASE = 2,
   5642     UCOL_PSK_TERTIARY = 3,
   5643     UCOL_PSK_QUATERNARY = 4,
   5644     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
   5645     UCOL_PSK_IDENTICAL = 6,
   5646     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
   5647     UCOL_PSK_LIMIT
   5648 };
   5649 
   5650 /** collation state enum. *_SHIFT value is how much to shift right
   5651  *  to get the state piece to the right. *_MASK value should be
   5652  *  ANDed with the shifted state. This data is stored in state[1]
   5653  *  field.
   5654  */
   5655 enum {
   5656     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
   5657     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
   5658     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
   5659     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
   5660     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
   5661      *  This field is also used to denote that the French secondary level is finished
   5662      */
   5663     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
   5664     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
   5665     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
   5666     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
   5667     /** When we do French we need to reverse secondary values. However, continuations
   5668      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
   5669      */
   5670     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
   5671     UCOL_PSK_BOCSU_BYTES_MASK = 3,
   5672     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
   5673     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
   5674 };
   5675 
   5676 // macro calculating the number of expansion CEs available
   5677 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
   5678 
   5679 
   5680 /** main sortkey part procedure. On the first call,
   5681  *  you should pass in a collator, an iterator, empty state
   5682  *  state[0] == state[1] == 0, a buffer to hold results
   5683  *  number of bytes you need and an error code pointer.
   5684  *  Make sure your buffer is big enough to hold the wanted
   5685  *  number of sortkey bytes. I don't check.
   5686  *  The only meaningful status you can get back is
   5687  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
   5688  *  have been dealt a raw deal and that you probably won't
   5689  *  be able to use partial sortkey generation for this
   5690  *  particular combination of string and collator. This
   5691  *  is highly unlikely, but you should still check the error code.
   5692  *  Any other status means that you're not in a sane situation
   5693  *  anymore. After the first call, preserve state values and
   5694  *  use them on subsequent calls to obtain more bytes of a sortkey.
   5695  *  Use until the number of bytes written is smaller than the requested
   5696  *  number of bytes. Generated sortkey is not compatible with the
   5697  *  one generated by ucol_getSortKey, as we don't do any compression.
   5698  *  However, levels are still terminated by a 1 (one) and the sortkey
   5699  *  is terminated by a 0 (zero). Identical level is the same as in the
   5700  *  regular sortkey - internal bocu-1 implementation is used.
   5701  *  For curious, although you cannot do much about this, here is
   5702  *  the structure of state words.
   5703  *  state[0] - iterator state. Depends on the iterator implementation,
   5704  *             but allows the iterator to continue where it stopped in
   5705  *             the last iteration.
   5706  *  state[1] - collation processing state. Here is the distribution
   5707  *             of the bits:
   5708  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
   5709  *             quaternary, quin (we don't use this one), identical and
   5710  *             null (producing only zeroes - first one to terminate the
   5711  *             sortkey and subsequent to fill the buffer).
   5712  *   3       - byte count. Number of bytes written on the primary level.
   5713  *   4       - was shifted. Whether the previous iteration finished in the
   5714  *             shifted state.
   5715  *   5, 6    - French continuation bytes written. See the comment in the enum
   5716  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
   5717  *             the identical level.
   5718  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
   5719  *             since thes last successful update of the iterator state.
   5720  */
   5721 U_CAPI int32_t U_EXPORT2
   5722 ucol_nextSortKeyPart(const UCollator *coll,
   5723                      UCharIterator *iter,
   5724                      uint32_t state[2],
   5725                      uint8_t *dest, int32_t count,
   5726                      UErrorCode *status)
   5727 {
   5728     /* error checking */
   5729     if(status==NULL || U_FAILURE(*status)) {
   5730         return 0;
   5731     }
   5732     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
   5733     if( coll==NULL || iter==NULL ||
   5734         state==NULL ||
   5735         count<0 || (count>0 && dest==NULL)
   5736     ) {
   5737         *status=U_ILLEGAL_ARGUMENT_ERROR;
   5738         UTRACE_EXIT_STATUS(status);
   5739         return 0;
   5740     }
   5741 
   5742     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
   5743                   coll, iter, state[0], state[1], dest, count);
   5744 
   5745     if(count==0) {
   5746         /* nothing to do */
   5747         UTRACE_EXIT_VALUE(0);
   5748         return 0;
   5749     }
   5750     /** Setting up situation according to the state we got from the previous iteration */
   5751     // The state of the iterator from the previous invocation
   5752     uint32_t iterState = state[0];
   5753     // Has the last iteration ended in the shifted state
   5754     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
   5755     // What is the current level of the sortkey?
   5756     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
   5757     // Have we written only one byte from a two byte primary in the previous iteration?
   5758     // Also on secondary level - have we finished with the French secondary?
   5759     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
   5760     // number of bytes in the continuation buffer for French
   5761     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
   5762     // Number of bytes already written from a bocsu sequence. Since
   5763     // the longes bocsu sequence is 4 long, this can be up to 3.
   5764     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
   5765     // Number of elements that need to be consumed in this iteration because
   5766     // the iterator returned UITER_NO_STATE at the end of the last iteration,
   5767     // so we had to save the last valid state.
   5768     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
   5769 
   5770     /** values that depend on the collator attributes */
   5771     // strength of the collator.
   5772     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
   5773     // maximal level of the partial sortkey. Need to take whether case level is done
   5774     int32_t maxLevel = 0;
   5775     if(strength < UCOL_TERTIARY) {
   5776         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   5777             maxLevel = UCOL_PSK_CASE;
   5778         } else {
   5779             maxLevel = strength;
   5780         }
   5781     } else {
   5782         if(strength == UCOL_TERTIARY) {
   5783             maxLevel = UCOL_PSK_TERTIARY;
   5784         } else if(strength == UCOL_QUATERNARY) {
   5785             maxLevel = UCOL_PSK_QUATERNARY;
   5786         } else { // identical
   5787             maxLevel = UCOL_IDENTICAL;
   5788         }
   5789     }
   5790     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
   5791     uint8_t UCOL_HIRAGANA_QUAD =
   5792       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
   5793     // Boundary value that decides whether a CE is shifted or not
   5794     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
   5795     // Are we doing French collation?
   5796     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
   5797 
   5798     /** initializing the collation state */
   5799     UBool notIsContinuation = FALSE;
   5800     uint32_t CE = UCOL_NO_MORE_CES;
   5801 
   5802     collIterate s;
   5803     IInit_collIterate(coll, NULL, -1, &s, status);
   5804     if(U_FAILURE(*status)) {
   5805         UTRACE_EXIT_STATUS(*status);
   5806         return 0;
   5807     }
   5808     s.iterator = iter;
   5809     s.flags |= UCOL_USE_ITERATOR;
   5810     // This variable tells us whether we have produced some other levels in this iteration
   5811     // before we moved to the identical level. In that case, we need to switch the
   5812     // type of the iterator.
   5813     UBool doingIdenticalFromStart = FALSE;
   5814     // Normalizing iterator
   5815     // The division for the array length may truncate the array size to
   5816     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   5817     // for all platforms anyway.
   5818     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   5819     UNormIterator *normIter = NULL;
   5820     // If the normalization is turned on for the collator and we are below identical level
   5821     // we will use a FCD normalizing iterator
   5822     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
   5823         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5824         s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
   5825         s.flags &= ~UCOL_ITER_NORM;
   5826         if(U_FAILURE(*status)) {
   5827             UTRACE_EXIT_STATUS(*status);
   5828             return 0;
   5829         }
   5830     } else if(level == UCOL_PSK_IDENTICAL) {
   5831         // for identical level, we need a NFD iterator. We need to instantiate it here, since we
   5832         // will be updating the state - and this cannot be done on an ordinary iterator.
   5833         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5834         s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   5835         s.flags &= ~UCOL_ITER_NORM;
   5836         if(U_FAILURE(*status)) {
   5837             UTRACE_EXIT_STATUS(*status);
   5838             return 0;
   5839         }
   5840         doingIdenticalFromStart = TRUE;
   5841     }
   5842 
   5843     // This is the tentative new state of the iterator. The problem
   5844     // is that the iterator might return an undefined state, in
   5845     // which case we should save the last valid state and increase
   5846     // the iterator skip value.
   5847     uint32_t newState = 0;
   5848 
   5849     // First, we set the iterator to the last valid position
   5850     // from the last iteration. This was saved in state[0].
   5851     if(iterState == 0) {
   5852         /* initial state */
   5853         if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
   5854             s.iterator->move(s.iterator, 0, UITER_LIMIT);
   5855         } else {
   5856             s.iterator->move(s.iterator, 0, UITER_START);
   5857         }
   5858     } else {
   5859         /* reset to previous state */
   5860         s.iterator->setState(s.iterator, iterState, status);
   5861         if(U_FAILURE(*status)) {
   5862             UTRACE_EXIT_STATUS(*status);
   5863             return 0;
   5864         }
   5865     }
   5866 
   5867 
   5868 
   5869     // This variable tells us whether we can attempt to update the state
   5870     // of iterator. Situations where we don't want to update iterator state
   5871     // are the existence of expansion CEs that are not yet processed, and
   5872     // finishing the case level without enough space in the buffer to insert
   5873     // a level terminator.
   5874     UBool canUpdateState = TRUE;
   5875 
   5876     // Consume all the CEs that were consumed at the end of the previous
   5877     // iteration without updating the iterator state. On identical level,
   5878     // consume the code points.
   5879     int32_t counter = cces;
   5880     if(level < UCOL_PSK_IDENTICAL) {
   5881         while(counter-->0) {
   5882             // If we're doing French and we are on the secondary level,
   5883             // we go backwards.
   5884             if(level == UCOL_PSK_SECONDARY && doingFrench) {
   5885                 CE = ucol_IGetPrevCE(coll, &s, status);
   5886             } else {
   5887                 CE = ucol_IGetNextCE(coll, &s, status);
   5888             }
   5889             if(CE==UCOL_NO_MORE_CES) {
   5890                 /* should not happen */
   5891                 *status=U_INTERNAL_PROGRAM_ERROR;
   5892                 UTRACE_EXIT_STATUS(*status);
   5893                 return 0;
   5894             }
   5895             if(uprv_numAvailableExpCEs(s)) {
   5896                 canUpdateState = FALSE;
   5897             }
   5898         }
   5899     } else {
   5900         while(counter-->0) {
   5901             uiter_next32(s.iterator);
   5902         }
   5903     }
   5904 
   5905     // French secondary needs to know whether the iterator state of zero came from previous level OR
   5906     // from a new invocation...
   5907     UBool wasDoingPrimary = FALSE;
   5908     // destination buffer byte counter. When this guy
   5909     // gets to count, we're done with the iteration
   5910     int32_t i = 0;
   5911     // used to count the zero bytes written after we
   5912     // have finished with the sort key
   5913     int32_t j = 0;
   5914 
   5915 
   5916     // Hm.... I think we're ready to plunge in. Basic story is as following:
   5917     // we have a fall through case based on level. This is used for initial
   5918     // positioning on iteration start. Every level processor contains a
   5919     // for(;;) which will be broken when we exhaust all the CEs. Other
   5920     // way to exit is a goto saveState, which happens when we have filled
   5921     // out our buffer.
   5922     switch(level) {
   5923     case UCOL_PSK_PRIMARY:
   5924         wasDoingPrimary = TRUE;
   5925         for(;;) {
   5926             if(i==count) {
   5927                 goto saveState;
   5928             }
   5929             // We should save the state only if we
   5930             // are sure that we are done with the
   5931             // previous iterator state
   5932             if(canUpdateState && byteCountOrFrenchDone == 0) {
   5933                 newState = s.iterator->getState(s.iterator);
   5934                 if(newState != UITER_NO_STATE) {
   5935                     iterState = newState;
   5936                     cces = 0;
   5937                 }
   5938             }
   5939             CE = ucol_IGetNextCE(coll, &s, status);
   5940             cces++;
   5941             if(CE==UCOL_NO_MORE_CES) {
   5942                 // Add the level separator
   5943                 terminatePSKLevel(level, maxLevel, i, dest);
   5944                 byteCountOrFrenchDone=0;
   5945                 // Restart the iteration an move to the
   5946                 // second level
   5947                 s.iterator->move(s.iterator, 0, UITER_START);
   5948                 cces = 0;
   5949                 level = UCOL_PSK_SECONDARY;
   5950                 break;
   5951             }
   5952             if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5953                 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
   5954                 if(CE != 0) {
   5955                     if(byteCountOrFrenchDone == 0) {
   5956                         // get the second byte of primary
   5957                         dest[i++]=(uint8_t)(CE >> 8);
   5958                     } else {
   5959                         byteCountOrFrenchDone = 0;
   5960                     }
   5961                     if((CE &=0xff)!=0) {
   5962                         if(i==count) {
   5963                             /* overflow */
   5964                             byteCountOrFrenchDone = 1;
   5965                             cces--;
   5966                             goto saveState;
   5967                         }
   5968                         dest[i++]=(uint8_t)CE;
   5969                     }
   5970                 }
   5971             }
   5972             if(uprv_numAvailableExpCEs(s)) {
   5973                 canUpdateState = FALSE;
   5974             } else {
   5975                 canUpdateState = TRUE;
   5976             }
   5977         }
   5978         /* fall through to next level */
   5979     case UCOL_PSK_SECONDARY:
   5980         if(strength >= UCOL_SECONDARY) {
   5981             if(!doingFrench) {
   5982                 for(;;) {
   5983                     if(i == count) {
   5984                         goto saveState;
   5985                     }
   5986                     // We should save the state only if we
   5987                     // are sure that we are done with the
   5988                     // previous iterator state
   5989                     if(canUpdateState) {
   5990                         newState = s.iterator->getState(s.iterator);
   5991                         if(newState != UITER_NO_STATE) {
   5992                             iterState = newState;
   5993                             cces = 0;
   5994                         }
   5995                     }
   5996                     CE = ucol_IGetNextCE(coll, &s, status);
   5997                     cces++;
   5998                     if(CE==UCOL_NO_MORE_CES) {
   5999                         // Add the level separator
   6000                         terminatePSKLevel(level, maxLevel, i, dest);
   6001                         byteCountOrFrenchDone = 0;
   6002                         // Restart the iteration an move to the
   6003                         // second level
   6004                         s.iterator->move(s.iterator, 0, UITER_START);
   6005                         cces = 0;
   6006                         level = UCOL_PSK_CASE;
   6007                         break;
   6008                     }
   6009                     if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6010                         CE >>= 8; /* get secondary */
   6011                         if(CE != 0) {
   6012                             dest[i++]=(uint8_t)CE;
   6013                         }
   6014                     }
   6015                     if(uprv_numAvailableExpCEs(s)) {
   6016                         canUpdateState = FALSE;
   6017                     } else {
   6018                         canUpdateState = TRUE;
   6019                     }
   6020                 }
   6021             } else { // French secondary processing
   6022                 uint8_t frenchBuff[UCOL_MAX_BUFFER];
   6023                 int32_t frenchIndex = 0;
   6024                 // Here we are going backwards.
   6025                 // If the iterator is at the beggining, it should be
   6026                 // moved to end.
   6027                 if(wasDoingPrimary) {
   6028                     s.iterator->move(s.iterator, 0, UITER_LIMIT);
   6029                     cces = 0;
   6030                 }
   6031                 for(;;) {
   6032                     if(i == count) {
   6033                         goto saveState;
   6034                     }
   6035                     if(canUpdateState) {
   6036                         newState = s.iterator->getState(s.iterator);
   6037                         if(newState != UITER_NO_STATE) {
   6038                             iterState = newState;
   6039                             cces = 0;
   6040                         }
   6041                     }
   6042                     CE = ucol_IGetPrevCE(coll, &s, status);
   6043                     cces++;
   6044                     if(CE==UCOL_NO_MORE_CES) {
   6045                         // Add the level separator
   6046                         terminatePSKLevel(level, maxLevel, i, dest);
   6047                         byteCountOrFrenchDone = 0;
   6048                         // Restart the iteration an move to the next level
   6049                         s.iterator->move(s.iterator, 0, UITER_START);
   6050                         level = UCOL_PSK_CASE;
   6051                         break;
   6052                     }
   6053                     if(isContinuation(CE)) { // if it's a continuation, we want to save it and
   6054                         // reverse when we get a first non-continuation CE.
   6055                         CE >>= 8;
   6056                         frenchBuff[frenchIndex++] = (uint8_t)CE;
   6057                     } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6058                         CE >>= 8; /* get secondary */
   6059                         if(!frenchIndex) {
   6060                             if(CE != 0) {
   6061                                 dest[i++]=(uint8_t)CE;
   6062                             }
   6063                         } else {
   6064                             frenchBuff[frenchIndex++] = (uint8_t)CE;
   6065                             frenchIndex -= usedFrench;
   6066                             usedFrench = 0;
   6067                             while(i < count && frenchIndex) {
   6068                                 dest[i++] = frenchBuff[--frenchIndex];
   6069                                 usedFrench++;
   6070                             }
   6071                         }
   6072                     }
   6073                     if(uprv_numAvailableExpCEs(s)) {
   6074                         canUpdateState = FALSE;
   6075                     } else {
   6076                         canUpdateState = TRUE;
   6077                     }
   6078                 }
   6079             }
   6080         } else {
   6081             level = UCOL_PSK_CASE;
   6082         }
   6083         /* fall through to next level */
   6084     case UCOL_PSK_CASE:
   6085         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   6086             uint32_t caseShift = UCOL_CASE_SHIFT_START;
   6087             uint8_t caseByte = UCOL_CASE_BYTE_START;
   6088             uint8_t caseBits = 0;
   6089 
   6090             for(;;) {
   6091                 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
   6092                 if(i == count) {
   6093                     goto saveState;
   6094                 }
   6095                 // We should save the state only if we
   6096                 // are sure that we are done with the
   6097                 // previous iterator state
   6098                 if(canUpdateState) {
   6099                     newState = s.iterator->getState(s.iterator);
   6100                     if(newState != UITER_NO_STATE) {
   6101                         iterState = newState;
   6102                         cces = 0;
   6103                     }
   6104                 }
   6105                 CE = ucol_IGetNextCE(coll, &s, status);
   6106                 cces++;
   6107                 if(CE==UCOL_NO_MORE_CES) {
   6108                     // On the case level we might have an unfinished
   6109                     // case byte. Add one if it's started.
   6110                     if(caseShift != UCOL_CASE_SHIFT_START) {
   6111                         dest[i++] = caseByte;
   6112                     }
   6113                     cces = 0;
   6114                     // We have finished processing CEs on this level.
   6115                     // However, we don't know if we have enough space
   6116                     // to add a case level terminator.
   6117                     if(i < count) {
   6118                         // Add the level separator
   6119                         terminatePSKLevel(level, maxLevel, i, dest);
   6120                         // Restart the iteration and move to the
   6121                         // next level
   6122                         s.iterator->move(s.iterator, 0, UITER_START);
   6123                         level = UCOL_PSK_TERTIARY;
   6124                     } else {
   6125                         canUpdateState = FALSE;
   6126                     }
   6127                     break;
   6128                 }
   6129 
   6130                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6131                     if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
   6132                         // do the case level if we need to do it. We don't want to calculate
   6133                         // case level for primary ignorables if we have only primary strength and case level
   6134                         // otherwise we would break well formedness of CEs
   6135                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   6136                         caseBits = (uint8_t)(CE & 0xC0);
   6137                         // this copies the case level logic from the
   6138                         // sort key generation code
   6139                         if(CE != 0) {
   6140                             if (caseShift == 0) {
   6141                                 dest[i++] = caseByte;
   6142                                 caseShift = UCOL_CASE_SHIFT_START;
   6143                                 caseByte = UCOL_CASE_BYTE_START;
   6144                             }
   6145                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6146                                 if((caseBits & 0xC0) == 0) {
   6147                                     caseByte |= 1 << (--caseShift);
   6148                                 } else {
   6149                                     caseByte |= 0 << (--caseShift);
   6150                                     /* second bit */
   6151                                     if(caseShift == 0) {
   6152                                         dest[i++] = caseByte;
   6153                                         caseShift = UCOL_CASE_SHIFT_START;
   6154                                         caseByte = UCOL_CASE_BYTE_START;
   6155                                     }
   6156                                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
   6157                                 }
   6158                             } else {
   6159                                 if((caseBits & 0xC0) == 0) {
   6160                                     caseByte |= 0 << (--caseShift);
   6161                                 } else {
   6162                                     caseByte |= 1 << (--caseShift);
   6163                                     /* second bit */
   6164                                     if(caseShift == 0) {
   6165                                         dest[i++] = caseByte;
   6166                                         caseShift = UCOL_CASE_SHIFT_START;
   6167                                         caseByte = UCOL_CASE_BYTE_START;
   6168                                     }
   6169                                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
   6170                                 }
   6171                             }
   6172                         }
   6173 
   6174                     }
   6175                 }
   6176                 // Not sure this is correct for the case level - revisit
   6177                 if(uprv_numAvailableExpCEs(s)) {
   6178                     canUpdateState = FALSE;
   6179                 } else {
   6180                     canUpdateState = TRUE;
   6181                 }
   6182             }
   6183         } else {
   6184             level = UCOL_PSK_TERTIARY;
   6185         }
   6186         /* fall through to next level */
   6187     case UCOL_PSK_TERTIARY:
   6188         if(strength >= UCOL_TERTIARY) {
   6189             for(;;) {
   6190                 if(i == count) {
   6191                     goto saveState;
   6192                 }
   6193                 // We should save the state only if we
   6194                 // are sure that we are done with the
   6195                 // previous iterator state
   6196                 if(canUpdateState) {
   6197                     newState = s.iterator->getState(s.iterator);
   6198                     if(newState != UITER_NO_STATE) {
   6199                         iterState = newState;
   6200                         cces = 0;
   6201                     }
   6202                 }
   6203                 CE = ucol_IGetNextCE(coll, &s, status);
   6204                 cces++;
   6205                 if(CE==UCOL_NO_MORE_CES) {
   6206                     // Add the level separator
   6207                     terminatePSKLevel(level, maxLevel, i, dest);
   6208                     byteCountOrFrenchDone = 0;
   6209                     // Restart the iteration an move to the
   6210                     // second level
   6211                     s.iterator->move(s.iterator, 0, UITER_START);
   6212                     cces = 0;
   6213                     level = UCOL_PSK_QUATERNARY;
   6214                     break;
   6215                 }
   6216                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6217                     notIsContinuation = !isContinuation(CE);
   6218 
   6219                     if(notIsContinuation) {
   6220                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   6221                         CE ^= coll->caseSwitch;
   6222                         CE &= coll->tertiaryMask;
   6223                     } else {
   6224                         CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   6225                     }
   6226 
   6227                     if(CE != 0) {
   6228                         dest[i++]=(uint8_t)CE;
   6229                     }
   6230                 }
   6231                 if(uprv_numAvailableExpCEs(s)) {
   6232                     canUpdateState = FALSE;
   6233                 } else {
   6234                     canUpdateState = TRUE;
   6235                 }
   6236             }
   6237         } else {
   6238             // if we're not doing tertiary
   6239             // skip to the end
   6240             level = UCOL_PSK_NULL;
   6241         }
   6242         /* fall through to next level */
   6243     case UCOL_PSK_QUATERNARY:
   6244         if(strength >= UCOL_QUATERNARY) {
   6245             for(;;) {
   6246                 if(i == count) {
   6247                     goto saveState;
   6248                 }
   6249                 // We should save the state only if we
   6250                 // are sure that we are done with the
   6251                 // previous iterator state
   6252                 if(canUpdateState) {
   6253                     newState = s.iterator->getState(s.iterator);
   6254                     if(newState != UITER_NO_STATE) {
   6255                         iterState = newState;
   6256                         cces = 0;
   6257                     }
   6258                 }
   6259                 CE = ucol_IGetNextCE(coll, &s, status);
   6260                 cces++;
   6261                 if(CE==UCOL_NO_MORE_CES) {
   6262                     // Add the level separator
   6263                     terminatePSKLevel(level, maxLevel, i, dest);
   6264                     //dest[i++] = UCOL_LEVELTERMINATOR;
   6265                     byteCountOrFrenchDone = 0;
   6266                     // Restart the iteration an move to the
   6267                     // second level
   6268                     s.iterator->move(s.iterator, 0, UITER_START);
   6269                     cces = 0;
   6270                     level = UCOL_PSK_QUIN;
   6271                     break;
   6272                 }
   6273                 if(CE==0)
   6274                     continue;
   6275                 if(isShiftedCE(CE, LVT, &wasShifted)) {
   6276                     CE >>= 16; /* get primary */
   6277                     if(CE != 0) {
   6278                         if(byteCountOrFrenchDone == 0) {
   6279                             dest[i++]=(uint8_t)(CE >> 8);
   6280                         } else {
   6281                             byteCountOrFrenchDone = 0;
   6282                         }
   6283                         if((CE &=0xff)!=0) {
   6284                             if(i==count) {
   6285                                 /* overflow */
   6286                                 byteCountOrFrenchDone = 1;
   6287                                 goto saveState;
   6288                             }
   6289                             dest[i++]=(uint8_t)CE;
   6290                         }
   6291                     }
   6292                 } else {
   6293                     notIsContinuation = !isContinuation(CE);
   6294                     if(notIsContinuation) {
   6295                         if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   6296                             dest[i++] = UCOL_HIRAGANA_QUAD;
   6297                         } else {
   6298                             dest[i++] = 0xFF;
   6299                         }
   6300                     }
   6301                 }
   6302                 if(uprv_numAvailableExpCEs(s)) {
   6303                     canUpdateState = FALSE;
   6304                 } else {
   6305                     canUpdateState = TRUE;
   6306                 }
   6307             }
   6308         } else {
   6309             // if we're not doing quaternary
   6310             // skip to the end
   6311             level = UCOL_PSK_NULL;
   6312         }
   6313         /* fall through to next level */
   6314     case UCOL_PSK_QUIN:
   6315         level = UCOL_PSK_IDENTICAL;
   6316         /* fall through to next level */
   6317     case UCOL_PSK_IDENTICAL:
   6318         if(strength >= UCOL_IDENTICAL) {
   6319             UChar32 first, second;
   6320             int32_t bocsuBytesWritten = 0;
   6321             // We always need to do identical on
   6322             // the NFD form of the string.
   6323             if(normIter == NULL) {
   6324                 // we arrived from the level below and
   6325                 // normalization was not turned on.
   6326                 // therefore, we need to make a fresh NFD iterator
   6327                 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   6328                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   6329             } else if(!doingIdenticalFromStart) {
   6330                 // there is an iterator, but we did some other levels.
   6331                 // therefore, we have a FCD iterator - need to make
   6332                 // a NFD one.
   6333                 // normIter being at the beginning does not guarantee
   6334                 // that the underlying iterator is at the beginning
   6335                 iter->move(iter, 0, UITER_START);
   6336                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   6337             }
   6338             // At this point we have a NFD iterator that is positioned
   6339             // in the right place
   6340             if(U_FAILURE(*status)) {
   6341                 UTRACE_EXIT_STATUS(*status);
   6342                 return 0;
   6343             }
   6344             first = uiter_previous32(s.iterator);
   6345             // maybe we're at the start of the string
   6346             if(first == U_SENTINEL) {
   6347                 first = 0;
   6348             } else {
   6349                 uiter_next32(s.iterator);
   6350             }
   6351 
   6352             j = 0;
   6353             for(;;) {
   6354                 if(i == count) {
   6355                     if(j+1 < bocsuBytesWritten) {
   6356                         bocsuBytesUsed = j+1;
   6357                     }
   6358                     goto saveState;
   6359                 }
   6360 
   6361                 // On identical level, we will always save
   6362                 // the state if we reach this point, since
   6363                 // we don't depend on getNextCE for content
   6364                 // all the content is in our buffer and we
   6365                 // already either stored the full buffer OR
   6366                 // otherwise we won't arrive here.
   6367                 newState = s.iterator->getState(s.iterator);
   6368                 if(newState != UITER_NO_STATE) {
   6369                     iterState = newState;
   6370                     cces = 0;
   6371                 }
   6372 
   6373                 uint8_t buff[4];
   6374                 second = uiter_next32(s.iterator);
   6375                 cces++;
   6376 
   6377                 // end condition for identical level
   6378                 if(second == U_SENTINEL) {
   6379                     terminatePSKLevel(level, maxLevel, i, dest);
   6380                     level = UCOL_PSK_NULL;
   6381                     break;
   6382                 }
   6383                 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
   6384                 first = second;
   6385 
   6386                 j = 0;
   6387                 if(bocsuBytesUsed != 0) {
   6388                     while(bocsuBytesUsed-->0) {
   6389                         j++;
   6390                     }
   6391                 }
   6392 
   6393                 while(i < count && j < bocsuBytesWritten) {
   6394                     dest[i++] = buff[j++];
   6395                 }
   6396             }
   6397 
   6398         } else {
   6399             level = UCOL_PSK_NULL;
   6400         }
   6401         /* fall through to next level */
   6402     case UCOL_PSK_NULL:
   6403         j = i;
   6404         while(j<count) {
   6405             dest[j++]=0;
   6406         }
   6407         break;
   6408     default:
   6409         *status = U_INTERNAL_PROGRAM_ERROR;
   6410         UTRACE_EXIT_STATUS(*status);
   6411         return 0;
   6412     }
   6413 
   6414 saveState:
   6415     // Now we need to return stuff. First we want to see whether we have
   6416     // done everything for the current state of iterator.
   6417     if(byteCountOrFrenchDone
   6418         || canUpdateState == FALSE
   6419         || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
   6420     {
   6421         // Any of above mean that the previous transaction
   6422         // wasn't finished and that we should store the
   6423         // previous iterator state.
   6424         state[0] = iterState;
   6425     } else {
   6426         // The transaction is complete. We will continue in the next iteration.
   6427         state[0] = s.iterator->getState(s.iterator);
   6428         cces = 0;
   6429     }
   6430     // Store the number of bocsu bytes written.
   6431     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
   6432         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6433     }
   6434     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
   6435 
   6436     // Next we put in the level of comparison
   6437     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
   6438 
   6439     // If we are doing French, we need to store whether we have just finished the French level
   6440     if(level == UCOL_PSK_SECONDARY && doingFrench) {
   6441         state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6442     } else {
   6443         state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6444     }
   6445 
   6446     // Was the latest CE shifted
   6447     if(wasShifted) {
   6448         state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
   6449     }
   6450     // Check for cces overflow
   6451     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
   6452         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6453     }
   6454     // Store cces
   6455     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
   6456 
   6457     // Check for French overflow
   6458     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
   6459         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6460     }
   6461     // Store number of bytes written in the French secondary continuation sequence
   6462     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
   6463 
   6464 
   6465     // If we have used normalizing iterator, get rid of it
   6466     if(normIter != NULL) {
   6467         unorm_closeIter(normIter);
   6468     }
   6469 
   6470     /* To avoid memory leak, free the offset buffer if necessary. */
   6471     ucol_freeOffsetBuffer(&s);
   6472 
   6473     // Return number of meaningful sortkey bytes.
   6474     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
   6475                   dest,i, state[0], state[1]);
   6476     UTRACE_EXIT_VALUE(i);
   6477     return i;
   6478 }
   6479 
   6480 /**
   6481  * Produce a bound for a given sortkey and a number of levels.
   6482  */
   6483 U_CAPI int32_t U_EXPORT2
   6484 ucol_getBound(const uint8_t       *source,
   6485         int32_t             sourceLength,
   6486         UColBoundMode       boundType,
   6487         uint32_t            noOfLevels,
   6488         uint8_t             *result,
   6489         int32_t             resultLength,
   6490         UErrorCode          *status)
   6491 {
   6492     // consistency checks
   6493     if(status == NULL || U_FAILURE(*status)) {
   6494         return 0;
   6495     }
   6496     if(source == NULL) {
   6497         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6498         return 0;
   6499     }
   6500 
   6501     int32_t sourceIndex = 0;
   6502     // Scan the string until we skip enough of the key OR reach the end of the key
   6503     do {
   6504         sourceIndex++;
   6505         if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
   6506             noOfLevels--;
   6507         }
   6508     } while (noOfLevels > 0
   6509         && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
   6510 
   6511     if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
   6512         && noOfLevels > 0) {
   6513             *status = U_SORT_KEY_TOO_SHORT_WARNING;
   6514     }
   6515 
   6516 
   6517     // READ ME: this code assumes that the values for boundType
   6518     // enum will not changes. They are set so that the enum value
   6519     // corresponds to the number of extra bytes each bound type
   6520     // needs.
   6521     if(result != NULL && resultLength >= sourceIndex+boundType) {
   6522         uprv_memcpy(result, source, sourceIndex);
   6523         switch(boundType) {
   6524             // Lower bound just gets terminated. No extra bytes
   6525         case UCOL_BOUND_LOWER: // = 0
   6526             break;
   6527             // Upper bound needs one extra byte
   6528         case UCOL_BOUND_UPPER: // = 1
   6529             result[sourceIndex++] = 2;
   6530             break;
   6531             // Upper long bound needs two extra bytes
   6532         case UCOL_BOUND_UPPER_LONG: // = 2
   6533             result[sourceIndex++] = 0xFF;
   6534             result[sourceIndex++] = 0xFF;
   6535             break;
   6536         default:
   6537             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6538             return 0;
   6539         }
   6540         result[sourceIndex++] = 0;
   6541 
   6542         return sourceIndex;
   6543     } else {
   6544         return sourceIndex+boundType+1;
   6545     }
   6546 }
   6547 
   6548 /****************************************************************************/
   6549 /* Following are the functions that deal with the properties of a collator  */
   6550 /* there are new APIs and some compatibility APIs                           */
   6551 /****************************************************************************/
   6552 
   6553 static inline void
   6554 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
   6555                     int32_t *primShift, int32_t *secShift, int32_t *terShift)
   6556 {
   6557     uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
   6558     UBool reverseSecondary = FALSE;
   6559     if(!isContinuation(CE)) {
   6560         tertiary = (uint8_t)((CE & coll->tertiaryMask));
   6561         tertiary ^= coll->caseSwitch;
   6562         reverseSecondary = TRUE;
   6563     } else {
   6564         tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   6565         tertiary &= UCOL_REMOVE_CASE;
   6566         reverseSecondary = FALSE;
   6567     }
   6568 
   6569     secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6570     primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6571     primary1 = (uint8_t)(CE >> 8);
   6572 
   6573     if(primary1 != 0) {
   6574         coll->latinOneCEs[ch] |= (primary1 << *primShift);
   6575         *primShift -= 8;
   6576     }
   6577     if(primary2 != 0) {
   6578         if(*primShift < 0) {
   6579             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6580             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6581             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6582             return;
   6583         }
   6584         coll->latinOneCEs[ch] |= (primary2 << *primShift);
   6585         *primShift -= 8;
   6586     }
   6587     if(secondary != 0) {
   6588         if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
   6589             coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
   6590             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
   6591         } else { // normal case
   6592             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
   6593         }
   6594         *secShift -= 8;
   6595     }
   6596     if(tertiary != 0) {
   6597         coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
   6598         *terShift -= 8;
   6599     }
   6600 }
   6601 
   6602 static inline UBool
   6603 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
   6604     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
   6605     if(newTable == NULL) {
   6606       *status = U_MEMORY_ALLOCATION_ERROR;
   6607       coll->latinOneFailed = TRUE;
   6608       return FALSE;
   6609     }
   6610     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
   6611     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
   6612     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
   6613     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
   6614     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
   6615     coll->latinOneTableLen = size;
   6616     uprv_free(coll->latinOneCEs);
   6617     coll->latinOneCEs = newTable;
   6618     return TRUE;
   6619 }
   6620 
   6621 static UBool
   6622 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
   6623     UBool result = TRUE;
   6624     if(coll->latinOneCEs == NULL) {
   6625         coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
   6626         if(coll->latinOneCEs == NULL) {
   6627             *status = U_MEMORY_ALLOCATION_ERROR;
   6628             return FALSE;
   6629         }
   6630         coll->latinOneTableLen = UCOL_LATINONETABLELEN;
   6631     }
   6632     UChar ch = 0;
   6633     UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
   6634     // Check for null pointer
   6635     if (U_FAILURE(*status)) {
   6636         return FALSE;
   6637     }
   6638     uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
   6639 
   6640     int32_t primShift = 24, secShift = 24, terShift = 24;
   6641     uint32_t CE = 0;
   6642     int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
   6643 
   6644     // TODO: make safe if you get more than you wanted...
   6645     for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
   6646         primShift = 24; secShift = 24; terShift = 24;
   6647         if(ch < 0x100) {
   6648             CE = coll->latinOneMapping[ch];
   6649         } else {
   6650             CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   6651             if(CE == UCOL_NOT_FOUND && coll->UCA) {
   6652                 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   6653             }
   6654         }
   6655         if(CE < UCOL_NOT_FOUND) {
   6656             ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6657         } else {
   6658             switch (getCETag(CE)) {
   6659             case EXPANSION_TAG:
   6660             case DIGIT_TAG:
   6661                 ucol_setText(it, &ch, 1, status);
   6662                 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
   6663                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6664                         coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6665                         coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6666                         coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6667                         break;
   6668                     }
   6669                     ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6670                 }
   6671                 break;
   6672             case CONTRACTION_TAG:
   6673                 // here is the trick
   6674                 // F2 is contraction. We do something very similar to contractions
   6675                 // but have two indices, one in the real contraction table and the
   6676                 // other to where we stuffed things. This hopes that we don't have
   6677                 // many contractions (this should work for latin-1 tables).
   6678                 {
   6679                     if((CE & 0x00FFF000) != 0) {
   6680                         *status = U_UNSUPPORTED_ERROR;
   6681                         goto cleanup_after_failure;
   6682                     }
   6683 
   6684                     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   6685 
   6686                     CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
   6687 
   6688                     coll->latinOneCEs[ch] = CE;
   6689                     coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
   6690                     coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
   6691 
   6692                     // We're going to jump into contraction table, pick the elements
   6693                     // and use them
   6694                     do {
   6695                         CE = *(coll->contractionCEs +
   6696                             (UCharOffset - coll->contractionIndex));
   6697                         if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
   6698                             uint32_t size;
   6699                             uint32_t i;    /* general counter */
   6700                             uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   6701                             size = getExpansionCount(CE);
   6702                             //CE = *CEOffset++;
   6703                             if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   6704                                 for(i = 0; i<size; i++) {
   6705                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6706                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6707                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6708                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6709                                         break;
   6710                                     }
   6711                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6712                                 }
   6713                             } else { /* else, we do */
   6714                                 while(*CEOffset != 0) {
   6715                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6716                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6717                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6718                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6719                                         break;
   6720                                     }
   6721                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6722                                 }
   6723                             }
   6724                             contractionOffset++;
   6725                         } else if(CE < UCOL_NOT_FOUND) {
   6726                             ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
   6727                         } else {
   6728                             coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6729                             coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6730                             coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6731                             contractionOffset++;
   6732                         }
   6733                         UCharOffset++;
   6734                         primShift = 24; secShift = 24; terShift = 24;
   6735                         if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
   6736                             if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
   6737                                 goto cleanup_after_failure;
   6738                             }
   6739                         }
   6740                     } while(*UCharOffset != 0xFFFF);
   6741                 }
   6742                 break;;
   6743             case SPEC_PROC_TAG:
   6744                 {
   6745                     // 0xB7 is a precontext character defined in UCA5.1, a special
   6746                     // handle is implemeted in order to save LatinOne table for
   6747                     // most locales.
   6748                     if (ch==0xb7) {
   6749                         ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6750                     }
   6751                     else {
   6752                         goto cleanup_after_failure;
   6753                     }
   6754                 }
   6755                 break;
   6756             default:
   6757                 goto cleanup_after_failure;
   6758             }
   6759         }
   6760     }
   6761     // compact table
   6762     if(contractionOffset < coll->latinOneTableLen) {
   6763         if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
   6764             goto cleanup_after_failure;
   6765         }
   6766     }
   6767     ucol_closeElements(it);
   6768     return result;
   6769 
   6770 cleanup_after_failure:
   6771     // status should already be set before arriving here.
   6772     coll->latinOneFailed = TRUE;
   6773     ucol_closeElements(it);
   6774     return FALSE;
   6775 }
   6776 
   6777 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
   6778     if(U_SUCCESS(*status)) {
   6779         if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6780             coll->caseSwitch = UCOL_CASE_SWITCH;
   6781         } else {
   6782             coll->caseSwitch = UCOL_NO_CASE_SWITCH;
   6783         }
   6784 
   6785         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
   6786             coll->tertiaryMask = UCOL_REMOVE_CASE;
   6787             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6788             coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
   6789             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
   6790             coll->tertiaryBottom = UCOL_COMMON_BOT3;
   6791         } else {
   6792             coll->tertiaryMask = UCOL_KEEP_CASE;
   6793             coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
   6794             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6795                 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
   6796                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
   6797                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
   6798             } else {
   6799                 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6800                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
   6801                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
   6802             }
   6803         }
   6804 
   6805         /* Set the compression values */
   6806         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
   6807         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
   6808         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
   6809 
   6810         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
   6811             && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
   6812         {
   6813             coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
   6814         } else {
   6815             coll->sortKeyGen = ucol_calcSortKey;
   6816         }
   6817         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
   6818             && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
   6819         {
   6820             if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
   6821                 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
   6822                     //fprintf(stderr, "F");
   6823                     coll->latinOneUse = TRUE;
   6824                 } else {
   6825                     coll->latinOneUse = FALSE;
   6826                 }
   6827                 if(*status == U_UNSUPPORTED_ERROR) {
   6828                     *status = U_ZERO_ERROR;
   6829                 }
   6830             } else { // latin1Table exists and it doesn't need to be regenerated, just use it
   6831                 coll->latinOneUse = TRUE;
   6832             }
   6833         } else {
   6834             coll->latinOneUse = FALSE;
   6835         }
   6836     }
   6837 }
   6838 
   6839 U_CAPI uint32_t  U_EXPORT2
   6840 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
   6841     if(U_FAILURE(*status) || coll == NULL) {
   6842         return 0;
   6843     }
   6844     if(len == -1) {
   6845         len = u_strlen(varTop);
   6846     }
   6847     if(len == 0) {
   6848         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6849         return 0;
   6850     }
   6851 
   6852     collIterate s;
   6853     IInit_collIterate(coll, varTop, len, &s, status);
   6854     if(U_FAILURE(*status)) {
   6855         return 0;
   6856     }
   6857 
   6858     uint32_t CE = ucol_IGetNextCE(coll, &s, status);
   6859 
   6860     /* here we check if we have consumed all characters */
   6861     /* you can put in either one character or a contraction */
   6862     /* you shouldn't put more... */
   6863     if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
   6864         *status = U_CE_NOT_FOUND_ERROR;
   6865         return 0;
   6866     }
   6867 
   6868     uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
   6869 
   6870     if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
   6871         *status = U_PRIMARY_TOO_LONG_ERROR;
   6872         return 0;
   6873     }
   6874     if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
   6875         coll->variableTopValueisDefault = FALSE;
   6876         coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
   6877     }
   6878 
   6879     /* To avoid memory leak, free the offset buffer if necessary. */
   6880     ucol_freeOffsetBuffer(&s);
   6881 
   6882     return CE & UCOL_PRIMARYMASK;
   6883 }
   6884 
   6885 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
   6886     if(U_FAILURE(*status) || coll == NULL) {
   6887         return 0;
   6888     }
   6889     return coll->variableTopValue<<16;
   6890 }
   6891 
   6892 U_CAPI void  U_EXPORT2
   6893 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
   6894     if(U_FAILURE(*status) || coll == NULL) {
   6895         return;
   6896     }
   6897 
   6898     if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
   6899         coll->variableTopValueisDefault = FALSE;
   6900         coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
   6901     }
   6902 }
   6903 /* Attribute setter API */
   6904 U_CAPI void  U_EXPORT2
   6905 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
   6906     if(U_FAILURE(*status) || coll == NULL) {
   6907       return;
   6908     }
   6909     UColAttributeValue oldFrench = coll->frenchCollation;
   6910     UColAttributeValue oldCaseFirst = coll->caseFirst;
   6911     switch(attr) {
   6912     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
   6913         if(value == UCOL_ON) {
   6914             coll->numericCollation = UCOL_ON;
   6915             coll->numericCollationisDefault = FALSE;
   6916         } else if (value == UCOL_OFF) {
   6917             coll->numericCollation = UCOL_OFF;
   6918             coll->numericCollationisDefault = FALSE;
   6919         } else if (value == UCOL_DEFAULT) {
   6920             coll->numericCollationisDefault = TRUE;
   6921             coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
   6922         } else {
   6923             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6924         }
   6925         break;
   6926     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
   6927         if(value == UCOL_ON) {
   6928             coll->hiraganaQ = UCOL_ON;
   6929             coll->hiraganaQisDefault = FALSE;
   6930         } else if (value == UCOL_OFF) {
   6931             coll->hiraganaQ = UCOL_OFF;
   6932             coll->hiraganaQisDefault = FALSE;
   6933         } else if (value == UCOL_DEFAULT) {
   6934             coll->hiraganaQisDefault = TRUE;
   6935             coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
   6936         } else {
   6937             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6938         }
   6939         break;
   6940     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   6941         if(value == UCOL_ON) {
   6942             coll->frenchCollation = UCOL_ON;
   6943             coll->frenchCollationisDefault = FALSE;
   6944         } else if (value == UCOL_OFF) {
   6945             coll->frenchCollation = UCOL_OFF;
   6946             coll->frenchCollationisDefault = FALSE;
   6947         } else if (value == UCOL_DEFAULT) {
   6948             coll->frenchCollationisDefault = TRUE;
   6949             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
   6950         } else {
   6951             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6952         }
   6953         break;
   6954     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   6955         if(value == UCOL_SHIFTED) {
   6956             coll->alternateHandling = UCOL_SHIFTED;
   6957             coll->alternateHandlingisDefault = FALSE;
   6958         } else if (value == UCOL_NON_IGNORABLE) {
   6959             coll->alternateHandling = UCOL_NON_IGNORABLE;
   6960             coll->alternateHandlingisDefault = FALSE;
   6961         } else if (value == UCOL_DEFAULT) {
   6962             coll->alternateHandlingisDefault = TRUE;
   6963             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
   6964         } else {
   6965             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6966         }
   6967         break;
   6968     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   6969         if(value == UCOL_LOWER_FIRST) {
   6970             coll->caseFirst = UCOL_LOWER_FIRST;
   6971             coll->caseFirstisDefault = FALSE;
   6972         } else if (value == UCOL_UPPER_FIRST) {
   6973             coll->caseFirst = UCOL_UPPER_FIRST;
   6974             coll->caseFirstisDefault = FALSE;
   6975         } else if (value == UCOL_OFF) {
   6976             coll->caseFirst = UCOL_OFF;
   6977             coll->caseFirstisDefault = FALSE;
   6978         } else if (value == UCOL_DEFAULT) {
   6979             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
   6980             coll->caseFirstisDefault = TRUE;
   6981         } else {
   6982             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6983         }
   6984         break;
   6985     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   6986         if(value == UCOL_ON) {
   6987             coll->caseLevel = UCOL_ON;
   6988             coll->caseLevelisDefault = FALSE;
   6989         } else if (value == UCOL_OFF) {
   6990             coll->caseLevel = UCOL_OFF;
   6991             coll->caseLevelisDefault = FALSE;
   6992         } else if (value == UCOL_DEFAULT) {
   6993             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
   6994             coll->caseLevelisDefault = TRUE;
   6995         } else {
   6996             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6997         }
   6998         break;
   6999     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   7000         if(value == UCOL_ON) {
   7001             coll->normalizationMode = UCOL_ON;
   7002             coll->normalizationModeisDefault = FALSE;
   7003         } else if (value == UCOL_OFF) {
   7004             coll->normalizationMode = UCOL_OFF;
   7005             coll->normalizationModeisDefault = FALSE;
   7006         } else if (value == UCOL_DEFAULT) {
   7007             coll->normalizationModeisDefault = TRUE;
   7008             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
   7009         } else {
   7010             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7011         }
   7012         break;
   7013     case UCOL_STRENGTH:         /* attribute for strength */
   7014         if (value == UCOL_DEFAULT) {
   7015             coll->strengthisDefault = TRUE;
   7016             coll->strength = (UColAttributeValue)coll->options->strength;
   7017         } else if (value <= UCOL_IDENTICAL) {
   7018             coll->strengthisDefault = FALSE;
   7019             coll->strength = value;
   7020         } else {
   7021             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7022         }
   7023         break;
   7024     case UCOL_ATTRIBUTE_COUNT:
   7025     default:
   7026         *status = U_ILLEGAL_ARGUMENT_ERROR;
   7027         break;
   7028     }
   7029     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
   7030         coll->latinOneRegenTable = TRUE;
   7031     } else {
   7032         coll->latinOneRegenTable = FALSE;
   7033     }
   7034     ucol_updateInternalState(coll, status);
   7035 }
   7036 
   7037 U_CAPI UColAttributeValue  U_EXPORT2
   7038 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
   7039     if(U_FAILURE(*status) || coll == NULL) {
   7040       return UCOL_DEFAULT;
   7041     }
   7042     switch(attr) {
   7043     case UCOL_NUMERIC_COLLATION:
   7044       return coll->numericCollation;
   7045     case UCOL_HIRAGANA_QUATERNARY_MODE:
   7046       return coll->hiraganaQ;
   7047     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   7048         return coll->frenchCollation;
   7049     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   7050         return coll->alternateHandling;
   7051     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   7052         return coll->caseFirst;
   7053     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   7054         return coll->caseLevel;
   7055     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   7056         return coll->normalizationMode;
   7057     case UCOL_STRENGTH:         /* attribute for strength */
   7058         return coll->strength;
   7059     case UCOL_ATTRIBUTE_COUNT:
   7060     default:
   7061         *status = U_ILLEGAL_ARGUMENT_ERROR;
   7062         break;
   7063     }
   7064     return UCOL_DEFAULT;
   7065 }
   7066 
   7067 U_CAPI void U_EXPORT2
   7068 ucol_setStrength(    UCollator                *coll,
   7069             UCollationStrength        strength)
   7070 {
   7071     UErrorCode status = U_ZERO_ERROR;
   7072     ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
   7073 }
   7074 
   7075 U_CAPI UCollationStrength U_EXPORT2
   7076 ucol_getStrength(const UCollator *coll)
   7077 {
   7078     UErrorCode status = U_ZERO_ERROR;
   7079     return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
   7080 }
   7081 
   7082 /****************************************************************************/
   7083 /* Following are misc functions                                             */
   7084 /* there are new APIs and some compatibility APIs                           */
   7085 /****************************************************************************/
   7086 
   7087 U_CAPI void U_EXPORT2
   7088 ucol_getVersion(const UCollator* coll,
   7089                 UVersionInfo versionInfo)
   7090 {
   7091     /* RunTime version  */
   7092     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
   7093     /* Builder version*/
   7094     uint8_t bdVersion = coll->image->version[0];
   7095 
   7096     /* Charset Version. Need to get the version from cnv files
   7097      * makeconv should populate cnv files with version and
   7098      * an api has to be provided in ucnv.h to obtain this version
   7099      */
   7100     uint8_t csVersion = 0;
   7101 
   7102     /* combine the version info */
   7103     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
   7104 
   7105     /* Tailoring rules */
   7106     versionInfo[0] = (uint8_t)(cmbVersion>>8);
   7107     versionInfo[1] = (uint8_t)cmbVersion;
   7108     versionInfo[2] = coll->image->version[1];
   7109     if(coll->UCA) {
   7110         /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
   7111         versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
   7112     } else {
   7113         versionInfo[3] = 0;
   7114     }
   7115 }
   7116 
   7117 
   7118 /* This internal API checks whether a character is tailored or not */
   7119 U_CAPI UBool  U_EXPORT2
   7120 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
   7121     if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
   7122         return FALSE;
   7123     }
   7124 
   7125     uint32_t CE = UCOL_NOT_FOUND;
   7126     const UChar *ContractionStart = NULL;
   7127     if(u < 0x100) { /* latin-1 */
   7128         CE = coll->latinOneMapping[u];
   7129         if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
   7130             return FALSE;
   7131         }
   7132     } else { /* regular */
   7133         CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
   7134     }
   7135 
   7136     if(isContraction(CE)) {
   7137         ContractionStart = (UChar *)coll->image+getContractOffset(CE);
   7138         CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
   7139     }
   7140 
   7141     return (UBool)(CE != UCOL_NOT_FOUND);
   7142 }
   7143 
   7144 
   7145 /****************************************************************************/
   7146 /* Following are the string compare functions                               */
   7147 /*                                                                          */
   7148 /****************************************************************************/
   7149 
   7150 
   7151 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
   7152 /*                     Used by strcoll if strength == identical and strings  */
   7153 /*                     are otherwise equal.                                  */
   7154 /*                                                                           */
   7155 /*                     Comparison must be done on NFD normalized strings.    */
   7156 /*                     FCD is not good enough.                               */
   7157 
   7158 static
   7159 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
   7160 {
   7161     // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
   7162     // of same type, but that doesn't really mean that it will stay that way.
   7163     int32_t            comparison;
   7164 
   7165     if (sColl->flags & UCOL_USE_ITERATOR) {
   7166         // The division for the array length may truncate the array size to
   7167         // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   7168         // for all platforms anyway.
   7169         UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   7170         UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   7171         UNormIterator *sNIt = NULL, *tNIt = NULL;
   7172         sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   7173         tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   7174         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   7175         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   7176         UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
   7177         UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
   7178         comparison = u_strCompareIter(sIt, tIt, TRUE);
   7179         unorm_closeIter(sNIt);
   7180         unorm_closeIter(tNIt);
   7181     } else {
   7182         int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
   7183         const UChar *sBuf = sColl->string;
   7184         int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
   7185         const UChar *tBuf = tColl->string;
   7186 
   7187         if (normalize) {
   7188             *status = U_ZERO_ERROR;
   7189             // Note: We could use Normalizer::compare() or similar, but for short strings
   7190             // which may not be in FCD it might be faster to just NFD them.
   7191             // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
   7192             // NFD'ing immediately might be faster for long strings,
   7193             // but string comparison is usually done on relatively short strings.
   7194             sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
   7195                                   sColl->writableBuffer,
   7196                                   *status);
   7197             tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
   7198                                   tColl->writableBuffer,
   7199                                   *status);
   7200             if(U_FAILURE(*status)) {
   7201                 return UCOL_LESS;
   7202             }
   7203             comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
   7204         } else {
   7205             comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
   7206         }
   7207     }
   7208 
   7209     if (comparison < 0) {
   7210         return UCOL_LESS;
   7211     } else if (comparison == 0) {
   7212         return UCOL_EQUAL;
   7213     } else /* comparison > 0 */ {
   7214         return UCOL_GREATER;
   7215     }
   7216 }
   7217 
   7218 /*  CEBuf - A struct and some inline functions to handle the saving    */
   7219 /*          of CEs in a buffer within ucol_strcoll                     */
   7220 
   7221 #define UCOL_CEBUF_SIZE 512
   7222 typedef struct ucol_CEBuf {
   7223     uint32_t    *buf;
   7224     uint32_t    *endp;
   7225     uint32_t    *pos;
   7226     uint32_t     localArray[UCOL_CEBUF_SIZE];
   7227 } ucol_CEBuf;
   7228 
   7229 
   7230 static
   7231 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
   7232     (b)->buf = (b)->pos = (b)->localArray;
   7233     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
   7234 }
   7235 
   7236 static
   7237 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
   7238     uint32_t  oldSize;
   7239     uint32_t  newSize;
   7240     uint32_t  *newBuf;
   7241 
   7242     ci->flags |= UCOL_ITER_ALLOCATED;
   7243     oldSize = (uint32_t)(b->pos - b->buf);
   7244     newSize = oldSize * 2;
   7245     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
   7246     if(newBuf == NULL) {
   7247         *status = U_MEMORY_ALLOCATION_ERROR;
   7248     }
   7249     else {
   7250         uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
   7251         if (b->buf != b->localArray) {
   7252             uprv_free(b->buf);
   7253         }
   7254         b->buf = newBuf;
   7255         b->endp = b->buf + newSize;
   7256         b->pos  = b->buf + oldSize;
   7257     }
   7258 }
   7259 
   7260 static
   7261 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
   7262     if (b->pos == b->endp) {
   7263         ucol_CEBuf_Expand(b, ci, status);
   7264     }
   7265     if (U_SUCCESS(*status)) {
   7266         *(b)->pos++ = ce;
   7267     }
   7268 }
   7269 
   7270 /* This is a trick string compare function that goes in and uses sortkeys to compare */
   7271 /* It is used when compare gets in trouble and needs to bail out                     */
   7272 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
   7273                                                   collIterate *tColl,
   7274                                                   UErrorCode *status)
   7275 {
   7276     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
   7277     uint8_t *sourceKeyP = sourceKey;
   7278     uint8_t *targetKeyP = targetKey;
   7279     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
   7280     const UCollator *coll = sColl->coll;
   7281     const UChar *source = NULL;
   7282     const UChar *target = NULL;
   7283     int32_t result = UCOL_EQUAL;
   7284     UnicodeString sourceString, targetString;
   7285     int32_t sourceLength;
   7286     int32_t targetLength;
   7287 
   7288     if(sColl->flags & UCOL_USE_ITERATOR) {
   7289         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   7290         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   7291         UChar32 c;
   7292         while((c=sColl->iterator->next(sColl->iterator))>=0) {
   7293             sourceString.append((UChar)c);
   7294         }
   7295         while((c=tColl->iterator->next(tColl->iterator))>=0) {
   7296             targetString.append((UChar)c);
   7297         }
   7298         source = sourceString.getBuffer();
   7299         sourceLength = sourceString.length();
   7300         target = targetString.getBuffer();
   7301         targetLength = targetString.length();
   7302     } else { // no iterators
   7303         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
   7304         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
   7305         source = sColl->string;
   7306         target = tColl->string;
   7307     }
   7308 
   7309 
   7310 
   7311     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7312     if(sourceKeyLen > UCOL_MAX_BUFFER) {
   7313         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
   7314         if(sourceKeyP == NULL) {
   7315             *status = U_MEMORY_ALLOCATION_ERROR;
   7316             goto cleanup_and_do_compare;
   7317         }
   7318         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7319     }
   7320 
   7321     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7322     if(targetKeyLen > UCOL_MAX_BUFFER) {
   7323         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
   7324         if(targetKeyP == NULL) {
   7325             *status = U_MEMORY_ALLOCATION_ERROR;
   7326             goto cleanup_and_do_compare;
   7327         }
   7328         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7329     }
   7330 
   7331     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
   7332 
   7333 cleanup_and_do_compare:
   7334     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
   7335         uprv_free(sourceKeyP);
   7336     }
   7337 
   7338     if(targetKeyP != NULL && targetKeyP != targetKey) {
   7339         uprv_free(targetKeyP);
   7340     }
   7341 
   7342     if(result<0) {
   7343         return UCOL_LESS;
   7344     } else if(result>0) {
   7345         return UCOL_GREATER;
   7346     } else {
   7347         return UCOL_EQUAL;
   7348     }
   7349 }
   7350 
   7351 
   7352 static UCollationResult
   7353 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
   7354 {
   7355     U_ALIGN_CODE(16);
   7356 
   7357     const UCollator *coll = sColl->coll;
   7358 
   7359 
   7360     // setting up the collator parameters
   7361     UColAttributeValue strength = coll->strength;
   7362     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
   7363 
   7364     UBool checkSecTer = initialCheckSecTer;
   7365     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
   7366     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
   7367     UBool checkIdent = (strength == UCOL_IDENTICAL);
   7368     UBool checkCase = (coll->caseLevel == UCOL_ON);
   7369     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
   7370     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
   7371     UBool qShifted = shifted && checkQuad;
   7372     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
   7373 
   7374     if(doHiragana && shifted) {
   7375         return (ucol_compareUsingSortKeys(sColl, tColl, status));
   7376     }
   7377     uint8_t caseSwitch = coll->caseSwitch;
   7378     uint8_t tertiaryMask = coll->tertiaryMask;
   7379 
   7380     // This is the lowest primary value that will not be ignored if shifted
   7381     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
   7382 
   7383     UCollationResult result = UCOL_EQUAL;
   7384     UCollationResult hirResult = UCOL_EQUAL;
   7385 
   7386     // Preparing the CE buffers. They will be filled during the primary phase
   7387     ucol_CEBuf   sCEs;
   7388     ucol_CEBuf   tCEs;
   7389     UCOL_INIT_CEBUF(&sCEs);
   7390     UCOL_INIT_CEBUF(&tCEs);
   7391 
   7392     uint32_t secS = 0, secT = 0;
   7393     uint32_t sOrder=0, tOrder=0;
   7394 
   7395     // Non shifted primary processing is quite simple
   7396     if(!shifted) {
   7397         for(;;) {
   7398 
   7399             // We fetch CEs until we hit a non ignorable primary or end.
   7400             do {
   7401                 // We get the next CE
   7402                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7403                 // Stuff it in the buffer
   7404                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7405                 // And keep just the primary part.
   7406                 sOrder &= UCOL_PRIMARYMASK;
   7407             } while(sOrder == 0);
   7408 
   7409             // see the comments on the above block
   7410             do {
   7411                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7412                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7413                 tOrder &= UCOL_PRIMARYMASK;
   7414             } while(tOrder == 0);
   7415 
   7416             // if both primaries are the same
   7417             if(sOrder == tOrder) {
   7418                 // and there are no more CEs, we advance to the next level
   7419                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7420                     break;
   7421                 }
   7422                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7423                     if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
   7424                         hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
   7425                             ? UCOL_LESS:UCOL_GREATER;
   7426                     }
   7427                 }
   7428             } else {
   7429                 // if two primaries are different, we are done
   7430                 result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
   7431                 goto commonReturn;
   7432             }
   7433         } // no primary difference... do the rest from the buffers
   7434     } else { // shifted - do a slightly more complicated processing :)
   7435         for(;;) {
   7436             UBool sInShifted = FALSE;
   7437             UBool tInShifted = FALSE;
   7438             // This version of code can be refactored. However, it seems easier to understand this way.
   7439             // Source loop. Sam as the target loop.
   7440             for(;;) {
   7441                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7442                 if(sOrder == UCOL_NO_MORE_CES) {
   7443                     UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7444                     break;
   7445                 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
   7446                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7447                     continue;
   7448                 } else if(isContinuation(sOrder)) {
   7449                     if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7450                         if(sInShifted) {
   7451                             sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7452                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7453                             continue;
   7454                         } else {
   7455                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7456                             break;
   7457                         }
   7458                     } else { /* Just lower level values */
   7459                         if(sInShifted) {
   7460                             continue;
   7461                         } else {
   7462                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7463                             continue;
   7464                         }
   7465                     }
   7466                 } else { /* regular */
   7467                     if((sOrder & UCOL_PRIMARYMASK) > LVT) {
   7468                         UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7469                         break;
   7470                     } else {
   7471                         if((sOrder & UCOL_PRIMARYMASK) > 0) {
   7472                             sInShifted = TRUE;
   7473                             sOrder &= UCOL_PRIMARYMASK;
   7474                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7475                             continue;
   7476                         } else {
   7477                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7478                             sInShifted = FALSE;
   7479                             continue;
   7480                         }
   7481                     }
   7482                 }
   7483             }
   7484             sOrder &= UCOL_PRIMARYMASK;
   7485             sInShifted = FALSE;
   7486 
   7487             for(;;) {
   7488                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7489                 if(tOrder == UCOL_NO_MORE_CES) {
   7490                     UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7491                     break;
   7492                 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
   7493                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7494                     continue;
   7495                 } else if(isContinuation(tOrder)) {
   7496                     if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7497                         if(tInShifted) {
   7498                             tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7499                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7500                             continue;
   7501                         } else {
   7502                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7503                             break;
   7504                         }
   7505                     } else { /* Just lower level values */
   7506                         if(tInShifted) {
   7507                             continue;
   7508                         } else {
   7509                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7510                             continue;
   7511                         }
   7512                     }
   7513                 } else { /* regular */
   7514                     if((tOrder & UCOL_PRIMARYMASK) > LVT) {
   7515                         UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7516                         break;
   7517                     } else {
   7518                         if((tOrder & UCOL_PRIMARYMASK) > 0) {
   7519                             tInShifted = TRUE;
   7520                             tOrder &= UCOL_PRIMARYMASK;
   7521                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7522                             continue;
   7523                         } else {
   7524                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7525                             tInShifted = FALSE;
   7526                             continue;
   7527                         }
   7528                     }
   7529                 }
   7530             }
   7531             tOrder &= UCOL_PRIMARYMASK;
   7532             tInShifted = FALSE;
   7533 
   7534             if(sOrder == tOrder) {
   7535                 /*
   7536                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7537                 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
   7538                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
   7539                 ? UCOL_LESS:UCOL_GREATER;
   7540                 }
   7541                 }
   7542                 */
   7543                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7544                     break;
   7545                 } else {
   7546                     sOrder = 0;
   7547                     tOrder = 0;
   7548                     continue;
   7549                 }
   7550             } else {
   7551                 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
   7552                 goto commonReturn;
   7553             }
   7554         } /* no primary difference... do the rest from the buffers */
   7555     }
   7556 
   7557     /* now, we're gonna reexamine collected CEs */
   7558     uint32_t    *sCE;
   7559     uint32_t    *tCE;
   7560 
   7561     /* This is the secondary level of comparison */
   7562     if(checkSecTer) {
   7563         if(!isFrenchSec) { /* normal */
   7564             sCE = sCEs.buf;
   7565             tCE = tCEs.buf;
   7566             for(;;) {
   7567                 while (secS == 0) {
   7568                     secS = *(sCE++) & UCOL_SECONDARYMASK;
   7569                 }
   7570 
   7571                 while(secT == 0) {
   7572                     secT = *(tCE++) & UCOL_SECONDARYMASK;
   7573                 }
   7574 
   7575                 if(secS == secT) {
   7576                     if(secS == UCOL_NO_MORE_CES_SECONDARY) {
   7577                         break;
   7578                     } else {
   7579                         secS = 0; secT = 0;
   7580                         continue;
   7581                     }
   7582                 } else {
   7583                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7584                     goto commonReturn;
   7585                 }
   7586             }
   7587         } else { /* do the French */
   7588             uint32_t *sCESave = NULL;
   7589             uint32_t *tCESave = NULL;
   7590             sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
   7591             tCE = tCEs.pos-2;
   7592             for(;;) {
   7593                 while (secS == 0 && sCE >= sCEs.buf) {
   7594                     if(sCESave == 0) {
   7595                         secS = *(sCE--);
   7596                         if(isContinuation(secS)) {
   7597                             while(isContinuation(secS = *(sCE--)))
   7598                                 ;
   7599                             /* after this, secS has the start of continuation, and sCEs points before that */
   7600                             sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7601                             sCE+=2;  /* need to point to the first continuation CP */
   7602                             /* However, now you can just continue doing stuff */
   7603                         }
   7604                     } else {
   7605                         secS = *(sCE++);
   7606                         if(!isContinuation(secS)) { /* This means we have finished with this cont */
   7607                             sCE = sCESave;            /* reset the pointer to before continuation */
   7608                             sCESave = 0;
   7609                             continue;
   7610                         }
   7611                     }
   7612                     secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7613                 }
   7614 
   7615                 while(secT == 0 && tCE >= tCEs.buf) {
   7616                     if(tCESave == 0) {
   7617                         secT = *(tCE--);
   7618                         if(isContinuation(secT)) {
   7619                             while(isContinuation(secT = *(tCE--)))
   7620                                 ;
   7621                             /* after this, secS has the start of continuation, and sCEs points before that */
   7622                             tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7623                             tCE+=2;  /* need to point to the first continuation CP */
   7624                             /* However, now you can just continue doing stuff */
   7625                         }
   7626                     } else {
   7627                         secT = *(tCE++);
   7628                         if(!isContinuation(secT)) { /* This means we have finished with this cont */
   7629                             tCE = tCESave;          /* reset the pointer to before continuation */
   7630                             tCESave = 0;
   7631                             continue;
   7632                         }
   7633                     }
   7634                     secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7635                 }
   7636 
   7637                 if(secS == secT) {
   7638                     if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
   7639                         break;
   7640                     } else {
   7641                         secS = 0; secT = 0;
   7642                         continue;
   7643                     }
   7644                 } else {
   7645                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7646                     goto commonReturn;
   7647                 }
   7648             }
   7649         }
   7650     }
   7651 
   7652     /* doing the case bit */
   7653     if(checkCase) {
   7654         sCE = sCEs.buf;
   7655         tCE = tCEs.buf;
   7656         for(;;) {
   7657             while((secS & UCOL_REMOVE_CASE) == 0) {
   7658                 if(!isContinuation(*sCE++)) {
   7659                     secS =*(sCE-1);
   7660                     if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7661                         // primary ignorables should not be considered on the case level when the strength is primary
   7662                         // otherwise, the CEs stop being well-formed
   7663                         secS &= UCOL_TERT_CASE_MASK;
   7664                         secS ^= caseSwitch;
   7665                     } else {
   7666                         secS = 0;
   7667                     }
   7668                 } else {
   7669                     secS = 0;
   7670                 }
   7671             }
   7672 
   7673             while((secT & UCOL_REMOVE_CASE) == 0) {
   7674                 if(!isContinuation(*tCE++)) {
   7675                     secT = *(tCE-1);
   7676                     if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7677                         // primary ignorables should not be considered on the case level when the strength is primary
   7678                         // otherwise, the CEs stop being well-formed
   7679                         secT &= UCOL_TERT_CASE_MASK;
   7680                         secT ^= caseSwitch;
   7681                     } else {
   7682                         secT = 0;
   7683                     }
   7684                 } else {
   7685                     secT = 0;
   7686                 }
   7687             }
   7688 
   7689             if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
   7690                 result = UCOL_LESS;
   7691                 goto commonReturn;
   7692             } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
   7693                 result = UCOL_GREATER;
   7694                 goto commonReturn;
   7695             }
   7696 
   7697             if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
   7698                 break;
   7699             } else {
   7700                 secS = 0;
   7701                 secT = 0;
   7702             }
   7703         }
   7704     }
   7705 
   7706     /* Tertiary level */
   7707     if(checkTertiary) {
   7708         secS = 0;
   7709         secT = 0;
   7710         sCE = sCEs.buf;
   7711         tCE = tCEs.buf;
   7712         for(;;) {
   7713             while((secS & UCOL_REMOVE_CASE) == 0) {
   7714                 secS = *(sCE++) & tertiaryMask;
   7715                 if(!isContinuation(secS)) {
   7716                     secS ^= caseSwitch;
   7717                 } else {
   7718                     secS &= UCOL_REMOVE_CASE;
   7719                 }
   7720             }
   7721 
   7722             while((secT & UCOL_REMOVE_CASE)  == 0) {
   7723                 secT = *(tCE++) & tertiaryMask;
   7724                 if(!isContinuation(secT)) {
   7725                     secT ^= caseSwitch;
   7726                 } else {
   7727                     secT &= UCOL_REMOVE_CASE;
   7728                 }
   7729             }
   7730 
   7731             if(secS == secT) {
   7732                 if((secS & UCOL_REMOVE_CASE) == 1) {
   7733                     break;
   7734                 } else {
   7735                     secS = 0; secT = 0;
   7736                     continue;
   7737                 }
   7738             } else {
   7739                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7740                 goto commonReturn;
   7741             }
   7742         }
   7743     }
   7744 
   7745 
   7746     if(qShifted /*checkQuad*/) {
   7747         UBool sInShifted = TRUE;
   7748         UBool tInShifted = TRUE;
   7749         secS = 0;
   7750         secT = 0;
   7751         sCE = sCEs.buf;
   7752         tCE = tCEs.buf;
   7753         for(;;) {
   7754             while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
   7755                 secS = *(sCE++);
   7756                 if(isContinuation(secS)) {
   7757                     if(!sInShifted) {
   7758                         continue;
   7759                     }
   7760                 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
   7761                     secS = UCOL_PRIMARYMASK;
   7762                     sInShifted = FALSE;
   7763                 } else {
   7764                     sInShifted = TRUE;
   7765                 }
   7766             }
   7767             secS &= UCOL_PRIMARYMASK;
   7768 
   7769 
   7770             while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
   7771                 secT = *(tCE++);
   7772                 if(isContinuation(secT)) {
   7773                     if(!tInShifted) {
   7774                         continue;
   7775                     }
   7776                 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
   7777                     secT = UCOL_PRIMARYMASK;
   7778                     tInShifted = FALSE;
   7779                 } else {
   7780                     tInShifted = TRUE;
   7781                 }
   7782             }
   7783             secT &= UCOL_PRIMARYMASK;
   7784 
   7785             if(secS == secT) {
   7786                 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
   7787                     break;
   7788                 } else {
   7789                     secS = 0; secT = 0;
   7790                     continue;
   7791                 }
   7792             } else {
   7793                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7794                 goto commonReturn;
   7795             }
   7796         }
   7797     } else if(doHiragana && hirResult != UCOL_EQUAL) {
   7798         // If we're fine on quaternaries, we might be different
   7799         // on Hiragana. This, however, might fail us in shifted.
   7800         result = hirResult;
   7801         goto commonReturn;
   7802     }
   7803 
   7804     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
   7805     /*  as a tiebreaker if all else is equal.                                */
   7806     /*  Getting here  should be quite rare - strings are not identical -     */
   7807     /*     that is checked first, but compared == through all other checks.  */
   7808     if(checkIdent)
   7809     {
   7810         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
   7811         result = ucol_checkIdent(sColl, tColl, TRUE, status);
   7812     }
   7813 
   7814 commonReturn:
   7815     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
   7816         if (sCEs.buf != sCEs.localArray ) {
   7817             uprv_free(sCEs.buf);
   7818         }
   7819         if (tCEs.buf != tCEs.localArray ) {
   7820             uprv_free(tCEs.buf);
   7821         }
   7822     }
   7823 
   7824     return result;
   7825 }
   7826 
   7827 static UCollationResult
   7828 ucol_strcollRegular(const UCollator *coll,
   7829                     const UChar *source, int32_t sourceLength,
   7830                     const UChar *target, int32_t targetLength,
   7831                     UErrorCode *status) {
   7832     collIterate sColl, tColl;
   7833     // Preparing the context objects for iterating over strings
   7834     IInit_collIterate(coll, source, sourceLength, &sColl, status);
   7835     IInit_collIterate(coll, target, targetLength, &tColl, status);
   7836     if(U_FAILURE(*status)) {
   7837         return UCOL_LESS;
   7838     }
   7839     return ucol_strcollRegular(&sColl, &tColl, status);
   7840 }
   7841 
   7842 static inline uint32_t
   7843 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
   7844                           uint32_t CE, const UChar *s, int32_t *index, int32_t len)
   7845 {
   7846     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
   7847     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
   7848     int32_t offset = 1;
   7849     UChar schar = 0, tchar = 0;
   7850 
   7851     for(;;) {
   7852         if(len == -1) {
   7853             if(s[*index] == 0) { // end of string
   7854                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7855             } else {
   7856                 schar = s[*index];
   7857             }
   7858         } else {
   7859             if(*index == len) {
   7860                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7861             } else {
   7862                 schar = s[*index];
   7863             }
   7864         }
   7865 
   7866         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   7867             offset++;
   7868         }
   7869 
   7870         if (schar == tchar) {
   7871             (*index)++;
   7872             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
   7873         }
   7874         else
   7875         {
   7876             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
   7877                 return UCOL_BAIL_OUT_CE;
   7878             }
   7879             // skip completely ignorables
   7880             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   7881             if(isZeroCE == 0) { // we have to ignore completely ignorables
   7882                 (*index)++;
   7883                 continue;
   7884             }
   7885 
   7886             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7887         }
   7888     }
   7889 }
   7890 
   7891 
   7892 /**
   7893  * This is a fast strcoll, geared towards text in Latin-1.
   7894  * It supports contractions of size two, French secondaries
   7895  * and case switching. You can use it with strengths primary
   7896  * to tertiary. It does not support shifted and case level.
   7897  * It relies on the table build by setupLatin1Table. If it
   7898  * doesn't understand something, it will go to the regular
   7899  * strcoll.
   7900  */
   7901 static UCollationResult
   7902 ucol_strcollUseLatin1( const UCollator    *coll,
   7903               const UChar        *source,
   7904               int32_t            sLen,
   7905               const UChar        *target,
   7906               int32_t            tLen,
   7907               UErrorCode *status)
   7908 {
   7909     U_ALIGN_CODE(16);
   7910     int32_t strength = coll->strength;
   7911 
   7912     int32_t sIndex = 0, tIndex = 0;
   7913     UChar sChar = 0, tChar = 0;
   7914     uint32_t sOrder=0, tOrder=0;
   7915 
   7916     UBool endOfSource = FALSE;
   7917 
   7918     uint32_t *elements = coll->latinOneCEs;
   7919 
   7920     UBool haveContractions = FALSE; // if we have contractions in our string
   7921                                     // we cannot do French secondary
   7922 
   7923     // Do the primary level
   7924     for(;;) {
   7925         while(sOrder==0) { // this loop skips primary ignorables
   7926             // sOrder=getNextlatinOneCE(source);
   7927             if(sLen==-1) {   // handling zero terminated strings
   7928                 sChar=source[sIndex++];
   7929                 if(sChar==0) {
   7930                     endOfSource = TRUE;
   7931                     break;
   7932                 }
   7933             } else {        // handling strings with known length
   7934                 if(sIndex==sLen) {
   7935                     endOfSource = TRUE;
   7936                     break;
   7937                 }
   7938                 sChar=source[sIndex++];
   7939             }
   7940             if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   7941                 //fprintf(stderr, "R");
   7942                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7943             }
   7944             sOrder = elements[sChar];
   7945             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
   7946                 // specials can basically be either contractions or bail-out signs. If we get anything
   7947                 // else, we'll bail out anywasy
   7948                 if(getCETag(sOrder) == CONTRACTION_TAG) {
   7949                     sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
   7950                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
   7951                     // However, if there are contractions in the table, but we always use just one char,
   7952                     // we might be able to do French. This should be checked out.
   7953                 }
   7954                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   7955                     //fprintf(stderr, "S");
   7956                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7957                 }
   7958             }
   7959         }
   7960 
   7961         while(tOrder==0) {  // this loop skips primary ignorables
   7962             // tOrder=getNextlatinOneCE(target);
   7963             if(tLen==-1) {    // handling zero terminated strings
   7964                 tChar=target[tIndex++];
   7965                 if(tChar==0) {
   7966                     if(endOfSource) { // this is different than source loop,
   7967                         // as we already know that source loop is done here,
   7968                         // so we can either finish the primary loop if both
   7969                         // strings are done or anounce the result if only
   7970                         // target is done. Same below.
   7971                         goto endOfPrimLoop;
   7972                     } else {
   7973                         return UCOL_GREATER;
   7974                     }
   7975                 }
   7976             } else {          // handling strings with known length
   7977                 if(tIndex==tLen) {
   7978                     if(endOfSource) {
   7979                         goto endOfPrimLoop;
   7980                     } else {
   7981                         return UCOL_GREATER;
   7982                     }
   7983                 }
   7984                 tChar=target[tIndex++];
   7985             }
   7986             if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   7987                 //fprintf(stderr, "R");
   7988                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7989             }
   7990             tOrder = elements[tChar];
   7991             if(tOrder >= UCOL_NOT_FOUND) {
   7992                 // Handling specials, see the comments for source
   7993                 if(getCETag(tOrder) == CONTRACTION_TAG) {
   7994                     tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
   7995                     haveContractions = TRUE;
   7996                 }
   7997                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   7998                     //fprintf(stderr, "S");
   7999                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8000                 }
   8001             }
   8002         }
   8003         if(endOfSource) { // source is finished, but target is not, say the result.
   8004             return UCOL_LESS;
   8005         }
   8006 
   8007         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
   8008             sOrder = 0; tOrder = 0;
   8009             continue;
   8010         } else {
   8011             // compare current top bytes
   8012             if(((sOrder^tOrder)&0xFF000000)!=0) {
   8013                 // top bytes differ, return difference
   8014                 if(sOrder < tOrder) {
   8015                     return UCOL_LESS;
   8016                 } else if(sOrder > tOrder) {
   8017                     return UCOL_GREATER;
   8018                 }
   8019                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
   8020                 // since we must return enum value
   8021             }
   8022 
   8023             // top bytes match, continue with following bytes
   8024             sOrder<<=8;
   8025             tOrder<<=8;
   8026         }
   8027     }
   8028 
   8029 endOfPrimLoop:
   8030     // after primary loop, we definitely know the sizes of strings,
   8031     // so we set it and use simpler loop for secondaries and tertiaries
   8032     sLen = sIndex; tLen = tIndex;
   8033     if(strength >= UCOL_SECONDARY) {
   8034         // adjust the table beggining
   8035         elements += coll->latinOneTableLen;
   8036         endOfSource = FALSE;
   8037 
   8038         if(coll->frenchCollation == UCOL_OFF) { // non French
   8039             // This loop is a simplified copy of primary loop
   8040             // at this point we know that whole strings are latin-1, so we don't
   8041             // check for that. We also know that we only have contractions as
   8042             // specials.
   8043             sIndex = 0; tIndex = 0;
   8044             for(;;) {
   8045                 while(sOrder==0) {
   8046                     if(sIndex==sLen) {
   8047                         endOfSource = TRUE;
   8048                         break;
   8049                     }
   8050                     sChar=source[sIndex++];
   8051                     sOrder = elements[sChar];
   8052                     if(sOrder > UCOL_NOT_FOUND) {
   8053                         sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
   8054                     }
   8055                 }
   8056 
   8057                 while(tOrder==0) {
   8058                     if(tIndex==tLen) {
   8059                         if(endOfSource) {
   8060                             goto endOfSecLoop;
   8061                         } else {
   8062                             return UCOL_GREATER;
   8063                         }
   8064                     }
   8065                     tChar=target[tIndex++];
   8066                     tOrder = elements[tChar];
   8067                     if(tOrder > UCOL_NOT_FOUND) {
   8068                         tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
   8069                     }
   8070                 }
   8071                 if(endOfSource) {
   8072                     return UCOL_LESS;
   8073                 }
   8074 
   8075                 if(sOrder == tOrder) {
   8076                     sOrder = 0; tOrder = 0;
   8077                     continue;
   8078                 } else {
   8079                     // see primary loop for comments on this
   8080                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   8081                         if(sOrder < tOrder) {
   8082                             return UCOL_LESS;
   8083                         } else if(sOrder > tOrder) {
   8084                             return UCOL_GREATER;
   8085                         }
   8086                     }
   8087                     sOrder<<=8;
   8088                     tOrder<<=8;
   8089                 }
   8090             }
   8091         } else { // French
   8092             if(haveContractions) { // if we have contractions, we have to bail out
   8093                 // since we don't really know how to handle them here
   8094                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8095             }
   8096             // For French, we go backwards
   8097             sIndex = sLen; tIndex = tLen;
   8098             for(;;) {
   8099                 while(sOrder==0) {
   8100                     if(sIndex==0) {
   8101                         endOfSource = TRUE;
   8102                         break;
   8103                     }
   8104                     sChar=source[--sIndex];
   8105                     sOrder = elements[sChar];
   8106                     // don't even look for contractions
   8107                 }
   8108 
   8109                 while(tOrder==0) {
   8110                     if(tIndex==0) {
   8111                         if(endOfSource) {
   8112                             goto endOfSecLoop;
   8113                         } else {
   8114                             return UCOL_GREATER;
   8115                         }
   8116                     }
   8117                     tChar=target[--tIndex];
   8118                     tOrder = elements[tChar];
   8119                     // don't even look for contractions
   8120                 }
   8121                 if(endOfSource) {
   8122                     return UCOL_LESS;
   8123                 }
   8124 
   8125                 if(sOrder == tOrder) {
   8126                     sOrder = 0; tOrder = 0;
   8127                     continue;
   8128                 } else {
   8129                     // see the primary loop for comments
   8130                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   8131                         if(sOrder < tOrder) {
   8132                             return UCOL_LESS;
   8133                         } else if(sOrder > tOrder) {
   8134                             return UCOL_GREATER;
   8135                         }
   8136                     }
   8137                     sOrder<<=8;
   8138                     tOrder<<=8;
   8139                 }
   8140             }
   8141         }
   8142     }
   8143 
   8144 endOfSecLoop:
   8145     if(strength >= UCOL_TERTIARY) {
   8146         // tertiary loop is the same as secondary (except no French)
   8147         elements += coll->latinOneTableLen;
   8148         sIndex = 0; tIndex = 0;
   8149         endOfSource = FALSE;
   8150         for(;;) {
   8151             while(sOrder==0) {
   8152                 if(sIndex==sLen) {
   8153                     endOfSource = TRUE;
   8154                     break;
   8155                 }
   8156                 sChar=source[sIndex++];
   8157                 sOrder = elements[sChar];
   8158                 if(sOrder > UCOL_NOT_FOUND) {
   8159                     sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
   8160                 }
   8161             }
   8162             while(tOrder==0) {
   8163                 if(tIndex==tLen) {
   8164                     if(endOfSource) {
   8165                         return UCOL_EQUAL; // if both strings are at the end, they are equal
   8166                     } else {
   8167                         return UCOL_GREATER;
   8168                     }
   8169                 }
   8170                 tChar=target[tIndex++];
   8171                 tOrder = elements[tChar];
   8172                 if(tOrder > UCOL_NOT_FOUND) {
   8173                     tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
   8174                 }
   8175             }
   8176             if(endOfSource) {
   8177                 return UCOL_LESS;
   8178             }
   8179             if(sOrder == tOrder) {
   8180                 sOrder = 0; tOrder = 0;
   8181                 continue;
   8182             } else {
   8183                 if(((sOrder^tOrder)&0xff000000)!=0) {
   8184                     if(sOrder < tOrder) {
   8185                         return UCOL_LESS;
   8186                     } else if(sOrder > tOrder) {
   8187                         return UCOL_GREATER;
   8188                     }
   8189                 }
   8190                 sOrder<<=8;
   8191                 tOrder<<=8;
   8192             }
   8193         }
   8194     }
   8195     return UCOL_EQUAL;
   8196 }
   8197 
   8198 
   8199 U_CAPI UCollationResult U_EXPORT2
   8200 ucol_strcollIter( const UCollator    *coll,
   8201                  UCharIterator *sIter,
   8202                  UCharIterator *tIter,
   8203                  UErrorCode         *status)
   8204 {
   8205     if(!status || U_FAILURE(*status)) {
   8206         return UCOL_EQUAL;
   8207     }
   8208 
   8209     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
   8210     UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
   8211 
   8212     if (sIter == tIter) {
   8213         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8214         return UCOL_EQUAL;
   8215     }
   8216     if(sIter == NULL || tIter == NULL || coll == NULL) {
   8217         *status = U_ILLEGAL_ARGUMENT_ERROR;
   8218         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8219         return UCOL_EQUAL;
   8220     }
   8221 
   8222     UCollationResult result = UCOL_EQUAL;
   8223 
   8224     // Preparing the context objects for iterating over strings
   8225     collIterate sColl, tColl;
   8226     IInit_collIterate(coll, NULL, -1, &sColl, status);
   8227     IInit_collIterate(coll, NULL, -1, &tColl, status);
   8228     if(U_FAILURE(*status)) {
   8229         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8230         return UCOL_EQUAL;
   8231     }
   8232     // The division for the array length may truncate the array size to
   8233     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   8234     // for all platforms anyway.
   8235     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8236     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8237     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
   8238 
   8239     sColl.iterator = sIter;
   8240     sColl.flags |= UCOL_USE_ITERATOR;
   8241     tColl.flags |= UCOL_USE_ITERATOR;
   8242     tColl.iterator = tIter;
   8243 
   8244     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
   8245         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   8246         sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
   8247         sColl.flags &= ~UCOL_ITER_NORM;
   8248 
   8249         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   8250         tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
   8251         tColl.flags &= ~UCOL_ITER_NORM;
   8252     }
   8253 
   8254     UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
   8255 
   8256     while((sChar = sColl.iterator->next(sColl.iterator)) ==
   8257         (tChar = tColl.iterator->next(tColl.iterator))) {
   8258             if(sChar == U_SENTINEL) {
   8259                 result = UCOL_EQUAL;
   8260                 goto end_compare;
   8261             }
   8262     }
   8263 
   8264     if(sChar == U_SENTINEL) {
   8265         tChar = tColl.iterator->previous(tColl.iterator);
   8266     }
   8267 
   8268     if(tChar == U_SENTINEL) {
   8269         sChar = sColl.iterator->previous(sColl.iterator);
   8270     }
   8271 
   8272     sChar = sColl.iterator->previous(sColl.iterator);
   8273     tChar = tColl.iterator->previous(tColl.iterator);
   8274 
   8275     if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
   8276     {
   8277         // We are stopped in the middle of a contraction.
   8278         // Scan backwards through the == part of the string looking for the start of the contraction.
   8279         //   It doesn't matter which string we scan, since they are the same in this region.
   8280         do
   8281         {
   8282             sChar = sColl.iterator->previous(sColl.iterator);
   8283             tChar = tColl.iterator->previous(tColl.iterator);
   8284         }
   8285         while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
   8286     }
   8287 
   8288 
   8289     if(U_SUCCESS(*status)) {
   8290         result = ucol_strcollRegular(&sColl, &tColl, status);
   8291     }
   8292 
   8293 end_compare:
   8294     if(sNormIter || tNormIter) {
   8295         unorm_closeIter(sNormIter);
   8296         unorm_closeIter(tNormIter);
   8297     }
   8298 
   8299     UTRACE_EXIT_VALUE_STATUS(result, *status)
   8300     return result;
   8301 }
   8302 
   8303 
   8304 /*                                                                      */
   8305 /* ucol_strcoll     Main public API string comparison function          */
   8306 /*                                                                      */
   8307 U_CAPI UCollationResult U_EXPORT2
   8308 ucol_strcoll( const UCollator    *coll,
   8309               const UChar        *source,
   8310               int32_t            sourceLength,
   8311               const UChar        *target,
   8312               int32_t            targetLength)
   8313 {
   8314     U_ALIGN_CODE(16);
   8315 
   8316     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
   8317     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   8318         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
   8319         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
   8320         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
   8321     }
   8322 
   8323     if(source == NULL || target == NULL) {
   8324         // do not crash, but return. Should have
   8325         // status argument to return error.
   8326         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8327         return UCOL_EQUAL;
   8328     }
   8329 
   8330     /* Quick check if source and target are same strings. */
   8331     /* They should either both be NULL terminated or the explicit length should be set on both. */
   8332     if (source==target && sourceLength==targetLength) {
   8333         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8334         return UCOL_EQUAL;
   8335     }
   8336 
   8337     /* Scan the strings.  Find:                                                             */
   8338     /*    The length of any leading portion that is equal                                   */
   8339     /*    Whether they are exactly equal.  (in which case we just return)                   */
   8340     const UChar    *pSrc    = source;
   8341     const UChar    *pTarg   = target;
   8342     int32_t        equalLength;
   8343 
   8344     if (sourceLength == -1 && targetLength == -1) {
   8345         // Both strings are null terminated.
   8346         //    Scan through any leading equal portion.
   8347         while (*pSrc == *pTarg && *pSrc != 0) {
   8348             pSrc++;
   8349             pTarg++;
   8350         }
   8351         if (*pSrc == 0 && *pTarg == 0) {
   8352             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8353             return UCOL_EQUAL;
   8354         }
   8355         equalLength = (int32_t)(pSrc - source);
   8356     }
   8357     else
   8358     {
   8359         // One or both strings has an explicit length.
   8360         const UChar    *pSrcEnd = source + sourceLength;
   8361         const UChar    *pTargEnd = target + targetLength;
   8362 
   8363         // Scan while the strings are bitwise ==, or until one is exhausted.
   8364         for (;;) {
   8365             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
   8366                 break;
   8367             }
   8368             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
   8369                 break;
   8370             }
   8371             if (*pSrc != *pTarg) {
   8372                 break;
   8373             }
   8374             pSrc++;
   8375             pTarg++;
   8376         }
   8377         equalLength = (int32_t)(pSrc - source);
   8378 
   8379         // If we made it all the way through both strings, we are done.  They are ==
   8380         if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
   8381             (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
   8382         {
   8383             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8384             return UCOL_EQUAL;
   8385         }
   8386     }
   8387     if (equalLength > 0) {
   8388         /* There is an identical portion at the beginning of the two strings.        */
   8389         /*   If the identical portion ends within a contraction or a comibining      */
   8390         /*   character sequence, back up to the start of that sequence.              */
   8391 
   8392         // These values should already be set by the code above.
   8393         //pSrc  = source + equalLength;        /* point to the first differing chars   */
   8394         //pTarg = target + equalLength;
   8395         if (pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
   8396             pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
   8397         {
   8398             // We are stopped in the middle of a contraction.
   8399             // Scan backwards through the == part of the string looking for the start of the contraction.
   8400             //   It doesn't matter which string we scan, since they are the same in this region.
   8401             do
   8402             {
   8403                 equalLength--;
   8404                 pSrc--;
   8405             }
   8406             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
   8407         }
   8408 
   8409         source += equalLength;
   8410         target += equalLength;
   8411         if (sourceLength > 0) {
   8412             sourceLength -= equalLength;
   8413         }
   8414         if (targetLength > 0) {
   8415             targetLength -= equalLength;
   8416         }
   8417     }
   8418 
   8419     UErrorCode status = U_ZERO_ERROR;
   8420     UCollationResult returnVal;
   8421     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
   8422         returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
   8423     } else {
   8424         returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
   8425     }
   8426     UTRACE_EXIT_VALUE(returnVal);
   8427     return returnVal;
   8428 }
   8429 
   8430 /* convenience function for comparing strings */
   8431 U_CAPI UBool U_EXPORT2
   8432 ucol_greater(    const    UCollator        *coll,
   8433         const    UChar            *source,
   8434         int32_t            sourceLength,
   8435         const    UChar            *target,
   8436         int32_t            targetLength)
   8437 {
   8438     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8439         == UCOL_GREATER);
   8440 }
   8441 
   8442 /* convenience function for comparing strings */
   8443 U_CAPI UBool U_EXPORT2
   8444 ucol_greaterOrEqual(    const    UCollator    *coll,
   8445             const    UChar        *source,
   8446             int32_t        sourceLength,
   8447             const    UChar        *target,
   8448             int32_t        targetLength)
   8449 {
   8450     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8451         != UCOL_LESS);
   8452 }
   8453 
   8454 /* convenience function for comparing strings */
   8455 U_CAPI UBool U_EXPORT2
   8456 ucol_equal(        const    UCollator        *coll,
   8457             const    UChar            *source,
   8458             int32_t            sourceLength,
   8459             const    UChar            *target,
   8460             int32_t            targetLength)
   8461 {
   8462     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8463         == UCOL_EQUAL);
   8464 }
   8465 
   8466 U_CAPI void U_EXPORT2
   8467 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
   8468     if(coll && coll->UCA) {
   8469         uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
   8470     }
   8471 }
   8472 
   8473 #endif /* #if !UCONFIG_NO_COLLATION */
   8474