Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 1996-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ucol.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 * Modification history
     12 * Date        Name      Comments
     13 * 1996-1999   various members of ICU team maintained C API for collation framework
     14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
     15 * 03/01/2001  synwee    Added maxexpansion functionality.
     16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_COLLATION
     22 
     23 #include "unicode/coleitr.h"
     24 #include "unicode/unorm.h"
     25 #include "unicode/udata.h"
     26 #include "unicode/ustring.h"
     27 
     28 #include "ucol_imp.h"
     29 #include "bocsu.h"
     30 
     31 #include "normalizer2impl.h"
     32 #include "unorm_it.h"
     33 #include "umutex.h"
     34 #include "cmemory.h"
     35 #include "ucln_in.h"
     36 #include "cstring.h"
     37 #include "utracimp.h"
     38 #include "putilimp.h"
     39 #include "uassert.h"
     40 
     41 #ifdef UCOL_DEBUG
     42 #include <stdio.h>
     43 #endif
     44 
     45 U_NAMESPACE_USE
     46 
     47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     48 
     49 #define LAST_BYTE_MASK_           0xFF
     50 #define SECOND_LAST_BYTE_SHIFT_   8
     51 
     52 #define ZERO_CC_LIMIT_            0xC0
     53 
     54 // this is static pointer to the normalizer fcdTrieIndex
     55 // it is always the same between calls to u_cleanup
     56 // and therefore writing to it is not synchronized.
     57 // It is cleaned in ucol_cleanup
     58 static const uint16_t *fcdTrieIndex=NULL;
     59 // Code points at fcdHighStart and above have a zero FCD value.
     60 static UChar32 fcdHighStart = 0;
     61 
     62 // These are values from UCA required for
     63 // implicit generation and supressing sort key compression
     64 // they should regularly be in the UCA, but if one
     65 // is running without UCA, it could be a problem
     66 static const int32_t maxRegularPrimary  = 0x7A;
     67 static const int32_t minImplicitPrimary = 0xE0;
     68 static const int32_t maxImplicitPrimary = 0xE4;
     69 
     70 U_CDECL_BEGIN
     71 static UBool U_CALLCONV
     72 ucol_cleanup(void)
     73 {
     74     fcdTrieIndex = NULL;
     75     return TRUE;
     76 }
     77 
     78 static int32_t U_CALLCONV
     79 _getFoldingOffset(uint32_t data) {
     80     return (int32_t)(data&0xFFFFFF);
     81 }
     82 
     83 U_CDECL_END
     84 
     85 // init FCD data
     86 static inline
     87 UBool initializeFCD(UErrorCode *status) {
     88     if (fcdTrieIndex != NULL) {
     89         return TRUE;
     90     } else {
     91         // The result is constant, until the library is reloaded.
     92         fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
     93         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
     94         return U_SUCCESS(*status);
     95     }
     96 }
     97 
     98 static
     99 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
    100                               int32_t sourceLen, collIterate *s,
    101                               UErrorCode *status)
    102 {
    103     (s)->string = (s)->pos = sourceString;
    104     (s)->origFlags = 0;
    105     (s)->flags = 0;
    106     if (sourceLen >= 0) {
    107         s->flags |= UCOL_ITER_HASLEN;
    108         (s)->endp = (UChar *)sourceString+sourceLen;
    109     }
    110     else {
    111         /* change to enable easier checking for end of string for fcdpositon */
    112         (s)->endp = NULL;
    113     }
    114     (s)->extendCEs = NULL;
    115     (s)->extendCEsSize = 0;
    116     (s)->CEpos = (s)->toReturn = (s)->CEs;
    117     (s)->offsetBuffer = NULL;
    118     (s)->offsetBufferSize = 0;
    119     (s)->offsetReturn = (s)->offsetStore = NULL;
    120     (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
    121     (s)->coll = (collator);
    122     (s)->nfd = Normalizer2Factory::getNFDInstance(*status);
    123     (s)->fcdPosition = 0;
    124     if(collator->normalizationMode == UCOL_ON) {
    125         (s)->flags |= UCOL_ITER_NORM;
    126     }
    127     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
    128         (s)->flags |= UCOL_HIRAGANA_Q;
    129     }
    130     (s)->iterator = NULL;
    131     //(s)->iteratorIndex = 0;
    132 }
    133 
    134 U_CAPI void  U_EXPORT2
    135 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
    136                              int32_t sourceLen, collIterate *s,
    137                              UErrorCode *status) {
    138     /* Out-of-line version for use from other files. */
    139     IInit_collIterate(collator, sourceString, sourceLen, s, status);
    140 }
    141 
    142 U_CAPI collIterate * U_EXPORT2
    143 uprv_new_collIterate(UErrorCode *status) {
    144     if(U_FAILURE(*status)) {
    145         return NULL;
    146     }
    147     collIterate *s = new collIterate;
    148     if(s == NULL) {
    149         *status = U_MEMORY_ALLOCATION_ERROR;
    150         return NULL;
    151     }
    152     return s;
    153 }
    154 
    155 U_CAPI void U_EXPORT2
    156 uprv_delete_collIterate(collIterate *s) {
    157     delete s;
    158 }
    159 
    160 U_CAPI UBool U_EXPORT2
    161 uprv_collIterateAtEnd(collIterate *s) {
    162     return s == NULL || s->pos == s->endp;
    163 }
    164 
    165 /**
    166 * Backup the state of the collIterate struct data
    167 * @param data collIterate to backup
    168 * @param backup storage
    169 */
    170 static
    171 inline void backupState(const collIterate *data, collIterateState *backup)
    172 {
    173     backup->fcdPosition = data->fcdPosition;
    174     backup->flags       = data->flags;
    175     backup->origFlags   = data->origFlags;
    176     backup->pos         = data->pos;
    177     backup->bufferaddress = data->writableBuffer.getBuffer();
    178     backup->buffersize    = data->writableBuffer.length();
    179     backup->iteratorMove = 0;
    180     backup->iteratorIndex = 0;
    181     if(data->iterator != NULL) {
    182         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
    183         backup->iteratorIndex = data->iterator->getState(data->iterator);
    184         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
    185         if(backup->iteratorIndex == UITER_NO_STATE) {
    186             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
    187                 backup->iteratorMove++;
    188                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
    189             }
    190             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    191         }
    192     }
    193 }
    194 
    195 /**
    196 * Loads the state into the collIterate struct data
    197 * @param data collIterate to backup
    198 * @param backup storage
    199 * @param forwards boolean to indicate if forwards iteration is used,
    200 *        false indicates backwards iteration
    201 */
    202 static
    203 inline void loadState(collIterate *data, const collIterateState *backup,
    204                       UBool        forwards)
    205 {
    206     UErrorCode status = U_ZERO_ERROR;
    207     data->flags       = backup->flags;
    208     data->origFlags   = backup->origFlags;
    209     if(data->iterator != NULL) {
    210         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
    211         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
    212         if(backup->iteratorMove != 0) {
    213             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    214         }
    215     }
    216     data->pos         = backup->pos;
    217 
    218     if ((data->flags & UCOL_ITER_INNORMBUF) &&
    219         data->writableBuffer.getBuffer() != backup->bufferaddress) {
    220         /*
    221         this is when a new buffer has been reallocated and we'll have to
    222         calculate the new position.
    223         note the new buffer has to contain the contents of the old buffer.
    224         */
    225         if (forwards) {
    226             data->pos = data->writableBuffer.getTerminatedBuffer() +
    227                                          (data->pos - backup->bufferaddress);
    228         }
    229         else {
    230             /* backwards direction */
    231             int32_t temp = backup->buffersize -
    232                                   (int32_t)(data->pos - backup->bufferaddress);
    233             data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
    234         }
    235     }
    236     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
    237         /*
    238         this is alittle tricky.
    239         if we are initially not in the normalization buffer, even if we
    240         normalize in the later stage, the data in the buffer will be
    241         ignored, since we skip back up to the data string.
    242         however if we are already in the normalization buffer, any
    243         further normalization will pull data into the normalization
    244         buffer and modify the fcdPosition.
    245         since we are keeping the data in the buffer for use, the
    246         fcdPosition can not be reverted back.
    247         arrgghh....
    248         */
    249         data->fcdPosition = backup->fcdPosition;
    250     }
    251 }
    252 
    253 static UBool
    254 reallocCEs(collIterate *data, int32_t newCapacity) {
    255     uint32_t *oldCEs = data->extendCEs;
    256     if(oldCEs == NULL) {
    257         oldCEs = data->CEs;
    258     }
    259     int32_t length = data->CEpos - oldCEs;
    260     uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
    261     if(newCEs == NULL) {
    262         return FALSE;
    263     }
    264     uprv_memcpy(newCEs, oldCEs, length * 4);
    265     uprv_free(data->extendCEs);
    266     data->extendCEs = newCEs;
    267     data->extendCEsSize = newCapacity;
    268     data->CEpos = newCEs + length;
    269     return TRUE;
    270 }
    271 
    272 static UBool
    273 increaseCEsCapacity(collIterate *data) {
    274     int32_t oldCapacity;
    275     if(data->extendCEs != NULL) {
    276         oldCapacity = data->extendCEsSize;
    277     } else {
    278         oldCapacity = LENGTHOF(data->CEs);
    279     }
    280     return reallocCEs(data, 2 * oldCapacity);
    281 }
    282 
    283 static UBool
    284 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
    285     int32_t oldCapacity;
    286     if(data->extendCEs != NULL) {
    287         oldCapacity = data->extendCEsSize;
    288     } else {
    289         oldCapacity = LENGTHOF(data->CEs);
    290     }
    291     if(minCapacity <= oldCapacity) {
    292         return TRUE;
    293     }
    294     oldCapacity *= 2;
    295     return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
    296 }
    297 
    298 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
    299     if(U_FAILURE(errorCode)) {
    300         return;
    301     }
    302     int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
    303     if(length >= offsetBufferSize) {
    304         int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
    305         int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4));
    306         if(newBuffer == NULL) {
    307             errorCode = U_MEMORY_ALLOCATION_ERROR;
    308             return;
    309         }
    310         if(length > 0) {
    311             uprv_memcpy(newBuffer, offsetBuffer, length * 4);
    312         }
    313         uprv_free(offsetBuffer);
    314         offsetBuffer = newBuffer;
    315         offsetStore = offsetBuffer + length;
    316         offsetBufferSize = newCapacity;
    317     }
    318     *offsetStore++ = offset;
    319 }
    320 
    321 /*
    322 * collIter_eos()
    323 *     Checks for a collIterate being positioned at the end of
    324 *     its source string.
    325 *
    326 */
    327 static
    328 inline UBool collIter_eos(collIterate *s) {
    329     if(s->flags & UCOL_USE_ITERATOR) {
    330       return !(s->iterator->hasNext(s->iterator));
    331     }
    332     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
    333         // Null terminated string, but not at null, so not at end.
    334         //   Whether in main or normalization buffer doesn't matter.
    335         return FALSE;
    336     }
    337 
    338     // String with length.  Can't be in normalization buffer, which is always
    339     //  null termintated.
    340     if (s->flags & UCOL_ITER_HASLEN) {
    341         return (s->pos == s->endp);
    342     }
    343 
    344     // We are at a null termination, could be either normalization buffer or main string.
    345     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
    346         // At null at end of main string.
    347         return TRUE;
    348     }
    349 
    350     // At null at end of normalization buffer.  Need to check whether there there are
    351     //   any characters left in the main buffer.
    352     if(s->origFlags & UCOL_USE_ITERATOR) {
    353       return !(s->iterator->hasNext(s->iterator));
    354     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
    355         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
    356         return (*s->fcdPosition == 0);
    357     }
    358     else {
    359         // Main string with an end pointer.
    360         return s->fcdPosition == s->endp;
    361     }
    362 }
    363 
    364 /*
    365 * collIter_bos()
    366 *     Checks for a collIterate being positioned at the start of
    367 *     its source string.
    368 *
    369 */
    370 static
    371 inline UBool collIter_bos(collIterate *source) {
    372   // if we're going backwards, we need to know whether there is more in the
    373   // iterator, even if we are in the side buffer
    374   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    375     return !source->iterator->hasPrevious(source->iterator);
    376   }
    377   if (source->pos <= source->string ||
    378       ((source->flags & UCOL_ITER_INNORMBUF) &&
    379       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
    380     return TRUE;
    381   }
    382   return FALSE;
    383 }
    384 
    385 /*static
    386 inline UBool collIter_SimpleBos(collIterate *source) {
    387   // if we're going backwards, we need to know whether there is more in the
    388   // iterator, even if we are in the side buffer
    389   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    390     return !source->iterator->hasPrevious(source->iterator);
    391   }
    392   if (source->pos == source->string) {
    393     return TRUE;
    394   }
    395   return FALSE;
    396 }*/
    397     //return (data->pos == data->string) ||
    398 
    399 
    400 /****************************************************************************/
    401 /* Following are the open/close functions                                   */
    402 /*                                                                          */
    403 /****************************************************************************/
    404 
    405 static UCollator*
    406 ucol_initFromBinary(const uint8_t *bin, int32_t length,
    407                 const UCollator *base,
    408                 UCollator *fillIn,
    409                 UErrorCode *status)
    410 {
    411     UCollator *result = fillIn;
    412     if(U_FAILURE(*status)) {
    413         return NULL;
    414     }
    415     /*
    416     if(base == NULL) {
    417         // we don't support null base yet
    418         *status = U_ILLEGAL_ARGUMENT_ERROR;
    419         return NULL;
    420     }
    421     */
    422     // We need these and we could be running without UCA
    423     uprv_uca_initImplicitConstants(status);
    424     UCATableHeader *colData = (UCATableHeader *)bin;
    425     // do we want version check here? We're trying to figure out whether collators are compatible
    426     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
    427         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
    428         colData->version[0] != UCOL_BUILDER_VERSION)
    429     {
    430         *status = U_COLLATOR_VERSION_MISMATCH;
    431         return NULL;
    432     }
    433     else {
    434         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
    435             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
    436             if(U_FAILURE(*status)){
    437                 return NULL;
    438             }
    439             result->hasRealData = TRUE;
    440         }
    441         else {
    442             if(base) {
    443                 result = ucol_initCollator(base->image, result, base, status);
    444                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
    445                 if(U_FAILURE(*status)){
    446                     return NULL;
    447                 }
    448                 result->hasRealData = FALSE;
    449             }
    450             else {
    451                 *status = U_USELESS_COLLATOR_ERROR;
    452                 return NULL;
    453             }
    454         }
    455         result->freeImageOnClose = FALSE;
    456     }
    457     result->actualLocale = NULL;
    458     result->validLocale = NULL;
    459     result->requestedLocale = NULL;
    460     result->rules = NULL;
    461     result->rulesLength = 0;
    462     result->freeRulesOnClose = FALSE;
    463     result->ucaRules = NULL;
    464     return result;
    465 }
    466 
    467 U_CAPI UCollator* U_EXPORT2
    468 ucol_openBinary(const uint8_t *bin, int32_t length,
    469                 const UCollator *base,
    470                 UErrorCode *status)
    471 {
    472     return ucol_initFromBinary(bin, length, base, NULL, status);
    473 }
    474 
    475 U_CAPI int32_t U_EXPORT2
    476 ucol_cloneBinary(const UCollator *coll,
    477                  uint8_t *buffer, int32_t capacity,
    478                  UErrorCode *status)
    479 {
    480     int32_t length = 0;
    481     if(U_FAILURE(*status)) {
    482         return length;
    483     }
    484     if(capacity < 0) {
    485         *status = U_ILLEGAL_ARGUMENT_ERROR;
    486         return length;
    487     }
    488     if(coll->hasRealData == TRUE) {
    489         length = coll->image->size;
    490         if(length <= capacity) {
    491             uprv_memcpy(buffer, coll->image, length);
    492         } else {
    493             *status = U_BUFFER_OVERFLOW_ERROR;
    494         }
    495     } else {
    496         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    497         if(length <= capacity) {
    498             /* build the UCATableHeader with minimal entries */
    499             /* do not copy the header from the UCA file because its values are wrong! */
    500             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    501 
    502             /* reset everything */
    503             uprv_memset(buffer, 0, length);
    504 
    505             /* set the tailoring-specific values */
    506             UCATableHeader *myData = (UCATableHeader *)buffer;
    507             myData->size = length;
    508 
    509             /* offset for the options, the only part of the data that is present after the header */
    510             myData->options = sizeof(UCATableHeader);
    511 
    512             /* need to always set the expansion value for an upper bound of the options */
    513             myData->expansion = myData->options + sizeof(UColOptionSet);
    514 
    515             myData->magic = UCOL_HEADER_MAGIC;
    516             myData->isBigEndian = U_IS_BIG_ENDIAN;
    517             myData->charSetFamily = U_CHARSET_FAMILY;
    518 
    519             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    520             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    521 
    522             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    523             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    524             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    525             myData->jamoSpecial = coll->image->jamoSpecial;
    526 
    527             /* copy the collator options */
    528             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    529         } else {
    530             *status = U_BUFFER_OVERFLOW_ERROR;
    531         }
    532     }
    533     return length;
    534 }
    535 
    536 U_CAPI UCollator* U_EXPORT2
    537 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
    538 {
    539     UCollator * localCollator;
    540     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
    541     char *stackBufferChars = (char *)stackBuffer;
    542     int32_t imageSize = 0;
    543     int32_t rulesSize = 0;
    544     int32_t rulesPadding = 0;
    545     uint8_t *image;
    546     UChar *rules;
    547     UBool colAllocated = FALSE;
    548     UBool imageAllocated = FALSE;
    549 
    550     if (status == NULL || U_FAILURE(*status)){
    551         return 0;
    552     }
    553     if ((stackBuffer && !pBufferSize) || !coll){
    554        *status = U_ILLEGAL_ARGUMENT_ERROR;
    555         return 0;
    556     }
    557     if (coll->rules && coll->freeRulesOnClose) {
    558         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
    559         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
    560         bufferSizeNeeded += rulesSize + rulesPadding;
    561     }
    562 
    563     if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
    564         *pBufferSize =  bufferSizeNeeded;
    565         return 0;
    566     }
    567 
    568     /* Pointers on 64-bit platforms need to be aligned
    569      * on a 64-bit boundry in memory.
    570      */
    571     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
    572         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
    573         if (*pBufferSize > offsetUp) {
    574             *pBufferSize -= offsetUp;
    575             stackBufferChars += offsetUp;
    576         }
    577         else {
    578             /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
    579             *pBufferSize = 1;
    580         }
    581     }
    582     stackBuffer = (void *)stackBufferChars;
    583 
    584     if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
    585         /* allocate one here...*/
    586         stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
    587         // Null pointer check.
    588         if (stackBufferChars == NULL) {
    589             *status = U_MEMORY_ALLOCATION_ERROR;
    590             return NULL;
    591         }
    592         colAllocated = TRUE;
    593         if (U_SUCCESS(*status)) {
    594             *status = U_SAFECLONE_ALLOCATED_WARNING;
    595         }
    596     }
    597     localCollator = (UCollator *)stackBufferChars;
    598     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
    599     {
    600         UErrorCode tempStatus = U_ZERO_ERROR;
    601         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
    602     }
    603     if (coll->freeImageOnClose) {
    604         image = (uint8_t *)uprv_malloc(imageSize);
    605         // Null pointer check
    606         if (image == NULL) {
    607             *status = U_MEMORY_ALLOCATION_ERROR;
    608             return NULL;
    609         }
    610         ucol_cloneBinary(coll, image, imageSize, status);
    611         imageAllocated = TRUE;
    612     }
    613     else {
    614         image = (uint8_t *)coll->image;
    615     }
    616     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
    617     if (U_FAILURE(*status)) {
    618         return NULL;
    619     }
    620 
    621     if (coll->rules) {
    622         if (coll->freeRulesOnClose) {
    623             localCollator->rules = u_strcpy(rules, coll->rules);
    624             //bufferEnd += rulesSize;
    625         }
    626         else {
    627             localCollator->rules = coll->rules;
    628         }
    629         localCollator->freeRulesOnClose = FALSE;
    630         localCollator->rulesLength = coll->rulesLength;
    631     }
    632 
    633     int32_t i;
    634     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
    635         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
    636     }
    637     // zero copies of pointers
    638     localCollator->actualLocale = NULL;
    639     localCollator->validLocale = NULL;
    640     localCollator->requestedLocale = NULL;
    641     localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
    642     localCollator->freeOnClose = colAllocated;
    643     localCollator->freeImageOnClose = imageAllocated;
    644     return localCollator;
    645 }
    646 
    647 U_CAPI void U_EXPORT2
    648 ucol_close(UCollator *coll)
    649 {
    650     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
    651     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
    652     if(coll != NULL) {
    653         // these are always owned by each UCollator struct,
    654         // so we always free them
    655         if(coll->validLocale != NULL) {
    656             uprv_free(coll->validLocale);
    657         }
    658         if(coll->actualLocale != NULL) {
    659             uprv_free(coll->actualLocale);
    660         }
    661         if(coll->requestedLocale != NULL) {
    662             uprv_free(coll->requestedLocale);
    663         }
    664         if(coll->latinOneCEs != NULL) {
    665             uprv_free(coll->latinOneCEs);
    666         }
    667         if(coll->options != NULL && coll->freeOptionsOnClose) {
    668             uprv_free(coll->options);
    669         }
    670         if(coll->rules != NULL && coll->freeRulesOnClose) {
    671             uprv_free((UChar *)coll->rules);
    672         }
    673         if(coll->image != NULL && coll->freeImageOnClose) {
    674             uprv_free((UCATableHeader *)coll->image);
    675         }
    676         if(coll->leadBytePermutationTable != NULL) {
    677             uprv_free(coll->leadBytePermutationTable);
    678         }
    679         if(coll->reorderCodes != NULL) {
    680             uprv_free(coll->reorderCodes);
    681         }
    682 
    683         /* Here, it would be advisable to close: */
    684         /* - UData for UCA (unless we stuff it in the root resb */
    685         /* Again, do we need additional housekeeping... HMMM! */
    686         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
    687         if(coll->freeOnClose){
    688             /* for safeClone, if freeOnClose is FALSE,
    689             don't free the other instance data */
    690             uprv_free(coll);
    691         }
    692     }
    693     UTRACE_EXIT();
    694 }
    695 
    696 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
    697 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
    698 U_CFUNC uint8_t* U_EXPORT2
    699 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
    700 {
    701     uint8_t *result = NULL;
    702     if(U_FAILURE(*status)) {
    703         return NULL;
    704     }
    705     if(coll->hasRealData == TRUE) {
    706         *length = coll->image->size;
    707         result = (uint8_t *)uprv_malloc(*length);
    708         /* test for NULL */
    709         if (result == NULL) {
    710             *status = U_MEMORY_ALLOCATION_ERROR;
    711             return NULL;
    712         }
    713         uprv_memcpy(result, coll->image, *length);
    714     } else {
    715         *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    716         result = (uint8_t *)uprv_malloc(*length);
    717         /* test for NULL */
    718         if (result == NULL) {
    719             *status = U_MEMORY_ALLOCATION_ERROR;
    720             return NULL;
    721         }
    722 
    723         /* build the UCATableHeader with minimal entries */
    724         /* do not copy the header from the UCA file because its values are wrong! */
    725         /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    726 
    727         /* reset everything */
    728         uprv_memset(result, 0, *length);
    729 
    730         /* set the tailoring-specific values */
    731         UCATableHeader *myData = (UCATableHeader *)result;
    732         myData->size = *length;
    733 
    734         /* offset for the options, the only part of the data that is present after the header */
    735         myData->options = sizeof(UCATableHeader);
    736 
    737         /* need to always set the expansion value for an upper bound of the options */
    738         myData->expansion = myData->options + sizeof(UColOptionSet);
    739 
    740         myData->magic = UCOL_HEADER_MAGIC;
    741         myData->isBigEndian = U_IS_BIG_ENDIAN;
    742         myData->charSetFamily = U_CHARSET_FAMILY;
    743 
    744         /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    745         uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    746 
    747         uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    748         uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    749         uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    750         myData->jamoSpecial = coll->image->jamoSpecial;
    751 
    752         /* copy the collator options */
    753         uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    754     }
    755     return result;
    756 }
    757 
    758 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
    759     if(U_FAILURE(*status)) {
    760         return;
    761     }
    762     result->caseFirst = (UColAttributeValue)opts->caseFirst;
    763     result->caseLevel = (UColAttributeValue)opts->caseLevel;
    764     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
    765     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
    766     if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
    767         return;
    768     }
    769     result->strength = (UColAttributeValue)opts->strength;
    770     result->variableTopValue = opts->variableTopValue;
    771     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
    772     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
    773     result->numericCollation = (UColAttributeValue)opts->numericCollation;
    774     result->caseFirstisDefault = TRUE;
    775     result->caseLevelisDefault = TRUE;
    776     result->frenchCollationisDefault = TRUE;
    777     result->normalizationModeisDefault = TRUE;
    778     result->strengthisDefault = TRUE;
    779     result->variableTopValueisDefault = TRUE;
    780     result->alternateHandlingisDefault = TRUE;
    781     result->hiraganaQisDefault = TRUE;
    782     result->numericCollationisDefault = TRUE;
    783 
    784     ucol_updateInternalState(result, status);
    785 
    786     result->options = opts;
    787 }
    788 
    789 
    790 /**
    791 * Approximate determination if a character is at a contraction end.
    792 * Guaranteed to be TRUE if a character is at the end of a contraction,
    793 * otherwise it is not deterministic.
    794 * @param c character to be determined
    795 * @param coll collator
    796 */
    797 static
    798 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
    799     if (c < coll->minContrEndCP) {
    800         return FALSE;
    801     }
    802 
    803     int32_t  hash = c;
    804     uint8_t  htbyte;
    805     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
    806         if (U16_IS_TRAIL(c)) {
    807             return TRUE;
    808         }
    809         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
    810     }
    811     htbyte = coll->contrEndCP[hash>>3];
    812     return (((htbyte >> (hash & 7)) & 1) == 1);
    813 }
    814 
    815 
    816 
    817 /*
    818 *   i_getCombiningClass()
    819 *        A fast, at least partly inline version of u_getCombiningClass()
    820 *        This is a candidate for further optimization.  Used heavily
    821 *        in contraction processing.
    822 */
    823 static
    824 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
    825     uint8_t sCC = 0;
    826     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
    827         sCC = u_getCombiningClass(c);
    828     }
    829     return sCC;
    830 }
    831 
    832 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
    833     UChar c;
    834     UCollator *result = fillIn;
    835     if(U_FAILURE(*status) || image == NULL) {
    836         return NULL;
    837     }
    838 
    839     if(result == NULL) {
    840         result = (UCollator *)uprv_malloc(sizeof(UCollator));
    841         if(result == NULL) {
    842             *status = U_MEMORY_ALLOCATION_ERROR;
    843             return result;
    844         }
    845         result->freeOnClose = TRUE;
    846     } else {
    847         result->freeOnClose = FALSE;
    848     }
    849 
    850     result->image = image;
    851     result->mapping.getFoldingOffset = _getFoldingOffset;
    852     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
    853     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
    854     if(U_FAILURE(*status)) {
    855         if(result->freeOnClose == TRUE) {
    856             uprv_free(result);
    857             result = NULL;
    858         }
    859         return result;
    860     }
    861 
    862     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
    863     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
    864     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
    865     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
    866     result->rules = NULL;
    867     result->rulesLength = 0;
    868     result->freeRulesOnClose = FALSE;
    869     result->reorderCodes = NULL;
    870     result->reorderCodesLength = 0;
    871     result->leadBytePermutationTable = NULL;
    872 
    873     /* get the version info from UCATableHeader and populate the Collator struct*/
    874     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
    875     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
    876     result->dataVersion[2] = 0;
    877     result->dataVersion[3] = 0;
    878 
    879     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
    880     result->minUnsafeCP = 0;
    881     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
    882         if (ucol_unsafeCP(c, result)) break;
    883     }
    884     result->minUnsafeCP = c;
    885 
    886     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
    887     result->minContrEndCP = 0;
    888     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
    889         if (ucol_contractionEndCP(c, result)) break;
    890     }
    891     result->minContrEndCP = c;
    892 
    893     /* max expansion tables */
    894     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
    895                                          result->image->endExpansionCE);
    896     result->lastEndExpansionCE = result->endExpansionCE +
    897                                  result->image->endExpansionCECount - 1;
    898     result->expansionCESize = (uint8_t*)result->image +
    899                                                result->image->expansionCESize;
    900 
    901 
    902     //result->errorCode = *status;
    903 
    904     result->latinOneCEs = NULL;
    905 
    906     result->latinOneRegenTable = FALSE;
    907     result->latinOneFailed = FALSE;
    908     result->UCA = UCA;
    909 
    910     /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
    911     result->ucaRules = NULL;
    912     result->actualLocale = NULL;
    913     result->validLocale = NULL;
    914     result->requestedLocale = NULL;
    915     result->hasRealData = FALSE; // real data lives in .dat file...
    916     result->freeImageOnClose = FALSE;
    917 
    918     /* set attributes */
    919     ucol_setOptionsFromHeader(
    920         result,
    921         (UColOptionSet*)((uint8_t*)result->image+result->image->options),
    922         status);
    923     result->freeOptionsOnClose = FALSE;
    924 
    925     return result;
    926 }
    927 
    928 /* new Mark's code */
    929 
    930 /**
    931  * For generation of Implicit CEs
    932  * @author Davis
    933  *
    934  * Cleaned up so that changes can be made more easily.
    935  * Old values:
    936 # First Implicit: E26A792D
    937 # Last Implicit: E3DC70C0
    938 # First CJK: E0030300
    939 # Last CJK: E0A9DD00
    940 # First CJK_A: E0A9DF00
    941 # Last CJK_A: E0DE3100
    942  */
    943 /* Following is a port of Mark's code for new treatment of implicits.
    944  * It is positioned here, since ucol_initUCA need to initialize the
    945  * variables below according to the data in the fractional UCA.
    946  */
    947 
    948 /**
    949  * Function used to:
    950  * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
    951  * b) bump any non-CJK characters by 10FFFF.
    952  * The relevant blocks are:
    953  * A:    4E00..9FFF; CJK Unified Ideographs
    954  *       F900..FAFF; CJK Compatibility Ideographs
    955  * B:    3400..4DBF; CJK Unified Ideographs Extension A
    956  *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
    957  * As long as
    958  *   no new B characters are allocated between 4E00 and FAFF, and
    959  *   no new A characters are outside of this range,
    960  * (very high probability) this simple code will work.
    961  * The reordered blocks are:
    962  * Block1 is CJK
    963  * Block2 is CJK_COMPAT_USED
    964  * Block3 is CJK_A
    965  * (all contiguous)
    966  * Any other CJK gets its normal code point
    967  * Any non-CJK gets +10FFFF
    968  * When we reorder Block1, we make sure that it is at the very start,
    969  * so that it will use a 3-byte form.
    970  * Warning: the we only pick up the compatibility characters that are
    971  * NOT decomposed, so that block is smaller!
    972  */
    973 
    974 // CONSTANTS
    975 static const UChar32
    976     NON_CJK_OFFSET = 0x110000,
    977     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
    978 
    979 /**
    980  * Precomputed by initImplicitConstants()
    981  */
    982 static int32_t
    983     final3Multiplier = 0,
    984     final4Multiplier = 0,
    985     final3Count = 0,
    986     final4Count = 0,
    987     medialCount = 0,
    988     min3Primary = 0,
    989     min4Primary = 0,
    990     max4Primary = 0,
    991     minTrail = 0,
    992     maxTrail = 0,
    993     max3Trail = 0,
    994     max4Trail = 0,
    995     min4Boundary = 0;
    996 
    997 static const UChar32
    998     // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
    999     // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
   1000     CJK_BASE = 0x4E00,
   1001     CJK_LIMIT = 0x9FCB+1,
   1002     // Unified CJK ideographs in the compatibility ideographs block.
   1003     CJK_COMPAT_USED_BASE = 0xFA0E,
   1004     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
   1005     // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
   1006     // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
   1007     CJK_A_BASE = 0x3400,
   1008     CJK_A_LIMIT = 0x4DB5+1,
   1009     // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
   1010     // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
   1011     CJK_B_BASE = 0x20000,
   1012     CJK_B_LIMIT = 0x2A6D6+1,
   1013     // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
   1014     // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
   1015     CJK_C_BASE = 0x2A700,
   1016     CJK_C_LIMIT = 0x2B734+1,
   1017     // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
   1018     // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
   1019     CJK_D_BASE = 0x2B740,
   1020     CJK_D_LIMIT = 0x2B81D+1;
   1021     // when adding to this list, look for all occurrences (in project)
   1022     // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
   1023 
   1024 static UChar32 swapCJK(UChar32 i) {
   1025     if (i < CJK_A_BASE) {
   1026         // non-CJK
   1027     } else if (i < CJK_A_LIMIT) {
   1028         // Extension A has lower code points than the original Unihan+compat
   1029         // but sorts higher.
   1030         return i - CJK_A_BASE
   1031                 + (CJK_LIMIT - CJK_BASE)
   1032                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1033     } else if (i < CJK_BASE) {
   1034         // non-CJK
   1035     } else if (i < CJK_LIMIT) {
   1036         return i - CJK_BASE;
   1037     } else if (i < CJK_COMPAT_USED_BASE) {
   1038         // non-CJK
   1039     } else if (i < CJK_COMPAT_USED_LIMIT) {
   1040         return i - CJK_COMPAT_USED_BASE
   1041                 + (CJK_LIMIT - CJK_BASE);
   1042     } else if (i < CJK_B_BASE) {
   1043         // non-CJK
   1044     } else if (i < CJK_B_LIMIT) {
   1045         return i; // non-BMP-CJK
   1046     } else if (i < CJK_C_BASE) {
   1047         // non-CJK
   1048     } else if (i < CJK_C_LIMIT) {
   1049         return i; // non-BMP-CJK
   1050     } else if (i < CJK_D_BASE) {
   1051         // non-CJK
   1052     } else if (i < CJK_D_LIMIT) {
   1053         return i; // non-BMP-CJK
   1054     }
   1055     return i + NON_CJK_OFFSET; // non-CJK
   1056 }
   1057 
   1058 U_CAPI UChar32 U_EXPORT2
   1059 uprv_uca_getRawFromCodePoint(UChar32 i) {
   1060     return swapCJK(i)+1;
   1061 }
   1062 
   1063 U_CAPI UChar32 U_EXPORT2
   1064 uprv_uca_getCodePointFromRaw(UChar32 i) {
   1065     i--;
   1066     UChar32 result = 0;
   1067     if(i >= NON_CJK_OFFSET) {
   1068         result = i - NON_CJK_OFFSET;
   1069     } else if(i >= CJK_B_BASE) {
   1070         result = i;
   1071     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
   1072         if(i < CJK_LIMIT - CJK_BASE) {
   1073             result = i + CJK_BASE;
   1074         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
   1075             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
   1076         } else {
   1077             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1078         }
   1079     } else {
   1080         result = -1;
   1081     }
   1082     return result;
   1083 }
   1084 
   1085 // GET IMPLICIT PRIMARY WEIGHTS
   1086 // Return value is left justified primary key
   1087 U_CAPI uint32_t U_EXPORT2
   1088 uprv_uca_getImplicitFromRaw(UChar32 cp) {
   1089     /*
   1090     if (cp < 0 || cp > UCOL_MAX_INPUT) {
   1091         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
   1092     }
   1093     */
   1094     int32_t last0 = cp - min4Boundary;
   1095     if (last0 < 0) {
   1096         int32_t last1 = cp / final3Count;
   1097         last0 = cp % final3Count;
   1098 
   1099         int32_t last2 = last1 / medialCount;
   1100         last1 %= medialCount;
   1101 
   1102         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
   1103         last1 = minTrail + last1; // offset
   1104         last2 = min3Primary + last2; // offset
   1105         /*
   1106         if (last2 >= min4Primary) {
   1107             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
   1108         }
   1109         */
   1110         return (last2 << 24) + (last1 << 16) + (last0 << 8);
   1111     } else {
   1112         int32_t last1 = last0 / final4Count;
   1113         last0 %= final4Count;
   1114 
   1115         int32_t last2 = last1 / medialCount;
   1116         last1 %= medialCount;
   1117 
   1118         int32_t last3 = last2 / medialCount;
   1119         last2 %= medialCount;
   1120 
   1121         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
   1122         last1 = minTrail + last1; // offset
   1123         last2 = minTrail + last2; // offset
   1124         last3 = min4Primary + last3; // offset
   1125         /*
   1126         if (last3 > max4Primary) {
   1127             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
   1128         }
   1129         */
   1130         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
   1131     }
   1132 }
   1133 
   1134 static uint32_t U_EXPORT2
   1135 uprv_uca_getImplicitPrimary(UChar32 cp) {
   1136    //fprintf(stdout, "Incoming: %04x\n", cp);
   1137     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
   1138 
   1139     cp = swapCJK(cp);
   1140     cp++;
   1141     // we now have a range of numbers from 0 to 21FFFF.
   1142 
   1143     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
   1144     //fprintf(stdout, "CJK swapped: %04x\n", cp);
   1145 
   1146     return uprv_uca_getImplicitFromRaw(cp);
   1147 }
   1148 
   1149 /**
   1150  * Converts implicit CE into raw integer ("code point")
   1151  * @param implicit
   1152  * @return -1 if illegal format
   1153  */
   1154 U_CAPI UChar32 U_EXPORT2
   1155 uprv_uca_getRawFromImplicit(uint32_t implicit) {
   1156     UChar32 result;
   1157     UChar32 b3 = implicit & 0xFF;
   1158     UChar32 b2 = (implicit >> 8) & 0xFF;
   1159     UChar32 b1 = (implicit >> 16) & 0xFF;
   1160     UChar32 b0 = (implicit >> 24) & 0xFF;
   1161 
   1162     // simple parameter checks
   1163     if (b0 < min3Primary || b0 > max4Primary
   1164         || b1 < minTrail || b1 > maxTrail)
   1165         return -1;
   1166     // normal offsets
   1167     b1 -= minTrail;
   1168 
   1169     // take care of the final values, and compose
   1170     if (b0 < min4Primary) {
   1171         if (b2 < minTrail || b2 > max3Trail || b3 != 0)
   1172             return -1;
   1173         b2 -= minTrail;
   1174         UChar32 remainder = b2 % final3Multiplier;
   1175         if (remainder != 0)
   1176             return -1;
   1177         b0 -= min3Primary;
   1178         b2 /= final3Multiplier;
   1179         result = ((b0 * medialCount) + b1) * final3Count + b2;
   1180     } else {
   1181         if (b2 < minTrail || b2 > maxTrail
   1182             || b3 < minTrail || b3 > max4Trail)
   1183             return -1;
   1184         b2 -= minTrail;
   1185         b3 -= minTrail;
   1186         UChar32 remainder = b3 % final4Multiplier;
   1187         if (remainder != 0)
   1188             return -1;
   1189         b3 /= final4Multiplier;
   1190         b0 -= min4Primary;
   1191         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
   1192     }
   1193     // final check
   1194     if (result < 0 || result > UCOL_MAX_INPUT)
   1195         return -1;
   1196     return result;
   1197 }
   1198 
   1199 
   1200 static inline int32_t divideAndRoundUp(int a, int b) {
   1201     return 1 + (a-1)/b;
   1202 }
   1203 
   1204 /* this function is either called from initUCA or from genUCA before
   1205  * doing canonical closure for the UCA.
   1206  */
   1207 
   1208 /**
   1209  * Set up to generate implicits.
   1210  * Maintenance Note:  this function may end up being called more than once, due
   1211  *                    to threading races during initialization.  Make sure that
   1212  *                    none of the Constants is ever transiently assigned an
   1213  *                    incorrect value.
   1214  * @param minPrimary
   1215  * @param maxPrimary
   1216  * @param minTrail final byte
   1217  * @param maxTrail final byte
   1218  * @param gap3 the gap we leave for tailoring for 3-byte forms
   1219  * @param gap4 the gap we leave for tailoring for 4-byte forms
   1220  */
   1221 static void initImplicitConstants(int minPrimary, int maxPrimary,
   1222                                     int minTrailIn, int maxTrailIn,
   1223                                     int gap3, int primaries3count,
   1224                                     UErrorCode *status) {
   1225     // some simple parameter checks
   1226     if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
   1227         || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
   1228         || (primaries3count < 1))
   1229     {
   1230         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1231         return;
   1232     };
   1233 
   1234     minTrail = minTrailIn;
   1235     maxTrail = maxTrailIn;
   1236 
   1237     min3Primary = minPrimary;
   1238     max4Primary = maxPrimary;
   1239     // compute constants for use later.
   1240     // number of values we can use in trailing bytes
   1241     // leave room for empty values between AND above, e.g. if gap = 2
   1242     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
   1243     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
   1244     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
   1245     final3Multiplier = gap3 + 1;
   1246     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
   1247     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
   1248 
   1249     // medials can use full range
   1250     medialCount = (maxTrail - minTrail + 1);
   1251     // find out how many values fit in each form
   1252     int32_t threeByteCount = medialCount * final3Count;
   1253     // now determine where the 3/4 boundary is.
   1254     // we use 3 bytes below the boundary, and 4 above
   1255     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
   1256     int32_t primaries4count = primariesAvailable - primaries3count;
   1257 
   1258 
   1259     int32_t min3ByteCoverage = primaries3count * threeByteCount;
   1260     min4Primary = minPrimary + primaries3count;
   1261     min4Boundary = min3ByteCoverage;
   1262     // Now expand out the multiplier for the 4 bytes, and redo.
   1263 
   1264     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
   1265     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
   1266     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
   1267     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
   1268     if (gap4 < 1) {
   1269         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1270         return;
   1271     }
   1272     final4Multiplier = gap4 + 1;
   1273     final4Count = neededPerFinalByte;
   1274     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
   1275 }
   1276 
   1277     /**
   1278      * Supply parameters for generating implicit CEs
   1279      */
   1280 U_CAPI void U_EXPORT2
   1281 uprv_uca_initImplicitConstants(UErrorCode *status) {
   1282     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
   1283     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
   1284     initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
   1285 }
   1286 
   1287 
   1288 /*    collIterNormalize     Incremental Normalization happens here.                       */
   1289 /*                          pick up the range of chars identifed by FCD,                  */
   1290 /*                          normalize it into the collIterate's writable buffer,          */
   1291 /*                          switch the collIterate's state to use the writable buffer.    */
   1292 /*                                                                                        */
   1293 static
   1294 void collIterNormalize(collIterate *collationSource)
   1295 {
   1296     UErrorCode  status = U_ZERO_ERROR;
   1297     const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
   1298     const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
   1299 
   1300     collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
   1301                                     collationSource->writableBuffer,
   1302                                     status);
   1303     if (U_FAILURE(status)) {
   1304 #ifdef UCOL_DEBUG
   1305         fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
   1306 #endif
   1307         return;
   1308     }
   1309 
   1310     collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
   1311     collationSource->origFlags  = collationSource->flags;
   1312     collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1313     collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1314 }
   1315 
   1316 
   1317 // This function takes the iterator and extracts normalized stuff up to the next boundary
   1318 // It is similar in the end results to the collIterNormalize, but for the cases when we
   1319 // use an iterator
   1320 /*static
   1321 inline void normalizeIterator(collIterate *collationSource) {
   1322   UErrorCode status = U_ZERO_ERROR;
   1323   UBool wasNormalized = FALSE;
   1324   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
   1325   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
   1326   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1327     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1328   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
   1329     // reallocate and terminate
   1330     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
   1331                                &collationSource->writableBuffer,
   1332                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
   1333                                0)
   1334     ) {
   1335     #ifdef UCOL_DEBUG
   1336         fprintf(stderr, "normalizeIterator(), out of memory\n");
   1337     #endif
   1338         return;
   1339     }
   1340     status = U_ZERO_ERROR;
   1341     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
   1342     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
   1343     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1344     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1345   }
   1346   // Terminate the buffer - we already checked that it is big enough
   1347   collationSource->writableBuffer[normLen] = 0;
   1348   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
   1349       collationSource->flags |= UCOL_ITER_ALLOCATED;
   1350   }
   1351   collationSource->pos        = collationSource->writableBuffer;
   1352   collationSource->origFlags  = collationSource->flags;
   1353   collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1354   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1355 }*/
   1356 
   1357 
   1358 /* Incremental FCD check and normalize                                                    */
   1359 /*   Called from getNextCE when normalization state is suspect.                           */
   1360 /*   When entering, the state is known to be this:                                        */
   1361 /*      o   We are working in the main buffer of the collIterate, not the side            */
   1362 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
   1363 /*          so we won't get here.                                                         */
   1364 /*      o   The leading combining class from the current character is 0 or                */
   1365 /*          the trailing combining class of the previous char was zero.                   */
   1366 /*          True because the previous call to this function will have always exited       */
   1367 /*          that way, and we get called for every char where cc might be non-zero.        */
   1368 static
   1369 inline UBool collIterFCD(collIterate *collationSource) {
   1370     const UChar *srcP, *endP;
   1371     uint8_t     leadingCC;
   1372     uint8_t     prevTrailingCC = 0;
   1373     uint16_t    fcd;
   1374     UBool       needNormalize = FALSE;
   1375 
   1376     srcP = collationSource->pos-1;
   1377 
   1378     if (collationSource->flags & UCOL_ITER_HASLEN) {
   1379         endP = collationSource->endp;
   1380     } else {
   1381         endP = NULL;
   1382     }
   1383 
   1384     // Get the trailing combining class of the current character.  If it's zero,
   1385     //   we are OK.
   1386     /* trie access */
   1387     fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
   1388     if (fcd != 0) {
   1389         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1390 
   1391         if (prevTrailingCC != 0) {
   1392             // The current char has a non-zero trailing CC.  Scan forward until we find
   1393             //   a char with a leading cc of zero.
   1394             while (endP == NULL || srcP != endP)
   1395             {
   1396                 const UChar *savedSrcP = srcP;
   1397 
   1398                 /* trie access */
   1399                 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
   1400                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1401                 if (leadingCC == 0) {
   1402                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
   1403                                            //   back up over it.  (Could be surrogate pair!)
   1404                     break;
   1405                 }
   1406 
   1407                 if (leadingCC < prevTrailingCC) {
   1408                     needNormalize = TRUE;
   1409                 }
   1410 
   1411                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1412             }
   1413         }
   1414     }
   1415 
   1416     collationSource->fcdPosition = (UChar *)srcP;
   1417 
   1418     return needNormalize;
   1419 }
   1420 
   1421 /****************************************************************************/
   1422 /* Following are the CE retrieval functions                                 */
   1423 /*                                                                          */
   1424 /****************************************************************************/
   1425 
   1426 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
   1427 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
   1428 
   1429 /* there should be a macro version of this function in the header file */
   1430 /* This is the first function that tries to fetch a collation element  */
   1431 /* If it's not succesfull or it encounters a more difficult situation  */
   1432 /* some more sofisticated and slower functions are invoked             */
   1433 static
   1434 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1435     uint32_t order = 0;
   1436     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
   1437         order = *(collationSource->toReturn++);                         /* if so, return them */
   1438         if(collationSource->CEpos == collationSource->toReturn) {
   1439             collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
   1440         }
   1441         return order;
   1442     }
   1443 
   1444     UChar ch = 0;
   1445     collationSource->offsetReturn = NULL;
   1446 
   1447     do {
   1448         for (;;)                           /* Loop handles case when incremental normalize switches   */
   1449         {                                  /*   to or from the side buffer / original string, and we  */
   1450             /*   need to start again to get the next character.        */
   1451 
   1452             if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
   1453             {
   1454                 // The source string is null terminated and we're not working from the side buffer,
   1455                 //   and we're not normalizing.  This is the fast path.
   1456                 //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
   1457                 ch = *collationSource->pos++;
   1458                 if (ch != 0) {
   1459                     break;
   1460                 }
   1461                 else {
   1462                     return UCOL_NO_MORE_CES;
   1463                 }
   1464             }
   1465 
   1466             if (collationSource->flags & UCOL_ITER_HASLEN) {
   1467                 // Normal path for strings when length is specified.
   1468                 //   (We can't be in side buffer because it is always null terminated.)
   1469                 if (collationSource->pos >= collationSource->endp) {
   1470                     // Ran off of the end of the main source string.  We're done.
   1471                     return UCOL_NO_MORE_CES;
   1472                 }
   1473                 ch = *collationSource->pos++;
   1474             }
   1475             else if(collationSource->flags & UCOL_USE_ITERATOR) {
   1476                 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
   1477                 if(iterCh == U_SENTINEL) {
   1478                     return UCOL_NO_MORE_CES;
   1479                 }
   1480                 ch = (UChar)iterCh;
   1481             }
   1482             else
   1483             {
   1484                 // Null terminated string.
   1485                 ch = *collationSource->pos++;
   1486                 if (ch == 0) {
   1487                     // Ran off end of buffer.
   1488                     if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1489                         // Ran off end of main string. backing up one character.
   1490                         collationSource->pos--;
   1491                         return UCOL_NO_MORE_CES;
   1492                     }
   1493                     else
   1494                     {
   1495                         // Hit null in the normalize side buffer.
   1496                         // Usually this means the end of the normalized data,
   1497                         // except for one odd case: a null followed by combining chars,
   1498                         //   which is the case if we are at the start of the buffer.
   1499                         if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
   1500                             break;
   1501                         }
   1502 
   1503                         //  Null marked end of side buffer.
   1504                         //   Revert to the main string and
   1505                         //   loop back to top to try again to get a character.
   1506                         collationSource->pos   = collationSource->fcdPosition;
   1507                         collationSource->flags = collationSource->origFlags;
   1508                         continue;
   1509                     }
   1510                 }
   1511             }
   1512 
   1513             if(collationSource->flags&UCOL_HIRAGANA_Q) {
   1514                 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
   1515                  * based on whether the previous codepoint was Hiragana or Katakana.
   1516                  */
   1517                 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
   1518                         ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
   1519                     collationSource->flags |= UCOL_WAS_HIRAGANA;
   1520                 } else {
   1521                     collationSource->flags &= ~UCOL_WAS_HIRAGANA;
   1522                 }
   1523             }
   1524 
   1525             // We've got a character.  See if there's any fcd and/or normalization stuff to do.
   1526             //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
   1527             if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
   1528                 break;
   1529             }
   1530 
   1531             if (collationSource->fcdPosition >= collationSource->pos) {
   1532                 // An earlier FCD check has already covered the current character.
   1533                 // We can go ahead and process this char.
   1534                 break;
   1535             }
   1536 
   1537             if (ch < ZERO_CC_LIMIT_ ) {
   1538                 // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
   1539                 break;
   1540             }
   1541 
   1542             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1543                 // We need to peek at the next character in order to tell if we are FCD
   1544                 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
   1545                     // We are at the last char of source string.
   1546                     //  It is always OK for FCD check.
   1547                     break;
   1548                 }
   1549 
   1550                 // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
   1551                 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1552                     break;
   1553                 }
   1554             }
   1555 
   1556 
   1557             // Need a more complete FCD check and possible normalization.
   1558             if (collIterFCD(collationSource)) {
   1559                 collIterNormalize(collationSource);
   1560             }
   1561             if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1562                 //  No normalization was needed.  Go ahead and process the char we already had.
   1563                 break;
   1564             }
   1565 
   1566             // Some normalization happened.  Next loop iteration will pick up a char
   1567             //   from the normalization buffer.
   1568 
   1569         }   // end for (;;)
   1570 
   1571 
   1572         if (ch <= 0xFF) {
   1573             /*  For latin-1 characters we never need to fall back to the UCA table        */
   1574             /*    because all of the UCA data is replicated in the latinOneMapping array  */
   1575             order = coll->latinOneMapping[ch];
   1576             if (order > UCOL_NOT_FOUND) {
   1577                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
   1578             }
   1579         }
   1580         else
   1581         {
   1582             // Always use UCA for Han, Hangul
   1583             // (Han extension A is before main Han block)
   1584             // **** Han compatibility chars ?? ****
   1585             if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   1586                 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
   1587                 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
   1588                     // between the two target ranges; do normal lookup
   1589                     // **** this range is YI, Modifier tone letters, ****
   1590                     // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   1591                     // **** Latin-D might be tailored, so we need to ****
   1592                     // **** do the normal lookup for these guys.     ****
   1593                     order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1594                 } else {
   1595                     // in one of the target ranges; use UCA
   1596                     order = UCOL_NOT_FOUND;
   1597                 }
   1598             } else {
   1599                 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1600             }
   1601 
   1602             if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
   1603                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
   1604             }
   1605 
   1606             if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
   1607                 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
   1608                 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   1609 
   1610                 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
   1611                     order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
   1612                 }
   1613             }
   1614         }
   1615     } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
   1616 
   1617     if(order == UCOL_NOT_FOUND) {
   1618         order = getImplicit(ch, collationSource);
   1619     }
   1620     return order; /* return the CE */
   1621 }
   1622 
   1623 /* ucol_getNextCE, out-of-line version for use from other files.   */
   1624 U_CAPI uint32_t  U_EXPORT2
   1625 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1626     return ucol_IGetNextCE(coll, collationSource, status);
   1627 }
   1628 
   1629 
   1630 /**
   1631 * Incremental previous normalization happens here. Pick up the range of chars
   1632 * identifed by FCD, normalize it into the collIterate's writable buffer,
   1633 * switch the collIterate's state to use the writable buffer.
   1634 * @param data collation iterator data
   1635 */
   1636 static
   1637 void collPrevIterNormalize(collIterate *data)
   1638 {
   1639     UErrorCode status  = U_ZERO_ERROR;
   1640     const UChar *pEnd   = data->pos;  /* End normalize + 1 */
   1641     const UChar *pStart;
   1642 
   1643     /* Start normalize */
   1644     if (data->fcdPosition == NULL) {
   1645         pStart = data->string;
   1646     }
   1647     else {
   1648         pStart = data->fcdPosition + 1;
   1649     }
   1650 
   1651     int32_t normLen =
   1652         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
   1653                              data->writableBuffer,
   1654                              status).
   1655         length();
   1656     if(U_FAILURE(status)) {
   1657         return;
   1658     }
   1659     /*
   1660     this puts the null termination infront of the normalized string instead
   1661     of the end
   1662     */
   1663     data->writableBuffer.insert(0, (UChar)0);
   1664 
   1665     /*
   1666      * The usual case at this point is that we've got a base
   1667      * character followed by marks that were normalized. If
   1668      * fcdPosition is NULL, that means that we backed up to
   1669      * the beginning of the string and there's no base character.
   1670      *
   1671      * Forward processing will usually normalize when it sees
   1672      * the first mark, so that mark will get it's natural offset
   1673      * and the rest will get the offset of the character following
   1674      * the marks. The base character will also get its natural offset.
   1675      *
   1676      * We write the offset of the base character, if there is one,
   1677      * followed by the offset of the first mark and then the offsets
   1678      * of the rest of the marks.
   1679      */
   1680     int32_t firstMarkOffset = 0;
   1681     int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
   1682     int32_t trailCount      = normLen - 1;
   1683 
   1684     if (data->fcdPosition != NULL) {
   1685         int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
   1686         UChar   baseChar   = *data->fcdPosition;
   1687 
   1688         firstMarkOffset = baseOffset + 1;
   1689 
   1690         /*
   1691          * If the base character is the start of a contraction, forward processing
   1692          * will normalize the marks while checking for the contraction, which means
   1693          * that the offset of the first mark will the same as the other marks.
   1694          *
   1695          * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
   1696          */
   1697         if (baseChar >= 0x100) {
   1698             uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
   1699 
   1700             if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
   1701                 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
   1702             }
   1703 
   1704             if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
   1705                 firstMarkOffset = trailOffset;
   1706             }
   1707         }
   1708 
   1709         data->appendOffset(baseOffset, status);
   1710     }
   1711 
   1712     data->appendOffset(firstMarkOffset, status);
   1713 
   1714     for (int32_t i = 0; i < trailCount; i += 1) {
   1715         data->appendOffset(trailOffset, status);
   1716     }
   1717 
   1718     data->offsetRepeatValue = trailOffset;
   1719 
   1720     data->offsetReturn = data->offsetStore - 1;
   1721     if (data->offsetReturn == data->offsetBuffer) {
   1722         data->offsetStore = data->offsetBuffer;
   1723     }
   1724 
   1725     data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
   1726     data->origFlags  = data->flags;
   1727     data->flags     |= UCOL_ITER_INNORMBUF;
   1728     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   1729 }
   1730 
   1731 
   1732 /**
   1733 * Incremental FCD check for previous iteration and normalize. Called from
   1734 * getPrevCE when normalization state is suspect.
   1735 * When entering, the state is known to be this:
   1736 * o  We are working in the main buffer of the collIterate, not the side
   1737 *    writable buffer. When in the side buffer, normalization mode is always
   1738 *    off, so we won't get here.
   1739 * o  The leading combining class from the current character is 0 or the
   1740 *    trailing combining class of the previous char was zero.
   1741 *    True because the previous call to this function will have always exited
   1742 *    that way, and we get called for every char where cc might be non-zero.
   1743 * @param data collation iterate struct
   1744 * @return normalization status, TRUE for normalization to be done, FALSE
   1745 *         otherwise
   1746 */
   1747 static
   1748 inline UBool collPrevIterFCD(collIterate *data)
   1749 {
   1750     const UChar *src, *start;
   1751     uint8_t     leadingCC;
   1752     uint8_t     trailingCC = 0;
   1753     uint16_t    fcd;
   1754     UBool       result = FALSE;
   1755 
   1756     start = data->string;
   1757     src = data->pos + 1;
   1758 
   1759     /* Get the trailing combining class of the current character. */
   1760     fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
   1761 
   1762     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1763 
   1764     if (leadingCC != 0) {
   1765         /*
   1766         The current char has a non-zero leading combining class.
   1767         Scan backward until we find a char with a trailing cc of zero.
   1768         */
   1769         for (;;)
   1770         {
   1771             if (start == src) {
   1772                 data->fcdPosition = NULL;
   1773                 return result;
   1774             }
   1775 
   1776             fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
   1777 
   1778             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1779 
   1780             if (trailingCC == 0) {
   1781                 break;
   1782             }
   1783 
   1784             if (leadingCC < trailingCC) {
   1785                 result = TRUE;
   1786             }
   1787 
   1788             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1789         }
   1790     }
   1791 
   1792     data->fcdPosition = (UChar *)src;
   1793 
   1794     return result;
   1795 }
   1796 
   1797 /** gets a code unit from the string at a given offset
   1798  *  Handles both normal and iterative cases.
   1799  *  No error checking - caller beware!
   1800  */
   1801 static inline
   1802 UChar peekCodeUnit(collIterate *source, int32_t offset) {
   1803     if(source->pos != NULL) {
   1804         return *(source->pos + offset);
   1805     } else if(source->iterator != NULL) {
   1806         UChar32 c;
   1807         if(offset != 0) {
   1808             source->iterator->move(source->iterator, offset, UITER_CURRENT);
   1809             c = source->iterator->next(source->iterator);
   1810             source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
   1811         } else {
   1812             c = source->iterator->current(source->iterator);
   1813         }
   1814         return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
   1815     } else {
   1816         return 0xfffd;
   1817     }
   1818 }
   1819 
   1820 // Code point version. Treats the offset as a _code point_ delta.
   1821 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
   1822 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
   1823 static inline
   1824 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
   1825     UChar32 c;
   1826     if(source->pos != NULL) {
   1827         const UChar *p = source->pos;
   1828         if(offset >= 0) {
   1829             // Skip forward over (offset-1) code points.
   1830             while(--offset >= 0) {
   1831                 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
   1832                     ++p;
   1833                 }
   1834             }
   1835             // Read the code point there.
   1836             c = *p++;
   1837             UChar trail;
   1838             if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
   1839                 c = U16_GET_SUPPLEMENTARY(c, trail);
   1840             }
   1841         } else /* offset<0 */ {
   1842             // Skip backward over (offset-1) code points.
   1843             while(++offset < 0) {
   1844                 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
   1845                     --p;
   1846                 }
   1847             }
   1848             // Read the code point before that.
   1849             c = *--p;
   1850             UChar lead;
   1851             if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
   1852                 c = U16_GET_SUPPLEMENTARY(lead, c);
   1853             }
   1854         }
   1855     } else if(source->iterator != NULL) {
   1856         if(offset >= 0) {
   1857             // Skip forward over (offset-1) code points.
   1858             int32_t fwd = offset;
   1859             while(fwd-- > 0) {
   1860                 uiter_next32(source->iterator);
   1861             }
   1862             // Read the code point there.
   1863             c = uiter_current32(source->iterator);
   1864             // Return to the starting point, skipping backward over (offset-1) code points.
   1865             while(offset-- > 0) {
   1866                 uiter_previous32(source->iterator);
   1867             }
   1868         } else /* offset<0 */ {
   1869             // Read backward, reading offset code points, remember only the last-read one.
   1870             int32_t back = offset;
   1871             do {
   1872                 c = uiter_previous32(source->iterator);
   1873             } while(++back < 0);
   1874             // Return to the starting position, skipping forward over offset code points.
   1875             do {
   1876                 uiter_next32(source->iterator);
   1877             } while(++offset < 0);
   1878         }
   1879     } else {
   1880         c = U_SENTINEL;
   1881     }
   1882     return c;
   1883 }
   1884 
   1885 /**
   1886 * Determines if we are at the start of the data string in the backwards
   1887 * collation iterator
   1888 * @param data collation iterator
   1889 * @return TRUE if we are at the start
   1890 */
   1891 static
   1892 inline UBool isAtStartPrevIterate(collIterate *data) {
   1893     if(data->pos == NULL && data->iterator != NULL) {
   1894         return !data->iterator->hasPrevious(data->iterator);
   1895     }
   1896     //return (collIter_bos(data)) ||
   1897     return (data->pos == data->string) ||
   1898               ((data->flags & UCOL_ITER_INNORMBUF) &&
   1899               *(data->pos - 1) == 0 && data->fcdPosition == NULL);
   1900 }
   1901 
   1902 static
   1903 inline void goBackOne(collIterate *data) {
   1904 # if 0
   1905     // somehow, it looks like we need to keep iterator synced up
   1906     // at all times, as above.
   1907     if(data->pos) {
   1908         data->pos--;
   1909     }
   1910     if(data->iterator) {
   1911         data->iterator->previous(data->iterator);
   1912     }
   1913 #endif
   1914     if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
   1915         data->iterator->previous(data->iterator);
   1916     }
   1917     if(data->pos) {
   1918         data->pos --;
   1919     }
   1920 }
   1921 
   1922 /**
   1923 * Inline function that gets a simple CE.
   1924 * So what it does is that it will first check the expansion buffer. If the
   1925 * expansion buffer is not empty, ie the end pointer to the expansion buffer
   1926 * is different from the string pointer, we return the collation element at the
   1927 * return pointer and decrement it.
   1928 * For more complicated CEs it resorts to getComplicatedCE.
   1929 * @param coll collator data
   1930 * @param data collation iterator struct
   1931 * @param status error status
   1932 */
   1933 static
   1934 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
   1935                                UErrorCode *status)
   1936 {
   1937     uint32_t result = (uint32_t)UCOL_NULLORDER;
   1938 
   1939     if (data->offsetReturn != NULL) {
   1940         if (data->offsetRepeatCount > 0) {
   1941                 data->offsetRepeatCount -= 1;
   1942         } else {
   1943             if (data->offsetReturn == data->offsetBuffer) {
   1944                 data->offsetReturn = NULL;
   1945                 data->offsetStore  = data->offsetBuffer;
   1946             } else {
   1947                 data->offsetReturn -= 1;
   1948             }
   1949         }
   1950     }
   1951 
   1952     if ((data->extendCEs && data->toReturn > data->extendCEs) ||
   1953             (!data->extendCEs && data->toReturn > data->CEs))
   1954     {
   1955         data->toReturn -= 1;
   1956         result = *(data->toReturn);
   1957         if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
   1958             data->CEpos = data->toReturn;
   1959         }
   1960     }
   1961     else {
   1962         UChar ch = 0;
   1963 
   1964         do {
   1965             /*
   1966             Loop handles case when incremental normalize switches to or from the
   1967             side buffer / original string, and we need to start again to get the
   1968             next character.
   1969             */
   1970             for (;;) {
   1971                 if (data->flags & UCOL_ITER_HASLEN) {
   1972                     /*
   1973                     Normal path for strings when length is specified.
   1974                     Not in side buffer because it is always null terminated.
   1975                     */
   1976                     if (data->pos <= data->string) {
   1977                         /* End of the main source string */
   1978                         return UCOL_NO_MORE_CES;
   1979                     }
   1980                     data->pos --;
   1981                     ch = *data->pos;
   1982                 }
   1983                 // we are using an iterator to go back. Pray for us!
   1984                 else if (data->flags & UCOL_USE_ITERATOR) {
   1985                   UChar32 iterCh = data->iterator->previous(data->iterator);
   1986                   if(iterCh == U_SENTINEL) {
   1987                     return UCOL_NO_MORE_CES;
   1988                   } else {
   1989                     ch = (UChar)iterCh;
   1990                   }
   1991                 }
   1992                 else {
   1993                     data->pos --;
   1994                     ch = *data->pos;
   1995                     /* we are in the side buffer. */
   1996                     if (ch == 0) {
   1997                         /*
   1998                         At the start of the normalize side buffer.
   1999                         Go back to string.
   2000                         Because pointer points to the last accessed character,
   2001                         hence we have to increment it by one here.
   2002                         */
   2003                         data->flags = data->origFlags;
   2004                         data->offsetRepeatValue = 0;
   2005 
   2006                          if (data->fcdPosition == NULL) {
   2007                             data->pos = data->string;
   2008                             return UCOL_NO_MORE_CES;
   2009                         }
   2010                         else {
   2011                             data->pos   = data->fcdPosition + 1;
   2012                         }
   2013 
   2014                        continue;
   2015                     }
   2016                 }
   2017 
   2018                 if(data->flags&UCOL_HIRAGANA_Q) {
   2019                   if(ch>=0x3040 && ch<=0x309f) {
   2020                     data->flags |= UCOL_WAS_HIRAGANA;
   2021                   } else {
   2022                     data->flags &= ~UCOL_WAS_HIRAGANA;
   2023                   }
   2024                 }
   2025 
   2026                 /*
   2027                 * got a character to determine if there's fcd and/or normalization
   2028                 * stuff to do.
   2029                 * if the current character is not fcd.
   2030                 * if current character is at the start of the string
   2031                 * Trailing combining class == 0.
   2032                 * Note if pos is in the writablebuffer, norm is always 0
   2033                 */
   2034                 if (ch < ZERO_CC_LIMIT_ ||
   2035                   // this should propel us out of the loop in the iterator case
   2036                     (data->flags & UCOL_ITER_NORM) == 0 ||
   2037                     (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
   2038                     || data->string == data->pos) {
   2039                     break;
   2040                 }
   2041 
   2042                 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   2043                     /* if next character is FCD */
   2044                     if (data->pos == data->string) {
   2045                         /* First char of string is always OK for FCD check */
   2046                         break;
   2047                     }
   2048 
   2049                     /* Not first char of string, do the FCD fast test */
   2050                     if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
   2051                         break;
   2052                     }
   2053                 }
   2054 
   2055                 /* Need a more complete FCD check and possible normalization. */
   2056                 if (collPrevIterFCD(data)) {
   2057                     collPrevIterNormalize(data);
   2058                 }
   2059 
   2060                 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2061                     /*  No normalization. Go ahead and process the char. */
   2062                     break;
   2063                 }
   2064 
   2065                 /*
   2066                 Some normalization happened.
   2067                 Next loop picks up a char from the normalization buffer.
   2068                 */
   2069             }
   2070 
   2071             /* attempt to handle contractions, after removal of the backwards
   2072             contraction
   2073             */
   2074             if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
   2075                 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
   2076             } else {
   2077                 if (ch <= 0xFF) {
   2078                     result = coll->latinOneMapping[ch];
   2079                 }
   2080                 else {
   2081                     // Always use UCA for [3400..9FFF], [AC00..D7AF]
   2082                     // **** [FA0E..FA2F] ?? ****
   2083                     if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   2084                         (ch >= 0x3400 && ch <= 0xD7AF)) {
   2085                         if (ch > 0x9FFF && ch < 0xAC00) {
   2086                             // between the two target ranges; do normal lookup
   2087                             // **** this range is YI, Modifier tone letters, ****
   2088                             // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   2089                             // **** Latin-D might be tailored, so we need to ****
   2090                             // **** do the normal lookup for these guys.     ****
   2091                              result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   2092                         } else {
   2093                             result = UCOL_NOT_FOUND;
   2094                         }
   2095                     } else {
   2096                         result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   2097                     }
   2098                 }
   2099                 if (result > UCOL_NOT_FOUND) {
   2100                     result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
   2101                 }
   2102                 if (result == UCOL_NOT_FOUND) { // Not found in master list
   2103                     if (!isAtStartPrevIterate(data) &&
   2104                         ucol_contractionEndCP(ch, data->coll))
   2105                     {
   2106                         result = UCOL_CONTRACTION;
   2107                     } else {
   2108                         if(coll->UCA) {
   2109                             result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   2110                         }
   2111                     }
   2112 
   2113                     if (result > UCOL_NOT_FOUND) {
   2114                         if(coll->UCA) {
   2115                             result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
   2116                         }
   2117                     }
   2118                 }
   2119             }
   2120         } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
   2121 
   2122         if(result == UCOL_NOT_FOUND) {
   2123             result = getPrevImplicit(ch, data);
   2124         }
   2125     }
   2126 
   2127     return result;
   2128 }
   2129 
   2130 
   2131 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
   2132 U_CFUNC uint32_t  U_EXPORT2
   2133 ucol_getPrevCE(const UCollator *coll, collIterate *data,
   2134                         UErrorCode *status) {
   2135     return ucol_IGetPrevCE(coll, data, status);
   2136 }
   2137 
   2138 
   2139 /* this should be connected to special Jamo handling */
   2140 U_CFUNC uint32_t  U_EXPORT2
   2141 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
   2142     collIterate colIt;
   2143     IInit_collIterate(coll, &u, 1, &colIt, status);
   2144     if(U_FAILURE(*status)) {
   2145         return 0;
   2146     }
   2147     return ucol_IGetNextCE(coll, &colIt, status);
   2148 }
   2149 
   2150 /**
   2151 * Inserts the argument character into the end of the buffer pushing back the
   2152 * null terminator.
   2153 * @param data collIterate struct data
   2154 * @param ch character to be appended
   2155 * @return the position of the new addition
   2156 */
   2157 static
   2158 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
   2159 {
   2160     int32_t oldLength = data->writableBuffer.length();
   2161     return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
   2162 }
   2163 
   2164 /**
   2165 * Inserts the argument string into the end of the buffer pushing back the
   2166 * null terminator.
   2167 * @param data collIterate struct data
   2168 * @param string to be appended
   2169 * @param length of the string to be appended
   2170 * @return the position of the new addition
   2171 */
   2172 static
   2173 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
   2174 {
   2175     int32_t oldLength = data->writableBuffer.length();
   2176     return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
   2177 }
   2178 
   2179 /**
   2180 * Special normalization function for contraction in the forwards iterator.
   2181 * This normalization sequence will place the current character at source->pos
   2182 * and its following normalized sequence into the buffer.
   2183 * The fcd position, pos will be changed.
   2184 * pos will now point to positions in the buffer.
   2185 * Flags will be changed accordingly.
   2186 * @param data collation iterator data
   2187 */
   2188 static
   2189 inline void normalizeNextContraction(collIterate *data)
   2190 {
   2191     int32_t     strsize;
   2192     UErrorCode  status     = U_ZERO_ERROR;
   2193     /* because the pointer points to the next character */
   2194     const UChar *pStart    = data->pos - 1;
   2195     const UChar *pEnd;
   2196 
   2197     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2198         data->writableBuffer.setTo(*(pStart - 1));
   2199         strsize               = 1;
   2200     }
   2201     else {
   2202         strsize = data->writableBuffer.length();
   2203     }
   2204 
   2205     pEnd = data->fcdPosition;
   2206 
   2207     data->writableBuffer.append(
   2208         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
   2209     if(U_FAILURE(status)) {
   2210         return;
   2211     }
   2212 
   2213     data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
   2214     data->origFlags  = data->flags;
   2215     data->flags     |= UCOL_ITER_INNORMBUF;
   2216     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2217 }
   2218 
   2219 /**
   2220 * Contraction character management function that returns the next character
   2221 * for the forwards iterator.
   2222 * Does nothing if the next character is in buffer and not the first character
   2223 * in it.
   2224 * Else it checks next character in data string to see if it is normalizable.
   2225 * If it is not, the character is simply copied into the buffer, else
   2226 * the whole normalized substring is copied into the buffer, including the
   2227 * current character.
   2228 * @param data collation element iterator data
   2229 * @return next character
   2230 */
   2231 static
   2232 inline UChar getNextNormalizedChar(collIterate *data)
   2233 {
   2234     UChar  nextch;
   2235     UChar  ch;
   2236     // Here we need to add the iterator code. One problem is the way
   2237     // end of string is handled. If we just return next char, it could
   2238     // be the sentinel. Most of the cases already check for this, but we
   2239     // need to be sure.
   2240     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
   2241          /* if no normalization and not in buffer. */
   2242       if(data->flags & UCOL_USE_ITERATOR) {
   2243          return (UChar)data->iterator->next(data->iterator);
   2244       } else {
   2245          return *(data->pos ++);
   2246       }
   2247     }
   2248 
   2249     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
   2250       //normalizeIterator(data);
   2251     //}
   2252 
   2253     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2254     if ((innormbuf && *data->pos != 0) ||
   2255         (data->fcdPosition != NULL && !innormbuf &&
   2256         data->pos < data->fcdPosition)) {
   2257         /*
   2258         if next character is in normalized buffer, no further normalization
   2259         is required
   2260         */
   2261         return *(data->pos ++);
   2262     }
   2263 
   2264     if (data->flags & UCOL_ITER_HASLEN) {
   2265         /* in data string */
   2266         if (data->pos + 1 == data->endp) {
   2267             return *(data->pos ++);
   2268         }
   2269     }
   2270     else {
   2271         if (innormbuf) {
   2272           // inside the normalization buffer, but at the end
   2273           // (since we encountered zero). This means, in the
   2274           // case we're using char iterator, that we need to
   2275           // do another round of normalization.
   2276           //if(data->origFlags & UCOL_USE_ITERATOR) {
   2277             // we need to restore original flags,
   2278             // otherwise, we'll lose them
   2279             //data->flags = data->origFlags;
   2280             //normalizeIterator(data);
   2281             //return *(data->pos++);
   2282           //} else {
   2283             /*
   2284             in writable buffer, at this point fcdPosition can not be
   2285             pointing to the end of the data string. see contracting tag.
   2286             */
   2287           if(data->fcdPosition) {
   2288             if (*(data->fcdPosition + 1) == 0 ||
   2289                 data->fcdPosition + 1 == data->endp) {
   2290                 /* at the end of the string, dump it into the normalizer */
   2291                 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
   2292                 // Check if data->pos received a null pointer
   2293                 if (data->pos == NULL) {
   2294                     return (UChar)-1; // Return to indicate error.
   2295                 }
   2296                 return *(data->fcdPosition ++);
   2297             }
   2298             data->pos = data->fcdPosition;
   2299           } else if(data->origFlags & UCOL_USE_ITERATOR) {
   2300             // if we are here, we're using a normalizing iterator.
   2301             // we should just continue further.
   2302             data->flags = data->origFlags;
   2303             data->pos = NULL;
   2304             return (UChar)data->iterator->next(data->iterator);
   2305           }
   2306           //}
   2307         }
   2308         else {
   2309             if (*(data->pos + 1) == 0) {
   2310                 return *(data->pos ++);
   2311             }
   2312         }
   2313     }
   2314 
   2315     ch = *data->pos ++;
   2316     nextch = *data->pos;
   2317 
   2318     /*
   2319     * if the current character is not fcd.
   2320     * Trailing combining class == 0.
   2321     */
   2322     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
   2323         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
   2324          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
   2325             /*
   2326             Need a more complete FCD check and possible normalization.
   2327             normalize substring will be appended to buffer
   2328             */
   2329         if (collIterFCD(data)) {
   2330             normalizeNextContraction(data);
   2331             return *(data->pos ++);
   2332         }
   2333         else if (innormbuf) {
   2334             /* fcdposition shifted even when there's no normalization, if we
   2335             don't input the rest into this, we'll get the wrong position when
   2336             we reach the end of the writableBuffer */
   2337             int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
   2338             data->pos = insertBufferEnd(data, data->pos - 1, length);
   2339             // Check if data->pos received a null pointer
   2340             if (data->pos == NULL) {
   2341                 return (UChar)-1; // Return to indicate error.
   2342             }
   2343             return *(data->pos ++);
   2344         }
   2345     }
   2346 
   2347     if (innormbuf) {
   2348         /*
   2349         no normalization is to be done hence only one character will be
   2350         appended to the buffer.
   2351         */
   2352         data->pos = insertBufferEnd(data, ch) + 1;
   2353         // Check if data->pos received a null pointer
   2354         if (data->pos == NULL) {
   2355             return (UChar)-1; // Return to indicate error.
   2356         }
   2357     }
   2358 
   2359     /* points back to the pos in string */
   2360     return ch;
   2361 }
   2362 
   2363 
   2364 
   2365 /**
   2366 * Function to copy the buffer into writableBuffer and sets the fcd position to
   2367 * the correct position
   2368 * @param source data string source
   2369 * @param buffer character buffer
   2370 */
   2371 static
   2372 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
   2373 {
   2374     /* okay confusing part here. to ensure that the skipped characters are
   2375     considered later, we need to place it in the appropriate position in the
   2376     normalization buffer and reassign the pos pointer. simple case if pos
   2377     reside in string, simply copy to normalization buffer and
   2378     fcdposition = pos, pos = start of normalization buffer. if pos in
   2379     normalization buffer, we'll insert the copy infront of pos and point pos
   2380     to the start of the normalization buffer. why am i doing these copies?
   2381     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
   2382     not require any changes, which be really painful. */
   2383     if (source->flags & UCOL_ITER_INNORMBUF) {
   2384         int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
   2385         source->writableBuffer.replace(0, replaceLength, buffer);
   2386     }
   2387     else {
   2388         source->fcdPosition  = source->pos;
   2389         source->origFlags    = source->flags;
   2390         source->flags       |= UCOL_ITER_INNORMBUF;
   2391         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   2392         source->writableBuffer = buffer;
   2393     }
   2394 
   2395     source->pos = source->writableBuffer.getTerminatedBuffer();
   2396 }
   2397 
   2398 /**
   2399 * Function to get the discontiguos collation element within the source.
   2400 * Note this function will set the position to the appropriate places.
   2401 * @param coll current collator used
   2402 * @param source data string source
   2403 * @param constart index to the start character in the contraction table
   2404 * @return discontiguos collation element offset
   2405 */
   2406 static
   2407 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
   2408                                 const UChar *constart)
   2409 {
   2410     /* source->pos currently points to the second combining character after
   2411        the start character */
   2412           const UChar *temppos      = source->pos;
   2413           UnicodeString buffer;
   2414     const UChar   *tempconstart = constart;
   2415           uint8_t  tempflags    = source->flags;
   2416           UBool    multicontraction = FALSE;
   2417           collIterateState discState;
   2418 
   2419           backupState(source, &discState);
   2420 
   2421     buffer.setTo(peekCodePoint(source, -1));
   2422     for (;;) {
   2423         UChar    *UCharOffset;
   2424         UChar     schar,
   2425                   tchar;
   2426         uint32_t  result;
   2427 
   2428         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
   2429             || (peekCodeUnit(source, 0) == 0  &&
   2430             //|| (*source->pos == 0  &&
   2431                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
   2432                  source->fcdPosition == NULL ||
   2433                  source->fcdPosition == source->endp ||
   2434                  *(source->fcdPosition) == 0 ||
   2435                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
   2436                  /* end of string in null terminated string or stopped by a
   2437                  null character, note fcd does not always point to a base
   2438                  character after the discontiguos change */
   2439                  u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
   2440                  //u_getCombiningClass(*(source->pos)) == 0) {
   2441             //constart = (UChar *)coll->image + getContractOffset(CE);
   2442             if (multicontraction) {
   2443                 source->pos    = temppos - 1;
   2444                 setDiscontiguosAttribute(source, buffer);
   2445                 return *(coll->contractionCEs +
   2446                                     (tempconstart - coll->contractionIndex));
   2447             }
   2448             constart = tempconstart;
   2449             break;
   2450         }
   2451 
   2452         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
   2453         schar = getNextNormalizedChar(source);
   2454 
   2455         while (schar > (tchar = *UCharOffset)) {
   2456             UCharOffset++;
   2457         }
   2458 
   2459         if (schar != tchar) {
   2460             /* not the correct codepoint. we stuff the current codepoint into
   2461             the discontiguos buffer and try the next character */
   2462             buffer.append(schar);
   2463             continue;
   2464         }
   2465         else {
   2466             if (u_getCombiningClass(schar) ==
   2467                 u_getCombiningClass(peekCodePoint(source, -2))) {
   2468                 buffer.append(schar);
   2469                 continue;
   2470             }
   2471             result = *(coll->contractionCEs +
   2472                                       (UCharOffset - coll->contractionIndex));
   2473         }
   2474 
   2475         if (result == UCOL_NOT_FOUND) {
   2476           break;
   2477         } else if (isContraction(result)) {
   2478             /* this is a multi-contraction*/
   2479             tempconstart = (UChar *)coll->image + getContractOffset(result);
   2480             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
   2481                 != UCOL_NOT_FOUND) {
   2482                 multicontraction = TRUE;
   2483                 temppos       = source->pos + 1;
   2484             }
   2485         } else {
   2486             setDiscontiguosAttribute(source, buffer);
   2487             return result;
   2488         }
   2489     }
   2490 
   2491     /* no problems simply reverting just like that,
   2492     if we are in string before getting into this function, points back to
   2493     string hence no problem.
   2494     if we are in normalization buffer before getting into this function,
   2495     since we'll never use another normalization within this function, we
   2496     know that fcdposition points to a base character. the normalization buffer
   2497     never change, hence this revert works. */
   2498     loadState(source, &discState, TRUE);
   2499     goBackOne(source);
   2500 
   2501     //source->pos   = temppos - 1;
   2502     source->flags = tempflags;
   2503     return *(coll->contractionCEs + (constart - coll->contractionIndex));
   2504 }
   2505 
   2506 /* now uses Mark's getImplicitPrimary code */
   2507 static
   2508 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
   2509     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   2510     *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
   2511     collationSource->offsetRepeatCount += 1;
   2512     return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
   2513 }
   2514 
   2515 /**
   2516 * Inserts the argument character into the front of the buffer replacing the
   2517 * front null terminator.
   2518 * @param data collation element iterator data
   2519 * @param ch character to be appended
   2520 */
   2521 static
   2522 inline void insertBufferFront(collIterate *data, UChar ch)
   2523 {
   2524     data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
   2525 }
   2526 
   2527 /**
   2528 * Special normalization function for contraction in the previous iterator.
   2529 * This normalization sequence will place the current character at source->pos
   2530 * and its following normalized sequence into the buffer.
   2531 * The fcd position, pos will be changed.
   2532 * pos will now point to positions in the buffer.
   2533 * Flags will be changed accordingly.
   2534 * @param data collation iterator data
   2535 */
   2536 static
   2537 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
   2538 {
   2539     const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
   2540     const UChar *pStart;
   2541 
   2542     UnicodeString endOfBuffer;
   2543     if (data->flags & UCOL_ITER_HASLEN) {
   2544         /*
   2545         normalization buffer not used yet, we'll pull down the next
   2546         character into the end of the buffer
   2547         */
   2548         endOfBuffer.setTo(*pEnd);
   2549     }
   2550     else {
   2551         endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
   2552     }
   2553 
   2554     if (data->fcdPosition == NULL) {
   2555         pStart = data->string;
   2556     }
   2557     else {
   2558         pStart = data->fcdPosition + 1;
   2559     }
   2560     int32_t normLen =
   2561         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
   2562                              data->writableBuffer,
   2563                              *status).
   2564         length();
   2565     if(U_FAILURE(*status)) {
   2566         return;
   2567     }
   2568     /*
   2569     this puts the null termination infront of the normalized string instead
   2570     of the end
   2571     */
   2572     data->pos =
   2573         data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
   2574         1 + normLen;
   2575     data->origFlags  = data->flags;
   2576     data->flags     |= UCOL_ITER_INNORMBUF;
   2577     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2578 }
   2579 
   2580 /**
   2581 * Contraction character management function that returns the previous character
   2582 * for the backwards iterator.
   2583 * Does nothing if the previous character is in buffer and not the first
   2584 * character in it.
   2585 * Else it checks previous character in data string to see if it is
   2586 * normalizable.
   2587 * If it is not, the character is simply copied into the buffer, else
   2588 * the whole normalized substring is copied into the buffer, including the
   2589 * current character.
   2590 * @param data collation element iterator data
   2591 * @return previous character
   2592 */
   2593 static
   2594 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
   2595 {
   2596     UChar  prevch;
   2597     UChar  ch;
   2598     const UChar *start;
   2599     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2600     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
   2601         (innormbuf && *(data->pos - 1) != 0)) {
   2602         /*
   2603         if no normalization.
   2604         if previous character is in normalized buffer, no further normalization
   2605         is required
   2606         */
   2607       if(data->flags & UCOL_USE_ITERATOR) {
   2608         data->iterator->move(data->iterator, -1, UITER_CURRENT);
   2609         return (UChar)data->iterator->next(data->iterator);
   2610       } else {
   2611         return *(data->pos - 1);
   2612       }
   2613     }
   2614 
   2615     start = data->pos;
   2616     if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
   2617         /* in data string */
   2618         if ((start - 1) == data->string) {
   2619             return *(start - 1);
   2620         }
   2621         start --;
   2622         ch     = *start;
   2623         prevch = *(start - 1);
   2624     }
   2625     else {
   2626         /*
   2627         in writable buffer, at this point fcdPosition can not be NULL.
   2628         see contracting tag.
   2629         */
   2630         if (data->fcdPosition == data->string) {
   2631             /* at the start of the string, just dump it into the normalizer */
   2632             insertBufferFront(data, *(data->fcdPosition));
   2633             data->fcdPosition = NULL;
   2634             return *(data->pos - 1);
   2635         }
   2636         start  = data->fcdPosition;
   2637         ch     = *start;
   2638         prevch = *(start - 1);
   2639     }
   2640     /*
   2641     * if the current character is not fcd.
   2642     * Trailing combining class == 0.
   2643     */
   2644     if (data->fcdPosition > start &&
   2645        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
   2646     {
   2647         /*
   2648         Need a more complete FCD check and possible normalization.
   2649         normalize substring will be appended to buffer
   2650         */
   2651         const UChar *backuppos = data->pos;
   2652         data->pos = start;
   2653         if (collPrevIterFCD(data)) {
   2654             normalizePrevContraction(data, status);
   2655             return *(data->pos - 1);
   2656         }
   2657         data->pos = backuppos;
   2658         data->fcdPosition ++;
   2659     }
   2660 
   2661     if (innormbuf) {
   2662     /*
   2663     no normalization is to be done hence only one character will be
   2664     appended to the buffer.
   2665     */
   2666         insertBufferFront(data, ch);
   2667         data->fcdPosition --;
   2668     }
   2669 
   2670     return ch;
   2671 }
   2672 
   2673 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
   2674 /* It is called by getNextCE */
   2675 
   2676 /* The following should be even */
   2677 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
   2678 
   2679 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
   2680     collIterateState entryState;
   2681     backupState(source, &entryState);
   2682     UChar32 cp = ch;
   2683 
   2684     for (;;) {
   2685         // This loop will repeat only in the case of contractions, and only when a contraction
   2686         //   is found and the first CE resulting from that contraction is itself a special
   2687         //   (an expansion, for example.)  All other special CE types are fully handled the
   2688         //   first time through, and the loop exits.
   2689 
   2690         const uint32_t *CEOffset = NULL;
   2691         switch(getCETag(CE)) {
   2692         case NOT_FOUND_TAG:
   2693             /* This one is not found, and we'll let somebody else bother about it... no more games */
   2694             return CE;
   2695         case SPEC_PROC_TAG:
   2696             {
   2697                 // Special processing is getting a CE that is preceded by a certain prefix
   2698                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   2699                 // When we encouter a special processing tag, we go backwards and try to see if
   2700                 // we have a match.
   2701                 // Contraction tables are used - so the whole process is not unlike contraction.
   2702                 // prefix data is stored backwards in the table.
   2703                 const UChar *UCharOffset;
   2704                 UChar schar, tchar;
   2705                 collIterateState prefixState;
   2706                 backupState(source, &prefixState);
   2707                 loadState(source, &entryState, TRUE);
   2708                 goBackOne(source); // We want to look at the point where we entered - actually one
   2709                 // before that...
   2710 
   2711                 for(;;) {
   2712                     // This loop will run once per source string character, for as long as we
   2713                     //  are matching a potential contraction sequence
   2714 
   2715                     // First we position ourselves at the begining of contraction sequence
   2716                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2717                     if (collIter_bos(source)) {
   2718                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2719                         break;
   2720                     }
   2721                     schar = getPrevNormalizedChar(source, status);
   2722                     goBackOne(source);
   2723 
   2724                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2725                         UCharOffset++;
   2726                     }
   2727 
   2728                     if (schar == tchar) {
   2729                         // Found the source string char in the table.
   2730                         //  Pick up the corresponding CE from the table.
   2731                         CE = *(coll->contractionCEs +
   2732                             (UCharOffset - coll->contractionIndex));
   2733                     }
   2734                     else
   2735                     {
   2736                         // Source string char was not in the table.
   2737                         //   We have not found the prefix.
   2738                         CE = *(coll->contractionCEs +
   2739                             (ContractionStart - coll->contractionIndex));
   2740                     }
   2741 
   2742                     if(!isPrefix(CE)) {
   2743                         // The source string char was in the contraction table, and the corresponding
   2744                         //   CE is not a prefix CE.  We found the prefix, break
   2745                         //   out of loop, this CE will end up being returned.  This is the normal
   2746                         //   way out of prefix handling when the source actually contained
   2747                         //   the prefix.
   2748                         break;
   2749                     }
   2750                 }
   2751                 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
   2752                     loadState(source, &prefixState, TRUE);
   2753                     if(source->origFlags & UCOL_USE_ITERATOR) {
   2754                         source->flags = source->origFlags;
   2755                     }
   2756                 } else { // prefix search was a failure, we have to backup all the way to the start
   2757                     loadState(source, &entryState, TRUE);
   2758                 }
   2759                 break;
   2760             }
   2761         case CONTRACTION_TAG:
   2762             {
   2763                 /* This should handle contractions */
   2764                 collIterateState state;
   2765                 backupState(source, &state);
   2766                 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
   2767                 const UChar *UCharOffset;
   2768                 UChar schar, tchar;
   2769 
   2770                 for (;;) {
   2771                     /* This loop will run once per source string character, for as long as we     */
   2772                     /*  are matching a potential contraction sequence                  */
   2773 
   2774                     /* First we position ourselves at the begining of contraction sequence */
   2775                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2776 
   2777                     if (collIter_eos(source)) {
   2778                         // Ran off the end of the source string.
   2779                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2780                         // So we'll pick whatever we have at the point...
   2781                         if (CE == UCOL_NOT_FOUND) {
   2782                             // back up the source over all the chars we scanned going into this contraction.
   2783                             CE = firstCE;
   2784                             loadState(source, &state, TRUE);
   2785                             if(source->origFlags & UCOL_USE_ITERATOR) {
   2786                                 source->flags = source->origFlags;
   2787                             }
   2788                         }
   2789                         break;
   2790                     }
   2791 
   2792                     uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
   2793                     uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
   2794 
   2795                     schar = getNextNormalizedChar(source);
   2796                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2797                         UCharOffset++;
   2798                     }
   2799 
   2800                     if (schar == tchar) {
   2801                         // Found the source string char in the contraction table.
   2802                         //  Pick up the corresponding CE from the table.
   2803                         CE = *(coll->contractionCEs +
   2804                             (UCharOffset - coll->contractionIndex));
   2805                     }
   2806                     else
   2807                     {
   2808                         // Source string char was not in contraction table.
   2809                         //   Unless we have a discontiguous contraction, we have finished
   2810                         //   with this contraction.
   2811                         // in order to do the proper detection, we
   2812                         // need to see if we're dealing with a supplementary
   2813                         /* We test whether the next two char are surrogate pairs.
   2814                         * This test is done if the iterator is not NULL.
   2815                         * If there is no surrogate pair, the iterator
   2816                         * goes back one if needed. */
   2817                         UChar32 miss = schar;
   2818                         if (source->iterator) {
   2819                             UChar32 surrNextChar; /* the next char in the iteration to test */
   2820                             int32_t prevPos; /* holds the previous position before move forward of the source iterator */
   2821                             if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
   2822                                 prevPos = source->iterator->index;
   2823                                 surrNextChar = getNextNormalizedChar(source);
   2824                                 if (U16_IS_TRAIL(surrNextChar)) {
   2825                                     miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
   2826                                 } else if (prevPos < source->iterator->index){
   2827                                     goBackOne(source);
   2828                                 }
   2829                             }
   2830                         } else if (U16_IS_LEAD(schar)) {
   2831                             miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
   2832                         }
   2833 
   2834                         uint8_t sCC;
   2835                         if (miss < 0x300 ||
   2836                             maxCC == 0 ||
   2837                             (sCC = i_getCombiningClass(miss, coll)) == 0 ||
   2838                             sCC>maxCC ||
   2839                             (allSame != 0 && sCC == maxCC) ||
   2840                             collIter_eos(source))
   2841                         {
   2842                             //  Contraction can not be discontiguous.
   2843                             goBackOne(source);  // back up the source string by one,
   2844                             //  because  the character we just looked at was
   2845                             //  not part of the contraction.   */
   2846                             if(U_IS_SUPPLEMENTARY(miss)) {
   2847                                 goBackOne(source);
   2848                             }
   2849                             CE = *(coll->contractionCEs +
   2850                                 (ContractionStart - coll->contractionIndex));
   2851                         } else {
   2852                             //
   2853                             // Contraction is possibly discontiguous.
   2854                             //   Scan more of source string looking for a match
   2855                             //
   2856                             UChar tempchar;
   2857                             /* find the next character if schar is not a base character
   2858                             and we are not yet at the end of the string */
   2859                             tempchar = getNextNormalizedChar(source);
   2860                             // probably need another supplementary thingie here
   2861                             goBackOne(source);
   2862                             if (i_getCombiningClass(tempchar, coll) == 0) {
   2863                                 goBackOne(source);
   2864                                 if(U_IS_SUPPLEMENTARY(miss)) {
   2865                                     goBackOne(source);
   2866                                 }
   2867                                 /* Spit out the last char of the string, wasn't tasty enough */
   2868                                 CE = *(coll->contractionCEs +
   2869                                     (ContractionStart - coll->contractionIndex));
   2870                             } else {
   2871                                 CE = getDiscontiguous(coll, source, ContractionStart);
   2872                             }
   2873                         }
   2874                     } // else after if(schar == tchar)
   2875 
   2876                     if(CE == UCOL_NOT_FOUND) {
   2877                         /* The Source string did not match the contraction that we were checking.  */
   2878                         /*  Back up the source position to undo the effects of having partially    */
   2879                         /*   scanned through what ultimately proved to not be a contraction.       */
   2880                         loadState(source, &state, TRUE);
   2881                         CE = firstCE;
   2882                         break;
   2883                     }
   2884 
   2885                     if(!isContraction(CE)) {
   2886                         // The source string char was in the contraction table, and the corresponding
   2887                         //   CE is not a contraction CE.  We completed the contraction, break
   2888                         //   out of loop, this CE will end up being returned.  This is the normal
   2889                         //   way out of contraction handling when the source actually contained
   2890                         //   the contraction.
   2891                         break;
   2892                     }
   2893 
   2894 
   2895                     // The source string char was in the contraction table, and the corresponding
   2896                     //   CE is IS  a contraction CE.  We will continue looping to check the source
   2897                     //   string for the remaining chars in the contraction.
   2898                     uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
   2899                     if(tempCE != UCOL_NOT_FOUND) {
   2900                         // We have scanned a a section of source string for which there is a
   2901                         //  CE from the contraction table.  Remember the CE and scan position, so
   2902                         //  that we can return to this point if further scanning fails to
   2903                         //  match a longer contraction sequence.
   2904                         firstCE = tempCE;
   2905 
   2906                         goBackOne(source);
   2907                         backupState(source, &state);
   2908                         getNextNormalizedChar(source);
   2909 
   2910                         // Another way to do this is:
   2911                         //collIterateState tempState;
   2912                         //backupState(source, &tempState);
   2913                         //goBackOne(source);
   2914                         //backupState(source, &state);
   2915                         //loadState(source, &tempState, TRUE);
   2916 
   2917                         // The problem is that for incomplete contractions we have to remember the previous
   2918                         // position. Before, the only thing I needed to do was state.pos--;
   2919                         // After iterator introduction and especially after introduction of normalizing
   2920                         // iterators, it became much more difficult to decrease the saved state.
   2921                         // I'm not yet sure which of the two methods above is faster.
   2922                     }
   2923                 } // for(;;)
   2924                 break;
   2925             } // case CONTRACTION_TAG:
   2926         case LONG_PRIMARY_TAG:
   2927             {
   2928                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   2929                 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   2930                 source->offsetRepeatCount += 1;
   2931                 return CE;
   2932             }
   2933         case EXPANSION_TAG:
   2934             {
   2935                 /* This should handle expansion. */
   2936                 /* NOTE: we can encounter both continuations and expansions in an expansion! */
   2937                 /* I have to decide where continuations are going to be dealt with */
   2938                 uint32_t size;
   2939                 uint32_t i;    /* general counter */
   2940 
   2941                 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   2942                 size = getExpansionCount(CE);
   2943                 CE = *CEOffset++;
   2944               //source->offsetRepeatCount = -1;
   2945 
   2946                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   2947                     for(i = 1; i<size; i++) {
   2948                         *(source->CEpos++) = *CEOffset++;
   2949                         source->offsetRepeatCount += 1;
   2950                     }
   2951                 } else { /* else, we do */
   2952                     while(*CEOffset != 0) {
   2953                         *(source->CEpos++) = *CEOffset++;
   2954                         source->offsetRepeatCount += 1;
   2955                     }
   2956                 }
   2957 
   2958                 return CE;
   2959             }
   2960         case DIGIT_TAG:
   2961             {
   2962                 /*
   2963                 We do a check to see if we want to collate digits as numbers; if so we generate
   2964                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   2965                 */
   2966                 //uint32_t size;
   2967                 uint32_t i;    /* general counter */
   2968 
   2969                 if (source->coll->numericCollation == UCOL_ON){
   2970                     collIterateState digitState = {0,0,0,0,0,0,0,0,0};
   2971                     UChar32 char32 = 0;
   2972                     int32_t digVal = 0;
   2973 
   2974                     uint32_t digIndx = 0;
   2975                     uint32_t endIndex = 0;
   2976                     uint32_t trailingZeroIndex = 0;
   2977 
   2978                     uint8_t collateVal = 0;
   2979 
   2980                     UBool nonZeroValReached = FALSE;
   2981 
   2982                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
   2983                     /*
   2984                          We parse the source string until we hit a char that's NOT a digit.
   2985                         Use this u_charDigitValue. This might be slow because we have to
   2986                         handle surrogates...
   2987                     */
   2988             /*
   2989                     if (U16_IS_LEAD(ch)){
   2990                       if (!collIter_eos(source)) {
   2991                         backupState(source, &digitState);
   2992                         UChar trail = getNextNormalizedChar(source);
   2993                         if(U16_IS_TRAIL(trail)) {
   2994                           char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   2995                         } else {
   2996                           loadState(source, &digitState, TRUE);
   2997                           char32 = ch;
   2998                         }
   2999                       } else {
   3000                         char32 = ch;
   3001                       }
   3002                     } else {
   3003                       char32 = ch;
   3004                     }
   3005                     digVal = u_charDigitValue(char32);
   3006             */
   3007                     digVal = u_charDigitValue(cp); // if we have arrived here, we have
   3008                     // already processed possible supplementaries that trigered the digit tag -
   3009                     // all supplementaries are marked in the UCA.
   3010                     /*
   3011                         We  pad a zero in front of the first element anyways. This takes
   3012                         care of the (probably) most common case where people are sorting things followed
   3013                         by a single digit
   3014                     */
   3015                     digIndx++;
   3016                     for(;;){
   3017                         // Make sure we have enough space. No longer needed;
   3018                         // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
   3019                         // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
   3020                         // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
   3021 
   3022                         // Skipping over leading zeroes.
   3023                         if (digVal != 0) {
   3024                             nonZeroValReached = TRUE;
   3025                         }
   3026                         if (nonZeroValReached) {
   3027                             /*
   3028                             We parse the digit string into base 100 numbers (this fits into a byte).
   3029                             We only add to the buffer in twos, thus if we are parsing an odd character,
   3030                             that serves as the 'tens' digit while the if we are parsing an even one, that
   3031                             is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3032                             a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3033                             overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3034                             than all the other bytes.
   3035                             */
   3036 
   3037                             if (digIndx % 2 == 1){
   3038                                 collateVal += (uint8_t)digVal;
   3039 
   3040                                 // We don't enter the low-order-digit case unless we've already seen
   3041                                 // the high order, or for the first digit, which is always non-zero.
   3042                                 if (collateVal != 0)
   3043                                     trailingZeroIndex = 0;
   3044 
   3045                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3046                                 collateVal = 0;
   3047                             }
   3048                             else{
   3049                                 // We drop the collation value into the buffer so if we need to do
   3050                                 // a "front patch" we don't have to check to see if we're hitting the
   3051                                 // last element.
   3052                                 collateVal = (uint8_t)(digVal * 10);
   3053 
   3054                                 // Check for trailing zeroes.
   3055                                 if (collateVal == 0)
   3056                                 {
   3057                                     if (!trailingZeroIndex)
   3058                                         trailingZeroIndex = (digIndx/2) + 2;
   3059                                 }
   3060                                 else
   3061                                     trailingZeroIndex = 0;
   3062 
   3063                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3064                             }
   3065                             digIndx++;
   3066                         }
   3067 
   3068                         // Get next character.
   3069                         if (!collIter_eos(source)){
   3070                             ch = getNextNormalizedChar(source);
   3071                             if (U16_IS_LEAD(ch)){
   3072                                 if (!collIter_eos(source)) {
   3073                                     backupState(source, &digitState);
   3074                                     UChar trail = getNextNormalizedChar(source);
   3075                                     if(U16_IS_TRAIL(trail)) {
   3076                                         char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   3077                                     } else {
   3078                                         loadState(source, &digitState, TRUE);
   3079                                         char32 = ch;
   3080                                     }
   3081                                 }
   3082                             } else {
   3083                                 char32 = ch;
   3084                             }
   3085 
   3086                             if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
   3087                                 // Resetting position to point to the next unprocessed char. We
   3088                                 // overshot it when doing our test/set for numbers.
   3089                                 if (char32 > 0xFFFF) { // For surrogates.
   3090                                     loadState(source, &digitState, TRUE);
   3091                                     //goBackOne(source);
   3092                                 }
   3093                                 goBackOne(source);
   3094                                 break;
   3095                             }
   3096                         } else {
   3097                             break;
   3098                         }
   3099                     }
   3100 
   3101                     if (nonZeroValReached == FALSE){
   3102                         digIndx = 2;
   3103                         numTempBuf[2] = 6;
   3104                     }
   3105 
   3106                     endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
   3107                     if (digIndx % 2 != 0){
   3108                         /*
   3109                         We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
   3110                         we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
   3111                         Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
   3112                         single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
   3113                         */
   3114 
   3115                         for(i = 2; i < endIndex; i++){
   3116                             numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
   3117                                 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
   3118                         }
   3119                         --digIndx;
   3120                     }
   3121 
   3122                     // Subtract one off of the last byte.
   3123                     numTempBuf[endIndex-1] -= 1;
   3124 
   3125                     /*
   3126                     We want to skip over the first two slots in the buffer. The first slot
   3127                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3128                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3129                     */
   3130                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3131                     numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
   3132 
   3133                     // Now transfer the collation key to our collIterate struct.
   3134                     // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
   3135                     //size = ((endIndex+1) & ~1)/2;
   3136                     CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3137                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3138                         UCOL_BYTE_COMMON; // Tertiary weight.
   3139                     i = 2; // Reset the index into the buffer.
   3140                     while(i < endIndex)
   3141                     {
   3142                         uint32_t primWeight = numTempBuf[i++] << 8;
   3143                         if ( i < endIndex)
   3144                             primWeight |= numTempBuf[i++];
   3145                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3146                     }
   3147 
   3148                 } else {
   3149                     // no numeric mode, we'll just switch to whatever we stashed and continue
   3150                     CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   3151                     CE = *CEOffset++;
   3152                     break;
   3153                 }
   3154                 return CE;
   3155             }
   3156             /* various implicits optimization */
   3157         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   3158             /* UCA is filled with these. Tailorings are NOT_FOUND */
   3159             return getImplicit(cp, source);
   3160         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   3161             // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
   3162             return getImplicit(cp, source);
   3163         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3164             {
   3165                 static const uint32_t
   3166                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3167                 //const uint32_t LCount = 19;
   3168                 static const uint32_t VCount = 21;
   3169                 static const uint32_t TCount = 28;
   3170                 //const uint32_t NCount = VCount * TCount;   // 588
   3171                 //const uint32_t SCount = LCount * NCount;   // 11172
   3172                 uint32_t L = ch - SBase;
   3173 
   3174                 // divide into pieces
   3175 
   3176                 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
   3177                 L /= TCount;
   3178                 uint32_t V = L % VCount;
   3179                 L /= VCount;
   3180 
   3181                 // offset them
   3182 
   3183                 L += LBase;
   3184                 V += VBase;
   3185                 T += TBase;
   3186 
   3187                 // return the first CE, but first put the rest into the expansion buffer
   3188                 if (!source->coll->image->jamoSpecial) { // FAST PATH
   3189 
   3190                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   3191                     if (T != TBase) {
   3192                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   3193                     }
   3194 
   3195                     return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   3196 
   3197                 } else { // Jamo is Special
   3198                     // Since Hanguls pass the FCD check, it is
   3199                     // guaranteed that we won't be in
   3200                     // the normalization buffer if something like this happens
   3201 
   3202                     // However, if we are using a uchar iterator and normalization
   3203                     // is ON, the Hangul that lead us here is going to be in that
   3204                     // normalization buffer. Here we want to restore the uchar
   3205                     // iterator state and pull out of the normalization buffer
   3206                     if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
   3207                         source->flags = source->origFlags; // restore the iterator
   3208                         source->pos = NULL;
   3209                     }
   3210 
   3211                     // Move Jamos into normalization buffer
   3212                     UChar *buffer = source->writableBuffer.getBuffer(4);
   3213                     int32_t bufferLength;
   3214                     buffer[0] = (UChar)L;
   3215                     buffer[1] = (UChar)V;
   3216                     if (T != TBase) {
   3217                         buffer[2] = (UChar)T;
   3218                         bufferLength = 3;
   3219                     } else {
   3220                         bufferLength = 2;
   3221                     }
   3222                     source->writableBuffer.releaseBuffer(bufferLength);
   3223 
   3224                     // Indicate where to continue in main input string after exhausting the writableBuffer
   3225                     source->fcdPosition       = source->pos;
   3226 
   3227                     source->pos   = source->writableBuffer.getTerminatedBuffer();
   3228                     source->origFlags   = source->flags;
   3229                     source->flags       |= UCOL_ITER_INNORMBUF;
   3230                     source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   3231 
   3232                     return(UCOL_IGNORABLE);
   3233                 }
   3234             }
   3235         case SURROGATE_TAG:
   3236             /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
   3237             /* two things can happen here: next code point can be a trailing surrogate - we will use it */
   3238             /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
   3239             /* we treat it like an unassigned code point. */
   3240             {
   3241                 UChar trail;
   3242                 collIterateState state;
   3243                 backupState(source, &state);
   3244                 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
   3245                     // we chould have stepped one char forward and it might have turned that it
   3246                     // was not a trail surrogate. In that case, we have to backup.
   3247                     loadState(source, &state, TRUE);
   3248                     return UCOL_NOT_FOUND;
   3249                 } else {
   3250                     /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
   3251                     CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
   3252                     if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
   3253                         // We need to backup
   3254                         loadState(source, &state, TRUE);
   3255                         return CE;
   3256                     }
   3257                     // calculate the supplementary code point value, if surrogate was not tailored
   3258                     cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   3259                 }
   3260             }
   3261             break;
   3262         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   3263             UChar nextChar;
   3264             if( source->flags & UCOL_USE_ITERATOR) {
   3265                 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
   3266                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3267                     source->iterator->next(source->iterator);
   3268                     return getImplicit(cp, source);
   3269                 }
   3270             } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
   3271                       U_IS_TRAIL((nextChar=*source->pos))) {
   3272                 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3273                 source->pos++;
   3274                 return getImplicit(cp, source);
   3275             }
   3276             return UCOL_NOT_FOUND;
   3277         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   3278             return UCOL_NOT_FOUND; /* broken surrogate sequence */
   3279         case CHARSET_TAG:
   3280             /* not yet implemented */
   3281             /* probably after 1.8 */
   3282             return UCOL_NOT_FOUND;
   3283         default:
   3284             *status = U_INTERNAL_PROGRAM_ERROR;
   3285             CE=0;
   3286             break;
   3287     }
   3288     if (CE <= UCOL_NOT_FOUND) break;
   3289   }
   3290   return CE;
   3291 }
   3292 
   3293 
   3294 /* now uses Mark's getImplicitPrimary code */
   3295 static
   3296 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
   3297     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   3298 
   3299     *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
   3300     collationSource->toReturn = collationSource->CEpos;
   3301 
   3302     // **** doesn't work if using iterator ****
   3303     if (collationSource->flags & UCOL_ITER_INNORMBUF) {
   3304         collationSource->offsetRepeatCount = 1;
   3305     } else {
   3306         int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
   3307 
   3308         UErrorCode errorCode = U_ZERO_ERROR;
   3309         collationSource->appendOffset(firstOffset, errorCode);
   3310         collationSource->appendOffset(firstOffset + 1, errorCode);
   3311 
   3312         collationSource->offsetReturn = collationSource->offsetStore - 1;
   3313         *(collationSource->offsetBuffer) = firstOffset;
   3314         if (collationSource->offsetReturn == collationSource->offsetBuffer) {
   3315             collationSource->offsetStore = collationSource->offsetBuffer;
   3316         }
   3317     }
   3318 
   3319     return ((r & 0x0000FFFF)<<16) | 0x000000C0;
   3320 }
   3321 
   3322 /**
   3323  * This function handles the special CEs like contractions, expansions,
   3324  * surrogates, Thai.
   3325  * It is called by both getPrevCE
   3326  */
   3327 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
   3328                           collIterate *source,
   3329                           UErrorCode *status)
   3330 {
   3331     const uint32_t *CEOffset    = NULL;
   3332           UChar    *UCharOffset = NULL;
   3333           UChar    schar;
   3334     const UChar    *constart    = NULL;
   3335           uint32_t size;
   3336           UChar    buffer[UCOL_MAX_BUFFER];
   3337           uint32_t *endCEBuffer;
   3338           UChar   *strbuffer;
   3339           int32_t noChars = 0;
   3340           int32_t CECount = 0;
   3341 
   3342     for(;;)
   3343     {
   3344         /* the only ces that loops are thai and contractions */
   3345         switch (getCETag(CE))
   3346         {
   3347         case NOT_FOUND_TAG:  /* this tag always returns */
   3348             return CE;
   3349 
   3350         case SPEC_PROC_TAG:
   3351             {
   3352                 // Special processing is getting a CE that is preceded by a certain prefix
   3353                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   3354                 // When we encouter a special processing tag, we go backwards and try to see if
   3355                 // we have a match.
   3356                 // Contraction tables are used - so the whole process is not unlike contraction.
   3357                 // prefix data is stored backwards in the table.
   3358                 const UChar *UCharOffset;
   3359                 UChar schar, tchar;
   3360                 collIterateState prefixState;
   3361                 backupState(source, &prefixState);
   3362                 for(;;) {
   3363                     // This loop will run once per source string character, for as long as we
   3364                     //  are matching a potential contraction sequence
   3365 
   3366                     // First we position ourselves at the begining of contraction sequence
   3367                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   3368 
   3369                     if (collIter_bos(source)) {
   3370                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   3371                         break;
   3372                     }
   3373                     schar = getPrevNormalizedChar(source, status);
   3374                     goBackOne(source);
   3375 
   3376                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   3377                         UCharOffset++;
   3378                     }
   3379 
   3380                     if (schar == tchar) {
   3381                         // Found the source string char in the table.
   3382                         //  Pick up the corresponding CE from the table.
   3383                         CE = *(coll->contractionCEs +
   3384                             (UCharOffset - coll->contractionIndex));
   3385                     }
   3386                     else
   3387                     {
   3388                         // if there is a completely ignorable code point in the middle of
   3389                         // a prefix, we need to act as if it's not there
   3390                         // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
   3391                         // lone surrogates cannot be set to zero as it would break other processing
   3392                         uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   3393                         // it's easy for BMP code points
   3394                         if(isZeroCE == 0) {
   3395                             continue;
   3396                         } else if(U16_IS_SURROGATE(schar)) {
   3397                             // for supplementary code points, we have to check the next one
   3398                             // situations where we are going to ignore
   3399                             // 1. beginning of the string: schar is a lone surrogate
   3400                             // 2. schar is a lone surrogate
   3401                             // 3. schar is a trail surrogate in a valid surrogate sequence
   3402                             //    that is explicitly set to zero.
   3403                             if (!collIter_bos(source)) {
   3404                                 UChar lead;
   3405                                 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
   3406                                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
   3407                                     if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
   3408                                         uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
   3409                                         if(finalCE == 0) {
   3410                                             // this is a real, assigned completely ignorable code point
   3411                                             goBackOne(source);
   3412                                             continue;
   3413                                         }
   3414                                     }
   3415                                 } else {
   3416                                     // lone surrogate, treat like unassigned
   3417                                     return UCOL_NOT_FOUND;
   3418                                 }
   3419                             } else {
   3420                                 // lone surrogate at the beggining, treat like unassigned
   3421                                 return UCOL_NOT_FOUND;
   3422                             }
   3423                         }
   3424                         // Source string char was not in the table.
   3425                         //   We have not found the prefix.
   3426                         CE = *(coll->contractionCEs +
   3427                             (ContractionStart - coll->contractionIndex));
   3428                     }
   3429 
   3430                     if(!isPrefix(CE)) {
   3431                         // The source string char was in the contraction table, and the corresponding
   3432                         //   CE is not a prefix CE.  We found the prefix, break
   3433                         //   out of loop, this CE will end up being returned.  This is the normal
   3434                         //   way out of prefix handling when the source actually contained
   3435                         //   the prefix.
   3436                         break;
   3437                     }
   3438                 }
   3439                 loadState(source, &prefixState, TRUE);
   3440                 break;
   3441             }
   3442 
   3443         case CONTRACTION_TAG: {
   3444             /* to ensure that the backwards and forwards iteration matches, we
   3445             take the current region of most possible match and pass it through
   3446             the forward iteration. this will ensure that the obstinate problem of
   3447             overlapping contractions will not occur.
   3448             */
   3449             schar = peekCodeUnit(source, 0);
   3450             constart = (UChar *)coll->image + getContractOffset(CE);
   3451             if (isAtStartPrevIterate(source)
   3452                 /* commented away contraction end checks after adding the checks
   3453                 in getPrevCE  */) {
   3454                     /* start of string or this is not the end of any contraction */
   3455                     CE = *(coll->contractionCEs +
   3456                         (constart - coll->contractionIndex));
   3457                     break;
   3458             }
   3459             strbuffer = buffer;
   3460             UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
   3461             *(UCharOffset --) = 0;
   3462             noChars = 0;
   3463             // have to swap thai characters
   3464             while (ucol_unsafeCP(schar, coll)) {
   3465                 *(UCharOffset) = schar;
   3466                 noChars++;
   3467                 UCharOffset --;
   3468                 schar = getPrevNormalizedChar(source, status);
   3469                 goBackOne(source);
   3470                 // TODO: when we exhaust the contraction buffer,
   3471                 // it needs to get reallocated. The problem is
   3472                 // that the size depends on the string which is
   3473                 // not iterated over. However, since we're travelling
   3474                 // backwards, we already had to set the iterator at
   3475                 // the end - so we might as well know where we are?
   3476                 if (UCharOffset + 1 == buffer) {
   3477                     /* we have exhausted the buffer */
   3478                     int32_t newsize = 0;
   3479                     if(source->pos) { // actually dealing with a position
   3480                         newsize = (int32_t)(source->pos - source->string + 1);
   3481                     } else { // iterator
   3482                         newsize = 4 * UCOL_MAX_BUFFER;
   3483                     }
   3484                     strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
   3485                         (newsize + UCOL_MAX_BUFFER));
   3486                     /* test for NULL */
   3487                     if (strbuffer == NULL) {
   3488                         *status = U_MEMORY_ALLOCATION_ERROR;
   3489                         return UCOL_NO_MORE_CES;
   3490                     }
   3491                     UCharOffset = strbuffer + newsize;
   3492                     uprv_memcpy(UCharOffset, buffer,
   3493                         UCOL_MAX_BUFFER * sizeof(UChar));
   3494                     UCharOffset --;
   3495                 }
   3496                 if ((source->pos && (source->pos == source->string ||
   3497                     ((source->flags & UCOL_ITER_INNORMBUF) &&
   3498                     *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
   3499                     || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
   3500                         break;
   3501                 }
   3502             }
   3503             /* adds the initial base character to the string */
   3504             *(UCharOffset) = schar;
   3505             noChars++;
   3506 
   3507             int32_t offsetBias;
   3508 
   3509             // **** doesn't work if using iterator ****
   3510             if (source->flags & UCOL_ITER_INNORMBUF) {
   3511                 offsetBias = -1;
   3512             } else {
   3513                 offsetBias = (int32_t)(source->pos - source->string);
   3514             }
   3515 
   3516             /* a new collIterate is used to simplify things, since using the current
   3517             collIterate will mean that the forward and backwards iteration will
   3518             share and change the same buffers. we don't want to get into that. */
   3519             collIterate temp;
   3520             int32_t rawOffset;
   3521 
   3522             IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
   3523             if(U_FAILURE(*status)) {
   3524                 return UCOL_NULLORDER;
   3525             }
   3526             temp.flags &= ~UCOL_ITER_NORM;
   3527             temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
   3528 
   3529             rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
   3530             CE = ucol_IGetNextCE(coll, &temp, status);
   3531 
   3532             if (source->extendCEs) {
   3533                 endCEBuffer = source->extendCEs + source->extendCEsSize;
   3534                 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
   3535             } else {
   3536                 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
   3537                 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
   3538             }
   3539 
   3540             while (CE != UCOL_NO_MORE_CES) {
   3541                 *(source->CEpos ++) = CE;
   3542 
   3543                 if (offsetBias >= 0) {
   3544                     source->appendOffset(rawOffset + offsetBias, *status);
   3545                 }
   3546 
   3547                 CECount++;
   3548                 if (source->CEpos == endCEBuffer) {
   3549                     /* ran out of CE space, reallocate to new buffer.
   3550                     If reallocation fails, reset pointers and bail out,
   3551                     there's no guarantee of the right character position after
   3552                     this bail*/
   3553                     if (!increaseCEsCapacity(source)) {
   3554                         *status = U_MEMORY_ALLOCATION_ERROR;
   3555                         break;
   3556                     }
   3557 
   3558                     endCEBuffer = source->extendCEs + source->extendCEsSize;
   3559                 }
   3560 
   3561                 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
   3562                     rawOffset = (int32_t)(temp.fcdPosition - temp.string);
   3563                 } else {
   3564                     rawOffset = (int32_t)(temp.pos - temp.string);
   3565                 }
   3566 
   3567                 CE = ucol_IGetNextCE(coll, &temp, status);
   3568             }
   3569 
   3570             if (strbuffer != buffer) {
   3571                 uprv_free(strbuffer);
   3572             }
   3573             if (U_FAILURE(*status)) {
   3574                 return (uint32_t)UCOL_NULLORDER;
   3575             }
   3576 
   3577             if (source->offsetRepeatValue != 0) {
   3578                 if (CECount > noChars) {
   3579                     source->offsetRepeatCount += temp.offsetRepeatCount;
   3580                 } else {
   3581                     // **** does this really skip the right offsets? ****
   3582                     source->offsetReturn -= (noChars - CECount);
   3583                 }
   3584             }
   3585 
   3586             if (offsetBias >= 0) {
   3587                 source->offsetReturn = source->offsetStore - 1;
   3588                 if (source->offsetReturn == source->offsetBuffer) {
   3589                     source->offsetStore = source->offsetBuffer;
   3590                 }
   3591             }
   3592 
   3593             source->toReturn = source->CEpos - 1;
   3594             if (source->toReturn == source->CEs) {
   3595                 source->CEpos = source->CEs;
   3596             }
   3597 
   3598             return *(source->toReturn);
   3599         }
   3600         case LONG_PRIMARY_TAG:
   3601             {
   3602                 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   3603                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   3604                 source->toReturn = source->CEpos - 1;
   3605 
   3606                 if (source->flags & UCOL_ITER_INNORMBUF) {
   3607                     source->offsetRepeatCount = 1;
   3608                 } else {
   3609                     int32_t firstOffset = (int32_t)(source->pos - source->string);
   3610 
   3611                     source->appendOffset(firstOffset, *status);
   3612                     source->appendOffset(firstOffset + 1, *status);
   3613 
   3614                     source->offsetReturn = source->offsetStore - 1;
   3615                     *(source->offsetBuffer) = firstOffset;
   3616                     if (source->offsetReturn == source->offsetBuffer) {
   3617                         source->offsetStore = source->offsetBuffer;
   3618                     }
   3619                 }
   3620 
   3621 
   3622                 return *(source->toReturn);
   3623             }
   3624 
   3625         case EXPANSION_TAG: /* this tag always returns */
   3626             {
   3627             /*
   3628             This should handle expansion.
   3629             NOTE: we can encounter both continuations and expansions in an expansion!
   3630             I have to decide where continuations are going to be dealt with
   3631             */
   3632             int32_t firstOffset = (int32_t)(source->pos - source->string);
   3633 
   3634             // **** doesn't work if using iterator ****
   3635             if (source->offsetReturn != NULL) {
   3636                 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
   3637                     source->offsetStore = source->offsetBuffer;
   3638                 }else {
   3639                   firstOffset = -1;
   3640                 }
   3641             }
   3642 
   3643             /* find the offset to expansion table */
   3644             CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3645             size     = getExpansionCount(CE);
   3646             if (size != 0) {
   3647                 /*
   3648                 if there are less than 16 elements in expansion, we don't terminate
   3649                 */
   3650                 uint32_t count;
   3651 
   3652                 for (count = 0; count < size; count++) {
   3653                     *(source->CEpos ++) = *CEOffset++;
   3654 
   3655                     if (firstOffset >= 0) {
   3656                         source->appendOffset(firstOffset + 1, *status);
   3657                     }
   3658                 }
   3659             } else {
   3660                 /* else, we do */
   3661                 while (*CEOffset != 0) {
   3662                     *(source->CEpos ++) = *CEOffset ++;
   3663 
   3664                     if (firstOffset >= 0) {
   3665                         source->appendOffset(firstOffset + 1, *status);
   3666                     }
   3667                 }
   3668             }
   3669 
   3670             if (firstOffset >= 0) {
   3671                 source->offsetReturn = source->offsetStore - 1;
   3672                 *(source->offsetBuffer) = firstOffset;
   3673                 if (source->offsetReturn == source->offsetBuffer) {
   3674                     source->offsetStore = source->offsetBuffer;
   3675                 }
   3676             } else {
   3677                 source->offsetRepeatCount += size - 1;
   3678             }
   3679 
   3680             source->toReturn = source->CEpos - 1;
   3681             // in case of one element expansion, we
   3682             // want to immediately return CEpos
   3683             if(source->toReturn == source->CEs) {
   3684                 source->CEpos = source->CEs;
   3685             }
   3686 
   3687             return *(source->toReturn);
   3688             }
   3689 
   3690         case DIGIT_TAG:
   3691             {
   3692                 /*
   3693                 We do a check to see if we want to collate digits as numbers; if so we generate
   3694                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   3695                 */
   3696                 uint32_t i;    /* general counter */
   3697 
   3698                 if (source->coll->numericCollation == UCOL_ON){
   3699                     uint32_t digIndx = 0;
   3700                     uint32_t endIndex = 0;
   3701                     uint32_t leadingZeroIndex = 0;
   3702                     uint32_t trailingZeroCount = 0;
   3703 
   3704                     uint8_t collateVal = 0;
   3705 
   3706                     UBool nonZeroValReached = FALSE;
   3707 
   3708                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
   3709                     /*
   3710                     We parse the source string until we hit a char that's NOT a digit.
   3711                     Use this u_charDigitValue. This might be slow because we have to
   3712                     handle surrogates...
   3713                     */
   3714                     /*
   3715                     We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
   3716                     with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
   3717                     element we process when going backward. To determine how long that chunk might be, we may need to make
   3718                     two passes through the loop that collects digits - one to see how long the string is (and how much is
   3719                     leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
   3720                     more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
   3721                     element chunk after resetting the state to the initialState at the right side of the digit string.
   3722                     */
   3723                     uint32_t ceLimit = 0;
   3724                     UChar initial_ch = ch;
   3725                     collIterateState initialState = {0,0,0,0,0,0,0,0,0};
   3726                     backupState(source, &initialState);
   3727 
   3728                     for(;;) {
   3729                         collIterateState state = {0,0,0,0,0,0,0,0,0};
   3730                         UChar32 char32 = 0;
   3731                         int32_t digVal = 0;
   3732 
   3733                         if (U16_IS_TRAIL (ch)) {
   3734                             if (!collIter_bos(source)){
   3735                                 UChar lead = getPrevNormalizedChar(source, status);
   3736                                 if(U16_IS_LEAD(lead)) {
   3737                                     char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3738                                     goBackOne(source);
   3739                                 } else {
   3740                                     char32 = ch;
   3741                                 }
   3742                             } else {
   3743                                 char32 = ch;
   3744                             }
   3745                         } else {
   3746                             char32 = ch;
   3747                         }
   3748                         digVal = u_charDigitValue(char32);
   3749 
   3750                         for(;;) {
   3751                             // Make sure we have enough space. No longer needed;
   3752                             // at this point the largest value of digIndx when we need to save data in numTempBuf
   3753                             // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
   3754                             // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
   3755 
   3756                             // Skip over trailing zeroes, and keep a count of them.
   3757                             if (digVal != 0)
   3758                                 nonZeroValReached = TRUE;
   3759 
   3760                             if (nonZeroValReached) {
   3761                                 /*
   3762                                 We parse the digit string into base 100 numbers (this fits into a byte).
   3763                                 We only add to the buffer in twos, thus if we are parsing an odd character,
   3764                                 that serves as the 'tens' digit while the if we are parsing an even one, that
   3765                                 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3766                                 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3767                                 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3768                                 than all the other bytes.
   3769 
   3770                                 Since we're doing in this reverse we want to put the first digit encountered into the
   3771                                 ones place and the second digit encountered into the tens place.
   3772                                 */
   3773 
   3774                                 if ((digIndx + trailingZeroCount) % 2 == 1) {
   3775                                     // High-order digit case (tens place)
   3776                                     collateVal += (uint8_t)(digVal * 10);
   3777 
   3778                                     // We cannot set leadingZeroIndex unless it has been set for the
   3779                                     // low-order digit. Therefore, all we can do for the high-order
   3780                                     // digit is turn it off, never on.
   3781                                     // The only time we will have a high digit without a low is for
   3782                                     // the very first non-zero digit, so no zero check is necessary.
   3783                                     if (collateVal != 0)
   3784                                         leadingZeroIndex = 0;
   3785 
   3786                                     // The first pass through, digIndx may exceed the limit, but in that case
   3787                                     // we no longer care about numTempBuf contents since they will be discarded
   3788                                     if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
   3789                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3790                                     }
   3791                                     collateVal = 0;
   3792                                 } else {
   3793                                     // Low-order digit case (ones place)
   3794                                     collateVal = (uint8_t)digVal;
   3795 
   3796                                     // Check for leading zeroes.
   3797                                     if (collateVal == 0) {
   3798                                         if (!leadingZeroIndex)
   3799                                             leadingZeroIndex = (digIndx/2) + 2;
   3800                                     } else
   3801                                         leadingZeroIndex = 0;
   3802 
   3803                                     // No need to write to buffer; the case of a last odd digit
   3804                                     // is handled below.
   3805                                 }
   3806                                 ++digIndx;
   3807                             } else
   3808                                 ++trailingZeroCount;
   3809 
   3810                             if (!collIter_bos(source)) {
   3811                                 ch = getPrevNormalizedChar(source, status);
   3812                                 //goBackOne(source);
   3813                                 if (U16_IS_TRAIL(ch)) {
   3814                                     backupState(source, &state);
   3815                                     if (!collIter_bos(source)) {
   3816                                         goBackOne(source);
   3817                                         UChar lead = getPrevNormalizedChar(source, status);
   3818 
   3819                                         if(U16_IS_LEAD(lead)) {
   3820                                             char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3821                                         } else {
   3822                                             loadState(source, &state, FALSE);
   3823                                             char32 = ch;
   3824                                         }
   3825                                     }
   3826                                 } else
   3827                                     char32 = ch;
   3828 
   3829                                 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
   3830                                     if (char32 > 0xFFFF) {// For surrogates.
   3831                                         loadState(source, &state, FALSE);
   3832                                     }
   3833                                     // Don't need to "reverse" the goBackOne call,
   3834                                     // as this points to the next position to process..
   3835                                     //if (char32 > 0xFFFF) // For surrogates.
   3836                                     //getNextNormalizedChar(source);
   3837                                     break;
   3838                                 }
   3839 
   3840                                 goBackOne(source);
   3841                             }else
   3842                                 break;
   3843                         }
   3844 
   3845                         if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
   3846                             // our collation element is not too big, go ahead and finish with it
   3847                             break;
   3848                         }
   3849                         // our digit string is too long for a collation element;
   3850                         // set the limit for it, reset the state and begin again
   3851                         ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
   3852                         if ( ceLimit == 0 ) {
   3853                             ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
   3854                         }
   3855                         ch = initial_ch;
   3856                         loadState(source, &initialState, FALSE);
   3857                         digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
   3858                         collateVal = 0;
   3859                         nonZeroValReached = FALSE;
   3860                     }
   3861 
   3862                     if (! nonZeroValReached) {
   3863                         digIndx = 2;
   3864                         trailingZeroCount = 0;
   3865                         numTempBuf[2] = 6;
   3866                     }
   3867 
   3868                     if ((digIndx + trailingZeroCount) % 2 != 0) {
   3869                         numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
   3870                         digIndx += 1;       // The implicit leading zero
   3871                     }
   3872                     if (trailingZeroCount % 2 != 0) {
   3873                         // We had to consume one trailing zero for the low digit
   3874                         // of the least significant byte
   3875                         digIndx += 1;       // The trailing zero not in the exponent
   3876                         trailingZeroCount -= 1;
   3877                     }
   3878 
   3879                     endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
   3880 
   3881                     // Subtract one off of the last byte. Really the first byte here, but it's reversed...
   3882                     numTempBuf[2] -= 1;
   3883 
   3884                     /*
   3885                     We want to skip over the first two slots in the buffer. The first slot
   3886                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3887                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3888                     The exponent must be adjusted by the number of leading zeroes, and the number of
   3889                     trailing zeroes.
   3890                     */
   3891                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3892                     uint32_t exponent = (digIndx+trailingZeroCount)/2;
   3893                     if (leadingZeroIndex)
   3894                         exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
   3895                     numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
   3896 
   3897                     // Now transfer the collation key to our collIterate struct.
   3898                     // The total size for our collation key is half of endIndex, rounded up.
   3899                     int32_t size = (endIndex+1)/2;
   3900                     if(!ensureCEsCapacity(source, size)) {
   3901                         return UCOL_NULLORDER;
   3902                     }
   3903                     *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3904                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3905                         UCOL_BYTE_COMMON; // Tertiary weight.
   3906                     i = endIndex - 1; // Reset the index into the buffer.
   3907                     while(i >= 2) {
   3908                         uint32_t primWeight = numTempBuf[i--] << 8;
   3909                         if ( i >= 2)
   3910                             primWeight |= numTempBuf[i--];
   3911                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3912                     }
   3913 
   3914                     source->toReturn = source->CEpos -1;
   3915                     return *(source->toReturn);
   3916                 } else {
   3917                     CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3918                     CE = *(CEOffset++);
   3919                     break;
   3920                 }
   3921             }
   3922 
   3923         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3924             {
   3925                 static const uint32_t
   3926                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3927                 //const uint32_t LCount = 19;
   3928                 static const uint32_t VCount = 21;
   3929                 static const uint32_t TCount = 28;
   3930                 //const uint32_t NCount = VCount * TCount;   /* 588 */
   3931                 //const uint32_t SCount = LCount * NCount;   /* 11172 */
   3932 
   3933                 uint32_t L = ch - SBase;
   3934                 /*
   3935                 divide into pieces.
   3936                 we do it in this order since some compilers can do % and / in one
   3937                 operation
   3938                 */
   3939                 uint32_t T = L % TCount;
   3940                 L /= TCount;
   3941                 uint32_t V = L % VCount;
   3942                 L /= VCount;
   3943 
   3944                 /* offset them */
   3945                 L += LBase;
   3946                 V += VBase;
   3947                 T += TBase;
   3948 
   3949                 int32_t firstOffset = (int32_t)(source->pos - source->string);
   3950                 source->appendOffset(firstOffset, *status);
   3951 
   3952                 /*
   3953                  * return the first CE, but first put the rest into the expansion buffer
   3954                  */
   3955                 if (!source->coll->image->jamoSpecial) {
   3956                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   3957                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   3958                     source->appendOffset(firstOffset + 1, *status);
   3959 
   3960                     if (T != TBase) {
   3961                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   3962                         source->appendOffset(firstOffset + 1, *status);
   3963                     }
   3964 
   3965                     source->toReturn = source->CEpos - 1;
   3966 
   3967                     source->offsetReturn = source->offsetStore - 1;
   3968                     if (source->offsetReturn == source->offsetBuffer) {
   3969                         source->offsetStore = source->offsetBuffer;
   3970                     }
   3971 
   3972                     return *(source->toReturn);
   3973                 } else {
   3974                     // Since Hanguls pass the FCD check, it is
   3975                     // guaranteed that we won't be in
   3976                     // the normalization buffer if something like this happens
   3977 
   3978                     // Move Jamos into normalization buffer
   3979                     UChar *tempbuffer = source->writableBuffer.getBuffer(5);
   3980                     int32_t tempbufferLength, jamoOffset;
   3981                     tempbuffer[0] = 0;
   3982                     tempbuffer[1] = (UChar)L;
   3983                     tempbuffer[2] = (UChar)V;
   3984                     if (T != TBase) {
   3985                         tempbuffer[3] = (UChar)T;
   3986                         tempbufferLength = 4;
   3987                     } else {
   3988                         tempbufferLength = 3;
   3989                     }
   3990                     source->writableBuffer.releaseBuffer(tempbufferLength);
   3991 
   3992                     // Indicate where to continue in main input string after exhausting the writableBuffer
   3993                     if (source->pos  == source->string) {
   3994                         jamoOffset = 0;
   3995                         source->fcdPosition = NULL;
   3996                     } else {
   3997                         jamoOffset = source->pos - source->string;
   3998                         source->fcdPosition       = source->pos-1;
   3999                     }
   4000 
   4001 					// Append offsets for the additional chars
   4002 					// (not the 0, and not the L whose offsets match the original Hangul)
   4003                     int32_t jamoRemaining = tempbufferLength - 2;
   4004                     jamoOffset++; // appended offsets should match end of original Hangul
   4005                     while (jamoRemaining-- > 0) {
   4006                         source->appendOffset(jamoOffset, *status);
   4007                     }
   4008 
   4009                     source->offsetRepeatValue = jamoOffset;
   4010 
   4011                     source->offsetReturn = source->offsetStore - 1;
   4012                     if (source->offsetReturn == source->offsetBuffer) {
   4013                         source->offsetStore = source->offsetBuffer;
   4014                     }
   4015 
   4016                     source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
   4017                     source->origFlags         = source->flags;
   4018                     source->flags            |= UCOL_ITER_INNORMBUF;
   4019                     source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   4020 
   4021                     return(UCOL_IGNORABLE);
   4022                 }
   4023             }
   4024 
   4025         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   4026             return getPrevImplicit(ch, source);
   4027 
   4028             // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
   4029         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   4030             return getPrevImplicit(ch, source);
   4031 
   4032         case SURROGATE_TAG:  /* This is a surrogate pair */
   4033             /* essentially an engaged lead surrogate. */
   4034             /* if you have encountered it here, it means that a */
   4035             /* broken sequence was encountered and this is an error */
   4036             return UCOL_NOT_FOUND;
   4037 
   4038         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   4039             return UCOL_NOT_FOUND; /* broken surrogate sequence */
   4040 
   4041         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   4042             {
   4043                 UChar32 cp = 0;
   4044                 UChar  prevChar;
   4045                 const UChar *prev;
   4046                 if (isAtStartPrevIterate(source)) {
   4047                     /* we are at the start of the string, wrong place to be at */
   4048                     return UCOL_NOT_FOUND;
   4049                 }
   4050                 if (source->pos != source->writableBuffer.getBuffer()) {
   4051                     prev     = source->pos - 1;
   4052                 } else {
   4053                     prev     = source->fcdPosition;
   4054                 }
   4055                 prevChar = *prev;
   4056 
   4057                 /* Handles Han and Supplementary characters here.*/
   4058                 if (U16_IS_LEAD(prevChar)) {
   4059                     cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   4060                     source->pos = prev;
   4061                 } else {
   4062                     return UCOL_NOT_FOUND; /* like unassigned */
   4063                 }
   4064 
   4065                 return getPrevImplicit(cp, source);
   4066             }
   4067 
   4068             /* UCA is filled with these. Tailorings are NOT_FOUND */
   4069             /* not yet implemented */
   4070         case CHARSET_TAG:  /* this tag always returns */
   4071             /* probably after 1.8 */
   4072             return UCOL_NOT_FOUND;
   4073 
   4074         default:           /* this tag always returns */
   4075             *status = U_INTERNAL_PROGRAM_ERROR;
   4076             CE=0;
   4077             break;
   4078         }
   4079 
   4080         if (CE <= UCOL_NOT_FOUND) {
   4081             break;
   4082         }
   4083     }
   4084 
   4085     return CE;
   4086 }
   4087 
   4088 /* This should really be a macro        */
   4089 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
   4090 /* anyway */
   4091 static
   4092 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
   4093 #ifdef UCOL_DEBUG
   4094     fprintf(stderr, ".");
   4095 #endif
   4096     uint8_t *newStart = NULL;
   4097     uint32_t offset = (uint32_t)(*secondaries-secStart);
   4098 
   4099     if(secStart==second) {
   4100         newStart=(uint8_t*)uprv_malloc(newSize);
   4101         if(newStart==NULL) {
   4102             *status = U_MEMORY_ALLOCATION_ERROR;
   4103             return NULL;
   4104         }
   4105         uprv_memcpy(newStart, secStart, *secondaries-secStart);
   4106     } else {
   4107         newStart=(uint8_t*)uprv_realloc(secStart, newSize);
   4108         if(newStart==NULL) {
   4109             *status = U_MEMORY_ALLOCATION_ERROR;
   4110             /* Since we're reallocating, return original reference so we don't loose it. */
   4111             return secStart;
   4112         }
   4113     }
   4114     *secondaries=newStart+offset;
   4115     *secSize=newSize;
   4116     return newStart;
   4117 }
   4118 
   4119 
   4120 /* This should really be a macro                                                                      */
   4121 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
   4122 /* secondaries in French                                                                              */
   4123 /*
   4124 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
   4125   uint8_t temp;
   4126   while(start<end) {
   4127     temp = *start;
   4128     *start++ = *end;
   4129     *end-- = temp;
   4130   }
   4131 }
   4132 */
   4133 
   4134 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
   4135   TYPE tempA; \
   4136 while((start)<(end)) { \
   4137     tempA = *(start); \
   4138     *(start)++ = *(end); \
   4139     *(end)-- = tempA; \
   4140 } \
   4141 }
   4142 
   4143 /****************************************************************************/
   4144 /* Following are the sortkey generation functions                           */
   4145 /*                                                                          */
   4146 /****************************************************************************/
   4147 
   4148 /**
   4149  * Merge two sort keys.
   4150  * This is useful, for example, to combine sort keys from first and last names
   4151  * to sort such pairs.
   4152  * Merged sort keys consider on each collation level the first part first entirely,
   4153  * then the second one.
   4154  * It is possible to merge multiple sort keys by consecutively merging
   4155  * another one with the intermediate result.
   4156  *
   4157  * The length of the merge result is the sum of the lengths of the input sort keys
   4158  * minus 1.
   4159  *
   4160  * @param src1 the first sort key
   4161  * @param src1Length the length of the first sort key, including the zero byte at the end;
   4162  *        can be -1 if the function is to find the length
   4163  * @param src2 the second sort key
   4164  * @param src2Length the length of the second sort key, including the zero byte at the end;
   4165  *        can be -1 if the function is to find the length
   4166  * @param dest the buffer where the merged sort key is written,
   4167  *        can be NULL if destCapacity==0
   4168  * @param destCapacity the number of bytes in the dest buffer
   4169  * @return the length of the merged sort key, src1Length+src2Length-1;
   4170  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
   4171  *         in which cases the contents of dest is undefined
   4172  *
   4173  * @draft
   4174  */
   4175 U_CAPI int32_t U_EXPORT2
   4176 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
   4177                    const uint8_t *src2, int32_t src2Length,
   4178                    uint8_t *dest, int32_t destCapacity) {
   4179     int32_t destLength;
   4180     uint8_t b;
   4181 
   4182     /* check arguments */
   4183     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
   4184         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
   4185         destCapacity<0 || (destCapacity>0 && dest==NULL)
   4186     ) {
   4187         /* error, attempt to write a zero byte and return 0 */
   4188         if(dest!=NULL && destCapacity>0) {
   4189             *dest=0;
   4190         }
   4191         return 0;
   4192     }
   4193 
   4194     /* check lengths and capacity */
   4195     if(src1Length<0) {
   4196         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
   4197     }
   4198     if(src2Length<0) {
   4199         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
   4200     }
   4201 
   4202     destLength=src1Length+src2Length-1;
   4203     if(destLength>destCapacity) {
   4204         /* the merged sort key does not fit into the destination */
   4205         return destLength;
   4206     }
   4207 
   4208     /* merge the sort keys with the same number of levels */
   4209     while(*src1!=0 && *src2!=0) { /* while both have another level */
   4210         /* copy level from src1 not including 00 or 01 */
   4211         while((b=*src1)>=2) {
   4212             ++src1;
   4213             *dest++=b;
   4214         }
   4215 
   4216         /* add a 02 merge separator */
   4217         *dest++=2;
   4218 
   4219         /* copy level from src2 not including 00 or 01 */
   4220         while((b=*src2)>=2) {
   4221             ++src2;
   4222             *dest++=b;
   4223         }
   4224 
   4225         /* if both sort keys have another level, then add a 01 level separator and continue */
   4226         if(*src1==1 && *src2==1) {
   4227             ++src1;
   4228             ++src2;
   4229             *dest++=1;
   4230         }
   4231     }
   4232 
   4233     /*
   4234      * here, at least one sort key is finished now, but the other one
   4235      * might have some contents left from containing more levels;
   4236      * that contents is just appended to the result
   4237      */
   4238     if(*src1!=0) {
   4239         /* src1 is not finished, therefore *src2==0, and src1 is appended */
   4240         src2=src1;
   4241     }
   4242     /* append src2, "the other, unfinished sort key" */
   4243     uprv_strcpy((char *)dest, (const char *)src2);
   4244 
   4245     /* trust that neither sort key contained illegally embedded zero bytes */
   4246     return destLength;
   4247 }
   4248 
   4249 /* sortkey API */
   4250 U_CAPI int32_t U_EXPORT2
   4251 ucol_getSortKey(const    UCollator    *coll,
   4252         const    UChar        *source,
   4253         int32_t        sourceLength,
   4254         uint8_t        *result,
   4255         int32_t        resultLength)
   4256 {
   4257     UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
   4258     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   4259         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
   4260             ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
   4261     }
   4262 
   4263     UErrorCode status = U_ZERO_ERROR;
   4264     int32_t keySize   = 0;
   4265 
   4266     if(source != NULL) {
   4267         // source == NULL is actually an error situation, but we would need to
   4268         // have an error code to return it. Until we introduce a new
   4269         // API, it stays like this
   4270 
   4271         /* this uses the function pointer that is set in updateinternalstate */
   4272         /* currently, there are two funcs: */
   4273         /*ucol_calcSortKey(...);*/
   4274         /*ucol_calcSortKeySimpleTertiary(...);*/
   4275 
   4276         keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
   4277         //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) {
   4278             // That's not good. Something unusual happened.
   4279             // We don't know how much we initialized before we failed.
   4280             // NULL terminate for safety.
   4281             // We have no way say that we have generated a partial sort key.
   4282             //result[0] = 0;
   4283             //keySize = 0;
   4284         //}
   4285     }
   4286     UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
   4287     UTRACE_EXIT_STATUS(status);
   4288     return keySize;
   4289 }
   4290 
   4291 /* this function is called by the C++ API for sortkey generation */
   4292 U_CFUNC int32_t
   4293 ucol_getSortKeyWithAllocation(const UCollator *coll,
   4294                               const UChar *source, int32_t sourceLength,
   4295                               uint8_t **pResult,
   4296                               UErrorCode *pErrorCode) {
   4297     *pResult = 0;
   4298     return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
   4299 }
   4300 
   4301 #define UCOL_FSEC_BUF_SIZE 256
   4302 
   4303 // Is this primary weight compressible?
   4304 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
   4305 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
   4306 static inline UBool
   4307 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
   4308     return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
   4309 }
   4310 
   4311 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
   4312 /* or if we run out of space while making a sortkey and want to return ASAP                                   */
   4313 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
   4314     UErrorCode status = U_ZERO_ERROR;
   4315     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
   4316     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
   4317     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
   4318     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
   4319     UBool  compareIdent = (strength == UCOL_IDENTICAL);
   4320     UBool  doCase = (coll->caseLevel == UCOL_ON);
   4321     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
   4322     //UBool  qShifted = shifted  && (compareQuad == 0);
   4323     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
   4324     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
   4325     uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
   4326     uint8_t *fSecs = fSecsBuff;
   4327     uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
   4328     uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
   4329 
   4330     uint32_t variableTopValue = coll->variableTopValue;
   4331     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
   4332     if(doHiragana) {
   4333         UCOL_COMMON_BOT4++;
   4334         /* allocate one more space for hiragana */
   4335     }
   4336     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
   4337 
   4338     uint32_t order = UCOL_NO_MORE_CES;
   4339     uint8_t primary1 = 0;
   4340     uint8_t primary2 = 0;
   4341     uint8_t secondary = 0;
   4342     uint8_t tertiary = 0;
   4343     int32_t caseShift = 0;
   4344     uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
   4345 
   4346     uint8_t caseSwitch = coll->caseSwitch;
   4347     uint8_t tertiaryMask = coll->tertiaryMask;
   4348     uint8_t tertiaryCommon = coll->tertiaryCommon;
   4349 
   4350     UBool wasShifted = FALSE;
   4351     UBool notIsContinuation = FALSE;
   4352     uint8_t leadPrimary = 0;
   4353 
   4354 
   4355     for(;;) {
   4356         order = ucol_IGetNextCE(coll, s, &status);
   4357         if(order == UCOL_NO_MORE_CES) {
   4358             break;
   4359         }
   4360 
   4361         if(order == 0) {
   4362             continue;
   4363         }
   4364 
   4365         notIsContinuation = !isContinuation(order);
   4366 
   4367 
   4368         if(notIsContinuation) {
   4369             tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
   4370         } else {
   4371             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   4372         }
   4373         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4374         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4375         primary1 = (uint8_t)(order >> 8);
   4376 
   4377         /* no need to permute since the actual code values don't matter
   4378         if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
   4379             primary1 = coll->leadBytePermutationTable[primary1];
   4380         }
   4381         */
   4382 
   4383         if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
   4384                       || (!notIsContinuation && wasShifted)))
   4385             || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
   4386                 /* and other ignorables should be removed if following a shifted code point */
   4387                 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
   4388                     /* we should just completely ignore it */
   4389                     continue;
   4390                 }
   4391                 if(compareQuad == 0) {
   4392                     if(c4 > 0) {
   4393                         currentSize += (c2/UCOL_BOT_COUNT4)+1;
   4394                         c4 = 0;
   4395                     }
   4396                     currentSize++;
   4397                     if(primary2 != 0) {
   4398                         currentSize++;
   4399                     }
   4400                 }
   4401                 wasShifted = TRUE;
   4402         } else {
   4403             wasShifted = FALSE;
   4404             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   4405             /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   4406             /* calculate sortkey size */
   4407             if(primary1 != UCOL_IGNORABLE) {
   4408                 if(notIsContinuation) {
   4409                     if(leadPrimary == primary1) {
   4410                         currentSize++;
   4411                     } else {
   4412                         if(leadPrimary != 0) {
   4413                             currentSize++;
   4414                         }
   4415                         if(primary2 == UCOL_IGNORABLE) {
   4416                             /* one byter, not compressed */
   4417                             currentSize++;
   4418                             leadPrimary = 0;
   4419                         } else if(isCompressible(coll, primary1)) {
   4420                             /* compress */
   4421                             leadPrimary = primary1;
   4422                             currentSize+=2;
   4423                         } else {
   4424                             leadPrimary = 0;
   4425                             currentSize+=2;
   4426                         }
   4427                     }
   4428                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   4429                     currentSize++;
   4430                     if(primary2 != UCOL_IGNORABLE) {
   4431                         currentSize++;
   4432                     }
   4433                 }
   4434             }
   4435 
   4436             if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
   4437                 if(!isFrenchSec){
   4438                     if (secondary == UCOL_COMMON2 && notIsContinuation) {
   4439                         c2++;
   4440                     } else {
   4441                         if(c2 > 0) {
   4442                             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4443                                 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
   4444                             } else {
   4445                                 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
   4446                             }
   4447                             c2 = 0;
   4448                         }
   4449                         currentSize++;
   4450                     }
   4451                 } else {
   4452                     fSecs[fSecsLen++] = secondary;
   4453                     if(fSecsLen == fSecsMaxLen) {
   4454                         uint8_t *fSecsTemp;
   4455                         if(fSecs == fSecsBuff) {
   4456                             fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen);
   4457                         } else {
   4458                             fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
   4459                         }
   4460                         if(fSecsTemp == NULL) {
   4461                             status = U_MEMORY_ALLOCATION_ERROR;
   4462                             return 0;
   4463                         }
   4464                         fSecs = fSecsTemp;
   4465                         fSecsMaxLen *= 2;
   4466                     }
   4467                     if(notIsContinuation) {
   4468                         if (frenchStartPtr != NULL) {
   4469                             /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4470                             uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4471                             frenchStartPtr = NULL;
   4472                         }
   4473                     } else {
   4474                         if (frenchStartPtr == NULL) {
   4475                             frenchStartPtr = fSecs+fSecsLen-2;
   4476                         }
   4477                         frenchEndPtr = fSecs+fSecsLen-1;
   4478                     }
   4479                 }
   4480             }
   4481 
   4482             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
   4483                 // do the case level if we need to do it. We don't want to calculate
   4484                 // case level for primary ignorables if we have only primary strength and case level
   4485                 // otherwise we would break well formedness of CEs
   4486                 if (caseShift  == 0) {
   4487                     currentSize++;
   4488                     caseShift = UCOL_CASE_SHIFT_START;
   4489                 }
   4490                 if((tertiary&0x3F) > 0 && notIsContinuation) {
   4491                     caseShift--;
   4492                     if((tertiary &0xC0) != 0) {
   4493                         if (caseShift  == 0) {
   4494                             currentSize++;
   4495                             caseShift = UCOL_CASE_SHIFT_START;
   4496                         }
   4497                         caseShift--;
   4498                     }
   4499                 }
   4500             } else {
   4501                 if(notIsContinuation) {
   4502                     tertiary ^= caseSwitch;
   4503                 }
   4504             }
   4505 
   4506             tertiary &= tertiaryMask;
   4507             if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
   4508                 if (tertiary == tertiaryCommon && notIsContinuation) {
   4509                     c3++;
   4510                 } else {
   4511                     if(c3 > 0) {
   4512                         if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
   4513                             || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
   4514                                 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
   4515                         } else {
   4516                             currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
   4517                         }
   4518                         c3 = 0;
   4519                     }
   4520                     currentSize++;
   4521                 }
   4522             }
   4523 
   4524             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
   4525                 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   4526                     if(c4>0) { // Close this part
   4527                         currentSize += (c4/UCOL_BOT_COUNT4)+1;
   4528                         c4 = 0;
   4529                     }
   4530                     currentSize++; // Add the Hiragana
   4531                 } else { // This wasn't Hiragana, so we can continue adding stuff
   4532                     c4++;
   4533                 }
   4534             }
   4535         }
   4536     }
   4537 
   4538     if(!isFrenchSec){
   4539         if(c2 > 0) {
   4540             currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
   4541         }
   4542     } else {
   4543         uint32_t i = 0;
   4544         if(frenchStartPtr != NULL) {
   4545             uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4546         }
   4547         for(i = 0; i<fSecsLen; i++) {
   4548             secondary = *(fSecs+fSecsLen-i-1);
   4549             /* This is compression code. */
   4550             if (secondary == UCOL_COMMON2) {
   4551                 ++c2;
   4552             } else {
   4553                 if(c2 > 0) {
   4554                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4555                         currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
   4556                     } else {
   4557                         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
   4558                     }
   4559                     c2 = 0;
   4560                 }
   4561                 currentSize++;
   4562             }
   4563         }
   4564         if(c2 > 0) {
   4565             currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
   4566         }
   4567         if(fSecs != fSecsBuff) {
   4568             uprv_free(fSecs);
   4569         }
   4570     }
   4571 
   4572     if(c3 > 0) {
   4573         currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
   4574     }
   4575 
   4576     if(c4 > 0  && compareQuad == 0) {
   4577         currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
   4578     }
   4579 
   4580     if(compareIdent) {
   4581         currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
   4582     }
   4583     return currentSize;
   4584 }
   4585 
   4586 static
   4587 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
   4588     if (caseShift  == 0) {
   4589         *(*cases)++ = UCOL_CASE_BYTE_START;
   4590         caseShift = UCOL_CASE_SHIFT_START;
   4591     }
   4592 }
   4593 
   4594 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
   4595 // know how many values we wanted to add, even if we didn't add them all
   4596 static
   4597 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
   4598     size++;
   4599     if(primaries < limit) {
   4600         *(primaries)++ = value;
   4601     }
   4602 }
   4603 
   4604 // Packs the secondary buffer when processing French locale. Adds the terminator.
   4605 static
   4606 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
   4607     uint8_t secondary;
   4608     int32_t count2 = 0;
   4609     uint32_t i = 0, size = 0;
   4610     // we use i here since the key size already accounts for terminators, so we'll discard the increment
   4611     addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
   4612     /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
   4613     if(frenchStartPtr != NULL) {
   4614         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4615     }
   4616     for(i = 0; i<*secsize; i++) {
   4617         secondary = *(secondaries-i-1);
   4618         /* This is compression code. */
   4619         if (secondary == UCOL_COMMON2) {
   4620             ++count2;
   4621         } else {
   4622             if (count2 > 0) {
   4623                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4624                     while (count2 > UCOL_TOP_COUNT2) {
   4625                         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
   4626                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4627                     }
   4628                     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
   4629                 } else {
   4630                     while (count2 > UCOL_BOT_COUNT2) {
   4631                         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   4632                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4633                     }
   4634                     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   4635                 }
   4636                 count2 = 0;
   4637             }
   4638             addWithIncrement(primaries, primEnd, size, secondary);
   4639         }
   4640     }
   4641     if (count2 > 0) {
   4642         while (count2 > UCOL_BOT_COUNT2) {
   4643             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   4644             count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4645         }
   4646         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   4647     }
   4648     *secsize = size;
   4649     return primaries;
   4650 }
   4651 
   4652 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
   4653 
   4654 /* This is the sortkey work horse function */
   4655 U_CFUNC int32_t U_CALLCONV
   4656 ucol_calcSortKey(const    UCollator    *coll,
   4657         const    UChar        *source,
   4658         int32_t        sourceLength,
   4659         uint8_t        **result,
   4660         uint32_t        resultLength,
   4661         UBool allocateSKBuffer,
   4662         UErrorCode *status)
   4663 {
   4664     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
   4665 
   4666     uint32_t i = 0; /* general purpose counter */
   4667 
   4668     /* Stack allocated buffers for buffers we use */
   4669     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
   4670 
   4671     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
   4672 
   4673     if(U_FAILURE(*status)) {
   4674         return 0;
   4675     }
   4676 
   4677     if(primaries == NULL && allocateSKBuffer == TRUE) {
   4678         primaries = *result = prim;
   4679         resultLength = UCOL_PRIMARY_MAX_BUFFER;
   4680     }
   4681 
   4682     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
   4683       caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
   4684 
   4685     uint32_t sortKeySize = 1; /* it is always \0 terminated */
   4686 
   4687     UnicodeString normSource;
   4688 
   4689     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
   4690 
   4691     UColAttributeValue strength = coll->strength;
   4692 
   4693     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
   4694     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
   4695     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
   4696     UBool  compareIdent = (strength == UCOL_IDENTICAL);
   4697     UBool  doCase = (coll->caseLevel == UCOL_ON);
   4698     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
   4699     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
   4700     //UBool  qShifted = shifted && (compareQuad == 0);
   4701     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
   4702 
   4703     uint32_t variableTopValue = coll->variableTopValue;
   4704     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
   4705     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
   4706     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
   4707     uint8_t UCOL_HIRAGANA_QUAD = 0;
   4708     if(doHiragana) {
   4709         UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
   4710         /* allocate one more space for hiragana, value for hiragana */
   4711     }
   4712     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
   4713 
   4714     /* support for special features like caselevel and funky secondaries */
   4715     uint8_t *frenchStartPtr = NULL;
   4716     uint8_t *frenchEndPtr = NULL;
   4717     uint32_t caseShift = 0;
   4718 
   4719     sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
   4720 
   4721     /* If we need to normalize, we'll do it all at once at the beginning! */
   4722     const Normalizer2 *norm2;
   4723     if(compareIdent) {
   4724         norm2 = Normalizer2Factory::getNFDInstance(*status);
   4725     } else if(coll->normalizationMode != UCOL_OFF) {
   4726         norm2 = Normalizer2Factory::getFCDInstance(*status);
   4727     } else {
   4728         norm2 = NULL;
   4729     }
   4730     if(norm2 != NULL) {
   4731         normSource.setTo(FALSE, source, len);
   4732         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   4733         if(qcYesLength != len) {
   4734             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   4735             normSource.truncate(qcYesLength);
   4736             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   4737             source = normSource.getBuffer();
   4738             len = normSource.length();
   4739         }
   4740     }
   4741     collIterate s;
   4742     IInit_collIterate(coll, source, len, &s, status);
   4743     if(U_FAILURE(*status)) {
   4744         return 0;
   4745     }
   4746     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
   4747 
   4748     if(resultLength == 0 || primaries == NULL) {
   4749         return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
   4750     }
   4751     uint8_t *primarySafeEnd = primaries + resultLength - 1;
   4752     if(strength > UCOL_PRIMARY) {
   4753         primarySafeEnd--;
   4754     }
   4755 
   4756     uint32_t minBufferSize = UCOL_MAX_BUFFER;
   4757 
   4758     uint8_t *primStart = primaries;
   4759     uint8_t *secStart = secondaries;
   4760     uint8_t *terStart = tertiaries;
   4761     uint8_t *caseStart = cases;
   4762     uint8_t *quadStart = quads;
   4763 
   4764     uint32_t order = 0;
   4765 
   4766     uint8_t primary1 = 0;
   4767     uint8_t primary2 = 0;
   4768     uint8_t secondary = 0;
   4769     uint8_t tertiary = 0;
   4770     uint8_t caseSwitch = coll->caseSwitch;
   4771     uint8_t tertiaryMask = coll->tertiaryMask;
   4772     int8_t tertiaryAddition = coll->tertiaryAddition;
   4773     uint8_t tertiaryTop = coll->tertiaryTop;
   4774     uint8_t tertiaryBottom = coll->tertiaryBottom;
   4775     uint8_t tertiaryCommon = coll->tertiaryCommon;
   4776     uint8_t caseBits = 0;
   4777 
   4778     UBool finished = FALSE;
   4779     UBool wasShifted = FALSE;
   4780     UBool notIsContinuation = FALSE;
   4781 
   4782     uint32_t prevBuffSize = 0;
   4783 
   4784     uint32_t count2 = 0, count3 = 0, count4 = 0;
   4785     uint8_t leadPrimary = 0;
   4786 
   4787     for(;;) {
   4788         for(i=prevBuffSize; i<minBufferSize; ++i) {
   4789 
   4790             order = ucol_IGetNextCE(coll, &s, status);
   4791             if(order == UCOL_NO_MORE_CES) {
   4792                 finished = TRUE;
   4793                 break;
   4794             }
   4795 
   4796             if(order == 0) {
   4797                 continue;
   4798             }
   4799 
   4800             notIsContinuation = !isContinuation(order);
   4801 
   4802             if(notIsContinuation) {
   4803                 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
   4804             } else {
   4805                 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   4806             }
   4807 
   4808             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4809             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4810             primary1 = (uint8_t)(order >> 8);
   4811 
   4812             uint8_t originalPrimary1 = primary1;
   4813             if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
   4814                 primary1 = coll->leadBytePermutationTable[primary1];
   4815             }
   4816 
   4817             if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
   4818                            || (!notIsContinuation && wasShifted)))
   4819                 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   4820             {
   4821                 /* and other ignorables should be removed if following a shifted code point */
   4822                 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
   4823                     /* we should just completely ignore it */
   4824                     continue;
   4825                 }
   4826                 if(compareQuad == 0) {
   4827                     if(count4 > 0) {
   4828                         while (count4 > UCOL_BOT_COUNT4) {
   4829                             *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   4830                             count4 -= UCOL_BOT_COUNT4;
   4831                         }
   4832                         *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
   4833                         count4 = 0;
   4834                     }
   4835                     /* We are dealing with a variable and we're treating them as shifted */
   4836                     /* This is a shifted ignorable */
   4837                     if(primary1 != 0) { /* we need to check this since we could be in continuation */
   4838                         *quads++ = primary1;
   4839                     }
   4840                     if(primary2 != 0) {
   4841                         *quads++ = primary2;
   4842                     }
   4843                 }
   4844                 wasShifted = TRUE;
   4845             } else {
   4846                 wasShifted = FALSE;
   4847                 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   4848                 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   4849                 /* regular and simple sortkey calc */
   4850                 if(primary1 != UCOL_IGNORABLE) {
   4851                     if(notIsContinuation) {
   4852                         if(leadPrimary == primary1) {
   4853                             *primaries++ = primary2;
   4854                         } else {
   4855                             if(leadPrimary != 0) {
   4856                                 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
   4857                             }
   4858                             if(primary2 == UCOL_IGNORABLE) {
   4859                                 /* one byter, not compressed */
   4860                                 *primaries++ = primary1;
   4861                                 leadPrimary = 0;
   4862                             } else if(isCompressible(coll, originalPrimary1)) {
   4863                                 /* compress */
   4864                                 *primaries++ = leadPrimary = primary1;
   4865                                 if(primaries <= primarySafeEnd) {
   4866                                     *primaries++ = primary2;
   4867                                 }
   4868                             } else {
   4869                                 leadPrimary = 0;
   4870                                 *primaries++ = primary1;
   4871                                 if(primaries <= primarySafeEnd) {
   4872                                     *primaries++ = primary2;
   4873                                 }
   4874                             }
   4875                         }
   4876                     } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   4877                         *primaries++ = primary1;
   4878                         if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) {
   4879                                 *primaries++ = primary2; /* second part */
   4880                         }
   4881                     }
   4882                 }
   4883 
   4884                 if(secondary > compareSec) {
   4885                     if(!isFrenchSec) {
   4886                         /* This is compression code. */
   4887                         if (secondary == UCOL_COMMON2 && notIsContinuation) {
   4888                             ++count2;
   4889                         } else {
   4890                             if (count2 > 0) {
   4891                                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4892                                     while (count2 > UCOL_TOP_COUNT2) {
   4893                                         *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   4894                                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4895                                     }
   4896                                     *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
   4897                                 } else {
   4898                                     while (count2 > UCOL_BOT_COUNT2) {
   4899                                         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4900                                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4901                                     }
   4902                                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   4903                                 }
   4904                                 count2 = 0;
   4905                             }
   4906                             *secondaries++ = secondary;
   4907                         }
   4908                     } else {
   4909                         *secondaries++ = secondary;
   4910                         /* Do the special handling for French secondaries */
   4911                         /* We need to get continuation elements and do intermediate restore */
   4912                         /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
   4913                         if(notIsContinuation) {
   4914                             if (frenchStartPtr != NULL) {
   4915                                 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4916                                 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4917                                 frenchStartPtr = NULL;
   4918                             }
   4919                         } else {
   4920                             if (frenchStartPtr == NULL) {
   4921                                 frenchStartPtr = secondaries - 2;
   4922                             }
   4923                             frenchEndPtr = secondaries-1;
   4924                         }
   4925                     }
   4926                 }
   4927 
   4928                 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
   4929                     // do the case level if we need to do it. We don't want to calculate
   4930                     // case level for primary ignorables if we have only primary strength and case level
   4931                     // otherwise we would break well formedness of CEs
   4932                     doCaseShift(&cases, caseShift);
   4933                     if(notIsContinuation) {
   4934                         caseBits = (uint8_t)(tertiary & 0xC0);
   4935 
   4936                         if(tertiary != 0) {
   4937                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   4938                                 if((caseBits & 0xC0) == 0) {
   4939                                     *(cases-1) |= 1 << (--caseShift);
   4940                                 } else {
   4941                                     *(cases-1) |= 0 << (--caseShift);
   4942                                     /* second bit */
   4943                                     doCaseShift(&cases, caseShift);
   4944                                     *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
   4945                                 }
   4946                             } else {
   4947                                 if((caseBits & 0xC0) == 0) {
   4948                                     *(cases-1) |= 0 << (--caseShift);
   4949                                 } else {
   4950                                     *(cases-1) |= 1 << (--caseShift);
   4951                                     /* second bit */
   4952                                     doCaseShift(&cases, caseShift);
   4953                                     *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
   4954                                 }
   4955                             }
   4956                         }
   4957 
   4958                     }
   4959                 } else {
   4960                     if(notIsContinuation) {
   4961                         tertiary ^= caseSwitch;
   4962                     }
   4963                 }
   4964 
   4965                 tertiary &= tertiaryMask;
   4966                 if(tertiary > compareTer) {
   4967                     /* This is compression code. */
   4968                     /* sequence size check is included in the if clause */
   4969                     if (tertiary == tertiaryCommon && notIsContinuation) {
   4970                         ++count3;
   4971                     } else {
   4972                         if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   4973                             tertiary += tertiaryAddition;
   4974                         } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   4975                             tertiary -= tertiaryAddition;
   4976                         }
   4977                         if (count3 > 0) {
   4978                             if ((tertiary > tertiaryCommon)) {
   4979                                 while (count3 > coll->tertiaryTopCount) {
   4980                                     *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   4981                                     count3 -= (uint32_t)coll->tertiaryTopCount;
   4982                                 }
   4983                                 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
   4984                             } else {
   4985                                 while (count3 > coll->tertiaryBottomCount) {
   4986                                     *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   4987                                     count3 -= (uint32_t)coll->tertiaryBottomCount;
   4988                                 }
   4989                                 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   4990                             }
   4991                             count3 = 0;
   4992                         }
   4993                         *tertiaries++ = tertiary;
   4994                     }
   4995                 }
   4996 
   4997                 if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
   4998                     if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   4999                         if(count4>0) { // Close this part
   5000                             while (count4 > UCOL_BOT_COUNT4) {
   5001                                 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   5002                                 count4 -= UCOL_BOT_COUNT4;
   5003                             }
   5004                             *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
   5005                             count4 = 0;
   5006                         }
   5007                         *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
   5008                     } else { // This wasn't Hiragana, so we can continue adding stuff
   5009                         count4++;
   5010                     }
   5011                 }
   5012             }
   5013 
   5014             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
   5015                 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
   5016                     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   5017                     if(U_FAILURE(*status)) {
   5018                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5019                         finished = TRUE;
   5020                         break;
   5021                     }
   5022                     s.flags &= ~UCOL_ITER_NORM;
   5023                     sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
   5024                     *status = U_BUFFER_OVERFLOW_ERROR;
   5025                     finished = TRUE;
   5026                     break;
   5027                 } else { /* It's much nicer if we can actually reallocate */
   5028                     int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart));
   5029                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
   5030                     if(U_SUCCESS(*status)) {
   5031                         *result = primStart;
   5032                         primarySafeEnd = primStart + resultLength - 1;
   5033                         if(strength > UCOL_PRIMARY) {
   5034                             primarySafeEnd--;
   5035                         }
   5036                     } else {
   5037                         /* We ran out of memory!? We can't recover. */
   5038                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5039                         finished = TRUE;
   5040                         break;
   5041                     }
   5042                 }
   5043             }
   5044         }
   5045         if(finished) {
   5046             break;
   5047         } else {
   5048             prevBuffSize = minBufferSize;
   5049 
   5050             uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
   5051             if (frenchStartPtr != NULL) {
   5052                 frenchStartOffset = (uint32_t)(frenchStartPtr - secStart);
   5053                 frenchEndOffset = (uint32_t)(frenchEndPtr - secStart);
   5054             }
   5055             secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
   5056             terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
   5057             caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
   5058             quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
   5059             if(U_FAILURE(*status)) {
   5060                 /* We ran out of memory!? We can't recover. */
   5061                 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5062                 break;
   5063             }
   5064             if (frenchStartPtr != NULL) {
   5065                 frenchStartPtr = secStart + frenchStartOffset;
   5066                 frenchEndPtr = secStart + frenchEndOffset;
   5067             }
   5068             minBufferSize *= 2;
   5069         }
   5070     }
   5071 
   5072     /* Here, we are generally done with processing */
   5073     /* bailing out would not be too productive */
   5074 
   5075     if(U_SUCCESS(*status)) {
   5076         sortKeySize += (uint32_t)(primaries - primStart);
   5077         /* we have done all the CE's, now let's put them together to form a key */
   5078         if(compareSec == 0) {
   5079             if (count2 > 0) {
   5080                 while (count2 > UCOL_BOT_COUNT2) {
   5081                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5082                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5083                 }
   5084                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   5085             }
   5086             uint32_t secsize = (uint32_t)(secondaries-secStart);
   5087             if(!isFrenchSec) { // Regular situation, we know the length of secondaries
   5088                 sortKeySize += secsize;
   5089                 if(sortKeySize <= resultLength) {
   5090                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5091                     uprv_memcpy(primaries, secStart, secsize);
   5092                     primaries += secsize;
   5093                 } else {
   5094                     if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
   5095                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5096                         if(U_SUCCESS(*status)) {
   5097                             *result = primStart;
   5098                             *(primaries++) = UCOL_LEVELTERMINATOR;
   5099                             uprv_memcpy(primaries, secStart, secsize);
   5100                             primaries += secsize;
   5101                         }
   5102                         else {
   5103                             /* We ran out of memory!? We can't recover. */
   5104                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5105                             goto cleanup;
   5106                         }
   5107                     } else {
   5108                         *status = U_BUFFER_OVERFLOW_ERROR;
   5109                     }
   5110                 }
   5111             } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
   5112                 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
   5113                 sortKeySize += secsize;
   5114                 if(sortKeySize <= resultLength) { // if we managed to pack fine
   5115                     primaries = newPrim; // update the primary pointer
   5116                 } else { // overflow, need to reallocate and redo
   5117                     if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
   5118                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5119                         if(U_SUCCESS(*status)) {
   5120                             primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
   5121                         }
   5122                         else {
   5123                             /* We ran out of memory!? We can't recover. */
   5124                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5125                             goto cleanup;
   5126                         }
   5127                     } else {
   5128                         *status = U_BUFFER_OVERFLOW_ERROR;
   5129                     }
   5130                 }
   5131             }
   5132         }
   5133 
   5134         if(doCase) {
   5135             uint32_t casesize = (uint32_t)(cases - caseStart);
   5136             sortKeySize += casesize;
   5137             if(sortKeySize <= resultLength) {
   5138                 *(primaries++) = UCOL_LEVELTERMINATOR;
   5139                 uprv_memcpy(primaries, caseStart, casesize);
   5140                 primaries += casesize;
   5141             } else {
   5142                 if(allocateSKBuffer == TRUE) {
   5143                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5144                     if(U_SUCCESS(*status)) {
   5145                         *result = primStart;
   5146                         *(primaries++) = UCOL_LEVELTERMINATOR;
   5147                         uprv_memcpy(primaries, caseStart, casesize);
   5148                     }
   5149                     else {
   5150                         /* We ran out of memory!? We can't recover. */
   5151                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5152                         goto cleanup;
   5153                     }
   5154                 } else {
   5155                     *status = U_BUFFER_OVERFLOW_ERROR;
   5156                 }
   5157             }
   5158         }
   5159 
   5160         if(compareTer == 0) {
   5161             if (count3 > 0) {
   5162                 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
   5163                     while (count3 >= coll->tertiaryTopCount) {
   5164                         *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   5165                         count3 -= (uint32_t)coll->tertiaryTopCount;
   5166                     }
   5167                     *tertiaries++ = (uint8_t)(tertiaryTop - count3);
   5168                 } else {
   5169                     while (count3 > coll->tertiaryBottomCount) {
   5170                         *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   5171                         count3 -= (uint32_t)coll->tertiaryBottomCount;
   5172                     }
   5173                     *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   5174                 }
   5175             }
   5176             uint32_t tersize = (uint32_t)(tertiaries - terStart);
   5177             sortKeySize += tersize;
   5178             if(sortKeySize <= resultLength) {
   5179                 *(primaries++) = UCOL_LEVELTERMINATOR;
   5180                 uprv_memcpy(primaries, terStart, tersize);
   5181                 primaries += tersize;
   5182             } else {
   5183                 if(allocateSKBuffer == TRUE) {
   5184                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5185                     if(U_SUCCESS(*status)) {
   5186                         *result = primStart;
   5187                         *(primaries++) = UCOL_LEVELTERMINATOR;
   5188                         uprv_memcpy(primaries, terStart, tersize);
   5189                     }
   5190                     else {
   5191                         /* We ran out of memory!? We can't recover. */
   5192                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5193                         goto cleanup;
   5194                     }
   5195                 } else {
   5196                     *status = U_BUFFER_OVERFLOW_ERROR;
   5197                 }
   5198             }
   5199 
   5200             if(compareQuad == 0/*qShifted == TRUE*/) {
   5201                 if(count4 > 0) {
   5202                     while (count4 > UCOL_BOT_COUNT4) {
   5203                         *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   5204                         count4 -= UCOL_BOT_COUNT4;
   5205                     }
   5206                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
   5207                 }
   5208                 uint32_t quadsize = (uint32_t)(quads - quadStart);
   5209                 sortKeySize += quadsize;
   5210                 if(sortKeySize <= resultLength) {
   5211                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5212                     uprv_memcpy(primaries, quadStart, quadsize);
   5213                     primaries += quadsize;
   5214                 } else {
   5215                     if(allocateSKBuffer == TRUE) {
   5216                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5217                         if(U_SUCCESS(*status)) {
   5218                             *result = primStart;
   5219                             *(primaries++) = UCOL_LEVELTERMINATOR;
   5220                             uprv_memcpy(primaries, quadStart, quadsize);
   5221                         }
   5222                         else {
   5223                             /* We ran out of memory!? We can't recover. */
   5224                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5225                             goto cleanup;
   5226                         }
   5227                     } else {
   5228                         *status = U_BUFFER_OVERFLOW_ERROR;
   5229                     }
   5230                 }
   5231             }
   5232 
   5233             if(compareIdent) {
   5234                 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
   5235                 if(sortKeySize <= resultLength) {
   5236                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5237                     primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
   5238                 } else {
   5239                     if(allocateSKBuffer == TRUE) {
   5240                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
   5241                         if(U_SUCCESS(*status)) {
   5242                             *result = primStart;
   5243                             *(primaries++) = UCOL_LEVELTERMINATOR;
   5244                             u_writeIdenticalLevelRun(s.string, len, primaries);
   5245                         }
   5246                         else {
   5247                             /* We ran out of memory!? We can't recover. */
   5248                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5249                             goto cleanup;
   5250                         }
   5251                     } else {
   5252                         *status = U_BUFFER_OVERFLOW_ERROR;
   5253                     }
   5254                 }
   5255             }
   5256         }
   5257         *(primaries++) = '\0';
   5258     }
   5259 
   5260     if(allocateSKBuffer == TRUE) {
   5261         *result = (uint8_t*)uprv_malloc(sortKeySize);
   5262         /* test for NULL */
   5263         if (*result == NULL) {
   5264             *status = U_MEMORY_ALLOCATION_ERROR;
   5265             goto cleanup;
   5266         }
   5267         uprv_memcpy(*result, primStart, sortKeySize);
   5268         if(primStart != prim) {
   5269             uprv_free(primStart);
   5270         }
   5271     }
   5272 
   5273 cleanup:
   5274     if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
   5275         /* NULL terminate for safety */
   5276         **result = 0;
   5277     }
   5278     if(terStart != tert) {
   5279         uprv_free(terStart);
   5280         uprv_free(secStart);
   5281         uprv_free(caseStart);
   5282         uprv_free(quadStart);
   5283     }
   5284 
   5285     /* To avoid memory leak, free the offset buffer if necessary. */
   5286     ucol_freeOffsetBuffer(&s);
   5287 
   5288     return sortKeySize;
   5289 }
   5290 
   5291 
   5292 U_CFUNC int32_t U_CALLCONV
   5293 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
   5294         const    UChar        *source,
   5295         int32_t        sourceLength,
   5296         uint8_t        **result,
   5297         uint32_t        resultLength,
   5298         UBool allocateSKBuffer,
   5299         UErrorCode *status)
   5300 {
   5301     U_ALIGN_CODE(16);
   5302 
   5303     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
   5304     uint32_t i = 0; /* general purpose counter */
   5305 
   5306     /* Stack allocated buffers for buffers we use */
   5307     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
   5308 
   5309     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
   5310 
   5311     if(U_FAILURE(*status)) {
   5312         return 0;
   5313     }
   5314 
   5315     if(primaries == NULL && allocateSKBuffer == TRUE) {
   5316         primaries = *result = prim;
   5317         resultLength = UCOL_PRIMARY_MAX_BUFFER;
   5318     }
   5319 
   5320     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
   5321 
   5322     uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
   5323 
   5324     UnicodeString normSource;
   5325 
   5326     int32_t len =  sourceLength;
   5327 
   5328     /* If we need to normalize, we'll do it all at once at the beginning! */
   5329     if(coll->normalizationMode != UCOL_OFF) {
   5330         normSource.setTo(len < 0, source, len);
   5331         const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
   5332         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   5333         if(qcYesLength != normSource.length()) {
   5334             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   5335             normSource.truncate(qcYesLength);
   5336             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   5337             source = normSource.getBuffer();
   5338             len = normSource.length();
   5339         }
   5340     }
   5341     collIterate s;
   5342     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   5343     if(U_FAILURE(*status)) {
   5344         return 0;
   5345     }
   5346     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
   5347 
   5348     if(resultLength == 0 || primaries == NULL) {
   5349         return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
   5350     }
   5351 
   5352     uint8_t *primarySafeEnd = primaries + resultLength - 2;
   5353 
   5354     uint32_t minBufferSize = UCOL_MAX_BUFFER;
   5355 
   5356     uint8_t *primStart = primaries;
   5357     uint8_t *secStart = secondaries;
   5358     uint8_t *terStart = tertiaries;
   5359 
   5360     uint32_t order = 0;
   5361 
   5362     uint8_t primary1 = 0;
   5363     uint8_t primary2 = 0;
   5364     uint8_t secondary = 0;
   5365     uint8_t tertiary = 0;
   5366     uint8_t caseSwitch = coll->caseSwitch;
   5367     uint8_t tertiaryMask = coll->tertiaryMask;
   5368     int8_t tertiaryAddition = coll->tertiaryAddition;
   5369     uint8_t tertiaryTop = coll->tertiaryTop;
   5370     uint8_t tertiaryBottom = coll->tertiaryBottom;
   5371     uint8_t tertiaryCommon = coll->tertiaryCommon;
   5372 
   5373     uint32_t prevBuffSize = 0;
   5374 
   5375     UBool finished = FALSE;
   5376     UBool notIsContinuation = FALSE;
   5377 
   5378     uint32_t count2 = 0, count3 = 0;
   5379     uint8_t leadPrimary = 0;
   5380 
   5381     for(;;) {
   5382         for(i=prevBuffSize; i<minBufferSize; ++i) {
   5383 
   5384             order = ucol_IGetNextCE(coll, &s, status);
   5385 
   5386             if(order == 0) {
   5387                 continue;
   5388             }
   5389 
   5390             if(order == UCOL_NO_MORE_CES) {
   5391                 finished = TRUE;
   5392                 break;
   5393             }
   5394 
   5395             notIsContinuation = !isContinuation(order);
   5396 
   5397             if(notIsContinuation) {
   5398                 tertiary = (uint8_t)((order & tertiaryMask));
   5399             } else {
   5400                 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   5401             }
   5402 
   5403             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5404             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5405             primary1 = (uint8_t)(order >> 8);
   5406 
   5407             uint8_t originalPrimary1 = primary1;
   5408             if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
   5409                 primary1 = coll->leadBytePermutationTable[primary1];
   5410             }
   5411 
   5412             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   5413             /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   5414             /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
   5415             /* regular and simple sortkey calc */
   5416             if(primary1 != UCOL_IGNORABLE) {
   5417                 if(notIsContinuation) {
   5418                     if(leadPrimary == primary1) {
   5419                         *primaries++ = primary2;
   5420                     } else {
   5421                         if(leadPrimary != 0) {
   5422                             *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
   5423                         }
   5424                         if(primary2 == UCOL_IGNORABLE) {
   5425                             /* one byter, not compressed */
   5426                             *primaries++ = primary1;
   5427                             leadPrimary = 0;
   5428                         } else if(isCompressible(coll, originalPrimary1)) {
   5429                             /* compress */
   5430                             *primaries++ = leadPrimary = primary1;
   5431                             *primaries++ = primary2;
   5432                         } else {
   5433                             leadPrimary = 0;
   5434                             *primaries++ = primary1;
   5435                             *primaries++ = primary2;
   5436                         }
   5437                     }
   5438                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   5439                     *primaries++ = primary1;
   5440                     if(primary2 != UCOL_IGNORABLE) {
   5441                         *primaries++ = primary2; /* second part */
   5442                     }
   5443                 }
   5444             }
   5445 
   5446             if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
   5447                 /* This is compression code. */
   5448                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
   5449                     ++count2;
   5450                 } else {
   5451                     if (count2 > 0) {
   5452                         if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   5453                             while (count2 > UCOL_TOP_COUNT2) {
   5454                                 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   5455                                 count2 -= (uint32_t)UCOL_TOP_COUNT2;
   5456                             }
   5457                             *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
   5458                         } else {
   5459                             while (count2 > UCOL_BOT_COUNT2) {
   5460                                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5461                                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5462                             }
   5463                             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   5464                         }
   5465                         count2 = 0;
   5466                     }
   5467                     *secondaries++ = secondary;
   5468                 }
   5469             }
   5470 
   5471             if(notIsContinuation) {
   5472                 tertiary ^= caseSwitch;
   5473             }
   5474 
   5475             if(tertiary > 0) {
   5476                 /* This is compression code. */
   5477                 /* sequence size check is included in the if clause */
   5478                 if (tertiary == tertiaryCommon && notIsContinuation) {
   5479                     ++count3;
   5480                 } else {
   5481                     if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   5482                         tertiary += tertiaryAddition;
   5483                     } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   5484                         tertiary -= tertiaryAddition;
   5485                     }
   5486                     if (count3 > 0) {
   5487                         if ((tertiary > tertiaryCommon)) {
   5488                             while (count3 > coll->tertiaryTopCount) {
   5489                                 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   5490                                 count3 -= (uint32_t)coll->tertiaryTopCount;
   5491                             }
   5492                             *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
   5493                         } else {
   5494                             while (count3 > coll->tertiaryBottomCount) {
   5495                                 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   5496                                 count3 -= (uint32_t)coll->tertiaryBottomCount;
   5497                             }
   5498                             *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   5499                         }
   5500                         count3 = 0;
   5501                     }
   5502                     *tertiaries++ = tertiary;
   5503                 }
   5504             }
   5505 
   5506             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
   5507                 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
   5508                     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   5509                     if(U_FAILURE(*status)) {
   5510                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5511                         finished = TRUE;
   5512                         break;
   5513                     }
   5514                     s.flags &= ~UCOL_ITER_NORM;
   5515                     sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
   5516                     *status = U_BUFFER_OVERFLOW_ERROR;
   5517                     finished = TRUE;
   5518                     break;
   5519                 } else { /* It's much nicer if we can actually reallocate */
   5520                     int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart));
   5521                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
   5522                     if(U_SUCCESS(*status)) {
   5523                         *result = primStart;
   5524                         primarySafeEnd = primStart + resultLength - 2;
   5525                     } else {
   5526                         /* We ran out of memory!? We can't recover. */
   5527                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5528                         finished = TRUE;
   5529                         break;
   5530                     }
   5531                 }
   5532             }
   5533         }
   5534         if(finished) {
   5535             break;
   5536         } else {
   5537             prevBuffSize = minBufferSize;
   5538             secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
   5539             terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
   5540             minBufferSize *= 2;
   5541             if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
   5542                 /* We ran out of memory!? We can't recover. */
   5543                 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5544                 break;
   5545             }
   5546         }
   5547     }
   5548 
   5549     if(U_SUCCESS(*status)) {
   5550         sortKeySize += (uint32_t)(primaries - primStart);
   5551         /* we have done all the CE's, now let's put them together to form a key */
   5552         if (count2 > 0) {
   5553             while (count2 > UCOL_BOT_COUNT2) {
   5554                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5555                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5556             }
   5557             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   5558         }
   5559         uint32_t secsize = (uint32_t)(secondaries-secStart);
   5560         sortKeySize += secsize;
   5561         if(sortKeySize <= resultLength) {
   5562             *(primaries++) = UCOL_LEVELTERMINATOR;
   5563             uprv_memcpy(primaries, secStart, secsize);
   5564             primaries += secsize;
   5565         } else {
   5566             if(allocateSKBuffer == TRUE) {
   5567                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5568                 if(U_SUCCESS(*status)) {
   5569                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5570                     *result = primStart;
   5571                     uprv_memcpy(primaries, secStart, secsize);
   5572                 }
   5573                 else {
   5574                     /* We ran out of memory!? We can't recover. */
   5575                     sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5576                     goto cleanup;
   5577                 }
   5578             } else {
   5579                 *status = U_BUFFER_OVERFLOW_ERROR;
   5580             }
   5581         }
   5582 
   5583         if (count3 > 0) {
   5584             if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
   5585                 while (count3 >= coll->tertiaryTopCount) {
   5586                     *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   5587                     count3 -= (uint32_t)coll->tertiaryTopCount;
   5588                 }
   5589                 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
   5590             } else {
   5591                 while (count3 > coll->tertiaryBottomCount) {
   5592                     *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   5593                     count3 -= (uint32_t)coll->tertiaryBottomCount;
   5594                 }
   5595                 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   5596             }
   5597         }
   5598         uint32_t tersize = (uint32_t)(tertiaries - terStart);
   5599         sortKeySize += tersize;
   5600         if(sortKeySize <= resultLength) {
   5601             *(primaries++) = UCOL_LEVELTERMINATOR;
   5602             uprv_memcpy(primaries, terStart, tersize);
   5603             primaries += tersize;
   5604         } else {
   5605             if(allocateSKBuffer == TRUE) {
   5606                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5607                 if(U_SUCCESS(*status)) {
   5608                     *result = primStart;
   5609                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5610                     uprv_memcpy(primaries, terStart, tersize);
   5611                 }
   5612                 else {
   5613                     /* We ran out of memory!? We can't recover. */
   5614                     sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5615                     goto cleanup;
   5616                 }
   5617             } else {
   5618                 *status = U_BUFFER_OVERFLOW_ERROR;
   5619             }
   5620         }
   5621 
   5622         *(primaries++) = '\0';
   5623     }
   5624 
   5625     if(allocateSKBuffer == TRUE) {
   5626         *result = (uint8_t*)uprv_malloc(sortKeySize);
   5627         /* test for NULL */
   5628         if (*result == NULL) {
   5629             *status = U_MEMORY_ALLOCATION_ERROR;
   5630             goto cleanup;
   5631         }
   5632         uprv_memcpy(*result, primStart, sortKeySize);
   5633         if(primStart != prim) {
   5634             uprv_free(primStart);
   5635         }
   5636     }
   5637 
   5638 cleanup:
   5639     if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
   5640         /* NULL terminate for safety */
   5641         **result = 0;
   5642     }
   5643     if(terStart != tert) {
   5644         uprv_free(terStart);
   5645         uprv_free(secStart);
   5646     }
   5647 
   5648     /* To avoid memory leak, free the offset buffer if necessary. */
   5649     ucol_freeOffsetBuffer(&s);
   5650 
   5651     return sortKeySize;
   5652 }
   5653 
   5654 static inline
   5655 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
   5656     UBool notIsContinuation = !isContinuation(CE);
   5657     uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
   5658     if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
   5659                || (!notIsContinuation && *wasShifted)))
   5660         || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   5661     {
   5662         // The stuff below should probably be in the sortkey code... maybe not...
   5663         if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
   5664             /* we should just completely ignore it */
   5665             *wasShifted = TRUE;
   5666             //continue;
   5667         }
   5668         //*wasShifted = TRUE;
   5669         return TRUE;
   5670     } else {
   5671         *wasShifted = FALSE;
   5672         return FALSE;
   5673     }
   5674 }
   5675 static inline
   5676 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
   5677     if(level < maxLevel) {
   5678         dest[i++] = UCOL_LEVELTERMINATOR;
   5679     } else {
   5680         dest[i++] = 0;
   5681     }
   5682 }
   5683 
   5684 /** enumeration of level identifiers for partial sort key generation */
   5685 enum {
   5686   UCOL_PSK_PRIMARY = 0,
   5687     UCOL_PSK_SECONDARY = 1,
   5688     UCOL_PSK_CASE = 2,
   5689     UCOL_PSK_TERTIARY = 3,
   5690     UCOL_PSK_QUATERNARY = 4,
   5691     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
   5692     UCOL_PSK_IDENTICAL = 6,
   5693     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
   5694     UCOL_PSK_LIMIT
   5695 };
   5696 
   5697 /** collation state enum. *_SHIFT value is how much to shift right
   5698  *  to get the state piece to the right. *_MASK value should be
   5699  *  ANDed with the shifted state. This data is stored in state[1]
   5700  *  field.
   5701  */
   5702 enum {
   5703     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
   5704     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
   5705     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
   5706     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
   5707     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
   5708      *  This field is also used to denote that the French secondary level is finished
   5709      */
   5710     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
   5711     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
   5712     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
   5713     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
   5714     /** When we do French we need to reverse secondary values. However, continuations
   5715      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
   5716      */
   5717     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
   5718     UCOL_PSK_BOCSU_BYTES_MASK = 3,
   5719     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
   5720     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
   5721 };
   5722 
   5723 // macro calculating the number of expansion CEs available
   5724 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
   5725 
   5726 
   5727 /** main sortkey part procedure. On the first call,
   5728  *  you should pass in a collator, an iterator, empty state
   5729  *  state[0] == state[1] == 0, a buffer to hold results
   5730  *  number of bytes you need and an error code pointer.
   5731  *  Make sure your buffer is big enough to hold the wanted
   5732  *  number of sortkey bytes. I don't check.
   5733  *  The only meaningful status you can get back is
   5734  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
   5735  *  have been dealt a raw deal and that you probably won't
   5736  *  be able to use partial sortkey generation for this
   5737  *  particular combination of string and collator. This
   5738  *  is highly unlikely, but you should still check the error code.
   5739  *  Any other status means that you're not in a sane situation
   5740  *  anymore. After the first call, preserve state values and
   5741  *  use them on subsequent calls to obtain more bytes of a sortkey.
   5742  *  Use until the number of bytes written is smaller than the requested
   5743  *  number of bytes. Generated sortkey is not compatible with the
   5744  *  one generated by ucol_getSortKey, as we don't do any compression.
   5745  *  However, levels are still terminated by a 1 (one) and the sortkey
   5746  *  is terminated by a 0 (zero). Identical level is the same as in the
   5747  *  regular sortkey - internal bocu-1 implementation is used.
   5748  *  For curious, although you cannot do much about this, here is
   5749  *  the structure of state words.
   5750  *  state[0] - iterator state. Depends on the iterator implementation,
   5751  *             but allows the iterator to continue where it stopped in
   5752  *             the last iteration.
   5753  *  state[1] - collation processing state. Here is the distribution
   5754  *             of the bits:
   5755  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
   5756  *             quaternary, quin (we don't use this one), identical and
   5757  *             null (producing only zeroes - first one to terminate the
   5758  *             sortkey and subsequent to fill the buffer).
   5759  *   3       - byte count. Number of bytes written on the primary level.
   5760  *   4       - was shifted. Whether the previous iteration finished in the
   5761  *             shifted state.
   5762  *   5, 6    - French continuation bytes written. See the comment in the enum
   5763  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
   5764  *             the identical level.
   5765  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
   5766  *             since thes last successful update of the iterator state.
   5767  */
   5768 U_CAPI int32_t U_EXPORT2
   5769 ucol_nextSortKeyPart(const UCollator *coll,
   5770                      UCharIterator *iter,
   5771                      uint32_t state[2],
   5772                      uint8_t *dest, int32_t count,
   5773                      UErrorCode *status)
   5774 {
   5775     /* error checking */
   5776     if(status==NULL || U_FAILURE(*status)) {
   5777         return 0;
   5778     }
   5779     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
   5780     if( coll==NULL || iter==NULL ||
   5781         state==NULL ||
   5782         count<0 || (count>0 && dest==NULL)
   5783     ) {
   5784         *status=U_ILLEGAL_ARGUMENT_ERROR;
   5785         UTRACE_EXIT_STATUS(status);
   5786         return 0;
   5787     }
   5788 
   5789     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
   5790                   coll, iter, state[0], state[1], dest, count);
   5791 
   5792     if(count==0) {
   5793         /* nothing to do */
   5794         UTRACE_EXIT_VALUE(0);
   5795         return 0;
   5796     }
   5797     /** Setting up situation according to the state we got from the previous iteration */
   5798     // The state of the iterator from the previous invocation
   5799     uint32_t iterState = state[0];
   5800     // Has the last iteration ended in the shifted state
   5801     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
   5802     // What is the current level of the sortkey?
   5803     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
   5804     // Have we written only one byte from a two byte primary in the previous iteration?
   5805     // Also on secondary level - have we finished with the French secondary?
   5806     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
   5807     // number of bytes in the continuation buffer for French
   5808     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
   5809     // Number of bytes already written from a bocsu sequence. Since
   5810     // the longes bocsu sequence is 4 long, this can be up to 3.
   5811     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
   5812     // Number of elements that need to be consumed in this iteration because
   5813     // the iterator returned UITER_NO_STATE at the end of the last iteration,
   5814     // so we had to save the last valid state.
   5815     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
   5816 
   5817     /** values that depend on the collator attributes */
   5818     // strength of the collator.
   5819     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
   5820     // maximal level of the partial sortkey. Need to take whether case level is done
   5821     int32_t maxLevel = 0;
   5822     if(strength < UCOL_TERTIARY) {
   5823         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   5824             maxLevel = UCOL_PSK_CASE;
   5825         } else {
   5826             maxLevel = strength;
   5827         }
   5828     } else {
   5829         if(strength == UCOL_TERTIARY) {
   5830             maxLevel = UCOL_PSK_TERTIARY;
   5831         } else if(strength == UCOL_QUATERNARY) {
   5832             maxLevel = UCOL_PSK_QUATERNARY;
   5833         } else { // identical
   5834             maxLevel = UCOL_IDENTICAL;
   5835         }
   5836     }
   5837     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
   5838     uint8_t UCOL_HIRAGANA_QUAD =
   5839       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
   5840     // Boundary value that decides whether a CE is shifted or not
   5841     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
   5842     // Are we doing French collation?
   5843     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
   5844 
   5845     /** initializing the collation state */
   5846     UBool notIsContinuation = FALSE;
   5847     uint32_t CE = UCOL_NO_MORE_CES;
   5848 
   5849     collIterate s;
   5850     IInit_collIterate(coll, NULL, -1, &s, status);
   5851     if(U_FAILURE(*status)) {
   5852         UTRACE_EXIT_STATUS(*status);
   5853         return 0;
   5854     }
   5855     s.iterator = iter;
   5856     s.flags |= UCOL_USE_ITERATOR;
   5857     // This variable tells us whether we have produced some other levels in this iteration
   5858     // before we moved to the identical level. In that case, we need to switch the
   5859     // type of the iterator.
   5860     UBool doingIdenticalFromStart = FALSE;
   5861     // Normalizing iterator
   5862     // The division for the array length may truncate the array size to
   5863     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   5864     // for all platforms anyway.
   5865     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   5866     UNormIterator *normIter = NULL;
   5867     // If the normalization is turned on for the collator and we are below identical level
   5868     // we will use a FCD normalizing iterator
   5869     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
   5870         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5871         s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
   5872         s.flags &= ~UCOL_ITER_NORM;
   5873         if(U_FAILURE(*status)) {
   5874             UTRACE_EXIT_STATUS(*status);
   5875             return 0;
   5876         }
   5877     } else if(level == UCOL_PSK_IDENTICAL) {
   5878         // for identical level, we need a NFD iterator. We need to instantiate it here, since we
   5879         // will be updating the state - and this cannot be done on an ordinary iterator.
   5880         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5881         s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   5882         s.flags &= ~UCOL_ITER_NORM;
   5883         if(U_FAILURE(*status)) {
   5884             UTRACE_EXIT_STATUS(*status);
   5885             return 0;
   5886         }
   5887         doingIdenticalFromStart = TRUE;
   5888     }
   5889 
   5890     // This is the tentative new state of the iterator. The problem
   5891     // is that the iterator might return an undefined state, in
   5892     // which case we should save the last valid state and increase
   5893     // the iterator skip value.
   5894     uint32_t newState = 0;
   5895 
   5896     // First, we set the iterator to the last valid position
   5897     // from the last iteration. This was saved in state[0].
   5898     if(iterState == 0) {
   5899         /* initial state */
   5900         if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
   5901             s.iterator->move(s.iterator, 0, UITER_LIMIT);
   5902         } else {
   5903             s.iterator->move(s.iterator, 0, UITER_START);
   5904         }
   5905     } else {
   5906         /* reset to previous state */
   5907         s.iterator->setState(s.iterator, iterState, status);
   5908         if(U_FAILURE(*status)) {
   5909             UTRACE_EXIT_STATUS(*status);
   5910             return 0;
   5911         }
   5912     }
   5913 
   5914 
   5915 
   5916     // This variable tells us whether we can attempt to update the state
   5917     // of iterator. Situations where we don't want to update iterator state
   5918     // are the existence of expansion CEs that are not yet processed, and
   5919     // finishing the case level without enough space in the buffer to insert
   5920     // a level terminator.
   5921     UBool canUpdateState = TRUE;
   5922 
   5923     // Consume all the CEs that were consumed at the end of the previous
   5924     // iteration without updating the iterator state. On identical level,
   5925     // consume the code points.
   5926     int32_t counter = cces;
   5927     if(level < UCOL_PSK_IDENTICAL) {
   5928         while(counter-->0) {
   5929             // If we're doing French and we are on the secondary level,
   5930             // we go backwards.
   5931             if(level == UCOL_PSK_SECONDARY && doingFrench) {
   5932                 CE = ucol_IGetPrevCE(coll, &s, status);
   5933             } else {
   5934                 CE = ucol_IGetNextCE(coll, &s, status);
   5935             }
   5936             if(CE==UCOL_NO_MORE_CES) {
   5937                 /* should not happen */
   5938                 *status=U_INTERNAL_PROGRAM_ERROR;
   5939                 UTRACE_EXIT_STATUS(*status);
   5940                 return 0;
   5941             }
   5942             if(uprv_numAvailableExpCEs(s)) {
   5943                 canUpdateState = FALSE;
   5944             }
   5945         }
   5946     } else {
   5947         while(counter-->0) {
   5948             uiter_next32(s.iterator);
   5949         }
   5950     }
   5951 
   5952     // French secondary needs to know whether the iterator state of zero came from previous level OR
   5953     // from a new invocation...
   5954     UBool wasDoingPrimary = FALSE;
   5955     // destination buffer byte counter. When this guy
   5956     // gets to count, we're done with the iteration
   5957     int32_t i = 0;
   5958     // used to count the zero bytes written after we
   5959     // have finished with the sort key
   5960     int32_t j = 0;
   5961 
   5962 
   5963     // Hm.... I think we're ready to plunge in. Basic story is as following:
   5964     // we have a fall through case based on level. This is used for initial
   5965     // positioning on iteration start. Every level processor contains a
   5966     // for(;;) which will be broken when we exhaust all the CEs. Other
   5967     // way to exit is a goto saveState, which happens when we have filled
   5968     // out our buffer.
   5969     switch(level) {
   5970     case UCOL_PSK_PRIMARY:
   5971         wasDoingPrimary = TRUE;
   5972         for(;;) {
   5973             if(i==count) {
   5974                 goto saveState;
   5975             }
   5976             // We should save the state only if we
   5977             // are sure that we are done with the
   5978             // previous iterator state
   5979             if(canUpdateState && byteCountOrFrenchDone == 0) {
   5980                 newState = s.iterator->getState(s.iterator);
   5981                 if(newState != UITER_NO_STATE) {
   5982                     iterState = newState;
   5983                     cces = 0;
   5984                 }
   5985             }
   5986             CE = ucol_IGetNextCE(coll, &s, status);
   5987             cces++;
   5988             if(CE==UCOL_NO_MORE_CES) {
   5989                 // Add the level separator
   5990                 terminatePSKLevel(level, maxLevel, i, dest);
   5991                 byteCountOrFrenchDone=0;
   5992                 // Restart the iteration an move to the
   5993                 // second level
   5994                 s.iterator->move(s.iterator, 0, UITER_START);
   5995                 cces = 0;
   5996                 level = UCOL_PSK_SECONDARY;
   5997                 break;
   5998             }
   5999             if(!isContinuation(CE)){
   6000                 if(coll->leadBytePermutationTable != NULL){
   6001                     CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
   6002                 }
   6003             }
   6004             if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6005                 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
   6006                 if(CE != 0) {
   6007                     if(byteCountOrFrenchDone == 0) {
   6008                         // get the second byte of primary
   6009                         dest[i++]=(uint8_t)(CE >> 8);
   6010                     } else {
   6011                         byteCountOrFrenchDone = 0;
   6012                     }
   6013                     if((CE &=0xff)!=0) {
   6014                         if(i==count) {
   6015                             /* overflow */
   6016                             byteCountOrFrenchDone = 1;
   6017                             cces--;
   6018                             goto saveState;
   6019                         }
   6020                         dest[i++]=(uint8_t)CE;
   6021                     }
   6022                 }
   6023             }
   6024             if(uprv_numAvailableExpCEs(s)) {
   6025                 canUpdateState = FALSE;
   6026             } else {
   6027                 canUpdateState = TRUE;
   6028             }
   6029         }
   6030         /* fall through to next level */
   6031     case UCOL_PSK_SECONDARY:
   6032         if(strength >= UCOL_SECONDARY) {
   6033             if(!doingFrench) {
   6034                 for(;;) {
   6035                     if(i == count) {
   6036                         goto saveState;
   6037                     }
   6038                     // We should save the state only if we
   6039                     // are sure that we are done with the
   6040                     // previous iterator state
   6041                     if(canUpdateState) {
   6042                         newState = s.iterator->getState(s.iterator);
   6043                         if(newState != UITER_NO_STATE) {
   6044                             iterState = newState;
   6045                             cces = 0;
   6046                         }
   6047                     }
   6048                     CE = ucol_IGetNextCE(coll, &s, status);
   6049                     cces++;
   6050                     if(CE==UCOL_NO_MORE_CES) {
   6051                         // Add the level separator
   6052                         terminatePSKLevel(level, maxLevel, i, dest);
   6053                         byteCountOrFrenchDone = 0;
   6054                         // Restart the iteration an move to the
   6055                         // second level
   6056                         s.iterator->move(s.iterator, 0, UITER_START);
   6057                         cces = 0;
   6058                         level = UCOL_PSK_CASE;
   6059                         break;
   6060                     }
   6061                     if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6062                         CE >>= 8; /* get secondary */
   6063                         if(CE != 0) {
   6064                             dest[i++]=(uint8_t)CE;
   6065                         }
   6066                     }
   6067                     if(uprv_numAvailableExpCEs(s)) {
   6068                         canUpdateState = FALSE;
   6069                     } else {
   6070                         canUpdateState = TRUE;
   6071                     }
   6072                 }
   6073             } else { // French secondary processing
   6074                 uint8_t frenchBuff[UCOL_MAX_BUFFER];
   6075                 int32_t frenchIndex = 0;
   6076                 // Here we are going backwards.
   6077                 // If the iterator is at the beggining, it should be
   6078                 // moved to end.
   6079                 if(wasDoingPrimary) {
   6080                     s.iterator->move(s.iterator, 0, UITER_LIMIT);
   6081                     cces = 0;
   6082                 }
   6083                 for(;;) {
   6084                     if(i == count) {
   6085                         goto saveState;
   6086                     }
   6087                     if(canUpdateState) {
   6088                         newState = s.iterator->getState(s.iterator);
   6089                         if(newState != UITER_NO_STATE) {
   6090                             iterState = newState;
   6091                             cces = 0;
   6092                         }
   6093                     }
   6094                     CE = ucol_IGetPrevCE(coll, &s, status);
   6095                     cces++;
   6096                     if(CE==UCOL_NO_MORE_CES) {
   6097                         // Add the level separator
   6098                         terminatePSKLevel(level, maxLevel, i, dest);
   6099                         byteCountOrFrenchDone = 0;
   6100                         // Restart the iteration an move to the next level
   6101                         s.iterator->move(s.iterator, 0, UITER_START);
   6102                         level = UCOL_PSK_CASE;
   6103                         break;
   6104                     }
   6105                     if(isContinuation(CE)) { // if it's a continuation, we want to save it and
   6106                         // reverse when we get a first non-continuation CE.
   6107                         CE >>= 8;
   6108                         frenchBuff[frenchIndex++] = (uint8_t)CE;
   6109                     } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6110                         CE >>= 8; /* get secondary */
   6111                         if(!frenchIndex) {
   6112                             if(CE != 0) {
   6113                                 dest[i++]=(uint8_t)CE;
   6114                             }
   6115                         } else {
   6116                             frenchBuff[frenchIndex++] = (uint8_t)CE;
   6117                             frenchIndex -= usedFrench;
   6118                             usedFrench = 0;
   6119                             while(i < count && frenchIndex) {
   6120                                 dest[i++] = frenchBuff[--frenchIndex];
   6121                                 usedFrench++;
   6122                             }
   6123                         }
   6124                     }
   6125                     if(uprv_numAvailableExpCEs(s)) {
   6126                         canUpdateState = FALSE;
   6127                     } else {
   6128                         canUpdateState = TRUE;
   6129                     }
   6130                 }
   6131             }
   6132         } else {
   6133             level = UCOL_PSK_CASE;
   6134         }
   6135         /* fall through to next level */
   6136     case UCOL_PSK_CASE:
   6137         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   6138             uint32_t caseShift = UCOL_CASE_SHIFT_START;
   6139             uint8_t caseByte = UCOL_CASE_BYTE_START;
   6140             uint8_t caseBits = 0;
   6141 
   6142             for(;;) {
   6143                 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
   6144                 if(i == count) {
   6145                     goto saveState;
   6146                 }
   6147                 // We should save the state only if we
   6148                 // are sure that we are done with the
   6149                 // previous iterator state
   6150                 if(canUpdateState) {
   6151                     newState = s.iterator->getState(s.iterator);
   6152                     if(newState != UITER_NO_STATE) {
   6153                         iterState = newState;
   6154                         cces = 0;
   6155                     }
   6156                 }
   6157                 CE = ucol_IGetNextCE(coll, &s, status);
   6158                 cces++;
   6159                 if(CE==UCOL_NO_MORE_CES) {
   6160                     // On the case level we might have an unfinished
   6161                     // case byte. Add one if it's started.
   6162                     if(caseShift != UCOL_CASE_SHIFT_START) {
   6163                         dest[i++] = caseByte;
   6164                     }
   6165                     cces = 0;
   6166                     // We have finished processing CEs on this level.
   6167                     // However, we don't know if we have enough space
   6168                     // to add a case level terminator.
   6169                     if(i < count) {
   6170                         // Add the level separator
   6171                         terminatePSKLevel(level, maxLevel, i, dest);
   6172                         // Restart the iteration and move to the
   6173                         // next level
   6174                         s.iterator->move(s.iterator, 0, UITER_START);
   6175                         level = UCOL_PSK_TERTIARY;
   6176                     } else {
   6177                         canUpdateState = FALSE;
   6178                     }
   6179                     break;
   6180                 }
   6181 
   6182                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6183                     if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
   6184                         // do the case level if we need to do it. We don't want to calculate
   6185                         // case level for primary ignorables if we have only primary strength and case level
   6186                         // otherwise we would break well formedness of CEs
   6187                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   6188                         caseBits = (uint8_t)(CE & 0xC0);
   6189                         // this copies the case level logic from the
   6190                         // sort key generation code
   6191                         if(CE != 0) {
   6192                             if (caseShift == 0) {
   6193                                 dest[i++] = caseByte;
   6194                                 caseShift = UCOL_CASE_SHIFT_START;
   6195                                 caseByte = UCOL_CASE_BYTE_START;
   6196                             }
   6197                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6198                                 if((caseBits & 0xC0) == 0) {
   6199                                     caseByte |= 1 << (--caseShift);
   6200                                 } else {
   6201                                     caseByte |= 0 << (--caseShift);
   6202                                     /* second bit */
   6203                                     if(caseShift == 0) {
   6204                                         dest[i++] = caseByte;
   6205                                         caseShift = UCOL_CASE_SHIFT_START;
   6206                                         caseByte = UCOL_CASE_BYTE_START;
   6207                                     }
   6208                                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
   6209                                 }
   6210                             } else {
   6211                                 if((caseBits & 0xC0) == 0) {
   6212                                     caseByte |= 0 << (--caseShift);
   6213                                 } else {
   6214                                     caseByte |= 1 << (--caseShift);
   6215                                     /* second bit */
   6216                                     if(caseShift == 0) {
   6217                                         dest[i++] = caseByte;
   6218                                         caseShift = UCOL_CASE_SHIFT_START;
   6219                                         caseByte = UCOL_CASE_BYTE_START;
   6220                                     }
   6221                                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
   6222                                 }
   6223                             }
   6224                         }
   6225 
   6226                     }
   6227                 }
   6228                 // Not sure this is correct for the case level - revisit
   6229                 if(uprv_numAvailableExpCEs(s)) {
   6230                     canUpdateState = FALSE;
   6231                 } else {
   6232                     canUpdateState = TRUE;
   6233                 }
   6234             }
   6235         } else {
   6236             level = UCOL_PSK_TERTIARY;
   6237         }
   6238         /* fall through to next level */
   6239     case UCOL_PSK_TERTIARY:
   6240         if(strength >= UCOL_TERTIARY) {
   6241             for(;;) {
   6242                 if(i == count) {
   6243                     goto saveState;
   6244                 }
   6245                 // We should save the state only if we
   6246                 // are sure that we are done with the
   6247                 // previous iterator state
   6248                 if(canUpdateState) {
   6249                     newState = s.iterator->getState(s.iterator);
   6250                     if(newState != UITER_NO_STATE) {
   6251                         iterState = newState;
   6252                         cces = 0;
   6253                     }
   6254                 }
   6255                 CE = ucol_IGetNextCE(coll, &s, status);
   6256                 cces++;
   6257                 if(CE==UCOL_NO_MORE_CES) {
   6258                     // Add the level separator
   6259                     terminatePSKLevel(level, maxLevel, i, dest);
   6260                     byteCountOrFrenchDone = 0;
   6261                     // Restart the iteration an move to the
   6262                     // second level
   6263                     s.iterator->move(s.iterator, 0, UITER_START);
   6264                     cces = 0;
   6265                     level = UCOL_PSK_QUATERNARY;
   6266                     break;
   6267                 }
   6268                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6269                     notIsContinuation = !isContinuation(CE);
   6270 
   6271                     if(notIsContinuation) {
   6272                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   6273                         CE ^= coll->caseSwitch;
   6274                         CE &= coll->tertiaryMask;
   6275                     } else {
   6276                         CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   6277                     }
   6278 
   6279                     if(CE != 0) {
   6280                         dest[i++]=(uint8_t)CE;
   6281                     }
   6282                 }
   6283                 if(uprv_numAvailableExpCEs(s)) {
   6284                     canUpdateState = FALSE;
   6285                 } else {
   6286                     canUpdateState = TRUE;
   6287                 }
   6288             }
   6289         } else {
   6290             // if we're not doing tertiary
   6291             // skip to the end
   6292             level = UCOL_PSK_NULL;
   6293         }
   6294         /* fall through to next level */
   6295     case UCOL_PSK_QUATERNARY:
   6296         if(strength >= UCOL_QUATERNARY) {
   6297             for(;;) {
   6298                 if(i == count) {
   6299                     goto saveState;
   6300                 }
   6301                 // We should save the state only if we
   6302                 // are sure that we are done with the
   6303                 // previous iterator state
   6304                 if(canUpdateState) {
   6305                     newState = s.iterator->getState(s.iterator);
   6306                     if(newState != UITER_NO_STATE) {
   6307                         iterState = newState;
   6308                         cces = 0;
   6309                     }
   6310                 }
   6311                 CE = ucol_IGetNextCE(coll, &s, status);
   6312                 cces++;
   6313                 if(CE==UCOL_NO_MORE_CES) {
   6314                     // Add the level separator
   6315                     terminatePSKLevel(level, maxLevel, i, dest);
   6316                     //dest[i++] = UCOL_LEVELTERMINATOR;
   6317                     byteCountOrFrenchDone = 0;
   6318                     // Restart the iteration an move to the
   6319                     // second level
   6320                     s.iterator->move(s.iterator, 0, UITER_START);
   6321                     cces = 0;
   6322                     level = UCOL_PSK_QUIN;
   6323                     break;
   6324                 }
   6325                 if(CE==0)
   6326                     continue;
   6327                 if(isShiftedCE(CE, LVT, &wasShifted)) {
   6328                     CE >>= 16; /* get primary */
   6329                     if(CE != 0) {
   6330                         if(byteCountOrFrenchDone == 0) {
   6331                             dest[i++]=(uint8_t)(CE >> 8);
   6332                         } else {
   6333                             byteCountOrFrenchDone = 0;
   6334                         }
   6335                         if((CE &=0xff)!=0) {
   6336                             if(i==count) {
   6337                                 /* overflow */
   6338                                 byteCountOrFrenchDone = 1;
   6339                                 goto saveState;
   6340                             }
   6341                             dest[i++]=(uint8_t)CE;
   6342                         }
   6343                     }
   6344                 } else {
   6345                     notIsContinuation = !isContinuation(CE);
   6346                     if(notIsContinuation) {
   6347                         if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   6348                             dest[i++] = UCOL_HIRAGANA_QUAD;
   6349                         } else {
   6350                             dest[i++] = 0xFF;
   6351                         }
   6352                     }
   6353                 }
   6354                 if(uprv_numAvailableExpCEs(s)) {
   6355                     canUpdateState = FALSE;
   6356                 } else {
   6357                     canUpdateState = TRUE;
   6358                 }
   6359             }
   6360         } else {
   6361             // if we're not doing quaternary
   6362             // skip to the end
   6363             level = UCOL_PSK_NULL;
   6364         }
   6365         /* fall through to next level */
   6366     case UCOL_PSK_QUIN:
   6367         level = UCOL_PSK_IDENTICAL;
   6368         /* fall through to next level */
   6369     case UCOL_PSK_IDENTICAL:
   6370         if(strength >= UCOL_IDENTICAL) {
   6371             UChar32 first, second;
   6372             int32_t bocsuBytesWritten = 0;
   6373             // We always need to do identical on
   6374             // the NFD form of the string.
   6375             if(normIter == NULL) {
   6376                 // we arrived from the level below and
   6377                 // normalization was not turned on.
   6378                 // therefore, we need to make a fresh NFD iterator
   6379                 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   6380                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   6381             } else if(!doingIdenticalFromStart) {
   6382                 // there is an iterator, but we did some other levels.
   6383                 // therefore, we have a FCD iterator - need to make
   6384                 // a NFD one.
   6385                 // normIter being at the beginning does not guarantee
   6386                 // that the underlying iterator is at the beginning
   6387                 iter->move(iter, 0, UITER_START);
   6388                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   6389             }
   6390             // At this point we have a NFD iterator that is positioned
   6391             // in the right place
   6392             if(U_FAILURE(*status)) {
   6393                 UTRACE_EXIT_STATUS(*status);
   6394                 return 0;
   6395             }
   6396             first = uiter_previous32(s.iterator);
   6397             // maybe we're at the start of the string
   6398             if(first == U_SENTINEL) {
   6399                 first = 0;
   6400             } else {
   6401                 uiter_next32(s.iterator);
   6402             }
   6403 
   6404             j = 0;
   6405             for(;;) {
   6406                 if(i == count) {
   6407                     if(j+1 < bocsuBytesWritten) {
   6408                         bocsuBytesUsed = j+1;
   6409                     }
   6410                     goto saveState;
   6411                 }
   6412 
   6413                 // On identical level, we will always save
   6414                 // the state if we reach this point, since
   6415                 // we don't depend on getNextCE for content
   6416                 // all the content is in our buffer and we
   6417                 // already either stored the full buffer OR
   6418                 // otherwise we won't arrive here.
   6419                 newState = s.iterator->getState(s.iterator);
   6420                 if(newState != UITER_NO_STATE) {
   6421                     iterState = newState;
   6422                     cces = 0;
   6423                 }
   6424 
   6425                 uint8_t buff[4];
   6426                 second = uiter_next32(s.iterator);
   6427                 cces++;
   6428 
   6429                 // end condition for identical level
   6430                 if(second == U_SENTINEL) {
   6431                     terminatePSKLevel(level, maxLevel, i, dest);
   6432                     level = UCOL_PSK_NULL;
   6433                     break;
   6434                 }
   6435                 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
   6436                 first = second;
   6437 
   6438                 j = 0;
   6439                 if(bocsuBytesUsed != 0) {
   6440                     while(bocsuBytesUsed-->0) {
   6441                         j++;
   6442                     }
   6443                 }
   6444 
   6445                 while(i < count && j < bocsuBytesWritten) {
   6446                     dest[i++] = buff[j++];
   6447                 }
   6448             }
   6449 
   6450         } else {
   6451             level = UCOL_PSK_NULL;
   6452         }
   6453         /* fall through to next level */
   6454     case UCOL_PSK_NULL:
   6455         j = i;
   6456         while(j<count) {
   6457             dest[j++]=0;
   6458         }
   6459         break;
   6460     default:
   6461         *status = U_INTERNAL_PROGRAM_ERROR;
   6462         UTRACE_EXIT_STATUS(*status);
   6463         return 0;
   6464     }
   6465 
   6466 saveState:
   6467     // Now we need to return stuff. First we want to see whether we have
   6468     // done everything for the current state of iterator.
   6469     if(byteCountOrFrenchDone
   6470         || canUpdateState == FALSE
   6471         || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
   6472     {
   6473         // Any of above mean that the previous transaction
   6474         // wasn't finished and that we should store the
   6475         // previous iterator state.
   6476         state[0] = iterState;
   6477     } else {
   6478         // The transaction is complete. We will continue in the next iteration.
   6479         state[0] = s.iterator->getState(s.iterator);
   6480         cces = 0;
   6481     }
   6482     // Store the number of bocsu bytes written.
   6483     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
   6484         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6485     }
   6486     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
   6487 
   6488     // Next we put in the level of comparison
   6489     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
   6490 
   6491     // If we are doing French, we need to store whether we have just finished the French level
   6492     if(level == UCOL_PSK_SECONDARY && doingFrench) {
   6493         state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6494     } else {
   6495         state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6496     }
   6497 
   6498     // Was the latest CE shifted
   6499     if(wasShifted) {
   6500         state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
   6501     }
   6502     // Check for cces overflow
   6503     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
   6504         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6505     }
   6506     // Store cces
   6507     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
   6508 
   6509     // Check for French overflow
   6510     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
   6511         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6512     }
   6513     // Store number of bytes written in the French secondary continuation sequence
   6514     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
   6515 
   6516 
   6517     // If we have used normalizing iterator, get rid of it
   6518     if(normIter != NULL) {
   6519         unorm_closeIter(normIter);
   6520     }
   6521 
   6522     /* To avoid memory leak, free the offset buffer if necessary. */
   6523     ucol_freeOffsetBuffer(&s);
   6524 
   6525     // Return number of meaningful sortkey bytes.
   6526     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
   6527                   dest,i, state[0], state[1]);
   6528     UTRACE_EXIT_VALUE(i);
   6529     return i;
   6530 }
   6531 
   6532 /**
   6533  * Produce a bound for a given sortkey and a number of levels.
   6534  */
   6535 U_CAPI int32_t U_EXPORT2
   6536 ucol_getBound(const uint8_t       *source,
   6537         int32_t             sourceLength,
   6538         UColBoundMode       boundType,
   6539         uint32_t            noOfLevels,
   6540         uint8_t             *result,
   6541         int32_t             resultLength,
   6542         UErrorCode          *status)
   6543 {
   6544     // consistency checks
   6545     if(status == NULL || U_FAILURE(*status)) {
   6546         return 0;
   6547     }
   6548     if(source == NULL) {
   6549         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6550         return 0;
   6551     }
   6552 
   6553     int32_t sourceIndex = 0;
   6554     // Scan the string until we skip enough of the key OR reach the end of the key
   6555     do {
   6556         sourceIndex++;
   6557         if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
   6558             noOfLevels--;
   6559         }
   6560     } while (noOfLevels > 0
   6561         && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
   6562 
   6563     if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
   6564         && noOfLevels > 0) {
   6565             *status = U_SORT_KEY_TOO_SHORT_WARNING;
   6566     }
   6567 
   6568 
   6569     // READ ME: this code assumes that the values for boundType
   6570     // enum will not changes. They are set so that the enum value
   6571     // corresponds to the number of extra bytes each bound type
   6572     // needs.
   6573     if(result != NULL && resultLength >= sourceIndex+boundType) {
   6574         uprv_memcpy(result, source, sourceIndex);
   6575         switch(boundType) {
   6576             // Lower bound just gets terminated. No extra bytes
   6577         case UCOL_BOUND_LOWER: // = 0
   6578             break;
   6579             // Upper bound needs one extra byte
   6580         case UCOL_BOUND_UPPER: // = 1
   6581             result[sourceIndex++] = 2;
   6582             break;
   6583             // Upper long bound needs two extra bytes
   6584         case UCOL_BOUND_UPPER_LONG: // = 2
   6585             result[sourceIndex++] = 0xFF;
   6586             result[sourceIndex++] = 0xFF;
   6587             break;
   6588         default:
   6589             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6590             return 0;
   6591         }
   6592         result[sourceIndex++] = 0;
   6593 
   6594         return sourceIndex;
   6595     } else {
   6596         return sourceIndex+boundType+1;
   6597     }
   6598 }
   6599 
   6600 /****************************************************************************/
   6601 /* Following are the functions that deal with the properties of a collator  */
   6602 /* there are new APIs and some compatibility APIs                           */
   6603 /****************************************************************************/
   6604 
   6605 static inline void
   6606 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
   6607                     int32_t *primShift, int32_t *secShift, int32_t *terShift)
   6608 {
   6609     uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
   6610     UBool reverseSecondary = FALSE;
   6611     UBool continuation = isContinuation(CE);
   6612     if(!continuation) {
   6613         tertiary = (uint8_t)((CE & coll->tertiaryMask));
   6614         tertiary ^= coll->caseSwitch;
   6615         reverseSecondary = TRUE;
   6616     } else {
   6617         tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   6618         tertiary &= UCOL_REMOVE_CASE;
   6619         reverseSecondary = FALSE;
   6620     }
   6621 
   6622     secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6623     primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6624     primary1 = (uint8_t)(CE >> 8);
   6625 
   6626     if(primary1 != 0) {
   6627         if (coll->leadBytePermutationTable != NULL && !continuation) {
   6628             primary1 = coll->leadBytePermutationTable[primary1];
   6629         }
   6630 
   6631         coll->latinOneCEs[ch] |= (primary1 << *primShift);
   6632         *primShift -= 8;
   6633     }
   6634     if(primary2 != 0) {
   6635         if(*primShift < 0) {
   6636             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6637             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6638             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6639             return;
   6640         }
   6641         coll->latinOneCEs[ch] |= (primary2 << *primShift);
   6642         *primShift -= 8;
   6643     }
   6644     if(secondary != 0) {
   6645         if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
   6646             coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
   6647             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
   6648         } else { // normal case
   6649             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
   6650         }
   6651         *secShift -= 8;
   6652     }
   6653     if(tertiary != 0) {
   6654         coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
   6655         *terShift -= 8;
   6656     }
   6657 }
   6658 
   6659 static inline UBool
   6660 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
   6661     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
   6662     if(newTable == NULL) {
   6663       *status = U_MEMORY_ALLOCATION_ERROR;
   6664       coll->latinOneFailed = TRUE;
   6665       return FALSE;
   6666     }
   6667     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
   6668     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
   6669     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
   6670     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
   6671     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
   6672     coll->latinOneTableLen = size;
   6673     uprv_free(coll->latinOneCEs);
   6674     coll->latinOneCEs = newTable;
   6675     return TRUE;
   6676 }
   6677 
   6678 static UBool
   6679 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
   6680     UBool result = TRUE;
   6681     if(coll->latinOneCEs == NULL) {
   6682         coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
   6683         if(coll->latinOneCEs == NULL) {
   6684             *status = U_MEMORY_ALLOCATION_ERROR;
   6685             return FALSE;
   6686         }
   6687         coll->latinOneTableLen = UCOL_LATINONETABLELEN;
   6688     }
   6689     UChar ch = 0;
   6690     UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
   6691     // Check for null pointer
   6692     if (U_FAILURE(*status)) {
   6693         return FALSE;
   6694     }
   6695     uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
   6696 
   6697     int32_t primShift = 24, secShift = 24, terShift = 24;
   6698     uint32_t CE = 0;
   6699     int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
   6700 
   6701     // TODO: make safe if you get more than you wanted...
   6702     for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
   6703         primShift = 24; secShift = 24; terShift = 24;
   6704         if(ch < 0x100) {
   6705             CE = coll->latinOneMapping[ch];
   6706         } else {
   6707             CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   6708             if(CE == UCOL_NOT_FOUND && coll->UCA) {
   6709                 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   6710             }
   6711         }
   6712         if(CE < UCOL_NOT_FOUND) {
   6713             ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6714         } else {
   6715             switch (getCETag(CE)) {
   6716             case EXPANSION_TAG:
   6717             case DIGIT_TAG:
   6718                 ucol_setText(it, &ch, 1, status);
   6719                 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
   6720                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6721                         coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6722                         coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6723                         coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6724                         break;
   6725                     }
   6726                     ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6727                 }
   6728                 break;
   6729             case CONTRACTION_TAG:
   6730                 // here is the trick
   6731                 // F2 is contraction. We do something very similar to contractions
   6732                 // but have two indices, one in the real contraction table and the
   6733                 // other to where we stuffed things. This hopes that we don't have
   6734                 // many contractions (this should work for latin-1 tables).
   6735                 {
   6736                     if((CE & 0x00FFF000) != 0) {
   6737                         *status = U_UNSUPPORTED_ERROR;
   6738                         goto cleanup_after_failure;
   6739                     }
   6740 
   6741                     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   6742 
   6743                     CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
   6744 
   6745                     coll->latinOneCEs[ch] = CE;
   6746                     coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
   6747                     coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
   6748 
   6749                     // We're going to jump into contraction table, pick the elements
   6750                     // and use them
   6751                     do {
   6752                         CE = *(coll->contractionCEs +
   6753                             (UCharOffset - coll->contractionIndex));
   6754                         if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
   6755                             uint32_t size;
   6756                             uint32_t i;    /* general counter */
   6757                             uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   6758                             size = getExpansionCount(CE);
   6759                             //CE = *CEOffset++;
   6760                             if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   6761                                 for(i = 0; i<size; i++) {
   6762                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6763                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6764                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6765                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6766                                         break;
   6767                                     }
   6768                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6769                                 }
   6770                             } else { /* else, we do */
   6771                                 while(*CEOffset != 0) {
   6772                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6773                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6774                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6775                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6776                                         break;
   6777                                     }
   6778                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6779                                 }
   6780                             }
   6781                             contractionOffset++;
   6782                         } else if(CE < UCOL_NOT_FOUND) {
   6783                             ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
   6784                         } else {
   6785                             coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6786                             coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6787                             coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6788                             contractionOffset++;
   6789                         }
   6790                         UCharOffset++;
   6791                         primShift = 24; secShift = 24; terShift = 24;
   6792                         if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
   6793                             if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
   6794                                 goto cleanup_after_failure;
   6795                             }
   6796                         }
   6797                     } while(*UCharOffset != 0xFFFF);
   6798                 }
   6799                 break;;
   6800             case SPEC_PROC_TAG:
   6801                 {
   6802                     // 0xB7 is a precontext character defined in UCA5.1, a special
   6803                     // handle is implemeted in order to save LatinOne table for
   6804                     // most locales.
   6805                     if (ch==0xb7) {
   6806                         ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6807                     }
   6808                     else {
   6809                         goto cleanup_after_failure;
   6810                     }
   6811                 }
   6812                 break;
   6813             default:
   6814                 goto cleanup_after_failure;
   6815             }
   6816         }
   6817     }
   6818     // compact table
   6819     if(contractionOffset < coll->latinOneTableLen) {
   6820         if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
   6821             goto cleanup_after_failure;
   6822         }
   6823     }
   6824     ucol_closeElements(it);
   6825     return result;
   6826 
   6827 cleanup_after_failure:
   6828     // status should already be set before arriving here.
   6829     coll->latinOneFailed = TRUE;
   6830     ucol_closeElements(it);
   6831     return FALSE;
   6832 }
   6833 
   6834 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
   6835     if(U_SUCCESS(*status)) {
   6836         if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6837             coll->caseSwitch = UCOL_CASE_SWITCH;
   6838         } else {
   6839             coll->caseSwitch = UCOL_NO_CASE_SWITCH;
   6840         }
   6841 
   6842         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
   6843             coll->tertiaryMask = UCOL_REMOVE_CASE;
   6844             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6845             coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
   6846             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
   6847             coll->tertiaryBottom = UCOL_COMMON_BOT3;
   6848         } else {
   6849             coll->tertiaryMask = UCOL_KEEP_CASE;
   6850             coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
   6851             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6852                 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
   6853                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
   6854                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
   6855             } else {
   6856                 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6857                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
   6858                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
   6859             }
   6860         }
   6861 
   6862         /* Set the compression values */
   6863         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
   6864         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
   6865         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
   6866 
   6867         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
   6868             && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
   6869         {
   6870             coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
   6871         } else {
   6872             coll->sortKeyGen = ucol_calcSortKey;
   6873         }
   6874         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
   6875             && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
   6876         {
   6877             if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
   6878                 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
   6879                     //fprintf(stderr, "F");
   6880                     coll->latinOneUse = TRUE;
   6881                 } else {
   6882                     coll->latinOneUse = FALSE;
   6883                 }
   6884                 if(*status == U_UNSUPPORTED_ERROR) {
   6885                     *status = U_ZERO_ERROR;
   6886                 }
   6887             } else { // latin1Table exists and it doesn't need to be regenerated, just use it
   6888                 coll->latinOneUse = TRUE;
   6889             }
   6890         } else {
   6891             coll->latinOneUse = FALSE;
   6892         }
   6893     }
   6894 }
   6895 
   6896 U_CAPI uint32_t  U_EXPORT2
   6897 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
   6898     if(U_FAILURE(*status) || coll == NULL) {
   6899         return 0;
   6900     }
   6901     if(len == -1) {
   6902         len = u_strlen(varTop);
   6903     }
   6904     if(len == 0) {
   6905         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6906         return 0;
   6907     }
   6908 
   6909     collIterate s;
   6910     IInit_collIterate(coll, varTop, len, &s, status);
   6911     if(U_FAILURE(*status)) {
   6912         return 0;
   6913     }
   6914 
   6915     uint32_t CE = ucol_IGetNextCE(coll, &s, status);
   6916 
   6917     /* here we check if we have consumed all characters */
   6918     /* you can put in either one character or a contraction */
   6919     /* you shouldn't put more... */
   6920     if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
   6921         *status = U_CE_NOT_FOUND_ERROR;
   6922         return 0;
   6923     }
   6924 
   6925     uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
   6926 
   6927     if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
   6928         *status = U_PRIMARY_TOO_LONG_ERROR;
   6929         return 0;
   6930     }
   6931     if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
   6932         coll->variableTopValueisDefault = FALSE;
   6933         coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
   6934     }
   6935 
   6936     /* To avoid memory leak, free the offset buffer if necessary. */
   6937     ucol_freeOffsetBuffer(&s);
   6938 
   6939     return CE & UCOL_PRIMARYMASK;
   6940 }
   6941 
   6942 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
   6943     if(U_FAILURE(*status) || coll == NULL) {
   6944         return 0;
   6945     }
   6946     return coll->variableTopValue<<16;
   6947 }
   6948 
   6949 U_CAPI void  U_EXPORT2
   6950 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
   6951     if(U_FAILURE(*status) || coll == NULL) {
   6952         return;
   6953     }
   6954 
   6955     if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
   6956         coll->variableTopValueisDefault = FALSE;
   6957         coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
   6958     }
   6959 }
   6960 /* Attribute setter API */
   6961 U_CAPI void  U_EXPORT2
   6962 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
   6963     if(U_FAILURE(*status) || coll == NULL) {
   6964       return;
   6965     }
   6966     UColAttributeValue oldFrench = coll->frenchCollation;
   6967     UColAttributeValue oldCaseFirst = coll->caseFirst;
   6968     switch(attr) {
   6969     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
   6970         if(value == UCOL_ON) {
   6971             coll->numericCollation = UCOL_ON;
   6972             coll->numericCollationisDefault = FALSE;
   6973         } else if (value == UCOL_OFF) {
   6974             coll->numericCollation = UCOL_OFF;
   6975             coll->numericCollationisDefault = FALSE;
   6976         } else if (value == UCOL_DEFAULT) {
   6977             coll->numericCollationisDefault = TRUE;
   6978             coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
   6979         } else {
   6980             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6981         }
   6982         break;
   6983     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
   6984         if(value == UCOL_ON) {
   6985             coll->hiraganaQ = UCOL_ON;
   6986             coll->hiraganaQisDefault = FALSE;
   6987         } else if (value == UCOL_OFF) {
   6988             coll->hiraganaQ = UCOL_OFF;
   6989             coll->hiraganaQisDefault = FALSE;
   6990         } else if (value == UCOL_DEFAULT) {
   6991             coll->hiraganaQisDefault = TRUE;
   6992             coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
   6993         } else {
   6994             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6995         }
   6996         break;
   6997     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   6998         if(value == UCOL_ON) {
   6999             coll->frenchCollation = UCOL_ON;
   7000             coll->frenchCollationisDefault = FALSE;
   7001         } else if (value == UCOL_OFF) {
   7002             coll->frenchCollation = UCOL_OFF;
   7003             coll->frenchCollationisDefault = FALSE;
   7004         } else if (value == UCOL_DEFAULT) {
   7005             coll->frenchCollationisDefault = TRUE;
   7006             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
   7007         } else {
   7008             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7009         }
   7010         break;
   7011     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   7012         if(value == UCOL_SHIFTED) {
   7013             coll->alternateHandling = UCOL_SHIFTED;
   7014             coll->alternateHandlingisDefault = FALSE;
   7015         } else if (value == UCOL_NON_IGNORABLE) {
   7016             coll->alternateHandling = UCOL_NON_IGNORABLE;
   7017             coll->alternateHandlingisDefault = FALSE;
   7018         } else if (value == UCOL_DEFAULT) {
   7019             coll->alternateHandlingisDefault = TRUE;
   7020             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
   7021         } else {
   7022             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7023         }
   7024         break;
   7025     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   7026         if(value == UCOL_LOWER_FIRST) {
   7027             coll->caseFirst = UCOL_LOWER_FIRST;
   7028             coll->caseFirstisDefault = FALSE;
   7029         } else if (value == UCOL_UPPER_FIRST) {
   7030             coll->caseFirst = UCOL_UPPER_FIRST;
   7031             coll->caseFirstisDefault = FALSE;
   7032         } else if (value == UCOL_OFF) {
   7033             coll->caseFirst = UCOL_OFF;
   7034             coll->caseFirstisDefault = FALSE;
   7035         } else if (value == UCOL_DEFAULT) {
   7036             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
   7037             coll->caseFirstisDefault = TRUE;
   7038         } else {
   7039             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7040         }
   7041         break;
   7042     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   7043         if(value == UCOL_ON) {
   7044             coll->caseLevel = UCOL_ON;
   7045             coll->caseLevelisDefault = FALSE;
   7046         } else if (value == UCOL_OFF) {
   7047             coll->caseLevel = UCOL_OFF;
   7048             coll->caseLevelisDefault = FALSE;
   7049         } else if (value == UCOL_DEFAULT) {
   7050             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
   7051             coll->caseLevelisDefault = TRUE;
   7052         } else {
   7053             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7054         }
   7055         break;
   7056     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   7057         if(value == UCOL_ON) {
   7058             coll->normalizationMode = UCOL_ON;
   7059             coll->normalizationModeisDefault = FALSE;
   7060             initializeFCD(status);
   7061         } else if (value == UCOL_OFF) {
   7062             coll->normalizationMode = UCOL_OFF;
   7063             coll->normalizationModeisDefault = FALSE;
   7064         } else if (value == UCOL_DEFAULT) {
   7065             coll->normalizationModeisDefault = TRUE;
   7066             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
   7067             if(coll->normalizationMode == UCOL_ON) {
   7068                 initializeFCD(status);
   7069             }
   7070         } else {
   7071             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7072         }
   7073         break;
   7074     case UCOL_STRENGTH:         /* attribute for strength */
   7075         if (value == UCOL_DEFAULT) {
   7076             coll->strengthisDefault = TRUE;
   7077             coll->strength = (UColAttributeValue)coll->options->strength;
   7078         } else if (value <= UCOL_IDENTICAL) {
   7079             coll->strengthisDefault = FALSE;
   7080             coll->strength = value;
   7081         } else {
   7082             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7083         }
   7084         break;
   7085     case UCOL_ATTRIBUTE_COUNT:
   7086     default:
   7087         *status = U_ILLEGAL_ARGUMENT_ERROR;
   7088         break;
   7089     }
   7090     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
   7091         coll->latinOneRegenTable = TRUE;
   7092     } else {
   7093         coll->latinOneRegenTable = FALSE;
   7094     }
   7095     ucol_updateInternalState(coll, status);
   7096 }
   7097 
   7098 U_CAPI UColAttributeValue  U_EXPORT2
   7099 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
   7100     if(U_FAILURE(*status) || coll == NULL) {
   7101       return UCOL_DEFAULT;
   7102     }
   7103     switch(attr) {
   7104     case UCOL_NUMERIC_COLLATION:
   7105       return coll->numericCollation;
   7106     case UCOL_HIRAGANA_QUATERNARY_MODE:
   7107       return coll->hiraganaQ;
   7108     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   7109         return coll->frenchCollation;
   7110     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   7111         return coll->alternateHandling;
   7112     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   7113         return coll->caseFirst;
   7114     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   7115         return coll->caseLevel;
   7116     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   7117         return coll->normalizationMode;
   7118     case UCOL_STRENGTH:         /* attribute for strength */
   7119         return coll->strength;
   7120     case UCOL_ATTRIBUTE_COUNT:
   7121     default:
   7122         *status = U_ILLEGAL_ARGUMENT_ERROR;
   7123         break;
   7124     }
   7125     return UCOL_DEFAULT;
   7126 }
   7127 
   7128 U_CAPI void U_EXPORT2
   7129 ucol_setStrength(    UCollator                *coll,
   7130             UCollationStrength        strength)
   7131 {
   7132     UErrorCode status = U_ZERO_ERROR;
   7133     ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
   7134 }
   7135 
   7136 U_CAPI UCollationStrength U_EXPORT2
   7137 ucol_getStrength(const UCollator *coll)
   7138 {
   7139     UErrorCode status = U_ZERO_ERROR;
   7140     return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
   7141 }
   7142 
   7143 U_INTERNAL int32_t U_EXPORT2
   7144 ucol_getReorderCodes(const UCollator *coll,
   7145                     int32_t *dest,
   7146                     int32_t destCapacity,
   7147                     UErrorCode *pErrorCode) {
   7148     if (U_FAILURE(*pErrorCode)) {
   7149         return 0;
   7150     }
   7151 
   7152     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
   7153         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   7154         return 0;
   7155     }
   7156 
   7157     if (coll->reorderCodesLength > destCapacity) {
   7158         *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
   7159         return coll->reorderCodesLength;
   7160     }
   7161     for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
   7162         dest[i] = coll->reorderCodes[i];
   7163     }
   7164     return coll->reorderCodesLength;
   7165 }
   7166 
   7167 U_INTERNAL void U_EXPORT2
   7168 ucol_setReorderCodes(UCollator *coll,
   7169                     const int32_t *reorderCodes,
   7170                     int32_t reorderCodesLength,
   7171                     UErrorCode *pErrorCode) {
   7172     if (U_FAILURE(*pErrorCode)) {
   7173         return;
   7174     }
   7175 
   7176     if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
   7177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   7178         return;
   7179     }
   7180 
   7181     uprv_free(coll->reorderCodes);
   7182     coll->reorderCodes = NULL;
   7183     coll->reorderCodesLength = 0;
   7184     if (reorderCodesLength == 0) {
   7185         uprv_free(coll->leadBytePermutationTable);
   7186         coll->leadBytePermutationTable = NULL;
   7187         return;
   7188     }
   7189     coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
   7190     if (coll->reorderCodes == NULL) {
   7191         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
   7192         return;
   7193     }
   7194     for (int32_t i = 0; i < reorderCodesLength; i++) {
   7195         coll->reorderCodes[i] = reorderCodes[i];
   7196     }
   7197     coll->reorderCodesLength = reorderCodesLength;
   7198     ucol_buildPermutationTable(coll, pErrorCode);
   7199     if (U_FAILURE(*pErrorCode)) {
   7200         uprv_free(coll->reorderCodes);
   7201         coll->reorderCodes = NULL;
   7202         coll->reorderCodesLength = 0;
   7203     }
   7204 }
   7205 
   7206 
   7207 /****************************************************************************/
   7208 /* Following are misc functions                                             */
   7209 /* there are new APIs and some compatibility APIs                           */
   7210 /****************************************************************************/
   7211 
   7212 U_CAPI void U_EXPORT2
   7213 ucol_getVersion(const UCollator* coll,
   7214                 UVersionInfo versionInfo)
   7215 {
   7216     /* RunTime version  */
   7217     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
   7218     /* Builder version*/
   7219     uint8_t bdVersion = coll->image->version[0];
   7220 
   7221     /* Charset Version. Need to get the version from cnv files
   7222      * makeconv should populate cnv files with version and
   7223      * an api has to be provided in ucnv.h to obtain this version
   7224      */
   7225     uint8_t csVersion = 0;
   7226 
   7227     /* combine the version info */
   7228     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
   7229 
   7230     /* Tailoring rules */
   7231     versionInfo[0] = (uint8_t)(cmbVersion>>8);
   7232     versionInfo[1] = (uint8_t)cmbVersion;
   7233     versionInfo[2] = coll->image->version[1];
   7234     if(coll->UCA) {
   7235         /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
   7236         versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
   7237     } else {
   7238         versionInfo[3] = 0;
   7239     }
   7240 }
   7241 
   7242 
   7243 /* This internal API checks whether a character is tailored or not */
   7244 U_CAPI UBool  U_EXPORT2
   7245 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
   7246     if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
   7247         return FALSE;
   7248     }
   7249 
   7250     uint32_t CE = UCOL_NOT_FOUND;
   7251     const UChar *ContractionStart = NULL;
   7252     if(u < 0x100) { /* latin-1 */
   7253         CE = coll->latinOneMapping[u];
   7254         if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
   7255             return FALSE;
   7256         }
   7257     } else { /* regular */
   7258         CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
   7259     }
   7260 
   7261     if(isContraction(CE)) {
   7262         ContractionStart = (UChar *)coll->image+getContractOffset(CE);
   7263         CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
   7264     }
   7265 
   7266     return (UBool)(CE != UCOL_NOT_FOUND);
   7267 }
   7268 
   7269 
   7270 /****************************************************************************/
   7271 /* Following are the string compare functions                               */
   7272 /*                                                                          */
   7273 /****************************************************************************/
   7274 
   7275 
   7276 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
   7277 /*                     Used by strcoll if strength == identical and strings  */
   7278 /*                     are otherwise equal.                                  */
   7279 /*                                                                           */
   7280 /*                     Comparison must be done on NFD normalized strings.    */
   7281 /*                     FCD is not good enough.                               */
   7282 
   7283 static
   7284 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
   7285 {
   7286     // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
   7287     // of same type, but that doesn't really mean that it will stay that way.
   7288     int32_t            comparison;
   7289 
   7290     if (sColl->flags & UCOL_USE_ITERATOR) {
   7291         // The division for the array length may truncate the array size to
   7292         // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   7293         // for all platforms anyway.
   7294         UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   7295         UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   7296         UNormIterator *sNIt = NULL, *tNIt = NULL;
   7297         sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   7298         tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   7299         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   7300         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   7301         UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
   7302         UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
   7303         comparison = u_strCompareIter(sIt, tIt, TRUE);
   7304         unorm_closeIter(sNIt);
   7305         unorm_closeIter(tNIt);
   7306     } else {
   7307         int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
   7308         const UChar *sBuf = sColl->string;
   7309         int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
   7310         const UChar *tBuf = tColl->string;
   7311 
   7312         if (normalize) {
   7313             *status = U_ZERO_ERROR;
   7314             // Note: We could use Normalizer::compare() or similar, but for short strings
   7315             // which may not be in FCD it might be faster to just NFD them.
   7316             // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
   7317             // NFD'ing immediately might be faster for long strings,
   7318             // but string comparison is usually done on relatively short strings.
   7319             sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
   7320                                   sColl->writableBuffer,
   7321                                   *status);
   7322             tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
   7323                                   tColl->writableBuffer,
   7324                                   *status);
   7325             if(U_FAILURE(*status)) {
   7326                 return UCOL_LESS;
   7327             }
   7328             comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
   7329         } else {
   7330             comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
   7331         }
   7332     }
   7333 
   7334     if (comparison < 0) {
   7335         return UCOL_LESS;
   7336     } else if (comparison == 0) {
   7337         return UCOL_EQUAL;
   7338     } else /* comparison > 0 */ {
   7339         return UCOL_GREATER;
   7340     }
   7341 }
   7342 
   7343 /*  CEBuf - A struct and some inline functions to handle the saving    */
   7344 /*          of CEs in a buffer within ucol_strcoll                     */
   7345 
   7346 #define UCOL_CEBUF_SIZE 512
   7347 typedef struct ucol_CEBuf {
   7348     uint32_t    *buf;
   7349     uint32_t    *endp;
   7350     uint32_t    *pos;
   7351     uint32_t     localArray[UCOL_CEBUF_SIZE];
   7352 } ucol_CEBuf;
   7353 
   7354 
   7355 static
   7356 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
   7357     (b)->buf = (b)->pos = (b)->localArray;
   7358     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
   7359 }
   7360 
   7361 static
   7362 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
   7363     uint32_t  oldSize;
   7364     uint32_t  newSize;
   7365     uint32_t  *newBuf;
   7366 
   7367     ci->flags |= UCOL_ITER_ALLOCATED;
   7368     oldSize = (uint32_t)(b->pos - b->buf);
   7369     newSize = oldSize * 2;
   7370     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
   7371     if(newBuf == NULL) {
   7372         *status = U_MEMORY_ALLOCATION_ERROR;
   7373     }
   7374     else {
   7375         uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
   7376         if (b->buf != b->localArray) {
   7377             uprv_free(b->buf);
   7378         }
   7379         b->buf = newBuf;
   7380         b->endp = b->buf + newSize;
   7381         b->pos  = b->buf + oldSize;
   7382     }
   7383 }
   7384 
   7385 static
   7386 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
   7387     if (b->pos == b->endp) {
   7388         ucol_CEBuf_Expand(b, ci, status);
   7389     }
   7390     if (U_SUCCESS(*status)) {
   7391         *(b)->pos++ = ce;
   7392     }
   7393 }
   7394 
   7395 /* This is a trick string compare function that goes in and uses sortkeys to compare */
   7396 /* It is used when compare gets in trouble and needs to bail out                     */
   7397 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
   7398                                                   collIterate *tColl,
   7399                                                   UErrorCode *status)
   7400 {
   7401     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
   7402     uint8_t *sourceKeyP = sourceKey;
   7403     uint8_t *targetKeyP = targetKey;
   7404     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
   7405     const UCollator *coll = sColl->coll;
   7406     const UChar *source = NULL;
   7407     const UChar *target = NULL;
   7408     int32_t result = UCOL_EQUAL;
   7409     UnicodeString sourceString, targetString;
   7410     int32_t sourceLength;
   7411     int32_t targetLength;
   7412 
   7413     if(sColl->flags & UCOL_USE_ITERATOR) {
   7414         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   7415         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   7416         UChar32 c;
   7417         while((c=sColl->iterator->next(sColl->iterator))>=0) {
   7418             sourceString.append((UChar)c);
   7419         }
   7420         while((c=tColl->iterator->next(tColl->iterator))>=0) {
   7421             targetString.append((UChar)c);
   7422         }
   7423         source = sourceString.getBuffer();
   7424         sourceLength = sourceString.length();
   7425         target = targetString.getBuffer();
   7426         targetLength = targetString.length();
   7427     } else { // no iterators
   7428         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
   7429         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
   7430         source = sColl->string;
   7431         target = tColl->string;
   7432     }
   7433 
   7434 
   7435 
   7436     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7437     if(sourceKeyLen > UCOL_MAX_BUFFER) {
   7438         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
   7439         if(sourceKeyP == NULL) {
   7440             *status = U_MEMORY_ALLOCATION_ERROR;
   7441             goto cleanup_and_do_compare;
   7442         }
   7443         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7444     }
   7445 
   7446     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7447     if(targetKeyLen > UCOL_MAX_BUFFER) {
   7448         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
   7449         if(targetKeyP == NULL) {
   7450             *status = U_MEMORY_ALLOCATION_ERROR;
   7451             goto cleanup_and_do_compare;
   7452         }
   7453         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7454     }
   7455 
   7456     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
   7457 
   7458 cleanup_and_do_compare:
   7459     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
   7460         uprv_free(sourceKeyP);
   7461     }
   7462 
   7463     if(targetKeyP != NULL && targetKeyP != targetKey) {
   7464         uprv_free(targetKeyP);
   7465     }
   7466 
   7467     if(result<0) {
   7468         return UCOL_LESS;
   7469     } else if(result>0) {
   7470         return UCOL_GREATER;
   7471     } else {
   7472         return UCOL_EQUAL;
   7473     }
   7474 }
   7475 
   7476 
   7477 static UCollationResult
   7478 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
   7479 {
   7480     U_ALIGN_CODE(16);
   7481 
   7482     const UCollator *coll = sColl->coll;
   7483 
   7484 
   7485     // setting up the collator parameters
   7486     UColAttributeValue strength = coll->strength;
   7487     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
   7488 
   7489     UBool checkSecTer = initialCheckSecTer;
   7490     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
   7491     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
   7492     UBool checkIdent = (strength == UCOL_IDENTICAL);
   7493     UBool checkCase = (coll->caseLevel == UCOL_ON);
   7494     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
   7495     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
   7496     UBool qShifted = shifted && checkQuad;
   7497     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
   7498 
   7499     if(doHiragana && shifted) {
   7500         return (ucol_compareUsingSortKeys(sColl, tColl, status));
   7501     }
   7502     uint8_t caseSwitch = coll->caseSwitch;
   7503     uint8_t tertiaryMask = coll->tertiaryMask;
   7504 
   7505     // This is the lowest primary value that will not be ignored if shifted
   7506     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
   7507 
   7508     UCollationResult result = UCOL_EQUAL;
   7509     UCollationResult hirResult = UCOL_EQUAL;
   7510 
   7511     // Preparing the CE buffers. They will be filled during the primary phase
   7512     ucol_CEBuf   sCEs;
   7513     ucol_CEBuf   tCEs;
   7514     UCOL_INIT_CEBUF(&sCEs);
   7515     UCOL_INIT_CEBUF(&tCEs);
   7516 
   7517     uint32_t secS = 0, secT = 0;
   7518     uint32_t sOrder=0, tOrder=0;
   7519 
   7520     // Non shifted primary processing is quite simple
   7521     if(!shifted) {
   7522         for(;;) {
   7523 
   7524             // We fetch CEs until we hit a non ignorable primary or end.
   7525             do {
   7526                 // We get the next CE
   7527                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7528                 // Stuff it in the buffer
   7529                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7530                 // And keep just the primary part.
   7531                 sOrder &= UCOL_PRIMARYMASK;
   7532             } while(sOrder == 0);
   7533 
   7534             // see the comments on the above block
   7535             do {
   7536                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7537                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7538                 tOrder &= UCOL_PRIMARYMASK;
   7539             } while(tOrder == 0);
   7540 
   7541             // if both primaries are the same
   7542             if(sOrder == tOrder) {
   7543                 // and there are no more CEs, we advance to the next level
   7544                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7545                     break;
   7546                 }
   7547                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7548                     if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
   7549                         hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
   7550                             ? UCOL_LESS:UCOL_GREATER;
   7551                     }
   7552                 }
   7553             } else {
   7554                 // only need to check one for continuation
   7555                 // if one is then the other must be or the preceding CE would be a prefix of the other
   7556                 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
   7557                     sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
   7558                     tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
   7559                 }
   7560                 // if two primaries are different, we are done
   7561                 result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
   7562                 goto commonReturn;
   7563             }
   7564         } // no primary difference... do the rest from the buffers
   7565     } else { // shifted - do a slightly more complicated processing :)
   7566         for(;;) {
   7567             UBool sInShifted = FALSE;
   7568             UBool tInShifted = FALSE;
   7569             // This version of code can be refactored. However, it seems easier to understand this way.
   7570             // Source loop. Sam as the target loop.
   7571             for(;;) {
   7572                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7573                 if(sOrder == UCOL_NO_MORE_CES) {
   7574                     UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7575                     break;
   7576                 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
   7577                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7578                     continue;
   7579                 } else if(isContinuation(sOrder)) {
   7580                     if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7581                         if(sInShifted) {
   7582                             sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7583                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7584                             continue;
   7585                         } else {
   7586                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7587                             break;
   7588                         }
   7589                     } else { /* Just lower level values */
   7590                         if(sInShifted) {
   7591                             continue;
   7592                         } else {
   7593                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7594                             continue;
   7595                         }
   7596                     }
   7597                 } else { /* regular */
   7598                     if(coll->leadBytePermutationTable != NULL){
   7599                         sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
   7600                     }
   7601                     if((sOrder & UCOL_PRIMARYMASK) > LVT) {
   7602                         UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7603                         break;
   7604                     } else {
   7605                         if((sOrder & UCOL_PRIMARYMASK) > 0) {
   7606                             sInShifted = TRUE;
   7607                             sOrder &= UCOL_PRIMARYMASK;
   7608                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7609                             continue;
   7610                         } else {
   7611                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7612                             sInShifted = FALSE;
   7613                             continue;
   7614                         }
   7615                     }
   7616                 }
   7617             }
   7618             sOrder &= UCOL_PRIMARYMASK;
   7619             sInShifted = FALSE;
   7620 
   7621             for(;;) {
   7622                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7623                 if(tOrder == UCOL_NO_MORE_CES) {
   7624                     UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7625                     break;
   7626                 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
   7627                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7628                     continue;
   7629                 } else if(isContinuation(tOrder)) {
   7630                     if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7631                         if(tInShifted) {
   7632                             tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7633                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7634                             continue;
   7635                         } else {
   7636                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7637                             break;
   7638                         }
   7639                     } else { /* Just lower level values */
   7640                         if(tInShifted) {
   7641                             continue;
   7642                         } else {
   7643                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7644                             continue;
   7645                         }
   7646                     }
   7647                 } else { /* regular */
   7648                     if(coll->leadBytePermutationTable != NULL){
   7649                         tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
   7650                     }
   7651                     if((tOrder & UCOL_PRIMARYMASK) > LVT) {
   7652                         UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7653                         break;
   7654                     } else {
   7655                         if((tOrder & UCOL_PRIMARYMASK) > 0) {
   7656                             tInShifted = TRUE;
   7657                             tOrder &= UCOL_PRIMARYMASK;
   7658                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7659                             continue;
   7660                         } else {
   7661                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7662                             tInShifted = FALSE;
   7663                             continue;
   7664                         }
   7665                     }
   7666                 }
   7667             }
   7668             tOrder &= UCOL_PRIMARYMASK;
   7669             tInShifted = FALSE;
   7670 
   7671             if(sOrder == tOrder) {
   7672                 /*
   7673                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7674                 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
   7675                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
   7676                 ? UCOL_LESS:UCOL_GREATER;
   7677                 }
   7678                 }
   7679                 */
   7680                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7681                     break;
   7682                 } else {
   7683                     sOrder = 0;
   7684                     tOrder = 0;
   7685                     continue;
   7686                 }
   7687             } else {
   7688                 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
   7689                 goto commonReturn;
   7690             }
   7691         } /* no primary difference... do the rest from the buffers */
   7692     }
   7693 
   7694     /* now, we're gonna reexamine collected CEs */
   7695     uint32_t    *sCE;
   7696     uint32_t    *tCE;
   7697 
   7698     /* This is the secondary level of comparison */
   7699     if(checkSecTer) {
   7700         if(!isFrenchSec) { /* normal */
   7701             sCE = sCEs.buf;
   7702             tCE = tCEs.buf;
   7703             for(;;) {
   7704                 while (secS == 0) {
   7705                     secS = *(sCE++) & UCOL_SECONDARYMASK;
   7706                 }
   7707 
   7708                 while(secT == 0) {
   7709                     secT = *(tCE++) & UCOL_SECONDARYMASK;
   7710                 }
   7711 
   7712                 if(secS == secT) {
   7713                     if(secS == UCOL_NO_MORE_CES_SECONDARY) {
   7714                         break;
   7715                     } else {
   7716                         secS = 0; secT = 0;
   7717                         continue;
   7718                     }
   7719                 } else {
   7720                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7721                     goto commonReturn;
   7722                 }
   7723             }
   7724         } else { /* do the French */
   7725             uint32_t *sCESave = NULL;
   7726             uint32_t *tCESave = NULL;
   7727             sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
   7728             tCE = tCEs.pos-2;
   7729             for(;;) {
   7730                 while (secS == 0 && sCE >= sCEs.buf) {
   7731                     if(sCESave == NULL) {
   7732                         secS = *(sCE--);
   7733                         if(isContinuation(secS)) {
   7734                             while(isContinuation(secS = *(sCE--)))
   7735                                 ;
   7736                             /* after this, secS has the start of continuation, and sCEs points before that */
   7737                             sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7738                             sCE+=2;  /* need to point to the first continuation CP */
   7739                             /* However, now you can just continue doing stuff */
   7740                         }
   7741                     } else {
   7742                         secS = *(sCE++);
   7743                         if(!isContinuation(secS)) { /* This means we have finished with this cont */
   7744                             sCE = sCESave;            /* reset the pointer to before continuation */
   7745                             sCESave = NULL;
   7746                             secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
   7747                             continue;
   7748                         }
   7749                     }
   7750                     secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7751                 }
   7752 
   7753                 while(secT == 0 && tCE >= tCEs.buf) {
   7754                     if(tCESave == NULL) {
   7755                         secT = *(tCE--);
   7756                         if(isContinuation(secT)) {
   7757                             while(isContinuation(secT = *(tCE--)))
   7758                                 ;
   7759                             /* after this, secS has the start of continuation, and sCEs points before that */
   7760                             tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7761                             tCE+=2;  /* need to point to the first continuation CP */
   7762                             /* However, now you can just continue doing stuff */
   7763                         }
   7764                     } else {
   7765                         secT = *(tCE++);
   7766                         if(!isContinuation(secT)) { /* This means we have finished with this cont */
   7767                             tCE = tCESave;          /* reset the pointer to before continuation */
   7768                             tCESave = NULL;
   7769                             secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
   7770                             continue;
   7771                         }
   7772                     }
   7773                     secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7774                 }
   7775 
   7776                 if(secS == secT) {
   7777                     if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
   7778                         break;
   7779                     } else {
   7780                         secS = 0; secT = 0;
   7781                         continue;
   7782                     }
   7783                 } else {
   7784                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7785                     goto commonReturn;
   7786                 }
   7787             }
   7788         }
   7789     }
   7790 
   7791     /* doing the case bit */
   7792     if(checkCase) {
   7793         sCE = sCEs.buf;
   7794         tCE = tCEs.buf;
   7795         for(;;) {
   7796             while((secS & UCOL_REMOVE_CASE) == 0) {
   7797                 if(!isContinuation(*sCE++)) {
   7798                     secS =*(sCE-1);
   7799                     if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7800                         // primary ignorables should not be considered on the case level when the strength is primary
   7801                         // otherwise, the CEs stop being well-formed
   7802                         secS &= UCOL_TERT_CASE_MASK;
   7803                         secS ^= caseSwitch;
   7804                     } else {
   7805                         secS = 0;
   7806                     }
   7807                 } else {
   7808                     secS = 0;
   7809                 }
   7810             }
   7811 
   7812             while((secT & UCOL_REMOVE_CASE) == 0) {
   7813                 if(!isContinuation(*tCE++)) {
   7814                     secT = *(tCE-1);
   7815                     if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7816                         // primary ignorables should not be considered on the case level when the strength is primary
   7817                         // otherwise, the CEs stop being well-formed
   7818                         secT &= UCOL_TERT_CASE_MASK;
   7819                         secT ^= caseSwitch;
   7820                     } else {
   7821                         secT = 0;
   7822                     }
   7823                 } else {
   7824                     secT = 0;
   7825                 }
   7826             }
   7827 
   7828             if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
   7829                 result = UCOL_LESS;
   7830                 goto commonReturn;
   7831             } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
   7832                 result = UCOL_GREATER;
   7833                 goto commonReturn;
   7834             }
   7835 
   7836             if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
   7837                 break;
   7838             } else {
   7839                 secS = 0;
   7840                 secT = 0;
   7841             }
   7842         }
   7843     }
   7844 
   7845     /* Tertiary level */
   7846     if(checkTertiary) {
   7847         secS = 0;
   7848         secT = 0;
   7849         sCE = sCEs.buf;
   7850         tCE = tCEs.buf;
   7851         for(;;) {
   7852             while((secS & UCOL_REMOVE_CASE) == 0) {
   7853                 secS = *(sCE++) & tertiaryMask;
   7854                 if(!isContinuation(secS)) {
   7855                     secS ^= caseSwitch;
   7856                 } else {
   7857                     secS &= UCOL_REMOVE_CASE;
   7858                 }
   7859             }
   7860 
   7861             while((secT & UCOL_REMOVE_CASE)  == 0) {
   7862                 secT = *(tCE++) & tertiaryMask;
   7863                 if(!isContinuation(secT)) {
   7864                     secT ^= caseSwitch;
   7865                 } else {
   7866                     secT &= UCOL_REMOVE_CASE;
   7867                 }
   7868             }
   7869 
   7870             if(secS == secT) {
   7871                 if((secS & UCOL_REMOVE_CASE) == 1) {
   7872                     break;
   7873                 } else {
   7874                     secS = 0; secT = 0;
   7875                     continue;
   7876                 }
   7877             } else {
   7878                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7879                 goto commonReturn;
   7880             }
   7881         }
   7882     }
   7883 
   7884 
   7885     if(qShifted /*checkQuad*/) {
   7886         UBool sInShifted = TRUE;
   7887         UBool tInShifted = TRUE;
   7888         secS = 0;
   7889         secT = 0;
   7890         sCE = sCEs.buf;
   7891         tCE = tCEs.buf;
   7892         for(;;) {
   7893             while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
   7894                 secS = *(sCE++);
   7895                 if(isContinuation(secS)) {
   7896                     if(!sInShifted) {
   7897                         continue;
   7898                     }
   7899                 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
   7900                     secS = UCOL_PRIMARYMASK;
   7901                     sInShifted = FALSE;
   7902                 } else {
   7903                     sInShifted = TRUE;
   7904                 }
   7905             }
   7906             secS &= UCOL_PRIMARYMASK;
   7907 
   7908 
   7909             while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
   7910                 secT = *(tCE++);
   7911                 if(isContinuation(secT)) {
   7912                     if(!tInShifted) {
   7913                         continue;
   7914                     }
   7915                 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
   7916                     secT = UCOL_PRIMARYMASK;
   7917                     tInShifted = FALSE;
   7918                 } else {
   7919                     tInShifted = TRUE;
   7920                 }
   7921             }
   7922             secT &= UCOL_PRIMARYMASK;
   7923 
   7924             if(secS == secT) {
   7925                 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
   7926                     break;
   7927                 } else {
   7928                     secS = 0; secT = 0;
   7929                     continue;
   7930                 }
   7931             } else {
   7932                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7933                 goto commonReturn;
   7934             }
   7935         }
   7936     } else if(doHiragana && hirResult != UCOL_EQUAL) {
   7937         // If we're fine on quaternaries, we might be different
   7938         // on Hiragana. This, however, might fail us in shifted.
   7939         result = hirResult;
   7940         goto commonReturn;
   7941     }
   7942 
   7943     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
   7944     /*  as a tiebreaker if all else is equal.                                */
   7945     /*  Getting here  should be quite rare - strings are not identical -     */
   7946     /*     that is checked first, but compared == through all other checks.  */
   7947     if(checkIdent)
   7948     {
   7949         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
   7950         result = ucol_checkIdent(sColl, tColl, TRUE, status);
   7951     }
   7952 
   7953 commonReturn:
   7954     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
   7955         if (sCEs.buf != sCEs.localArray ) {
   7956             uprv_free(sCEs.buf);
   7957         }
   7958         if (tCEs.buf != tCEs.localArray ) {
   7959             uprv_free(tCEs.buf);
   7960         }
   7961     }
   7962 
   7963     return result;
   7964 }
   7965 
   7966 static UCollationResult
   7967 ucol_strcollRegular(const UCollator *coll,
   7968                     const UChar *source, int32_t sourceLength,
   7969                     const UChar *target, int32_t targetLength,
   7970                     UErrorCode *status) {
   7971     collIterate sColl, tColl;
   7972     // Preparing the context objects for iterating over strings
   7973     IInit_collIterate(coll, source, sourceLength, &sColl, status);
   7974     IInit_collIterate(coll, target, targetLength, &tColl, status);
   7975     if(U_FAILURE(*status)) {
   7976         return UCOL_LESS;
   7977     }
   7978     return ucol_strcollRegular(&sColl, &tColl, status);
   7979 }
   7980 
   7981 static inline uint32_t
   7982 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
   7983                           uint32_t CE, const UChar *s, int32_t *index, int32_t len)
   7984 {
   7985     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
   7986     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
   7987     int32_t offset = 1;
   7988     UChar schar = 0, tchar = 0;
   7989 
   7990     for(;;) {
   7991         if(len == -1) {
   7992             if(s[*index] == 0) { // end of string
   7993                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7994             } else {
   7995                 schar = s[*index];
   7996             }
   7997         } else {
   7998             if(*index == len) {
   7999                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   8000             } else {
   8001                 schar = s[*index];
   8002             }
   8003         }
   8004 
   8005         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   8006             offset++;
   8007         }
   8008 
   8009         if (schar == tchar) {
   8010             (*index)++;
   8011             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
   8012         }
   8013         else
   8014         {
   8015             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
   8016                 return UCOL_BAIL_OUT_CE;
   8017             }
   8018             // skip completely ignorables
   8019             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   8020             if(isZeroCE == 0) { // we have to ignore completely ignorables
   8021                 (*index)++;
   8022                 continue;
   8023             }
   8024 
   8025             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   8026         }
   8027     }
   8028 }
   8029 
   8030 
   8031 /**
   8032  * This is a fast strcoll, geared towards text in Latin-1.
   8033  * It supports contractions of size two, French secondaries
   8034  * and case switching. You can use it with strengths primary
   8035  * to tertiary. It does not support shifted and case level.
   8036  * It relies on the table build by setupLatin1Table. If it
   8037  * doesn't understand something, it will go to the regular
   8038  * strcoll.
   8039  */
   8040 static UCollationResult
   8041 ucol_strcollUseLatin1( const UCollator    *coll,
   8042               const UChar        *source,
   8043               int32_t            sLen,
   8044               const UChar        *target,
   8045               int32_t            tLen,
   8046               UErrorCode *status)
   8047 {
   8048     U_ALIGN_CODE(16);
   8049     int32_t strength = coll->strength;
   8050 
   8051     int32_t sIndex = 0, tIndex = 0;
   8052     UChar sChar = 0, tChar = 0;
   8053     uint32_t sOrder=0, tOrder=0;
   8054 
   8055     UBool endOfSource = FALSE;
   8056 
   8057     uint32_t *elements = coll->latinOneCEs;
   8058 
   8059     UBool haveContractions = FALSE; // if we have contractions in our string
   8060                                     // we cannot do French secondary
   8061 
   8062     // Do the primary level
   8063     for(;;) {
   8064         while(sOrder==0) { // this loop skips primary ignorables
   8065             // sOrder=getNextlatinOneCE(source);
   8066             if(sLen==-1) {   // handling zero terminated strings
   8067                 sChar=source[sIndex++];
   8068                 if(sChar==0) {
   8069                     endOfSource = TRUE;
   8070                     break;
   8071                 }
   8072             } else {        // handling strings with known length
   8073                 if(sIndex==sLen) {
   8074                     endOfSource = TRUE;
   8075                     break;
   8076                 }
   8077                 sChar=source[sIndex++];
   8078             }
   8079             if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   8080                 //fprintf(stderr, "R");
   8081                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8082             }
   8083             sOrder = elements[sChar];
   8084             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
   8085                 // specials can basically be either contractions or bail-out signs. If we get anything
   8086                 // else, we'll bail out anywasy
   8087                 if(getCETag(sOrder) == CONTRACTION_TAG) {
   8088                     sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
   8089                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
   8090                     // However, if there are contractions in the table, but we always use just one char,
   8091                     // we might be able to do French. This should be checked out.
   8092                 }
   8093                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   8094                     //fprintf(stderr, "S");
   8095                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8096                 }
   8097             }
   8098         }
   8099 
   8100         while(tOrder==0) {  // this loop skips primary ignorables
   8101             // tOrder=getNextlatinOneCE(target);
   8102             if(tLen==-1) {    // handling zero terminated strings
   8103                 tChar=target[tIndex++];
   8104                 if(tChar==0) {
   8105                     if(endOfSource) { // this is different than source loop,
   8106                         // as we already know that source loop is done here,
   8107                         // so we can either finish the primary loop if both
   8108                         // strings are done or anounce the result if only
   8109                         // target is done. Same below.
   8110                         goto endOfPrimLoop;
   8111                     } else {
   8112                         return UCOL_GREATER;
   8113                     }
   8114                 }
   8115             } else {          // handling strings with known length
   8116                 if(tIndex==tLen) {
   8117                     if(endOfSource) {
   8118                         goto endOfPrimLoop;
   8119                     } else {
   8120                         return UCOL_GREATER;
   8121                     }
   8122                 }
   8123                 tChar=target[tIndex++];
   8124             }
   8125             if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   8126                 //fprintf(stderr, "R");
   8127                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8128             }
   8129             tOrder = elements[tChar];
   8130             if(tOrder >= UCOL_NOT_FOUND) {
   8131                 // Handling specials, see the comments for source
   8132                 if(getCETag(tOrder) == CONTRACTION_TAG) {
   8133                     tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
   8134                     haveContractions = TRUE;
   8135                 }
   8136                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   8137                     //fprintf(stderr, "S");
   8138                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8139                 }
   8140             }
   8141         }
   8142         if(endOfSource) { // source is finished, but target is not, say the result.
   8143             return UCOL_LESS;
   8144         }
   8145 
   8146         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
   8147             sOrder = 0; tOrder = 0;
   8148             continue;
   8149         } else {
   8150             // compare current top bytes
   8151             if(((sOrder^tOrder)&0xFF000000)!=0) {
   8152                 // top bytes differ, return difference
   8153                 if(sOrder < tOrder) {
   8154                     return UCOL_LESS;
   8155                 } else if(sOrder > tOrder) {
   8156                     return UCOL_GREATER;
   8157                 }
   8158                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
   8159                 // since we must return enum value
   8160             }
   8161 
   8162             // top bytes match, continue with following bytes
   8163             sOrder<<=8;
   8164             tOrder<<=8;
   8165         }
   8166     }
   8167 
   8168 endOfPrimLoop:
   8169     // after primary loop, we definitely know the sizes of strings,
   8170     // so we set it and use simpler loop for secondaries and tertiaries
   8171     sLen = sIndex; tLen = tIndex;
   8172     if(strength >= UCOL_SECONDARY) {
   8173         // adjust the table beggining
   8174         elements += coll->latinOneTableLen;
   8175         endOfSource = FALSE;
   8176 
   8177         if(coll->frenchCollation == UCOL_OFF) { // non French
   8178             // This loop is a simplified copy of primary loop
   8179             // at this point we know that whole strings are latin-1, so we don't
   8180             // check for that. We also know that we only have contractions as
   8181             // specials.
   8182             sIndex = 0; tIndex = 0;
   8183             for(;;) {
   8184                 while(sOrder==0) {
   8185                     if(sIndex==sLen) {
   8186                         endOfSource = TRUE;
   8187                         break;
   8188                     }
   8189                     sChar=source[sIndex++];
   8190                     sOrder = elements[sChar];
   8191                     if(sOrder > UCOL_NOT_FOUND) {
   8192                         sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
   8193                     }
   8194                 }
   8195 
   8196                 while(tOrder==0) {
   8197                     if(tIndex==tLen) {
   8198                         if(endOfSource) {
   8199                             goto endOfSecLoop;
   8200                         } else {
   8201                             return UCOL_GREATER;
   8202                         }
   8203                     }
   8204                     tChar=target[tIndex++];
   8205                     tOrder = elements[tChar];
   8206                     if(tOrder > UCOL_NOT_FOUND) {
   8207                         tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
   8208                     }
   8209                 }
   8210                 if(endOfSource) {
   8211                     return UCOL_LESS;
   8212                 }
   8213 
   8214                 if(sOrder == tOrder) {
   8215                     sOrder = 0; tOrder = 0;
   8216                     continue;
   8217                 } else {
   8218                     // see primary loop for comments on this
   8219                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   8220                         if(sOrder < tOrder) {
   8221                             return UCOL_LESS;
   8222                         } else if(sOrder > tOrder) {
   8223                             return UCOL_GREATER;
   8224                         }
   8225                     }
   8226                     sOrder<<=8;
   8227                     tOrder<<=8;
   8228                 }
   8229             }
   8230         } else { // French
   8231             if(haveContractions) { // if we have contractions, we have to bail out
   8232                 // since we don't really know how to handle them here
   8233                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8234             }
   8235             // For French, we go backwards
   8236             sIndex = sLen; tIndex = tLen;
   8237             for(;;) {
   8238                 while(sOrder==0) {
   8239                     if(sIndex==0) {
   8240                         endOfSource = TRUE;
   8241                         break;
   8242                     }
   8243                     sChar=source[--sIndex];
   8244                     sOrder = elements[sChar];
   8245                     // don't even look for contractions
   8246                 }
   8247 
   8248                 while(tOrder==0) {
   8249                     if(tIndex==0) {
   8250                         if(endOfSource) {
   8251                             goto endOfSecLoop;
   8252                         } else {
   8253                             return UCOL_GREATER;
   8254                         }
   8255                     }
   8256                     tChar=target[--tIndex];
   8257                     tOrder = elements[tChar];
   8258                     // don't even look for contractions
   8259                 }
   8260                 if(endOfSource) {
   8261                     return UCOL_LESS;
   8262                 }
   8263 
   8264                 if(sOrder == tOrder) {
   8265                     sOrder = 0; tOrder = 0;
   8266                     continue;
   8267                 } else {
   8268                     // see the primary loop for comments
   8269                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   8270                         if(sOrder < tOrder) {
   8271                             return UCOL_LESS;
   8272                         } else if(sOrder > tOrder) {
   8273                             return UCOL_GREATER;
   8274                         }
   8275                     }
   8276                     sOrder<<=8;
   8277                     tOrder<<=8;
   8278                 }
   8279             }
   8280         }
   8281     }
   8282 
   8283 endOfSecLoop:
   8284     if(strength >= UCOL_TERTIARY) {
   8285         // tertiary loop is the same as secondary (except no French)
   8286         elements += coll->latinOneTableLen;
   8287         sIndex = 0; tIndex = 0;
   8288         endOfSource = FALSE;
   8289         for(;;) {
   8290             while(sOrder==0) {
   8291                 if(sIndex==sLen) {
   8292                     endOfSource = TRUE;
   8293                     break;
   8294                 }
   8295                 sChar=source[sIndex++];
   8296                 sOrder = elements[sChar];
   8297                 if(sOrder > UCOL_NOT_FOUND) {
   8298                     sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
   8299                 }
   8300             }
   8301             while(tOrder==0) {
   8302                 if(tIndex==tLen) {
   8303                     if(endOfSource) {
   8304                         return UCOL_EQUAL; // if both strings are at the end, they are equal
   8305                     } else {
   8306                         return UCOL_GREATER;
   8307                     }
   8308                 }
   8309                 tChar=target[tIndex++];
   8310                 tOrder = elements[tChar];
   8311                 if(tOrder > UCOL_NOT_FOUND) {
   8312                     tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
   8313                 }
   8314             }
   8315             if(endOfSource) {
   8316                 return UCOL_LESS;
   8317             }
   8318             if(sOrder == tOrder) {
   8319                 sOrder = 0; tOrder = 0;
   8320                 continue;
   8321             } else {
   8322                 if(((sOrder^tOrder)&0xff000000)!=0) {
   8323                     if(sOrder < tOrder) {
   8324                         return UCOL_LESS;
   8325                     } else if(sOrder > tOrder) {
   8326                         return UCOL_GREATER;
   8327                     }
   8328                 }
   8329                 sOrder<<=8;
   8330                 tOrder<<=8;
   8331             }
   8332         }
   8333     }
   8334     return UCOL_EQUAL;
   8335 }
   8336 
   8337 
   8338 U_CAPI UCollationResult U_EXPORT2
   8339 ucol_strcollIter( const UCollator    *coll,
   8340                  UCharIterator *sIter,
   8341                  UCharIterator *tIter,
   8342                  UErrorCode         *status)
   8343 {
   8344     if(!status || U_FAILURE(*status)) {
   8345         return UCOL_EQUAL;
   8346     }
   8347 
   8348     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
   8349     UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
   8350 
   8351     if (sIter == tIter) {
   8352         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8353         return UCOL_EQUAL;
   8354     }
   8355     if(sIter == NULL || tIter == NULL || coll == NULL) {
   8356         *status = U_ILLEGAL_ARGUMENT_ERROR;
   8357         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8358         return UCOL_EQUAL;
   8359     }
   8360 
   8361     UCollationResult result = UCOL_EQUAL;
   8362 
   8363     // Preparing the context objects for iterating over strings
   8364     collIterate sColl, tColl;
   8365     IInit_collIterate(coll, NULL, -1, &sColl, status);
   8366     IInit_collIterate(coll, NULL, -1, &tColl, status);
   8367     if(U_FAILURE(*status)) {
   8368         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8369         return UCOL_EQUAL;
   8370     }
   8371     // The division for the array length may truncate the array size to
   8372     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   8373     // for all platforms anyway.
   8374     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8375     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8376     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
   8377 
   8378     sColl.iterator = sIter;
   8379     sColl.flags |= UCOL_USE_ITERATOR;
   8380     tColl.flags |= UCOL_USE_ITERATOR;
   8381     tColl.iterator = tIter;
   8382 
   8383     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
   8384         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   8385         sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
   8386         sColl.flags &= ~UCOL_ITER_NORM;
   8387 
   8388         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   8389         tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
   8390         tColl.flags &= ~UCOL_ITER_NORM;
   8391     }
   8392 
   8393     UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
   8394 
   8395     while((sChar = sColl.iterator->next(sColl.iterator)) ==
   8396         (tChar = tColl.iterator->next(tColl.iterator))) {
   8397             if(sChar == U_SENTINEL) {
   8398                 result = UCOL_EQUAL;
   8399                 goto end_compare;
   8400             }
   8401     }
   8402 
   8403     if(sChar == U_SENTINEL) {
   8404         tChar = tColl.iterator->previous(tColl.iterator);
   8405     }
   8406 
   8407     if(tChar == U_SENTINEL) {
   8408         sChar = sColl.iterator->previous(sColl.iterator);
   8409     }
   8410 
   8411     sChar = sColl.iterator->previous(sColl.iterator);
   8412     tChar = tColl.iterator->previous(tColl.iterator);
   8413 
   8414     if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
   8415     {
   8416         // We are stopped in the middle of a contraction.
   8417         // Scan backwards through the == part of the string looking for the start of the contraction.
   8418         //   It doesn't matter which string we scan, since they are the same in this region.
   8419         do
   8420         {
   8421             sChar = sColl.iterator->previous(sColl.iterator);
   8422             tChar = tColl.iterator->previous(tColl.iterator);
   8423         }
   8424         while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
   8425     }
   8426 
   8427 
   8428     if(U_SUCCESS(*status)) {
   8429         result = ucol_strcollRegular(&sColl, &tColl, status);
   8430     }
   8431 
   8432 end_compare:
   8433     if(sNormIter || tNormIter) {
   8434         unorm_closeIter(sNormIter);
   8435         unorm_closeIter(tNormIter);
   8436     }
   8437 
   8438     UTRACE_EXIT_VALUE_STATUS(result, *status)
   8439     return result;
   8440 }
   8441 
   8442 
   8443 /*                                                                      */
   8444 /* ucol_strcoll     Main public API string comparison function          */
   8445 /*                                                                      */
   8446 U_CAPI UCollationResult U_EXPORT2
   8447 ucol_strcoll( const UCollator    *coll,
   8448               const UChar        *source,
   8449               int32_t            sourceLength,
   8450               const UChar        *target,
   8451               int32_t            targetLength)
   8452 {
   8453     U_ALIGN_CODE(16);
   8454 
   8455     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
   8456     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   8457         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
   8458         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
   8459         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
   8460     }
   8461 
   8462     if(source == NULL || target == NULL) {
   8463         // do not crash, but return. Should have
   8464         // status argument to return error.
   8465         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8466         return UCOL_EQUAL;
   8467     }
   8468 
   8469     /* Quick check if source and target are same strings. */
   8470     /* They should either both be NULL terminated or the explicit length should be set on both. */
   8471     if (source==target && sourceLength==targetLength) {
   8472         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8473         return UCOL_EQUAL;
   8474     }
   8475 
   8476     /* Scan the strings.  Find:                                                             */
   8477     /*    The length of any leading portion that is equal                                   */
   8478     /*    Whether they are exactly equal.  (in which case we just return)                   */
   8479     const UChar    *pSrc    = source;
   8480     const UChar    *pTarg   = target;
   8481     int32_t        equalLength;
   8482 
   8483     if (sourceLength == -1 && targetLength == -1) {
   8484         // Both strings are null terminated.
   8485         //    Scan through any leading equal portion.
   8486         while (*pSrc == *pTarg && *pSrc != 0) {
   8487             pSrc++;
   8488             pTarg++;
   8489         }
   8490         if (*pSrc == 0 && *pTarg == 0) {
   8491             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8492             return UCOL_EQUAL;
   8493         }
   8494         equalLength = (int32_t)(pSrc - source);
   8495     }
   8496     else
   8497     {
   8498         // One or both strings has an explicit length.
   8499         const UChar    *pSrcEnd = source + sourceLength;
   8500         const UChar    *pTargEnd = target + targetLength;
   8501 
   8502         // Scan while the strings are bitwise ==, or until one is exhausted.
   8503         for (;;) {
   8504             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
   8505                 break;
   8506             }
   8507             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
   8508                 break;
   8509             }
   8510             if (*pSrc != *pTarg) {
   8511                 break;
   8512             }
   8513             pSrc++;
   8514             pTarg++;
   8515         }
   8516         equalLength = (int32_t)(pSrc - source);
   8517 
   8518         // If we made it all the way through both strings, we are done.  They are ==
   8519         if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
   8520             (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
   8521         {
   8522             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8523             return UCOL_EQUAL;
   8524         }
   8525     }
   8526     if (equalLength > 0) {
   8527         /* There is an identical portion at the beginning of the two strings.        */
   8528         /*   If the identical portion ends within a contraction or a comibining      */
   8529         /*   character sequence, back up to the start of that sequence.              */
   8530 
   8531         // These values should already be set by the code above.
   8532         //pSrc  = source + equalLength;        /* point to the first differing chars   */
   8533         //pTarg = target + equalLength;
   8534         if ((pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
   8535             (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
   8536         {
   8537             // We are stopped in the middle of a contraction.
   8538             // Scan backwards through the == part of the string looking for the start of the contraction.
   8539             //   It doesn't matter which string we scan, since they are the same in this region.
   8540             do
   8541             {
   8542                 equalLength--;
   8543                 pSrc--;
   8544             }
   8545             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
   8546         }
   8547 
   8548         source += equalLength;
   8549         target += equalLength;
   8550         if (sourceLength > 0) {
   8551             sourceLength -= equalLength;
   8552         }
   8553         if (targetLength > 0) {
   8554             targetLength -= equalLength;
   8555         }
   8556     }
   8557 
   8558     UErrorCode status = U_ZERO_ERROR;
   8559     UCollationResult returnVal;
   8560     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
   8561         returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
   8562     } else {
   8563         returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
   8564     }
   8565     UTRACE_EXIT_VALUE(returnVal);
   8566     return returnVal;
   8567 }
   8568 
   8569 /* convenience function for comparing strings */
   8570 U_CAPI UBool U_EXPORT2
   8571 ucol_greater(    const    UCollator        *coll,
   8572         const    UChar            *source,
   8573         int32_t            sourceLength,
   8574         const    UChar            *target,
   8575         int32_t            targetLength)
   8576 {
   8577     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8578         == UCOL_GREATER);
   8579 }
   8580 
   8581 /* convenience function for comparing strings */
   8582 U_CAPI UBool U_EXPORT2
   8583 ucol_greaterOrEqual(    const    UCollator    *coll,
   8584             const    UChar        *source,
   8585             int32_t        sourceLength,
   8586             const    UChar        *target,
   8587             int32_t        targetLength)
   8588 {
   8589     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8590         != UCOL_LESS);
   8591 }
   8592 
   8593 /* convenience function for comparing strings */
   8594 U_CAPI UBool U_EXPORT2
   8595 ucol_equal(        const    UCollator        *coll,
   8596             const    UChar            *source,
   8597             int32_t            sourceLength,
   8598             const    UChar            *target,
   8599             int32_t            targetLength)
   8600 {
   8601     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8602         == UCOL_EQUAL);
   8603 }
   8604 
   8605 U_CAPI void U_EXPORT2
   8606 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
   8607     if(coll && coll->UCA) {
   8608         uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
   8609     }
   8610 }
   8611 
   8612 #endif /* #if !UCONFIG_NO_COLLATION */
   8613