Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 1996-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ucol.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 * Modification history
     12 * Date        Name      Comments
     13 * 1996-1999   various members of ICU team maintained C API for collation framework
     14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
     15 * 03/01/2001  synwee    Added maxexpansion functionality.
     16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_COLLATION
     22 
     23 #include "unicode/coleitr.h"
     24 #include "unicode/unorm.h"
     25 #include "unicode/udata.h"
     26 #include "unicode/ustring.h"
     27 
     28 #include "ucol_imp.h"
     29 #include "bocsu.h"
     30 
     31 #include "normalizer2impl.h"
     32 #include "unorm_it.h"
     33 #include "umutex.h"
     34 #include "cmemory.h"
     35 #include "ucln_in.h"
     36 #include "cstring.h"
     37 #include "utracimp.h"
     38 #include "putilimp.h"
     39 #include "uassert.h"
     40 
     41 #ifdef UCOL_DEBUG
     42 #include <stdio.h>
     43 #endif
     44 
     45 U_NAMESPACE_USE
     46 
     47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     48 
     49 #define LAST_BYTE_MASK_           0xFF
     50 #define SECOND_LAST_BYTE_SHIFT_   8
     51 
     52 #define ZERO_CC_LIMIT_            0xC0
     53 
     54 // this is static pointer to the normalizer fcdTrieIndex
     55 // it is always the same between calls to u_cleanup
     56 // and therefore writing to it is not synchronized.
     57 // It is cleaned in ucol_cleanup
     58 static const uint16_t *fcdTrieIndex=NULL;
     59 // Code points at fcdHighStart and above have a zero FCD value.
     60 static UChar32 fcdHighStart = 0;
     61 
     62 // These are values from UCA required for
     63 // implicit generation and supressing sort key compression
     64 // they should regularly be in the UCA, but if one
     65 // is running without UCA, it could be a problem
     66 static const int32_t maxRegularPrimary  = 0x7A;
     67 static const int32_t minImplicitPrimary = 0xE0;
     68 static const int32_t maxImplicitPrimary = 0xE4;
     69 
     70 U_CDECL_BEGIN
     71 static UBool U_CALLCONV
     72 ucol_cleanup(void)
     73 {
     74     fcdTrieIndex = NULL;
     75     return TRUE;
     76 }
     77 
     78 static int32_t U_CALLCONV
     79 _getFoldingOffset(uint32_t data) {
     80     return (int32_t)(data&0xFFFFFF);
     81 }
     82 
     83 U_CDECL_END
     84 
     85 // init FCD data
     86 static inline
     87 UBool initializeFCD(UErrorCode *status) {
     88     if (fcdTrieIndex != NULL) {
     89         return TRUE;
     90     } else {
     91         // The result is constant, until the library is reloaded.
     92         fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
     93         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
     94         return U_SUCCESS(*status);
     95     }
     96 }
     97 
     98 static
     99 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
    100                               int32_t sourceLen, collIterate *s,
    101                               UErrorCode *status)
    102 {
    103     (s)->string = (s)->pos = sourceString;
    104     (s)->origFlags = 0;
    105     (s)->flags = 0;
    106     if (sourceLen >= 0) {
    107         s->flags |= UCOL_ITER_HASLEN;
    108         (s)->endp = (UChar *)sourceString+sourceLen;
    109     }
    110     else {
    111         /* change to enable easier checking for end of string for fcdpositon */
    112         (s)->endp = NULL;
    113     }
    114     (s)->extendCEs = NULL;
    115     (s)->extendCEsSize = 0;
    116     (s)->CEpos = (s)->toReturn = (s)->CEs;
    117     (s)->offsetBuffer = NULL;
    118     (s)->offsetBufferSize = 0;
    119     (s)->offsetReturn = (s)->offsetStore = NULL;
    120     (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
    121     (s)->coll = (collator);
    122     (s)->nfd = Normalizer2Factory::getNFDInstance(*status);
    123     (s)->fcdPosition = 0;
    124     if(collator->normalizationMode == UCOL_ON) {
    125         (s)->flags |= UCOL_ITER_NORM;
    126     }
    127     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
    128         (s)->flags |= UCOL_HIRAGANA_Q;
    129     }
    130     (s)->iterator = NULL;
    131     //(s)->iteratorIndex = 0;
    132 }
    133 
    134 U_CAPI void  U_EXPORT2
    135 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
    136                              int32_t sourceLen, collIterate *s,
    137                              UErrorCode *status) {
    138     /* Out-of-line version for use from other files. */
    139     IInit_collIterate(collator, sourceString, sourceLen, s, status);
    140 }
    141 
    142 U_CAPI collIterate * U_EXPORT2
    143 uprv_new_collIterate(UErrorCode *status) {
    144     if(U_FAILURE(*status)) {
    145         return NULL;
    146     }
    147     collIterate *s = new collIterate;
    148     if(s == NULL) {
    149         *status = U_MEMORY_ALLOCATION_ERROR;
    150         return NULL;
    151     }
    152     return s;
    153 }
    154 
    155 U_CAPI void U_EXPORT2
    156 uprv_delete_collIterate(collIterate *s) {
    157     delete s;
    158 }
    159 
    160 U_CAPI UBool U_EXPORT2
    161 uprv_collIterateAtEnd(collIterate *s) {
    162     return s == NULL || s->pos == s->endp;
    163 }
    164 
    165 /**
    166 * Backup the state of the collIterate struct data
    167 * @param data collIterate to backup
    168 * @param backup storage
    169 */
    170 static
    171 inline void backupState(const collIterate *data, collIterateState *backup)
    172 {
    173     backup->fcdPosition = data->fcdPosition;
    174     backup->flags       = data->flags;
    175     backup->origFlags   = data->origFlags;
    176     backup->pos         = data->pos;
    177     backup->bufferaddress = data->writableBuffer.getBuffer();
    178     backup->buffersize    = data->writableBuffer.length();
    179     backup->iteratorMove = 0;
    180     backup->iteratorIndex = 0;
    181     if(data->iterator != NULL) {
    182         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
    183         backup->iteratorIndex = data->iterator->getState(data->iterator);
    184         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
    185         if(backup->iteratorIndex == UITER_NO_STATE) {
    186             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
    187                 backup->iteratorMove++;
    188                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
    189             }
    190             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    191         }
    192     }
    193 }
    194 
    195 /**
    196 * Loads the state into the collIterate struct data
    197 * @param data collIterate to backup
    198 * @param backup storage
    199 * @param forwards boolean to indicate if forwards iteration is used,
    200 *        false indicates backwards iteration
    201 */
    202 static
    203 inline void loadState(collIterate *data, const collIterateState *backup,
    204                       UBool        forwards)
    205 {
    206     UErrorCode status = U_ZERO_ERROR;
    207     data->flags       = backup->flags;
    208     data->origFlags   = backup->origFlags;
    209     if(data->iterator != NULL) {
    210         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
    211         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
    212         if(backup->iteratorMove != 0) {
    213             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    214         }
    215     }
    216     data->pos         = backup->pos;
    217 
    218     if ((data->flags & UCOL_ITER_INNORMBUF) &&
    219         data->writableBuffer.getBuffer() != backup->bufferaddress) {
    220         /*
    221         this is when a new buffer has been reallocated and we'll have to
    222         calculate the new position.
    223         note the new buffer has to contain the contents of the old buffer.
    224         */
    225         if (forwards) {
    226             data->pos = data->writableBuffer.getTerminatedBuffer() +
    227                                          (data->pos - backup->bufferaddress);
    228         }
    229         else {
    230             /* backwards direction */
    231             int32_t temp = backup->buffersize -
    232                                   (int32_t)(data->pos - backup->bufferaddress);
    233             data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
    234         }
    235     }
    236     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
    237         /*
    238         this is alittle tricky.
    239         if we are initially not in the normalization buffer, even if we
    240         normalize in the later stage, the data in the buffer will be
    241         ignored, since we skip back up to the data string.
    242         however if we are already in the normalization buffer, any
    243         further normalization will pull data into the normalization
    244         buffer and modify the fcdPosition.
    245         since we are keeping the data in the buffer for use, the
    246         fcdPosition can not be reverted back.
    247         arrgghh....
    248         */
    249         data->fcdPosition = backup->fcdPosition;
    250     }
    251 }
    252 
    253 static UBool
    254 reallocCEs(collIterate *data, int32_t newCapacity) {
    255     uint32_t *oldCEs = data->extendCEs;
    256     if(oldCEs == NULL) {
    257         oldCEs = data->CEs;
    258     }
    259     int32_t length = data->CEpos - oldCEs;
    260     uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
    261     if(newCEs == NULL) {
    262         return FALSE;
    263     }
    264     uprv_memcpy(newCEs, oldCEs, length * 4);
    265     uprv_free(data->extendCEs);
    266     data->extendCEs = newCEs;
    267     data->extendCEsSize = newCapacity;
    268     data->CEpos = newCEs + length;
    269     return TRUE;
    270 }
    271 
    272 static UBool
    273 increaseCEsCapacity(collIterate *data) {
    274     int32_t oldCapacity;
    275     if(data->extendCEs != NULL) {
    276         oldCapacity = data->extendCEsSize;
    277     } else {
    278         oldCapacity = LENGTHOF(data->CEs);
    279     }
    280     return reallocCEs(data, 2 * oldCapacity);
    281 }
    282 
    283 static UBool
    284 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
    285     int32_t oldCapacity;
    286     if(data->extendCEs != NULL) {
    287         oldCapacity = data->extendCEsSize;
    288     } else {
    289         oldCapacity = LENGTHOF(data->CEs);
    290     }
    291     if(minCapacity <= oldCapacity) {
    292         return TRUE;
    293     }
    294     oldCapacity *= 2;
    295     return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
    296 }
    297 
    298 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
    299     if(U_FAILURE(errorCode)) {
    300         return;
    301     }
    302     int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
    303     if(length >= offsetBufferSize) {
    304         int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
    305         int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4));
    306         if(newBuffer == NULL) {
    307             errorCode = U_MEMORY_ALLOCATION_ERROR;
    308             return;
    309         }
    310         if(length > 0) {
    311             uprv_memcpy(newBuffer, offsetBuffer, length * 4);
    312         }
    313         uprv_free(offsetBuffer);
    314         offsetBuffer = newBuffer;
    315         offsetStore = offsetBuffer + length;
    316         offsetBufferSize = newCapacity;
    317     }
    318     *offsetStore++ = offset;
    319 }
    320 
    321 /*
    322 * collIter_eos()
    323 *     Checks for a collIterate being positioned at the end of
    324 *     its source string.
    325 *
    326 */
    327 static
    328 inline UBool collIter_eos(collIterate *s) {
    329     if(s->flags & UCOL_USE_ITERATOR) {
    330       return !(s->iterator->hasNext(s->iterator));
    331     }
    332     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
    333         // Null terminated string, but not at null, so not at end.
    334         //   Whether in main or normalization buffer doesn't matter.
    335         return FALSE;
    336     }
    337 
    338     // String with length.  Can't be in normalization buffer, which is always
    339     //  null termintated.
    340     if (s->flags & UCOL_ITER_HASLEN) {
    341         return (s->pos == s->endp);
    342     }
    343 
    344     // We are at a null termination, could be either normalization buffer or main string.
    345     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
    346         // At null at end of main string.
    347         return TRUE;
    348     }
    349 
    350     // At null at end of normalization buffer.  Need to check whether there there are
    351     //   any characters left in the main buffer.
    352     if(s->origFlags & UCOL_USE_ITERATOR) {
    353       return !(s->iterator->hasNext(s->iterator));
    354     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
    355         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
    356         return (*s->fcdPosition == 0);
    357     }
    358     else {
    359         // Main string with an end pointer.
    360         return s->fcdPosition == s->endp;
    361     }
    362 }
    363 
    364 /*
    365 * collIter_bos()
    366 *     Checks for a collIterate being positioned at the start of
    367 *     its source string.
    368 *
    369 */
    370 static
    371 inline UBool collIter_bos(collIterate *source) {
    372   // if we're going backwards, we need to know whether there is more in the
    373   // iterator, even if we are in the side buffer
    374   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    375     return !source->iterator->hasPrevious(source->iterator);
    376   }
    377   if (source->pos <= source->string ||
    378       ((source->flags & UCOL_ITER_INNORMBUF) &&
    379       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
    380     return TRUE;
    381   }
    382   return FALSE;
    383 }
    384 
    385 /*static
    386 inline UBool collIter_SimpleBos(collIterate *source) {
    387   // if we're going backwards, we need to know whether there is more in the
    388   // iterator, even if we are in the side buffer
    389   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    390     return !source->iterator->hasPrevious(source->iterator);
    391   }
    392   if (source->pos == source->string) {
    393     return TRUE;
    394   }
    395   return FALSE;
    396 }*/
    397     //return (data->pos == data->string) ||
    398 
    399 
    400 /****************************************************************************/
    401 /* Following are the open/close functions                                   */
    402 /*                                                                          */
    403 /****************************************************************************/
    404 
    405 static UCollator*
    406 ucol_initFromBinary(const uint8_t *bin, int32_t length,
    407                 const UCollator *base,
    408                 UCollator *fillIn,
    409                 UErrorCode *status)
    410 {
    411     UCollator *result = fillIn;
    412     if(U_FAILURE(*status)) {
    413         return NULL;
    414     }
    415     /*
    416     if(base == NULL) {
    417         // we don't support null base yet
    418         *status = U_ILLEGAL_ARGUMENT_ERROR;
    419         return NULL;
    420     }
    421     */
    422     // We need these and we could be running without UCA
    423     uprv_uca_initImplicitConstants(status);
    424     UCATableHeader *colData = (UCATableHeader *)bin;
    425     // do we want version check here? We're trying to figure out whether collators are compatible
    426     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
    427         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
    428         colData->version[0] != UCOL_BUILDER_VERSION)
    429     {
    430         *status = U_COLLATOR_VERSION_MISMATCH;
    431         return NULL;
    432     }
    433     else {
    434         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
    435             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
    436             if(U_FAILURE(*status)){
    437                 return NULL;
    438             }
    439             result->hasRealData = TRUE;
    440         }
    441         else {
    442             if(base) {
    443                 result = ucol_initCollator(base->image, result, base, status);
    444                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
    445                 if(U_FAILURE(*status)){
    446                     return NULL;
    447                 }
    448                 result->hasRealData = FALSE;
    449             }
    450             else {
    451                 *status = U_USELESS_COLLATOR_ERROR;
    452                 return NULL;
    453             }
    454         }
    455         result->freeImageOnClose = FALSE;
    456     }
    457     result->actualLocale = NULL;
    458     result->validLocale = NULL;
    459     result->requestedLocale = NULL;
    460     result->rules = NULL;
    461     result->rulesLength = 0;
    462     result->freeRulesOnClose = FALSE;
    463     result->ucaRules = NULL;
    464     return result;
    465 }
    466 
    467 U_CAPI UCollator* U_EXPORT2
    468 ucol_openBinary(const uint8_t *bin, int32_t length,
    469                 const UCollator *base,
    470                 UErrorCode *status)
    471 {
    472     return ucol_initFromBinary(bin, length, base, NULL, status);
    473 }
    474 
    475 U_CAPI int32_t U_EXPORT2
    476 ucol_cloneBinary(const UCollator *coll,
    477                  uint8_t *buffer, int32_t capacity,
    478                  UErrorCode *status)
    479 {
    480     int32_t length = 0;
    481     if(U_FAILURE(*status)) {
    482         return length;
    483     }
    484     if(capacity < 0) {
    485         *status = U_ILLEGAL_ARGUMENT_ERROR;
    486         return length;
    487     }
    488     if(coll->hasRealData == TRUE) {
    489         length = coll->image->size;
    490         if(length <= capacity) {
    491             uprv_memcpy(buffer, coll->image, length);
    492         } else {
    493             *status = U_BUFFER_OVERFLOW_ERROR;
    494         }
    495     } else {
    496         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    497         if(length <= capacity) {
    498             /* build the UCATableHeader with minimal entries */
    499             /* do not copy the header from the UCA file because its values are wrong! */
    500             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    501 
    502             /* reset everything */
    503             uprv_memset(buffer, 0, length);
    504 
    505             /* set the tailoring-specific values */
    506             UCATableHeader *myData = (UCATableHeader *)buffer;
    507             myData->size = length;
    508 
    509             /* offset for the options, the only part of the data that is present after the header */
    510             myData->options = sizeof(UCATableHeader);
    511 
    512             /* need to always set the expansion value for an upper bound of the options */
    513             myData->expansion = myData->options + sizeof(UColOptionSet);
    514 
    515             myData->magic = UCOL_HEADER_MAGIC;
    516             myData->isBigEndian = U_IS_BIG_ENDIAN;
    517             myData->charSetFamily = U_CHARSET_FAMILY;
    518 
    519             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    520             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    521 
    522             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    523             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    524             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    525             myData->jamoSpecial = coll->image->jamoSpecial;
    526 
    527             /* copy the collator options */
    528             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    529         } else {
    530             *status = U_BUFFER_OVERFLOW_ERROR;
    531         }
    532     }
    533     return length;
    534 }
    535 
    536 U_CAPI UCollator* U_EXPORT2
    537 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
    538 {
    539     UCollator * localCollator;
    540     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
    541     char *stackBufferChars = (char *)stackBuffer;
    542     int32_t imageSize = 0;
    543     int32_t rulesSize = 0;
    544     int32_t rulesPadding = 0;
    545     uint8_t *image;
    546     UChar *rules;
    547     UBool colAllocated = FALSE;
    548     UBool imageAllocated = FALSE;
    549 
    550     if (status == NULL || U_FAILURE(*status)){
    551         return 0;
    552     }
    553     if ((stackBuffer && !pBufferSize) || !coll){
    554        *status = U_ILLEGAL_ARGUMENT_ERROR;
    555         return 0;
    556     }
    557     if (coll->rules && coll->freeRulesOnClose) {
    558         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
    559         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
    560         bufferSizeNeeded += rulesSize + rulesPadding;
    561     }
    562 
    563     if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
    564         *pBufferSize =  bufferSizeNeeded;
    565         return 0;
    566     }
    567 
    568     /* Pointers on 64-bit platforms need to be aligned
    569      * on a 64-bit boundry in memory.
    570      */
    571     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
    572         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
    573         if (*pBufferSize > offsetUp) {
    574             *pBufferSize -= offsetUp;
    575             stackBufferChars += offsetUp;
    576         }
    577         else {
    578             /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
    579             *pBufferSize = 1;
    580         }
    581     }
    582     stackBuffer = (void *)stackBufferChars;
    583 
    584     if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
    585         /* allocate one here...*/
    586         stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
    587         // Null pointer check.
    588         if (stackBufferChars == NULL) {
    589             *status = U_MEMORY_ALLOCATION_ERROR;
    590             return NULL;
    591         }
    592         colAllocated = TRUE;
    593         if (U_SUCCESS(*status)) {
    594             *status = U_SAFECLONE_ALLOCATED_WARNING;
    595         }
    596     }
    597     localCollator = (UCollator *)stackBufferChars;
    598     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
    599     {
    600         UErrorCode tempStatus = U_ZERO_ERROR;
    601         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
    602     }
    603     if (coll->freeImageOnClose) {
    604         image = (uint8_t *)uprv_malloc(imageSize);
    605         // Null pointer check
    606         if (image == NULL) {
    607             *status = U_MEMORY_ALLOCATION_ERROR;
    608             return NULL;
    609         }
    610         ucol_cloneBinary(coll, image, imageSize, status);
    611         imageAllocated = TRUE;
    612     }
    613     else {
    614         image = (uint8_t *)coll->image;
    615     }
    616     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
    617     if (U_FAILURE(*status)) {
    618         return NULL;
    619     }
    620 
    621     if (coll->rules) {
    622         if (coll->freeRulesOnClose) {
    623             localCollator->rules = u_strcpy(rules, coll->rules);
    624             //bufferEnd += rulesSize;
    625         }
    626         else {
    627             localCollator->rules = coll->rules;
    628         }
    629         localCollator->freeRulesOnClose = FALSE;
    630         localCollator->rulesLength = coll->rulesLength;
    631     }
    632 
    633     int32_t i;
    634     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
    635         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
    636     }
    637     // zero copies of pointers
    638     localCollator->actualLocale = NULL;
    639     localCollator->validLocale = NULL;
    640     localCollator->requestedLocale = NULL;
    641     localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
    642     localCollator->freeOnClose = colAllocated;
    643     localCollator->freeImageOnClose = imageAllocated;
    644     return localCollator;
    645 }
    646 
    647 U_CAPI void U_EXPORT2
    648 ucol_close(UCollator *coll)
    649 {
    650     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
    651     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
    652     if(coll != NULL) {
    653         // these are always owned by each UCollator struct,
    654         // so we always free them
    655         if(coll->validLocale != NULL) {
    656             uprv_free(coll->validLocale);
    657         }
    658         if(coll->actualLocale != NULL) {
    659             uprv_free(coll->actualLocale);
    660         }
    661         if(coll->requestedLocale != NULL) {
    662             uprv_free(coll->requestedLocale);
    663         }
    664         if(coll->latinOneCEs != NULL) {
    665             uprv_free(coll->latinOneCEs);
    666         }
    667         if(coll->options != NULL && coll->freeOptionsOnClose) {
    668             uprv_free(coll->options);
    669         }
    670         if(coll->rules != NULL && coll->freeRulesOnClose) {
    671             uprv_free((UChar *)coll->rules);
    672         }
    673         if(coll->image != NULL && coll->freeImageOnClose) {
    674             uprv_free((UCATableHeader *)coll->image);
    675         }
    676         if(coll->leadBytePermutationTable != NULL) {
    677             uprv_free(coll->leadBytePermutationTable);
    678         }
    679         if(coll->reorderCodes != NULL) {
    680             uprv_free(coll->reorderCodes);
    681         }
    682 
    683         /* Here, it would be advisable to close: */
    684         /* - UData for UCA (unless we stuff it in the root resb */
    685         /* Again, do we need additional housekeeping... HMMM! */
    686         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
    687         if(coll->freeOnClose){
    688             /* for safeClone, if freeOnClose is FALSE,
    689             don't free the other instance data */
    690             uprv_free(coll);
    691         }
    692     }
    693     UTRACE_EXIT();
    694 }
    695 
    696 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
    697 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
    698 U_CFUNC uint8_t* U_EXPORT2
    699 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
    700 {
    701     uint8_t *result = NULL;
    702     if(U_FAILURE(*status)) {
    703         return NULL;
    704     }
    705     if(coll->hasRealData == TRUE) {
    706         *length = coll->image->size;
    707         result = (uint8_t *)uprv_malloc(*length);
    708         /* test for NULL */
    709         if (result == NULL) {
    710             *status = U_MEMORY_ALLOCATION_ERROR;
    711             return NULL;
    712         }
    713         uprv_memcpy(result, coll->image, *length);
    714     } else {
    715         *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    716         result = (uint8_t *)uprv_malloc(*length);
    717         /* test for NULL */
    718         if (result == NULL) {
    719             *status = U_MEMORY_ALLOCATION_ERROR;
    720             return NULL;
    721         }
    722 
    723         /* build the UCATableHeader with minimal entries */
    724         /* do not copy the header from the UCA file because its values are wrong! */
    725         /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    726 
    727         /* reset everything */
    728         uprv_memset(result, 0, *length);
    729 
    730         /* set the tailoring-specific values */
    731         UCATableHeader *myData = (UCATableHeader *)result;
    732         myData->size = *length;
    733 
    734         /* offset for the options, the only part of the data that is present after the header */
    735         myData->options = sizeof(UCATableHeader);
    736 
    737         /* need to always set the expansion value for an upper bound of the options */
    738         myData->expansion = myData->options + sizeof(UColOptionSet);
    739 
    740         myData->magic = UCOL_HEADER_MAGIC;
    741         myData->isBigEndian = U_IS_BIG_ENDIAN;
    742         myData->charSetFamily = U_CHARSET_FAMILY;
    743 
    744         /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    745         uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    746 
    747         uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    748         uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    749         uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    750         myData->jamoSpecial = coll->image->jamoSpecial;
    751 
    752         /* copy the collator options */
    753         uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    754     }
    755     return result;
    756 }
    757 
    758 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
    759     if(U_FAILURE(*status)) {
    760         return;
    761     }
    762     result->caseFirst = (UColAttributeValue)opts->caseFirst;
    763     result->caseLevel = (UColAttributeValue)opts->caseLevel;
    764     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
    765     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
    766     if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
    767         return;
    768     }
    769     result->strength = (UColAttributeValue)opts->strength;
    770     result->variableTopValue = opts->variableTopValue;
    771     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
    772     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
    773     result->numericCollation = (UColAttributeValue)opts->numericCollation;
    774     result->caseFirstisDefault = TRUE;
    775     result->caseLevelisDefault = TRUE;
    776     result->frenchCollationisDefault = TRUE;
    777     result->normalizationModeisDefault = TRUE;
    778     result->strengthisDefault = TRUE;
    779     result->variableTopValueisDefault = TRUE;
    780     result->alternateHandlingisDefault = TRUE;
    781     result->hiraganaQisDefault = TRUE;
    782     result->numericCollationisDefault = TRUE;
    783 
    784     ucol_updateInternalState(result, status);
    785 
    786     result->options = opts;
    787 }
    788 
    789 
    790 /**
    791 * Approximate determination if a character is at a contraction end.
    792 * Guaranteed to be TRUE if a character is at the end of a contraction,
    793 * otherwise it is not deterministic.
    794 * @param c character to be determined
    795 * @param coll collator
    796 */
    797 static
    798 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
    799     if (c < coll->minContrEndCP) {
    800         return FALSE;
    801     }
    802 
    803     int32_t  hash = c;
    804     uint8_t  htbyte;
    805     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
    806         if (U16_IS_TRAIL(c)) {
    807             return TRUE;
    808         }
    809         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
    810     }
    811     htbyte = coll->contrEndCP[hash>>3];
    812     return (((htbyte >> (hash & 7)) & 1) == 1);
    813 }
    814 
    815 
    816 
    817 /*
    818 *   i_getCombiningClass()
    819 *        A fast, at least partly inline version of u_getCombiningClass()
    820 *        This is a candidate for further optimization.  Used heavily
    821 *        in contraction processing.
    822 */
    823 static
    824 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
    825     uint8_t sCC = 0;
    826     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
    827         sCC = u_getCombiningClass(c);
    828     }
    829     return sCC;
    830 }
    831 
    832 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
    833     UChar c;
    834     UCollator *result = fillIn;
    835     if(U_FAILURE(*status) || image == NULL) {
    836         return NULL;
    837     }
    838 
    839     if(result == NULL) {
    840         result = (UCollator *)uprv_malloc(sizeof(UCollator));
    841         if(result == NULL) {
    842             *status = U_MEMORY_ALLOCATION_ERROR;
    843             return result;
    844         }
    845         result->freeOnClose = TRUE;
    846     } else {
    847         result->freeOnClose = FALSE;
    848     }
    849 
    850     result->image = image;
    851     result->mapping.getFoldingOffset = _getFoldingOffset;
    852     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
    853     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
    854     if(U_FAILURE(*status)) {
    855         if(result->freeOnClose == TRUE) {
    856             uprv_free(result);
    857             result = NULL;
    858         }
    859         return result;
    860     }
    861 
    862     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
    863     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
    864     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
    865     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
    866     result->rules = NULL;
    867     result->rulesLength = 0;
    868     result->freeRulesOnClose = FALSE;
    869     result->reorderCodes = NULL;
    870     result->reorderCodesLength = 0;
    871     result->leadBytePermutationTable = NULL;
    872 
    873     /* get the version info from UCATableHeader and populate the Collator struct*/
    874     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
    875     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
    876     result->dataVersion[2] = 0;
    877     result->dataVersion[3] = 0;
    878 
    879     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
    880     result->minUnsafeCP = 0;
    881     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
    882         if (ucol_unsafeCP(c, result)) break;
    883     }
    884     result->minUnsafeCP = c;
    885 
    886     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
    887     result->minContrEndCP = 0;
    888     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
    889         if (ucol_contractionEndCP(c, result)) break;
    890     }
    891     result->minContrEndCP = c;
    892 
    893     /* max expansion tables */
    894     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
    895                                          result->image->endExpansionCE);
    896     result->lastEndExpansionCE = result->endExpansionCE +
    897                                  result->image->endExpansionCECount - 1;
    898     result->expansionCESize = (uint8_t*)result->image +
    899                                                result->image->expansionCESize;
    900 
    901 
    902     //result->errorCode = *status;
    903 
    904     result->latinOneCEs = NULL;
    905 
    906     result->latinOneRegenTable = FALSE;
    907     result->latinOneFailed = FALSE;
    908     result->UCA = UCA;
    909 
    910     /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
    911     result->ucaRules = NULL;
    912     result->actualLocale = NULL;
    913     result->validLocale = NULL;
    914     result->requestedLocale = NULL;
    915     result->hasRealData = FALSE; // real data lives in .dat file...
    916     result->freeImageOnClose = FALSE;
    917 
    918     /* set attributes */
    919     ucol_setOptionsFromHeader(
    920         result,
    921         (UColOptionSet*)((uint8_t*)result->image+result->image->options),
    922         status);
    923     result->freeOptionsOnClose = FALSE;
    924 
    925     return result;
    926 }
    927 
    928 /* new Mark's code */
    929 
    930 /**
    931  * For generation of Implicit CEs
    932  * @author Davis
    933  *
    934  * Cleaned up so that changes can be made more easily.
    935  * Old values:
    936 # First Implicit: E26A792D
    937 # Last Implicit: E3DC70C0
    938 # First CJK: E0030300
    939 # Last CJK: E0A9DD00
    940 # First CJK_A: E0A9DF00
    941 # Last CJK_A: E0DE3100
    942  */
    943 /* Following is a port of Mark's code for new treatment of implicits.
    944  * It is positioned here, since ucol_initUCA need to initialize the
    945  * variables below according to the data in the fractional UCA.
    946  */
    947 
    948 /**
    949  * Function used to:
    950  * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
    951  * b) bump any non-CJK characters by 10FFFF.
    952  * The relevant blocks are:
    953  * A:    4E00..9FFF; CJK Unified Ideographs
    954  *       F900..FAFF; CJK Compatibility Ideographs
    955  * B:    3400..4DBF; CJK Unified Ideographs Extension A
    956  *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
    957  * As long as
    958  *   no new B characters are allocated between 4E00 and FAFF, and
    959  *   no new A characters are outside of this range,
    960  * (very high probability) this simple code will work.
    961  * The reordered blocks are:
    962  * Block1 is CJK
    963  * Block2 is CJK_COMPAT_USED
    964  * Block3 is CJK_A
    965  * (all contiguous)
    966  * Any other CJK gets its normal code point
    967  * Any non-CJK gets +10FFFF
    968  * When we reorder Block1, we make sure that it is at the very start,
    969  * so that it will use a 3-byte form.
    970  * Warning: the we only pick up the compatibility characters that are
    971  * NOT decomposed, so that block is smaller!
    972  */
    973 
    974 // CONSTANTS
    975 static const UChar32
    976     NON_CJK_OFFSET = 0x110000,
    977     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
    978 
    979 /**
    980  * Precomputed by initImplicitConstants()
    981  */
    982 static int32_t
    983     final3Multiplier = 0,
    984     final4Multiplier = 0,
    985     final3Count = 0,
    986     final4Count = 0,
    987     medialCount = 0,
    988     min3Primary = 0,
    989     min4Primary = 0,
    990     max4Primary = 0,
    991     minTrail = 0,
    992     maxTrail = 0,
    993     max3Trail = 0,
    994     max4Trail = 0,
    995     min4Boundary = 0;
    996 
    997 static const UChar32
    998     // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
    999     // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
   1000     CJK_BASE = 0x4E00,
   1001     CJK_LIMIT = 0x9FCB+1,
   1002     // Unified CJK ideographs in the compatibility ideographs block.
   1003     CJK_COMPAT_USED_BASE = 0xFA0E,
   1004     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
   1005     // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
   1006     // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
   1007     CJK_A_BASE = 0x3400,
   1008     CJK_A_LIMIT = 0x4DB5+1,
   1009     // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
   1010     // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
   1011     CJK_B_BASE = 0x20000,
   1012     CJK_B_LIMIT = 0x2A6D6+1,
   1013     // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
   1014     // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
   1015     CJK_C_BASE = 0x2A700,
   1016     CJK_C_LIMIT = 0x2B734+1,
   1017     // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
   1018     // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
   1019     CJK_D_BASE = 0x2B740,
   1020     CJK_D_LIMIT = 0x2B81D+1;
   1021     // when adding to this list, look for all occurrences (in project)
   1022     // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
   1023 
   1024 static UChar32 swapCJK(UChar32 i) {
   1025     if (i < CJK_A_BASE) {
   1026         // non-CJK
   1027     } else if (i < CJK_A_LIMIT) {
   1028         // Extension A has lower code points than the original Unihan+compat
   1029         // but sorts higher.
   1030         return i - CJK_A_BASE
   1031                 + (CJK_LIMIT - CJK_BASE)
   1032                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1033     } else if (i < CJK_BASE) {
   1034         // non-CJK
   1035     } else if (i < CJK_LIMIT) {
   1036         return i - CJK_BASE;
   1037     } else if (i < CJK_COMPAT_USED_BASE) {
   1038         // non-CJK
   1039     } else if (i < CJK_COMPAT_USED_LIMIT) {
   1040         return i - CJK_COMPAT_USED_BASE
   1041                 + (CJK_LIMIT - CJK_BASE);
   1042     } else if (i < CJK_B_BASE) {
   1043         // non-CJK
   1044     } else if (i < CJK_B_LIMIT) {
   1045         return i; // non-BMP-CJK
   1046     } else if (i < CJK_C_BASE) {
   1047         // non-CJK
   1048     } else if (i < CJK_C_LIMIT) {
   1049         return i; // non-BMP-CJK
   1050     } else if (i < CJK_D_BASE) {
   1051         // non-CJK
   1052     } else if (i < CJK_D_LIMIT) {
   1053         return i; // non-BMP-CJK
   1054     }
   1055     return i + NON_CJK_OFFSET; // non-CJK
   1056 }
   1057 
   1058 U_CAPI UChar32 U_EXPORT2
   1059 uprv_uca_getRawFromCodePoint(UChar32 i) {
   1060     return swapCJK(i)+1;
   1061 }
   1062 
   1063 U_CAPI UChar32 U_EXPORT2
   1064 uprv_uca_getCodePointFromRaw(UChar32 i) {
   1065     i--;
   1066     UChar32 result = 0;
   1067     if(i >= NON_CJK_OFFSET) {
   1068         result = i - NON_CJK_OFFSET;
   1069     } else if(i >= CJK_B_BASE) {
   1070         result = i;
   1071     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
   1072         if(i < CJK_LIMIT - CJK_BASE) {
   1073             result = i + CJK_BASE;
   1074         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
   1075             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
   1076         } else {
   1077             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1078         }
   1079     } else {
   1080         result = -1;
   1081     }
   1082     return result;
   1083 }
   1084 
   1085 // GET IMPLICIT PRIMARY WEIGHTS
   1086 // Return value is left justified primary key
   1087 U_CAPI uint32_t U_EXPORT2
   1088 uprv_uca_getImplicitFromRaw(UChar32 cp) {
   1089     /*
   1090     if (cp < 0 || cp > UCOL_MAX_INPUT) {
   1091         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
   1092     }
   1093     */
   1094     int32_t last0 = cp - min4Boundary;
   1095     if (last0 < 0) {
   1096         int32_t last1 = cp / final3Count;
   1097         last0 = cp % final3Count;
   1098 
   1099         int32_t last2 = last1 / medialCount;
   1100         last1 %= medialCount;
   1101 
   1102         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
   1103         last1 = minTrail + last1; // offset
   1104         last2 = min3Primary + last2; // offset
   1105         /*
   1106         if (last2 >= min4Primary) {
   1107             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
   1108         }
   1109         */
   1110         return (last2 << 24) + (last1 << 16) + (last0 << 8);
   1111     } else {
   1112         int32_t last1 = last0 / final4Count;
   1113         last0 %= final4Count;
   1114 
   1115         int32_t last2 = last1 / medialCount;
   1116         last1 %= medialCount;
   1117 
   1118         int32_t last3 = last2 / medialCount;
   1119         last2 %= medialCount;
   1120 
   1121         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
   1122         last1 = minTrail + last1; // offset
   1123         last2 = minTrail + last2; // offset
   1124         last3 = min4Primary + last3; // offset
   1125         /*
   1126         if (last3 > max4Primary) {
   1127             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
   1128         }
   1129         */
   1130         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
   1131     }
   1132 }
   1133 
   1134 static uint32_t U_EXPORT2
   1135 uprv_uca_getImplicitPrimary(UChar32 cp) {
   1136    //fprintf(stdout, "Incoming: %04x\n", cp);
   1137     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
   1138 
   1139     cp = swapCJK(cp);
   1140     cp++;
   1141     // we now have a range of numbers from 0 to 21FFFF.
   1142 
   1143     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
   1144     //fprintf(stdout, "CJK swapped: %04x\n", cp);
   1145 
   1146     return uprv_uca_getImplicitFromRaw(cp);
   1147 }
   1148 
   1149 /**
   1150  * Converts implicit CE into raw integer ("code point")
   1151  * @param implicit
   1152  * @return -1 if illegal format
   1153  */
   1154 U_CAPI UChar32 U_EXPORT2
   1155 uprv_uca_getRawFromImplicit(uint32_t implicit) {
   1156     UChar32 result;
   1157     UChar32 b3 = implicit & 0xFF;
   1158     UChar32 b2 = (implicit >> 8) & 0xFF;
   1159     UChar32 b1 = (implicit >> 16) & 0xFF;
   1160     UChar32 b0 = (implicit >> 24) & 0xFF;
   1161 
   1162     // simple parameter checks
   1163     if (b0 < min3Primary || b0 > max4Primary
   1164         || b1 < minTrail || b1 > maxTrail)
   1165         return -1;
   1166     // normal offsets
   1167     b1 -= minTrail;
   1168 
   1169     // take care of the final values, and compose
   1170     if (b0 < min4Primary) {
   1171         if (b2 < minTrail || b2 > max3Trail || b3 != 0)
   1172             return -1;
   1173         b2 -= minTrail;
   1174         UChar32 remainder = b2 % final3Multiplier;
   1175         if (remainder != 0)
   1176             return -1;
   1177         b0 -= min3Primary;
   1178         b2 /= final3Multiplier;
   1179         result = ((b0 * medialCount) + b1) * final3Count + b2;
   1180     } else {
   1181         if (b2 < minTrail || b2 > maxTrail
   1182             || b3 < minTrail || b3 > max4Trail)
   1183             return -1;
   1184         b2 -= minTrail;
   1185         b3 -= minTrail;
   1186         UChar32 remainder = b3 % final4Multiplier;
   1187         if (remainder != 0)
   1188             return -1;
   1189         b3 /= final4Multiplier;
   1190         b0 -= min4Primary;
   1191         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
   1192     }
   1193     // final check
   1194     if (result < 0 || result > UCOL_MAX_INPUT)
   1195         return -1;
   1196     return result;
   1197 }
   1198 
   1199 
   1200 static inline int32_t divideAndRoundUp(int a, int b) {
   1201     return 1 + (a-1)/b;
   1202 }
   1203 
   1204 /* this function is either called from initUCA or from genUCA before
   1205  * doing canonical closure for the UCA.
   1206  */
   1207 
   1208 /**
   1209  * Set up to generate implicits.
   1210  * Maintenance Note:  this function may end up being called more than once, due
   1211  *                    to threading races during initialization.  Make sure that
   1212  *                    none of the Constants is ever transiently assigned an
   1213  *                    incorrect value.
   1214  * @param minPrimary
   1215  * @param maxPrimary
   1216  * @param minTrail final byte
   1217  * @param maxTrail final byte
   1218  * @param gap3 the gap we leave for tailoring for 3-byte forms
   1219  * @param gap4 the gap we leave for tailoring for 4-byte forms
   1220  */
   1221 static void initImplicitConstants(int minPrimary, int maxPrimary,
   1222                                     int minTrailIn, int maxTrailIn,
   1223                                     int gap3, int primaries3count,
   1224                                     UErrorCode *status) {
   1225     // some simple parameter checks
   1226     if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
   1227         || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
   1228         || (primaries3count < 1))
   1229     {
   1230         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1231         return;
   1232     };
   1233 
   1234     minTrail = minTrailIn;
   1235     maxTrail = maxTrailIn;
   1236 
   1237     min3Primary = minPrimary;
   1238     max4Primary = maxPrimary;
   1239     // compute constants for use later.
   1240     // number of values we can use in trailing bytes
   1241     // leave room for empty values between AND above, e.g. if gap = 2
   1242     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
   1243     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
   1244     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
   1245     final3Multiplier = gap3 + 1;
   1246     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
   1247     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
   1248 
   1249     // medials can use full range
   1250     medialCount = (maxTrail - minTrail + 1);
   1251     // find out how many values fit in each form
   1252     int32_t threeByteCount = medialCount * final3Count;
   1253     // now determine where the 3/4 boundary is.
   1254     // we use 3 bytes below the boundary, and 4 above
   1255     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
   1256     int32_t primaries4count = primariesAvailable - primaries3count;
   1257 
   1258 
   1259     int32_t min3ByteCoverage = primaries3count * threeByteCount;
   1260     min4Primary = minPrimary + primaries3count;
   1261     min4Boundary = min3ByteCoverage;
   1262     // Now expand out the multiplier for the 4 bytes, and redo.
   1263 
   1264     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
   1265     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
   1266     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
   1267     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
   1268     if (gap4 < 1) {
   1269         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1270         return;
   1271     }
   1272     final4Multiplier = gap4 + 1;
   1273     final4Count = neededPerFinalByte;
   1274     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
   1275 }
   1276 
   1277     /**
   1278      * Supply parameters for generating implicit CEs
   1279      */
   1280 U_CAPI void U_EXPORT2
   1281 uprv_uca_initImplicitConstants(UErrorCode *status) {
   1282     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
   1283     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
   1284     initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
   1285 }
   1286 
   1287 
   1288 /*    collIterNormalize     Incremental Normalization happens here.                       */
   1289 /*                          pick up the range of chars identifed by FCD,                  */
   1290 /*                          normalize it into the collIterate's writable buffer,          */
   1291 /*                          switch the collIterate's state to use the writable buffer.    */
   1292 /*                                                                                        */
   1293 static
   1294 void collIterNormalize(collIterate *collationSource)
   1295 {
   1296     UErrorCode  status = U_ZERO_ERROR;
   1297     const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
   1298     const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
   1299 
   1300     collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
   1301                                     collationSource->writableBuffer,
   1302                                     status);
   1303     if (U_FAILURE(status)) {
   1304 #ifdef UCOL_DEBUG
   1305         fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
   1306 #endif
   1307         return;
   1308     }
   1309 
   1310     collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
   1311     collationSource->origFlags  = collationSource->flags;
   1312     collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1313     collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1314 }
   1315 
   1316 
   1317 // This function takes the iterator and extracts normalized stuff up to the next boundary
   1318 // It is similar in the end results to the collIterNormalize, but for the cases when we
   1319 // use an iterator
   1320 /*static
   1321 inline void normalizeIterator(collIterate *collationSource) {
   1322   UErrorCode status = U_ZERO_ERROR;
   1323   UBool wasNormalized = FALSE;
   1324   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
   1325   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
   1326   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1327     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1328   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
   1329     // reallocate and terminate
   1330     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
   1331                                &collationSource->writableBuffer,
   1332                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
   1333                                0)
   1334     ) {
   1335     #ifdef UCOL_DEBUG
   1336         fprintf(stderr, "normalizeIterator(), out of memory\n");
   1337     #endif
   1338         return;
   1339     }
   1340     status = U_ZERO_ERROR;
   1341     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
   1342     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
   1343     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1344     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1345   }
   1346   // Terminate the buffer - we already checked that it is big enough
   1347   collationSource->writableBuffer[normLen] = 0;
   1348   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
   1349       collationSource->flags |= UCOL_ITER_ALLOCATED;
   1350   }
   1351   collationSource->pos        = collationSource->writableBuffer;
   1352   collationSource->origFlags  = collationSource->flags;
   1353   collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1354   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1355 }*/
   1356 
   1357 
   1358 /* Incremental FCD check and normalize                                                    */
   1359 /*   Called from getNextCE when normalization state is suspect.                           */
   1360 /*   When entering, the state is known to be this:                                        */
   1361 /*      o   We are working in the main buffer of the collIterate, not the side            */
   1362 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
   1363 /*          so we won't get here.                                                         */
   1364 /*      o   The leading combining class from the current character is 0 or                */
   1365 /*          the trailing combining class of the previous char was zero.                   */
   1366 /*          True because the previous call to this function will have always exited       */
   1367 /*          that way, and we get called for every char where cc might be non-zero.        */
   1368 static
   1369 inline UBool collIterFCD(collIterate *collationSource) {
   1370     const UChar *srcP, *endP;
   1371     uint8_t     leadingCC;
   1372     uint8_t     prevTrailingCC = 0;
   1373     uint16_t    fcd;
   1374     UBool       needNormalize = FALSE;
   1375 
   1376     srcP = collationSource->pos-1;
   1377 
   1378     if (collationSource->flags & UCOL_ITER_HASLEN) {
   1379         endP = collationSource->endp;
   1380     } else {
   1381         endP = NULL;
   1382     }
   1383 
   1384     // Get the trailing combining class of the current character.  If it's zero,
   1385     //   we are OK.
   1386     /* trie access */
   1387     fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
   1388     if (fcd != 0) {
   1389         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1390 
   1391         if (prevTrailingCC != 0) {
   1392             // The current char has a non-zero trailing CC.  Scan forward until we find
   1393             //   a char with a leading cc of zero.
   1394             while (endP == NULL || srcP != endP)
   1395             {
   1396                 const UChar *savedSrcP = srcP;
   1397 
   1398                 /* trie access */
   1399                 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
   1400                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1401                 if (leadingCC == 0) {
   1402                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
   1403                                            //   back up over it.  (Could be surrogate pair!)
   1404                     break;
   1405                 }
   1406 
   1407                 if (leadingCC < prevTrailingCC) {
   1408                     needNormalize = TRUE;
   1409                 }
   1410 
   1411                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1412             }
   1413         }
   1414     }
   1415 
   1416     collationSource->fcdPosition = (UChar *)srcP;
   1417 
   1418     return needNormalize;
   1419 }
   1420 
   1421 /****************************************************************************/
   1422 /* Following are the CE retrieval functions                                 */
   1423 /*                                                                          */
   1424 /****************************************************************************/
   1425 
   1426 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
   1427 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
   1428 
   1429 /* there should be a macro version of this function in the header file */
   1430 /* This is the first function that tries to fetch a collation element  */
   1431 /* If it's not succesfull or it encounters a more difficult situation  */
   1432 /* some more sofisticated and slower functions are invoked             */
   1433 static
   1434 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1435     uint32_t order = 0;
   1436     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
   1437         order = *(collationSource->toReturn++);                         /* if so, return them */
   1438         if(collationSource->CEpos == collationSource->toReturn) {
   1439             collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
   1440         }
   1441         return order;
   1442     }
   1443 
   1444     UChar ch = 0;
   1445     collationSource->offsetReturn = NULL;
   1446 
   1447     for (;;)                           /* Loop handles case when incremental normalize switches   */
   1448     {                                  /*   to or from the side buffer / original string, and we  */
   1449         /*   need to start again to get the next character.        */
   1450 
   1451         if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
   1452         {
   1453             // The source string is null terminated and we're not working from the side buffer,
   1454             //   and we're not normalizing.  This is the fast path.
   1455             //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
   1456             ch = *collationSource->pos++;
   1457             if (ch != 0) {
   1458                 break;
   1459             }
   1460             else {
   1461                 return UCOL_NO_MORE_CES;
   1462             }
   1463         }
   1464 
   1465         if (collationSource->flags & UCOL_ITER_HASLEN) {
   1466             // Normal path for strings when length is specified.
   1467             //   (We can't be in side buffer because it is always null terminated.)
   1468             if (collationSource->pos >= collationSource->endp) {
   1469                 // Ran off of the end of the main source string.  We're done.
   1470                 return UCOL_NO_MORE_CES;
   1471             }
   1472             ch = *collationSource->pos++;
   1473         }
   1474         else if(collationSource->flags & UCOL_USE_ITERATOR) {
   1475             UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
   1476             if(iterCh == U_SENTINEL) {
   1477                 return UCOL_NO_MORE_CES;
   1478             }
   1479             ch = (UChar)iterCh;
   1480         }
   1481         else
   1482         {
   1483             // Null terminated string.
   1484             ch = *collationSource->pos++;
   1485             if (ch == 0) {
   1486                 // Ran off end of buffer.
   1487                 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1488                     // Ran off end of main string. backing up one character.
   1489                     collationSource->pos--;
   1490                     return UCOL_NO_MORE_CES;
   1491                 }
   1492                 else
   1493                 {
   1494                     // Hit null in the normalize side buffer.
   1495                     // Usually this means the end of the normalized data,
   1496                     // except for one odd case: a null followed by combining chars,
   1497                     //   which is the case if we are at the start of the buffer.
   1498                     if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
   1499                         break;
   1500                     }
   1501 
   1502                     //  Null marked end of side buffer.
   1503                     //   Revert to the main string and
   1504                     //   loop back to top to try again to get a character.
   1505                     collationSource->pos   = collationSource->fcdPosition;
   1506                     collationSource->flags = collationSource->origFlags;
   1507                     continue;
   1508                 }
   1509             }
   1510         }
   1511 
   1512         if(collationSource->flags&UCOL_HIRAGANA_Q) {
   1513             /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
   1514              * based on whether the previous codepoint was Hiragana or Katakana.
   1515              */
   1516             if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
   1517                     ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
   1518                 collationSource->flags |= UCOL_WAS_HIRAGANA;
   1519             } else {
   1520                 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
   1521             }
   1522         }
   1523 
   1524         // We've got a character.  See if there's any fcd and/or normalization stuff to do.
   1525         //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
   1526         if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
   1527             break;
   1528         }
   1529 
   1530         if (collationSource->fcdPosition >= collationSource->pos) {
   1531             // An earlier FCD check has already covered the current character.
   1532             // We can go ahead and process this char.
   1533             break;
   1534         }
   1535 
   1536         if (ch < ZERO_CC_LIMIT_ ) {
   1537             // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
   1538             break;
   1539         }
   1540 
   1541         if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1542             // We need to peek at the next character in order to tell if we are FCD
   1543             if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
   1544                 // We are at the last char of source string.
   1545                 //  It is always OK for FCD check.
   1546                 break;
   1547             }
   1548 
   1549             // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
   1550             if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1551                 break;
   1552             }
   1553         }
   1554 
   1555 
   1556         // Need a more complete FCD check and possible normalization.
   1557         if (collIterFCD(collationSource)) {
   1558             collIterNormalize(collationSource);
   1559         }
   1560         if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1561             //  No normalization was needed.  Go ahead and process the char we already had.
   1562             break;
   1563         }
   1564 
   1565         // Some normalization happened.  Next loop iteration will pick up a char
   1566         //   from the normalization buffer.
   1567 
   1568     }   // end for (;;)
   1569 
   1570 
   1571     if (ch <= 0xFF) {
   1572         /*  For latin-1 characters we never need to fall back to the UCA table        */
   1573         /*    because all of the UCA data is replicated in the latinOneMapping array  */
   1574         order = coll->latinOneMapping[ch];
   1575         if (order > UCOL_NOT_FOUND) {
   1576             order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
   1577         }
   1578     }
   1579     else
   1580     {
   1581         // Always use UCA for Han, Hangul
   1582         // (Han extension A is before main Han block)
   1583         // **** Han compatibility chars ?? ****
   1584         if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   1585             (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
   1586             if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
   1587                 // between the two target ranges; do normal lookup
   1588                 // **** this range is YI, Modifier tone letters, ****
   1589                 // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   1590                 // **** Latin-D might be tailored, so we need to ****
   1591                 // **** do the normal lookup for these guys.     ****
   1592                 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1593             } else {
   1594                 // in one of the target ranges; use UCA
   1595                 order = UCOL_NOT_FOUND;
   1596             }
   1597         } else {
   1598             order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1599         }
   1600 
   1601         if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
   1602             order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
   1603         }
   1604 
   1605         if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
   1606             /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
   1607             order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   1608 
   1609             if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
   1610                 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
   1611             }
   1612         }
   1613     }
   1614     if(order == UCOL_NOT_FOUND) {
   1615         order = getImplicit(ch, collationSource);
   1616     }
   1617     return order; /* return the CE */
   1618 }
   1619 
   1620 /* ucol_getNextCE, out-of-line version for use from other files.   */
   1621 U_CAPI uint32_t  U_EXPORT2
   1622 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1623     return ucol_IGetNextCE(coll, collationSource, status);
   1624 }
   1625 
   1626 
   1627 /**
   1628 * Incremental previous normalization happens here. Pick up the range of chars
   1629 * identifed by FCD, normalize it into the collIterate's writable buffer,
   1630 * switch the collIterate's state to use the writable buffer.
   1631 * @param data collation iterator data
   1632 */
   1633 static
   1634 void collPrevIterNormalize(collIterate *data)
   1635 {
   1636     UErrorCode status  = U_ZERO_ERROR;
   1637     const UChar *pEnd   = data->pos;  /* End normalize + 1 */
   1638     const UChar *pStart;
   1639 
   1640     /* Start normalize */
   1641     if (data->fcdPosition == NULL) {
   1642         pStart = data->string;
   1643     }
   1644     else {
   1645         pStart = data->fcdPosition + 1;
   1646     }
   1647 
   1648     int32_t normLen =
   1649         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
   1650                              data->writableBuffer,
   1651                              status).
   1652         length();
   1653     if(U_FAILURE(status)) {
   1654         return;
   1655     }
   1656     /*
   1657     this puts the null termination infront of the normalized string instead
   1658     of the end
   1659     */
   1660     data->writableBuffer.insert(0, (UChar)0);
   1661 
   1662     /*
   1663      * The usual case at this point is that we've got a base
   1664      * character followed by marks that were normalized. If
   1665      * fcdPosition is NULL, that means that we backed up to
   1666      * the beginning of the string and there's no base character.
   1667      *
   1668      * Forward processing will usually normalize when it sees
   1669      * the first mark, so that mark will get it's natural offset
   1670      * and the rest will get the offset of the character following
   1671      * the marks. The base character will also get its natural offset.
   1672      *
   1673      * We write the offset of the base character, if there is one,
   1674      * followed by the offset of the first mark and then the offsets
   1675      * of the rest of the marks.
   1676      */
   1677     int32_t firstMarkOffset = 0;
   1678     int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
   1679     int32_t trailCount      = normLen - 1;
   1680 
   1681     if (data->fcdPosition != NULL) {
   1682         int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
   1683         UChar   baseChar   = *data->fcdPosition;
   1684 
   1685         firstMarkOffset = baseOffset + 1;
   1686 
   1687         /*
   1688          * If the base character is the start of a contraction, forward processing
   1689          * will normalize the marks while checking for the contraction, which means
   1690          * that the offset of the first mark will the same as the other marks.
   1691          *
   1692          * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
   1693          */
   1694         if (baseChar >= 0x100) {
   1695             uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
   1696 
   1697             if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
   1698                 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
   1699             }
   1700 
   1701             if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
   1702                 firstMarkOffset = trailOffset;
   1703             }
   1704         }
   1705 
   1706         data->appendOffset(baseOffset, status);
   1707     }
   1708 
   1709     data->appendOffset(firstMarkOffset, status);
   1710 
   1711     for (int32_t i = 0; i < trailCount; i += 1) {
   1712         data->appendOffset(trailOffset, status);
   1713     }
   1714 
   1715     data->offsetRepeatValue = trailOffset;
   1716 
   1717     data->offsetReturn = data->offsetStore - 1;
   1718     if (data->offsetReturn == data->offsetBuffer) {
   1719         data->offsetStore = data->offsetBuffer;
   1720     }
   1721 
   1722     data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
   1723     data->origFlags  = data->flags;
   1724     data->flags     |= UCOL_ITER_INNORMBUF;
   1725     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   1726 }
   1727 
   1728 
   1729 /**
   1730 * Incremental FCD check for previous iteration and normalize. Called from
   1731 * getPrevCE when normalization state is suspect.
   1732 * When entering, the state is known to be this:
   1733 * o  We are working in the main buffer of the collIterate, not the side
   1734 *    writable buffer. When in the side buffer, normalization mode is always
   1735 *    off, so we won't get here.
   1736 * o  The leading combining class from the current character is 0 or the
   1737 *    trailing combining class of the previous char was zero.
   1738 *    True because the previous call to this function will have always exited
   1739 *    that way, and we get called for every char where cc might be non-zero.
   1740 * @param data collation iterate struct
   1741 * @return normalization status, TRUE for normalization to be done, FALSE
   1742 *         otherwise
   1743 */
   1744 static
   1745 inline UBool collPrevIterFCD(collIterate *data)
   1746 {
   1747     const UChar *src, *start;
   1748     uint8_t     leadingCC;
   1749     uint8_t     trailingCC = 0;
   1750     uint16_t    fcd;
   1751     UBool       result = FALSE;
   1752 
   1753     start = data->string;
   1754     src = data->pos + 1;
   1755 
   1756     /* Get the trailing combining class of the current character. */
   1757     fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
   1758 
   1759     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1760 
   1761     if (leadingCC != 0) {
   1762         /*
   1763         The current char has a non-zero leading combining class.
   1764         Scan backward until we find a char with a trailing cc of zero.
   1765         */
   1766         for (;;)
   1767         {
   1768             if (start == src) {
   1769                 data->fcdPosition = NULL;
   1770                 return result;
   1771             }
   1772 
   1773             fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
   1774 
   1775             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1776 
   1777             if (trailingCC == 0) {
   1778                 break;
   1779             }
   1780 
   1781             if (leadingCC < trailingCC) {
   1782                 result = TRUE;
   1783             }
   1784 
   1785             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1786         }
   1787     }
   1788 
   1789     data->fcdPosition = (UChar *)src;
   1790 
   1791     return result;
   1792 }
   1793 
   1794 /** gets a code unit from the string at a given offset
   1795  *  Handles both normal and iterative cases.
   1796  *  No error checking - caller beware!
   1797  */
   1798 static inline
   1799 UChar peekCodeUnit(collIterate *source, int32_t offset) {
   1800     if(source->pos != NULL) {
   1801         return *(source->pos + offset);
   1802     } else if(source->iterator != NULL) {
   1803         UChar32 c;
   1804         if(offset != 0) {
   1805             source->iterator->move(source->iterator, offset, UITER_CURRENT);
   1806             c = source->iterator->next(source->iterator);
   1807             source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
   1808         } else {
   1809             c = source->iterator->current(source->iterator);
   1810         }
   1811         return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
   1812     } else {
   1813         return 0xfffd;
   1814     }
   1815 }
   1816 
   1817 // Code point version. Treats the offset as a _code point_ delta.
   1818 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
   1819 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
   1820 static inline
   1821 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
   1822     UChar32 c;
   1823     if(source->pos != NULL) {
   1824         const UChar *p = source->pos;
   1825         if(offset >= 0) {
   1826             // Skip forward over (offset-1) code points.
   1827             while(--offset >= 0) {
   1828                 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
   1829                     ++p;
   1830                 }
   1831             }
   1832             // Read the code point there.
   1833             c = *p++;
   1834             UChar trail;
   1835             if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
   1836                 c = U16_GET_SUPPLEMENTARY(c, trail);
   1837             }
   1838         } else /* offset<0 */ {
   1839             // Skip backward over (offset-1) code points.
   1840             while(++offset < 0) {
   1841                 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
   1842                     --p;
   1843                 }
   1844             }
   1845             // Read the code point before that.
   1846             c = *--p;
   1847             UChar lead;
   1848             if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
   1849                 c = U16_GET_SUPPLEMENTARY(lead, c);
   1850             }
   1851         }
   1852     } else if(source->iterator != NULL) {
   1853         if(offset >= 0) {
   1854             // Skip forward over (offset-1) code points.
   1855             int32_t fwd = offset;
   1856             while(fwd-- > 0) {
   1857                 uiter_next32(source->iterator);
   1858             }
   1859             // Read the code point there.
   1860             c = uiter_current32(source->iterator);
   1861             // Return to the starting point, skipping backward over (offset-1) code points.
   1862             while(offset-- > 0) {
   1863                 uiter_previous32(source->iterator);
   1864             }
   1865         } else /* offset<0 */ {
   1866             // Read backward, reading offset code points, remember only the last-read one.
   1867             int32_t back = offset;
   1868             do {
   1869                 c = uiter_previous32(source->iterator);
   1870             } while(++back < 0);
   1871             // Return to the starting position, skipping forward over offset code points.
   1872             do {
   1873                 uiter_next32(source->iterator);
   1874             } while(++offset < 0);
   1875         }
   1876     } else {
   1877         c = U_SENTINEL;
   1878     }
   1879     return c;
   1880 }
   1881 
   1882 /**
   1883 * Determines if we are at the start of the data string in the backwards
   1884 * collation iterator
   1885 * @param data collation iterator
   1886 * @return TRUE if we are at the start
   1887 */
   1888 static
   1889 inline UBool isAtStartPrevIterate(collIterate *data) {
   1890     if(data->pos == NULL && data->iterator != NULL) {
   1891         return !data->iterator->hasPrevious(data->iterator);
   1892     }
   1893     //return (collIter_bos(data)) ||
   1894     return (data->pos == data->string) ||
   1895               ((data->flags & UCOL_ITER_INNORMBUF) &&
   1896               *(data->pos - 1) == 0 && data->fcdPosition == NULL);
   1897 }
   1898 
   1899 static
   1900 inline void goBackOne(collIterate *data) {
   1901 # if 0
   1902     // somehow, it looks like we need to keep iterator synced up
   1903     // at all times, as above.
   1904     if(data->pos) {
   1905         data->pos--;
   1906     }
   1907     if(data->iterator) {
   1908         data->iterator->previous(data->iterator);
   1909     }
   1910 #endif
   1911     if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
   1912         data->iterator->previous(data->iterator);
   1913     }
   1914     if(data->pos) {
   1915         data->pos --;
   1916     }
   1917 }
   1918 
   1919 /**
   1920 * Inline function that gets a simple CE.
   1921 * So what it does is that it will first check the expansion buffer. If the
   1922 * expansion buffer is not empty, ie the end pointer to the expansion buffer
   1923 * is different from the string pointer, we return the collation element at the
   1924 * return pointer and decrement it.
   1925 * For more complicated CEs it resorts to getComplicatedCE.
   1926 * @param coll collator data
   1927 * @param data collation iterator struct
   1928 * @param status error status
   1929 */
   1930 static
   1931 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
   1932                                UErrorCode *status)
   1933 {
   1934     uint32_t result = (uint32_t)UCOL_NULLORDER;
   1935 
   1936     if (data->offsetReturn != NULL) {
   1937         if (data->offsetRepeatCount > 0) {
   1938                 data->offsetRepeatCount -= 1;
   1939         } else {
   1940             if (data->offsetReturn == data->offsetBuffer) {
   1941                 data->offsetReturn = NULL;
   1942                 data->offsetStore  = data->offsetBuffer;
   1943             } else {
   1944                 data->offsetReturn -= 1;
   1945             }
   1946         }
   1947     }
   1948 
   1949     if ((data->extendCEs && data->toReturn > data->extendCEs) ||
   1950             (!data->extendCEs && data->toReturn > data->CEs))
   1951     {
   1952         data->toReturn -= 1;
   1953         result = *(data->toReturn);
   1954         if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
   1955             data->CEpos = data->toReturn;
   1956         }
   1957     }
   1958     else {
   1959         UChar ch = 0;
   1960 
   1961         /*
   1962         Loop handles case when incremental normalize switches to or from the
   1963         side buffer / original string, and we need to start again to get the
   1964         next character.
   1965         */
   1966         for (;;) {
   1967             if (data->flags & UCOL_ITER_HASLEN) {
   1968                 /*
   1969                 Normal path for strings when length is specified.
   1970                 Not in side buffer because it is always null terminated.
   1971                 */
   1972                 if (data->pos <= data->string) {
   1973                     /* End of the main source string */
   1974                     return UCOL_NO_MORE_CES;
   1975                 }
   1976                 data->pos --;
   1977                 ch = *data->pos;
   1978             }
   1979             // we are using an iterator to go back. Pray for us!
   1980             else if (data->flags & UCOL_USE_ITERATOR) {
   1981               UChar32 iterCh = data->iterator->previous(data->iterator);
   1982               if(iterCh == U_SENTINEL) {
   1983                 return UCOL_NO_MORE_CES;
   1984               } else {
   1985                 ch = (UChar)iterCh;
   1986               }
   1987             }
   1988             else {
   1989                 data->pos --;
   1990                 ch = *data->pos;
   1991                 /* we are in the side buffer. */
   1992                 if (ch == 0) {
   1993                     /*
   1994                     At the start of the normalize side buffer.
   1995                     Go back to string.
   1996                     Because pointer points to the last accessed character,
   1997                     hence we have to increment it by one here.
   1998                     */
   1999                     data->flags = data->origFlags;
   2000                     data->offsetRepeatValue = 0;
   2001 
   2002                      if (data->fcdPosition == NULL) {
   2003                         data->pos = data->string;
   2004                         return UCOL_NO_MORE_CES;
   2005                     }
   2006                     else {
   2007                         data->pos   = data->fcdPosition + 1;
   2008                     }
   2009 
   2010                    continue;
   2011                 }
   2012             }
   2013 
   2014             if(data->flags&UCOL_HIRAGANA_Q) {
   2015               if(ch>=0x3040 && ch<=0x309f) {
   2016                 data->flags |= UCOL_WAS_HIRAGANA;
   2017               } else {
   2018                 data->flags &= ~UCOL_WAS_HIRAGANA;
   2019               }
   2020             }
   2021 
   2022             /*
   2023             * got a character to determine if there's fcd and/or normalization
   2024             * stuff to do.
   2025             * if the current character is not fcd.
   2026             * if current character is at the start of the string
   2027             * Trailing combining class == 0.
   2028             * Note if pos is in the writablebuffer, norm is always 0
   2029             */
   2030             if (ch < ZERO_CC_LIMIT_ ||
   2031               // this should propel us out of the loop in the iterator case
   2032                 (data->flags & UCOL_ITER_NORM) == 0 ||
   2033                 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
   2034                 || data->string == data->pos) {
   2035                 break;
   2036             }
   2037 
   2038             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   2039                 /* if next character is FCD */
   2040                 if (data->pos == data->string) {
   2041                     /* First char of string is always OK for FCD check */
   2042                     break;
   2043                 }
   2044 
   2045                 /* Not first char of string, do the FCD fast test */
   2046                 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
   2047                     break;
   2048                 }
   2049             }
   2050 
   2051             /* Need a more complete FCD check and possible normalization. */
   2052             if (collPrevIterFCD(data)) {
   2053                 collPrevIterNormalize(data);
   2054             }
   2055 
   2056             if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2057                 /*  No normalization. Go ahead and process the char. */
   2058                 break;
   2059             }
   2060 
   2061             /*
   2062             Some normalization happened.
   2063             Next loop picks up a char from the normalization buffer.
   2064             */
   2065         }
   2066 
   2067         /* attempt to handle contractions, after removal of the backwards
   2068         contraction
   2069         */
   2070         if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
   2071             result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
   2072         } else {
   2073             if (ch <= 0xFF) {
   2074                 result = coll->latinOneMapping[ch];
   2075             }
   2076             else {
   2077                 // Always use UCA for [3400..9FFF], [AC00..D7AF]
   2078                 // **** [FA0E..FA2F] ?? ****
   2079                 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   2080                     (ch >= 0x3400 && ch <= 0xD7AF)) {
   2081                     if (ch > 0x9FFF && ch < 0xAC00) {
   2082                         // between the two target ranges; do normal lookup
   2083                         // **** this range is YI, Modifier tone letters, ****
   2084                         // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   2085                         // **** Latin-D might be tailored, so we need to ****
   2086                         // **** do the normal lookup for these guys.     ****
   2087                          result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   2088                     } else {
   2089                         result = UCOL_NOT_FOUND;
   2090                     }
   2091                 } else {
   2092                     result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   2093                 }
   2094             }
   2095             if (result > UCOL_NOT_FOUND) {
   2096                 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
   2097             }
   2098             if (result == UCOL_NOT_FOUND) { // Not found in master list
   2099                 if (!isAtStartPrevIterate(data) &&
   2100                     ucol_contractionEndCP(ch, data->coll))
   2101                 {
   2102                     result = UCOL_CONTRACTION;
   2103                 } else {
   2104                     if(coll->UCA) {
   2105                         result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   2106                     }
   2107                 }
   2108 
   2109                 if (result > UCOL_NOT_FOUND) {
   2110                     if(coll->UCA) {
   2111                         result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
   2112                     }
   2113                 }
   2114             }
   2115         }
   2116 
   2117         if(result == UCOL_NOT_FOUND) {
   2118             result = getPrevImplicit(ch, data);
   2119         }
   2120     }
   2121 
   2122     return result;
   2123 }
   2124 
   2125 
   2126 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
   2127 U_CFUNC uint32_t  U_EXPORT2
   2128 ucol_getPrevCE(const UCollator *coll, collIterate *data,
   2129                         UErrorCode *status) {
   2130     return ucol_IGetPrevCE(coll, data, status);
   2131 }
   2132 
   2133 
   2134 /* this should be connected to special Jamo handling */
   2135 U_CFUNC uint32_t  U_EXPORT2
   2136 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
   2137     collIterate colIt;
   2138     IInit_collIterate(coll, &u, 1, &colIt, status);
   2139     if(U_FAILURE(*status)) {
   2140         return 0;
   2141     }
   2142     return ucol_IGetNextCE(coll, &colIt, status);
   2143 }
   2144 
   2145 /**
   2146 * Inserts the argument character into the end of the buffer pushing back the
   2147 * null terminator.
   2148 * @param data collIterate struct data
   2149 * @param ch character to be appended
   2150 * @return the position of the new addition
   2151 */
   2152 static
   2153 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
   2154 {
   2155     int32_t oldLength = data->writableBuffer.length();
   2156     return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
   2157 }
   2158 
   2159 /**
   2160 * Inserts the argument string into the end of the buffer pushing back the
   2161 * null terminator.
   2162 * @param data collIterate struct data
   2163 * @param string to be appended
   2164 * @param length of the string to be appended
   2165 * @return the position of the new addition
   2166 */
   2167 static
   2168 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
   2169 {
   2170     int32_t oldLength = data->writableBuffer.length();
   2171     return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
   2172 }
   2173 
   2174 /**
   2175 * Special normalization function for contraction in the forwards iterator.
   2176 * This normalization sequence will place the current character at source->pos
   2177 * and its following normalized sequence into the buffer.
   2178 * The fcd position, pos will be changed.
   2179 * pos will now point to positions in the buffer.
   2180 * Flags will be changed accordingly.
   2181 * @param data collation iterator data
   2182 */
   2183 static
   2184 inline void normalizeNextContraction(collIterate *data)
   2185 {
   2186     int32_t     strsize;
   2187     UErrorCode  status     = U_ZERO_ERROR;
   2188     /* because the pointer points to the next character */
   2189     const UChar *pStart    = data->pos - 1;
   2190     const UChar *pEnd;
   2191 
   2192     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2193         data->writableBuffer.setTo(*(pStart - 1));
   2194         strsize               = 1;
   2195     }
   2196     else {
   2197         strsize = data->writableBuffer.length();
   2198     }
   2199 
   2200     pEnd = data->fcdPosition;
   2201 
   2202     data->writableBuffer.append(
   2203         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
   2204     if(U_FAILURE(status)) {
   2205         return;
   2206     }
   2207 
   2208     data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
   2209     data->origFlags  = data->flags;
   2210     data->flags     |= UCOL_ITER_INNORMBUF;
   2211     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2212 }
   2213 
   2214 /**
   2215 * Contraction character management function that returns the next character
   2216 * for the forwards iterator.
   2217 * Does nothing if the next character is in buffer and not the first character
   2218 * in it.
   2219 * Else it checks next character in data string to see if it is normalizable.
   2220 * If it is not, the character is simply copied into the buffer, else
   2221 * the whole normalized substring is copied into the buffer, including the
   2222 * current character.
   2223 * @param data collation element iterator data
   2224 * @return next character
   2225 */
   2226 static
   2227 inline UChar getNextNormalizedChar(collIterate *data)
   2228 {
   2229     UChar  nextch;
   2230     UChar  ch;
   2231     // Here we need to add the iterator code. One problem is the way
   2232     // end of string is handled. If we just return next char, it could
   2233     // be the sentinel. Most of the cases already check for this, but we
   2234     // need to be sure.
   2235     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
   2236          /* if no normalization and not in buffer. */
   2237       if(data->flags & UCOL_USE_ITERATOR) {
   2238          return (UChar)data->iterator->next(data->iterator);
   2239       } else {
   2240          return *(data->pos ++);
   2241       }
   2242     }
   2243 
   2244     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
   2245       //normalizeIterator(data);
   2246     //}
   2247 
   2248     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2249     if ((innormbuf && *data->pos != 0) ||
   2250         (data->fcdPosition != NULL && !innormbuf &&
   2251         data->pos < data->fcdPosition)) {
   2252         /*
   2253         if next character is in normalized buffer, no further normalization
   2254         is required
   2255         */
   2256         return *(data->pos ++);
   2257     }
   2258 
   2259     if (data->flags & UCOL_ITER_HASLEN) {
   2260         /* in data string */
   2261         if (data->pos + 1 == data->endp) {
   2262             return *(data->pos ++);
   2263         }
   2264     }
   2265     else {
   2266         if (innormbuf) {
   2267           // inside the normalization buffer, but at the end
   2268           // (since we encountered zero). This means, in the
   2269           // case we're using char iterator, that we need to
   2270           // do another round of normalization.
   2271           //if(data->origFlags & UCOL_USE_ITERATOR) {
   2272             // we need to restore original flags,
   2273             // otherwise, we'll lose them
   2274             //data->flags = data->origFlags;
   2275             //normalizeIterator(data);
   2276             //return *(data->pos++);
   2277           //} else {
   2278             /*
   2279             in writable buffer, at this point fcdPosition can not be
   2280             pointing to the end of the data string. see contracting tag.
   2281             */
   2282           if(data->fcdPosition) {
   2283             if (*(data->fcdPosition + 1) == 0 ||
   2284                 data->fcdPosition + 1 == data->endp) {
   2285                 /* at the end of the string, dump it into the normalizer */
   2286                 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
   2287                 // Check if data->pos received a null pointer
   2288                 if (data->pos == NULL) {
   2289                     return (UChar)-1; // Return to indicate error.
   2290                 }
   2291                 return *(data->fcdPosition ++);
   2292             }
   2293             data->pos = data->fcdPosition;
   2294           } else if(data->origFlags & UCOL_USE_ITERATOR) {
   2295             // if we are here, we're using a normalizing iterator.
   2296             // we should just continue further.
   2297             data->flags = data->origFlags;
   2298             data->pos = NULL;
   2299             return (UChar)data->iterator->next(data->iterator);
   2300           }
   2301           //}
   2302         }
   2303         else {
   2304             if (*(data->pos + 1) == 0) {
   2305                 return *(data->pos ++);
   2306             }
   2307         }
   2308     }
   2309 
   2310     ch = *data->pos ++;
   2311     nextch = *data->pos;
   2312 
   2313     /*
   2314     * if the current character is not fcd.
   2315     * Trailing combining class == 0.
   2316     */
   2317     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
   2318         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
   2319          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
   2320             /*
   2321             Need a more complete FCD check and possible normalization.
   2322             normalize substring will be appended to buffer
   2323             */
   2324         if (collIterFCD(data)) {
   2325             normalizeNextContraction(data);
   2326             return *(data->pos ++);
   2327         }
   2328         else if (innormbuf) {
   2329             /* fcdposition shifted even when there's no normalization, if we
   2330             don't input the rest into this, we'll get the wrong position when
   2331             we reach the end of the writableBuffer */
   2332             int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
   2333             data->pos = insertBufferEnd(data, data->pos - 1, length);
   2334             // Check if data->pos received a null pointer
   2335             if (data->pos == NULL) {
   2336                 return (UChar)-1; // Return to indicate error.
   2337             }
   2338             return *(data->pos ++);
   2339         }
   2340     }
   2341 
   2342     if (innormbuf) {
   2343         /*
   2344         no normalization is to be done hence only one character will be
   2345         appended to the buffer.
   2346         */
   2347         data->pos = insertBufferEnd(data, ch) + 1;
   2348         // Check if data->pos received a null pointer
   2349         if (data->pos == NULL) {
   2350             return (UChar)-1; // Return to indicate error.
   2351         }
   2352     }
   2353 
   2354     /* points back to the pos in string */
   2355     return ch;
   2356 }
   2357 
   2358 
   2359 
   2360 /**
   2361 * Function to copy the buffer into writableBuffer and sets the fcd position to
   2362 * the correct position
   2363 * @param source data string source
   2364 * @param buffer character buffer
   2365 */
   2366 static
   2367 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
   2368 {
   2369     /* okay confusing part here. to ensure that the skipped characters are
   2370     considered later, we need to place it in the appropriate position in the
   2371     normalization buffer and reassign the pos pointer. simple case if pos
   2372     reside in string, simply copy to normalization buffer and
   2373     fcdposition = pos, pos = start of normalization buffer. if pos in
   2374     normalization buffer, we'll insert the copy infront of pos and point pos
   2375     to the start of the normalization buffer. why am i doing these copies?
   2376     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
   2377     not require any changes, which be really painful. */
   2378     if (source->flags & UCOL_ITER_INNORMBUF) {
   2379         int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
   2380         source->writableBuffer.replace(0, replaceLength, buffer);
   2381     }
   2382     else {
   2383         source->fcdPosition  = source->pos;
   2384         source->origFlags    = source->flags;
   2385         source->flags       |= UCOL_ITER_INNORMBUF;
   2386         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   2387         source->writableBuffer = buffer;
   2388     }
   2389 
   2390     source->pos = source->writableBuffer.getTerminatedBuffer();
   2391 }
   2392 
   2393 /**
   2394 * Function to get the discontiguos collation element within the source.
   2395 * Note this function will set the position to the appropriate places.
   2396 * @param coll current collator used
   2397 * @param source data string source
   2398 * @param constart index to the start character in the contraction table
   2399 * @return discontiguos collation element offset
   2400 */
   2401 static
   2402 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
   2403                                 const UChar *constart)
   2404 {
   2405     /* source->pos currently points to the second combining character after
   2406        the start character */
   2407           const UChar *temppos      = source->pos;
   2408           UnicodeString buffer;
   2409     const UChar   *tempconstart = constart;
   2410           uint8_t  tempflags    = source->flags;
   2411           UBool    multicontraction = FALSE;
   2412           collIterateState discState;
   2413 
   2414           backupState(source, &discState);
   2415 
   2416     buffer.setTo(peekCodePoint(source, -1));
   2417     for (;;) {
   2418         UChar    *UCharOffset;
   2419         UChar     schar,
   2420                   tchar;
   2421         uint32_t  result;
   2422 
   2423         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
   2424             || (peekCodeUnit(source, 0) == 0  &&
   2425             //|| (*source->pos == 0  &&
   2426                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
   2427                  source->fcdPosition == NULL ||
   2428                  source->fcdPosition == source->endp ||
   2429                  *(source->fcdPosition) == 0 ||
   2430                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
   2431                  /* end of string in null terminated string or stopped by a
   2432                  null character, note fcd does not always point to a base
   2433                  character after the discontiguos change */
   2434                  u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
   2435                  //u_getCombiningClass(*(source->pos)) == 0) {
   2436             //constart = (UChar *)coll->image + getContractOffset(CE);
   2437             if (multicontraction) {
   2438                 source->pos    = temppos - 1;
   2439                 setDiscontiguosAttribute(source, buffer);
   2440                 return *(coll->contractionCEs +
   2441                                     (tempconstart - coll->contractionIndex));
   2442             }
   2443             constart = tempconstart;
   2444             break;
   2445         }
   2446 
   2447         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
   2448         schar = getNextNormalizedChar(source);
   2449 
   2450         while (schar > (tchar = *UCharOffset)) {
   2451             UCharOffset++;
   2452         }
   2453 
   2454         if (schar != tchar) {
   2455             /* not the correct codepoint. we stuff the current codepoint into
   2456             the discontiguos buffer and try the next character */
   2457             buffer.append(schar);
   2458             continue;
   2459         }
   2460         else {
   2461             if (u_getCombiningClass(schar) ==
   2462                 u_getCombiningClass(peekCodePoint(source, -2))) {
   2463                 buffer.append(schar);
   2464                 continue;
   2465             }
   2466             result = *(coll->contractionCEs +
   2467                                       (UCharOffset - coll->contractionIndex));
   2468         }
   2469 
   2470         if (result == UCOL_NOT_FOUND) {
   2471           break;
   2472         } else if (isContraction(result)) {
   2473             /* this is a multi-contraction*/
   2474             tempconstart = (UChar *)coll->image + getContractOffset(result);
   2475             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
   2476                 != UCOL_NOT_FOUND) {
   2477                 multicontraction = TRUE;
   2478                 temppos       = source->pos + 1;
   2479             }
   2480         } else {
   2481             setDiscontiguosAttribute(source, buffer);
   2482             return result;
   2483         }
   2484     }
   2485 
   2486     /* no problems simply reverting just like that,
   2487     if we are in string before getting into this function, points back to
   2488     string hence no problem.
   2489     if we are in normalization buffer before getting into this function,
   2490     since we'll never use another normalization within this function, we
   2491     know that fcdposition points to a base character. the normalization buffer
   2492     never change, hence this revert works. */
   2493     loadState(source, &discState, TRUE);
   2494     goBackOne(source);
   2495 
   2496     //source->pos   = temppos - 1;
   2497     source->flags = tempflags;
   2498     return *(coll->contractionCEs + (constart - coll->contractionIndex));
   2499 }
   2500 
   2501 /* now uses Mark's getImplicitPrimary code */
   2502 static
   2503 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
   2504     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   2505     *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
   2506     collationSource->offsetRepeatCount += 1;
   2507     return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
   2508 }
   2509 
   2510 /**
   2511 * Inserts the argument character into the front of the buffer replacing the
   2512 * front null terminator.
   2513 * @param data collation element iterator data
   2514 * @param ch character to be appended
   2515 */
   2516 static
   2517 inline void insertBufferFront(collIterate *data, UChar ch)
   2518 {
   2519     data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
   2520 }
   2521 
   2522 /**
   2523 * Special normalization function for contraction in the previous iterator.
   2524 * This normalization sequence will place the current character at source->pos
   2525 * and its following normalized sequence into the buffer.
   2526 * The fcd position, pos will be changed.
   2527 * pos will now point to positions in the buffer.
   2528 * Flags will be changed accordingly.
   2529 * @param data collation iterator data
   2530 */
   2531 static
   2532 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
   2533 {
   2534     const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
   2535     const UChar *pStart;
   2536 
   2537     UnicodeString endOfBuffer;
   2538     if (data->flags & UCOL_ITER_HASLEN) {
   2539         /*
   2540         normalization buffer not used yet, we'll pull down the next
   2541         character into the end of the buffer
   2542         */
   2543         endOfBuffer.setTo(*pEnd);
   2544     }
   2545     else {
   2546         endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
   2547     }
   2548 
   2549     if (data->fcdPosition == NULL) {
   2550         pStart = data->string;
   2551     }
   2552     else {
   2553         pStart = data->fcdPosition + 1;
   2554     }
   2555     int32_t normLen =
   2556         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
   2557                              data->writableBuffer,
   2558                              *status).
   2559         length();
   2560     if(U_FAILURE(*status)) {
   2561         return;
   2562     }
   2563     /*
   2564     this puts the null termination infront of the normalized string instead
   2565     of the end
   2566     */
   2567     data->pos =
   2568         data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
   2569         1 + normLen;
   2570     data->origFlags  = data->flags;
   2571     data->flags     |= UCOL_ITER_INNORMBUF;
   2572     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2573 }
   2574 
   2575 /**
   2576 * Contraction character management function that returns the previous character
   2577 * for the backwards iterator.
   2578 * Does nothing if the previous character is in buffer and not the first
   2579 * character in it.
   2580 * Else it checks previous character in data string to see if it is
   2581 * normalizable.
   2582 * If it is not, the character is simply copied into the buffer, else
   2583 * the whole normalized substring is copied into the buffer, including the
   2584 * current character.
   2585 * @param data collation element iterator data
   2586 * @return previous character
   2587 */
   2588 static
   2589 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
   2590 {
   2591     UChar  prevch;
   2592     UChar  ch;
   2593     const UChar *start;
   2594     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2595     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
   2596         (innormbuf && *(data->pos - 1) != 0)) {
   2597         /*
   2598         if no normalization.
   2599         if previous character is in normalized buffer, no further normalization
   2600         is required
   2601         */
   2602       if(data->flags & UCOL_USE_ITERATOR) {
   2603         data->iterator->move(data->iterator, -1, UITER_CURRENT);
   2604         return (UChar)data->iterator->next(data->iterator);
   2605       } else {
   2606         return *(data->pos - 1);
   2607       }
   2608     }
   2609 
   2610     start = data->pos;
   2611     if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
   2612         /* in data string */
   2613         if ((start - 1) == data->string) {
   2614             return *(start - 1);
   2615         }
   2616         start --;
   2617         ch     = *start;
   2618         prevch = *(start - 1);
   2619     }
   2620     else {
   2621         /*
   2622         in writable buffer, at this point fcdPosition can not be NULL.
   2623         see contracting tag.
   2624         */
   2625         if (data->fcdPosition == data->string) {
   2626             /* at the start of the string, just dump it into the normalizer */
   2627             insertBufferFront(data, *(data->fcdPosition));
   2628             data->fcdPosition = NULL;
   2629             return *(data->pos - 1);
   2630         }
   2631         start  = data->fcdPosition;
   2632         ch     = *start;
   2633         prevch = *(start - 1);
   2634     }
   2635     /*
   2636     * if the current character is not fcd.
   2637     * Trailing combining class == 0.
   2638     */
   2639     if (data->fcdPosition > start &&
   2640        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
   2641     {
   2642         /*
   2643         Need a more complete FCD check and possible normalization.
   2644         normalize substring will be appended to buffer
   2645         */
   2646         const UChar *backuppos = data->pos;
   2647         data->pos = start;
   2648         if (collPrevIterFCD(data)) {
   2649             normalizePrevContraction(data, status);
   2650             return *(data->pos - 1);
   2651         }
   2652         data->pos = backuppos;
   2653         data->fcdPosition ++;
   2654     }
   2655 
   2656     if (innormbuf) {
   2657     /*
   2658     no normalization is to be done hence only one character will be
   2659     appended to the buffer.
   2660     */
   2661         insertBufferFront(data, ch);
   2662         data->fcdPosition --;
   2663     }
   2664 
   2665     return ch;
   2666 }
   2667 
   2668 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
   2669 /* It is called by getNextCE */
   2670 
   2671 /* The following should be even */
   2672 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
   2673 
   2674 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
   2675     collIterateState entryState;
   2676     backupState(source, &entryState);
   2677     UChar32 cp = ch;
   2678 
   2679     for (;;) {
   2680         // This loop will repeat only in the case of contractions, and only when a contraction
   2681         //   is found and the first CE resulting from that contraction is itself a special
   2682         //   (an expansion, for example.)  All other special CE types are fully handled the
   2683         //   first time through, and the loop exits.
   2684 
   2685         const uint32_t *CEOffset = NULL;
   2686         switch(getCETag(CE)) {
   2687         case NOT_FOUND_TAG:
   2688             /* This one is not found, and we'll let somebody else bother about it... no more games */
   2689             return CE;
   2690         case SPEC_PROC_TAG:
   2691             {
   2692                 // Special processing is getting a CE that is preceded by a certain prefix
   2693                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   2694                 // When we encouter a special processing tag, we go backwards and try to see if
   2695                 // we have a match.
   2696                 // Contraction tables are used - so the whole process is not unlike contraction.
   2697                 // prefix data is stored backwards in the table.
   2698                 const UChar *UCharOffset;
   2699                 UChar schar, tchar;
   2700                 collIterateState prefixState;
   2701                 backupState(source, &prefixState);
   2702                 loadState(source, &entryState, TRUE);
   2703                 goBackOne(source); // We want to look at the point where we entered - actually one
   2704                 // before that...
   2705 
   2706                 for(;;) {
   2707                     // This loop will run once per source string character, for as long as we
   2708                     //  are matching a potential contraction sequence
   2709 
   2710                     // First we position ourselves at the begining of contraction sequence
   2711                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2712                     if (collIter_bos(source)) {
   2713                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2714                         break;
   2715                     }
   2716                     schar = getPrevNormalizedChar(source, status);
   2717                     goBackOne(source);
   2718 
   2719                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2720                         UCharOffset++;
   2721                     }
   2722 
   2723                     if (schar == tchar) {
   2724                         // Found the source string char in the table.
   2725                         //  Pick up the corresponding CE from the table.
   2726                         CE = *(coll->contractionCEs +
   2727                             (UCharOffset - coll->contractionIndex));
   2728                     }
   2729                     else
   2730                     {
   2731                         // Source string char was not in the table.
   2732                         //   We have not found the prefix.
   2733                         CE = *(coll->contractionCEs +
   2734                             (ContractionStart - coll->contractionIndex));
   2735                     }
   2736 
   2737                     if(!isPrefix(CE)) {
   2738                         // The source string char was in the contraction table, and the corresponding
   2739                         //   CE is not a prefix CE.  We found the prefix, break
   2740                         //   out of loop, this CE will end up being returned.  This is the normal
   2741                         //   way out of prefix handling when the source actually contained
   2742                         //   the prefix.
   2743                         break;
   2744                     }
   2745                 }
   2746                 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
   2747                     loadState(source, &prefixState, TRUE);
   2748                     if(source->origFlags & UCOL_USE_ITERATOR) {
   2749                         source->flags = source->origFlags;
   2750                     }
   2751                 } else { // prefix search was a failure, we have to backup all the way to the start
   2752                     loadState(source, &entryState, TRUE);
   2753                 }
   2754                 break;
   2755             }
   2756         case CONTRACTION_TAG:
   2757             {
   2758                 /* This should handle contractions */
   2759                 collIterateState state;
   2760                 backupState(source, &state);
   2761                 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
   2762                 const UChar *UCharOffset;
   2763                 UChar schar, tchar;
   2764 
   2765                 for (;;) {
   2766                     /* This loop will run once per source string character, for as long as we     */
   2767                     /*  are matching a potential contraction sequence                  */
   2768 
   2769                     /* First we position ourselves at the begining of contraction sequence */
   2770                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2771 
   2772                     if (collIter_eos(source)) {
   2773                         // Ran off the end of the source string.
   2774                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2775                         // So we'll pick whatever we have at the point...
   2776                         if (CE == UCOL_NOT_FOUND) {
   2777                             // back up the source over all the chars we scanned going into this contraction.
   2778                             CE = firstCE;
   2779                             loadState(source, &state, TRUE);
   2780                             if(source->origFlags & UCOL_USE_ITERATOR) {
   2781                                 source->flags = source->origFlags;
   2782                             }
   2783                         }
   2784                         break;
   2785                     }
   2786 
   2787                     uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
   2788                     uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
   2789 
   2790                     schar = getNextNormalizedChar(source);
   2791                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2792                         UCharOffset++;
   2793                     }
   2794 
   2795                     if (schar == tchar) {
   2796                         // Found the source string char in the contraction table.
   2797                         //  Pick up the corresponding CE from the table.
   2798                         CE = *(coll->contractionCEs +
   2799                             (UCharOffset - coll->contractionIndex));
   2800                     }
   2801                     else
   2802                     {
   2803                         // Source string char was not in contraction table.
   2804                         //   Unless we have a discontiguous contraction, we have finished
   2805                         //   with this contraction.
   2806                         // in order to do the proper detection, we
   2807                         // need to see if we're dealing with a supplementary
   2808                         /* We test whether the next two char are surrogate pairs.
   2809                         * This test is done if the iterator is not NULL.
   2810                         * If there is no surrogate pair, the iterator
   2811                         * goes back one if needed. */
   2812                         UChar32 miss = schar;
   2813                         if (source->iterator) {
   2814                             UChar32 surrNextChar; /* the next char in the iteration to test */
   2815                             int32_t prevPos; /* holds the previous position before move forward of the source iterator */
   2816                             if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
   2817                                 prevPos = source->iterator->index;
   2818                                 surrNextChar = getNextNormalizedChar(source);
   2819                                 if (U16_IS_TRAIL(surrNextChar)) {
   2820                                     miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
   2821                                 } else if (prevPos < source->iterator->index){
   2822                                     goBackOne(source);
   2823                                 }
   2824                             }
   2825                         } else if (U16_IS_LEAD(schar)) {
   2826                             miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
   2827                         }
   2828 
   2829                         uint8_t sCC;
   2830                         if (miss < 0x300 ||
   2831                             maxCC == 0 ||
   2832                             (sCC = i_getCombiningClass(miss, coll)) == 0 ||
   2833                             sCC>maxCC ||
   2834                             (allSame != 0 && sCC == maxCC) ||
   2835                             collIter_eos(source))
   2836                         {
   2837                             //  Contraction can not be discontiguous.
   2838                             goBackOne(source);  // back up the source string by one,
   2839                             //  because  the character we just looked at was
   2840                             //  not part of the contraction.   */
   2841                             if(U_IS_SUPPLEMENTARY(miss)) {
   2842                                 goBackOne(source);
   2843                             }
   2844                             CE = *(coll->contractionCEs +
   2845                                 (ContractionStart - coll->contractionIndex));
   2846                         } else {
   2847                             //
   2848                             // Contraction is possibly discontiguous.
   2849                             //   Scan more of source string looking for a match
   2850                             //
   2851                             UChar tempchar;
   2852                             /* find the next character if schar is not a base character
   2853                             and we are not yet at the end of the string */
   2854                             tempchar = getNextNormalizedChar(source);
   2855                             // probably need another supplementary thingie here
   2856                             goBackOne(source);
   2857                             if (i_getCombiningClass(tempchar, coll) == 0) {
   2858                                 goBackOne(source);
   2859                                 if(U_IS_SUPPLEMENTARY(miss)) {
   2860                                     goBackOne(source);
   2861                                 }
   2862                                 /* Spit out the last char of the string, wasn't tasty enough */
   2863                                 CE = *(coll->contractionCEs +
   2864                                     (ContractionStart - coll->contractionIndex));
   2865                             } else {
   2866                                 CE = getDiscontiguous(coll, source, ContractionStart);
   2867                             }
   2868                         }
   2869                     } // else after if(schar == tchar)
   2870 
   2871                     if(CE == UCOL_NOT_FOUND) {
   2872                         /* The Source string did not match the contraction that we were checking.  */
   2873                         /*  Back up the source position to undo the effects of having partially    */
   2874                         /*   scanned through what ultimately proved to not be a contraction.       */
   2875                         loadState(source, &state, TRUE);
   2876                         CE = firstCE;
   2877                         break;
   2878                     }
   2879 
   2880                     if(!isContraction(CE)) {
   2881                         // The source string char was in the contraction table, and the corresponding
   2882                         //   CE is not a contraction CE.  We completed the contraction, break
   2883                         //   out of loop, this CE will end up being returned.  This is the normal
   2884                         //   way out of contraction handling when the source actually contained
   2885                         //   the contraction.
   2886                         break;
   2887                     }
   2888 
   2889 
   2890                     // The source string char was in the contraction table, and the corresponding
   2891                     //   CE is IS  a contraction CE.  We will continue looping to check the source
   2892                     //   string for the remaining chars in the contraction.
   2893                     uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
   2894                     if(tempCE != UCOL_NOT_FOUND) {
   2895                         // We have scanned a a section of source string for which there is a
   2896                         //  CE from the contraction table.  Remember the CE and scan position, so
   2897                         //  that we can return to this point if further scanning fails to
   2898                         //  match a longer contraction sequence.
   2899                         firstCE = tempCE;
   2900 
   2901                         goBackOne(source);
   2902                         backupState(source, &state);
   2903                         getNextNormalizedChar(source);
   2904 
   2905                         // Another way to do this is:
   2906                         //collIterateState tempState;
   2907                         //backupState(source, &tempState);
   2908                         //goBackOne(source);
   2909                         //backupState(source, &state);
   2910                         //loadState(source, &tempState, TRUE);
   2911 
   2912                         // The problem is that for incomplete contractions we have to remember the previous
   2913                         // position. Before, the only thing I needed to do was state.pos--;
   2914                         // After iterator introduction and especially after introduction of normalizing
   2915                         // iterators, it became much more difficult to decrease the saved state.
   2916                         // I'm not yet sure which of the two methods above is faster.
   2917                     }
   2918                 } // for(;;)
   2919                 break;
   2920             } // case CONTRACTION_TAG:
   2921         case LONG_PRIMARY_TAG:
   2922             {
   2923                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   2924                 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   2925                 source->offsetRepeatCount += 1;
   2926                 return CE;
   2927             }
   2928         case EXPANSION_TAG:
   2929             {
   2930                 /* This should handle expansion. */
   2931                 /* NOTE: we can encounter both continuations and expansions in an expansion! */
   2932                 /* I have to decide where continuations are going to be dealt with */
   2933                 uint32_t size;
   2934                 uint32_t i;    /* general counter */
   2935 
   2936                 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   2937                 size = getExpansionCount(CE);
   2938                 CE = *CEOffset++;
   2939               //source->offsetRepeatCount = -1;
   2940 
   2941                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   2942                     for(i = 1; i<size; i++) {
   2943                         *(source->CEpos++) = *CEOffset++;
   2944                         source->offsetRepeatCount += 1;
   2945                     }
   2946                 } else { /* else, we do */
   2947                     while(*CEOffset != 0) {
   2948                         *(source->CEpos++) = *CEOffset++;
   2949                         source->offsetRepeatCount += 1;
   2950                     }
   2951                 }
   2952 
   2953                 return CE;
   2954             }
   2955         case DIGIT_TAG:
   2956             {
   2957                 /*
   2958                 We do a check to see if we want to collate digits as numbers; if so we generate
   2959                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   2960                 */
   2961                 //uint32_t size;
   2962                 uint32_t i;    /* general counter */
   2963 
   2964                 if (source->coll->numericCollation == UCOL_ON){
   2965                     collIterateState digitState = {0,0,0,0,0,0,0,0,0};
   2966                     UChar32 char32 = 0;
   2967                     int32_t digVal = 0;
   2968 
   2969                     uint32_t digIndx = 0;
   2970                     uint32_t endIndex = 0;
   2971                     uint32_t trailingZeroIndex = 0;
   2972 
   2973                     uint8_t collateVal = 0;
   2974 
   2975                     UBool nonZeroValReached = FALSE;
   2976 
   2977                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
   2978                     /*
   2979                          We parse the source string until we hit a char that's NOT a digit.
   2980                         Use this u_charDigitValue. This might be slow because we have to
   2981                         handle surrogates...
   2982                     */
   2983             /*
   2984                     if (U16_IS_LEAD(ch)){
   2985                       if (!collIter_eos(source)) {
   2986                         backupState(source, &digitState);
   2987                         UChar trail = getNextNormalizedChar(source);
   2988                         if(U16_IS_TRAIL(trail)) {
   2989                           char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   2990                         } else {
   2991                           loadState(source, &digitState, TRUE);
   2992                           char32 = ch;
   2993                         }
   2994                       } else {
   2995                         char32 = ch;
   2996                       }
   2997                     } else {
   2998                       char32 = ch;
   2999                     }
   3000                     digVal = u_charDigitValue(char32);
   3001             */
   3002                     digVal = u_charDigitValue(cp); // if we have arrived here, we have
   3003                     // already processed possible supplementaries that trigered the digit tag -
   3004                     // all supplementaries are marked in the UCA.
   3005                     /*
   3006                         We  pad a zero in front of the first element anyways. This takes
   3007                         care of the (probably) most common case where people are sorting things followed
   3008                         by a single digit
   3009                     */
   3010                     digIndx++;
   3011                     for(;;){
   3012                         // Make sure we have enough space. No longer needed;
   3013                         // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
   3014                         // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
   3015                         // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
   3016 
   3017                         // Skipping over leading zeroes.
   3018                         if (digVal != 0) {
   3019                             nonZeroValReached = TRUE;
   3020                         }
   3021                         if (nonZeroValReached) {
   3022                             /*
   3023                             We parse the digit string into base 100 numbers (this fits into a byte).
   3024                             We only add to the buffer in twos, thus if we are parsing an odd character,
   3025                             that serves as the 'tens' digit while the if we are parsing an even one, that
   3026                             is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3027                             a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3028                             overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3029                             than all the other bytes.
   3030                             */
   3031 
   3032                             if (digIndx % 2 == 1){
   3033                                 collateVal += (uint8_t)digVal;
   3034 
   3035                                 // We don't enter the low-order-digit case unless we've already seen
   3036                                 // the high order, or for the first digit, which is always non-zero.
   3037                                 if (collateVal != 0)
   3038                                     trailingZeroIndex = 0;
   3039 
   3040                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3041                                 collateVal = 0;
   3042                             }
   3043                             else{
   3044                                 // We drop the collation value into the buffer so if we need to do
   3045                                 // a "front patch" we don't have to check to see if we're hitting the
   3046                                 // last element.
   3047                                 collateVal = (uint8_t)(digVal * 10);
   3048 
   3049                                 // Check for trailing zeroes.
   3050                                 if (collateVal == 0)
   3051                                 {
   3052                                     if (!trailingZeroIndex)
   3053                                         trailingZeroIndex = (digIndx/2) + 2;
   3054                                 }
   3055                                 else
   3056                                     trailingZeroIndex = 0;
   3057 
   3058                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3059                             }
   3060                             digIndx++;
   3061                         }
   3062 
   3063                         // Get next character.
   3064                         if (!collIter_eos(source)){
   3065                             ch = getNextNormalizedChar(source);
   3066                             if (U16_IS_LEAD(ch)){
   3067                                 if (!collIter_eos(source)) {
   3068                                     backupState(source, &digitState);
   3069                                     UChar trail = getNextNormalizedChar(source);
   3070                                     if(U16_IS_TRAIL(trail)) {
   3071                                         char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   3072                                     } else {
   3073                                         loadState(source, &digitState, TRUE);
   3074                                         char32 = ch;
   3075                                     }
   3076                                 }
   3077                             } else {
   3078                                 char32 = ch;
   3079                             }
   3080 
   3081                             if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
   3082                                 // Resetting position to point to the next unprocessed char. We
   3083                                 // overshot it when doing our test/set for numbers.
   3084                                 if (char32 > 0xFFFF) { // For surrogates.
   3085                                     loadState(source, &digitState, TRUE);
   3086                                     //goBackOne(source);
   3087                                 }
   3088                                 goBackOne(source);
   3089                                 break;
   3090                             }
   3091                         } else {
   3092                             break;
   3093                         }
   3094                     }
   3095 
   3096                     if (nonZeroValReached == FALSE){
   3097                         digIndx = 2;
   3098                         numTempBuf[2] = 6;
   3099                     }
   3100 
   3101                     endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
   3102                     if (digIndx % 2 != 0){
   3103                         /*
   3104                         We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
   3105                         we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
   3106                         Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
   3107                         single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
   3108                         */
   3109 
   3110                         for(i = 2; i < endIndex; i++){
   3111                             numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
   3112                                 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
   3113                         }
   3114                         --digIndx;
   3115                     }
   3116 
   3117                     // Subtract one off of the last byte.
   3118                     numTempBuf[endIndex-1] -= 1;
   3119 
   3120                     /*
   3121                     We want to skip over the first two slots in the buffer. The first slot
   3122                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3123                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3124                     */
   3125                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3126                     numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
   3127 
   3128                     // Now transfer the collation key to our collIterate struct.
   3129                     // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
   3130                     //size = ((endIndex+1) & ~1)/2;
   3131                     CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3132                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3133                         UCOL_BYTE_COMMON; // Tertiary weight.
   3134                     i = 2; // Reset the index into the buffer.
   3135                     while(i < endIndex)
   3136                     {
   3137                         uint32_t primWeight = numTempBuf[i++] << 8;
   3138                         if ( i < endIndex)
   3139                             primWeight |= numTempBuf[i++];
   3140                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3141                     }
   3142 
   3143                 } else {
   3144                     // no numeric mode, we'll just switch to whatever we stashed and continue
   3145                     CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   3146                     CE = *CEOffset++;
   3147                     break;
   3148                 }
   3149                 return CE;
   3150             }
   3151             /* various implicits optimization */
   3152         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   3153             /* UCA is filled with these. Tailorings are NOT_FOUND */
   3154             return getImplicit(cp, source);
   3155         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   3156             // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
   3157             return getImplicit(cp, source);
   3158         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3159             {
   3160                 static const uint32_t
   3161                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3162                 //const uint32_t LCount = 19;
   3163                 static const uint32_t VCount = 21;
   3164                 static const uint32_t TCount = 28;
   3165                 //const uint32_t NCount = VCount * TCount;   // 588
   3166                 //const uint32_t SCount = LCount * NCount;   // 11172
   3167                 uint32_t L = ch - SBase;
   3168 
   3169                 // divide into pieces
   3170 
   3171                 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
   3172                 L /= TCount;
   3173                 uint32_t V = L % VCount;
   3174                 L /= VCount;
   3175 
   3176                 // offset them
   3177 
   3178                 L += LBase;
   3179                 V += VBase;
   3180                 T += TBase;
   3181 
   3182                 // return the first CE, but first put the rest into the expansion buffer
   3183                 if (!source->coll->image->jamoSpecial) { // FAST PATH
   3184 
   3185                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   3186                     if (T != TBase) {
   3187                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   3188                     }
   3189 
   3190                     return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   3191 
   3192                 } else { // Jamo is Special
   3193                     // Since Hanguls pass the FCD check, it is
   3194                     // guaranteed that we won't be in
   3195                     // the normalization buffer if something like this happens
   3196                     // However, if we are using a uchar iterator and normalization
   3197                     // is ON, the Hangul that lead us here is going to be in that
   3198                     // normalization buffer. Here we want to restore the uchar
   3199                     // iterator state and pull out of the normalization buffer
   3200                     if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
   3201                         source->flags = source->origFlags; // restore the iterator
   3202                         source->pos = NULL;
   3203                     }
   3204                     // Move Jamos into normalization buffer
   3205                     UChar *buffer = source->writableBuffer.getBuffer(4);
   3206                     int32_t bufferLength;
   3207                     buffer[0] = (UChar)L;
   3208                     buffer[1] = (UChar)V;
   3209                     if (T != TBase) {
   3210                         buffer[2] = (UChar)T;
   3211                         bufferLength = 3;
   3212                     } else {
   3213                         bufferLength = 2;
   3214                     }
   3215                     source->writableBuffer.releaseBuffer(bufferLength);
   3216 
   3217                     source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
   3218                     //   after exhausting the writableBuffer
   3219                     source->pos   = source->writableBuffer.getTerminatedBuffer();
   3220                     source->origFlags   = source->flags;
   3221                     source->flags       |= UCOL_ITER_INNORMBUF;
   3222                     source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   3223 
   3224                     return(UCOL_IGNORABLE);
   3225                 }
   3226             }
   3227         case SURROGATE_TAG:
   3228             /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
   3229             /* two things can happen here: next code point can be a trailing surrogate - we will use it */
   3230             /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
   3231             /* we treat it like an unassigned code point. */
   3232             {
   3233                 UChar trail;
   3234                 collIterateState state;
   3235                 backupState(source, &state);
   3236                 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
   3237                     // we chould have stepped one char forward and it might have turned that it
   3238                     // was not a trail surrogate. In that case, we have to backup.
   3239                     loadState(source, &state, TRUE);
   3240                     return UCOL_NOT_FOUND;
   3241                 } else {
   3242                     /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
   3243                     CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
   3244                     if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
   3245                         // We need to backup
   3246                         loadState(source, &state, TRUE);
   3247                         return CE;
   3248                     }
   3249                     // calculate the supplementary code point value, if surrogate was not tailored
   3250                     cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   3251                 }
   3252             }
   3253             break;
   3254         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   3255             UChar nextChar;
   3256             if( source->flags & UCOL_USE_ITERATOR) {
   3257                 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
   3258                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3259                     source->iterator->next(source->iterator);
   3260                     return getImplicit(cp, source);
   3261                 }
   3262             } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
   3263                       U_IS_TRAIL((nextChar=*source->pos))) {
   3264                 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3265                 source->pos++;
   3266                 return getImplicit(cp, source);
   3267             }
   3268             return UCOL_NOT_FOUND;
   3269         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   3270             return UCOL_NOT_FOUND; /* broken surrogate sequence */
   3271         case CHARSET_TAG:
   3272             /* not yet implemented */
   3273             /* probably after 1.8 */
   3274             return UCOL_NOT_FOUND;
   3275         default:
   3276             *status = U_INTERNAL_PROGRAM_ERROR;
   3277             CE=0;
   3278             break;
   3279     }
   3280     if (CE <= UCOL_NOT_FOUND) break;
   3281   }
   3282   return CE;
   3283 }
   3284 
   3285 
   3286 /* now uses Mark's getImplicitPrimary code */
   3287 static
   3288 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
   3289     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   3290 
   3291     *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
   3292     collationSource->toReturn = collationSource->CEpos;
   3293 
   3294     // **** doesn't work if using iterator ****
   3295     if (collationSource->flags & UCOL_ITER_INNORMBUF) {
   3296         collationSource->offsetRepeatCount = 1;
   3297     } else {
   3298         int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
   3299 
   3300         UErrorCode errorCode = U_ZERO_ERROR;
   3301         collationSource->appendOffset(firstOffset, errorCode);
   3302         collationSource->appendOffset(firstOffset + 1, errorCode);
   3303 
   3304         collationSource->offsetReturn = collationSource->offsetStore - 1;
   3305         *(collationSource->offsetBuffer) = firstOffset;
   3306         if (collationSource->offsetReturn == collationSource->offsetBuffer) {
   3307             collationSource->offsetStore = collationSource->offsetBuffer;
   3308         }
   3309     }
   3310 
   3311     return ((r & 0x0000FFFF)<<16) | 0x000000C0;
   3312 }
   3313 
   3314 /**
   3315  * This function handles the special CEs like contractions, expansions,
   3316  * surrogates, Thai.
   3317  * It is called by both getPrevCE
   3318  */
   3319 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
   3320                           collIterate *source,
   3321                           UErrorCode *status)
   3322 {
   3323     const uint32_t *CEOffset    = NULL;
   3324           UChar    *UCharOffset = NULL;
   3325           UChar    schar;
   3326     const UChar    *constart    = NULL;
   3327           uint32_t size;
   3328           UChar    buffer[UCOL_MAX_BUFFER];
   3329           uint32_t *endCEBuffer;
   3330           UChar   *strbuffer;
   3331           int32_t noChars = 0;
   3332           int32_t CECount = 0;
   3333 
   3334     for(;;)
   3335     {
   3336         /* the only ces that loops are thai and contractions */
   3337         switch (getCETag(CE))
   3338         {
   3339         case NOT_FOUND_TAG:  /* this tag always returns */
   3340             return CE;
   3341 
   3342         case SPEC_PROC_TAG:
   3343             {
   3344                 // Special processing is getting a CE that is preceded by a certain prefix
   3345                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   3346                 // When we encouter a special processing tag, we go backwards and try to see if
   3347                 // we have a match.
   3348                 // Contraction tables are used - so the whole process is not unlike contraction.
   3349                 // prefix data is stored backwards in the table.
   3350                 const UChar *UCharOffset;
   3351                 UChar schar, tchar;
   3352                 collIterateState prefixState;
   3353                 backupState(source, &prefixState);
   3354                 for(;;) {
   3355                     // This loop will run once per source string character, for as long as we
   3356                     //  are matching a potential contraction sequence
   3357 
   3358                     // First we position ourselves at the begining of contraction sequence
   3359                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   3360 
   3361                     if (collIter_bos(source)) {
   3362                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   3363                         break;
   3364                     }
   3365                     schar = getPrevNormalizedChar(source, status);
   3366                     goBackOne(source);
   3367 
   3368                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   3369                         UCharOffset++;
   3370                     }
   3371 
   3372                     if (schar == tchar) {
   3373                         // Found the source string char in the table.
   3374                         //  Pick up the corresponding CE from the table.
   3375                         CE = *(coll->contractionCEs +
   3376                             (UCharOffset - coll->contractionIndex));
   3377                     }
   3378                     else
   3379                     {
   3380                         // if there is a completely ignorable code point in the middle of
   3381                         // a prefix, we need to act as if it's not there
   3382                         // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
   3383                         // lone surrogates cannot be set to zero as it would break other processing
   3384                         uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   3385                         // it's easy for BMP code points
   3386                         if(isZeroCE == 0) {
   3387                             continue;
   3388                         } else if(U16_IS_SURROGATE(schar)) {
   3389                             // for supplementary code points, we have to check the next one
   3390                             // situations where we are going to ignore
   3391                             // 1. beginning of the string: schar is a lone surrogate
   3392                             // 2. schar is a lone surrogate
   3393                             // 3. schar is a trail surrogate in a valid surrogate sequence
   3394                             //    that is explicitly set to zero.
   3395                             if (!collIter_bos(source)) {
   3396                                 UChar lead;
   3397                                 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
   3398                                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
   3399                                     if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
   3400                                         uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
   3401                                         if(finalCE == 0) {
   3402                                             // this is a real, assigned completely ignorable code point
   3403                                             goBackOne(source);
   3404                                             continue;
   3405                                         }
   3406                                     }
   3407                                 } else {
   3408                                     // lone surrogate, treat like unassigned
   3409                                     return UCOL_NOT_FOUND;
   3410                                 }
   3411                             } else {
   3412                                 // lone surrogate at the beggining, treat like unassigned
   3413                                 return UCOL_NOT_FOUND;
   3414                             }
   3415                         }
   3416                         // Source string char was not in the table.
   3417                         //   We have not found the prefix.
   3418                         CE = *(coll->contractionCEs +
   3419                             (ContractionStart - coll->contractionIndex));
   3420                     }
   3421 
   3422                     if(!isPrefix(CE)) {
   3423                         // The source string char was in the contraction table, and the corresponding
   3424                         //   CE is not a prefix CE.  We found the prefix, break
   3425                         //   out of loop, this CE will end up being returned.  This is the normal
   3426                         //   way out of prefix handling when the source actually contained
   3427                         //   the prefix.
   3428                         break;
   3429                     }
   3430                 }
   3431                 loadState(source, &prefixState, TRUE);
   3432                 break;
   3433             }
   3434 
   3435         case CONTRACTION_TAG: {
   3436             /* to ensure that the backwards and forwards iteration matches, we
   3437             take the current region of most possible match and pass it through
   3438             the forward iteration. this will ensure that the obstinate problem of
   3439             overlapping contractions will not occur.
   3440             */
   3441             schar = peekCodeUnit(source, 0);
   3442             constart = (UChar *)coll->image + getContractOffset(CE);
   3443             if (isAtStartPrevIterate(source)
   3444                 /* commented away contraction end checks after adding the checks
   3445                 in getPrevCE  */) {
   3446                     /* start of string or this is not the end of any contraction */
   3447                     CE = *(coll->contractionCEs +
   3448                         (constart - coll->contractionIndex));
   3449                     break;
   3450             }
   3451             strbuffer = buffer;
   3452             UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
   3453             *(UCharOffset --) = 0;
   3454             noChars = 0;
   3455             // have to swap thai characters
   3456             while (ucol_unsafeCP(schar, coll)) {
   3457                 *(UCharOffset) = schar;
   3458                 noChars++;
   3459                 UCharOffset --;
   3460                 schar = getPrevNormalizedChar(source, status);
   3461                 goBackOne(source);
   3462                 // TODO: when we exhaust the contraction buffer,
   3463                 // it needs to get reallocated. The problem is
   3464                 // that the size depends on the string which is
   3465                 // not iterated over. However, since we're travelling
   3466                 // backwards, we already had to set the iterator at
   3467                 // the end - so we might as well know where we are?
   3468                 if (UCharOffset + 1 == buffer) {
   3469                     /* we have exhausted the buffer */
   3470                     int32_t newsize = 0;
   3471                     if(source->pos) { // actually dealing with a position
   3472                         newsize = (int32_t)(source->pos - source->string + 1);
   3473                     } else { // iterator
   3474                         newsize = 4 * UCOL_MAX_BUFFER;
   3475                     }
   3476                     strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
   3477                         (newsize + UCOL_MAX_BUFFER));
   3478                     /* test for NULL */
   3479                     if (strbuffer == NULL) {
   3480                         *status = U_MEMORY_ALLOCATION_ERROR;
   3481                         return UCOL_NO_MORE_CES;
   3482                     }
   3483                     UCharOffset = strbuffer + newsize;
   3484                     uprv_memcpy(UCharOffset, buffer,
   3485                         UCOL_MAX_BUFFER * sizeof(UChar));
   3486                     UCharOffset --;
   3487                 }
   3488                 if ((source->pos && (source->pos == source->string ||
   3489                     ((source->flags & UCOL_ITER_INNORMBUF) &&
   3490                     *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
   3491                     || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
   3492                         break;
   3493                 }
   3494             }
   3495             /* adds the initial base character to the string */
   3496             *(UCharOffset) = schar;
   3497             noChars++;
   3498 
   3499             int32_t offsetBias;
   3500 
   3501             // **** doesn't work if using iterator ****
   3502             if (source->flags & UCOL_ITER_INNORMBUF) {
   3503                 offsetBias = -1;
   3504             } else {
   3505                 offsetBias = (int32_t)(source->pos - source->string);
   3506             }
   3507 
   3508             /* a new collIterate is used to simplify things, since using the current
   3509             collIterate will mean that the forward and backwards iteration will
   3510             share and change the same buffers. we don't want to get into that. */
   3511             collIterate temp;
   3512             int32_t rawOffset;
   3513 
   3514             IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
   3515             if(U_FAILURE(*status)) {
   3516                 return UCOL_NULLORDER;
   3517             }
   3518             temp.flags &= ~UCOL_ITER_NORM;
   3519             temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
   3520 
   3521             rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
   3522             CE = ucol_IGetNextCE(coll, &temp, status);
   3523 
   3524             if (source->extendCEs) {
   3525                 endCEBuffer = source->extendCEs + source->extendCEsSize;
   3526                 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
   3527             } else {
   3528                 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
   3529                 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
   3530             }
   3531 
   3532             while (CE != UCOL_NO_MORE_CES) {
   3533                 *(source->CEpos ++) = CE;
   3534 
   3535                 if (offsetBias >= 0) {
   3536                     source->appendOffset(rawOffset + offsetBias, *status);
   3537                 }
   3538 
   3539                 CECount++;
   3540                 if (source->CEpos == endCEBuffer) {
   3541                     /* ran out of CE space, reallocate to new buffer.
   3542                     If reallocation fails, reset pointers and bail out,
   3543                     there's no guarantee of the right character position after
   3544                     this bail*/
   3545                     if (!increaseCEsCapacity(source)) {
   3546                         *status = U_MEMORY_ALLOCATION_ERROR;
   3547                         break;
   3548                     }
   3549 
   3550                     endCEBuffer = source->extendCEs + source->extendCEsSize;
   3551                 }
   3552 
   3553                 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
   3554                     rawOffset = (int32_t)(temp.fcdPosition - temp.string);
   3555                 } else {
   3556                     rawOffset = (int32_t)(temp.pos - temp.string);
   3557                 }
   3558 
   3559                 CE = ucol_IGetNextCE(coll, &temp, status);
   3560             }
   3561 
   3562             if (strbuffer != buffer) {
   3563                 uprv_free(strbuffer);
   3564             }
   3565             if (U_FAILURE(*status)) {
   3566                 return (uint32_t)UCOL_NULLORDER;
   3567             }
   3568 
   3569             if (source->offsetRepeatValue != 0) {
   3570                 if (CECount > noChars) {
   3571                     source->offsetRepeatCount += temp.offsetRepeatCount;
   3572                 } else {
   3573                     // **** does this really skip the right offsets? ****
   3574                     source->offsetReturn -= (noChars - CECount);
   3575                 }
   3576             }
   3577 
   3578             if (offsetBias >= 0) {
   3579                 source->offsetReturn = source->offsetStore - 1;
   3580                 if (source->offsetReturn == source->offsetBuffer) {
   3581                     source->offsetStore = source->offsetBuffer;
   3582                 }
   3583             }
   3584 
   3585             source->toReturn = source->CEpos - 1;
   3586             if (source->toReturn == source->CEs) {
   3587                 source->CEpos = source->CEs;
   3588             }
   3589 
   3590             return *(source->toReturn);
   3591         }
   3592         case LONG_PRIMARY_TAG:
   3593             {
   3594                 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   3595                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   3596                 source->toReturn = source->CEpos - 1;
   3597 
   3598                 if (source->flags & UCOL_ITER_INNORMBUF) {
   3599                     source->offsetRepeatCount = 1;
   3600                 } else {
   3601                     int32_t firstOffset = (int32_t)(source->pos - source->string);
   3602 
   3603                     source->appendOffset(firstOffset, *status);
   3604                     source->appendOffset(firstOffset + 1, *status);
   3605 
   3606                     source->offsetReturn = source->offsetStore - 1;
   3607                     *(source->offsetBuffer) = firstOffset;
   3608                     if (source->offsetReturn == source->offsetBuffer) {
   3609                         source->offsetStore = source->offsetBuffer;
   3610                     }
   3611                 }
   3612 
   3613 
   3614                 return *(source->toReturn);
   3615             }
   3616 
   3617         case EXPANSION_TAG: /* this tag always returns */
   3618             {
   3619             /*
   3620             This should handle expansion.
   3621             NOTE: we can encounter both continuations and expansions in an expansion!
   3622             I have to decide where continuations are going to be dealt with
   3623             */
   3624             int32_t firstOffset = (int32_t)(source->pos - source->string);
   3625 
   3626             // **** doesn't work if using iterator ****
   3627             if (source->offsetReturn != NULL) {
   3628                 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
   3629                     source->offsetStore = source->offsetBuffer;
   3630                 }else {
   3631                   firstOffset = -1;
   3632                 }
   3633             }
   3634 
   3635             /* find the offset to expansion table */
   3636             CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3637             size     = getExpansionCount(CE);
   3638             if (size != 0) {
   3639                 /*
   3640                 if there are less than 16 elements in expansion, we don't terminate
   3641                 */
   3642                 uint32_t count;
   3643 
   3644                 for (count = 0; count < size; count++) {
   3645                     *(source->CEpos ++) = *CEOffset++;
   3646 
   3647                     if (firstOffset >= 0) {
   3648                         source->appendOffset(firstOffset + 1, *status);
   3649                     }
   3650                 }
   3651             } else {
   3652                 /* else, we do */
   3653                 while (*CEOffset != 0) {
   3654                     *(source->CEpos ++) = *CEOffset ++;
   3655 
   3656                     if (firstOffset >= 0) {
   3657                         source->appendOffset(firstOffset + 1, *status);
   3658                     }
   3659                 }
   3660             }
   3661 
   3662             if (firstOffset >= 0) {
   3663                 source->offsetReturn = source->offsetStore - 1;
   3664                 *(source->offsetBuffer) = firstOffset;
   3665                 if (source->offsetReturn == source->offsetBuffer) {
   3666                     source->offsetStore = source->offsetBuffer;
   3667                 }
   3668             } else {
   3669                 source->offsetRepeatCount += size - 1;
   3670             }
   3671 
   3672             source->toReturn = source->CEpos - 1;
   3673             // in case of one element expansion, we
   3674             // want to immediately return CEpos
   3675             if(source->toReturn == source->CEs) {
   3676                 source->CEpos = source->CEs;
   3677             }
   3678 
   3679             return *(source->toReturn);
   3680             }
   3681 
   3682         case DIGIT_TAG:
   3683             {
   3684                 /*
   3685                 We do a check to see if we want to collate digits as numbers; if so we generate
   3686                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   3687                 */
   3688                 uint32_t i;    /* general counter */
   3689 
   3690                 if (source->coll->numericCollation == UCOL_ON){
   3691                     uint32_t digIndx = 0;
   3692                     uint32_t endIndex = 0;
   3693                     uint32_t leadingZeroIndex = 0;
   3694                     uint32_t trailingZeroCount = 0;
   3695 
   3696                     uint8_t collateVal = 0;
   3697 
   3698                     UBool nonZeroValReached = FALSE;
   3699 
   3700                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
   3701                     /*
   3702                     We parse the source string until we hit a char that's NOT a digit.
   3703                     Use this u_charDigitValue. This might be slow because we have to
   3704                     handle surrogates...
   3705                     */
   3706                     /*
   3707                     We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
   3708                     with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
   3709                     element we process when going backward. To determine how long that chunk might be, we may need to make
   3710                     two passes through the loop that collects digits - one to see how long the string is (and how much is
   3711                     leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
   3712                     more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
   3713                     element chunk after resetting the state to the initialState at the right side of the digit string.
   3714                     */
   3715                     uint32_t ceLimit = 0;
   3716                     UChar initial_ch = ch;
   3717                     collIterateState initialState = {0,0,0,0,0,0,0,0,0};
   3718                     backupState(source, &initialState);
   3719 
   3720                     for(;;) {
   3721                         collIterateState state = {0,0,0,0,0,0,0,0,0};
   3722                         UChar32 char32 = 0;
   3723                         int32_t digVal = 0;
   3724 
   3725                         if (U16_IS_TRAIL (ch)) {
   3726                             if (!collIter_bos(source)){
   3727                                 UChar lead = getPrevNormalizedChar(source, status);
   3728                                 if(U16_IS_LEAD(lead)) {
   3729                                     char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3730                                     goBackOne(source);
   3731                                 } else {
   3732                                     char32 = ch;
   3733                                 }
   3734                             } else {
   3735                                 char32 = ch;
   3736                             }
   3737                         } else {
   3738                             char32 = ch;
   3739                         }
   3740                         digVal = u_charDigitValue(char32);
   3741 
   3742                         for(;;) {
   3743                             // Make sure we have enough space. No longer needed;
   3744                             // at this point the largest value of digIndx when we need to save data in numTempBuf
   3745                             // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
   3746                             // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
   3747 
   3748                             // Skip over trailing zeroes, and keep a count of them.
   3749                             if (digVal != 0)
   3750                                 nonZeroValReached = TRUE;
   3751 
   3752                             if (nonZeroValReached) {
   3753                                 /*
   3754                                 We parse the digit string into base 100 numbers (this fits into a byte).
   3755                                 We only add to the buffer in twos, thus if we are parsing an odd character,
   3756                                 that serves as the 'tens' digit while the if we are parsing an even one, that
   3757                                 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3758                                 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3759                                 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3760                                 than all the other bytes.
   3761 
   3762                                 Since we're doing in this reverse we want to put the first digit encountered into the
   3763                                 ones place and the second digit encountered into the tens place.
   3764                                 */
   3765 
   3766                                 if ((digIndx + trailingZeroCount) % 2 == 1) {
   3767                                     // High-order digit case (tens place)
   3768                                     collateVal += (uint8_t)(digVal * 10);
   3769 
   3770                                     // We cannot set leadingZeroIndex unless it has been set for the
   3771                                     // low-order digit. Therefore, all we can do for the high-order
   3772                                     // digit is turn it off, never on.
   3773                                     // The only time we will have a high digit without a low is for
   3774                                     // the very first non-zero digit, so no zero check is necessary.
   3775                                     if (collateVal != 0)
   3776                                         leadingZeroIndex = 0;
   3777 
   3778                                     // The first pass through, digIndx may exceed the limit, but in that case
   3779                                     // we no longer care about numTempBuf contents since they will be discarded
   3780                                     if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
   3781                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3782                                     }
   3783                                     collateVal = 0;
   3784                                 } else {
   3785                                     // Low-order digit case (ones place)
   3786                                     collateVal = (uint8_t)digVal;
   3787 
   3788                                     // Check for leading zeroes.
   3789                                     if (collateVal == 0) {
   3790                                         if (!leadingZeroIndex)
   3791                                             leadingZeroIndex = (digIndx/2) + 2;
   3792                                     } else
   3793                                         leadingZeroIndex = 0;
   3794 
   3795                                     // No need to write to buffer; the case of a last odd digit
   3796                                     // is handled below.
   3797                                 }
   3798                                 ++digIndx;
   3799                             } else
   3800                                 ++trailingZeroCount;
   3801 
   3802                             if (!collIter_bos(source)) {
   3803                                 ch = getPrevNormalizedChar(source, status);
   3804                                 //goBackOne(source);
   3805                                 if (U16_IS_TRAIL(ch)) {
   3806                                     backupState(source, &state);
   3807                                     if (!collIter_bos(source)) {
   3808                                         goBackOne(source);
   3809                                         UChar lead = getPrevNormalizedChar(source, status);
   3810 
   3811                                         if(U16_IS_LEAD(lead)) {
   3812                                             char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3813                                         } else {
   3814                                             loadState(source, &state, FALSE);
   3815                                             char32 = ch;
   3816                                         }
   3817                                     }
   3818                                 } else
   3819                                     char32 = ch;
   3820 
   3821                                 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
   3822                                     if (char32 > 0xFFFF) {// For surrogates.
   3823                                         loadState(source, &state, FALSE);
   3824                                     }
   3825                                     // Don't need to "reverse" the goBackOne call,
   3826                                     // as this points to the next position to process..
   3827                                     //if (char32 > 0xFFFF) // For surrogates.
   3828                                     //getNextNormalizedChar(source);
   3829                                     break;
   3830                                 }
   3831 
   3832                                 goBackOne(source);
   3833                             }else
   3834                                 break;
   3835                         }
   3836 
   3837                         if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
   3838                             // our collation element is not too big, go ahead and finish with it
   3839                             break;
   3840                         }
   3841                         // our digit string is too long for a collation element;
   3842                         // set the limit for it, reset the state and begin again
   3843                         ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
   3844                         if ( ceLimit == 0 ) {
   3845                             ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
   3846                         }
   3847                         ch = initial_ch;
   3848                         loadState(source, &initialState, FALSE);
   3849                         digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
   3850                         collateVal = 0;
   3851                         nonZeroValReached = FALSE;
   3852                     }
   3853 
   3854                     if (! nonZeroValReached) {
   3855                         digIndx = 2;
   3856                         trailingZeroCount = 0;
   3857                         numTempBuf[2] = 6;
   3858                     }
   3859 
   3860                     if ((digIndx + trailingZeroCount) % 2 != 0) {
   3861                         numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
   3862                         digIndx += 1;       // The implicit leading zero
   3863                     }
   3864                     if (trailingZeroCount % 2 != 0) {
   3865                         // We had to consume one trailing zero for the low digit
   3866                         // of the least significant byte
   3867                         digIndx += 1;       // The trailing zero not in the exponent
   3868                         trailingZeroCount -= 1;
   3869                     }
   3870 
   3871                     endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
   3872 
   3873                     // Subtract one off of the last byte. Really the first byte here, but it's reversed...
   3874                     numTempBuf[2] -= 1;
   3875 
   3876                     /*
   3877                     We want to skip over the first two slots in the buffer. The first slot
   3878                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3879                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3880                     The exponent must be adjusted by the number of leading zeroes, and the number of
   3881                     trailing zeroes.
   3882                     */
   3883                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3884                     uint32_t exponent = (digIndx+trailingZeroCount)/2;
   3885                     if (leadingZeroIndex)
   3886                         exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
   3887                     numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
   3888 
   3889                     // Now transfer the collation key to our collIterate struct.
   3890                     // The total size for our collation key is half of endIndex, rounded up.
   3891                     int32_t size = (endIndex+1)/2;
   3892                     if(!ensureCEsCapacity(source, size)) {
   3893                         return UCOL_NULLORDER;
   3894                     }
   3895                     *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3896                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3897                         UCOL_BYTE_COMMON; // Tertiary weight.
   3898                     i = endIndex - 1; // Reset the index into the buffer.
   3899                     while(i >= 2) {
   3900                         uint32_t primWeight = numTempBuf[i--] << 8;
   3901                         if ( i >= 2)
   3902                             primWeight |= numTempBuf[i--];
   3903                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3904                     }
   3905 
   3906                     source->toReturn = source->CEpos -1;
   3907                     return *(source->toReturn);
   3908                 } else {
   3909                     CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3910                     CE = *(CEOffset++);
   3911                     break;
   3912                 }
   3913             }
   3914 
   3915         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3916             {
   3917                 static const uint32_t
   3918                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3919                 //const uint32_t LCount = 19;
   3920                 static const uint32_t VCount = 21;
   3921                 static const uint32_t TCount = 28;
   3922                 //const uint32_t NCount = VCount * TCount;   /* 588 */
   3923                 //const uint32_t SCount = LCount * NCount;   /* 11172 */
   3924 
   3925                 uint32_t L = ch - SBase;
   3926                 /*
   3927                 divide into pieces.
   3928                 we do it in this order since some compilers can do % and / in one
   3929                 operation
   3930                 */
   3931                 uint32_t T = L % TCount;
   3932                 L /= TCount;
   3933                 uint32_t V = L % VCount;
   3934                 L /= VCount;
   3935 
   3936                 /* offset them */
   3937                 L += LBase;
   3938                 V += VBase;
   3939                 T += TBase;
   3940 
   3941                 int32_t firstOffset = (int32_t)(source->pos - source->string);
   3942                 source->appendOffset(firstOffset, *status);
   3943 
   3944                 /*
   3945                  * return the first CE, but first put the rest into the expansion buffer
   3946                  */
   3947                 if (!source->coll->image->jamoSpecial) {
   3948                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   3949                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   3950                     source->appendOffset(firstOffset + 1, *status);
   3951 
   3952                     if (T != TBase) {
   3953                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   3954                         source->appendOffset(firstOffset + 1, *status);
   3955                     }
   3956 
   3957                     source->toReturn = source->CEpos - 1;
   3958 
   3959                     source->offsetReturn = source->offsetStore - 1;
   3960                     if (source->offsetReturn == source->offsetBuffer) {
   3961                         source->offsetStore = source->offsetBuffer;
   3962                     }
   3963 
   3964                     return *(source->toReturn);
   3965                 } else {
   3966                     // Since Hanguls pass the FCD check, it is
   3967                     // guaranteed that we won't be in
   3968                     // the normalization buffer if something like this happens
   3969                     // Move Jamos into normalization buffer
   3970                     /*
   3971                     Move the Jamos into the
   3972                     normalization buffer
   3973                     */
   3974                     UChar *tempbuffer = source->writableBuffer.getBuffer(5);
   3975                     int32_t tempbufferLength;
   3976                     tempbuffer[0] = 0;
   3977                     tempbuffer[1] = (UChar)L;
   3978                     tempbuffer[2] = (UChar)V;
   3979                     if (T != TBase) {
   3980                         tempbuffer[3] = (UChar)T;
   3981                         tempbufferLength = 4;
   3982                     } else {
   3983                         tempbufferLength = 3;
   3984                     }
   3985                     source->writableBuffer.releaseBuffer(tempbufferLength);
   3986 
   3987                     /*
   3988                     Indicate where to continue in main input string after exhausting
   3989                     the writableBuffer
   3990                     */
   3991                     if (source->pos  == source->string) {
   3992                         source->fcdPosition = NULL;
   3993                     } else {
   3994                         source->fcdPosition       = source->pos-1;
   3995                     }
   3996 
   3997                     source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
   3998                     source->origFlags         = source->flags;
   3999                     source->flags            |= UCOL_ITER_INNORMBUF;
   4000                     source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   4001 
   4002                     return(UCOL_IGNORABLE);
   4003                 }
   4004             }
   4005 
   4006         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   4007             return getPrevImplicit(ch, source);
   4008 
   4009             // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
   4010         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   4011             return getPrevImplicit(ch, source);
   4012 
   4013         case SURROGATE_TAG:  /* This is a surrogate pair */
   4014             /* essentially an engaged lead surrogate. */
   4015             /* if you have encountered it here, it means that a */
   4016             /* broken sequence was encountered and this is an error */
   4017             return UCOL_NOT_FOUND;
   4018 
   4019         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   4020             return UCOL_NOT_FOUND; /* broken surrogate sequence */
   4021 
   4022         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   4023             {
   4024                 UChar32 cp = 0;
   4025                 UChar  prevChar;
   4026                 const UChar *prev;
   4027                 if (isAtStartPrevIterate(source)) {
   4028                     /* we are at the start of the string, wrong place to be at */
   4029                     return UCOL_NOT_FOUND;
   4030                 }
   4031                 if (source->pos != source->writableBuffer.getBuffer()) {
   4032                     prev     = source->pos - 1;
   4033                 } else {
   4034                     prev     = source->fcdPosition;
   4035                 }
   4036                 prevChar = *prev;
   4037 
   4038                 /* Handles Han and Supplementary characters here.*/
   4039                 if (U16_IS_LEAD(prevChar)) {
   4040                     cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   4041                     source->pos = prev;
   4042                 } else {
   4043                     return UCOL_NOT_FOUND; /* like unassigned */
   4044                 }
   4045 
   4046                 return getPrevImplicit(cp, source);
   4047             }
   4048 
   4049             /* UCA is filled with these. Tailorings are NOT_FOUND */
   4050             /* not yet implemented */
   4051         case CHARSET_TAG:  /* this tag always returns */
   4052             /* probably after 1.8 */
   4053             return UCOL_NOT_FOUND;
   4054 
   4055         default:           /* this tag always returns */
   4056             *status = U_INTERNAL_PROGRAM_ERROR;
   4057             CE=0;
   4058             break;
   4059         }
   4060 
   4061         if (CE <= UCOL_NOT_FOUND) {
   4062             break;
   4063         }
   4064     }
   4065 
   4066     return CE;
   4067 }
   4068 
   4069 /* This should really be a macro        */
   4070 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
   4071 /* anyway */
   4072 static
   4073 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
   4074 #ifdef UCOL_DEBUG
   4075     fprintf(stderr, ".");
   4076 #endif
   4077     uint8_t *newStart = NULL;
   4078     uint32_t offset = (uint32_t)(*secondaries-secStart);
   4079 
   4080     if(secStart==second) {
   4081         newStart=(uint8_t*)uprv_malloc(newSize);
   4082         if(newStart==NULL) {
   4083             *status = U_MEMORY_ALLOCATION_ERROR;
   4084             return NULL;
   4085         }
   4086         uprv_memcpy(newStart, secStart, *secondaries-secStart);
   4087     } else {
   4088         newStart=(uint8_t*)uprv_realloc(secStart, newSize);
   4089         if(newStart==NULL) {
   4090             *status = U_MEMORY_ALLOCATION_ERROR;
   4091             /* Since we're reallocating, return original reference so we don't loose it. */
   4092             return secStart;
   4093         }
   4094     }
   4095     *secondaries=newStart+offset;
   4096     *secSize=newSize;
   4097     return newStart;
   4098 }
   4099 
   4100 
   4101 /* This should really be a macro                                                                      */
   4102 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
   4103 /* secondaries in French                                                                              */
   4104 /*
   4105 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
   4106   uint8_t temp;
   4107   while(start<end) {
   4108     temp = *start;
   4109     *start++ = *end;
   4110     *end-- = temp;
   4111   }
   4112 }
   4113 */
   4114 
   4115 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
   4116   TYPE tempA; \
   4117 while((start)<(end)) { \
   4118     tempA = *(start); \
   4119     *(start)++ = *(end); \
   4120     *(end)-- = tempA; \
   4121 } \
   4122 }
   4123 
   4124 /****************************************************************************/
   4125 /* Following are the sortkey generation functions                           */
   4126 /*                                                                          */
   4127 /****************************************************************************/
   4128 
   4129 /**
   4130  * Merge two sort keys.
   4131  * This is useful, for example, to combine sort keys from first and last names
   4132  * to sort such pairs.
   4133  * Merged sort keys consider on each collation level the first part first entirely,
   4134  * then the second one.
   4135  * It is possible to merge multiple sort keys by consecutively merging
   4136  * another one with the intermediate result.
   4137  *
   4138  * The length of the merge result is the sum of the lengths of the input sort keys
   4139  * minus 1.
   4140  *
   4141  * @param src1 the first sort key
   4142  * @param src1Length the length of the first sort key, including the zero byte at the end;
   4143  *        can be -1 if the function is to find the length
   4144  * @param src2 the second sort key
   4145  * @param src2Length the length of the second sort key, including the zero byte at the end;
   4146  *        can be -1 if the function is to find the length
   4147  * @param dest the buffer where the merged sort key is written,
   4148  *        can be NULL if destCapacity==0
   4149  * @param destCapacity the number of bytes in the dest buffer
   4150  * @return the length of the merged sort key, src1Length+src2Length-1;
   4151  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
   4152  *         in which cases the contents of dest is undefined
   4153  *
   4154  * @draft
   4155  */
   4156 U_CAPI int32_t U_EXPORT2
   4157 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
   4158                    const uint8_t *src2, int32_t src2Length,
   4159                    uint8_t *dest, int32_t destCapacity) {
   4160     int32_t destLength;
   4161     uint8_t b;
   4162 
   4163     /* check arguments */
   4164     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
   4165         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
   4166         destCapacity<0 || (destCapacity>0 && dest==NULL)
   4167     ) {
   4168         /* error, attempt to write a zero byte and return 0 */
   4169         if(dest!=NULL && destCapacity>0) {
   4170             *dest=0;
   4171         }
   4172         return 0;
   4173     }
   4174 
   4175     /* check lengths and capacity */
   4176     if(src1Length<0) {
   4177         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
   4178     }
   4179     if(src2Length<0) {
   4180         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
   4181     }
   4182 
   4183     destLength=src1Length+src2Length-1;
   4184     if(destLength>destCapacity) {
   4185         /* the merged sort key does not fit into the destination */
   4186         return destLength;
   4187     }
   4188 
   4189     /* merge the sort keys with the same number of levels */
   4190     while(*src1!=0 && *src2!=0) { /* while both have another level */
   4191         /* copy level from src1 not including 00 or 01 */
   4192         while((b=*src1)>=2) {
   4193             ++src1;
   4194             *dest++=b;
   4195         }
   4196 
   4197         /* add a 02 merge separator */
   4198         *dest++=2;
   4199 
   4200         /* copy level from src2 not including 00 or 01 */
   4201         while((b=*src2)>=2) {
   4202             ++src2;
   4203             *dest++=b;
   4204         }
   4205 
   4206         /* if both sort keys have another level, then add a 01 level separator and continue */
   4207         if(*src1==1 && *src2==1) {
   4208             ++src1;
   4209             ++src2;
   4210             *dest++=1;
   4211         }
   4212     }
   4213 
   4214     /*
   4215      * here, at least one sort key is finished now, but the other one
   4216      * might have some contents left from containing more levels;
   4217      * that contents is just appended to the result
   4218      */
   4219     if(*src1!=0) {
   4220         /* src1 is not finished, therefore *src2==0, and src1 is appended */
   4221         src2=src1;
   4222     }
   4223     /* append src2, "the other, unfinished sort key" */
   4224     uprv_strcpy((char *)dest, (const char *)src2);
   4225 
   4226     /* trust that neither sort key contained illegally embedded zero bytes */
   4227     return destLength;
   4228 }
   4229 
   4230 /* sortkey API */
   4231 U_CAPI int32_t U_EXPORT2
   4232 ucol_getSortKey(const    UCollator    *coll,
   4233         const    UChar        *source,
   4234         int32_t        sourceLength,
   4235         uint8_t        *result,
   4236         int32_t        resultLength)
   4237 {
   4238     UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
   4239     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   4240         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
   4241             ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
   4242     }
   4243 
   4244     UErrorCode status = U_ZERO_ERROR;
   4245     int32_t keySize   = 0;
   4246 
   4247     if(source != NULL) {
   4248         // source == NULL is actually an error situation, but we would need to
   4249         // have an error code to return it. Until we introduce a new
   4250         // API, it stays like this
   4251 
   4252         /* this uses the function pointer that is set in updateinternalstate */
   4253         /* currently, there are two funcs: */
   4254         /*ucol_calcSortKey(...);*/
   4255         /*ucol_calcSortKeySimpleTertiary(...);*/
   4256 
   4257         keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
   4258         //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) {
   4259             // That's not good. Something unusual happened.
   4260             // We don't know how much we initialized before we failed.
   4261             // NULL terminate for safety.
   4262             // We have no way say that we have generated a partial sort key.
   4263             //result[0] = 0;
   4264             //keySize = 0;
   4265         //}
   4266     }
   4267     UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
   4268     UTRACE_EXIT_STATUS(status);
   4269     return keySize;
   4270 }
   4271 
   4272 /* this function is called by the C++ API for sortkey generation */
   4273 U_CFUNC int32_t
   4274 ucol_getSortKeyWithAllocation(const UCollator *coll,
   4275                               const UChar *source, int32_t sourceLength,
   4276                               uint8_t **pResult,
   4277                               UErrorCode *pErrorCode) {
   4278     *pResult = 0;
   4279     return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
   4280 }
   4281 
   4282 #define UCOL_FSEC_BUF_SIZE 256
   4283 
   4284 // Is this primary weight compressible?
   4285 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
   4286 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
   4287 static inline UBool
   4288 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
   4289     return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
   4290 }
   4291 
   4292 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
   4293 /* or if we run out of space while making a sortkey and want to return ASAP                                   */
   4294 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
   4295     UErrorCode status = U_ZERO_ERROR;
   4296     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
   4297     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
   4298     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
   4299     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
   4300     UBool  compareIdent = (strength == UCOL_IDENTICAL);
   4301     UBool  doCase = (coll->caseLevel == UCOL_ON);
   4302     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
   4303     //UBool  qShifted = shifted  && (compareQuad == 0);
   4304     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
   4305     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
   4306     uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
   4307     uint8_t *fSecs = fSecsBuff;
   4308     uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
   4309     uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
   4310 
   4311     uint32_t variableTopValue = coll->variableTopValue;
   4312     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
   4313     if(doHiragana) {
   4314         UCOL_COMMON_BOT4++;
   4315         /* allocate one more space for hiragana */
   4316     }
   4317     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
   4318 
   4319     uint32_t order = UCOL_NO_MORE_CES;
   4320     uint8_t primary1 = 0;
   4321     uint8_t primary2 = 0;
   4322     uint8_t secondary = 0;
   4323     uint8_t tertiary = 0;
   4324     int32_t caseShift = 0;
   4325     uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
   4326 
   4327     uint8_t caseSwitch = coll->caseSwitch;
   4328     uint8_t tertiaryMask = coll->tertiaryMask;
   4329     uint8_t tertiaryCommon = coll->tertiaryCommon;
   4330 
   4331     UBool wasShifted = FALSE;
   4332     UBool notIsContinuation = FALSE;
   4333     uint8_t leadPrimary = 0;
   4334 
   4335 
   4336     for(;;) {
   4337         order = ucol_IGetNextCE(coll, s, &status);
   4338         if(order == UCOL_NO_MORE_CES) {
   4339             break;
   4340         }
   4341 
   4342         if(order == 0) {
   4343             continue;
   4344         }
   4345 
   4346         notIsContinuation = !isContinuation(order);
   4347 
   4348 
   4349         if(notIsContinuation) {
   4350             tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
   4351         } else {
   4352             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   4353         }
   4354         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4355         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4356         primary1 = (uint8_t)(order >> 8);
   4357 
   4358         /* no need to permute since the actual code values don't matter
   4359         if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
   4360             primary1 = coll->leadBytePermutationTable[primary1];
   4361         }
   4362         */
   4363 
   4364         if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
   4365                       || (!notIsContinuation && wasShifted)))
   4366             || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
   4367                 /* and other ignorables should be removed if following a shifted code point */
   4368                 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
   4369                     /* we should just completely ignore it */
   4370                     continue;
   4371                 }
   4372                 if(compareQuad == 0) {
   4373                     if(c4 > 0) {
   4374                         currentSize += (c2/UCOL_BOT_COUNT4)+1;
   4375                         c4 = 0;
   4376                     }
   4377                     currentSize++;
   4378                     if(primary2 != 0) {
   4379                         currentSize++;
   4380                     }
   4381                 }
   4382                 wasShifted = TRUE;
   4383         } else {
   4384             wasShifted = FALSE;
   4385             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   4386             /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   4387             /* calculate sortkey size */
   4388             if(primary1 != UCOL_IGNORABLE) {
   4389                 if(notIsContinuation) {
   4390                     if(leadPrimary == primary1) {
   4391                         currentSize++;
   4392                     } else {
   4393                         if(leadPrimary != 0) {
   4394                             currentSize++;
   4395                         }
   4396                         if(primary2 == UCOL_IGNORABLE) {
   4397                             /* one byter, not compressed */
   4398                             currentSize++;
   4399                             leadPrimary = 0;
   4400                         } else if(isCompressible(coll, primary1)) {
   4401                             /* compress */
   4402                             leadPrimary = primary1;
   4403                             currentSize+=2;
   4404                         } else {
   4405                             leadPrimary = 0;
   4406                             currentSize+=2;
   4407                         }
   4408                     }
   4409                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   4410                     currentSize++;
   4411                     if(primary2 != UCOL_IGNORABLE) {
   4412                         currentSize++;
   4413                     }
   4414                 }
   4415             }
   4416 
   4417             if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
   4418                 if(!isFrenchSec){
   4419                     if (secondary == UCOL_COMMON2 && notIsContinuation) {
   4420                         c2++;
   4421                     } else {
   4422                         if(c2 > 0) {
   4423                             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4424                                 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
   4425                             } else {
   4426                                 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
   4427                             }
   4428                             c2 = 0;
   4429                         }
   4430                         currentSize++;
   4431                     }
   4432                 } else {
   4433                     fSecs[fSecsLen++] = secondary;
   4434                     if(fSecsLen == fSecsMaxLen) {
   4435                         uint8_t *fSecsTemp;
   4436                         if(fSecs == fSecsBuff) {
   4437                             fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen);
   4438                         } else {
   4439                             fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
   4440                         }
   4441                         if(fSecsTemp == NULL) {
   4442                             status = U_MEMORY_ALLOCATION_ERROR;
   4443                             return 0;
   4444                         }
   4445                         fSecs = fSecsTemp;
   4446                         fSecsMaxLen *= 2;
   4447                     }
   4448                     if(notIsContinuation) {
   4449                         if (frenchStartPtr != NULL) {
   4450                             /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4451                             uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4452                             frenchStartPtr = NULL;
   4453                         }
   4454                     } else {
   4455                         if (frenchStartPtr == NULL) {
   4456                             frenchStartPtr = fSecs+fSecsLen-2;
   4457                         }
   4458                         frenchEndPtr = fSecs+fSecsLen-1;
   4459                     }
   4460                 }
   4461             }
   4462 
   4463             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
   4464                 // do the case level if we need to do it. We don't want to calculate
   4465                 // case level for primary ignorables if we have only primary strength and case level
   4466                 // otherwise we would break well formedness of CEs
   4467                 if (caseShift  == 0) {
   4468                     currentSize++;
   4469                     caseShift = UCOL_CASE_SHIFT_START;
   4470                 }
   4471                 if((tertiary&0x3F) > 0 && notIsContinuation) {
   4472                     caseShift--;
   4473                     if((tertiary &0xC0) != 0) {
   4474                         if (caseShift  == 0) {
   4475                             currentSize++;
   4476                             caseShift = UCOL_CASE_SHIFT_START;
   4477                         }
   4478                         caseShift--;
   4479                     }
   4480                 }
   4481             } else {
   4482                 if(notIsContinuation) {
   4483                     tertiary ^= caseSwitch;
   4484                 }
   4485             }
   4486 
   4487             tertiary &= tertiaryMask;
   4488             if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
   4489                 if (tertiary == tertiaryCommon && notIsContinuation) {
   4490                     c3++;
   4491                 } else {
   4492                     if(c3 > 0) {
   4493                         if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
   4494                             || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
   4495                                 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
   4496                         } else {
   4497                             currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
   4498                         }
   4499                         c3 = 0;
   4500                     }
   4501                     currentSize++;
   4502                 }
   4503             }
   4504 
   4505             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
   4506                 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   4507                     if(c4>0) { // Close this part
   4508                         currentSize += (c4/UCOL_BOT_COUNT4)+1;
   4509                         c4 = 0;
   4510                     }
   4511                     currentSize++; // Add the Hiragana
   4512                 } else { // This wasn't Hiragana, so we can continue adding stuff
   4513                     c4++;
   4514                 }
   4515             }
   4516         }
   4517     }
   4518 
   4519     if(!isFrenchSec){
   4520         if(c2 > 0) {
   4521             currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
   4522         }
   4523     } else {
   4524         uint32_t i = 0;
   4525         if(frenchStartPtr != NULL) {
   4526             uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4527         }
   4528         for(i = 0; i<fSecsLen; i++) {
   4529             secondary = *(fSecs+fSecsLen-i-1);
   4530             /* This is compression code. */
   4531             if (secondary == UCOL_COMMON2) {
   4532                 ++c2;
   4533             } else {
   4534                 if(c2 > 0) {
   4535                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4536                         currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
   4537                     } else {
   4538                         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
   4539                     }
   4540                     c2 = 0;
   4541                 }
   4542                 currentSize++;
   4543             }
   4544         }
   4545         if(c2 > 0) {
   4546             currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
   4547         }
   4548         if(fSecs != fSecsBuff) {
   4549             uprv_free(fSecs);
   4550         }
   4551     }
   4552 
   4553     if(c3 > 0) {
   4554         currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
   4555     }
   4556 
   4557     if(c4 > 0  && compareQuad == 0) {
   4558         currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
   4559     }
   4560 
   4561     if(compareIdent) {
   4562         currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
   4563     }
   4564     return currentSize;
   4565 }
   4566 
   4567 static
   4568 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
   4569     if (caseShift  == 0) {
   4570         *(*cases)++ = UCOL_CASE_BYTE_START;
   4571         caseShift = UCOL_CASE_SHIFT_START;
   4572     }
   4573 }
   4574 
   4575 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
   4576 // know how many values we wanted to add, even if we didn't add them all
   4577 static
   4578 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
   4579     size++;
   4580     if(primaries < limit) {
   4581         *(primaries)++ = value;
   4582     }
   4583 }
   4584 
   4585 // Packs the secondary buffer when processing French locale. Adds the terminator.
   4586 static
   4587 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
   4588     uint8_t secondary;
   4589     int32_t count2 = 0;
   4590     uint32_t i = 0, size = 0;
   4591     // we use i here since the key size already accounts for terminators, so we'll discard the increment
   4592     addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
   4593     /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
   4594     if(frenchStartPtr != NULL) {
   4595         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4596     }
   4597     for(i = 0; i<*secsize; i++) {
   4598         secondary = *(secondaries-i-1);
   4599         /* This is compression code. */
   4600         if (secondary == UCOL_COMMON2) {
   4601             ++count2;
   4602         } else {
   4603             if (count2 > 0) {
   4604                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4605                     while (count2 > UCOL_TOP_COUNT2) {
   4606                         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
   4607                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4608                     }
   4609                     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
   4610                 } else {
   4611                     while (count2 > UCOL_BOT_COUNT2) {
   4612                         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   4613                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4614                     }
   4615                     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   4616                 }
   4617                 count2 = 0;
   4618             }
   4619             addWithIncrement(primaries, primEnd, size, secondary);
   4620         }
   4621     }
   4622     if (count2 > 0) {
   4623         while (count2 > UCOL_BOT_COUNT2) {
   4624             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   4625             count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4626         }
   4627         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   4628     }
   4629     *secsize = size;
   4630     return primaries;
   4631 }
   4632 
   4633 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
   4634 
   4635 /* This is the sortkey work horse function */
   4636 U_CFUNC int32_t U_CALLCONV
   4637 ucol_calcSortKey(const    UCollator    *coll,
   4638         const    UChar        *source,
   4639         int32_t        sourceLength,
   4640         uint8_t        **result,
   4641         uint32_t        resultLength,
   4642         UBool allocateSKBuffer,
   4643         UErrorCode *status)
   4644 {
   4645     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
   4646 
   4647     uint32_t i = 0; /* general purpose counter */
   4648 
   4649     /* Stack allocated buffers for buffers we use */
   4650     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
   4651 
   4652     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
   4653 
   4654     if(U_FAILURE(*status)) {
   4655         return 0;
   4656     }
   4657 
   4658     if(primaries == NULL && allocateSKBuffer == TRUE) {
   4659         primaries = *result = prim;
   4660         resultLength = UCOL_PRIMARY_MAX_BUFFER;
   4661     }
   4662 
   4663     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
   4664       caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
   4665 
   4666     uint32_t sortKeySize = 1; /* it is always \0 terminated */
   4667 
   4668     UnicodeString normSource;
   4669 
   4670     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
   4671 
   4672     UColAttributeValue strength = coll->strength;
   4673 
   4674     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
   4675     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
   4676     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
   4677     UBool  compareIdent = (strength == UCOL_IDENTICAL);
   4678     UBool  doCase = (coll->caseLevel == UCOL_ON);
   4679     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
   4680     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
   4681     //UBool  qShifted = shifted && (compareQuad == 0);
   4682     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
   4683 
   4684     uint32_t variableTopValue = coll->variableTopValue;
   4685     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
   4686     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
   4687     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
   4688     uint8_t UCOL_HIRAGANA_QUAD = 0;
   4689     if(doHiragana) {
   4690         UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
   4691         /* allocate one more space for hiragana, value for hiragana */
   4692     }
   4693     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
   4694 
   4695     /* support for special features like caselevel and funky secondaries */
   4696     uint8_t *frenchStartPtr = NULL;
   4697     uint8_t *frenchEndPtr = NULL;
   4698     uint32_t caseShift = 0;
   4699 
   4700     sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
   4701 
   4702     /* If we need to normalize, we'll do it all at once at the beginning! */
   4703     const Normalizer2 *norm2;
   4704     if(compareIdent) {
   4705         norm2 = Normalizer2Factory::getNFDInstance(*status);
   4706     } else if(coll->normalizationMode != UCOL_OFF) {
   4707         norm2 = Normalizer2Factory::getFCDInstance(*status);
   4708     } else {
   4709         norm2 = NULL;
   4710     }
   4711     if(norm2 != NULL) {
   4712         normSource.setTo(FALSE, source, len);
   4713         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   4714         if(qcYesLength != len) {
   4715             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   4716             normSource.truncate(qcYesLength);
   4717             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   4718             source = normSource.getBuffer();
   4719             len = normSource.length();
   4720         }
   4721     }
   4722     collIterate s;
   4723     IInit_collIterate(coll, source, len, &s, status);
   4724     if(U_FAILURE(*status)) {
   4725         return 0;
   4726     }
   4727     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
   4728 
   4729     if(resultLength == 0 || primaries == NULL) {
   4730         return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
   4731     }
   4732     uint8_t *primarySafeEnd = primaries + resultLength - 1;
   4733     if(strength > UCOL_PRIMARY) {
   4734         primarySafeEnd--;
   4735     }
   4736 
   4737     uint32_t minBufferSize = UCOL_MAX_BUFFER;
   4738 
   4739     uint8_t *primStart = primaries;
   4740     uint8_t *secStart = secondaries;
   4741     uint8_t *terStart = tertiaries;
   4742     uint8_t *caseStart = cases;
   4743     uint8_t *quadStart = quads;
   4744 
   4745     uint32_t order = 0;
   4746 
   4747     uint8_t primary1 = 0;
   4748     uint8_t primary2 = 0;
   4749     uint8_t secondary = 0;
   4750     uint8_t tertiary = 0;
   4751     uint8_t caseSwitch = coll->caseSwitch;
   4752     uint8_t tertiaryMask = coll->tertiaryMask;
   4753     int8_t tertiaryAddition = coll->tertiaryAddition;
   4754     uint8_t tertiaryTop = coll->tertiaryTop;
   4755     uint8_t tertiaryBottom = coll->tertiaryBottom;
   4756     uint8_t tertiaryCommon = coll->tertiaryCommon;
   4757     uint8_t caseBits = 0;
   4758 
   4759     UBool finished = FALSE;
   4760     UBool wasShifted = FALSE;
   4761     UBool notIsContinuation = FALSE;
   4762 
   4763     uint32_t prevBuffSize = 0;
   4764 
   4765     uint32_t count2 = 0, count3 = 0, count4 = 0;
   4766     uint8_t leadPrimary = 0;
   4767 
   4768     for(;;) {
   4769         for(i=prevBuffSize; i<minBufferSize; ++i) {
   4770 
   4771             order = ucol_IGetNextCE(coll, &s, status);
   4772             if(order == UCOL_NO_MORE_CES) {
   4773                 finished = TRUE;
   4774                 break;
   4775             }
   4776 
   4777             if(order == 0) {
   4778                 continue;
   4779             }
   4780 
   4781             notIsContinuation = !isContinuation(order);
   4782 
   4783             if(notIsContinuation) {
   4784                 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
   4785             } else {
   4786                 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   4787             }
   4788 
   4789             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4790             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4791             primary1 = (uint8_t)(order >> 8);
   4792 
   4793             uint8_t originalPrimary1 = primary1;
   4794             if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
   4795                 primary1 = coll->leadBytePermutationTable[primary1];
   4796             }
   4797 
   4798             if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
   4799                            || (!notIsContinuation && wasShifted)))
   4800                 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   4801             {
   4802                 /* and other ignorables should be removed if following a shifted code point */
   4803                 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
   4804                     /* we should just completely ignore it */
   4805                     continue;
   4806                 }
   4807                 if(compareQuad == 0) {
   4808                     if(count4 > 0) {
   4809                         while (count4 > UCOL_BOT_COUNT4) {
   4810                             *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   4811                             count4 -= UCOL_BOT_COUNT4;
   4812                         }
   4813                         *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
   4814                         count4 = 0;
   4815                     }
   4816                     /* We are dealing with a variable and we're treating them as shifted */
   4817                     /* This is a shifted ignorable */
   4818                     if(primary1 != 0) { /* we need to check this since we could be in continuation */
   4819                         *quads++ = primary1;
   4820                     }
   4821                     if(primary2 != 0) {
   4822                         *quads++ = primary2;
   4823                     }
   4824                 }
   4825                 wasShifted = TRUE;
   4826             } else {
   4827                 wasShifted = FALSE;
   4828                 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   4829                 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   4830                 /* regular and simple sortkey calc */
   4831                 if(primary1 != UCOL_IGNORABLE) {
   4832                     if(notIsContinuation) {
   4833                         if(leadPrimary == primary1) {
   4834                             *primaries++ = primary2;
   4835                         } else {
   4836                             if(leadPrimary != 0) {
   4837                                 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
   4838                             }
   4839                             if(primary2 == UCOL_IGNORABLE) {
   4840                                 /* one byter, not compressed */
   4841                                 *primaries++ = primary1;
   4842                                 leadPrimary = 0;
   4843                             } else if(isCompressible(coll, originalPrimary1)) {
   4844                                 /* compress */
   4845                                 *primaries++ = leadPrimary = primary1;
   4846                                 if(primaries <= primarySafeEnd) {
   4847                                     *primaries++ = primary2;
   4848                                 }
   4849                             } else {
   4850                                 leadPrimary = 0;
   4851                                 *primaries++ = primary1;
   4852                                 if(primaries <= primarySafeEnd) {
   4853                                     *primaries++ = primary2;
   4854                                 }
   4855                             }
   4856                         }
   4857                     } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   4858                         *primaries++ = primary1;
   4859                         if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) {
   4860                                 *primaries++ = primary2; /* second part */
   4861                         }
   4862                     }
   4863                 }
   4864 
   4865                 if(secondary > compareSec) {
   4866                     if(!isFrenchSec) {
   4867                         /* This is compression code. */
   4868                         if (secondary == UCOL_COMMON2 && notIsContinuation) {
   4869                             ++count2;
   4870                         } else {
   4871                             if (count2 > 0) {
   4872                                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4873                                     while (count2 > UCOL_TOP_COUNT2) {
   4874                                         *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   4875                                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4876                                     }
   4877                                     *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
   4878                                 } else {
   4879                                     while (count2 > UCOL_BOT_COUNT2) {
   4880                                         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4881                                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4882                                     }
   4883                                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   4884                                 }
   4885                                 count2 = 0;
   4886                             }
   4887                             *secondaries++ = secondary;
   4888                         }
   4889                     } else {
   4890                         *secondaries++ = secondary;
   4891                         /* Do the special handling for French secondaries */
   4892                         /* We need to get continuation elements and do intermediate restore */
   4893                         /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
   4894                         if(notIsContinuation) {
   4895                             if (frenchStartPtr != NULL) {
   4896                                 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4897                                 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4898                                 frenchStartPtr = NULL;
   4899                             }
   4900                         } else {
   4901                             if (frenchStartPtr == NULL) {
   4902                                 frenchStartPtr = secondaries - 2;
   4903                             }
   4904                             frenchEndPtr = secondaries-1;
   4905                         }
   4906                     }
   4907                 }
   4908 
   4909                 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
   4910                     // do the case level if we need to do it. We don't want to calculate
   4911                     // case level for primary ignorables if we have only primary strength and case level
   4912                     // otherwise we would break well formedness of CEs
   4913                     doCaseShift(&cases, caseShift);
   4914                     if(notIsContinuation) {
   4915                         caseBits = (uint8_t)(tertiary & 0xC0);
   4916 
   4917                         if(tertiary != 0) {
   4918                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   4919                                 if((caseBits & 0xC0) == 0) {
   4920                                     *(cases-1) |= 1 << (--caseShift);
   4921                                 } else {
   4922                                     *(cases-1) |= 0 << (--caseShift);
   4923                                     /* second bit */
   4924                                     doCaseShift(&cases, caseShift);
   4925                                     *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
   4926                                 }
   4927                             } else {
   4928                                 if((caseBits & 0xC0) == 0) {
   4929                                     *(cases-1) |= 0 << (--caseShift);
   4930                                 } else {
   4931                                     *(cases-1) |= 1 << (--caseShift);
   4932                                     /* second bit */
   4933                                     doCaseShift(&cases, caseShift);
   4934                                     *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
   4935                                 }
   4936                             }
   4937                         }
   4938 
   4939                     }
   4940                 } else {
   4941                     if(notIsContinuation) {
   4942                         tertiary ^= caseSwitch;
   4943                     }
   4944                 }
   4945 
   4946                 tertiary &= tertiaryMask;
   4947                 if(tertiary > compareTer) {
   4948                     /* This is compression code. */
   4949                     /* sequence size check is included in the if clause */
   4950                     if (tertiary == tertiaryCommon && notIsContinuation) {
   4951                         ++count3;
   4952                     } else {
   4953                         if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   4954                             tertiary += tertiaryAddition;
   4955                         } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   4956                             tertiary -= tertiaryAddition;
   4957                         }
   4958                         if (count3 > 0) {
   4959                             if ((tertiary > tertiaryCommon)) {
   4960                                 while (count3 > coll->tertiaryTopCount) {
   4961                                     *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   4962                                     count3 -= (uint32_t)coll->tertiaryTopCount;
   4963                                 }
   4964                                 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
   4965                             } else {
   4966                                 while (count3 > coll->tertiaryBottomCount) {
   4967                                     *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   4968                                     count3 -= (uint32_t)coll->tertiaryBottomCount;
   4969                                 }
   4970                                 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   4971                             }
   4972                             count3 = 0;
   4973                         }
   4974                         *tertiaries++ = tertiary;
   4975                     }
   4976                 }
   4977 
   4978                 if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
   4979                     if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   4980                         if(count4>0) { // Close this part
   4981                             while (count4 > UCOL_BOT_COUNT4) {
   4982                                 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   4983                                 count4 -= UCOL_BOT_COUNT4;
   4984                             }
   4985                             *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
   4986                             count4 = 0;
   4987                         }
   4988                         *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
   4989                     } else { // This wasn't Hiragana, so we can continue adding stuff
   4990                         count4++;
   4991                     }
   4992                 }
   4993             }
   4994 
   4995             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
   4996                 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
   4997                     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   4998                     if(U_FAILURE(*status)) {
   4999                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5000                         finished = TRUE;
   5001                         break;
   5002                     }
   5003                     s.flags &= ~UCOL_ITER_NORM;
   5004                     sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
   5005                     *status = U_BUFFER_OVERFLOW_ERROR;
   5006                     finished = TRUE;
   5007                     break;
   5008                 } else { /* It's much nicer if we can actually reallocate */
   5009                     int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart));
   5010                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
   5011                     if(U_SUCCESS(*status)) {
   5012                         *result = primStart;
   5013                         primarySafeEnd = primStart + resultLength - 1;
   5014                         if(strength > UCOL_PRIMARY) {
   5015                             primarySafeEnd--;
   5016                         }
   5017                     } else {
   5018                         /* We ran out of memory!? We can't recover. */
   5019                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5020                         finished = TRUE;
   5021                         break;
   5022                     }
   5023                 }
   5024             }
   5025         }
   5026         if(finished) {
   5027             break;
   5028         } else {
   5029             prevBuffSize = minBufferSize;
   5030 
   5031             uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
   5032             if (frenchStartPtr != NULL) {
   5033                 frenchStartOffset = (uint32_t)(frenchStartPtr - secStart);
   5034                 frenchEndOffset = (uint32_t)(frenchEndPtr - secStart);
   5035             }
   5036             secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
   5037             terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
   5038             caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
   5039             quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
   5040             if(U_FAILURE(*status)) {
   5041                 /* We ran out of memory!? We can't recover. */
   5042                 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5043                 break;
   5044             }
   5045             if (frenchStartPtr != NULL) {
   5046                 frenchStartPtr = secStart + frenchStartOffset;
   5047                 frenchEndPtr = secStart + frenchEndOffset;
   5048             }
   5049             minBufferSize *= 2;
   5050         }
   5051     }
   5052 
   5053     /* Here, we are generally done with processing */
   5054     /* bailing out would not be too productive */
   5055 
   5056     if(U_SUCCESS(*status)) {
   5057         sortKeySize += (uint32_t)(primaries - primStart);
   5058         /* we have done all the CE's, now let's put them together to form a key */
   5059         if(compareSec == 0) {
   5060             if (count2 > 0) {
   5061                 while (count2 > UCOL_BOT_COUNT2) {
   5062                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5063                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5064                 }
   5065                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   5066             }
   5067             uint32_t secsize = (uint32_t)(secondaries-secStart);
   5068             if(!isFrenchSec) { // Regular situation, we know the length of secondaries
   5069                 sortKeySize += secsize;
   5070                 if(sortKeySize <= resultLength) {
   5071                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5072                     uprv_memcpy(primaries, secStart, secsize);
   5073                     primaries += secsize;
   5074                 } else {
   5075                     if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
   5076                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5077                         if(U_SUCCESS(*status)) {
   5078                             *result = primStart;
   5079                             *(primaries++) = UCOL_LEVELTERMINATOR;
   5080                             uprv_memcpy(primaries, secStart, secsize);
   5081                             primaries += secsize;
   5082                         }
   5083                         else {
   5084                             /* We ran out of memory!? We can't recover. */
   5085                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5086                             goto cleanup;
   5087                         }
   5088                     } else {
   5089                         *status = U_BUFFER_OVERFLOW_ERROR;
   5090                     }
   5091                 }
   5092             } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
   5093                 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
   5094                 sortKeySize += secsize;
   5095                 if(sortKeySize <= resultLength) { // if we managed to pack fine
   5096                     primaries = newPrim; // update the primary pointer
   5097                 } else { // overflow, need to reallocate and redo
   5098                     if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
   5099                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5100                         if(U_SUCCESS(*status)) {
   5101                             primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
   5102                         }
   5103                         else {
   5104                             /* We ran out of memory!? We can't recover. */
   5105                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5106                             goto cleanup;
   5107                         }
   5108                     } else {
   5109                         *status = U_BUFFER_OVERFLOW_ERROR;
   5110                     }
   5111                 }
   5112             }
   5113         }
   5114 
   5115         if(doCase) {
   5116             uint32_t casesize = (uint32_t)(cases - caseStart);
   5117             sortKeySize += casesize;
   5118             if(sortKeySize <= resultLength) {
   5119                 *(primaries++) = UCOL_LEVELTERMINATOR;
   5120                 uprv_memcpy(primaries, caseStart, casesize);
   5121                 primaries += casesize;
   5122             } else {
   5123                 if(allocateSKBuffer == TRUE) {
   5124                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5125                     if(U_SUCCESS(*status)) {
   5126                         *result = primStart;
   5127                         *(primaries++) = UCOL_LEVELTERMINATOR;
   5128                         uprv_memcpy(primaries, caseStart, casesize);
   5129                     }
   5130                     else {
   5131                         /* We ran out of memory!? We can't recover. */
   5132                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5133                         goto cleanup;
   5134                     }
   5135                 } else {
   5136                     *status = U_BUFFER_OVERFLOW_ERROR;
   5137                 }
   5138             }
   5139         }
   5140 
   5141         if(compareTer == 0) {
   5142             if (count3 > 0) {
   5143                 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
   5144                     while (count3 >= coll->tertiaryTopCount) {
   5145                         *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   5146                         count3 -= (uint32_t)coll->tertiaryTopCount;
   5147                     }
   5148                     *tertiaries++ = (uint8_t)(tertiaryTop - count3);
   5149                 } else {
   5150                     while (count3 > coll->tertiaryBottomCount) {
   5151                         *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   5152                         count3 -= (uint32_t)coll->tertiaryBottomCount;
   5153                     }
   5154                     *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   5155                 }
   5156             }
   5157             uint32_t tersize = (uint32_t)(tertiaries - terStart);
   5158             sortKeySize += tersize;
   5159             if(sortKeySize <= resultLength) {
   5160                 *(primaries++) = UCOL_LEVELTERMINATOR;
   5161                 uprv_memcpy(primaries, terStart, tersize);
   5162                 primaries += tersize;
   5163             } else {
   5164                 if(allocateSKBuffer == TRUE) {
   5165                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5166                     if(U_SUCCESS(*status)) {
   5167                         *result = primStart;
   5168                         *(primaries++) = UCOL_LEVELTERMINATOR;
   5169                         uprv_memcpy(primaries, terStart, tersize);
   5170                     }
   5171                     else {
   5172                         /* We ran out of memory!? We can't recover. */
   5173                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5174                         goto cleanup;
   5175                     }
   5176                 } else {
   5177                     *status = U_BUFFER_OVERFLOW_ERROR;
   5178                 }
   5179             }
   5180 
   5181             if(compareQuad == 0/*qShifted == TRUE*/) {
   5182                 if(count4 > 0) {
   5183                     while (count4 > UCOL_BOT_COUNT4) {
   5184                         *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   5185                         count4 -= UCOL_BOT_COUNT4;
   5186                     }
   5187                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
   5188                 }
   5189                 uint32_t quadsize = (uint32_t)(quads - quadStart);
   5190                 sortKeySize += quadsize;
   5191                 if(sortKeySize <= resultLength) {
   5192                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5193                     uprv_memcpy(primaries, quadStart, quadsize);
   5194                     primaries += quadsize;
   5195                 } else {
   5196                     if(allocateSKBuffer == TRUE) {
   5197                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5198                         if(U_SUCCESS(*status)) {
   5199                             *result = primStart;
   5200                             *(primaries++) = UCOL_LEVELTERMINATOR;
   5201                             uprv_memcpy(primaries, quadStart, quadsize);
   5202                         }
   5203                         else {
   5204                             /* We ran out of memory!? We can't recover. */
   5205                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5206                             goto cleanup;
   5207                         }
   5208                     } else {
   5209                         *status = U_BUFFER_OVERFLOW_ERROR;
   5210                     }
   5211                 }
   5212             }
   5213 
   5214             if(compareIdent) {
   5215                 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
   5216                 if(sortKeySize <= resultLength) {
   5217                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5218                     primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
   5219                 } else {
   5220                     if(allocateSKBuffer == TRUE) {
   5221                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
   5222                         if(U_SUCCESS(*status)) {
   5223                             *result = primStart;
   5224                             *(primaries++) = UCOL_LEVELTERMINATOR;
   5225                             u_writeIdenticalLevelRun(s.string, len, primaries);
   5226                         }
   5227                         else {
   5228                             /* We ran out of memory!? We can't recover. */
   5229                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5230                             goto cleanup;
   5231                         }
   5232                     } else {
   5233                         *status = U_BUFFER_OVERFLOW_ERROR;
   5234                     }
   5235                 }
   5236             }
   5237         }
   5238         *(primaries++) = '\0';
   5239     }
   5240 
   5241     if(allocateSKBuffer == TRUE) {
   5242         *result = (uint8_t*)uprv_malloc(sortKeySize);
   5243         /* test for NULL */
   5244         if (*result == NULL) {
   5245             *status = U_MEMORY_ALLOCATION_ERROR;
   5246             goto cleanup;
   5247         }
   5248         uprv_memcpy(*result, primStart, sortKeySize);
   5249         if(primStart != prim) {
   5250             uprv_free(primStart);
   5251         }
   5252     }
   5253 
   5254 cleanup:
   5255     if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
   5256         /* NULL terminate for safety */
   5257         **result = 0;
   5258     }
   5259     if(terStart != tert) {
   5260         uprv_free(terStart);
   5261         uprv_free(secStart);
   5262         uprv_free(caseStart);
   5263         uprv_free(quadStart);
   5264     }
   5265 
   5266     /* To avoid memory leak, free the offset buffer if necessary. */
   5267     ucol_freeOffsetBuffer(&s);
   5268 
   5269     return sortKeySize;
   5270 }
   5271 
   5272 
   5273 U_CFUNC int32_t U_CALLCONV
   5274 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
   5275         const    UChar        *source,
   5276         int32_t        sourceLength,
   5277         uint8_t        **result,
   5278         uint32_t        resultLength,
   5279         UBool allocateSKBuffer,
   5280         UErrorCode *status)
   5281 {
   5282     U_ALIGN_CODE(16);
   5283 
   5284     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
   5285     uint32_t i = 0; /* general purpose counter */
   5286 
   5287     /* Stack allocated buffers for buffers we use */
   5288     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
   5289 
   5290     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
   5291 
   5292     if(U_FAILURE(*status)) {
   5293         return 0;
   5294     }
   5295 
   5296     if(primaries == NULL && allocateSKBuffer == TRUE) {
   5297         primaries = *result = prim;
   5298         resultLength = UCOL_PRIMARY_MAX_BUFFER;
   5299     }
   5300 
   5301     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
   5302 
   5303     uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
   5304 
   5305     UnicodeString normSource;
   5306 
   5307     int32_t len =  sourceLength;
   5308 
   5309     /* If we need to normalize, we'll do it all at once at the beginning! */
   5310     if(coll->normalizationMode != UCOL_OFF) {
   5311         normSource.setTo(len < 0, source, len);
   5312         const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
   5313         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   5314         if(qcYesLength != normSource.length()) {
   5315             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   5316             normSource.truncate(qcYesLength);
   5317             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   5318             source = normSource.getBuffer();
   5319             len = normSource.length();
   5320         }
   5321     }
   5322     collIterate s;
   5323     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   5324     if(U_FAILURE(*status)) {
   5325         return 0;
   5326     }
   5327     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
   5328 
   5329     if(resultLength == 0 || primaries == NULL) {
   5330         return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
   5331     }
   5332 
   5333     uint8_t *primarySafeEnd = primaries + resultLength - 2;
   5334 
   5335     uint32_t minBufferSize = UCOL_MAX_BUFFER;
   5336 
   5337     uint8_t *primStart = primaries;
   5338     uint8_t *secStart = secondaries;
   5339     uint8_t *terStart = tertiaries;
   5340 
   5341     uint32_t order = 0;
   5342 
   5343     uint8_t primary1 = 0;
   5344     uint8_t primary2 = 0;
   5345     uint8_t secondary = 0;
   5346     uint8_t tertiary = 0;
   5347     uint8_t caseSwitch = coll->caseSwitch;
   5348     uint8_t tertiaryMask = coll->tertiaryMask;
   5349     int8_t tertiaryAddition = coll->tertiaryAddition;
   5350     uint8_t tertiaryTop = coll->tertiaryTop;
   5351     uint8_t tertiaryBottom = coll->tertiaryBottom;
   5352     uint8_t tertiaryCommon = coll->tertiaryCommon;
   5353 
   5354     uint32_t prevBuffSize = 0;
   5355 
   5356     UBool finished = FALSE;
   5357     UBool notIsContinuation = FALSE;
   5358 
   5359     uint32_t count2 = 0, count3 = 0;
   5360     uint8_t leadPrimary = 0;
   5361 
   5362     for(;;) {
   5363         for(i=prevBuffSize; i<minBufferSize; ++i) {
   5364 
   5365             order = ucol_IGetNextCE(coll, &s, status);
   5366 
   5367             if(order == 0) {
   5368                 continue;
   5369             }
   5370 
   5371             if(order == UCOL_NO_MORE_CES) {
   5372                 finished = TRUE;
   5373                 break;
   5374             }
   5375 
   5376             notIsContinuation = !isContinuation(order);
   5377 
   5378             if(notIsContinuation) {
   5379                 tertiary = (uint8_t)((order & tertiaryMask));
   5380             } else {
   5381                 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   5382             }
   5383 
   5384             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5385             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5386             primary1 = (uint8_t)(order >> 8);
   5387 
   5388             uint8_t originalPrimary1 = primary1;
   5389             if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
   5390                 primary1 = coll->leadBytePermutationTable[primary1];
   5391             }
   5392 
   5393             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   5394             /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   5395             /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
   5396             /* regular and simple sortkey calc */
   5397             if(primary1 != UCOL_IGNORABLE) {
   5398                 if(notIsContinuation) {
   5399                     if(leadPrimary == primary1) {
   5400                         *primaries++ = primary2;
   5401                     } else {
   5402                         if(leadPrimary != 0) {
   5403                             *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
   5404                         }
   5405                         if(primary2 == UCOL_IGNORABLE) {
   5406                             /* one byter, not compressed */
   5407                             *primaries++ = primary1;
   5408                             leadPrimary = 0;
   5409                         } else if(isCompressible(coll, originalPrimary1)) {
   5410                             /* compress */
   5411                             *primaries++ = leadPrimary = primary1;
   5412                             *primaries++ = primary2;
   5413                         } else {
   5414                             leadPrimary = 0;
   5415                             *primaries++ = primary1;
   5416                             *primaries++ = primary2;
   5417                         }
   5418                     }
   5419                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   5420                     *primaries++ = primary1;
   5421                     if(primary2 != UCOL_IGNORABLE) {
   5422                         *primaries++ = primary2; /* second part */
   5423                     }
   5424                 }
   5425             }
   5426 
   5427             if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
   5428                 /* This is compression code. */
   5429                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
   5430                     ++count2;
   5431                 } else {
   5432                     if (count2 > 0) {
   5433                         if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   5434                             while (count2 > UCOL_TOP_COUNT2) {
   5435                                 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   5436                                 count2 -= (uint32_t)UCOL_TOP_COUNT2;
   5437                             }
   5438                             *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
   5439                         } else {
   5440                             while (count2 > UCOL_BOT_COUNT2) {
   5441                                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5442                                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5443                             }
   5444                             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   5445                         }
   5446                         count2 = 0;
   5447                     }
   5448                     *secondaries++ = secondary;
   5449                 }
   5450             }
   5451 
   5452             if(notIsContinuation) {
   5453                 tertiary ^= caseSwitch;
   5454             }
   5455 
   5456             if(tertiary > 0) {
   5457                 /* This is compression code. */
   5458                 /* sequence size check is included in the if clause */
   5459                 if (tertiary == tertiaryCommon && notIsContinuation) {
   5460                     ++count3;
   5461                 } else {
   5462                     if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   5463                         tertiary += tertiaryAddition;
   5464                     } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   5465                         tertiary -= tertiaryAddition;
   5466                     }
   5467                     if (count3 > 0) {
   5468                         if ((tertiary > tertiaryCommon)) {
   5469                             while (count3 > coll->tertiaryTopCount) {
   5470                                 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   5471                                 count3 -= (uint32_t)coll->tertiaryTopCount;
   5472                             }
   5473                             *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
   5474                         } else {
   5475                             while (count3 > coll->tertiaryBottomCount) {
   5476                                 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   5477                                 count3 -= (uint32_t)coll->tertiaryBottomCount;
   5478                             }
   5479                             *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   5480                         }
   5481                         count3 = 0;
   5482                     }
   5483                     *tertiaries++ = tertiary;
   5484                 }
   5485             }
   5486 
   5487             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
   5488                 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
   5489                     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   5490                     if(U_FAILURE(*status)) {
   5491                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5492                         finished = TRUE;
   5493                         break;
   5494                     }
   5495                     s.flags &= ~UCOL_ITER_NORM;
   5496                     sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
   5497                     *status = U_BUFFER_OVERFLOW_ERROR;
   5498                     finished = TRUE;
   5499                     break;
   5500                 } else { /* It's much nicer if we can actually reallocate */
   5501                     int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart));
   5502                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
   5503                     if(U_SUCCESS(*status)) {
   5504                         *result = primStart;
   5505                         primarySafeEnd = primStart + resultLength - 2;
   5506                     } else {
   5507                         /* We ran out of memory!? We can't recover. */
   5508                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5509                         finished = TRUE;
   5510                         break;
   5511                     }
   5512                 }
   5513             }
   5514         }
   5515         if(finished) {
   5516             break;
   5517         } else {
   5518             prevBuffSize = minBufferSize;
   5519             secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
   5520             terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
   5521             minBufferSize *= 2;
   5522             if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
   5523                 /* We ran out of memory!? We can't recover. */
   5524                 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5525                 break;
   5526             }
   5527         }
   5528     }
   5529 
   5530     if(U_SUCCESS(*status)) {
   5531         sortKeySize += (uint32_t)(primaries - primStart);
   5532         /* we have done all the CE's, now let's put them together to form a key */
   5533         if (count2 > 0) {
   5534             while (count2 > UCOL_BOT_COUNT2) {
   5535                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5536                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5537             }
   5538             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
   5539         }
   5540         uint32_t secsize = (uint32_t)(secondaries-secStart);
   5541         sortKeySize += secsize;
   5542         if(sortKeySize <= resultLength) {
   5543             *(primaries++) = UCOL_LEVELTERMINATOR;
   5544             uprv_memcpy(primaries, secStart, secsize);
   5545             primaries += secsize;
   5546         } else {
   5547             if(allocateSKBuffer == TRUE) {
   5548                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5549                 if(U_SUCCESS(*status)) {
   5550                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5551                     *result = primStart;
   5552                     uprv_memcpy(primaries, secStart, secsize);
   5553                 }
   5554                 else {
   5555                     /* We ran out of memory!? We can't recover. */
   5556                     sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5557                     goto cleanup;
   5558                 }
   5559             } else {
   5560                 *status = U_BUFFER_OVERFLOW_ERROR;
   5561             }
   5562         }
   5563 
   5564         if (count3 > 0) {
   5565             if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
   5566                 while (count3 >= coll->tertiaryTopCount) {
   5567                     *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
   5568                     count3 -= (uint32_t)coll->tertiaryTopCount;
   5569                 }
   5570                 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
   5571             } else {
   5572                 while (count3 > coll->tertiaryBottomCount) {
   5573                     *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
   5574                     count3 -= (uint32_t)coll->tertiaryBottomCount;
   5575                 }
   5576                 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
   5577             }
   5578         }
   5579         uint32_t tersize = (uint32_t)(tertiaries - terStart);
   5580         sortKeySize += tersize;
   5581         if(sortKeySize <= resultLength) {
   5582             *(primaries++) = UCOL_LEVELTERMINATOR;
   5583             uprv_memcpy(primaries, terStart, tersize);
   5584             primaries += tersize;
   5585         } else {
   5586             if(allocateSKBuffer == TRUE) {
   5587                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
   5588                 if(U_SUCCESS(*status)) {
   5589                     *result = primStart;
   5590                     *(primaries++) = UCOL_LEVELTERMINATOR;
   5591                     uprv_memcpy(primaries, terStart, tersize);
   5592                 }
   5593                 else {
   5594                     /* We ran out of memory!? We can't recover. */
   5595                     sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
   5596                     goto cleanup;
   5597                 }
   5598             } else {
   5599                 *status = U_BUFFER_OVERFLOW_ERROR;
   5600             }
   5601         }
   5602 
   5603         *(primaries++) = '\0';
   5604     }
   5605 
   5606     if(allocateSKBuffer == TRUE) {
   5607         *result = (uint8_t*)uprv_malloc(sortKeySize);
   5608         /* test for NULL */
   5609         if (*result == NULL) {
   5610             *status = U_MEMORY_ALLOCATION_ERROR;
   5611             goto cleanup;
   5612         }
   5613         uprv_memcpy(*result, primStart, sortKeySize);
   5614         if(primStart != prim) {
   5615             uprv_free(primStart);
   5616         }
   5617     }
   5618 
   5619 cleanup:
   5620     if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
   5621         /* NULL terminate for safety */
   5622         **result = 0;
   5623     }
   5624     if(terStart != tert) {
   5625         uprv_free(terStart);
   5626         uprv_free(secStart);
   5627     }
   5628 
   5629     /* To avoid memory leak, free the offset buffer if necessary. */
   5630     ucol_freeOffsetBuffer(&s);
   5631 
   5632     return sortKeySize;
   5633 }
   5634 
   5635 static inline
   5636 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
   5637     UBool notIsContinuation = !isContinuation(CE);
   5638     uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
   5639     if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
   5640                || (!notIsContinuation && *wasShifted)))
   5641         || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   5642     {
   5643         // The stuff below should probably be in the sortkey code... maybe not...
   5644         if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
   5645             /* we should just completely ignore it */
   5646             *wasShifted = TRUE;
   5647             //continue;
   5648         }
   5649         //*wasShifted = TRUE;
   5650         return TRUE;
   5651     } else {
   5652         *wasShifted = FALSE;
   5653         return FALSE;
   5654     }
   5655 }
   5656 static inline
   5657 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
   5658     if(level < maxLevel) {
   5659         dest[i++] = UCOL_LEVELTERMINATOR;
   5660     } else {
   5661         dest[i++] = 0;
   5662     }
   5663 }
   5664 
   5665 /** enumeration of level identifiers for partial sort key generation */
   5666 enum {
   5667   UCOL_PSK_PRIMARY = 0,
   5668     UCOL_PSK_SECONDARY = 1,
   5669     UCOL_PSK_CASE = 2,
   5670     UCOL_PSK_TERTIARY = 3,
   5671     UCOL_PSK_QUATERNARY = 4,
   5672     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
   5673     UCOL_PSK_IDENTICAL = 6,
   5674     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
   5675     UCOL_PSK_LIMIT
   5676 };
   5677 
   5678 /** collation state enum. *_SHIFT value is how much to shift right
   5679  *  to get the state piece to the right. *_MASK value should be
   5680  *  ANDed with the shifted state. This data is stored in state[1]
   5681  *  field.
   5682  */
   5683 enum {
   5684     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
   5685     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
   5686     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
   5687     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
   5688     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
   5689      *  This field is also used to denote that the French secondary level is finished
   5690      */
   5691     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
   5692     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
   5693     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
   5694     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
   5695     /** When we do French we need to reverse secondary values. However, continuations
   5696      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
   5697      */
   5698     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
   5699     UCOL_PSK_BOCSU_BYTES_MASK = 3,
   5700     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
   5701     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
   5702 };
   5703 
   5704 // macro calculating the number of expansion CEs available
   5705 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
   5706 
   5707 
   5708 /** main sortkey part procedure. On the first call,
   5709  *  you should pass in a collator, an iterator, empty state
   5710  *  state[0] == state[1] == 0, a buffer to hold results
   5711  *  number of bytes you need and an error code pointer.
   5712  *  Make sure your buffer is big enough to hold the wanted
   5713  *  number of sortkey bytes. I don't check.
   5714  *  The only meaningful status you can get back is
   5715  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
   5716  *  have been dealt a raw deal and that you probably won't
   5717  *  be able to use partial sortkey generation for this
   5718  *  particular combination of string and collator. This
   5719  *  is highly unlikely, but you should still check the error code.
   5720  *  Any other status means that you're not in a sane situation
   5721  *  anymore. After the first call, preserve state values and
   5722  *  use them on subsequent calls to obtain more bytes of a sortkey.
   5723  *  Use until the number of bytes written is smaller than the requested
   5724  *  number of bytes. Generated sortkey is not compatible with the
   5725  *  one generated by ucol_getSortKey, as we don't do any compression.
   5726  *  However, levels are still terminated by a 1 (one) and the sortkey
   5727  *  is terminated by a 0 (zero). Identical level is the same as in the
   5728  *  regular sortkey - internal bocu-1 implementation is used.
   5729  *  For curious, although you cannot do much about this, here is
   5730  *  the structure of state words.
   5731  *  state[0] - iterator state. Depends on the iterator implementation,
   5732  *             but allows the iterator to continue where it stopped in
   5733  *             the last iteration.
   5734  *  state[1] - collation processing state. Here is the distribution
   5735  *             of the bits:
   5736  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
   5737  *             quaternary, quin (we don't use this one), identical and
   5738  *             null (producing only zeroes - first one to terminate the
   5739  *             sortkey and subsequent to fill the buffer).
   5740  *   3       - byte count. Number of bytes written on the primary level.
   5741  *   4       - was shifted. Whether the previous iteration finished in the
   5742  *             shifted state.
   5743  *   5, 6    - French continuation bytes written. See the comment in the enum
   5744  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
   5745  *             the identical level.
   5746  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
   5747  *             since thes last successful update of the iterator state.
   5748  */
   5749 U_CAPI int32_t U_EXPORT2
   5750 ucol_nextSortKeyPart(const UCollator *coll,
   5751                      UCharIterator *iter,
   5752                      uint32_t state[2],
   5753                      uint8_t *dest, int32_t count,
   5754                      UErrorCode *status)
   5755 {
   5756     /* error checking */
   5757     if(status==NULL || U_FAILURE(*status)) {
   5758         return 0;
   5759     }
   5760     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
   5761     if( coll==NULL || iter==NULL ||
   5762         state==NULL ||
   5763         count<0 || (count>0 && dest==NULL)
   5764     ) {
   5765         *status=U_ILLEGAL_ARGUMENT_ERROR;
   5766         UTRACE_EXIT_STATUS(status);
   5767         return 0;
   5768     }
   5769 
   5770     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
   5771                   coll, iter, state[0], state[1], dest, count);
   5772 
   5773     if(count==0) {
   5774         /* nothing to do */
   5775         UTRACE_EXIT_VALUE(0);
   5776         return 0;
   5777     }
   5778     /** Setting up situation according to the state we got from the previous iteration */
   5779     // The state of the iterator from the previous invocation
   5780     uint32_t iterState = state[0];
   5781     // Has the last iteration ended in the shifted state
   5782     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
   5783     // What is the current level of the sortkey?
   5784     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
   5785     // Have we written only one byte from a two byte primary in the previous iteration?
   5786     // Also on secondary level - have we finished with the French secondary?
   5787     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
   5788     // number of bytes in the continuation buffer for French
   5789     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
   5790     // Number of bytes already written from a bocsu sequence. Since
   5791     // the longes bocsu sequence is 4 long, this can be up to 3.
   5792     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
   5793     // Number of elements that need to be consumed in this iteration because
   5794     // the iterator returned UITER_NO_STATE at the end of the last iteration,
   5795     // so we had to save the last valid state.
   5796     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
   5797 
   5798     /** values that depend on the collator attributes */
   5799     // strength of the collator.
   5800     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
   5801     // maximal level of the partial sortkey. Need to take whether case level is done
   5802     int32_t maxLevel = 0;
   5803     if(strength < UCOL_TERTIARY) {
   5804         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   5805             maxLevel = UCOL_PSK_CASE;
   5806         } else {
   5807             maxLevel = strength;
   5808         }
   5809     } else {
   5810         if(strength == UCOL_TERTIARY) {
   5811             maxLevel = UCOL_PSK_TERTIARY;
   5812         } else if(strength == UCOL_QUATERNARY) {
   5813             maxLevel = UCOL_PSK_QUATERNARY;
   5814         } else { // identical
   5815             maxLevel = UCOL_IDENTICAL;
   5816         }
   5817     }
   5818     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
   5819     uint8_t UCOL_HIRAGANA_QUAD =
   5820       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
   5821     // Boundary value that decides whether a CE is shifted or not
   5822     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
   5823     // Are we doing French collation?
   5824     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
   5825 
   5826     /** initializing the collation state */
   5827     UBool notIsContinuation = FALSE;
   5828     uint32_t CE = UCOL_NO_MORE_CES;
   5829 
   5830     collIterate s;
   5831     IInit_collIterate(coll, NULL, -1, &s, status);
   5832     if(U_FAILURE(*status)) {
   5833         UTRACE_EXIT_STATUS(*status);
   5834         return 0;
   5835     }
   5836     s.iterator = iter;
   5837     s.flags |= UCOL_USE_ITERATOR;
   5838     // This variable tells us whether we have produced some other levels in this iteration
   5839     // before we moved to the identical level. In that case, we need to switch the
   5840     // type of the iterator.
   5841     UBool doingIdenticalFromStart = FALSE;
   5842     // Normalizing iterator
   5843     // The division for the array length may truncate the array size to
   5844     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   5845     // for all platforms anyway.
   5846     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   5847     UNormIterator *normIter = NULL;
   5848     // If the normalization is turned on for the collator and we are below identical level
   5849     // we will use a FCD normalizing iterator
   5850     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
   5851         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5852         s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
   5853         s.flags &= ~UCOL_ITER_NORM;
   5854         if(U_FAILURE(*status)) {
   5855             UTRACE_EXIT_STATUS(*status);
   5856             return 0;
   5857         }
   5858     } else if(level == UCOL_PSK_IDENTICAL) {
   5859         // for identical level, we need a NFD iterator. We need to instantiate it here, since we
   5860         // will be updating the state - and this cannot be done on an ordinary iterator.
   5861         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5862         s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   5863         s.flags &= ~UCOL_ITER_NORM;
   5864         if(U_FAILURE(*status)) {
   5865             UTRACE_EXIT_STATUS(*status);
   5866             return 0;
   5867         }
   5868         doingIdenticalFromStart = TRUE;
   5869     }
   5870 
   5871     // This is the tentative new state of the iterator. The problem
   5872     // is that the iterator might return an undefined state, in
   5873     // which case we should save the last valid state and increase
   5874     // the iterator skip value.
   5875     uint32_t newState = 0;
   5876 
   5877     // First, we set the iterator to the last valid position
   5878     // from the last iteration. This was saved in state[0].
   5879     if(iterState == 0) {
   5880         /* initial state */
   5881         if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
   5882             s.iterator->move(s.iterator, 0, UITER_LIMIT);
   5883         } else {
   5884             s.iterator->move(s.iterator, 0, UITER_START);
   5885         }
   5886     } else {
   5887         /* reset to previous state */
   5888         s.iterator->setState(s.iterator, iterState, status);
   5889         if(U_FAILURE(*status)) {
   5890             UTRACE_EXIT_STATUS(*status);
   5891             return 0;
   5892         }
   5893     }
   5894 
   5895 
   5896 
   5897     // This variable tells us whether we can attempt to update the state
   5898     // of iterator. Situations where we don't want to update iterator state
   5899     // are the existence of expansion CEs that are not yet processed, and
   5900     // finishing the case level without enough space in the buffer to insert
   5901     // a level terminator.
   5902     UBool canUpdateState = TRUE;
   5903 
   5904     // Consume all the CEs that were consumed at the end of the previous
   5905     // iteration without updating the iterator state. On identical level,
   5906     // consume the code points.
   5907     int32_t counter = cces;
   5908     if(level < UCOL_PSK_IDENTICAL) {
   5909         while(counter-->0) {
   5910             // If we're doing French and we are on the secondary level,
   5911             // we go backwards.
   5912             if(level == UCOL_PSK_SECONDARY && doingFrench) {
   5913                 CE = ucol_IGetPrevCE(coll, &s, status);
   5914             } else {
   5915                 CE = ucol_IGetNextCE(coll, &s, status);
   5916             }
   5917             if(CE==UCOL_NO_MORE_CES) {
   5918                 /* should not happen */
   5919                 *status=U_INTERNAL_PROGRAM_ERROR;
   5920                 UTRACE_EXIT_STATUS(*status);
   5921                 return 0;
   5922             }
   5923             if(uprv_numAvailableExpCEs(s)) {
   5924                 canUpdateState = FALSE;
   5925             }
   5926         }
   5927     } else {
   5928         while(counter-->0) {
   5929             uiter_next32(s.iterator);
   5930         }
   5931     }
   5932 
   5933     // French secondary needs to know whether the iterator state of zero came from previous level OR
   5934     // from a new invocation...
   5935     UBool wasDoingPrimary = FALSE;
   5936     // destination buffer byte counter. When this guy
   5937     // gets to count, we're done with the iteration
   5938     int32_t i = 0;
   5939     // used to count the zero bytes written after we
   5940     // have finished with the sort key
   5941     int32_t j = 0;
   5942 
   5943 
   5944     // Hm.... I think we're ready to plunge in. Basic story is as following:
   5945     // we have a fall through case based on level. This is used for initial
   5946     // positioning on iteration start. Every level processor contains a
   5947     // for(;;) which will be broken when we exhaust all the CEs. Other
   5948     // way to exit is a goto saveState, which happens when we have filled
   5949     // out our buffer.
   5950     switch(level) {
   5951     case UCOL_PSK_PRIMARY:
   5952         wasDoingPrimary = TRUE;
   5953         for(;;) {
   5954             if(i==count) {
   5955                 goto saveState;
   5956             }
   5957             // We should save the state only if we
   5958             // are sure that we are done with the
   5959             // previous iterator state
   5960             if(canUpdateState && byteCountOrFrenchDone == 0) {
   5961                 newState = s.iterator->getState(s.iterator);
   5962                 if(newState != UITER_NO_STATE) {
   5963                     iterState = newState;
   5964                     cces = 0;
   5965                 }
   5966             }
   5967             CE = ucol_IGetNextCE(coll, &s, status);
   5968             cces++;
   5969             if(CE==UCOL_NO_MORE_CES) {
   5970                 // Add the level separator
   5971                 terminatePSKLevel(level, maxLevel, i, dest);
   5972                 byteCountOrFrenchDone=0;
   5973                 // Restart the iteration an move to the
   5974                 // second level
   5975                 s.iterator->move(s.iterator, 0, UITER_START);
   5976                 cces = 0;
   5977                 level = UCOL_PSK_SECONDARY;
   5978                 break;
   5979             }
   5980             if(!isContinuation(CE)){
   5981                 if(coll->leadBytePermutationTable != NULL){
   5982                     CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
   5983                 }
   5984             }
   5985             if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5986                 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
   5987                 if(CE != 0) {
   5988                     if(byteCountOrFrenchDone == 0) {
   5989                         // get the second byte of primary
   5990                         dest[i++]=(uint8_t)(CE >> 8);
   5991                     } else {
   5992                         byteCountOrFrenchDone = 0;
   5993                     }
   5994                     if((CE &=0xff)!=0) {
   5995                         if(i==count) {
   5996                             /* overflow */
   5997                             byteCountOrFrenchDone = 1;
   5998                             cces--;
   5999                             goto saveState;
   6000                         }
   6001                         dest[i++]=(uint8_t)CE;
   6002                     }
   6003                 }
   6004             }
   6005             if(uprv_numAvailableExpCEs(s)) {
   6006                 canUpdateState = FALSE;
   6007             } else {
   6008                 canUpdateState = TRUE;
   6009             }
   6010         }
   6011         /* fall through to next level */
   6012     case UCOL_PSK_SECONDARY:
   6013         if(strength >= UCOL_SECONDARY) {
   6014             if(!doingFrench) {
   6015                 for(;;) {
   6016                     if(i == count) {
   6017                         goto saveState;
   6018                     }
   6019                     // We should save the state only if we
   6020                     // are sure that we are done with the
   6021                     // previous iterator state
   6022                     if(canUpdateState) {
   6023                         newState = s.iterator->getState(s.iterator);
   6024                         if(newState != UITER_NO_STATE) {
   6025                             iterState = newState;
   6026                             cces = 0;
   6027                         }
   6028                     }
   6029                     CE = ucol_IGetNextCE(coll, &s, status);
   6030                     cces++;
   6031                     if(CE==UCOL_NO_MORE_CES) {
   6032                         // Add the level separator
   6033                         terminatePSKLevel(level, maxLevel, i, dest);
   6034                         byteCountOrFrenchDone = 0;
   6035                         // Restart the iteration an move to the
   6036                         // second level
   6037                         s.iterator->move(s.iterator, 0, UITER_START);
   6038                         cces = 0;
   6039                         level = UCOL_PSK_CASE;
   6040                         break;
   6041                     }
   6042                     if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6043                         CE >>= 8; /* get secondary */
   6044                         if(CE != 0) {
   6045                             dest[i++]=(uint8_t)CE;
   6046                         }
   6047                     }
   6048                     if(uprv_numAvailableExpCEs(s)) {
   6049                         canUpdateState = FALSE;
   6050                     } else {
   6051                         canUpdateState = TRUE;
   6052                     }
   6053                 }
   6054             } else { // French secondary processing
   6055                 uint8_t frenchBuff[UCOL_MAX_BUFFER];
   6056                 int32_t frenchIndex = 0;
   6057                 // Here we are going backwards.
   6058                 // If the iterator is at the beggining, it should be
   6059                 // moved to end.
   6060                 if(wasDoingPrimary) {
   6061                     s.iterator->move(s.iterator, 0, UITER_LIMIT);
   6062                     cces = 0;
   6063                 }
   6064                 for(;;) {
   6065                     if(i == count) {
   6066                         goto saveState;
   6067                     }
   6068                     if(canUpdateState) {
   6069                         newState = s.iterator->getState(s.iterator);
   6070                         if(newState != UITER_NO_STATE) {
   6071                             iterState = newState;
   6072                             cces = 0;
   6073                         }
   6074                     }
   6075                     CE = ucol_IGetPrevCE(coll, &s, status);
   6076                     cces++;
   6077                     if(CE==UCOL_NO_MORE_CES) {
   6078                         // Add the level separator
   6079                         terminatePSKLevel(level, maxLevel, i, dest);
   6080                         byteCountOrFrenchDone = 0;
   6081                         // Restart the iteration an move to the next level
   6082                         s.iterator->move(s.iterator, 0, UITER_START);
   6083                         level = UCOL_PSK_CASE;
   6084                         break;
   6085                     }
   6086                     if(isContinuation(CE)) { // if it's a continuation, we want to save it and
   6087                         // reverse when we get a first non-continuation CE.
   6088                         CE >>= 8;
   6089                         frenchBuff[frenchIndex++] = (uint8_t)CE;
   6090                     } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6091                         CE >>= 8; /* get secondary */
   6092                         if(!frenchIndex) {
   6093                             if(CE != 0) {
   6094                                 dest[i++]=(uint8_t)CE;
   6095                             }
   6096                         } else {
   6097                             frenchBuff[frenchIndex++] = (uint8_t)CE;
   6098                             frenchIndex -= usedFrench;
   6099                             usedFrench = 0;
   6100                             while(i < count && frenchIndex) {
   6101                                 dest[i++] = frenchBuff[--frenchIndex];
   6102                                 usedFrench++;
   6103                             }
   6104                         }
   6105                     }
   6106                     if(uprv_numAvailableExpCEs(s)) {
   6107                         canUpdateState = FALSE;
   6108                     } else {
   6109                         canUpdateState = TRUE;
   6110                     }
   6111                 }
   6112             }
   6113         } else {
   6114             level = UCOL_PSK_CASE;
   6115         }
   6116         /* fall through to next level */
   6117     case UCOL_PSK_CASE:
   6118         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   6119             uint32_t caseShift = UCOL_CASE_SHIFT_START;
   6120             uint8_t caseByte = UCOL_CASE_BYTE_START;
   6121             uint8_t caseBits = 0;
   6122 
   6123             for(;;) {
   6124                 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
   6125                 if(i == count) {
   6126                     goto saveState;
   6127                 }
   6128                 // We should save the state only if we
   6129                 // are sure that we are done with the
   6130                 // previous iterator state
   6131                 if(canUpdateState) {
   6132                     newState = s.iterator->getState(s.iterator);
   6133                     if(newState != UITER_NO_STATE) {
   6134                         iterState = newState;
   6135                         cces = 0;
   6136                     }
   6137                 }
   6138                 CE = ucol_IGetNextCE(coll, &s, status);
   6139                 cces++;
   6140                 if(CE==UCOL_NO_MORE_CES) {
   6141                     // On the case level we might have an unfinished
   6142                     // case byte. Add one if it's started.
   6143                     if(caseShift != UCOL_CASE_SHIFT_START) {
   6144                         dest[i++] = caseByte;
   6145                     }
   6146                     cces = 0;
   6147                     // We have finished processing CEs on this level.
   6148                     // However, we don't know if we have enough space
   6149                     // to add a case level terminator.
   6150                     if(i < count) {
   6151                         // Add the level separator
   6152                         terminatePSKLevel(level, maxLevel, i, dest);
   6153                         // Restart the iteration and move to the
   6154                         // next level
   6155                         s.iterator->move(s.iterator, 0, UITER_START);
   6156                         level = UCOL_PSK_TERTIARY;
   6157                     } else {
   6158                         canUpdateState = FALSE;
   6159                     }
   6160                     break;
   6161                 }
   6162 
   6163                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6164                     if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
   6165                         // do the case level if we need to do it. We don't want to calculate
   6166                         // case level for primary ignorables if we have only primary strength and case level
   6167                         // otherwise we would break well formedness of CEs
   6168                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   6169                         caseBits = (uint8_t)(CE & 0xC0);
   6170                         // this copies the case level logic from the
   6171                         // sort key generation code
   6172                         if(CE != 0) {
   6173                             if (caseShift == 0) {
   6174                                 dest[i++] = caseByte;
   6175                                 caseShift = UCOL_CASE_SHIFT_START;
   6176                                 caseByte = UCOL_CASE_BYTE_START;
   6177                             }
   6178                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6179                                 if((caseBits & 0xC0) == 0) {
   6180                                     caseByte |= 1 << (--caseShift);
   6181                                 } else {
   6182                                     caseByte |= 0 << (--caseShift);
   6183                                     /* second bit */
   6184                                     if(caseShift == 0) {
   6185                                         dest[i++] = caseByte;
   6186                                         caseShift = UCOL_CASE_SHIFT_START;
   6187                                         caseByte = UCOL_CASE_BYTE_START;
   6188                                     }
   6189                                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
   6190                                 }
   6191                             } else {
   6192                                 if((caseBits & 0xC0) == 0) {
   6193                                     caseByte |= 0 << (--caseShift);
   6194                                 } else {
   6195                                     caseByte |= 1 << (--caseShift);
   6196                                     /* second bit */
   6197                                     if(caseShift == 0) {
   6198                                         dest[i++] = caseByte;
   6199                                         caseShift = UCOL_CASE_SHIFT_START;
   6200                                         caseByte = UCOL_CASE_BYTE_START;
   6201                                     }
   6202                                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
   6203                                 }
   6204                             }
   6205                         }
   6206 
   6207                     }
   6208                 }
   6209                 // Not sure this is correct for the case level - revisit
   6210                 if(uprv_numAvailableExpCEs(s)) {
   6211                     canUpdateState = FALSE;
   6212                 } else {
   6213                     canUpdateState = TRUE;
   6214                 }
   6215             }
   6216         } else {
   6217             level = UCOL_PSK_TERTIARY;
   6218         }
   6219         /* fall through to next level */
   6220     case UCOL_PSK_TERTIARY:
   6221         if(strength >= UCOL_TERTIARY) {
   6222             for(;;) {
   6223                 if(i == count) {
   6224                     goto saveState;
   6225                 }
   6226                 // We should save the state only if we
   6227                 // are sure that we are done with the
   6228                 // previous iterator state
   6229                 if(canUpdateState) {
   6230                     newState = s.iterator->getState(s.iterator);
   6231                     if(newState != UITER_NO_STATE) {
   6232                         iterState = newState;
   6233                         cces = 0;
   6234                     }
   6235                 }
   6236                 CE = ucol_IGetNextCE(coll, &s, status);
   6237                 cces++;
   6238                 if(CE==UCOL_NO_MORE_CES) {
   6239                     // Add the level separator
   6240                     terminatePSKLevel(level, maxLevel, i, dest);
   6241                     byteCountOrFrenchDone = 0;
   6242                     // Restart the iteration an move to the
   6243                     // second level
   6244                     s.iterator->move(s.iterator, 0, UITER_START);
   6245                     cces = 0;
   6246                     level = UCOL_PSK_QUATERNARY;
   6247                     break;
   6248                 }
   6249                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   6250                     notIsContinuation = !isContinuation(CE);
   6251 
   6252                     if(notIsContinuation) {
   6253                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   6254                         CE ^= coll->caseSwitch;
   6255                         CE &= coll->tertiaryMask;
   6256                     } else {
   6257                         CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   6258                     }
   6259 
   6260                     if(CE != 0) {
   6261                         dest[i++]=(uint8_t)CE;
   6262                     }
   6263                 }
   6264                 if(uprv_numAvailableExpCEs(s)) {
   6265                     canUpdateState = FALSE;
   6266                 } else {
   6267                     canUpdateState = TRUE;
   6268                 }
   6269             }
   6270         } else {
   6271             // if we're not doing tertiary
   6272             // skip to the end
   6273             level = UCOL_PSK_NULL;
   6274         }
   6275         /* fall through to next level */
   6276     case UCOL_PSK_QUATERNARY:
   6277         if(strength >= UCOL_QUATERNARY) {
   6278             for(;;) {
   6279                 if(i == count) {
   6280                     goto saveState;
   6281                 }
   6282                 // We should save the state only if we
   6283                 // are sure that we are done with the
   6284                 // previous iterator state
   6285                 if(canUpdateState) {
   6286                     newState = s.iterator->getState(s.iterator);
   6287                     if(newState != UITER_NO_STATE) {
   6288                         iterState = newState;
   6289                         cces = 0;
   6290                     }
   6291                 }
   6292                 CE = ucol_IGetNextCE(coll, &s, status);
   6293                 cces++;
   6294                 if(CE==UCOL_NO_MORE_CES) {
   6295                     // Add the level separator
   6296                     terminatePSKLevel(level, maxLevel, i, dest);
   6297                     //dest[i++] = UCOL_LEVELTERMINATOR;
   6298                     byteCountOrFrenchDone = 0;
   6299                     // Restart the iteration an move to the
   6300                     // second level
   6301                     s.iterator->move(s.iterator, 0, UITER_START);
   6302                     cces = 0;
   6303                     level = UCOL_PSK_QUIN;
   6304                     break;
   6305                 }
   6306                 if(CE==0)
   6307                     continue;
   6308                 if(isShiftedCE(CE, LVT, &wasShifted)) {
   6309                     CE >>= 16; /* get primary */
   6310                     if(CE != 0) {
   6311                         if(byteCountOrFrenchDone == 0) {
   6312                             dest[i++]=(uint8_t)(CE >> 8);
   6313                         } else {
   6314                             byteCountOrFrenchDone = 0;
   6315                         }
   6316                         if((CE &=0xff)!=0) {
   6317                             if(i==count) {
   6318                                 /* overflow */
   6319                                 byteCountOrFrenchDone = 1;
   6320                                 goto saveState;
   6321                             }
   6322                             dest[i++]=(uint8_t)CE;
   6323                         }
   6324                     }
   6325                 } else {
   6326                     notIsContinuation = !isContinuation(CE);
   6327                     if(notIsContinuation) {
   6328                         if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   6329                             dest[i++] = UCOL_HIRAGANA_QUAD;
   6330                         } else {
   6331                             dest[i++] = 0xFF;
   6332                         }
   6333                     }
   6334                 }
   6335                 if(uprv_numAvailableExpCEs(s)) {
   6336                     canUpdateState = FALSE;
   6337                 } else {
   6338                     canUpdateState = TRUE;
   6339                 }
   6340             }
   6341         } else {
   6342             // if we're not doing quaternary
   6343             // skip to the end
   6344             level = UCOL_PSK_NULL;
   6345         }
   6346         /* fall through to next level */
   6347     case UCOL_PSK_QUIN:
   6348         level = UCOL_PSK_IDENTICAL;
   6349         /* fall through to next level */
   6350     case UCOL_PSK_IDENTICAL:
   6351         if(strength >= UCOL_IDENTICAL) {
   6352             UChar32 first, second;
   6353             int32_t bocsuBytesWritten = 0;
   6354             // We always need to do identical on
   6355             // the NFD form of the string.
   6356             if(normIter == NULL) {
   6357                 // we arrived from the level below and
   6358                 // normalization was not turned on.
   6359                 // therefore, we need to make a fresh NFD iterator
   6360                 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   6361                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   6362             } else if(!doingIdenticalFromStart) {
   6363                 // there is an iterator, but we did some other levels.
   6364                 // therefore, we have a FCD iterator - need to make
   6365                 // a NFD one.
   6366                 // normIter being at the beginning does not guarantee
   6367                 // that the underlying iterator is at the beginning
   6368                 iter->move(iter, 0, UITER_START);
   6369                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   6370             }
   6371             // At this point we have a NFD iterator that is positioned
   6372             // in the right place
   6373             if(U_FAILURE(*status)) {
   6374                 UTRACE_EXIT_STATUS(*status);
   6375                 return 0;
   6376             }
   6377             first = uiter_previous32(s.iterator);
   6378             // maybe we're at the start of the string
   6379             if(first == U_SENTINEL) {
   6380                 first = 0;
   6381             } else {
   6382                 uiter_next32(s.iterator);
   6383             }
   6384 
   6385             j = 0;
   6386             for(;;) {
   6387                 if(i == count) {
   6388                     if(j+1 < bocsuBytesWritten) {
   6389                         bocsuBytesUsed = j+1;
   6390                     }
   6391                     goto saveState;
   6392                 }
   6393 
   6394                 // On identical level, we will always save
   6395                 // the state if we reach this point, since
   6396                 // we don't depend on getNextCE for content
   6397                 // all the content is in our buffer and we
   6398                 // already either stored the full buffer OR
   6399                 // otherwise we won't arrive here.
   6400                 newState = s.iterator->getState(s.iterator);
   6401                 if(newState != UITER_NO_STATE) {
   6402                     iterState = newState;
   6403                     cces = 0;
   6404                 }
   6405 
   6406                 uint8_t buff[4];
   6407                 second = uiter_next32(s.iterator);
   6408                 cces++;
   6409 
   6410                 // end condition for identical level
   6411                 if(second == U_SENTINEL) {
   6412                     terminatePSKLevel(level, maxLevel, i, dest);
   6413                     level = UCOL_PSK_NULL;
   6414                     break;
   6415                 }
   6416                 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
   6417                 first = second;
   6418 
   6419                 j = 0;
   6420                 if(bocsuBytesUsed != 0) {
   6421                     while(bocsuBytesUsed-->0) {
   6422                         j++;
   6423                     }
   6424                 }
   6425 
   6426                 while(i < count && j < bocsuBytesWritten) {
   6427                     dest[i++] = buff[j++];
   6428                 }
   6429             }
   6430 
   6431         } else {
   6432             level = UCOL_PSK_NULL;
   6433         }
   6434         /* fall through to next level */
   6435     case UCOL_PSK_NULL:
   6436         j = i;
   6437         while(j<count) {
   6438             dest[j++]=0;
   6439         }
   6440         break;
   6441     default:
   6442         *status = U_INTERNAL_PROGRAM_ERROR;
   6443         UTRACE_EXIT_STATUS(*status);
   6444         return 0;
   6445     }
   6446 
   6447 saveState:
   6448     // Now we need to return stuff. First we want to see whether we have
   6449     // done everything for the current state of iterator.
   6450     if(byteCountOrFrenchDone
   6451         || canUpdateState == FALSE
   6452         || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
   6453     {
   6454         // Any of above mean that the previous transaction
   6455         // wasn't finished and that we should store the
   6456         // previous iterator state.
   6457         state[0] = iterState;
   6458     } else {
   6459         // The transaction is complete. We will continue in the next iteration.
   6460         state[0] = s.iterator->getState(s.iterator);
   6461         cces = 0;
   6462     }
   6463     // Store the number of bocsu bytes written.
   6464     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
   6465         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6466     }
   6467     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
   6468 
   6469     // Next we put in the level of comparison
   6470     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
   6471 
   6472     // If we are doing French, we need to store whether we have just finished the French level
   6473     if(level == UCOL_PSK_SECONDARY && doingFrench) {
   6474         state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6475     } else {
   6476         state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6477     }
   6478 
   6479     // Was the latest CE shifted
   6480     if(wasShifted) {
   6481         state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
   6482     }
   6483     // Check for cces overflow
   6484     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
   6485         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6486     }
   6487     // Store cces
   6488     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
   6489 
   6490     // Check for French overflow
   6491     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
   6492         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6493     }
   6494     // Store number of bytes written in the French secondary continuation sequence
   6495     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
   6496 
   6497 
   6498     // If we have used normalizing iterator, get rid of it
   6499     if(normIter != NULL) {
   6500         unorm_closeIter(normIter);
   6501     }
   6502 
   6503     /* To avoid memory leak, free the offset buffer if necessary. */
   6504     ucol_freeOffsetBuffer(&s);
   6505 
   6506     // Return number of meaningful sortkey bytes.
   6507     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
   6508                   dest,i, state[0], state[1]);
   6509     UTRACE_EXIT_VALUE(i);
   6510     return i;
   6511 }
   6512 
   6513 /**
   6514  * Produce a bound for a given sortkey and a number of levels.
   6515  */
   6516 U_CAPI int32_t U_EXPORT2
   6517 ucol_getBound(const uint8_t       *source,
   6518         int32_t             sourceLength,
   6519         UColBoundMode       boundType,
   6520         uint32_t            noOfLevels,
   6521         uint8_t             *result,
   6522         int32_t             resultLength,
   6523         UErrorCode          *status)
   6524 {
   6525     // consistency checks
   6526     if(status == NULL || U_FAILURE(*status)) {
   6527         return 0;
   6528     }
   6529     if(source == NULL) {
   6530         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6531         return 0;
   6532     }
   6533 
   6534     int32_t sourceIndex = 0;
   6535     // Scan the string until we skip enough of the key OR reach the end of the key
   6536     do {
   6537         sourceIndex++;
   6538         if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
   6539             noOfLevels--;
   6540         }
   6541     } while (noOfLevels > 0
   6542         && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
   6543 
   6544     if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
   6545         && noOfLevels > 0) {
   6546             *status = U_SORT_KEY_TOO_SHORT_WARNING;
   6547     }
   6548 
   6549 
   6550     // READ ME: this code assumes that the values for boundType
   6551     // enum will not changes. They are set so that the enum value
   6552     // corresponds to the number of extra bytes each bound type
   6553     // needs.
   6554     if(result != NULL && resultLength >= sourceIndex+boundType) {
   6555         uprv_memcpy(result, source, sourceIndex);
   6556         switch(boundType) {
   6557             // Lower bound just gets terminated. No extra bytes
   6558         case UCOL_BOUND_LOWER: // = 0
   6559             break;
   6560             // Upper bound needs one extra byte
   6561         case UCOL_BOUND_UPPER: // = 1
   6562             result[sourceIndex++] = 2;
   6563             break;
   6564             // Upper long bound needs two extra bytes
   6565         case UCOL_BOUND_UPPER_LONG: // = 2
   6566             result[sourceIndex++] = 0xFF;
   6567             result[sourceIndex++] = 0xFF;
   6568             break;
   6569         default:
   6570             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6571             return 0;
   6572         }
   6573         result[sourceIndex++] = 0;
   6574 
   6575         return sourceIndex;
   6576     } else {
   6577         return sourceIndex+boundType+1;
   6578     }
   6579 }
   6580 
   6581 /****************************************************************************/
   6582 /* Following are the functions that deal with the properties of a collator  */
   6583 /* there are new APIs and some compatibility APIs                           */
   6584 /****************************************************************************/
   6585 
   6586 static inline void
   6587 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
   6588                     int32_t *primShift, int32_t *secShift, int32_t *terShift)
   6589 {
   6590     uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
   6591     UBool reverseSecondary = FALSE;
   6592     UBool continuation = isContinuation(CE);
   6593     if(!continuation) {
   6594         tertiary = (uint8_t)((CE & coll->tertiaryMask));
   6595         tertiary ^= coll->caseSwitch;
   6596         reverseSecondary = TRUE;
   6597     } else {
   6598         tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   6599         tertiary &= UCOL_REMOVE_CASE;
   6600         reverseSecondary = FALSE;
   6601     }
   6602 
   6603     secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6604     primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6605     primary1 = (uint8_t)(CE >> 8);
   6606 
   6607     if(primary1 != 0) {
   6608         if (coll->leadBytePermutationTable != NULL && !continuation) {
   6609             primary1 = coll->leadBytePermutationTable[primary1];
   6610         }
   6611 
   6612         coll->latinOneCEs[ch] |= (primary1 << *primShift);
   6613         *primShift -= 8;
   6614     }
   6615     if(primary2 != 0) {
   6616         if(*primShift < 0) {
   6617             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6618             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6619             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6620             return;
   6621         }
   6622         coll->latinOneCEs[ch] |= (primary2 << *primShift);
   6623         *primShift -= 8;
   6624     }
   6625     if(secondary != 0) {
   6626         if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
   6627             coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
   6628             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
   6629         } else { // normal case
   6630             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
   6631         }
   6632         *secShift -= 8;
   6633     }
   6634     if(tertiary != 0) {
   6635         coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
   6636         *terShift -= 8;
   6637     }
   6638 }
   6639 
   6640 static inline UBool
   6641 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
   6642     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
   6643     if(newTable == NULL) {
   6644       *status = U_MEMORY_ALLOCATION_ERROR;
   6645       coll->latinOneFailed = TRUE;
   6646       return FALSE;
   6647     }
   6648     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
   6649     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
   6650     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
   6651     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
   6652     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
   6653     coll->latinOneTableLen = size;
   6654     uprv_free(coll->latinOneCEs);
   6655     coll->latinOneCEs = newTable;
   6656     return TRUE;
   6657 }
   6658 
   6659 static UBool
   6660 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
   6661     UBool result = TRUE;
   6662     if(coll->latinOneCEs == NULL) {
   6663         coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
   6664         if(coll->latinOneCEs == NULL) {
   6665             *status = U_MEMORY_ALLOCATION_ERROR;
   6666             return FALSE;
   6667         }
   6668         coll->latinOneTableLen = UCOL_LATINONETABLELEN;
   6669     }
   6670     UChar ch = 0;
   6671     UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
   6672     // Check for null pointer
   6673     if (U_FAILURE(*status)) {
   6674         return FALSE;
   6675     }
   6676     uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
   6677 
   6678     int32_t primShift = 24, secShift = 24, terShift = 24;
   6679     uint32_t CE = 0;
   6680     int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
   6681 
   6682     // TODO: make safe if you get more than you wanted...
   6683     for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
   6684         primShift = 24; secShift = 24; terShift = 24;
   6685         if(ch < 0x100) {
   6686             CE = coll->latinOneMapping[ch];
   6687         } else {
   6688             CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   6689             if(CE == UCOL_NOT_FOUND && coll->UCA) {
   6690                 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   6691             }
   6692         }
   6693         if(CE < UCOL_NOT_FOUND) {
   6694             ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6695         } else {
   6696             switch (getCETag(CE)) {
   6697             case EXPANSION_TAG:
   6698             case DIGIT_TAG:
   6699                 ucol_setText(it, &ch, 1, status);
   6700                 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
   6701                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6702                         coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6703                         coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6704                         coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6705                         break;
   6706                     }
   6707                     ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6708                 }
   6709                 break;
   6710             case CONTRACTION_TAG:
   6711                 // here is the trick
   6712                 // F2 is contraction. We do something very similar to contractions
   6713                 // but have two indices, one in the real contraction table and the
   6714                 // other to where we stuffed things. This hopes that we don't have
   6715                 // many contractions (this should work for latin-1 tables).
   6716                 {
   6717                     if((CE & 0x00FFF000) != 0) {
   6718                         *status = U_UNSUPPORTED_ERROR;
   6719                         goto cleanup_after_failure;
   6720                     }
   6721 
   6722                     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   6723 
   6724                     CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
   6725 
   6726                     coll->latinOneCEs[ch] = CE;
   6727                     coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
   6728                     coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
   6729 
   6730                     // We're going to jump into contraction table, pick the elements
   6731                     // and use them
   6732                     do {
   6733                         CE = *(coll->contractionCEs +
   6734                             (UCharOffset - coll->contractionIndex));
   6735                         if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
   6736                             uint32_t size;
   6737                             uint32_t i;    /* general counter */
   6738                             uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   6739                             size = getExpansionCount(CE);
   6740                             //CE = *CEOffset++;
   6741                             if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   6742                                 for(i = 0; i<size; i++) {
   6743                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6744                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6745                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6746                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6747                                         break;
   6748                                     }
   6749                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6750                                 }
   6751                             } else { /* else, we do */
   6752                                 while(*CEOffset != 0) {
   6753                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6754                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6755                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6756                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6757                                         break;
   6758                                     }
   6759                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6760                                 }
   6761                             }
   6762                             contractionOffset++;
   6763                         } else if(CE < UCOL_NOT_FOUND) {
   6764                             ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
   6765                         } else {
   6766                             coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6767                             coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6768                             coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6769                             contractionOffset++;
   6770                         }
   6771                         UCharOffset++;
   6772                         primShift = 24; secShift = 24; terShift = 24;
   6773                         if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
   6774                             if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
   6775                                 goto cleanup_after_failure;
   6776                             }
   6777                         }
   6778                     } while(*UCharOffset != 0xFFFF);
   6779                 }
   6780                 break;;
   6781             case SPEC_PROC_TAG:
   6782                 {
   6783                     // 0xB7 is a precontext character defined in UCA5.1, a special
   6784                     // handle is implemeted in order to save LatinOne table for
   6785                     // most locales.
   6786                     if (ch==0xb7) {
   6787                         ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6788                     }
   6789                     else {
   6790                         goto cleanup_after_failure;
   6791                     }
   6792                 }
   6793                 break;
   6794             default:
   6795                 goto cleanup_after_failure;
   6796             }
   6797         }
   6798     }
   6799     // compact table
   6800     if(contractionOffset < coll->latinOneTableLen) {
   6801         if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
   6802             goto cleanup_after_failure;
   6803         }
   6804     }
   6805     ucol_closeElements(it);
   6806     return result;
   6807 
   6808 cleanup_after_failure:
   6809     // status should already be set before arriving here.
   6810     coll->latinOneFailed = TRUE;
   6811     ucol_closeElements(it);
   6812     return FALSE;
   6813 }
   6814 
   6815 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
   6816     if(U_SUCCESS(*status)) {
   6817         if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6818             coll->caseSwitch = UCOL_CASE_SWITCH;
   6819         } else {
   6820             coll->caseSwitch = UCOL_NO_CASE_SWITCH;
   6821         }
   6822 
   6823         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
   6824             coll->tertiaryMask = UCOL_REMOVE_CASE;
   6825             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6826             coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
   6827             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
   6828             coll->tertiaryBottom = UCOL_COMMON_BOT3;
   6829         } else {
   6830             coll->tertiaryMask = UCOL_KEEP_CASE;
   6831             coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
   6832             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6833                 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
   6834                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
   6835                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
   6836             } else {
   6837                 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6838                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
   6839                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
   6840             }
   6841         }
   6842 
   6843         /* Set the compression values */
   6844         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
   6845         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
   6846         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
   6847 
   6848         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
   6849             && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
   6850         {
   6851             coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
   6852         } else {
   6853             coll->sortKeyGen = ucol_calcSortKey;
   6854         }
   6855         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
   6856             && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
   6857         {
   6858             if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
   6859                 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
   6860                     //fprintf(stderr, "F");
   6861                     coll->latinOneUse = TRUE;
   6862                 } else {
   6863                     coll->latinOneUse = FALSE;
   6864                 }
   6865                 if(*status == U_UNSUPPORTED_ERROR) {
   6866                     *status = U_ZERO_ERROR;
   6867                 }
   6868             } else { // latin1Table exists and it doesn't need to be regenerated, just use it
   6869                 coll->latinOneUse = TRUE;
   6870             }
   6871         } else {
   6872             coll->latinOneUse = FALSE;
   6873         }
   6874     }
   6875 }
   6876 
   6877 U_CAPI uint32_t  U_EXPORT2
   6878 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
   6879     if(U_FAILURE(*status) || coll == NULL) {
   6880         return 0;
   6881     }
   6882     if(len == -1) {
   6883         len = u_strlen(varTop);
   6884     }
   6885     if(len == 0) {
   6886         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6887         return 0;
   6888     }
   6889 
   6890     collIterate s;
   6891     IInit_collIterate(coll, varTop, len, &s, status);
   6892     if(U_FAILURE(*status)) {
   6893         return 0;
   6894     }
   6895 
   6896     uint32_t CE = ucol_IGetNextCE(coll, &s, status);
   6897 
   6898     /* here we check if we have consumed all characters */
   6899     /* you can put in either one character or a contraction */
   6900     /* you shouldn't put more... */
   6901     if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
   6902         *status = U_CE_NOT_FOUND_ERROR;
   6903         return 0;
   6904     }
   6905 
   6906     uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
   6907 
   6908     if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
   6909         *status = U_PRIMARY_TOO_LONG_ERROR;
   6910         return 0;
   6911     }
   6912     if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
   6913         coll->variableTopValueisDefault = FALSE;
   6914         coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
   6915     }
   6916 
   6917     /* To avoid memory leak, free the offset buffer if necessary. */
   6918     ucol_freeOffsetBuffer(&s);
   6919 
   6920     return CE & UCOL_PRIMARYMASK;
   6921 }
   6922 
   6923 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
   6924     if(U_FAILURE(*status) || coll == NULL) {
   6925         return 0;
   6926     }
   6927     return coll->variableTopValue<<16;
   6928 }
   6929 
   6930 U_CAPI void  U_EXPORT2
   6931 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
   6932     if(U_FAILURE(*status) || coll == NULL) {
   6933         return;
   6934     }
   6935 
   6936     if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
   6937         coll->variableTopValueisDefault = FALSE;
   6938         coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
   6939     }
   6940 }
   6941 /* Attribute setter API */
   6942 U_CAPI void  U_EXPORT2
   6943 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
   6944     if(U_FAILURE(*status) || coll == NULL) {
   6945       return;
   6946     }
   6947     UColAttributeValue oldFrench = coll->frenchCollation;
   6948     UColAttributeValue oldCaseFirst = coll->caseFirst;
   6949     switch(attr) {
   6950     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
   6951         if(value == UCOL_ON) {
   6952             coll->numericCollation = UCOL_ON;
   6953             coll->numericCollationisDefault = FALSE;
   6954         } else if (value == UCOL_OFF) {
   6955             coll->numericCollation = UCOL_OFF;
   6956             coll->numericCollationisDefault = FALSE;
   6957         } else if (value == UCOL_DEFAULT) {
   6958             coll->numericCollationisDefault = TRUE;
   6959             coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
   6960         } else {
   6961             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6962         }
   6963         break;
   6964     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
   6965         if(value == UCOL_ON) {
   6966             coll->hiraganaQ = UCOL_ON;
   6967             coll->hiraganaQisDefault = FALSE;
   6968         } else if (value == UCOL_OFF) {
   6969             coll->hiraganaQ = UCOL_OFF;
   6970             coll->hiraganaQisDefault = FALSE;
   6971         } else if (value == UCOL_DEFAULT) {
   6972             coll->hiraganaQisDefault = TRUE;
   6973             coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
   6974         } else {
   6975             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6976         }
   6977         break;
   6978     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   6979         if(value == UCOL_ON) {
   6980             coll->frenchCollation = UCOL_ON;
   6981             coll->frenchCollationisDefault = FALSE;
   6982         } else if (value == UCOL_OFF) {
   6983             coll->frenchCollation = UCOL_OFF;
   6984             coll->frenchCollationisDefault = FALSE;
   6985         } else if (value == UCOL_DEFAULT) {
   6986             coll->frenchCollationisDefault = TRUE;
   6987             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
   6988         } else {
   6989             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6990         }
   6991         break;
   6992     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   6993         if(value == UCOL_SHIFTED) {
   6994             coll->alternateHandling = UCOL_SHIFTED;
   6995             coll->alternateHandlingisDefault = FALSE;
   6996         } else if (value == UCOL_NON_IGNORABLE) {
   6997             coll->alternateHandling = UCOL_NON_IGNORABLE;
   6998             coll->alternateHandlingisDefault = FALSE;
   6999         } else if (value == UCOL_DEFAULT) {
   7000             coll->alternateHandlingisDefault = TRUE;
   7001             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
   7002         } else {
   7003             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7004         }
   7005         break;
   7006     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   7007         if(value == UCOL_LOWER_FIRST) {
   7008             coll->caseFirst = UCOL_LOWER_FIRST;
   7009             coll->caseFirstisDefault = FALSE;
   7010         } else if (value == UCOL_UPPER_FIRST) {
   7011             coll->caseFirst = UCOL_UPPER_FIRST;
   7012             coll->caseFirstisDefault = FALSE;
   7013         } else if (value == UCOL_OFF) {
   7014             coll->caseFirst = UCOL_OFF;
   7015             coll->caseFirstisDefault = FALSE;
   7016         } else if (value == UCOL_DEFAULT) {
   7017             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
   7018             coll->caseFirstisDefault = TRUE;
   7019         } else {
   7020             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7021         }
   7022         break;
   7023     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   7024         if(value == UCOL_ON) {
   7025             coll->caseLevel = UCOL_ON;
   7026             coll->caseLevelisDefault = FALSE;
   7027         } else if (value == UCOL_OFF) {
   7028             coll->caseLevel = UCOL_OFF;
   7029             coll->caseLevelisDefault = FALSE;
   7030         } else if (value == UCOL_DEFAULT) {
   7031             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
   7032             coll->caseLevelisDefault = TRUE;
   7033         } else {
   7034             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7035         }
   7036         break;
   7037     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   7038         if(value == UCOL_ON) {
   7039             coll->normalizationMode = UCOL_ON;
   7040             coll->normalizationModeisDefault = FALSE;
   7041             initializeFCD(status);
   7042         } else if (value == UCOL_OFF) {
   7043             coll->normalizationMode = UCOL_OFF;
   7044             coll->normalizationModeisDefault = FALSE;
   7045         } else if (value == UCOL_DEFAULT) {
   7046             coll->normalizationModeisDefault = TRUE;
   7047             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
   7048             if(coll->normalizationMode == UCOL_ON) {
   7049                 initializeFCD(status);
   7050             }
   7051         } else {
   7052             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7053         }
   7054         break;
   7055     case UCOL_STRENGTH:         /* attribute for strength */
   7056         if (value == UCOL_DEFAULT) {
   7057             coll->strengthisDefault = TRUE;
   7058             coll->strength = (UColAttributeValue)coll->options->strength;
   7059         } else if (value <= UCOL_IDENTICAL) {
   7060             coll->strengthisDefault = FALSE;
   7061             coll->strength = value;
   7062         } else {
   7063             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   7064         }
   7065         break;
   7066     case UCOL_ATTRIBUTE_COUNT:
   7067     default:
   7068         *status = U_ILLEGAL_ARGUMENT_ERROR;
   7069         break;
   7070     }
   7071     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
   7072         coll->latinOneRegenTable = TRUE;
   7073     } else {
   7074         coll->latinOneRegenTable = FALSE;
   7075     }
   7076     ucol_updateInternalState(coll, status);
   7077 }
   7078 
   7079 U_CAPI UColAttributeValue  U_EXPORT2
   7080 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
   7081     if(U_FAILURE(*status) || coll == NULL) {
   7082       return UCOL_DEFAULT;
   7083     }
   7084     switch(attr) {
   7085     case UCOL_NUMERIC_COLLATION:
   7086       return coll->numericCollation;
   7087     case UCOL_HIRAGANA_QUATERNARY_MODE:
   7088       return coll->hiraganaQ;
   7089     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   7090         return coll->frenchCollation;
   7091     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   7092         return coll->alternateHandling;
   7093     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   7094         return coll->caseFirst;
   7095     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   7096         return coll->caseLevel;
   7097     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   7098         return coll->normalizationMode;
   7099     case UCOL_STRENGTH:         /* attribute for strength */
   7100         return coll->strength;
   7101     case UCOL_ATTRIBUTE_COUNT:
   7102     default:
   7103         *status = U_ILLEGAL_ARGUMENT_ERROR;
   7104         break;
   7105     }
   7106     return UCOL_DEFAULT;
   7107 }
   7108 
   7109 U_CAPI void U_EXPORT2
   7110 ucol_setStrength(    UCollator                *coll,
   7111             UCollationStrength        strength)
   7112 {
   7113     UErrorCode status = U_ZERO_ERROR;
   7114     ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
   7115 }
   7116 
   7117 U_CAPI UCollationStrength U_EXPORT2
   7118 ucol_getStrength(const UCollator *coll)
   7119 {
   7120     UErrorCode status = U_ZERO_ERROR;
   7121     return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
   7122 }
   7123 
   7124 U_INTERNAL int32_t U_EXPORT2
   7125 ucol_getReorderCodes(const UCollator *coll,
   7126                     int32_t *dest,
   7127                     int32_t destCapacity,
   7128                     UErrorCode *pErrorCode) {
   7129     if (U_FAILURE(*pErrorCode)) {
   7130         return 0;
   7131     }
   7132 
   7133     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
   7134         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   7135         return 0;
   7136     }
   7137 
   7138     if (coll->reorderCodesLength > destCapacity) {
   7139         *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
   7140         return coll->reorderCodesLength;
   7141     }
   7142     for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
   7143         dest[i] = coll->reorderCodes[i];
   7144     }
   7145     return coll->reorderCodesLength;
   7146 }
   7147 
   7148 U_INTERNAL void U_EXPORT2
   7149 ucol_setReorderCodes(UCollator *coll,
   7150                     const int32_t *reorderCodes,
   7151                     int32_t reorderCodesLength,
   7152                     UErrorCode *pErrorCode) {
   7153     if (U_FAILURE(*pErrorCode)) {
   7154         return;
   7155     }
   7156 
   7157     if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
   7158         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   7159         return;
   7160     }
   7161 
   7162     uprv_free(coll->reorderCodes);
   7163     coll->reorderCodes = NULL;
   7164     coll->reorderCodesLength = 0;
   7165     if (reorderCodesLength == 0) {
   7166         uprv_free(coll->leadBytePermutationTable);
   7167         coll->leadBytePermutationTable = NULL;
   7168         return;
   7169     }
   7170     coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
   7171     if (coll->reorderCodes == NULL) {
   7172         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
   7173         return;
   7174     }
   7175     for (int32_t i = 0; i < reorderCodesLength; i++) {
   7176         coll->reorderCodes[i] = reorderCodes[i];
   7177     }
   7178     coll->reorderCodesLength = reorderCodesLength;
   7179     ucol_buildPermutationTable(coll, pErrorCode);
   7180     if (U_FAILURE(*pErrorCode)) {
   7181         uprv_free(coll->reorderCodes);
   7182         coll->reorderCodes = NULL;
   7183         coll->reorderCodesLength = 0;
   7184     }
   7185 }
   7186 
   7187 
   7188 /****************************************************************************/
   7189 /* Following are misc functions                                             */
   7190 /* there are new APIs and some compatibility APIs                           */
   7191 /****************************************************************************/
   7192 
   7193 U_CAPI void U_EXPORT2
   7194 ucol_getVersion(const UCollator* coll,
   7195                 UVersionInfo versionInfo)
   7196 {
   7197     /* RunTime version  */
   7198     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
   7199     /* Builder version*/
   7200     uint8_t bdVersion = coll->image->version[0];
   7201 
   7202     /* Charset Version. Need to get the version from cnv files
   7203      * makeconv should populate cnv files with version and
   7204      * an api has to be provided in ucnv.h to obtain this version
   7205      */
   7206     uint8_t csVersion = 0;
   7207 
   7208     /* combine the version info */
   7209     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
   7210 
   7211     /* Tailoring rules */
   7212     versionInfo[0] = (uint8_t)(cmbVersion>>8);
   7213     versionInfo[1] = (uint8_t)cmbVersion;
   7214     versionInfo[2] = coll->image->version[1];
   7215     if(coll->UCA) {
   7216         /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
   7217         versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
   7218     } else {
   7219         versionInfo[3] = 0;
   7220     }
   7221 }
   7222 
   7223 
   7224 /* This internal API checks whether a character is tailored or not */
   7225 U_CAPI UBool  U_EXPORT2
   7226 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
   7227     if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
   7228         return FALSE;
   7229     }
   7230 
   7231     uint32_t CE = UCOL_NOT_FOUND;
   7232     const UChar *ContractionStart = NULL;
   7233     if(u < 0x100) { /* latin-1 */
   7234         CE = coll->latinOneMapping[u];
   7235         if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
   7236             return FALSE;
   7237         }
   7238     } else { /* regular */
   7239         CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
   7240     }
   7241 
   7242     if(isContraction(CE)) {
   7243         ContractionStart = (UChar *)coll->image+getContractOffset(CE);
   7244         CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
   7245     }
   7246 
   7247     return (UBool)(CE != UCOL_NOT_FOUND);
   7248 }
   7249 
   7250 
   7251 /****************************************************************************/
   7252 /* Following are the string compare functions                               */
   7253 /*                                                                          */
   7254 /****************************************************************************/
   7255 
   7256 
   7257 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
   7258 /*                     Used by strcoll if strength == identical and strings  */
   7259 /*                     are otherwise equal.                                  */
   7260 /*                                                                           */
   7261 /*                     Comparison must be done on NFD normalized strings.    */
   7262 /*                     FCD is not good enough.                               */
   7263 
   7264 static
   7265 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
   7266 {
   7267     // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
   7268     // of same type, but that doesn't really mean that it will stay that way.
   7269     int32_t            comparison;
   7270 
   7271     if (sColl->flags & UCOL_USE_ITERATOR) {
   7272         // The division for the array length may truncate the array size to
   7273         // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   7274         // for all platforms anyway.
   7275         UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   7276         UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   7277         UNormIterator *sNIt = NULL, *tNIt = NULL;
   7278         sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   7279         tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   7280         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   7281         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   7282         UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
   7283         UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
   7284         comparison = u_strCompareIter(sIt, tIt, TRUE);
   7285         unorm_closeIter(sNIt);
   7286         unorm_closeIter(tNIt);
   7287     } else {
   7288         int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
   7289         const UChar *sBuf = sColl->string;
   7290         int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
   7291         const UChar *tBuf = tColl->string;
   7292 
   7293         if (normalize) {
   7294             *status = U_ZERO_ERROR;
   7295             // Note: We could use Normalizer::compare() or similar, but for short strings
   7296             // which may not be in FCD it might be faster to just NFD them.
   7297             // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
   7298             // NFD'ing immediately might be faster for long strings,
   7299             // but string comparison is usually done on relatively short strings.
   7300             sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
   7301                                   sColl->writableBuffer,
   7302                                   *status);
   7303             tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
   7304                                   tColl->writableBuffer,
   7305                                   *status);
   7306             if(U_FAILURE(*status)) {
   7307                 return UCOL_LESS;
   7308             }
   7309             comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
   7310         } else {
   7311             comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
   7312         }
   7313     }
   7314 
   7315     if (comparison < 0) {
   7316         return UCOL_LESS;
   7317     } else if (comparison == 0) {
   7318         return UCOL_EQUAL;
   7319     } else /* comparison > 0 */ {
   7320         return UCOL_GREATER;
   7321     }
   7322 }
   7323 
   7324 /*  CEBuf - A struct and some inline functions to handle the saving    */
   7325 /*          of CEs in a buffer within ucol_strcoll                     */
   7326 
   7327 #define UCOL_CEBUF_SIZE 512
   7328 typedef struct ucol_CEBuf {
   7329     uint32_t    *buf;
   7330     uint32_t    *endp;
   7331     uint32_t    *pos;
   7332     uint32_t     localArray[UCOL_CEBUF_SIZE];
   7333 } ucol_CEBuf;
   7334 
   7335 
   7336 static
   7337 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
   7338     (b)->buf = (b)->pos = (b)->localArray;
   7339     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
   7340 }
   7341 
   7342 static
   7343 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
   7344     uint32_t  oldSize;
   7345     uint32_t  newSize;
   7346     uint32_t  *newBuf;
   7347 
   7348     ci->flags |= UCOL_ITER_ALLOCATED;
   7349     oldSize = (uint32_t)(b->pos - b->buf);
   7350     newSize = oldSize * 2;
   7351     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
   7352     if(newBuf == NULL) {
   7353         *status = U_MEMORY_ALLOCATION_ERROR;
   7354     }
   7355     else {
   7356         uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
   7357         if (b->buf != b->localArray) {
   7358             uprv_free(b->buf);
   7359         }
   7360         b->buf = newBuf;
   7361         b->endp = b->buf + newSize;
   7362         b->pos  = b->buf + oldSize;
   7363     }
   7364 }
   7365 
   7366 static
   7367 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
   7368     if (b->pos == b->endp) {
   7369         ucol_CEBuf_Expand(b, ci, status);
   7370     }
   7371     if (U_SUCCESS(*status)) {
   7372         *(b)->pos++ = ce;
   7373     }
   7374 }
   7375 
   7376 /* This is a trick string compare function that goes in and uses sortkeys to compare */
   7377 /* It is used when compare gets in trouble and needs to bail out                     */
   7378 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
   7379                                                   collIterate *tColl,
   7380                                                   UErrorCode *status)
   7381 {
   7382     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
   7383     uint8_t *sourceKeyP = sourceKey;
   7384     uint8_t *targetKeyP = targetKey;
   7385     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
   7386     const UCollator *coll = sColl->coll;
   7387     const UChar *source = NULL;
   7388     const UChar *target = NULL;
   7389     int32_t result = UCOL_EQUAL;
   7390     UnicodeString sourceString, targetString;
   7391     int32_t sourceLength;
   7392     int32_t targetLength;
   7393 
   7394     if(sColl->flags & UCOL_USE_ITERATOR) {
   7395         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   7396         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   7397         UChar32 c;
   7398         while((c=sColl->iterator->next(sColl->iterator))>=0) {
   7399             sourceString.append((UChar)c);
   7400         }
   7401         while((c=tColl->iterator->next(tColl->iterator))>=0) {
   7402             targetString.append((UChar)c);
   7403         }
   7404         source = sourceString.getBuffer();
   7405         sourceLength = sourceString.length();
   7406         target = targetString.getBuffer();
   7407         targetLength = targetString.length();
   7408     } else { // no iterators
   7409         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
   7410         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
   7411         source = sColl->string;
   7412         target = tColl->string;
   7413     }
   7414 
   7415 
   7416 
   7417     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7418     if(sourceKeyLen > UCOL_MAX_BUFFER) {
   7419         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
   7420         if(sourceKeyP == NULL) {
   7421             *status = U_MEMORY_ALLOCATION_ERROR;
   7422             goto cleanup_and_do_compare;
   7423         }
   7424         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7425     }
   7426 
   7427     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7428     if(targetKeyLen > UCOL_MAX_BUFFER) {
   7429         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
   7430         if(targetKeyP == NULL) {
   7431             *status = U_MEMORY_ALLOCATION_ERROR;
   7432             goto cleanup_and_do_compare;
   7433         }
   7434         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7435     }
   7436 
   7437     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
   7438 
   7439 cleanup_and_do_compare:
   7440     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
   7441         uprv_free(sourceKeyP);
   7442     }
   7443 
   7444     if(targetKeyP != NULL && targetKeyP != targetKey) {
   7445         uprv_free(targetKeyP);
   7446     }
   7447 
   7448     if(result<0) {
   7449         return UCOL_LESS;
   7450     } else if(result>0) {
   7451         return UCOL_GREATER;
   7452     } else {
   7453         return UCOL_EQUAL;
   7454     }
   7455 }
   7456 
   7457 
   7458 static UCollationResult
   7459 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
   7460 {
   7461     U_ALIGN_CODE(16);
   7462 
   7463     const UCollator *coll = sColl->coll;
   7464 
   7465 
   7466     // setting up the collator parameters
   7467     UColAttributeValue strength = coll->strength;
   7468     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
   7469 
   7470     UBool checkSecTer = initialCheckSecTer;
   7471     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
   7472     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
   7473     UBool checkIdent = (strength == UCOL_IDENTICAL);
   7474     UBool checkCase = (coll->caseLevel == UCOL_ON);
   7475     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
   7476     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
   7477     UBool qShifted = shifted && checkQuad;
   7478     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
   7479 
   7480     if(doHiragana && shifted) {
   7481         return (ucol_compareUsingSortKeys(sColl, tColl, status));
   7482     }
   7483     uint8_t caseSwitch = coll->caseSwitch;
   7484     uint8_t tertiaryMask = coll->tertiaryMask;
   7485 
   7486     // This is the lowest primary value that will not be ignored if shifted
   7487     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
   7488 
   7489     UCollationResult result = UCOL_EQUAL;
   7490     UCollationResult hirResult = UCOL_EQUAL;
   7491 
   7492     // Preparing the CE buffers. They will be filled during the primary phase
   7493     ucol_CEBuf   sCEs;
   7494     ucol_CEBuf   tCEs;
   7495     UCOL_INIT_CEBUF(&sCEs);
   7496     UCOL_INIT_CEBUF(&tCEs);
   7497 
   7498     uint32_t secS = 0, secT = 0;
   7499     uint32_t sOrder=0, tOrder=0;
   7500 
   7501     // Non shifted primary processing is quite simple
   7502     if(!shifted) {
   7503         for(;;) {
   7504 
   7505             // We fetch CEs until we hit a non ignorable primary or end.
   7506             do {
   7507                 // We get the next CE
   7508                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7509                 // Stuff it in the buffer
   7510                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7511                 // And keep just the primary part.
   7512                 sOrder &= UCOL_PRIMARYMASK;
   7513             } while(sOrder == 0);
   7514 
   7515             // see the comments on the above block
   7516             do {
   7517                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7518                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7519                 tOrder &= UCOL_PRIMARYMASK;
   7520             } while(tOrder == 0);
   7521 
   7522             // if both primaries are the same
   7523             if(sOrder == tOrder) {
   7524                 // and there are no more CEs, we advance to the next level
   7525                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7526                     break;
   7527                 }
   7528                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7529                     if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
   7530                         hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
   7531                             ? UCOL_LESS:UCOL_GREATER;
   7532                     }
   7533                 }
   7534             } else {
   7535                 // only need to check one for continuation
   7536                 // if one is then the other must be or the preceding CE would be a prefix of the other
   7537                 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
   7538                     sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
   7539                     tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
   7540                 }
   7541                 // if two primaries are different, we are done
   7542                 result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
   7543                 goto commonReturn;
   7544             }
   7545         } // no primary difference... do the rest from the buffers
   7546     } else { // shifted - do a slightly more complicated processing :)
   7547         for(;;) {
   7548             UBool sInShifted = FALSE;
   7549             UBool tInShifted = FALSE;
   7550             // This version of code can be refactored. However, it seems easier to understand this way.
   7551             // Source loop. Sam as the target loop.
   7552             for(;;) {
   7553                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7554                 if(sOrder == UCOL_NO_MORE_CES) {
   7555                     UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7556                     break;
   7557                 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
   7558                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7559                     continue;
   7560                 } else if(isContinuation(sOrder)) {
   7561                     if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7562                         if(sInShifted) {
   7563                             sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7564                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7565                             continue;
   7566                         } else {
   7567                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7568                             break;
   7569                         }
   7570                     } else { /* Just lower level values */
   7571                         if(sInShifted) {
   7572                             continue;
   7573                         } else {
   7574                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7575                             continue;
   7576                         }
   7577                     }
   7578                 } else { /* regular */
   7579                     if(coll->leadBytePermutationTable != NULL){
   7580                         sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
   7581                     }
   7582                     if((sOrder & UCOL_PRIMARYMASK) > LVT) {
   7583                         UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7584                         break;
   7585                     } else {
   7586                         if((sOrder & UCOL_PRIMARYMASK) > 0) {
   7587                             sInShifted = TRUE;
   7588                             sOrder &= UCOL_PRIMARYMASK;
   7589                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7590                             continue;
   7591                         } else {
   7592                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7593                             sInShifted = FALSE;
   7594                             continue;
   7595                         }
   7596                     }
   7597                 }
   7598             }
   7599             sOrder &= UCOL_PRIMARYMASK;
   7600             sInShifted = FALSE;
   7601 
   7602             for(;;) {
   7603                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7604                 if(tOrder == UCOL_NO_MORE_CES) {
   7605                     UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7606                     break;
   7607                 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
   7608                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7609                     continue;
   7610                 } else if(isContinuation(tOrder)) {
   7611                     if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7612                         if(tInShifted) {
   7613                             tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7614                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7615                             continue;
   7616                         } else {
   7617                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7618                             break;
   7619                         }
   7620                     } else { /* Just lower level values */
   7621                         if(tInShifted) {
   7622                             continue;
   7623                         } else {
   7624                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7625                             continue;
   7626                         }
   7627                     }
   7628                 } else { /* regular */
   7629                     if(coll->leadBytePermutationTable != NULL){
   7630                         tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
   7631                     }
   7632                     if((tOrder & UCOL_PRIMARYMASK) > LVT) {
   7633                         UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7634                         break;
   7635                     } else {
   7636                         if((tOrder & UCOL_PRIMARYMASK) > 0) {
   7637                             tInShifted = TRUE;
   7638                             tOrder &= UCOL_PRIMARYMASK;
   7639                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7640                             continue;
   7641                         } else {
   7642                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7643                             tInShifted = FALSE;
   7644                             continue;
   7645                         }
   7646                     }
   7647                 }
   7648             }
   7649             tOrder &= UCOL_PRIMARYMASK;
   7650             tInShifted = FALSE;
   7651 
   7652             if(sOrder == tOrder) {
   7653                 /*
   7654                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7655                 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
   7656                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
   7657                 ? UCOL_LESS:UCOL_GREATER;
   7658                 }
   7659                 }
   7660                 */
   7661                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7662                     break;
   7663                 } else {
   7664                     sOrder = 0;
   7665                     tOrder = 0;
   7666                     continue;
   7667                 }
   7668             } else {
   7669                 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
   7670                 goto commonReturn;
   7671             }
   7672         } /* no primary difference... do the rest from the buffers */
   7673     }
   7674 
   7675     /* now, we're gonna reexamine collected CEs */
   7676     uint32_t    *sCE;
   7677     uint32_t    *tCE;
   7678 
   7679     /* This is the secondary level of comparison */
   7680     if(checkSecTer) {
   7681         if(!isFrenchSec) { /* normal */
   7682             sCE = sCEs.buf;
   7683             tCE = tCEs.buf;
   7684             for(;;) {
   7685                 while (secS == 0) {
   7686                     secS = *(sCE++) & UCOL_SECONDARYMASK;
   7687                 }
   7688 
   7689                 while(secT == 0) {
   7690                     secT = *(tCE++) & UCOL_SECONDARYMASK;
   7691                 }
   7692 
   7693                 if(secS == secT) {
   7694                     if(secS == UCOL_NO_MORE_CES_SECONDARY) {
   7695                         break;
   7696                     } else {
   7697                         secS = 0; secT = 0;
   7698                         continue;
   7699                     }
   7700                 } else {
   7701                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7702                     goto commonReturn;
   7703                 }
   7704             }
   7705         } else { /* do the French */
   7706             uint32_t *sCESave = NULL;
   7707             uint32_t *tCESave = NULL;
   7708             sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
   7709             tCE = tCEs.pos-2;
   7710             for(;;) {
   7711                 while (secS == 0 && sCE >= sCEs.buf) {
   7712                     if(sCESave == NULL) {
   7713                         secS = *(sCE--);
   7714                         if(isContinuation(secS)) {
   7715                             while(isContinuation(secS = *(sCE--)))
   7716                                 ;
   7717                             /* after this, secS has the start of continuation, and sCEs points before that */
   7718                             sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7719                             sCE+=2;  /* need to point to the first continuation CP */
   7720                             /* However, now you can just continue doing stuff */
   7721                         }
   7722                     } else {
   7723                         secS = *(sCE++);
   7724                         if(!isContinuation(secS)) { /* This means we have finished with this cont */
   7725                             sCE = sCESave;            /* reset the pointer to before continuation */
   7726                             sCESave = NULL;
   7727                             secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
   7728                             continue;
   7729                         }
   7730                     }
   7731                     secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7732                 }
   7733 
   7734                 while(secT == 0 && tCE >= tCEs.buf) {
   7735                     if(tCESave == NULL) {
   7736                         secT = *(tCE--);
   7737                         if(isContinuation(secT)) {
   7738                             while(isContinuation(secT = *(tCE--)))
   7739                                 ;
   7740                             /* after this, secS has the start of continuation, and sCEs points before that */
   7741                             tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7742                             tCE+=2;  /* need to point to the first continuation CP */
   7743                             /* However, now you can just continue doing stuff */
   7744                         }
   7745                     } else {
   7746                         secT = *(tCE++);
   7747                         if(!isContinuation(secT)) { /* This means we have finished with this cont */
   7748                             tCE = tCESave;          /* reset the pointer to before continuation */
   7749                             tCESave = NULL;
   7750                             secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
   7751                             continue;
   7752                         }
   7753                     }
   7754                     secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7755                 }
   7756 
   7757                 if(secS == secT) {
   7758                     if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
   7759                         break;
   7760                     } else {
   7761                         secS = 0; secT = 0;
   7762                         continue;
   7763                     }
   7764                 } else {
   7765                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7766                     goto commonReturn;
   7767                 }
   7768             }
   7769         }
   7770     }
   7771 
   7772     /* doing the case bit */
   7773     if(checkCase) {
   7774         sCE = sCEs.buf;
   7775         tCE = tCEs.buf;
   7776         for(;;) {
   7777             while((secS & UCOL_REMOVE_CASE) == 0) {
   7778                 if(!isContinuation(*sCE++)) {
   7779                     secS =*(sCE-1);
   7780                     if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7781                         // primary ignorables should not be considered on the case level when the strength is primary
   7782                         // otherwise, the CEs stop being well-formed
   7783                         secS &= UCOL_TERT_CASE_MASK;
   7784                         secS ^= caseSwitch;
   7785                     } else {
   7786                         secS = 0;
   7787                     }
   7788                 } else {
   7789                     secS = 0;
   7790                 }
   7791             }
   7792 
   7793             while((secT & UCOL_REMOVE_CASE) == 0) {
   7794                 if(!isContinuation(*tCE++)) {
   7795                     secT = *(tCE-1);
   7796                     if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7797                         // primary ignorables should not be considered on the case level when the strength is primary
   7798                         // otherwise, the CEs stop being well-formed
   7799                         secT &= UCOL_TERT_CASE_MASK;
   7800                         secT ^= caseSwitch;
   7801                     } else {
   7802                         secT = 0;
   7803                     }
   7804                 } else {
   7805                     secT = 0;
   7806                 }
   7807             }
   7808 
   7809             if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
   7810                 result = UCOL_LESS;
   7811                 goto commonReturn;
   7812             } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
   7813                 result = UCOL_GREATER;
   7814                 goto commonReturn;
   7815             }
   7816 
   7817             if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
   7818                 break;
   7819             } else {
   7820                 secS = 0;
   7821                 secT = 0;
   7822             }
   7823         }
   7824     }
   7825 
   7826     /* Tertiary level */
   7827     if(checkTertiary) {
   7828         secS = 0;
   7829         secT = 0;
   7830         sCE = sCEs.buf;
   7831         tCE = tCEs.buf;
   7832         for(;;) {
   7833             while((secS & UCOL_REMOVE_CASE) == 0) {
   7834                 secS = *(sCE++) & tertiaryMask;
   7835                 if(!isContinuation(secS)) {
   7836                     secS ^= caseSwitch;
   7837                 } else {
   7838                     secS &= UCOL_REMOVE_CASE;
   7839                 }
   7840             }
   7841 
   7842             while((secT & UCOL_REMOVE_CASE)  == 0) {
   7843                 secT = *(tCE++) & tertiaryMask;
   7844                 if(!isContinuation(secT)) {
   7845                     secT ^= caseSwitch;
   7846                 } else {
   7847                     secT &= UCOL_REMOVE_CASE;
   7848                 }
   7849             }
   7850 
   7851             if(secS == secT) {
   7852                 if((secS & UCOL_REMOVE_CASE) == 1) {
   7853                     break;
   7854                 } else {
   7855                     secS = 0; secT = 0;
   7856                     continue;
   7857                 }
   7858             } else {
   7859                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7860                 goto commonReturn;
   7861             }
   7862         }
   7863     }
   7864 
   7865 
   7866     if(qShifted /*checkQuad*/) {
   7867         UBool sInShifted = TRUE;
   7868         UBool tInShifted = TRUE;
   7869         secS = 0;
   7870         secT = 0;
   7871         sCE = sCEs.buf;
   7872         tCE = tCEs.buf;
   7873         for(;;) {
   7874             while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
   7875                 secS = *(sCE++);
   7876                 if(isContinuation(secS)) {
   7877                     if(!sInShifted) {
   7878                         continue;
   7879                     }
   7880                 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
   7881                     secS = UCOL_PRIMARYMASK;
   7882                     sInShifted = FALSE;
   7883                 } else {
   7884                     sInShifted = TRUE;
   7885                 }
   7886             }
   7887             secS &= UCOL_PRIMARYMASK;
   7888 
   7889 
   7890             while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
   7891                 secT = *(tCE++);
   7892                 if(isContinuation(secT)) {
   7893                     if(!tInShifted) {
   7894                         continue;
   7895                     }
   7896                 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
   7897                     secT = UCOL_PRIMARYMASK;
   7898                     tInShifted = FALSE;
   7899                 } else {
   7900                     tInShifted = TRUE;
   7901                 }
   7902             }
   7903             secT &= UCOL_PRIMARYMASK;
   7904 
   7905             if(secS == secT) {
   7906                 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
   7907                     break;
   7908                 } else {
   7909                     secS = 0; secT = 0;
   7910                     continue;
   7911                 }
   7912             } else {
   7913                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7914                 goto commonReturn;
   7915             }
   7916         }
   7917     } else if(doHiragana && hirResult != UCOL_EQUAL) {
   7918         // If we're fine on quaternaries, we might be different
   7919         // on Hiragana. This, however, might fail us in shifted.
   7920         result = hirResult;
   7921         goto commonReturn;
   7922     }
   7923 
   7924     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
   7925     /*  as a tiebreaker if all else is equal.                                */
   7926     /*  Getting here  should be quite rare - strings are not identical -     */
   7927     /*     that is checked first, but compared == through all other checks.  */
   7928     if(checkIdent)
   7929     {
   7930         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
   7931         result = ucol_checkIdent(sColl, tColl, TRUE, status);
   7932     }
   7933 
   7934 commonReturn:
   7935     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
   7936         if (sCEs.buf != sCEs.localArray ) {
   7937             uprv_free(sCEs.buf);
   7938         }
   7939         if (tCEs.buf != tCEs.localArray ) {
   7940             uprv_free(tCEs.buf);
   7941         }
   7942     }
   7943 
   7944     return result;
   7945 }
   7946 
   7947 static UCollationResult
   7948 ucol_strcollRegular(const UCollator *coll,
   7949                     const UChar *source, int32_t sourceLength,
   7950                     const UChar *target, int32_t targetLength,
   7951                     UErrorCode *status) {
   7952     collIterate sColl, tColl;
   7953     // Preparing the context objects for iterating over strings
   7954     IInit_collIterate(coll, source, sourceLength, &sColl, status);
   7955     IInit_collIterate(coll, target, targetLength, &tColl, status);
   7956     if(U_FAILURE(*status)) {
   7957         return UCOL_LESS;
   7958     }
   7959     return ucol_strcollRegular(&sColl, &tColl, status);
   7960 }
   7961 
   7962 static inline uint32_t
   7963 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
   7964                           uint32_t CE, const UChar *s, int32_t *index, int32_t len)
   7965 {
   7966     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
   7967     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
   7968     int32_t offset = 1;
   7969     UChar schar = 0, tchar = 0;
   7970 
   7971     for(;;) {
   7972         if(len == -1) {
   7973             if(s[*index] == 0) { // end of string
   7974                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7975             } else {
   7976                 schar = s[*index];
   7977             }
   7978         } else {
   7979             if(*index == len) {
   7980                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7981             } else {
   7982                 schar = s[*index];
   7983             }
   7984         }
   7985 
   7986         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   7987             offset++;
   7988         }
   7989 
   7990         if (schar == tchar) {
   7991             (*index)++;
   7992             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
   7993         }
   7994         else
   7995         {
   7996             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
   7997                 return UCOL_BAIL_OUT_CE;
   7998             }
   7999             // skip completely ignorables
   8000             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   8001             if(isZeroCE == 0) { // we have to ignore completely ignorables
   8002                 (*index)++;
   8003                 continue;
   8004             }
   8005 
   8006             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   8007         }
   8008     }
   8009 }
   8010 
   8011 
   8012 /**
   8013  * This is a fast strcoll, geared towards text in Latin-1.
   8014  * It supports contractions of size two, French secondaries
   8015  * and case switching. You can use it with strengths primary
   8016  * to tertiary. It does not support shifted and case level.
   8017  * It relies on the table build by setupLatin1Table. If it
   8018  * doesn't understand something, it will go to the regular
   8019  * strcoll.
   8020  */
   8021 static UCollationResult
   8022 ucol_strcollUseLatin1( const UCollator    *coll,
   8023               const UChar        *source,
   8024               int32_t            sLen,
   8025               const UChar        *target,
   8026               int32_t            tLen,
   8027               UErrorCode *status)
   8028 {
   8029     U_ALIGN_CODE(16);
   8030     int32_t strength = coll->strength;
   8031 
   8032     int32_t sIndex = 0, tIndex = 0;
   8033     UChar sChar = 0, tChar = 0;
   8034     uint32_t sOrder=0, tOrder=0;
   8035 
   8036     UBool endOfSource = FALSE;
   8037 
   8038     uint32_t *elements = coll->latinOneCEs;
   8039 
   8040     UBool haveContractions = FALSE; // if we have contractions in our string
   8041                                     // we cannot do French secondary
   8042 
   8043     // Do the primary level
   8044     for(;;) {
   8045         while(sOrder==0) { // this loop skips primary ignorables
   8046             // sOrder=getNextlatinOneCE(source);
   8047             if(sLen==-1) {   // handling zero terminated strings
   8048                 sChar=source[sIndex++];
   8049                 if(sChar==0) {
   8050                     endOfSource = TRUE;
   8051                     break;
   8052                 }
   8053             } else {        // handling strings with known length
   8054                 if(sIndex==sLen) {
   8055                     endOfSource = TRUE;
   8056                     break;
   8057                 }
   8058                 sChar=source[sIndex++];
   8059             }
   8060             if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   8061                 //fprintf(stderr, "R");
   8062                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8063             }
   8064             sOrder = elements[sChar];
   8065             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
   8066                 // specials can basically be either contractions or bail-out signs. If we get anything
   8067                 // else, we'll bail out anywasy
   8068                 if(getCETag(sOrder) == CONTRACTION_TAG) {
   8069                     sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
   8070                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
   8071                     // However, if there are contractions in the table, but we always use just one char,
   8072                     // we might be able to do French. This should be checked out.
   8073                 }
   8074                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   8075                     //fprintf(stderr, "S");
   8076                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8077                 }
   8078             }
   8079         }
   8080 
   8081         while(tOrder==0) {  // this loop skips primary ignorables
   8082             // tOrder=getNextlatinOneCE(target);
   8083             if(tLen==-1) {    // handling zero terminated strings
   8084                 tChar=target[tIndex++];
   8085                 if(tChar==0) {
   8086                     if(endOfSource) { // this is different than source loop,
   8087                         // as we already know that source loop is done here,
   8088                         // so we can either finish the primary loop if both
   8089                         // strings are done or anounce the result if only
   8090                         // target is done. Same below.
   8091                         goto endOfPrimLoop;
   8092                     } else {
   8093                         return UCOL_GREATER;
   8094                     }
   8095                 }
   8096             } else {          // handling strings with known length
   8097                 if(tIndex==tLen) {
   8098                     if(endOfSource) {
   8099                         goto endOfPrimLoop;
   8100                     } else {
   8101                         return UCOL_GREATER;
   8102                     }
   8103                 }
   8104                 tChar=target[tIndex++];
   8105             }
   8106             if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   8107                 //fprintf(stderr, "R");
   8108                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8109             }
   8110             tOrder = elements[tChar];
   8111             if(tOrder >= UCOL_NOT_FOUND) {
   8112                 // Handling specials, see the comments for source
   8113                 if(getCETag(tOrder) == CONTRACTION_TAG) {
   8114                     tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
   8115                     haveContractions = TRUE;
   8116                 }
   8117                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   8118                     //fprintf(stderr, "S");
   8119                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8120                 }
   8121             }
   8122         }
   8123         if(endOfSource) { // source is finished, but target is not, say the result.
   8124             return UCOL_LESS;
   8125         }
   8126 
   8127         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
   8128             sOrder = 0; tOrder = 0;
   8129             continue;
   8130         } else {
   8131             // compare current top bytes
   8132             if(((sOrder^tOrder)&0xFF000000)!=0) {
   8133                 // top bytes differ, return difference
   8134                 if(sOrder < tOrder) {
   8135                     return UCOL_LESS;
   8136                 } else if(sOrder > tOrder) {
   8137                     return UCOL_GREATER;
   8138                 }
   8139                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
   8140                 // since we must return enum value
   8141             }
   8142 
   8143             // top bytes match, continue with following bytes
   8144             sOrder<<=8;
   8145             tOrder<<=8;
   8146         }
   8147     }
   8148 
   8149 endOfPrimLoop:
   8150     // after primary loop, we definitely know the sizes of strings,
   8151     // so we set it and use simpler loop for secondaries and tertiaries
   8152     sLen = sIndex; tLen = tIndex;
   8153     if(strength >= UCOL_SECONDARY) {
   8154         // adjust the table beggining
   8155         elements += coll->latinOneTableLen;
   8156         endOfSource = FALSE;
   8157 
   8158         if(coll->frenchCollation == UCOL_OFF) { // non French
   8159             // This loop is a simplified copy of primary loop
   8160             // at this point we know that whole strings are latin-1, so we don't
   8161             // check for that. We also know that we only have contractions as
   8162             // specials.
   8163             sIndex = 0; tIndex = 0;
   8164             for(;;) {
   8165                 while(sOrder==0) {
   8166                     if(sIndex==sLen) {
   8167                         endOfSource = TRUE;
   8168                         break;
   8169                     }
   8170                     sChar=source[sIndex++];
   8171                     sOrder = elements[sChar];
   8172                     if(sOrder > UCOL_NOT_FOUND) {
   8173                         sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
   8174                     }
   8175                 }
   8176 
   8177                 while(tOrder==0) {
   8178                     if(tIndex==tLen) {
   8179                         if(endOfSource) {
   8180                             goto endOfSecLoop;
   8181                         } else {
   8182                             return UCOL_GREATER;
   8183                         }
   8184                     }
   8185                     tChar=target[tIndex++];
   8186                     tOrder = elements[tChar];
   8187                     if(tOrder > UCOL_NOT_FOUND) {
   8188                         tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
   8189                     }
   8190                 }
   8191                 if(endOfSource) {
   8192                     return UCOL_LESS;
   8193                 }
   8194 
   8195                 if(sOrder == tOrder) {
   8196                     sOrder = 0; tOrder = 0;
   8197                     continue;
   8198                 } else {
   8199                     // see primary loop for comments on this
   8200                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   8201                         if(sOrder < tOrder) {
   8202                             return UCOL_LESS;
   8203                         } else if(sOrder > tOrder) {
   8204                             return UCOL_GREATER;
   8205                         }
   8206                     }
   8207                     sOrder<<=8;
   8208                     tOrder<<=8;
   8209                 }
   8210             }
   8211         } else { // French
   8212             if(haveContractions) { // if we have contractions, we have to bail out
   8213                 // since we don't really know how to handle them here
   8214                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   8215             }
   8216             // For French, we go backwards
   8217             sIndex = sLen; tIndex = tLen;
   8218             for(;;) {
   8219                 while(sOrder==0) {
   8220                     if(sIndex==0) {
   8221                         endOfSource = TRUE;
   8222                         break;
   8223                     }
   8224                     sChar=source[--sIndex];
   8225                     sOrder = elements[sChar];
   8226                     // don't even look for contractions
   8227                 }
   8228 
   8229                 while(tOrder==0) {
   8230                     if(tIndex==0) {
   8231                         if(endOfSource) {
   8232                             goto endOfSecLoop;
   8233                         } else {
   8234                             return UCOL_GREATER;
   8235                         }
   8236                     }
   8237                     tChar=target[--tIndex];
   8238                     tOrder = elements[tChar];
   8239                     // don't even look for contractions
   8240                 }
   8241                 if(endOfSource) {
   8242                     return UCOL_LESS;
   8243                 }
   8244 
   8245                 if(sOrder == tOrder) {
   8246                     sOrder = 0; tOrder = 0;
   8247                     continue;
   8248                 } else {
   8249                     // see the primary loop for comments
   8250                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   8251                         if(sOrder < tOrder) {
   8252                             return UCOL_LESS;
   8253                         } else if(sOrder > tOrder) {
   8254                             return UCOL_GREATER;
   8255                         }
   8256                     }
   8257                     sOrder<<=8;
   8258                     tOrder<<=8;
   8259                 }
   8260             }
   8261         }
   8262     }
   8263 
   8264 endOfSecLoop:
   8265     if(strength >= UCOL_TERTIARY) {
   8266         // tertiary loop is the same as secondary (except no French)
   8267         elements += coll->latinOneTableLen;
   8268         sIndex = 0; tIndex = 0;
   8269         endOfSource = FALSE;
   8270         for(;;) {
   8271             while(sOrder==0) {
   8272                 if(sIndex==sLen) {
   8273                     endOfSource = TRUE;
   8274                     break;
   8275                 }
   8276                 sChar=source[sIndex++];
   8277                 sOrder = elements[sChar];
   8278                 if(sOrder > UCOL_NOT_FOUND) {
   8279                     sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
   8280                 }
   8281             }
   8282             while(tOrder==0) {
   8283                 if(tIndex==tLen) {
   8284                     if(endOfSource) {
   8285                         return UCOL_EQUAL; // if both strings are at the end, they are equal
   8286                     } else {
   8287                         return UCOL_GREATER;
   8288                     }
   8289                 }
   8290                 tChar=target[tIndex++];
   8291                 tOrder = elements[tChar];
   8292                 if(tOrder > UCOL_NOT_FOUND) {
   8293                     tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
   8294                 }
   8295             }
   8296             if(endOfSource) {
   8297                 return UCOL_LESS;
   8298             }
   8299             if(sOrder == tOrder) {
   8300                 sOrder = 0; tOrder = 0;
   8301                 continue;
   8302             } else {
   8303                 if(((sOrder^tOrder)&0xff000000)!=0) {
   8304                     if(sOrder < tOrder) {
   8305                         return UCOL_LESS;
   8306                     } else if(sOrder > tOrder) {
   8307                         return UCOL_GREATER;
   8308                     }
   8309                 }
   8310                 sOrder<<=8;
   8311                 tOrder<<=8;
   8312             }
   8313         }
   8314     }
   8315     return UCOL_EQUAL;
   8316 }
   8317 
   8318 
   8319 U_CAPI UCollationResult U_EXPORT2
   8320 ucol_strcollIter( const UCollator    *coll,
   8321                  UCharIterator *sIter,
   8322                  UCharIterator *tIter,
   8323                  UErrorCode         *status)
   8324 {
   8325     if(!status || U_FAILURE(*status)) {
   8326         return UCOL_EQUAL;
   8327     }
   8328 
   8329     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
   8330     UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
   8331 
   8332     if (sIter == tIter) {
   8333         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8334         return UCOL_EQUAL;
   8335     }
   8336     if(sIter == NULL || tIter == NULL || coll == NULL) {
   8337         *status = U_ILLEGAL_ARGUMENT_ERROR;
   8338         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8339         return UCOL_EQUAL;
   8340     }
   8341 
   8342     UCollationResult result = UCOL_EQUAL;
   8343 
   8344     // Preparing the context objects for iterating over strings
   8345     collIterate sColl, tColl;
   8346     IInit_collIterate(coll, NULL, -1, &sColl, status);
   8347     IInit_collIterate(coll, NULL, -1, &tColl, status);
   8348     if(U_FAILURE(*status)) {
   8349         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8350         return UCOL_EQUAL;
   8351     }
   8352     // The division for the array length may truncate the array size to
   8353     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   8354     // for all platforms anyway.
   8355     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8356     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8357     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
   8358 
   8359     sColl.iterator = sIter;
   8360     sColl.flags |= UCOL_USE_ITERATOR;
   8361     tColl.flags |= UCOL_USE_ITERATOR;
   8362     tColl.iterator = tIter;
   8363 
   8364     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
   8365         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   8366         sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
   8367         sColl.flags &= ~UCOL_ITER_NORM;
   8368 
   8369         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   8370         tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
   8371         tColl.flags &= ~UCOL_ITER_NORM;
   8372     }
   8373 
   8374     UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
   8375 
   8376     while((sChar = sColl.iterator->next(sColl.iterator)) ==
   8377         (tChar = tColl.iterator->next(tColl.iterator))) {
   8378             if(sChar == U_SENTINEL) {
   8379                 result = UCOL_EQUAL;
   8380                 goto end_compare;
   8381             }
   8382     }
   8383 
   8384     if(sChar == U_SENTINEL) {
   8385         tChar = tColl.iterator->previous(tColl.iterator);
   8386     }
   8387 
   8388     if(tChar == U_SENTINEL) {
   8389         sChar = sColl.iterator->previous(sColl.iterator);
   8390     }
   8391 
   8392     sChar = sColl.iterator->previous(sColl.iterator);
   8393     tChar = tColl.iterator->previous(tColl.iterator);
   8394 
   8395     if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
   8396     {
   8397         // We are stopped in the middle of a contraction.
   8398         // Scan backwards through the == part of the string looking for the start of the contraction.
   8399         //   It doesn't matter which string we scan, since they are the same in this region.
   8400         do
   8401         {
   8402             sChar = sColl.iterator->previous(sColl.iterator);
   8403             tChar = tColl.iterator->previous(tColl.iterator);
   8404         }
   8405         while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
   8406     }
   8407 
   8408 
   8409     if(U_SUCCESS(*status)) {
   8410         result = ucol_strcollRegular(&sColl, &tColl, status);
   8411     }
   8412 
   8413 end_compare:
   8414     if(sNormIter || tNormIter) {
   8415         unorm_closeIter(sNormIter);
   8416         unorm_closeIter(tNormIter);
   8417     }
   8418 
   8419     UTRACE_EXIT_VALUE_STATUS(result, *status)
   8420     return result;
   8421 }
   8422 
   8423 
   8424 /*                                                                      */
   8425 /* ucol_strcoll     Main public API string comparison function          */
   8426 /*                                                                      */
   8427 U_CAPI UCollationResult U_EXPORT2
   8428 ucol_strcoll( const UCollator    *coll,
   8429               const UChar        *source,
   8430               int32_t            sourceLength,
   8431               const UChar        *target,
   8432               int32_t            targetLength)
   8433 {
   8434     U_ALIGN_CODE(16);
   8435 
   8436     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
   8437     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   8438         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
   8439         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
   8440         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
   8441     }
   8442 
   8443     if(source == NULL || target == NULL) {
   8444         // do not crash, but return. Should have
   8445         // status argument to return error.
   8446         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8447         return UCOL_EQUAL;
   8448     }
   8449 
   8450     /* Quick check if source and target are same strings. */
   8451     /* They should either both be NULL terminated or the explicit length should be set on both. */
   8452     if (source==target && sourceLength==targetLength) {
   8453         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8454         return UCOL_EQUAL;
   8455     }
   8456 
   8457     /* Scan the strings.  Find:                                                             */
   8458     /*    The length of any leading portion that is equal                                   */
   8459     /*    Whether they are exactly equal.  (in which case we just return)                   */
   8460     const UChar    *pSrc    = source;
   8461     const UChar    *pTarg   = target;
   8462     int32_t        equalLength;
   8463 
   8464     if (sourceLength == -1 && targetLength == -1) {
   8465         // Both strings are null terminated.
   8466         //    Scan through any leading equal portion.
   8467         while (*pSrc == *pTarg && *pSrc != 0) {
   8468             pSrc++;
   8469             pTarg++;
   8470         }
   8471         if (*pSrc == 0 && *pTarg == 0) {
   8472             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8473             return UCOL_EQUAL;
   8474         }
   8475         equalLength = (int32_t)(pSrc - source);
   8476     }
   8477     else
   8478     {
   8479         // One or both strings has an explicit length.
   8480         const UChar    *pSrcEnd = source + sourceLength;
   8481         const UChar    *pTargEnd = target + targetLength;
   8482 
   8483         // Scan while the strings are bitwise ==, or until one is exhausted.
   8484         for (;;) {
   8485             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
   8486                 break;
   8487             }
   8488             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
   8489                 break;
   8490             }
   8491             if (*pSrc != *pTarg) {
   8492                 break;
   8493             }
   8494             pSrc++;
   8495             pTarg++;
   8496         }
   8497         equalLength = (int32_t)(pSrc - source);
   8498 
   8499         // If we made it all the way through both strings, we are done.  They are ==
   8500         if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
   8501             (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
   8502         {
   8503             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8504             return UCOL_EQUAL;
   8505         }
   8506     }
   8507     if (equalLength > 0) {
   8508         /* There is an identical portion at the beginning of the two strings.        */
   8509         /*   If the identical portion ends within a contraction or a comibining      */
   8510         /*   character sequence, back up to the start of that sequence.              */
   8511 
   8512         // These values should already be set by the code above.
   8513         //pSrc  = source + equalLength;        /* point to the first differing chars   */
   8514         //pTarg = target + equalLength;
   8515         if ((pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
   8516             (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
   8517         {
   8518             // We are stopped in the middle of a contraction.
   8519             // Scan backwards through the == part of the string looking for the start of the contraction.
   8520             //   It doesn't matter which string we scan, since they are the same in this region.
   8521             do
   8522             {
   8523                 equalLength--;
   8524                 pSrc--;
   8525             }
   8526             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
   8527         }
   8528 
   8529         source += equalLength;
   8530         target += equalLength;
   8531         if (sourceLength > 0) {
   8532             sourceLength -= equalLength;
   8533         }
   8534         if (targetLength > 0) {
   8535             targetLength -= equalLength;
   8536         }
   8537     }
   8538 
   8539     UErrorCode status = U_ZERO_ERROR;
   8540     UCollationResult returnVal;
   8541     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
   8542         returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
   8543     } else {
   8544         returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
   8545     }
   8546     UTRACE_EXIT_VALUE(returnVal);
   8547     return returnVal;
   8548 }
   8549 
   8550 /* convenience function for comparing strings */
   8551 U_CAPI UBool U_EXPORT2
   8552 ucol_greater(    const    UCollator        *coll,
   8553         const    UChar            *source,
   8554         int32_t            sourceLength,
   8555         const    UChar            *target,
   8556         int32_t            targetLength)
   8557 {
   8558     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8559         == UCOL_GREATER);
   8560 }
   8561 
   8562 /* convenience function for comparing strings */
   8563 U_CAPI UBool U_EXPORT2
   8564 ucol_greaterOrEqual(    const    UCollator    *coll,
   8565             const    UChar        *source,
   8566             int32_t        sourceLength,
   8567             const    UChar        *target,
   8568             int32_t        targetLength)
   8569 {
   8570     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8571         != UCOL_LESS);
   8572 }
   8573 
   8574 /* convenience function for comparing strings */
   8575 U_CAPI UBool U_EXPORT2
   8576 ucol_equal(        const    UCollator        *coll,
   8577             const    UChar            *source,
   8578             int32_t            sourceLength,
   8579             const    UChar            *target,
   8580             int32_t            targetLength)
   8581 {
   8582     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8583         == UCOL_EQUAL);
   8584 }
   8585 
   8586 U_CAPI void U_EXPORT2
   8587 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
   8588     if(coll && coll->UCA) {
   8589         uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
   8590     }
   8591 }
   8592 
   8593 #endif /* #if !UCONFIG_NO_COLLATION */
   8594