Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 1996-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ucol.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 * Modification history
     12 * Date        Name      Comments
     13 * 1996-1999   various members of ICU team maintained C API for collation framework
     14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
     15 * 03/01/2001  synwee    Added maxexpansion functionality.
     16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_COLLATION
     22 
     23 #include "unicode/bytestream.h"
     24 #include "unicode/coleitr.h"
     25 #include "unicode/unorm.h"
     26 #include "unicode/udata.h"
     27 #include "unicode/ustring.h"
     28 
     29 #include "ucol_imp.h"
     30 #include "bocsu.h"
     31 
     32 #include "normalizer2impl.h"
     33 #include "unorm_it.h"
     34 #include "umutex.h"
     35 #include "cmemory.h"
     36 #include "ucln_in.h"
     37 #include "cstring.h"
     38 #include "utracimp.h"
     39 #include "putilimp.h"
     40 #include "uassert.h"
     41 
     42 #ifdef UCOL_DEBUG
     43 #include <stdio.h>
     44 #endif
     45 
     46 U_NAMESPACE_USE
     47 
     48 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     49 
     50 #define LAST_BYTE_MASK_           0xFF
     51 #define SECOND_LAST_BYTE_SHIFT_   8
     52 
     53 #define ZERO_CC_LIMIT_            0xC0
     54 
     55 // this is static pointer to the normalizer fcdTrieIndex
     56 // it is always the same between calls to u_cleanup
     57 // and therefore writing to it is not synchronized.
     58 // It is cleaned in ucol_cleanup
     59 static const uint16_t *fcdTrieIndex=NULL;
     60 // Code points at fcdHighStart and above have a zero FCD value.
     61 static UChar32 fcdHighStart = 0;
     62 
     63 // These are values from UCA required for
     64 // implicit generation and supressing sort key compression
     65 // they should regularly be in the UCA, but if one
     66 // is running without UCA, it could be a problem
     67 static const int32_t maxRegularPrimary  = 0x7A;
     68 static const int32_t minImplicitPrimary = 0xE0;
     69 static const int32_t maxImplicitPrimary = 0xE4;
     70 
     71 U_CDECL_BEGIN
     72 static UBool U_CALLCONV
     73 ucol_cleanup(void)
     74 {
     75     fcdTrieIndex = NULL;
     76     return TRUE;
     77 }
     78 
     79 static int32_t U_CALLCONV
     80 _getFoldingOffset(uint32_t data) {
     81     return (int32_t)(data&0xFFFFFF);
     82 }
     83 
     84 U_CDECL_END
     85 
     86 // init FCD data
     87 static inline
     88 UBool initializeFCD(UErrorCode *status) {
     89     if (fcdTrieIndex != NULL) {
     90         return TRUE;
     91     } else {
     92         // The result is constant, until the library is reloaded.
     93         fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
     94         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
     95         return U_SUCCESS(*status);
     96     }
     97 }
     98 
     99 static
    100 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
    101                               int32_t sourceLen, collIterate *s,
    102                               UErrorCode *status)
    103 {
    104     (s)->string = (s)->pos = sourceString;
    105     (s)->origFlags = 0;
    106     (s)->flags = 0;
    107     if (sourceLen >= 0) {
    108         s->flags |= UCOL_ITER_HASLEN;
    109         (s)->endp = (UChar *)sourceString+sourceLen;
    110     }
    111     else {
    112         /* change to enable easier checking for end of string for fcdpositon */
    113         (s)->endp = NULL;
    114     }
    115     (s)->extendCEs = NULL;
    116     (s)->extendCEsSize = 0;
    117     (s)->CEpos = (s)->toReturn = (s)->CEs;
    118     (s)->offsetBuffer = NULL;
    119     (s)->offsetBufferSize = 0;
    120     (s)->offsetReturn = (s)->offsetStore = NULL;
    121     (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
    122     (s)->coll = (collator);
    123     (s)->nfd = Normalizer2Factory::getNFDInstance(*status);
    124     (s)->fcdPosition = 0;
    125     if(collator->normalizationMode == UCOL_ON) {
    126         (s)->flags |= UCOL_ITER_NORM;
    127     }
    128     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
    129         (s)->flags |= UCOL_HIRAGANA_Q;
    130     }
    131     (s)->iterator = NULL;
    132     //(s)->iteratorIndex = 0;
    133 }
    134 
    135 U_CAPI void  U_EXPORT2
    136 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
    137                              int32_t sourceLen, collIterate *s,
    138                              UErrorCode *status) {
    139     /* Out-of-line version for use from other files. */
    140     IInit_collIterate(collator, sourceString, sourceLen, s, status);
    141 }
    142 
    143 U_CAPI collIterate * U_EXPORT2
    144 uprv_new_collIterate(UErrorCode *status) {
    145     if(U_FAILURE(*status)) {
    146         return NULL;
    147     }
    148     collIterate *s = new collIterate;
    149     if(s == NULL) {
    150         *status = U_MEMORY_ALLOCATION_ERROR;
    151         return NULL;
    152     }
    153     return s;
    154 }
    155 
    156 U_CAPI void U_EXPORT2
    157 uprv_delete_collIterate(collIterate *s) {
    158     delete s;
    159 }
    160 
    161 U_CAPI UBool U_EXPORT2
    162 uprv_collIterateAtEnd(collIterate *s) {
    163     return s == NULL || s->pos == s->endp;
    164 }
    165 
    166 /**
    167 * Backup the state of the collIterate struct data
    168 * @param data collIterate to backup
    169 * @param backup storage
    170 */
    171 static
    172 inline void backupState(const collIterate *data, collIterateState *backup)
    173 {
    174     backup->fcdPosition = data->fcdPosition;
    175     backup->flags       = data->flags;
    176     backup->origFlags   = data->origFlags;
    177     backup->pos         = data->pos;
    178     backup->bufferaddress = data->writableBuffer.getBuffer();
    179     backup->buffersize    = data->writableBuffer.length();
    180     backup->iteratorMove = 0;
    181     backup->iteratorIndex = 0;
    182     if(data->iterator != NULL) {
    183         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
    184         backup->iteratorIndex = data->iterator->getState(data->iterator);
    185         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
    186         if(backup->iteratorIndex == UITER_NO_STATE) {
    187             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
    188                 backup->iteratorMove++;
    189                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
    190             }
    191             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    192         }
    193     }
    194 }
    195 
    196 /**
    197 * Loads the state into the collIterate struct data
    198 * @param data collIterate to backup
    199 * @param backup storage
    200 * @param forwards boolean to indicate if forwards iteration is used,
    201 *        false indicates backwards iteration
    202 */
    203 static
    204 inline void loadState(collIterate *data, const collIterateState *backup,
    205                       UBool        forwards)
    206 {
    207     UErrorCode status = U_ZERO_ERROR;
    208     data->flags       = backup->flags;
    209     data->origFlags   = backup->origFlags;
    210     if(data->iterator != NULL) {
    211         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
    212         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
    213         if(backup->iteratorMove != 0) {
    214             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    215         }
    216     }
    217     data->pos         = backup->pos;
    218 
    219     if ((data->flags & UCOL_ITER_INNORMBUF) &&
    220         data->writableBuffer.getBuffer() != backup->bufferaddress) {
    221         /*
    222         this is when a new buffer has been reallocated and we'll have to
    223         calculate the new position.
    224         note the new buffer has to contain the contents of the old buffer.
    225         */
    226         if (forwards) {
    227             data->pos = data->writableBuffer.getTerminatedBuffer() +
    228                                          (data->pos - backup->bufferaddress);
    229         }
    230         else {
    231             /* backwards direction */
    232             int32_t temp = backup->buffersize -
    233                                   (int32_t)(data->pos - backup->bufferaddress);
    234             data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
    235         }
    236     }
    237     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
    238         /*
    239         this is alittle tricky.
    240         if we are initially not in the normalization buffer, even if we
    241         normalize in the later stage, the data in the buffer will be
    242         ignored, since we skip back up to the data string.
    243         however if we are already in the normalization buffer, any
    244         further normalization will pull data into the normalization
    245         buffer and modify the fcdPosition.
    246         since we are keeping the data in the buffer for use, the
    247         fcdPosition can not be reverted back.
    248         arrgghh....
    249         */
    250         data->fcdPosition = backup->fcdPosition;
    251     }
    252 }
    253 
    254 static UBool
    255 reallocCEs(collIterate *data, int32_t newCapacity) {
    256     uint32_t *oldCEs = data->extendCEs;
    257     if(oldCEs == NULL) {
    258         oldCEs = data->CEs;
    259     }
    260     int32_t length = data->CEpos - oldCEs;
    261     uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
    262     if(newCEs == NULL) {
    263         return FALSE;
    264     }
    265     uprv_memcpy(newCEs, oldCEs, length * 4);
    266     uprv_free(data->extendCEs);
    267     data->extendCEs = newCEs;
    268     data->extendCEsSize = newCapacity;
    269     data->CEpos = newCEs + length;
    270     return TRUE;
    271 }
    272 
    273 static UBool
    274 increaseCEsCapacity(collIterate *data) {
    275     int32_t oldCapacity;
    276     if(data->extendCEs != NULL) {
    277         oldCapacity = data->extendCEsSize;
    278     } else {
    279         oldCapacity = LENGTHOF(data->CEs);
    280     }
    281     return reallocCEs(data, 2 * oldCapacity);
    282 }
    283 
    284 static UBool
    285 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
    286     int32_t oldCapacity;
    287     if(data->extendCEs != NULL) {
    288         oldCapacity = data->extendCEsSize;
    289     } else {
    290         oldCapacity = LENGTHOF(data->CEs);
    291     }
    292     if(minCapacity <= oldCapacity) {
    293         return TRUE;
    294     }
    295     oldCapacity *= 2;
    296     return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
    297 }
    298 
    299 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
    300     if(U_FAILURE(errorCode)) {
    301         return;
    302     }
    303     int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
    304     if(length >= offsetBufferSize) {
    305         int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
    306         int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4));
    307         if(newBuffer == NULL) {
    308             errorCode = U_MEMORY_ALLOCATION_ERROR;
    309             return;
    310         }
    311         if(length > 0) {
    312             uprv_memcpy(newBuffer, offsetBuffer, length * 4);
    313         }
    314         uprv_free(offsetBuffer);
    315         offsetBuffer = newBuffer;
    316         offsetStore = offsetBuffer + length;
    317         offsetBufferSize = newCapacity;
    318     }
    319     *offsetStore++ = offset;
    320 }
    321 
    322 /*
    323 * collIter_eos()
    324 *     Checks for a collIterate being positioned at the end of
    325 *     its source string.
    326 *
    327 */
    328 static
    329 inline UBool collIter_eos(collIterate *s) {
    330     if(s->flags & UCOL_USE_ITERATOR) {
    331       return !(s->iterator->hasNext(s->iterator));
    332     }
    333     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
    334         // Null terminated string, but not at null, so not at end.
    335         //   Whether in main or normalization buffer doesn't matter.
    336         return FALSE;
    337     }
    338 
    339     // String with length.  Can't be in normalization buffer, which is always
    340     //  null termintated.
    341     if (s->flags & UCOL_ITER_HASLEN) {
    342         return (s->pos == s->endp);
    343     }
    344 
    345     // We are at a null termination, could be either normalization buffer or main string.
    346     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
    347         // At null at end of main string.
    348         return TRUE;
    349     }
    350 
    351     // At null at end of normalization buffer.  Need to check whether there there are
    352     //   any characters left in the main buffer.
    353     if(s->origFlags & UCOL_USE_ITERATOR) {
    354       return !(s->iterator->hasNext(s->iterator));
    355     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
    356         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
    357         return (*s->fcdPosition == 0);
    358     }
    359     else {
    360         // Main string with an end pointer.
    361         return s->fcdPosition == s->endp;
    362     }
    363 }
    364 
    365 /*
    366 * collIter_bos()
    367 *     Checks for a collIterate being positioned at the start of
    368 *     its source string.
    369 *
    370 */
    371 static
    372 inline UBool collIter_bos(collIterate *source) {
    373   // if we're going backwards, we need to know whether there is more in the
    374   // iterator, even if we are in the side buffer
    375   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    376     return !source->iterator->hasPrevious(source->iterator);
    377   }
    378   if (source->pos <= source->string ||
    379       ((source->flags & UCOL_ITER_INNORMBUF) &&
    380       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
    381     return TRUE;
    382   }
    383   return FALSE;
    384 }
    385 
    386 /*static
    387 inline UBool collIter_SimpleBos(collIterate *source) {
    388   // if we're going backwards, we need to know whether there is more in the
    389   // iterator, even if we are in the side buffer
    390   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    391     return !source->iterator->hasPrevious(source->iterator);
    392   }
    393   if (source->pos == source->string) {
    394     return TRUE;
    395   }
    396   return FALSE;
    397 }*/
    398     //return (data->pos == data->string) ||
    399 
    400 
    401 /****************************************************************************/
    402 /* Following are the open/close functions                                   */
    403 /*                                                                          */
    404 /****************************************************************************/
    405 
    406 static UCollator*
    407 ucol_initFromBinary(const uint8_t *bin, int32_t length,
    408                 const UCollator *base,
    409                 UCollator *fillIn,
    410                 UErrorCode *status)
    411 {
    412     UCollator *result = fillIn;
    413     if(U_FAILURE(*status)) {
    414         return NULL;
    415     }
    416     /*
    417     if(base == NULL) {
    418         // we don't support null base yet
    419         *status = U_ILLEGAL_ARGUMENT_ERROR;
    420         return NULL;
    421     }
    422     */
    423     // We need these and we could be running without UCA
    424     uprv_uca_initImplicitConstants(status);
    425     UCATableHeader *colData = (UCATableHeader *)bin;
    426     // do we want version check here? We're trying to figure out whether collators are compatible
    427     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
    428         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
    429         colData->version[0] != UCOL_BUILDER_VERSION)
    430     {
    431         *status = U_COLLATOR_VERSION_MISMATCH;
    432         return NULL;
    433     }
    434     else {
    435         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
    436             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
    437             if(U_FAILURE(*status)){
    438                 return NULL;
    439             }
    440             result->hasRealData = TRUE;
    441         }
    442         else {
    443             if(base) {
    444                 result = ucol_initCollator(base->image, result, base, status);
    445                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
    446                 if(U_FAILURE(*status)){
    447                     return NULL;
    448                 }
    449                 result->hasRealData = FALSE;
    450             }
    451             else {
    452                 *status = U_USELESS_COLLATOR_ERROR;
    453                 return NULL;
    454             }
    455         }
    456         result->freeImageOnClose = FALSE;
    457     }
    458     result->actualLocale = NULL;
    459     result->validLocale = NULL;
    460     result->requestedLocale = NULL;
    461     result->rules = NULL;
    462     result->rulesLength = 0;
    463     result->freeRulesOnClose = FALSE;
    464     result->ucaRules = NULL;
    465     return result;
    466 }
    467 
    468 U_CAPI UCollator* U_EXPORT2
    469 ucol_openBinary(const uint8_t *bin, int32_t length,
    470                 const UCollator *base,
    471                 UErrorCode *status)
    472 {
    473     return ucol_initFromBinary(bin, length, base, NULL, status);
    474 }
    475 
    476 U_CAPI int32_t U_EXPORT2
    477 ucol_cloneBinary(const UCollator *coll,
    478                  uint8_t *buffer, int32_t capacity,
    479                  UErrorCode *status)
    480 {
    481     int32_t length = 0;
    482     if(U_FAILURE(*status)) {
    483         return length;
    484     }
    485     if(capacity < 0) {
    486         *status = U_ILLEGAL_ARGUMENT_ERROR;
    487         return length;
    488     }
    489     if(coll->hasRealData == TRUE) {
    490         length = coll->image->size;
    491         if(length <= capacity) {
    492             uprv_memcpy(buffer, coll->image, length);
    493         } else {
    494             *status = U_BUFFER_OVERFLOW_ERROR;
    495         }
    496     } else {
    497         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    498         if(length <= capacity) {
    499             /* build the UCATableHeader with minimal entries */
    500             /* do not copy the header from the UCA file because its values are wrong! */
    501             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    502 
    503             /* reset everything */
    504             uprv_memset(buffer, 0, length);
    505 
    506             /* set the tailoring-specific values */
    507             UCATableHeader *myData = (UCATableHeader *)buffer;
    508             myData->size = length;
    509 
    510             /* offset for the options, the only part of the data that is present after the header */
    511             myData->options = sizeof(UCATableHeader);
    512 
    513             /* need to always set the expansion value for an upper bound of the options */
    514             myData->expansion = myData->options + sizeof(UColOptionSet);
    515 
    516             myData->magic = UCOL_HEADER_MAGIC;
    517             myData->isBigEndian = U_IS_BIG_ENDIAN;
    518             myData->charSetFamily = U_CHARSET_FAMILY;
    519 
    520             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    521             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    522 
    523             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    524             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    525             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    526             myData->jamoSpecial = coll->image->jamoSpecial;
    527 
    528             /* copy the collator options */
    529             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    530         } else {
    531             *status = U_BUFFER_OVERFLOW_ERROR;
    532         }
    533     }
    534     return length;
    535 }
    536 
    537 U_CAPI UCollator* U_EXPORT2
    538 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
    539 {
    540     UCollator * localCollator;
    541     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
    542     char *stackBufferChars = (char *)stackBuffer;
    543     int32_t imageSize = 0;
    544     int32_t rulesSize = 0;
    545     int32_t rulesPadding = 0;
    546     int32_t defaultReorderCodesSize = 0;
    547     int32_t reorderCodesSize = 0;
    548     uint8_t *image;
    549     UChar *rules;
    550     int32_t* defaultReorderCodes;
    551     int32_t* reorderCodes;
    552     uint8_t* leadBytePermutationTable;
    553     UBool colAllocated = FALSE;
    554     UBool imageAllocated = FALSE;
    555 
    556     if (status == NULL || U_FAILURE(*status)){
    557         return 0;
    558     }
    559     if ((stackBuffer && !pBufferSize) || !coll){
    560        *status = U_ILLEGAL_ARGUMENT_ERROR;
    561         return 0;
    562     }
    563 
    564     if (coll->rules && coll->freeRulesOnClose) {
    565         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
    566         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
    567         bufferSizeNeeded += rulesSize + rulesPadding;
    568     }
    569     // no padding for alignment needed from here since the next two are 4 byte quantities
    570     if (coll->defaultReorderCodes) {
    571         defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
    572         bufferSizeNeeded += defaultReorderCodesSize;
    573     }
    574     if (coll->reorderCodes) {
    575         reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
    576         bufferSizeNeeded += reorderCodesSize;
    577     }
    578     if (coll->leadBytePermutationTable) {
    579         bufferSizeNeeded += 256 * sizeof(uint8_t);
    580     }
    581 
    582     if (stackBuffer && *pBufferSize <= 0) { /* 'preflighting' request - set needed size into *pBufferSize */
    583         *pBufferSize =  bufferSizeNeeded;
    584         return 0;
    585     }
    586 
    587     /* Pointers on 64-bit platforms need to be aligned
    588      * on a 64-bit boundry in memory.
    589      */
    590     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
    591         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
    592         if (*pBufferSize > offsetUp) {
    593             *pBufferSize -= offsetUp;
    594             stackBufferChars += offsetUp;
    595         }
    596         else {
    597             /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
    598             *pBufferSize = 1;
    599         }
    600     }
    601     stackBuffer = (void *)stackBufferChars;
    602 
    603     if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
    604         /* allocate one here...*/
    605         stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
    606         // Null pointer check.
    607         if (stackBufferChars == NULL) {
    608             *status = U_MEMORY_ALLOCATION_ERROR;
    609             return NULL;
    610         }
    611         colAllocated = TRUE;
    612         if (U_SUCCESS(*status)) {
    613             *status = U_SAFECLONE_ALLOCATED_WARNING;
    614         }
    615     }
    616     localCollator = (UCollator *)stackBufferChars;
    617     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
    618     defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
    619     reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
    620     leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
    621 
    622     {
    623         UErrorCode tempStatus = U_ZERO_ERROR;
    624         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
    625     }
    626     if (coll->freeImageOnClose) {
    627         image = (uint8_t *)uprv_malloc(imageSize);
    628         // Null pointer check
    629         if (image == NULL) {
    630             *status = U_MEMORY_ALLOCATION_ERROR;
    631             return NULL;
    632         }
    633         ucol_cloneBinary(coll, image, imageSize, status);
    634         imageAllocated = TRUE;
    635     }
    636     else {
    637         image = (uint8_t *)coll->image;
    638     }
    639     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
    640     if (U_FAILURE(*status)) {
    641         return NULL;
    642     }
    643 
    644     if (coll->rules) {
    645         if (coll->freeRulesOnClose) {
    646             localCollator->rules = u_strcpy(rules, coll->rules);
    647             //bufferEnd += rulesSize;
    648         }
    649         else {
    650             localCollator->rules = coll->rules;
    651         }
    652         localCollator->freeRulesOnClose = FALSE;
    653         localCollator->rulesLength = coll->rulesLength;
    654     }
    655 
    656     // collator reordering
    657     if (coll->defaultReorderCodes) {
    658         localCollator->defaultReorderCodes =
    659             (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
    660         localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
    661         localCollator->freeDefaultReorderCodesOnClose = FALSE;
    662     }
    663     if (coll->reorderCodes) {
    664         localCollator->reorderCodes =
    665             (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
    666         localCollator->reorderCodesLength = coll->reorderCodesLength;
    667         localCollator->freeReorderCodesOnClose = FALSE;
    668     }
    669     if (coll->leadBytePermutationTable) {
    670         localCollator->leadBytePermutationTable =
    671             (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
    672         localCollator->freeLeadBytePermutationTableOnClose = FALSE;
    673     }
    674 
    675     int32_t i;
    676     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
    677         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
    678     }
    679     // zero copies of pointers
    680     localCollator->actualLocale = NULL;
    681     localCollator->validLocale = NULL;
    682     localCollator->requestedLocale = NULL;
    683     localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
    684     localCollator->freeOnClose = colAllocated;
    685     localCollator->freeImageOnClose = imageAllocated;
    686     return localCollator;
    687 }
    688 
    689 U_CAPI void U_EXPORT2
    690 ucol_close(UCollator *coll)
    691 {
    692     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
    693     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
    694     if(coll != NULL) {
    695         // these are always owned by each UCollator struct,
    696         // so we always free them
    697         if(coll->validLocale != NULL) {
    698             uprv_free(coll->validLocale);
    699         }
    700         if(coll->actualLocale != NULL) {
    701             uprv_free(coll->actualLocale);
    702         }
    703         if(coll->requestedLocale != NULL) {
    704             uprv_free(coll->requestedLocale);
    705         }
    706         if(coll->latinOneCEs != NULL) {
    707             uprv_free(coll->latinOneCEs);
    708         }
    709         if(coll->options != NULL && coll->freeOptionsOnClose) {
    710             uprv_free(coll->options);
    711         }
    712         if(coll->rules != NULL && coll->freeRulesOnClose) {
    713             uprv_free((UChar *)coll->rules);
    714         }
    715         if(coll->image != NULL && coll->freeImageOnClose) {
    716             uprv_free((UCATableHeader *)coll->image);
    717         }
    718 
    719         if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
    720             uprv_free(coll->leadBytePermutationTable);
    721         }
    722         if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
    723             uprv_free(coll->defaultReorderCodes);
    724         }
    725         if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
    726             uprv_free(coll->reorderCodes);
    727         }
    728 
    729         /* Here, it would be advisable to close: */
    730         /* - UData for UCA (unless we stuff it in the root resb */
    731         /* Again, do we need additional housekeeping... HMMM! */
    732         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
    733         if(coll->freeOnClose){
    734             /* for safeClone, if freeOnClose is FALSE,
    735             don't free the other instance data */
    736             uprv_free(coll);
    737         }
    738     }
    739     UTRACE_EXIT();
    740 }
    741 
    742 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
    743 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
    744 U_CFUNC uint8_t* U_EXPORT2
    745 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
    746 {
    747     uint8_t *result = NULL;
    748     if(U_FAILURE(*status)) {
    749         return NULL;
    750     }
    751     if(coll->hasRealData == TRUE) {
    752         *length = coll->image->size;
    753         result = (uint8_t *)uprv_malloc(*length);
    754         /* test for NULL */
    755         if (result == NULL) {
    756             *status = U_MEMORY_ALLOCATION_ERROR;
    757             return NULL;
    758         }
    759         uprv_memcpy(result, coll->image, *length);
    760     } else {
    761         *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    762         result = (uint8_t *)uprv_malloc(*length);
    763         /* test for NULL */
    764         if (result == NULL) {
    765             *status = U_MEMORY_ALLOCATION_ERROR;
    766             return NULL;
    767         }
    768 
    769         /* build the UCATableHeader with minimal entries */
    770         /* do not copy the header from the UCA file because its values are wrong! */
    771         /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    772 
    773         /* reset everything */
    774         uprv_memset(result, 0, *length);
    775 
    776         /* set the tailoring-specific values */
    777         UCATableHeader *myData = (UCATableHeader *)result;
    778         myData->size = *length;
    779 
    780         /* offset for the options, the only part of the data that is present after the header */
    781         myData->options = sizeof(UCATableHeader);
    782 
    783         /* need to always set the expansion value for an upper bound of the options */
    784         myData->expansion = myData->options + sizeof(UColOptionSet);
    785 
    786         myData->magic = UCOL_HEADER_MAGIC;
    787         myData->isBigEndian = U_IS_BIG_ENDIAN;
    788         myData->charSetFamily = U_CHARSET_FAMILY;
    789 
    790         /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    791         uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    792 
    793         uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    794         uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    795         uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    796         myData->jamoSpecial = coll->image->jamoSpecial;
    797 
    798         /* copy the collator options */
    799         uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    800     }
    801     return result;
    802 }
    803 
    804 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
    805     if(U_FAILURE(*status)) {
    806         return;
    807     }
    808     result->caseFirst = (UColAttributeValue)opts->caseFirst;
    809     result->caseLevel = (UColAttributeValue)opts->caseLevel;
    810     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
    811     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
    812     if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
    813         return;
    814     }
    815     result->strength = (UColAttributeValue)opts->strength;
    816     result->variableTopValue = opts->variableTopValue;
    817     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
    818     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
    819     result->numericCollation = (UColAttributeValue)opts->numericCollation;
    820     result->caseFirstisDefault = TRUE;
    821     result->caseLevelisDefault = TRUE;
    822     result->frenchCollationisDefault = TRUE;
    823     result->normalizationModeisDefault = TRUE;
    824     result->strengthisDefault = TRUE;
    825     result->variableTopValueisDefault = TRUE;
    826     result->alternateHandlingisDefault = TRUE;
    827     result->hiraganaQisDefault = TRUE;
    828     result->numericCollationisDefault = TRUE;
    829 
    830     ucol_updateInternalState(result, status);
    831 
    832     result->options = opts;
    833 }
    834 
    835 
    836 /**
    837 * Approximate determination if a character is at a contraction end.
    838 * Guaranteed to be TRUE if a character is at the end of a contraction,
    839 * otherwise it is not deterministic.
    840 * @param c character to be determined
    841 * @param coll collator
    842 */
    843 static
    844 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
    845     if (c < coll->minContrEndCP) {
    846         return FALSE;
    847     }
    848 
    849     int32_t  hash = c;
    850     uint8_t  htbyte;
    851     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
    852         if (U16_IS_TRAIL(c)) {
    853             return TRUE;
    854         }
    855         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
    856     }
    857     htbyte = coll->contrEndCP[hash>>3];
    858     return (((htbyte >> (hash & 7)) & 1) == 1);
    859 }
    860 
    861 
    862 
    863 /*
    864 *   i_getCombiningClass()
    865 *        A fast, at least partly inline version of u_getCombiningClass()
    866 *        This is a candidate for further optimization.  Used heavily
    867 *        in contraction processing.
    868 */
    869 static
    870 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
    871     uint8_t sCC = 0;
    872     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
    873         sCC = u_getCombiningClass(c);
    874     }
    875     return sCC;
    876 }
    877 
    878 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
    879     UChar c;
    880     UCollator *result = fillIn;
    881     if(U_FAILURE(*status) || image == NULL) {
    882         return NULL;
    883     }
    884 
    885     if(result == NULL) {
    886         result = (UCollator *)uprv_malloc(sizeof(UCollator));
    887         if(result == NULL) {
    888             *status = U_MEMORY_ALLOCATION_ERROR;
    889             return result;
    890         }
    891         result->freeOnClose = TRUE;
    892     } else {
    893         result->freeOnClose = FALSE;
    894     }
    895 
    896     result->image = image;
    897     result->mapping.getFoldingOffset = _getFoldingOffset;
    898     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
    899     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
    900     if(U_FAILURE(*status)) {
    901         if(result->freeOnClose == TRUE) {
    902             uprv_free(result);
    903             result = NULL;
    904         }
    905         return result;
    906     }
    907 
    908     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
    909     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
    910     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
    911     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
    912     result->rules = NULL;
    913     result->rulesLength = 0;
    914     result->freeRulesOnClose = FALSE;
    915     result->defaultReorderCodes = NULL;
    916     result->defaultReorderCodesLength = 0;
    917     result->freeDefaultReorderCodesOnClose = FALSE;
    918     result->reorderCodes = NULL;
    919     result->reorderCodesLength = 0;
    920     result->freeReorderCodesOnClose = FALSE;
    921     result->leadBytePermutationTable = NULL;
    922     result->freeLeadBytePermutationTableOnClose = FALSE;
    923 
    924     /* get the version info from UCATableHeader and populate the Collator struct*/
    925     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
    926     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
    927     result->dataVersion[2] = 0;
    928     result->dataVersion[3] = 0;
    929 
    930     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
    931     result->minUnsafeCP = 0;
    932     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
    933         if (ucol_unsafeCP(c, result)) break;
    934     }
    935     result->minUnsafeCP = c;
    936 
    937     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
    938     result->minContrEndCP = 0;
    939     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
    940         if (ucol_contractionEndCP(c, result)) break;
    941     }
    942     result->minContrEndCP = c;
    943 
    944     /* max expansion tables */
    945     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
    946                                          result->image->endExpansionCE);
    947     result->lastEndExpansionCE = result->endExpansionCE +
    948                                  result->image->endExpansionCECount - 1;
    949     result->expansionCESize = (uint8_t*)result->image +
    950                                                result->image->expansionCESize;
    951 
    952 
    953     //result->errorCode = *status;
    954 
    955     result->latinOneCEs = NULL;
    956 
    957     result->latinOneRegenTable = FALSE;
    958     result->latinOneFailed = FALSE;
    959     result->UCA = UCA;
    960 
    961     /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
    962     result->ucaRules = NULL;
    963     result->actualLocale = NULL;
    964     result->validLocale = NULL;
    965     result->requestedLocale = NULL;
    966     result->hasRealData = FALSE; // real data lives in .dat file...
    967     result->freeImageOnClose = FALSE;
    968 
    969     /* set attributes */
    970     ucol_setOptionsFromHeader(
    971         result,
    972         (UColOptionSet*)((uint8_t*)result->image+result->image->options),
    973         status);
    974     result->freeOptionsOnClose = FALSE;
    975 
    976     return result;
    977 }
    978 
    979 /* new Mark's code */
    980 
    981 /**
    982  * For generation of Implicit CEs
    983  * @author Davis
    984  *
    985  * Cleaned up so that changes can be made more easily.
    986  * Old values:
    987 # First Implicit: E26A792D
    988 # Last Implicit: E3DC70C0
    989 # First CJK: E0030300
    990 # Last CJK: E0A9DD00
    991 # First CJK_A: E0A9DF00
    992 # Last CJK_A: E0DE3100
    993  */
    994 /* Following is a port of Mark's code for new treatment of implicits.
    995  * It is positioned here, since ucol_initUCA need to initialize the
    996  * variables below according to the data in the fractional UCA.
    997  */
    998 
    999 /**
   1000  * Function used to:
   1001  * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
   1002  * b) bump any non-CJK characters by 10FFFF.
   1003  * The relevant blocks are:
   1004  * A:    4E00..9FFF; CJK Unified Ideographs
   1005  *       F900..FAFF; CJK Compatibility Ideographs
   1006  * B:    3400..4DBF; CJK Unified Ideographs Extension A
   1007  *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
   1008  * As long as
   1009  *   no new B characters are allocated between 4E00 and FAFF, and
   1010  *   no new A characters are outside of this range,
   1011  * (very high probability) this simple code will work.
   1012  * The reordered blocks are:
   1013  * Block1 is CJK
   1014  * Block2 is CJK_COMPAT_USED
   1015  * Block3 is CJK_A
   1016  * (all contiguous)
   1017  * Any other CJK gets its normal code point
   1018  * Any non-CJK gets +10FFFF
   1019  * When we reorder Block1, we make sure that it is at the very start,
   1020  * so that it will use a 3-byte form.
   1021  * Warning: the we only pick up the compatibility characters that are
   1022  * NOT decomposed, so that block is smaller!
   1023  */
   1024 
   1025 // CONSTANTS
   1026 static const UChar32
   1027     NON_CJK_OFFSET = 0x110000,
   1028     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
   1029 
   1030 /**
   1031  * Precomputed by initImplicitConstants()
   1032  */
   1033 static int32_t
   1034     final3Multiplier = 0,
   1035     final4Multiplier = 0,
   1036     final3Count = 0,
   1037     final4Count = 0,
   1038     medialCount = 0,
   1039     min3Primary = 0,
   1040     min4Primary = 0,
   1041     max4Primary = 0,
   1042     minTrail = 0,
   1043     maxTrail = 0,
   1044     max3Trail = 0,
   1045     max4Trail = 0,
   1046     min4Boundary = 0;
   1047 
   1048 static const UChar32
   1049     // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
   1050     // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
   1051     CJK_BASE = 0x4E00,
   1052     CJK_LIMIT = 0x9FCB+1,
   1053     // Unified CJK ideographs in the compatibility ideographs block.
   1054     CJK_COMPAT_USED_BASE = 0xFA0E,
   1055     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
   1056     // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
   1057     // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
   1058     CJK_A_BASE = 0x3400,
   1059     CJK_A_LIMIT = 0x4DB5+1,
   1060     // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
   1061     // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
   1062     CJK_B_BASE = 0x20000,
   1063     CJK_B_LIMIT = 0x2A6D6+1,
   1064     // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
   1065     // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
   1066     CJK_C_BASE = 0x2A700,
   1067     CJK_C_LIMIT = 0x2B734+1,
   1068     // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
   1069     // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
   1070     CJK_D_BASE = 0x2B740,
   1071     CJK_D_LIMIT = 0x2B81D+1;
   1072     // when adding to this list, look for all occurrences (in project)
   1073     // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
   1074 
   1075 static UChar32 swapCJK(UChar32 i) {
   1076     if (i < CJK_A_BASE) {
   1077         // non-CJK
   1078     } else if (i < CJK_A_LIMIT) {
   1079         // Extension A has lower code points than the original Unihan+compat
   1080         // but sorts higher.
   1081         return i - CJK_A_BASE
   1082                 + (CJK_LIMIT - CJK_BASE)
   1083                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1084     } else if (i < CJK_BASE) {
   1085         // non-CJK
   1086     } else if (i < CJK_LIMIT) {
   1087         return i - CJK_BASE;
   1088     } else if (i < CJK_COMPAT_USED_BASE) {
   1089         // non-CJK
   1090     } else if (i < CJK_COMPAT_USED_LIMIT) {
   1091         return i - CJK_COMPAT_USED_BASE
   1092                 + (CJK_LIMIT - CJK_BASE);
   1093     } else if (i < CJK_B_BASE) {
   1094         // non-CJK
   1095     } else if (i < CJK_B_LIMIT) {
   1096         return i; // non-BMP-CJK
   1097     } else if (i < CJK_C_BASE) {
   1098         // non-CJK
   1099     } else if (i < CJK_C_LIMIT) {
   1100         return i; // non-BMP-CJK
   1101     } else if (i < CJK_D_BASE) {
   1102         // non-CJK
   1103     } else if (i < CJK_D_LIMIT) {
   1104         return i; // non-BMP-CJK
   1105     }
   1106     return i + NON_CJK_OFFSET; // non-CJK
   1107 }
   1108 
   1109 U_CAPI UChar32 U_EXPORT2
   1110 uprv_uca_getRawFromCodePoint(UChar32 i) {
   1111     return swapCJK(i)+1;
   1112 }
   1113 
   1114 U_CAPI UChar32 U_EXPORT2
   1115 uprv_uca_getCodePointFromRaw(UChar32 i) {
   1116     i--;
   1117     UChar32 result = 0;
   1118     if(i >= NON_CJK_OFFSET) {
   1119         result = i - NON_CJK_OFFSET;
   1120     } else if(i >= CJK_B_BASE) {
   1121         result = i;
   1122     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
   1123         if(i < CJK_LIMIT - CJK_BASE) {
   1124             result = i + CJK_BASE;
   1125         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
   1126             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
   1127         } else {
   1128             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1129         }
   1130     } else {
   1131         result = -1;
   1132     }
   1133     return result;
   1134 }
   1135 
   1136 // GET IMPLICIT PRIMARY WEIGHTS
   1137 // Return value is left justified primary key
   1138 U_CAPI uint32_t U_EXPORT2
   1139 uprv_uca_getImplicitFromRaw(UChar32 cp) {
   1140     /*
   1141     if (cp < 0 || cp > UCOL_MAX_INPUT) {
   1142         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
   1143     }
   1144     */
   1145     int32_t last0 = cp - min4Boundary;
   1146     if (last0 < 0) {
   1147         int32_t last1 = cp / final3Count;
   1148         last0 = cp % final3Count;
   1149 
   1150         int32_t last2 = last1 / medialCount;
   1151         last1 %= medialCount;
   1152 
   1153         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
   1154         last1 = minTrail + last1; // offset
   1155         last2 = min3Primary + last2; // offset
   1156         /*
   1157         if (last2 >= min4Primary) {
   1158             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
   1159         }
   1160         */
   1161         return (last2 << 24) + (last1 << 16) + (last0 << 8);
   1162     } else {
   1163         int32_t last1 = last0 / final4Count;
   1164         last0 %= final4Count;
   1165 
   1166         int32_t last2 = last1 / medialCount;
   1167         last1 %= medialCount;
   1168 
   1169         int32_t last3 = last2 / medialCount;
   1170         last2 %= medialCount;
   1171 
   1172         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
   1173         last1 = minTrail + last1; // offset
   1174         last2 = minTrail + last2; // offset
   1175         last3 = min4Primary + last3; // offset
   1176         /*
   1177         if (last3 > max4Primary) {
   1178             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
   1179         }
   1180         */
   1181         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
   1182     }
   1183 }
   1184 
   1185 static uint32_t U_EXPORT2
   1186 uprv_uca_getImplicitPrimary(UChar32 cp) {
   1187    //fprintf(stdout, "Incoming: %04x\n", cp);
   1188     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
   1189 
   1190     cp = swapCJK(cp);
   1191     cp++;
   1192     // we now have a range of numbers from 0 to 21FFFF.
   1193 
   1194     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
   1195     //fprintf(stdout, "CJK swapped: %04x\n", cp);
   1196 
   1197     return uprv_uca_getImplicitFromRaw(cp);
   1198 }
   1199 
   1200 /**
   1201  * Converts implicit CE into raw integer ("code point")
   1202  * @param implicit
   1203  * @return -1 if illegal format
   1204  */
   1205 U_CAPI UChar32 U_EXPORT2
   1206 uprv_uca_getRawFromImplicit(uint32_t implicit) {
   1207     UChar32 result;
   1208     UChar32 b3 = implicit & 0xFF;
   1209     UChar32 b2 = (implicit >> 8) & 0xFF;
   1210     UChar32 b1 = (implicit >> 16) & 0xFF;
   1211     UChar32 b0 = (implicit >> 24) & 0xFF;
   1212 
   1213     // simple parameter checks
   1214     if (b0 < min3Primary || b0 > max4Primary
   1215         || b1 < minTrail || b1 > maxTrail)
   1216         return -1;
   1217     // normal offsets
   1218     b1 -= minTrail;
   1219 
   1220     // take care of the final values, and compose
   1221     if (b0 < min4Primary) {
   1222         if (b2 < minTrail || b2 > max3Trail || b3 != 0)
   1223             return -1;
   1224         b2 -= minTrail;
   1225         UChar32 remainder = b2 % final3Multiplier;
   1226         if (remainder != 0)
   1227             return -1;
   1228         b0 -= min3Primary;
   1229         b2 /= final3Multiplier;
   1230         result = ((b0 * medialCount) + b1) * final3Count + b2;
   1231     } else {
   1232         if (b2 < minTrail || b2 > maxTrail
   1233             || b3 < minTrail || b3 > max4Trail)
   1234             return -1;
   1235         b2 -= minTrail;
   1236         b3 -= minTrail;
   1237         UChar32 remainder = b3 % final4Multiplier;
   1238         if (remainder != 0)
   1239             return -1;
   1240         b3 /= final4Multiplier;
   1241         b0 -= min4Primary;
   1242         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
   1243     }
   1244     // final check
   1245     if (result < 0 || result > UCOL_MAX_INPUT)
   1246         return -1;
   1247     return result;
   1248 }
   1249 
   1250 
   1251 static inline int32_t divideAndRoundUp(int a, int b) {
   1252     return 1 + (a-1)/b;
   1253 }
   1254 
   1255 /* this function is either called from initUCA or from genUCA before
   1256  * doing canonical closure for the UCA.
   1257  */
   1258 
   1259 /**
   1260  * Set up to generate implicits.
   1261  * Maintenance Note:  this function may end up being called more than once, due
   1262  *                    to threading races during initialization.  Make sure that
   1263  *                    none of the Constants is ever transiently assigned an
   1264  *                    incorrect value.
   1265  * @param minPrimary
   1266  * @param maxPrimary
   1267  * @param minTrail final byte
   1268  * @param maxTrail final byte
   1269  * @param gap3 the gap we leave for tailoring for 3-byte forms
   1270  * @param gap4 the gap we leave for tailoring for 4-byte forms
   1271  */
   1272 static void initImplicitConstants(int minPrimary, int maxPrimary,
   1273                                     int minTrailIn, int maxTrailIn,
   1274                                     int gap3, int primaries3count,
   1275                                     UErrorCode *status) {
   1276     // some simple parameter checks
   1277     if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
   1278         || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
   1279         || (primaries3count < 1))
   1280     {
   1281         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1282         return;
   1283     };
   1284 
   1285     minTrail = minTrailIn;
   1286     maxTrail = maxTrailIn;
   1287 
   1288     min3Primary = minPrimary;
   1289     max4Primary = maxPrimary;
   1290     // compute constants for use later.
   1291     // number of values we can use in trailing bytes
   1292     // leave room for empty values between AND above, e.g. if gap = 2
   1293     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
   1294     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
   1295     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
   1296     final3Multiplier = gap3 + 1;
   1297     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
   1298     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
   1299 
   1300     // medials can use full range
   1301     medialCount = (maxTrail - minTrail + 1);
   1302     // find out how many values fit in each form
   1303     int32_t threeByteCount = medialCount * final3Count;
   1304     // now determine where the 3/4 boundary is.
   1305     // we use 3 bytes below the boundary, and 4 above
   1306     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
   1307     int32_t primaries4count = primariesAvailable - primaries3count;
   1308 
   1309 
   1310     int32_t min3ByteCoverage = primaries3count * threeByteCount;
   1311     min4Primary = minPrimary + primaries3count;
   1312     min4Boundary = min3ByteCoverage;
   1313     // Now expand out the multiplier for the 4 bytes, and redo.
   1314 
   1315     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
   1316     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
   1317     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
   1318     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
   1319     if (gap4 < 1) {
   1320         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1321         return;
   1322     }
   1323     final4Multiplier = gap4 + 1;
   1324     final4Count = neededPerFinalByte;
   1325     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
   1326 }
   1327 
   1328     /**
   1329      * Supply parameters for generating implicit CEs
   1330      */
   1331 U_CAPI void U_EXPORT2
   1332 uprv_uca_initImplicitConstants(UErrorCode *status) {
   1333     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
   1334     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
   1335     initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
   1336 }
   1337 
   1338 
   1339 /*    collIterNormalize     Incremental Normalization happens here.                       */
   1340 /*                          pick up the range of chars identifed by FCD,                  */
   1341 /*                          normalize it into the collIterate's writable buffer,          */
   1342 /*                          switch the collIterate's state to use the writable buffer.    */
   1343 /*                                                                                        */
   1344 static
   1345 void collIterNormalize(collIterate *collationSource)
   1346 {
   1347     UErrorCode  status = U_ZERO_ERROR;
   1348     const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
   1349     const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
   1350 
   1351     collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
   1352                                     collationSource->writableBuffer,
   1353                                     status);
   1354     if (U_FAILURE(status)) {
   1355 #ifdef UCOL_DEBUG
   1356         fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
   1357 #endif
   1358         return;
   1359     }
   1360 
   1361     collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
   1362     collationSource->origFlags  = collationSource->flags;
   1363     collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1364     collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1365 }
   1366 
   1367 
   1368 // This function takes the iterator and extracts normalized stuff up to the next boundary
   1369 // It is similar in the end results to the collIterNormalize, but for the cases when we
   1370 // use an iterator
   1371 /*static
   1372 inline void normalizeIterator(collIterate *collationSource) {
   1373   UErrorCode status = U_ZERO_ERROR;
   1374   UBool wasNormalized = FALSE;
   1375   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
   1376   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
   1377   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1378     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1379   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
   1380     // reallocate and terminate
   1381     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
   1382                                &collationSource->writableBuffer,
   1383                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
   1384                                0)
   1385     ) {
   1386     #ifdef UCOL_DEBUG
   1387         fprintf(stderr, "normalizeIterator(), out of memory\n");
   1388     #endif
   1389         return;
   1390     }
   1391     status = U_ZERO_ERROR;
   1392     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
   1393     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
   1394     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1395     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1396   }
   1397   // Terminate the buffer - we already checked that it is big enough
   1398   collationSource->writableBuffer[normLen] = 0;
   1399   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
   1400       collationSource->flags |= UCOL_ITER_ALLOCATED;
   1401   }
   1402   collationSource->pos        = collationSource->writableBuffer;
   1403   collationSource->origFlags  = collationSource->flags;
   1404   collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1405   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1406 }*/
   1407 
   1408 
   1409 /* Incremental FCD check and normalize                                                    */
   1410 /*   Called from getNextCE when normalization state is suspect.                           */
   1411 /*   When entering, the state is known to be this:                                        */
   1412 /*      o   We are working in the main buffer of the collIterate, not the side            */
   1413 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
   1414 /*          so we won't get here.                                                         */
   1415 /*      o   The leading combining class from the current character is 0 or                */
   1416 /*          the trailing combining class of the previous char was zero.                   */
   1417 /*          True because the previous call to this function will have always exited       */
   1418 /*          that way, and we get called for every char where cc might be non-zero.        */
   1419 static
   1420 inline UBool collIterFCD(collIterate *collationSource) {
   1421     const UChar *srcP, *endP;
   1422     uint8_t     leadingCC;
   1423     uint8_t     prevTrailingCC = 0;
   1424     uint16_t    fcd;
   1425     UBool       needNormalize = FALSE;
   1426 
   1427     srcP = collationSource->pos-1;
   1428 
   1429     if (collationSource->flags & UCOL_ITER_HASLEN) {
   1430         endP = collationSource->endp;
   1431     } else {
   1432         endP = NULL;
   1433     }
   1434 
   1435     // Get the trailing combining class of the current character.  If it's zero,
   1436     //   we are OK.
   1437     /* trie access */
   1438     fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
   1439     if (fcd != 0) {
   1440         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1441 
   1442         if (prevTrailingCC != 0) {
   1443             // The current char has a non-zero trailing CC.  Scan forward until we find
   1444             //   a char with a leading cc of zero.
   1445             while (endP == NULL || srcP != endP)
   1446             {
   1447                 const UChar *savedSrcP = srcP;
   1448 
   1449                 /* trie access */
   1450                 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
   1451                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1452                 if (leadingCC == 0) {
   1453                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
   1454                                            //   back up over it.  (Could be surrogate pair!)
   1455                     break;
   1456                 }
   1457 
   1458                 if (leadingCC < prevTrailingCC) {
   1459                     needNormalize = TRUE;
   1460                 }
   1461 
   1462                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1463             }
   1464         }
   1465     }
   1466 
   1467     collationSource->fcdPosition = (UChar *)srcP;
   1468 
   1469     return needNormalize;
   1470 }
   1471 
   1472 /****************************************************************************/
   1473 /* Following are the CE retrieval functions                                 */
   1474 /*                                                                          */
   1475 /****************************************************************************/
   1476 
   1477 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
   1478 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
   1479 
   1480 /* there should be a macro version of this function in the header file */
   1481 /* This is the first function that tries to fetch a collation element  */
   1482 /* If it's not succesfull or it encounters a more difficult situation  */
   1483 /* some more sofisticated and slower functions are invoked             */
   1484 static
   1485 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1486     uint32_t order = 0;
   1487     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
   1488         order = *(collationSource->toReturn++);                         /* if so, return them */
   1489         if(collationSource->CEpos == collationSource->toReturn) {
   1490             collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
   1491         }
   1492         return order;
   1493     }
   1494 
   1495     UChar ch = 0;
   1496     collationSource->offsetReturn = NULL;
   1497 
   1498     do {
   1499         for (;;)                           /* Loop handles case when incremental normalize switches   */
   1500         {                                  /*   to or from the side buffer / original string, and we  */
   1501             /*   need to start again to get the next character.        */
   1502 
   1503             if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
   1504             {
   1505                 // The source string is null terminated and we're not working from the side buffer,
   1506                 //   and we're not normalizing.  This is the fast path.
   1507                 //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
   1508                 ch = *collationSource->pos++;
   1509                 if (ch != 0) {
   1510                     break;
   1511                 }
   1512                 else {
   1513                     return UCOL_NO_MORE_CES;
   1514                 }
   1515             }
   1516 
   1517             if (collationSource->flags & UCOL_ITER_HASLEN) {
   1518                 // Normal path for strings when length is specified.
   1519                 //   (We can't be in side buffer because it is always null terminated.)
   1520                 if (collationSource->pos >= collationSource->endp) {
   1521                     // Ran off of the end of the main source string.  We're done.
   1522                     return UCOL_NO_MORE_CES;
   1523                 }
   1524                 ch = *collationSource->pos++;
   1525             }
   1526             else if(collationSource->flags & UCOL_USE_ITERATOR) {
   1527                 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
   1528                 if(iterCh == U_SENTINEL) {
   1529                     return UCOL_NO_MORE_CES;
   1530                 }
   1531                 ch = (UChar)iterCh;
   1532             }
   1533             else
   1534             {
   1535                 // Null terminated string.
   1536                 ch = *collationSource->pos++;
   1537                 if (ch == 0) {
   1538                     // Ran off end of buffer.
   1539                     if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1540                         // Ran off end of main string. backing up one character.
   1541                         collationSource->pos--;
   1542                         return UCOL_NO_MORE_CES;
   1543                     }
   1544                     else
   1545                     {
   1546                         // Hit null in the normalize side buffer.
   1547                         // Usually this means the end of the normalized data,
   1548                         // except for one odd case: a null followed by combining chars,
   1549                         //   which is the case if we are at the start of the buffer.
   1550                         if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
   1551                             break;
   1552                         }
   1553 
   1554                         //  Null marked end of side buffer.
   1555                         //   Revert to the main string and
   1556                         //   loop back to top to try again to get a character.
   1557                         collationSource->pos   = collationSource->fcdPosition;
   1558                         collationSource->flags = collationSource->origFlags;
   1559                         continue;
   1560                     }
   1561                 }
   1562             }
   1563 
   1564             if(collationSource->flags&UCOL_HIRAGANA_Q) {
   1565                 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
   1566                  * based on whether the previous codepoint was Hiragana or Katakana.
   1567                  */
   1568                 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
   1569                         ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
   1570                     collationSource->flags |= UCOL_WAS_HIRAGANA;
   1571                 } else {
   1572                     collationSource->flags &= ~UCOL_WAS_HIRAGANA;
   1573                 }
   1574             }
   1575 
   1576             // We've got a character.  See if there's any fcd and/or normalization stuff to do.
   1577             //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
   1578             if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
   1579                 break;
   1580             }
   1581 
   1582             if (collationSource->fcdPosition >= collationSource->pos) {
   1583                 // An earlier FCD check has already covered the current character.
   1584                 // We can go ahead and process this char.
   1585                 break;
   1586             }
   1587 
   1588             if (ch < ZERO_CC_LIMIT_ ) {
   1589                 // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
   1590                 break;
   1591             }
   1592 
   1593             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1594                 // We need to peek at the next character in order to tell if we are FCD
   1595                 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
   1596                     // We are at the last char of source string.
   1597                     //  It is always OK for FCD check.
   1598                     break;
   1599                 }
   1600 
   1601                 // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
   1602                 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1603                     break;
   1604                 }
   1605             }
   1606 
   1607 
   1608             // Need a more complete FCD check and possible normalization.
   1609             if (collIterFCD(collationSource)) {
   1610                 collIterNormalize(collationSource);
   1611             }
   1612             if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1613                 //  No normalization was needed.  Go ahead and process the char we already had.
   1614                 break;
   1615             }
   1616 
   1617             // Some normalization happened.  Next loop iteration will pick up a char
   1618             //   from the normalization buffer.
   1619 
   1620         }   // end for (;;)
   1621 
   1622 
   1623         if (ch <= 0xFF) {
   1624             /*  For latin-1 characters we never need to fall back to the UCA table        */
   1625             /*    because all of the UCA data is replicated in the latinOneMapping array  */
   1626             order = coll->latinOneMapping[ch];
   1627             if (order > UCOL_NOT_FOUND) {
   1628                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
   1629             }
   1630         }
   1631         else
   1632         {
   1633             // Always use UCA for Han, Hangul
   1634             // (Han extension A is before main Han block)
   1635             // **** Han compatibility chars ?? ****
   1636             if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   1637                 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
   1638                 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
   1639                     // between the two target ranges; do normal lookup
   1640                     // **** this range is YI, Modifier tone letters, ****
   1641                     // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   1642                     // **** Latin-D might be tailored, so we need to ****
   1643                     // **** do the normal lookup for these guys.     ****
   1644                     order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1645                 } else {
   1646                     // in one of the target ranges; use UCA
   1647                     order = UCOL_NOT_FOUND;
   1648                 }
   1649             } else {
   1650                 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1651             }
   1652 
   1653             if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
   1654                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
   1655             }
   1656 
   1657             if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
   1658                 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
   1659                 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   1660 
   1661                 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
   1662                     order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
   1663                 }
   1664             }
   1665         }
   1666     } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
   1667 
   1668     if(order == UCOL_NOT_FOUND) {
   1669         order = getImplicit(ch, collationSource);
   1670     }
   1671     return order; /* return the CE */
   1672 }
   1673 
   1674 /* ucol_getNextCE, out-of-line version for use from other files.   */
   1675 U_CAPI uint32_t  U_EXPORT2
   1676 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1677     return ucol_IGetNextCE(coll, collationSource, status);
   1678 }
   1679 
   1680 
   1681 /**
   1682 * Incremental previous normalization happens here. Pick up the range of chars
   1683 * identifed by FCD, normalize it into the collIterate's writable buffer,
   1684 * switch the collIterate's state to use the writable buffer.
   1685 * @param data collation iterator data
   1686 */
   1687 static
   1688 void collPrevIterNormalize(collIterate *data)
   1689 {
   1690     UErrorCode status  = U_ZERO_ERROR;
   1691     const UChar *pEnd   = data->pos;  /* End normalize + 1 */
   1692     const UChar *pStart;
   1693 
   1694     /* Start normalize */
   1695     if (data->fcdPosition == NULL) {
   1696         pStart = data->string;
   1697     }
   1698     else {
   1699         pStart = data->fcdPosition + 1;
   1700     }
   1701 
   1702     int32_t normLen =
   1703         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
   1704                              data->writableBuffer,
   1705                              status).
   1706         length();
   1707     if(U_FAILURE(status)) {
   1708         return;
   1709     }
   1710     /*
   1711     this puts the null termination infront of the normalized string instead
   1712     of the end
   1713     */
   1714     data->writableBuffer.insert(0, (UChar)0);
   1715 
   1716     /*
   1717      * The usual case at this point is that we've got a base
   1718      * character followed by marks that were normalized. If
   1719      * fcdPosition is NULL, that means that we backed up to
   1720      * the beginning of the string and there's no base character.
   1721      *
   1722      * Forward processing will usually normalize when it sees
   1723      * the first mark, so that mark will get it's natural offset
   1724      * and the rest will get the offset of the character following
   1725      * the marks. The base character will also get its natural offset.
   1726      *
   1727      * We write the offset of the base character, if there is one,
   1728      * followed by the offset of the first mark and then the offsets
   1729      * of the rest of the marks.
   1730      */
   1731     int32_t firstMarkOffset = 0;
   1732     int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
   1733     int32_t trailCount      = normLen - 1;
   1734 
   1735     if (data->fcdPosition != NULL) {
   1736         int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
   1737         UChar   baseChar   = *data->fcdPosition;
   1738 
   1739         firstMarkOffset = baseOffset + 1;
   1740 
   1741         /*
   1742          * If the base character is the start of a contraction, forward processing
   1743          * will normalize the marks while checking for the contraction, which means
   1744          * that the offset of the first mark will the same as the other marks.
   1745          *
   1746          * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
   1747          */
   1748         if (baseChar >= 0x100) {
   1749             uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
   1750 
   1751             if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
   1752                 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
   1753             }
   1754 
   1755             if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
   1756                 firstMarkOffset = trailOffset;
   1757             }
   1758         }
   1759 
   1760         data->appendOffset(baseOffset, status);
   1761     }
   1762 
   1763     data->appendOffset(firstMarkOffset, status);
   1764 
   1765     for (int32_t i = 0; i < trailCount; i += 1) {
   1766         data->appendOffset(trailOffset, status);
   1767     }
   1768 
   1769     data->offsetRepeatValue = trailOffset;
   1770 
   1771     data->offsetReturn = data->offsetStore - 1;
   1772     if (data->offsetReturn == data->offsetBuffer) {
   1773         data->offsetStore = data->offsetBuffer;
   1774     }
   1775 
   1776     data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
   1777     data->origFlags  = data->flags;
   1778     data->flags     |= UCOL_ITER_INNORMBUF;
   1779     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   1780 }
   1781 
   1782 
   1783 /**
   1784 * Incremental FCD check for previous iteration and normalize. Called from
   1785 * getPrevCE when normalization state is suspect.
   1786 * When entering, the state is known to be this:
   1787 * o  We are working in the main buffer of the collIterate, not the side
   1788 *    writable buffer. When in the side buffer, normalization mode is always
   1789 *    off, so we won't get here.
   1790 * o  The leading combining class from the current character is 0 or the
   1791 *    trailing combining class of the previous char was zero.
   1792 *    True because the previous call to this function will have always exited
   1793 *    that way, and we get called for every char where cc might be non-zero.
   1794 * @param data collation iterate struct
   1795 * @return normalization status, TRUE for normalization to be done, FALSE
   1796 *         otherwise
   1797 */
   1798 static
   1799 inline UBool collPrevIterFCD(collIterate *data)
   1800 {
   1801     const UChar *src, *start;
   1802     uint8_t     leadingCC;
   1803     uint8_t     trailingCC = 0;
   1804     uint16_t    fcd;
   1805     UBool       result = FALSE;
   1806 
   1807     start = data->string;
   1808     src = data->pos + 1;
   1809 
   1810     /* Get the trailing combining class of the current character. */
   1811     fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
   1812 
   1813     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1814 
   1815     if (leadingCC != 0) {
   1816         /*
   1817         The current char has a non-zero leading combining class.
   1818         Scan backward until we find a char with a trailing cc of zero.
   1819         */
   1820         for (;;)
   1821         {
   1822             if (start == src) {
   1823                 data->fcdPosition = NULL;
   1824                 return result;
   1825             }
   1826 
   1827             fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
   1828 
   1829             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1830 
   1831             if (trailingCC == 0) {
   1832                 break;
   1833             }
   1834 
   1835             if (leadingCC < trailingCC) {
   1836                 result = TRUE;
   1837             }
   1838 
   1839             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1840         }
   1841     }
   1842 
   1843     data->fcdPosition = (UChar *)src;
   1844 
   1845     return result;
   1846 }
   1847 
   1848 /** gets a code unit from the string at a given offset
   1849  *  Handles both normal and iterative cases.
   1850  *  No error checking - caller beware!
   1851  */
   1852 static inline
   1853 UChar peekCodeUnit(collIterate *source, int32_t offset) {
   1854     if(source->pos != NULL) {
   1855         return *(source->pos + offset);
   1856     } else if(source->iterator != NULL) {
   1857         UChar32 c;
   1858         if(offset != 0) {
   1859             source->iterator->move(source->iterator, offset, UITER_CURRENT);
   1860             c = source->iterator->next(source->iterator);
   1861             source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
   1862         } else {
   1863             c = source->iterator->current(source->iterator);
   1864         }
   1865         return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
   1866     } else {
   1867         return 0xfffd;
   1868     }
   1869 }
   1870 
   1871 // Code point version. Treats the offset as a _code point_ delta.
   1872 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
   1873 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
   1874 static inline
   1875 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
   1876     UChar32 c;
   1877     if(source->pos != NULL) {
   1878         const UChar *p = source->pos;
   1879         if(offset >= 0) {
   1880             // Skip forward over (offset-1) code points.
   1881             while(--offset >= 0) {
   1882                 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
   1883                     ++p;
   1884                 }
   1885             }
   1886             // Read the code point there.
   1887             c = *p++;
   1888             UChar trail;
   1889             if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
   1890                 c = U16_GET_SUPPLEMENTARY(c, trail);
   1891             }
   1892         } else /* offset<0 */ {
   1893             // Skip backward over (offset-1) code points.
   1894             while(++offset < 0) {
   1895                 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
   1896                     --p;
   1897                 }
   1898             }
   1899             // Read the code point before that.
   1900             c = *--p;
   1901             UChar lead;
   1902             if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
   1903                 c = U16_GET_SUPPLEMENTARY(lead, c);
   1904             }
   1905         }
   1906     } else if(source->iterator != NULL) {
   1907         if(offset >= 0) {
   1908             // Skip forward over (offset-1) code points.
   1909             int32_t fwd = offset;
   1910             while(fwd-- > 0) {
   1911                 uiter_next32(source->iterator);
   1912             }
   1913             // Read the code point there.
   1914             c = uiter_current32(source->iterator);
   1915             // Return to the starting point, skipping backward over (offset-1) code points.
   1916             while(offset-- > 0) {
   1917                 uiter_previous32(source->iterator);
   1918             }
   1919         } else /* offset<0 */ {
   1920             // Read backward, reading offset code points, remember only the last-read one.
   1921             int32_t back = offset;
   1922             do {
   1923                 c = uiter_previous32(source->iterator);
   1924             } while(++back < 0);
   1925             // Return to the starting position, skipping forward over offset code points.
   1926             do {
   1927                 uiter_next32(source->iterator);
   1928             } while(++offset < 0);
   1929         }
   1930     } else {
   1931         c = U_SENTINEL;
   1932     }
   1933     return c;
   1934 }
   1935 
   1936 /**
   1937 * Determines if we are at the start of the data string in the backwards
   1938 * collation iterator
   1939 * @param data collation iterator
   1940 * @return TRUE if we are at the start
   1941 */
   1942 static
   1943 inline UBool isAtStartPrevIterate(collIterate *data) {
   1944     if(data->pos == NULL && data->iterator != NULL) {
   1945         return !data->iterator->hasPrevious(data->iterator);
   1946     }
   1947     //return (collIter_bos(data)) ||
   1948     return (data->pos == data->string) ||
   1949               ((data->flags & UCOL_ITER_INNORMBUF) &&
   1950               *(data->pos - 1) == 0 && data->fcdPosition == NULL);
   1951 }
   1952 
   1953 static
   1954 inline void goBackOne(collIterate *data) {
   1955 # if 0
   1956     // somehow, it looks like we need to keep iterator synced up
   1957     // at all times, as above.
   1958     if(data->pos) {
   1959         data->pos--;
   1960     }
   1961     if(data->iterator) {
   1962         data->iterator->previous(data->iterator);
   1963     }
   1964 #endif
   1965     if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
   1966         data->iterator->previous(data->iterator);
   1967     }
   1968     if(data->pos) {
   1969         data->pos --;
   1970     }
   1971 }
   1972 
   1973 /**
   1974 * Inline function that gets a simple CE.
   1975 * So what it does is that it will first check the expansion buffer. If the
   1976 * expansion buffer is not empty, ie the end pointer to the expansion buffer
   1977 * is different from the string pointer, we return the collation element at the
   1978 * return pointer and decrement it.
   1979 * For more complicated CEs it resorts to getComplicatedCE.
   1980 * @param coll collator data
   1981 * @param data collation iterator struct
   1982 * @param status error status
   1983 */
   1984 static
   1985 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
   1986                                UErrorCode *status)
   1987 {
   1988     uint32_t result = (uint32_t)UCOL_NULLORDER;
   1989 
   1990     if (data->offsetReturn != NULL) {
   1991         if (data->offsetRepeatCount > 0) {
   1992                 data->offsetRepeatCount -= 1;
   1993         } else {
   1994             if (data->offsetReturn == data->offsetBuffer) {
   1995                 data->offsetReturn = NULL;
   1996                 data->offsetStore  = data->offsetBuffer;
   1997             } else {
   1998                 data->offsetReturn -= 1;
   1999             }
   2000         }
   2001     }
   2002 
   2003     if ((data->extendCEs && data->toReturn > data->extendCEs) ||
   2004             (!data->extendCEs && data->toReturn > data->CEs))
   2005     {
   2006         data->toReturn -= 1;
   2007         result = *(data->toReturn);
   2008         if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
   2009             data->CEpos = data->toReturn;
   2010         }
   2011     }
   2012     else {
   2013         UChar ch = 0;
   2014 
   2015         do {
   2016             /*
   2017             Loop handles case when incremental normalize switches to or from the
   2018             side buffer / original string, and we need to start again to get the
   2019             next character.
   2020             */
   2021             for (;;) {
   2022                 if (data->flags & UCOL_ITER_HASLEN) {
   2023                     /*
   2024                     Normal path for strings when length is specified.
   2025                     Not in side buffer because it is always null terminated.
   2026                     */
   2027                     if (data->pos <= data->string) {
   2028                         /* End of the main source string */
   2029                         return UCOL_NO_MORE_CES;
   2030                     }
   2031                     data->pos --;
   2032                     ch = *data->pos;
   2033                 }
   2034                 // we are using an iterator to go back. Pray for us!
   2035                 else if (data->flags & UCOL_USE_ITERATOR) {
   2036                   UChar32 iterCh = data->iterator->previous(data->iterator);
   2037                   if(iterCh == U_SENTINEL) {
   2038                     return UCOL_NO_MORE_CES;
   2039                   } else {
   2040                     ch = (UChar)iterCh;
   2041                   }
   2042                 }
   2043                 else {
   2044                     data->pos --;
   2045                     ch = *data->pos;
   2046                     /* we are in the side buffer. */
   2047                     if (ch == 0) {
   2048                         /*
   2049                         At the start of the normalize side buffer.
   2050                         Go back to string.
   2051                         Because pointer points to the last accessed character,
   2052                         hence we have to increment it by one here.
   2053                         */
   2054                         data->flags = data->origFlags;
   2055                         data->offsetRepeatValue = 0;
   2056 
   2057                          if (data->fcdPosition == NULL) {
   2058                             data->pos = data->string;
   2059                             return UCOL_NO_MORE_CES;
   2060                         }
   2061                         else {
   2062                             data->pos   = data->fcdPosition + 1;
   2063                         }
   2064 
   2065                        continue;
   2066                     }
   2067                 }
   2068 
   2069                 if(data->flags&UCOL_HIRAGANA_Q) {
   2070                   if(ch>=0x3040 && ch<=0x309f) {
   2071                     data->flags |= UCOL_WAS_HIRAGANA;
   2072                   } else {
   2073                     data->flags &= ~UCOL_WAS_HIRAGANA;
   2074                   }
   2075                 }
   2076 
   2077                 /*
   2078                 * got a character to determine if there's fcd and/or normalization
   2079                 * stuff to do.
   2080                 * if the current character is not fcd.
   2081                 * if current character is at the start of the string
   2082                 * Trailing combining class == 0.
   2083                 * Note if pos is in the writablebuffer, norm is always 0
   2084                 */
   2085                 if (ch < ZERO_CC_LIMIT_ ||
   2086                   // this should propel us out of the loop in the iterator case
   2087                     (data->flags & UCOL_ITER_NORM) == 0 ||
   2088                     (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
   2089                     || data->string == data->pos) {
   2090                     break;
   2091                 }
   2092 
   2093                 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   2094                     /* if next character is FCD */
   2095                     if (data->pos == data->string) {
   2096                         /* First char of string is always OK for FCD check */
   2097                         break;
   2098                     }
   2099 
   2100                     /* Not first char of string, do the FCD fast test */
   2101                     if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
   2102                         break;
   2103                     }
   2104                 }
   2105 
   2106                 /* Need a more complete FCD check and possible normalization. */
   2107                 if (collPrevIterFCD(data)) {
   2108                     collPrevIterNormalize(data);
   2109                 }
   2110 
   2111                 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2112                     /*  No normalization. Go ahead and process the char. */
   2113                     break;
   2114                 }
   2115 
   2116                 /*
   2117                 Some normalization happened.
   2118                 Next loop picks up a char from the normalization buffer.
   2119                 */
   2120             }
   2121 
   2122             /* attempt to handle contractions, after removal of the backwards
   2123             contraction
   2124             */
   2125             if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
   2126                 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
   2127             } else {
   2128                 if (ch <= 0xFF) {
   2129                     result = coll->latinOneMapping[ch];
   2130                 }
   2131                 else {
   2132                     // Always use UCA for [3400..9FFF], [AC00..D7AF]
   2133                     // **** [FA0E..FA2F] ?? ****
   2134                     if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   2135                         (ch >= 0x3400 && ch <= 0xD7AF)) {
   2136                         if (ch > 0x9FFF && ch < 0xAC00) {
   2137                             // between the two target ranges; do normal lookup
   2138                             // **** this range is YI, Modifier tone letters, ****
   2139                             // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   2140                             // **** Latin-D might be tailored, so we need to ****
   2141                             // **** do the normal lookup for these guys.     ****
   2142                              result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   2143                         } else {
   2144                             result = UCOL_NOT_FOUND;
   2145                         }
   2146                     } else {
   2147                         result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   2148                     }
   2149                 }
   2150                 if (result > UCOL_NOT_FOUND) {
   2151                     result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
   2152                 }
   2153                 if (result == UCOL_NOT_FOUND) { // Not found in master list
   2154                     if (!isAtStartPrevIterate(data) &&
   2155                         ucol_contractionEndCP(ch, data->coll))
   2156                     {
   2157                         result = UCOL_CONTRACTION;
   2158                     } else {
   2159                         if(coll->UCA) {
   2160                             result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   2161                         }
   2162                     }
   2163 
   2164                     if (result > UCOL_NOT_FOUND) {
   2165                         if(coll->UCA) {
   2166                             result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
   2167                         }
   2168                     }
   2169                 }
   2170             }
   2171         } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
   2172 
   2173         if(result == UCOL_NOT_FOUND) {
   2174             result = getPrevImplicit(ch, data);
   2175         }
   2176     }
   2177 
   2178     return result;
   2179 }
   2180 
   2181 
   2182 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
   2183 U_CFUNC uint32_t  U_EXPORT2
   2184 ucol_getPrevCE(const UCollator *coll, collIterate *data,
   2185                         UErrorCode *status) {
   2186     return ucol_IGetPrevCE(coll, data, status);
   2187 }
   2188 
   2189 
   2190 /* this should be connected to special Jamo handling */
   2191 U_CFUNC uint32_t  U_EXPORT2
   2192 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
   2193     collIterate colIt;
   2194     IInit_collIterate(coll, &u, 1, &colIt, status);
   2195     if(U_FAILURE(*status)) {
   2196         return 0;
   2197     }
   2198     return ucol_IGetNextCE(coll, &colIt, status);
   2199 }
   2200 
   2201 /**
   2202 * Inserts the argument character into the end of the buffer pushing back the
   2203 * null terminator.
   2204 * @param data collIterate struct data
   2205 * @param ch character to be appended
   2206 * @return the position of the new addition
   2207 */
   2208 static
   2209 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
   2210 {
   2211     int32_t oldLength = data->writableBuffer.length();
   2212     return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
   2213 }
   2214 
   2215 /**
   2216 * Inserts the argument string into the end of the buffer pushing back the
   2217 * null terminator.
   2218 * @param data collIterate struct data
   2219 * @param string to be appended
   2220 * @param length of the string to be appended
   2221 * @return the position of the new addition
   2222 */
   2223 static
   2224 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
   2225 {
   2226     int32_t oldLength = data->writableBuffer.length();
   2227     return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
   2228 }
   2229 
   2230 /**
   2231 * Special normalization function for contraction in the forwards iterator.
   2232 * This normalization sequence will place the current character at source->pos
   2233 * and its following normalized sequence into the buffer.
   2234 * The fcd position, pos will be changed.
   2235 * pos will now point to positions in the buffer.
   2236 * Flags will be changed accordingly.
   2237 * @param data collation iterator data
   2238 */
   2239 static
   2240 inline void normalizeNextContraction(collIterate *data)
   2241 {
   2242     int32_t     strsize;
   2243     UErrorCode  status     = U_ZERO_ERROR;
   2244     /* because the pointer points to the next character */
   2245     const UChar *pStart    = data->pos - 1;
   2246     const UChar *pEnd;
   2247 
   2248     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2249         data->writableBuffer.setTo(*(pStart - 1));
   2250         strsize               = 1;
   2251     }
   2252     else {
   2253         strsize = data->writableBuffer.length();
   2254     }
   2255 
   2256     pEnd = data->fcdPosition;
   2257 
   2258     data->writableBuffer.append(
   2259         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
   2260     if(U_FAILURE(status)) {
   2261         return;
   2262     }
   2263 
   2264     data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
   2265     data->origFlags  = data->flags;
   2266     data->flags     |= UCOL_ITER_INNORMBUF;
   2267     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2268 }
   2269 
   2270 /**
   2271 * Contraction character management function that returns the next character
   2272 * for the forwards iterator.
   2273 * Does nothing if the next character is in buffer and not the first character
   2274 * in it.
   2275 * Else it checks next character in data string to see if it is normalizable.
   2276 * If it is not, the character is simply copied into the buffer, else
   2277 * the whole normalized substring is copied into the buffer, including the
   2278 * current character.
   2279 * @param data collation element iterator data
   2280 * @return next character
   2281 */
   2282 static
   2283 inline UChar getNextNormalizedChar(collIterate *data)
   2284 {
   2285     UChar  nextch;
   2286     UChar  ch;
   2287     // Here we need to add the iterator code. One problem is the way
   2288     // end of string is handled. If we just return next char, it could
   2289     // be the sentinel. Most of the cases already check for this, but we
   2290     // need to be sure.
   2291     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
   2292          /* if no normalization and not in buffer. */
   2293       if(data->flags & UCOL_USE_ITERATOR) {
   2294          return (UChar)data->iterator->next(data->iterator);
   2295       } else {
   2296          return *(data->pos ++);
   2297       }
   2298     }
   2299 
   2300     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
   2301       //normalizeIterator(data);
   2302     //}
   2303 
   2304     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2305     if ((innormbuf && *data->pos != 0) ||
   2306         (data->fcdPosition != NULL && !innormbuf &&
   2307         data->pos < data->fcdPosition)) {
   2308         /*
   2309         if next character is in normalized buffer, no further normalization
   2310         is required
   2311         */
   2312         return *(data->pos ++);
   2313     }
   2314 
   2315     if (data->flags & UCOL_ITER_HASLEN) {
   2316         /* in data string */
   2317         if (data->pos + 1 == data->endp) {
   2318             return *(data->pos ++);
   2319         }
   2320     }
   2321     else {
   2322         if (innormbuf) {
   2323           // inside the normalization buffer, but at the end
   2324           // (since we encountered zero). This means, in the
   2325           // case we're using char iterator, that we need to
   2326           // do another round of normalization.
   2327           //if(data->origFlags & UCOL_USE_ITERATOR) {
   2328             // we need to restore original flags,
   2329             // otherwise, we'll lose them
   2330             //data->flags = data->origFlags;
   2331             //normalizeIterator(data);
   2332             //return *(data->pos++);
   2333           //} else {
   2334             /*
   2335             in writable buffer, at this point fcdPosition can not be
   2336             pointing to the end of the data string. see contracting tag.
   2337             */
   2338           if(data->fcdPosition) {
   2339             if (*(data->fcdPosition + 1) == 0 ||
   2340                 data->fcdPosition + 1 == data->endp) {
   2341                 /* at the end of the string, dump it into the normalizer */
   2342                 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
   2343                 // Check if data->pos received a null pointer
   2344                 if (data->pos == NULL) {
   2345                     return (UChar)-1; // Return to indicate error.
   2346                 }
   2347                 return *(data->fcdPosition ++);
   2348             }
   2349             data->pos = data->fcdPosition;
   2350           } else if(data->origFlags & UCOL_USE_ITERATOR) {
   2351             // if we are here, we're using a normalizing iterator.
   2352             // we should just continue further.
   2353             data->flags = data->origFlags;
   2354             data->pos = NULL;
   2355             return (UChar)data->iterator->next(data->iterator);
   2356           }
   2357           //}
   2358         }
   2359         else {
   2360             if (*(data->pos + 1) == 0) {
   2361                 return *(data->pos ++);
   2362             }
   2363         }
   2364     }
   2365 
   2366     ch = *data->pos ++;
   2367     nextch = *data->pos;
   2368 
   2369     /*
   2370     * if the current character is not fcd.
   2371     * Trailing combining class == 0.
   2372     */
   2373     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
   2374         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
   2375          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
   2376             /*
   2377             Need a more complete FCD check and possible normalization.
   2378             normalize substring will be appended to buffer
   2379             */
   2380         if (collIterFCD(data)) {
   2381             normalizeNextContraction(data);
   2382             return *(data->pos ++);
   2383         }
   2384         else if (innormbuf) {
   2385             /* fcdposition shifted even when there's no normalization, if we
   2386             don't input the rest into this, we'll get the wrong position when
   2387             we reach the end of the writableBuffer */
   2388             int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
   2389             data->pos = insertBufferEnd(data, data->pos - 1, length);
   2390             // Check if data->pos received a null pointer
   2391             if (data->pos == NULL) {
   2392                 return (UChar)-1; // Return to indicate error.
   2393             }
   2394             return *(data->pos ++);
   2395         }
   2396     }
   2397 
   2398     if (innormbuf) {
   2399         /*
   2400         no normalization is to be done hence only one character will be
   2401         appended to the buffer.
   2402         */
   2403         data->pos = insertBufferEnd(data, ch) + 1;
   2404         // Check if data->pos received a null pointer
   2405         if (data->pos == NULL) {
   2406             return (UChar)-1; // Return to indicate error.
   2407         }
   2408     }
   2409 
   2410     /* points back to the pos in string */
   2411     return ch;
   2412 }
   2413 
   2414 
   2415 
   2416 /**
   2417 * Function to copy the buffer into writableBuffer and sets the fcd position to
   2418 * the correct position
   2419 * @param source data string source
   2420 * @param buffer character buffer
   2421 */
   2422 static
   2423 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
   2424 {
   2425     /* okay confusing part here. to ensure that the skipped characters are
   2426     considered later, we need to place it in the appropriate position in the
   2427     normalization buffer and reassign the pos pointer. simple case if pos
   2428     reside in string, simply copy to normalization buffer and
   2429     fcdposition = pos, pos = start of normalization buffer. if pos in
   2430     normalization buffer, we'll insert the copy infront of pos and point pos
   2431     to the start of the normalization buffer. why am i doing these copies?
   2432     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
   2433     not require any changes, which be really painful. */
   2434     if (source->flags & UCOL_ITER_INNORMBUF) {
   2435         int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
   2436         source->writableBuffer.replace(0, replaceLength, buffer);
   2437     }
   2438     else {
   2439         source->fcdPosition  = source->pos;
   2440         source->origFlags    = source->flags;
   2441         source->flags       |= UCOL_ITER_INNORMBUF;
   2442         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   2443         source->writableBuffer = buffer;
   2444     }
   2445 
   2446     source->pos = source->writableBuffer.getTerminatedBuffer();
   2447 }
   2448 
   2449 /**
   2450 * Function to get the discontiguos collation element within the source.
   2451 * Note this function will set the position to the appropriate places.
   2452 * @param coll current collator used
   2453 * @param source data string source
   2454 * @param constart index to the start character in the contraction table
   2455 * @return discontiguos collation element offset
   2456 */
   2457 static
   2458 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
   2459                                 const UChar *constart)
   2460 {
   2461     /* source->pos currently points to the second combining character after
   2462        the start character */
   2463           const UChar *temppos      = source->pos;
   2464           UnicodeString buffer;
   2465     const UChar   *tempconstart = constart;
   2466           uint8_t  tempflags    = source->flags;
   2467           UBool    multicontraction = FALSE;
   2468           collIterateState discState;
   2469 
   2470           backupState(source, &discState);
   2471 
   2472     buffer.setTo(peekCodePoint(source, -1));
   2473     for (;;) {
   2474         UChar    *UCharOffset;
   2475         UChar     schar,
   2476                   tchar;
   2477         uint32_t  result;
   2478 
   2479         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
   2480             || (peekCodeUnit(source, 0) == 0  &&
   2481             //|| (*source->pos == 0  &&
   2482                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
   2483                  source->fcdPosition == NULL ||
   2484                  source->fcdPosition == source->endp ||
   2485                  *(source->fcdPosition) == 0 ||
   2486                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
   2487                  /* end of string in null terminated string or stopped by a
   2488                  null character, note fcd does not always point to a base
   2489                  character after the discontiguos change */
   2490                  u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
   2491                  //u_getCombiningClass(*(source->pos)) == 0) {
   2492             //constart = (UChar *)coll->image + getContractOffset(CE);
   2493             if (multicontraction) {
   2494                 source->pos    = temppos - 1;
   2495                 setDiscontiguosAttribute(source, buffer);
   2496                 return *(coll->contractionCEs +
   2497                                     (tempconstart - coll->contractionIndex));
   2498             }
   2499             constart = tempconstart;
   2500             break;
   2501         }
   2502 
   2503         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
   2504         schar = getNextNormalizedChar(source);
   2505 
   2506         while (schar > (tchar = *UCharOffset)) {
   2507             UCharOffset++;
   2508         }
   2509 
   2510         if (schar != tchar) {
   2511             /* not the correct codepoint. we stuff the current codepoint into
   2512             the discontiguos buffer and try the next character */
   2513             buffer.append(schar);
   2514             continue;
   2515         }
   2516         else {
   2517             if (u_getCombiningClass(schar) ==
   2518                 u_getCombiningClass(peekCodePoint(source, -2))) {
   2519                 buffer.append(schar);
   2520                 continue;
   2521             }
   2522             result = *(coll->contractionCEs +
   2523                                       (UCharOffset - coll->contractionIndex));
   2524         }
   2525 
   2526         if (result == UCOL_NOT_FOUND) {
   2527           break;
   2528         } else if (isContraction(result)) {
   2529             /* this is a multi-contraction*/
   2530             tempconstart = (UChar *)coll->image + getContractOffset(result);
   2531             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
   2532                 != UCOL_NOT_FOUND) {
   2533                 multicontraction = TRUE;
   2534                 temppos       = source->pos + 1;
   2535             }
   2536         } else {
   2537             setDiscontiguosAttribute(source, buffer);
   2538             return result;
   2539         }
   2540     }
   2541 
   2542     /* no problems simply reverting just like that,
   2543     if we are in string before getting into this function, points back to
   2544     string hence no problem.
   2545     if we are in normalization buffer before getting into this function,
   2546     since we'll never use another normalization within this function, we
   2547     know that fcdposition points to a base character. the normalization buffer
   2548     never change, hence this revert works. */
   2549     loadState(source, &discState, TRUE);
   2550     goBackOne(source);
   2551 
   2552     //source->pos   = temppos - 1;
   2553     source->flags = tempflags;
   2554     return *(coll->contractionCEs + (constart - coll->contractionIndex));
   2555 }
   2556 
   2557 /* now uses Mark's getImplicitPrimary code */
   2558 static
   2559 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
   2560     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   2561     *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
   2562     collationSource->offsetRepeatCount += 1;
   2563     return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
   2564 }
   2565 
   2566 /**
   2567 * Inserts the argument character into the front of the buffer replacing the
   2568 * front null terminator.
   2569 * @param data collation element iterator data
   2570 * @param ch character to be appended
   2571 */
   2572 static
   2573 inline void insertBufferFront(collIterate *data, UChar ch)
   2574 {
   2575     data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
   2576 }
   2577 
   2578 /**
   2579 * Special normalization function for contraction in the previous iterator.
   2580 * This normalization sequence will place the current character at source->pos
   2581 * and its following normalized sequence into the buffer.
   2582 * The fcd position, pos will be changed.
   2583 * pos will now point to positions in the buffer.
   2584 * Flags will be changed accordingly.
   2585 * @param data collation iterator data
   2586 */
   2587 static
   2588 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
   2589 {
   2590     const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
   2591     const UChar *pStart;
   2592 
   2593     UnicodeString endOfBuffer;
   2594     if (data->flags & UCOL_ITER_HASLEN) {
   2595         /*
   2596         normalization buffer not used yet, we'll pull down the next
   2597         character into the end of the buffer
   2598         */
   2599         endOfBuffer.setTo(*pEnd);
   2600     }
   2601     else {
   2602         endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
   2603     }
   2604 
   2605     if (data->fcdPosition == NULL) {
   2606         pStart = data->string;
   2607     }
   2608     else {
   2609         pStart = data->fcdPosition + 1;
   2610     }
   2611     int32_t normLen =
   2612         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
   2613                              data->writableBuffer,
   2614                              *status).
   2615         length();
   2616     if(U_FAILURE(*status)) {
   2617         return;
   2618     }
   2619     /*
   2620     this puts the null termination infront of the normalized string instead
   2621     of the end
   2622     */
   2623     data->pos =
   2624         data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
   2625         1 + normLen;
   2626     data->origFlags  = data->flags;
   2627     data->flags     |= UCOL_ITER_INNORMBUF;
   2628     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2629 }
   2630 
   2631 /**
   2632 * Contraction character management function that returns the previous character
   2633 * for the backwards iterator.
   2634 * Does nothing if the previous character is in buffer and not the first
   2635 * character in it.
   2636 * Else it checks previous character in data string to see if it is
   2637 * normalizable.
   2638 * If it is not, the character is simply copied into the buffer, else
   2639 * the whole normalized substring is copied into the buffer, including the
   2640 * current character.
   2641 * @param data collation element iterator data
   2642 * @return previous character
   2643 */
   2644 static
   2645 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
   2646 {
   2647     UChar  prevch;
   2648     UChar  ch;
   2649     const UChar *start;
   2650     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2651     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
   2652         (innormbuf && *(data->pos - 1) != 0)) {
   2653         /*
   2654         if no normalization.
   2655         if previous character is in normalized buffer, no further normalization
   2656         is required
   2657         */
   2658       if(data->flags & UCOL_USE_ITERATOR) {
   2659         data->iterator->move(data->iterator, -1, UITER_CURRENT);
   2660         return (UChar)data->iterator->next(data->iterator);
   2661       } else {
   2662         return *(data->pos - 1);
   2663       }
   2664     }
   2665 
   2666     start = data->pos;
   2667     if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
   2668         /* in data string */
   2669         if ((start - 1) == data->string) {
   2670             return *(start - 1);
   2671         }
   2672         start --;
   2673         ch     = *start;
   2674         prevch = *(start - 1);
   2675     }
   2676     else {
   2677         /*
   2678         in writable buffer, at this point fcdPosition can not be NULL.
   2679         see contracting tag.
   2680         */
   2681         if (data->fcdPosition == data->string) {
   2682             /* at the start of the string, just dump it into the normalizer */
   2683             insertBufferFront(data, *(data->fcdPosition));
   2684             data->fcdPosition = NULL;
   2685             return *(data->pos - 1);
   2686         }
   2687         start  = data->fcdPosition;
   2688         ch     = *start;
   2689         prevch = *(start - 1);
   2690     }
   2691     /*
   2692     * if the current character is not fcd.
   2693     * Trailing combining class == 0.
   2694     */
   2695     if (data->fcdPosition > start &&
   2696        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
   2697     {
   2698         /*
   2699         Need a more complete FCD check and possible normalization.
   2700         normalize substring will be appended to buffer
   2701         */
   2702         const UChar *backuppos = data->pos;
   2703         data->pos = start;
   2704         if (collPrevIterFCD(data)) {
   2705             normalizePrevContraction(data, status);
   2706             return *(data->pos - 1);
   2707         }
   2708         data->pos = backuppos;
   2709         data->fcdPosition ++;
   2710     }
   2711 
   2712     if (innormbuf) {
   2713     /*
   2714     no normalization is to be done hence only one character will be
   2715     appended to the buffer.
   2716     */
   2717         insertBufferFront(data, ch);
   2718         data->fcdPosition --;
   2719     }
   2720 
   2721     return ch;
   2722 }
   2723 
   2724 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
   2725 /* It is called by getNextCE */
   2726 
   2727 /* The following should be even */
   2728 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
   2729 
   2730 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
   2731     collIterateState entryState;
   2732     backupState(source, &entryState);
   2733     UChar32 cp = ch;
   2734 
   2735     for (;;) {
   2736         // This loop will repeat only in the case of contractions, and only when a contraction
   2737         //   is found and the first CE resulting from that contraction is itself a special
   2738         //   (an expansion, for example.)  All other special CE types are fully handled the
   2739         //   first time through, and the loop exits.
   2740 
   2741         const uint32_t *CEOffset = NULL;
   2742         switch(getCETag(CE)) {
   2743         case NOT_FOUND_TAG:
   2744             /* This one is not found, and we'll let somebody else bother about it... no more games */
   2745             return CE;
   2746         case SPEC_PROC_TAG:
   2747             {
   2748                 // Special processing is getting a CE that is preceded by a certain prefix
   2749                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   2750                 // When we encouter a special processing tag, we go backwards and try to see if
   2751                 // we have a match.
   2752                 // Contraction tables are used - so the whole process is not unlike contraction.
   2753                 // prefix data is stored backwards in the table.
   2754                 const UChar *UCharOffset;
   2755                 UChar schar, tchar;
   2756                 collIterateState prefixState;
   2757                 backupState(source, &prefixState);
   2758                 loadState(source, &entryState, TRUE);
   2759                 goBackOne(source); // We want to look at the point where we entered - actually one
   2760                 // before that...
   2761 
   2762                 for(;;) {
   2763                     // This loop will run once per source string character, for as long as we
   2764                     //  are matching a potential contraction sequence
   2765 
   2766                     // First we position ourselves at the begining of contraction sequence
   2767                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2768                     if (collIter_bos(source)) {
   2769                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2770                         break;
   2771                     }
   2772                     schar = getPrevNormalizedChar(source, status);
   2773                     goBackOne(source);
   2774 
   2775                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2776                         UCharOffset++;
   2777                     }
   2778 
   2779                     if (schar == tchar) {
   2780                         // Found the source string char in the table.
   2781                         //  Pick up the corresponding CE from the table.
   2782                         CE = *(coll->contractionCEs +
   2783                             (UCharOffset - coll->contractionIndex));
   2784                     }
   2785                     else
   2786                     {
   2787                         // Source string char was not in the table.
   2788                         //   We have not found the prefix.
   2789                         CE = *(coll->contractionCEs +
   2790                             (ContractionStart - coll->contractionIndex));
   2791                     }
   2792 
   2793                     if(!isPrefix(CE)) {
   2794                         // The source string char was in the contraction table, and the corresponding
   2795                         //   CE is not a prefix CE.  We found the prefix, break
   2796                         //   out of loop, this CE will end up being returned.  This is the normal
   2797                         //   way out of prefix handling when the source actually contained
   2798                         //   the prefix.
   2799                         break;
   2800                     }
   2801                 }
   2802                 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
   2803                     loadState(source, &prefixState, TRUE);
   2804                     if(source->origFlags & UCOL_USE_ITERATOR) {
   2805                         source->flags = source->origFlags;
   2806                     }
   2807                 } else { // prefix search was a failure, we have to backup all the way to the start
   2808                     loadState(source, &entryState, TRUE);
   2809                 }
   2810                 break;
   2811             }
   2812         case CONTRACTION_TAG:
   2813             {
   2814                 /* This should handle contractions */
   2815                 collIterateState state;
   2816                 backupState(source, &state);
   2817                 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
   2818                 const UChar *UCharOffset;
   2819                 UChar schar, tchar;
   2820 
   2821                 for (;;) {
   2822                     /* This loop will run once per source string character, for as long as we     */
   2823                     /*  are matching a potential contraction sequence                  */
   2824 
   2825                     /* First we position ourselves at the begining of contraction sequence */
   2826                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2827 
   2828                     if (collIter_eos(source)) {
   2829                         // Ran off the end of the source string.
   2830                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2831                         // So we'll pick whatever we have at the point...
   2832                         if (CE == UCOL_NOT_FOUND) {
   2833                             // back up the source over all the chars we scanned going into this contraction.
   2834                             CE = firstCE;
   2835                             loadState(source, &state, TRUE);
   2836                             if(source->origFlags & UCOL_USE_ITERATOR) {
   2837                                 source->flags = source->origFlags;
   2838                             }
   2839                         }
   2840                         break;
   2841                     }
   2842 
   2843                     uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
   2844                     uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
   2845 
   2846                     schar = getNextNormalizedChar(source);
   2847                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2848                         UCharOffset++;
   2849                     }
   2850 
   2851                     if (schar == tchar) {
   2852                         // Found the source string char in the contraction table.
   2853                         //  Pick up the corresponding CE from the table.
   2854                         CE = *(coll->contractionCEs +
   2855                             (UCharOffset - coll->contractionIndex));
   2856                     }
   2857                     else
   2858                     {
   2859                         // Source string char was not in contraction table.
   2860                         //   Unless we have a discontiguous contraction, we have finished
   2861                         //   with this contraction.
   2862                         // in order to do the proper detection, we
   2863                         // need to see if we're dealing with a supplementary
   2864                         /* We test whether the next two char are surrogate pairs.
   2865                         * This test is done if the iterator is not NULL.
   2866                         * If there is no surrogate pair, the iterator
   2867                         * goes back one if needed. */
   2868                         UChar32 miss = schar;
   2869                         if (source->iterator) {
   2870                             UChar32 surrNextChar; /* the next char in the iteration to test */
   2871                             int32_t prevPos; /* holds the previous position before move forward of the source iterator */
   2872                             if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
   2873                                 prevPos = source->iterator->index;
   2874                                 surrNextChar = getNextNormalizedChar(source);
   2875                                 if (U16_IS_TRAIL(surrNextChar)) {
   2876                                     miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
   2877                                 } else if (prevPos < source->iterator->index){
   2878                                     goBackOne(source);
   2879                                 }
   2880                             }
   2881                         } else if (U16_IS_LEAD(schar)) {
   2882                             miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
   2883                         }
   2884 
   2885                         uint8_t sCC;
   2886                         if (miss < 0x300 ||
   2887                             maxCC == 0 ||
   2888                             (sCC = i_getCombiningClass(miss, coll)) == 0 ||
   2889                             sCC>maxCC ||
   2890                             (allSame != 0 && sCC == maxCC) ||
   2891                             collIter_eos(source))
   2892                         {
   2893                             //  Contraction can not be discontiguous.
   2894                             goBackOne(source);  // back up the source string by one,
   2895                             //  because  the character we just looked at was
   2896                             //  not part of the contraction.   */
   2897                             if(U_IS_SUPPLEMENTARY(miss)) {
   2898                                 goBackOne(source);
   2899                             }
   2900                             CE = *(coll->contractionCEs +
   2901                                 (ContractionStart - coll->contractionIndex));
   2902                         } else {
   2903                             //
   2904                             // Contraction is possibly discontiguous.
   2905                             //   Scan more of source string looking for a match
   2906                             //
   2907                             UChar tempchar;
   2908                             /* find the next character if schar is not a base character
   2909                             and we are not yet at the end of the string */
   2910                             tempchar = getNextNormalizedChar(source);
   2911                             // probably need another supplementary thingie here
   2912                             goBackOne(source);
   2913                             if (i_getCombiningClass(tempchar, coll) == 0) {
   2914                                 goBackOne(source);
   2915                                 if(U_IS_SUPPLEMENTARY(miss)) {
   2916                                     goBackOne(source);
   2917                                 }
   2918                                 /* Spit out the last char of the string, wasn't tasty enough */
   2919                                 CE = *(coll->contractionCEs +
   2920                                     (ContractionStart - coll->contractionIndex));
   2921                             } else {
   2922                                 CE = getDiscontiguous(coll, source, ContractionStart);
   2923                             }
   2924                         }
   2925                     } // else after if(schar == tchar)
   2926 
   2927                     if(CE == UCOL_NOT_FOUND) {
   2928                         /* The Source string did not match the contraction that we were checking.  */
   2929                         /*  Back up the source position to undo the effects of having partially    */
   2930                         /*   scanned through what ultimately proved to not be a contraction.       */
   2931                         loadState(source, &state, TRUE);
   2932                         CE = firstCE;
   2933                         break;
   2934                     }
   2935 
   2936                     if(!isContraction(CE)) {
   2937                         // The source string char was in the contraction table, and the corresponding
   2938                         //   CE is not a contraction CE.  We completed the contraction, break
   2939                         //   out of loop, this CE will end up being returned.  This is the normal
   2940                         //   way out of contraction handling when the source actually contained
   2941                         //   the contraction.
   2942                         break;
   2943                     }
   2944 
   2945 
   2946                     // The source string char was in the contraction table, and the corresponding
   2947                     //   CE is IS  a contraction CE.  We will continue looping to check the source
   2948                     //   string for the remaining chars in the contraction.
   2949                     uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
   2950                     if(tempCE != UCOL_NOT_FOUND) {
   2951                         // We have scanned a a section of source string for which there is a
   2952                         //  CE from the contraction table.  Remember the CE and scan position, so
   2953                         //  that we can return to this point if further scanning fails to
   2954                         //  match a longer contraction sequence.
   2955                         firstCE = tempCE;
   2956 
   2957                         goBackOne(source);
   2958                         backupState(source, &state);
   2959                         getNextNormalizedChar(source);
   2960 
   2961                         // Another way to do this is:
   2962                         //collIterateState tempState;
   2963                         //backupState(source, &tempState);
   2964                         //goBackOne(source);
   2965                         //backupState(source, &state);
   2966                         //loadState(source, &tempState, TRUE);
   2967 
   2968                         // The problem is that for incomplete contractions we have to remember the previous
   2969                         // position. Before, the only thing I needed to do was state.pos--;
   2970                         // After iterator introduction and especially after introduction of normalizing
   2971                         // iterators, it became much more difficult to decrease the saved state.
   2972                         // I'm not yet sure which of the two methods above is faster.
   2973                     }
   2974                 } // for(;;)
   2975                 break;
   2976             } // case CONTRACTION_TAG:
   2977         case LONG_PRIMARY_TAG:
   2978             {
   2979                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   2980                 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   2981                 source->offsetRepeatCount += 1;
   2982                 return CE;
   2983             }
   2984         case EXPANSION_TAG:
   2985             {
   2986                 /* This should handle expansion. */
   2987                 /* NOTE: we can encounter both continuations and expansions in an expansion! */
   2988                 /* I have to decide where continuations are going to be dealt with */
   2989                 uint32_t size;
   2990                 uint32_t i;    /* general counter */
   2991 
   2992                 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   2993                 size = getExpansionCount(CE);
   2994                 CE = *CEOffset++;
   2995               //source->offsetRepeatCount = -1;
   2996 
   2997                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   2998                     for(i = 1; i<size; i++) {
   2999                         *(source->CEpos++) = *CEOffset++;
   3000                         source->offsetRepeatCount += 1;
   3001                     }
   3002                 } else { /* else, we do */
   3003                     while(*CEOffset != 0) {
   3004                         *(source->CEpos++) = *CEOffset++;
   3005                         source->offsetRepeatCount += 1;
   3006                     }
   3007                 }
   3008 
   3009                 return CE;
   3010             }
   3011         case DIGIT_TAG:
   3012             {
   3013                 /*
   3014                 We do a check to see if we want to collate digits as numbers; if so we generate
   3015                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   3016                 */
   3017                 //uint32_t size;
   3018                 uint32_t i;    /* general counter */
   3019 
   3020                 if (source->coll->numericCollation == UCOL_ON){
   3021                     collIterateState digitState = {0,0,0,0,0,0,0,0,0};
   3022                     UChar32 char32 = 0;
   3023                     int32_t digVal = 0;
   3024 
   3025                     uint32_t digIndx = 0;
   3026                     uint32_t endIndex = 0;
   3027                     uint32_t trailingZeroIndex = 0;
   3028 
   3029                     uint8_t collateVal = 0;
   3030 
   3031                     UBool nonZeroValReached = FALSE;
   3032 
   3033                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
   3034                     /*
   3035                          We parse the source string until we hit a char that's NOT a digit.
   3036                         Use this u_charDigitValue. This might be slow because we have to
   3037                         handle surrogates...
   3038                     */
   3039             /*
   3040                     if (U16_IS_LEAD(ch)){
   3041                       if (!collIter_eos(source)) {
   3042                         backupState(source, &digitState);
   3043                         UChar trail = getNextNormalizedChar(source);
   3044                         if(U16_IS_TRAIL(trail)) {
   3045                           char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   3046                         } else {
   3047                           loadState(source, &digitState, TRUE);
   3048                           char32 = ch;
   3049                         }
   3050                       } else {
   3051                         char32 = ch;
   3052                       }
   3053                     } else {
   3054                       char32 = ch;
   3055                     }
   3056                     digVal = u_charDigitValue(char32);
   3057             */
   3058                     digVal = u_charDigitValue(cp); // if we have arrived here, we have
   3059                     // already processed possible supplementaries that trigered the digit tag -
   3060                     // all supplementaries are marked in the UCA.
   3061                     /*
   3062                         We  pad a zero in front of the first element anyways. This takes
   3063                         care of the (probably) most common case where people are sorting things followed
   3064                         by a single digit
   3065                     */
   3066                     digIndx++;
   3067                     for(;;){
   3068                         // Make sure we have enough space. No longer needed;
   3069                         // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
   3070                         // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
   3071                         // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
   3072 
   3073                         // Skipping over leading zeroes.
   3074                         if (digVal != 0) {
   3075                             nonZeroValReached = TRUE;
   3076                         }
   3077                         if (nonZeroValReached) {
   3078                             /*
   3079                             We parse the digit string into base 100 numbers (this fits into a byte).
   3080                             We only add to the buffer in twos, thus if we are parsing an odd character,
   3081                             that serves as the 'tens' digit while the if we are parsing an even one, that
   3082                             is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3083                             a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3084                             overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3085                             than all the other bytes.
   3086                             */
   3087 
   3088                             if (digIndx % 2 == 1){
   3089                                 collateVal += (uint8_t)digVal;
   3090 
   3091                                 // We don't enter the low-order-digit case unless we've already seen
   3092                                 // the high order, or for the first digit, which is always non-zero.
   3093                                 if (collateVal != 0)
   3094                                     trailingZeroIndex = 0;
   3095 
   3096                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3097                                 collateVal = 0;
   3098                             }
   3099                             else{
   3100                                 // We drop the collation value into the buffer so if we need to do
   3101                                 // a "front patch" we don't have to check to see if we're hitting the
   3102                                 // last element.
   3103                                 collateVal = (uint8_t)(digVal * 10);
   3104 
   3105                                 // Check for trailing zeroes.
   3106                                 if (collateVal == 0)
   3107                                 {
   3108                                     if (!trailingZeroIndex)
   3109                                         trailingZeroIndex = (digIndx/2) + 2;
   3110                                 }
   3111                                 else
   3112                                     trailingZeroIndex = 0;
   3113 
   3114                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3115                             }
   3116                             digIndx++;
   3117                         }
   3118 
   3119                         // Get next character.
   3120                         if (!collIter_eos(source)){
   3121                             ch = getNextNormalizedChar(source);
   3122                             if (U16_IS_LEAD(ch)){
   3123                                 if (!collIter_eos(source)) {
   3124                                     backupState(source, &digitState);
   3125                                     UChar trail = getNextNormalizedChar(source);
   3126                                     if(U16_IS_TRAIL(trail)) {
   3127                                         char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   3128                                     } else {
   3129                                         loadState(source, &digitState, TRUE);
   3130                                         char32 = ch;
   3131                                     }
   3132                                 }
   3133                             } else {
   3134                                 char32 = ch;
   3135                             }
   3136 
   3137                             if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
   3138                                 // Resetting position to point to the next unprocessed char. We
   3139                                 // overshot it when doing our test/set for numbers.
   3140                                 if (char32 > 0xFFFF) { // For surrogates.
   3141                                     loadState(source, &digitState, TRUE);
   3142                                     //goBackOne(source);
   3143                                 }
   3144                                 goBackOne(source);
   3145                                 break;
   3146                             }
   3147                         } else {
   3148                             break;
   3149                         }
   3150                     }
   3151 
   3152                     if (nonZeroValReached == FALSE){
   3153                         digIndx = 2;
   3154                         numTempBuf[2] = 6;
   3155                     }
   3156 
   3157                     endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
   3158                     if (digIndx % 2 != 0){
   3159                         /*
   3160                         We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
   3161                         we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
   3162                         Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
   3163                         single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
   3164                         */
   3165 
   3166                         for(i = 2; i < endIndex; i++){
   3167                             numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
   3168                                 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
   3169                         }
   3170                         --digIndx;
   3171                     }
   3172 
   3173                     // Subtract one off of the last byte.
   3174                     numTempBuf[endIndex-1] -= 1;
   3175 
   3176                     /*
   3177                     We want to skip over the first two slots in the buffer. The first slot
   3178                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3179                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3180                     */
   3181                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3182                     numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
   3183 
   3184                     // Now transfer the collation key to our collIterate struct.
   3185                     // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
   3186                     //size = ((endIndex+1) & ~1)/2;
   3187                     CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3188                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3189                         UCOL_BYTE_COMMON; // Tertiary weight.
   3190                     i = 2; // Reset the index into the buffer.
   3191                     while(i < endIndex)
   3192                     {
   3193                         uint32_t primWeight = numTempBuf[i++] << 8;
   3194                         if ( i < endIndex)
   3195                             primWeight |= numTempBuf[i++];
   3196                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3197                     }
   3198 
   3199                 } else {
   3200                     // no numeric mode, we'll just switch to whatever we stashed and continue
   3201                     CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   3202                     CE = *CEOffset++;
   3203                     break;
   3204                 }
   3205                 return CE;
   3206             }
   3207             /* various implicits optimization */
   3208         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   3209             /* UCA is filled with these. Tailorings are NOT_FOUND */
   3210             return getImplicit(cp, source);
   3211         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   3212             // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
   3213             return getImplicit(cp, source);
   3214         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3215             {
   3216                 static const uint32_t
   3217                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3218                 //const uint32_t LCount = 19;
   3219                 static const uint32_t VCount = 21;
   3220                 static const uint32_t TCount = 28;
   3221                 //const uint32_t NCount = VCount * TCount;   // 588
   3222                 //const uint32_t SCount = LCount * NCount;   // 11172
   3223                 uint32_t L = ch - SBase;
   3224 
   3225                 // divide into pieces
   3226 
   3227                 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
   3228                 L /= TCount;
   3229                 uint32_t V = L % VCount;
   3230                 L /= VCount;
   3231 
   3232                 // offset them
   3233 
   3234                 L += LBase;
   3235                 V += VBase;
   3236                 T += TBase;
   3237 
   3238                 // return the first CE, but first put the rest into the expansion buffer
   3239                 if (!source->coll->image->jamoSpecial) { // FAST PATH
   3240 
   3241                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   3242                     if (T != TBase) {
   3243                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   3244                     }
   3245 
   3246                     return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   3247 
   3248                 } else { // Jamo is Special
   3249                     // Since Hanguls pass the FCD check, it is
   3250                     // guaranteed that we won't be in
   3251                     // the normalization buffer if something like this happens
   3252 
   3253                     // However, if we are using a uchar iterator and normalization
   3254                     // is ON, the Hangul that lead us here is going to be in that
   3255                     // normalization buffer. Here we want to restore the uchar
   3256                     // iterator state and pull out of the normalization buffer
   3257                     if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
   3258                         source->flags = source->origFlags; // restore the iterator
   3259                         source->pos = NULL;
   3260                     }
   3261 
   3262                     // Move Jamos into normalization buffer
   3263                     UChar *buffer = source->writableBuffer.getBuffer(4);
   3264                     int32_t bufferLength;
   3265                     buffer[0] = (UChar)L;
   3266                     buffer[1] = (UChar)V;
   3267                     if (T != TBase) {
   3268                         buffer[2] = (UChar)T;
   3269                         bufferLength = 3;
   3270                     } else {
   3271                         bufferLength = 2;
   3272                     }
   3273                     source->writableBuffer.releaseBuffer(bufferLength);
   3274 
   3275                     // Indicate where to continue in main input string after exhausting the writableBuffer
   3276                     source->fcdPosition       = source->pos;
   3277 
   3278                     source->pos   = source->writableBuffer.getTerminatedBuffer();
   3279                     source->origFlags   = source->flags;
   3280                     source->flags       |= UCOL_ITER_INNORMBUF;
   3281                     source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   3282 
   3283                     return(UCOL_IGNORABLE);
   3284                 }
   3285             }
   3286         case SURROGATE_TAG:
   3287             /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
   3288             /* two things can happen here: next code point can be a trailing surrogate - we will use it */
   3289             /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
   3290             /* we treat it like an unassigned code point. */
   3291             {
   3292                 UChar trail;
   3293                 collIterateState state;
   3294                 backupState(source, &state);
   3295                 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
   3296                     // we chould have stepped one char forward and it might have turned that it
   3297                     // was not a trail surrogate. In that case, we have to backup.
   3298                     loadState(source, &state, TRUE);
   3299                     return UCOL_NOT_FOUND;
   3300                 } else {
   3301                     /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
   3302                     CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
   3303                     if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
   3304                         // We need to backup
   3305                         loadState(source, &state, TRUE);
   3306                         return CE;
   3307                     }
   3308                     // calculate the supplementary code point value, if surrogate was not tailored
   3309                     cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   3310                 }
   3311             }
   3312             break;
   3313         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   3314             UChar nextChar;
   3315             if( source->flags & UCOL_USE_ITERATOR) {
   3316                 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
   3317                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3318                     source->iterator->next(source->iterator);
   3319                     return getImplicit(cp, source);
   3320                 }
   3321             } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
   3322                       U_IS_TRAIL((nextChar=*source->pos))) {
   3323                 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3324                 source->pos++;
   3325                 return getImplicit(cp, source);
   3326             }
   3327             return UCOL_NOT_FOUND;
   3328         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   3329             return UCOL_NOT_FOUND; /* broken surrogate sequence */
   3330         case CHARSET_TAG:
   3331             /* not yet implemented */
   3332             /* probably after 1.8 */
   3333             return UCOL_NOT_FOUND;
   3334         default:
   3335             *status = U_INTERNAL_PROGRAM_ERROR;
   3336             CE=0;
   3337             break;
   3338     }
   3339     if (CE <= UCOL_NOT_FOUND) break;
   3340   }
   3341   return CE;
   3342 }
   3343 
   3344 
   3345 /* now uses Mark's getImplicitPrimary code */
   3346 static
   3347 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
   3348     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   3349 
   3350     *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
   3351     collationSource->toReturn = collationSource->CEpos;
   3352 
   3353     // **** doesn't work if using iterator ****
   3354     if (collationSource->flags & UCOL_ITER_INNORMBUF) {
   3355         collationSource->offsetRepeatCount = 1;
   3356     } else {
   3357         int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
   3358 
   3359         UErrorCode errorCode = U_ZERO_ERROR;
   3360         collationSource->appendOffset(firstOffset, errorCode);
   3361         collationSource->appendOffset(firstOffset + 1, errorCode);
   3362 
   3363         collationSource->offsetReturn = collationSource->offsetStore - 1;
   3364         *(collationSource->offsetBuffer) = firstOffset;
   3365         if (collationSource->offsetReturn == collationSource->offsetBuffer) {
   3366             collationSource->offsetStore = collationSource->offsetBuffer;
   3367         }
   3368     }
   3369 
   3370     return ((r & 0x0000FFFF)<<16) | 0x000000C0;
   3371 }
   3372 
   3373 /**
   3374  * This function handles the special CEs like contractions, expansions,
   3375  * surrogates, Thai.
   3376  * It is called by both getPrevCE
   3377  */
   3378 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
   3379                           collIterate *source,
   3380                           UErrorCode *status)
   3381 {
   3382     const uint32_t *CEOffset    = NULL;
   3383           UChar    *UCharOffset = NULL;
   3384           UChar    schar;
   3385     const UChar    *constart    = NULL;
   3386           uint32_t size;
   3387           UChar    buffer[UCOL_MAX_BUFFER];
   3388           uint32_t *endCEBuffer;
   3389           UChar   *strbuffer;
   3390           int32_t noChars = 0;
   3391           int32_t CECount = 0;
   3392 
   3393     for(;;)
   3394     {
   3395         /* the only ces that loops are thai and contractions */
   3396         switch (getCETag(CE))
   3397         {
   3398         case NOT_FOUND_TAG:  /* this tag always returns */
   3399             return CE;
   3400 
   3401         case SPEC_PROC_TAG:
   3402             {
   3403                 // Special processing is getting a CE that is preceded by a certain prefix
   3404                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   3405                 // When we encouter a special processing tag, we go backwards and try to see if
   3406                 // we have a match.
   3407                 // Contraction tables are used - so the whole process is not unlike contraction.
   3408                 // prefix data is stored backwards in the table.
   3409                 const UChar *UCharOffset;
   3410                 UChar schar, tchar;
   3411                 collIterateState prefixState;
   3412                 backupState(source, &prefixState);
   3413                 for(;;) {
   3414                     // This loop will run once per source string character, for as long as we
   3415                     //  are matching a potential contraction sequence
   3416 
   3417                     // First we position ourselves at the begining of contraction sequence
   3418                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   3419 
   3420                     if (collIter_bos(source)) {
   3421                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   3422                         break;
   3423                     }
   3424                     schar = getPrevNormalizedChar(source, status);
   3425                     goBackOne(source);
   3426 
   3427                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   3428                         UCharOffset++;
   3429                     }
   3430 
   3431                     if (schar == tchar) {
   3432                         // Found the source string char in the table.
   3433                         //  Pick up the corresponding CE from the table.
   3434                         CE = *(coll->contractionCEs +
   3435                             (UCharOffset - coll->contractionIndex));
   3436                     }
   3437                     else
   3438                     {
   3439                         // if there is a completely ignorable code point in the middle of
   3440                         // a prefix, we need to act as if it's not there
   3441                         // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
   3442                         // lone surrogates cannot be set to zero as it would break other processing
   3443                         uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   3444                         // it's easy for BMP code points
   3445                         if(isZeroCE == 0) {
   3446                             continue;
   3447                         } else if(U16_IS_SURROGATE(schar)) {
   3448                             // for supplementary code points, we have to check the next one
   3449                             // situations where we are going to ignore
   3450                             // 1. beginning of the string: schar is a lone surrogate
   3451                             // 2. schar is a lone surrogate
   3452                             // 3. schar is a trail surrogate in a valid surrogate sequence
   3453                             //    that is explicitly set to zero.
   3454                             if (!collIter_bos(source)) {
   3455                                 UChar lead;
   3456                                 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
   3457                                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
   3458                                     if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
   3459                                         uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
   3460                                         if(finalCE == 0) {
   3461                                             // this is a real, assigned completely ignorable code point
   3462                                             goBackOne(source);
   3463                                             continue;
   3464                                         }
   3465                                     }
   3466                                 } else {
   3467                                     // lone surrogate, treat like unassigned
   3468                                     return UCOL_NOT_FOUND;
   3469                                 }
   3470                             } else {
   3471                                 // lone surrogate at the beggining, treat like unassigned
   3472                                 return UCOL_NOT_FOUND;
   3473                             }
   3474                         }
   3475                         // Source string char was not in the table.
   3476                         //   We have not found the prefix.
   3477                         CE = *(coll->contractionCEs +
   3478                             (ContractionStart - coll->contractionIndex));
   3479                     }
   3480 
   3481                     if(!isPrefix(CE)) {
   3482                         // The source string char was in the contraction table, and the corresponding
   3483                         //   CE is not a prefix CE.  We found the prefix, break
   3484                         //   out of loop, this CE will end up being returned.  This is the normal
   3485                         //   way out of prefix handling when the source actually contained
   3486                         //   the prefix.
   3487                         break;
   3488                     }
   3489                 }
   3490                 loadState(source, &prefixState, TRUE);
   3491                 break;
   3492             }
   3493 
   3494         case CONTRACTION_TAG: {
   3495             /* to ensure that the backwards and forwards iteration matches, we
   3496             take the current region of most possible match and pass it through
   3497             the forward iteration. this will ensure that the obstinate problem of
   3498             overlapping contractions will not occur.
   3499             */
   3500             schar = peekCodeUnit(source, 0);
   3501             constart = (UChar *)coll->image + getContractOffset(CE);
   3502             if (isAtStartPrevIterate(source)
   3503                 /* commented away contraction end checks after adding the checks
   3504                 in getPrevCE  */) {
   3505                     /* start of string or this is not the end of any contraction */
   3506                     CE = *(coll->contractionCEs +
   3507                         (constart - coll->contractionIndex));
   3508                     break;
   3509             }
   3510             strbuffer = buffer;
   3511             UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
   3512             *(UCharOffset --) = 0;
   3513             noChars = 0;
   3514             // have to swap thai characters
   3515             while (ucol_unsafeCP(schar, coll)) {
   3516                 *(UCharOffset) = schar;
   3517                 noChars++;
   3518                 UCharOffset --;
   3519                 schar = getPrevNormalizedChar(source, status);
   3520                 goBackOne(source);
   3521                 // TODO: when we exhaust the contraction buffer,
   3522                 // it needs to get reallocated. The problem is
   3523                 // that the size depends on the string which is
   3524                 // not iterated over. However, since we're travelling
   3525                 // backwards, we already had to set the iterator at
   3526                 // the end - so we might as well know where we are?
   3527                 if (UCharOffset + 1 == buffer) {
   3528                     /* we have exhausted the buffer */
   3529                     int32_t newsize = 0;
   3530                     if(source->pos) { // actually dealing with a position
   3531                         newsize = (int32_t)(source->pos - source->string + 1);
   3532                     } else { // iterator
   3533                         newsize = 4 * UCOL_MAX_BUFFER;
   3534                     }
   3535                     strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
   3536                         (newsize + UCOL_MAX_BUFFER));
   3537                     /* test for NULL */
   3538                     if (strbuffer == NULL) {
   3539                         *status = U_MEMORY_ALLOCATION_ERROR;
   3540                         return UCOL_NO_MORE_CES;
   3541                     }
   3542                     UCharOffset = strbuffer + newsize;
   3543                     uprv_memcpy(UCharOffset, buffer,
   3544                         UCOL_MAX_BUFFER * sizeof(UChar));
   3545                     UCharOffset --;
   3546                 }
   3547                 if ((source->pos && (source->pos == source->string ||
   3548                     ((source->flags & UCOL_ITER_INNORMBUF) &&
   3549                     *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
   3550                     || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
   3551                         break;
   3552                 }
   3553             }
   3554             /* adds the initial base character to the string */
   3555             *(UCharOffset) = schar;
   3556             noChars++;
   3557 
   3558             int32_t offsetBias;
   3559 
   3560             // **** doesn't work if using iterator ****
   3561             if (source->flags & UCOL_ITER_INNORMBUF) {
   3562                 offsetBias = -1;
   3563             } else {
   3564                 offsetBias = (int32_t)(source->pos - source->string);
   3565             }
   3566 
   3567             /* a new collIterate is used to simplify things, since using the current
   3568             collIterate will mean that the forward and backwards iteration will
   3569             share and change the same buffers. we don't want to get into that. */
   3570             collIterate temp;
   3571             int32_t rawOffset;
   3572 
   3573             IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
   3574             if(U_FAILURE(*status)) {
   3575                 return UCOL_NULLORDER;
   3576             }
   3577             temp.flags &= ~UCOL_ITER_NORM;
   3578             temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
   3579 
   3580             rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
   3581             CE = ucol_IGetNextCE(coll, &temp, status);
   3582 
   3583             if (source->extendCEs) {
   3584                 endCEBuffer = source->extendCEs + source->extendCEsSize;
   3585                 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
   3586             } else {
   3587                 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
   3588                 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
   3589             }
   3590 
   3591             while (CE != UCOL_NO_MORE_CES) {
   3592                 *(source->CEpos ++) = CE;
   3593 
   3594                 if (offsetBias >= 0) {
   3595                     source->appendOffset(rawOffset + offsetBias, *status);
   3596                 }
   3597 
   3598                 CECount++;
   3599                 if (source->CEpos == endCEBuffer) {
   3600                     /* ran out of CE space, reallocate to new buffer.
   3601                     If reallocation fails, reset pointers and bail out,
   3602                     there's no guarantee of the right character position after
   3603                     this bail*/
   3604                     if (!increaseCEsCapacity(source)) {
   3605                         *status = U_MEMORY_ALLOCATION_ERROR;
   3606                         break;
   3607                     }
   3608 
   3609                     endCEBuffer = source->extendCEs + source->extendCEsSize;
   3610                 }
   3611 
   3612                 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
   3613                     rawOffset = (int32_t)(temp.fcdPosition - temp.string);
   3614                 } else {
   3615                     rawOffset = (int32_t)(temp.pos - temp.string);
   3616                 }
   3617 
   3618                 CE = ucol_IGetNextCE(coll, &temp, status);
   3619             }
   3620 
   3621             if (strbuffer != buffer) {
   3622                 uprv_free(strbuffer);
   3623             }
   3624             if (U_FAILURE(*status)) {
   3625                 return (uint32_t)UCOL_NULLORDER;
   3626             }
   3627 
   3628             if (source->offsetRepeatValue != 0) {
   3629                 if (CECount > noChars) {
   3630                     source->offsetRepeatCount += temp.offsetRepeatCount;
   3631                 } else {
   3632                     // **** does this really skip the right offsets? ****
   3633                     source->offsetReturn -= (noChars - CECount);
   3634                 }
   3635             }
   3636 
   3637             if (offsetBias >= 0) {
   3638                 source->offsetReturn = source->offsetStore - 1;
   3639                 if (source->offsetReturn == source->offsetBuffer) {
   3640                     source->offsetStore = source->offsetBuffer;
   3641                 }
   3642             }
   3643 
   3644             source->toReturn = source->CEpos - 1;
   3645             if (source->toReturn == source->CEs) {
   3646                 source->CEpos = source->CEs;
   3647             }
   3648 
   3649             return *(source->toReturn);
   3650         }
   3651         case LONG_PRIMARY_TAG:
   3652             {
   3653                 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   3654                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   3655                 source->toReturn = source->CEpos - 1;
   3656 
   3657                 if (source->flags & UCOL_ITER_INNORMBUF) {
   3658                     source->offsetRepeatCount = 1;
   3659                 } else {
   3660                     int32_t firstOffset = (int32_t)(source->pos - source->string);
   3661 
   3662                     source->appendOffset(firstOffset, *status);
   3663                     source->appendOffset(firstOffset + 1, *status);
   3664 
   3665                     source->offsetReturn = source->offsetStore - 1;
   3666                     *(source->offsetBuffer) = firstOffset;
   3667                     if (source->offsetReturn == source->offsetBuffer) {
   3668                         source->offsetStore = source->offsetBuffer;
   3669                     }
   3670                 }
   3671 
   3672 
   3673                 return *(source->toReturn);
   3674             }
   3675 
   3676         case EXPANSION_TAG: /* this tag always returns */
   3677             {
   3678             /*
   3679             This should handle expansion.
   3680             NOTE: we can encounter both continuations and expansions in an expansion!
   3681             I have to decide where continuations are going to be dealt with
   3682             */
   3683             int32_t firstOffset = (int32_t)(source->pos - source->string);
   3684 
   3685             // **** doesn't work if using iterator ****
   3686             if (source->offsetReturn != NULL) {
   3687                 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
   3688                     source->offsetStore = source->offsetBuffer;
   3689                 }else {
   3690                   firstOffset = -1;
   3691                 }
   3692             }
   3693 
   3694             /* find the offset to expansion table */
   3695             CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3696             size     = getExpansionCount(CE);
   3697             if (size != 0) {
   3698                 /*
   3699                 if there are less than 16 elements in expansion, we don't terminate
   3700                 */
   3701                 uint32_t count;
   3702 
   3703                 for (count = 0; count < size; count++) {
   3704                     *(source->CEpos ++) = *CEOffset++;
   3705 
   3706                     if (firstOffset >= 0) {
   3707                         source->appendOffset(firstOffset + 1, *status);
   3708                     }
   3709                 }
   3710             } else {
   3711                 /* else, we do */
   3712                 while (*CEOffset != 0) {
   3713                     *(source->CEpos ++) = *CEOffset ++;
   3714 
   3715                     if (firstOffset >= 0) {
   3716                         source->appendOffset(firstOffset + 1, *status);
   3717                     }
   3718                 }
   3719             }
   3720 
   3721             if (firstOffset >= 0) {
   3722                 source->offsetReturn = source->offsetStore - 1;
   3723                 *(source->offsetBuffer) = firstOffset;
   3724                 if (source->offsetReturn == source->offsetBuffer) {
   3725                     source->offsetStore = source->offsetBuffer;
   3726                 }
   3727             } else {
   3728                 source->offsetRepeatCount += size - 1;
   3729             }
   3730 
   3731             source->toReturn = source->CEpos - 1;
   3732             // in case of one element expansion, we
   3733             // want to immediately return CEpos
   3734             if(source->toReturn == source->CEs) {
   3735                 source->CEpos = source->CEs;
   3736             }
   3737 
   3738             return *(source->toReturn);
   3739             }
   3740 
   3741         case DIGIT_TAG:
   3742             {
   3743                 /*
   3744                 We do a check to see if we want to collate digits as numbers; if so we generate
   3745                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   3746                 */
   3747                 uint32_t i;    /* general counter */
   3748 
   3749                 if (source->coll->numericCollation == UCOL_ON){
   3750                     uint32_t digIndx = 0;
   3751                     uint32_t endIndex = 0;
   3752                     uint32_t leadingZeroIndex = 0;
   3753                     uint32_t trailingZeroCount = 0;
   3754 
   3755                     uint8_t collateVal = 0;
   3756 
   3757                     UBool nonZeroValReached = FALSE;
   3758 
   3759                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
   3760                     /*
   3761                     We parse the source string until we hit a char that's NOT a digit.
   3762                     Use this u_charDigitValue. This might be slow because we have to
   3763                     handle surrogates...
   3764                     */
   3765                     /*
   3766                     We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
   3767                     with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
   3768                     element we process when going backward. To determine how long that chunk might be, we may need to make
   3769                     two passes through the loop that collects digits - one to see how long the string is (and how much is
   3770                     leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
   3771                     more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
   3772                     element chunk after resetting the state to the initialState at the right side of the digit string.
   3773                     */
   3774                     uint32_t ceLimit = 0;
   3775                     UChar initial_ch = ch;
   3776                     collIterateState initialState = {0,0,0,0,0,0,0,0,0};
   3777                     backupState(source, &initialState);
   3778 
   3779                     for(;;) {
   3780                         collIterateState state = {0,0,0,0,0,0,0,0,0};
   3781                         UChar32 char32 = 0;
   3782                         int32_t digVal = 0;
   3783 
   3784                         if (U16_IS_TRAIL (ch)) {
   3785                             if (!collIter_bos(source)){
   3786                                 UChar lead = getPrevNormalizedChar(source, status);
   3787                                 if(U16_IS_LEAD(lead)) {
   3788                                     char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3789                                     goBackOne(source);
   3790                                 } else {
   3791                                     char32 = ch;
   3792                                 }
   3793                             } else {
   3794                                 char32 = ch;
   3795                             }
   3796                         } else {
   3797                             char32 = ch;
   3798                         }
   3799                         digVal = u_charDigitValue(char32);
   3800 
   3801                         for(;;) {
   3802                             // Make sure we have enough space. No longer needed;
   3803                             // at this point the largest value of digIndx when we need to save data in numTempBuf
   3804                             // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
   3805                             // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
   3806 
   3807                             // Skip over trailing zeroes, and keep a count of them.
   3808                             if (digVal != 0)
   3809                                 nonZeroValReached = TRUE;
   3810 
   3811                             if (nonZeroValReached) {
   3812                                 /*
   3813                                 We parse the digit string into base 100 numbers (this fits into a byte).
   3814                                 We only add to the buffer in twos, thus if we are parsing an odd character,
   3815                                 that serves as the 'tens' digit while the if we are parsing an even one, that
   3816                                 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3817                                 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3818                                 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3819                                 than all the other bytes.
   3820 
   3821                                 Since we're doing in this reverse we want to put the first digit encountered into the
   3822                                 ones place and the second digit encountered into the tens place.
   3823                                 */
   3824 
   3825                                 if ((digIndx + trailingZeroCount) % 2 == 1) {
   3826                                     // High-order digit case (tens place)
   3827                                     collateVal += (uint8_t)(digVal * 10);
   3828 
   3829                                     // We cannot set leadingZeroIndex unless it has been set for the
   3830                                     // low-order digit. Therefore, all we can do for the high-order
   3831                                     // digit is turn it off, never on.
   3832                                     // The only time we will have a high digit without a low is for
   3833                                     // the very first non-zero digit, so no zero check is necessary.
   3834                                     if (collateVal != 0)
   3835                                         leadingZeroIndex = 0;
   3836 
   3837                                     // The first pass through, digIndx may exceed the limit, but in that case
   3838                                     // we no longer care about numTempBuf contents since they will be discarded
   3839                                     if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
   3840                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3841                                     }
   3842                                     collateVal = 0;
   3843                                 } else {
   3844                                     // Low-order digit case (ones place)
   3845                                     collateVal = (uint8_t)digVal;
   3846 
   3847                                     // Check for leading zeroes.
   3848                                     if (collateVal == 0) {
   3849                                         if (!leadingZeroIndex)
   3850                                             leadingZeroIndex = (digIndx/2) + 2;
   3851                                     } else
   3852                                         leadingZeroIndex = 0;
   3853 
   3854                                     // No need to write to buffer; the case of a last odd digit
   3855                                     // is handled below.
   3856                                 }
   3857                                 ++digIndx;
   3858                             } else
   3859                                 ++trailingZeroCount;
   3860 
   3861                             if (!collIter_bos(source)) {
   3862                                 ch = getPrevNormalizedChar(source, status);
   3863                                 //goBackOne(source);
   3864                                 if (U16_IS_TRAIL(ch)) {
   3865                                     backupState(source, &state);
   3866                                     if (!collIter_bos(source)) {
   3867                                         goBackOne(source);
   3868                                         UChar lead = getPrevNormalizedChar(source, status);
   3869 
   3870                                         if(U16_IS_LEAD(lead)) {
   3871                                             char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3872                                         } else {
   3873                                             loadState(source, &state, FALSE);
   3874                                             char32 = ch;
   3875                                         }
   3876                                     }
   3877                                 } else
   3878                                     char32 = ch;
   3879 
   3880                                 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
   3881                                     if (char32 > 0xFFFF) {// For surrogates.
   3882                                         loadState(source, &state, FALSE);
   3883                                     }
   3884                                     // Don't need to "reverse" the goBackOne call,
   3885                                     // as this points to the next position to process..
   3886                                     //if (char32 > 0xFFFF) // For surrogates.
   3887                                     //getNextNormalizedChar(source);
   3888                                     break;
   3889                                 }
   3890 
   3891                                 goBackOne(source);
   3892                             }else
   3893                                 break;
   3894                         }
   3895 
   3896                         if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
   3897                             // our collation element is not too big, go ahead and finish with it
   3898                             break;
   3899                         }
   3900                         // our digit string is too long for a collation element;
   3901                         // set the limit for it, reset the state and begin again
   3902                         ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
   3903                         if ( ceLimit == 0 ) {
   3904                             ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
   3905                         }
   3906                         ch = initial_ch;
   3907                         loadState(source, &initialState, FALSE);
   3908                         digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
   3909                         collateVal = 0;
   3910                         nonZeroValReached = FALSE;
   3911                     }
   3912 
   3913                     if (! nonZeroValReached) {
   3914                         digIndx = 2;
   3915                         trailingZeroCount = 0;
   3916                         numTempBuf[2] = 6;
   3917                     }
   3918 
   3919                     if ((digIndx + trailingZeroCount) % 2 != 0) {
   3920                         numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
   3921                         digIndx += 1;       // The implicit leading zero
   3922                     }
   3923                     if (trailingZeroCount % 2 != 0) {
   3924                         // We had to consume one trailing zero for the low digit
   3925                         // of the least significant byte
   3926                         digIndx += 1;       // The trailing zero not in the exponent
   3927                         trailingZeroCount -= 1;
   3928                     }
   3929 
   3930                     endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
   3931 
   3932                     // Subtract one off of the last byte. Really the first byte here, but it's reversed...
   3933                     numTempBuf[2] -= 1;
   3934 
   3935                     /*
   3936                     We want to skip over the first two slots in the buffer. The first slot
   3937                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3938                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3939                     The exponent must be adjusted by the number of leading zeroes, and the number of
   3940                     trailing zeroes.
   3941                     */
   3942                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3943                     uint32_t exponent = (digIndx+trailingZeroCount)/2;
   3944                     if (leadingZeroIndex)
   3945                         exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
   3946                     numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
   3947 
   3948                     // Now transfer the collation key to our collIterate struct.
   3949                     // The total size for our collation key is half of endIndex, rounded up.
   3950                     int32_t size = (endIndex+1)/2;
   3951                     if(!ensureCEsCapacity(source, size)) {
   3952                         return UCOL_NULLORDER;
   3953                     }
   3954                     *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3955                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3956                         UCOL_BYTE_COMMON; // Tertiary weight.
   3957                     i = endIndex - 1; // Reset the index into the buffer.
   3958                     while(i >= 2) {
   3959                         uint32_t primWeight = numTempBuf[i--] << 8;
   3960                         if ( i >= 2)
   3961                             primWeight |= numTempBuf[i--];
   3962                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3963                     }
   3964 
   3965                     source->toReturn = source->CEpos -1;
   3966                     return *(source->toReturn);
   3967                 } else {
   3968                     CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3969                     CE = *(CEOffset++);
   3970                     break;
   3971                 }
   3972             }
   3973 
   3974         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3975             {
   3976                 static const uint32_t
   3977                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3978                 //const uint32_t LCount = 19;
   3979                 static const uint32_t VCount = 21;
   3980                 static const uint32_t TCount = 28;
   3981                 //const uint32_t NCount = VCount * TCount;   /* 588 */
   3982                 //const uint32_t SCount = LCount * NCount;   /* 11172 */
   3983 
   3984                 uint32_t L = ch - SBase;
   3985                 /*
   3986                 divide into pieces.
   3987                 we do it in this order since some compilers can do % and / in one
   3988                 operation
   3989                 */
   3990                 uint32_t T = L % TCount;
   3991                 L /= TCount;
   3992                 uint32_t V = L % VCount;
   3993                 L /= VCount;
   3994 
   3995                 /* offset them */
   3996                 L += LBase;
   3997                 V += VBase;
   3998                 T += TBase;
   3999 
   4000                 int32_t firstOffset = (int32_t)(source->pos - source->string);
   4001                 source->appendOffset(firstOffset, *status);
   4002 
   4003                 /*
   4004                  * return the first CE, but first put the rest into the expansion buffer
   4005                  */
   4006                 if (!source->coll->image->jamoSpecial) {
   4007                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   4008                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   4009                     source->appendOffset(firstOffset + 1, *status);
   4010 
   4011                     if (T != TBase) {
   4012                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   4013                         source->appendOffset(firstOffset + 1, *status);
   4014                     }
   4015 
   4016                     source->toReturn = source->CEpos - 1;
   4017 
   4018                     source->offsetReturn = source->offsetStore - 1;
   4019                     if (source->offsetReturn == source->offsetBuffer) {
   4020                         source->offsetStore = source->offsetBuffer;
   4021                     }
   4022 
   4023                     return *(source->toReturn);
   4024                 } else {
   4025                     // Since Hanguls pass the FCD check, it is
   4026                     // guaranteed that we won't be in
   4027                     // the normalization buffer if something like this happens
   4028 
   4029                     // Move Jamos into normalization buffer
   4030                     UChar *tempbuffer = source->writableBuffer.getBuffer(5);
   4031                     int32_t tempbufferLength, jamoOffset;
   4032                     tempbuffer[0] = 0;
   4033                     tempbuffer[1] = (UChar)L;
   4034                     tempbuffer[2] = (UChar)V;
   4035                     if (T != TBase) {
   4036                         tempbuffer[3] = (UChar)T;
   4037                         tempbufferLength = 4;
   4038                     } else {
   4039                         tempbufferLength = 3;
   4040                     }
   4041                     source->writableBuffer.releaseBuffer(tempbufferLength);
   4042 
   4043                     // Indicate where to continue in main input string after exhausting the writableBuffer
   4044                     if (source->pos  == source->string) {
   4045                         jamoOffset = 0;
   4046                         source->fcdPosition = NULL;
   4047                     } else {
   4048                         jamoOffset = source->pos - source->string;
   4049                         source->fcdPosition       = source->pos-1;
   4050                     }
   4051 
   4052                     // Append offsets for the additional chars
   4053                     // (not the 0, and not the L whose offsets match the original Hangul)
   4054                     int32_t jamoRemaining = tempbufferLength - 2;
   4055                     jamoOffset++; // appended offsets should match end of original Hangul
   4056                     while (jamoRemaining-- > 0) {
   4057                         source->appendOffset(jamoOffset, *status);
   4058                     }
   4059 
   4060                     source->offsetRepeatValue = jamoOffset;
   4061 
   4062                     source->offsetReturn = source->offsetStore - 1;
   4063                     if (source->offsetReturn == source->offsetBuffer) {
   4064                         source->offsetStore = source->offsetBuffer;
   4065                     }
   4066 
   4067                     source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
   4068                     source->origFlags         = source->flags;
   4069                     source->flags            |= UCOL_ITER_INNORMBUF;
   4070                     source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   4071 
   4072                     return(UCOL_IGNORABLE);
   4073                 }
   4074             }
   4075 
   4076         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   4077             return getPrevImplicit(ch, source);
   4078 
   4079             // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
   4080         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   4081             return getPrevImplicit(ch, source);
   4082 
   4083         case SURROGATE_TAG:  /* This is a surrogate pair */
   4084             /* essentially an engaged lead surrogate. */
   4085             /* if you have encountered it here, it means that a */
   4086             /* broken sequence was encountered and this is an error */
   4087             return UCOL_NOT_FOUND;
   4088 
   4089         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   4090             return UCOL_NOT_FOUND; /* broken surrogate sequence */
   4091 
   4092         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   4093             {
   4094                 UChar32 cp = 0;
   4095                 UChar  prevChar;
   4096                 const UChar *prev;
   4097                 if (isAtStartPrevIterate(source)) {
   4098                     /* we are at the start of the string, wrong place to be at */
   4099                     return UCOL_NOT_FOUND;
   4100                 }
   4101                 if (source->pos != source->writableBuffer.getBuffer()) {
   4102                     prev     = source->pos - 1;
   4103                 } else {
   4104                     prev     = source->fcdPosition;
   4105                 }
   4106                 prevChar = *prev;
   4107 
   4108                 /* Handles Han and Supplementary characters here.*/
   4109                 if (U16_IS_LEAD(prevChar)) {
   4110                     cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   4111                     source->pos = prev;
   4112                 } else {
   4113                     return UCOL_NOT_FOUND; /* like unassigned */
   4114                 }
   4115 
   4116                 return getPrevImplicit(cp, source);
   4117             }
   4118 
   4119             /* UCA is filled with these. Tailorings are NOT_FOUND */
   4120             /* not yet implemented */
   4121         case CHARSET_TAG:  /* this tag always returns */
   4122             /* probably after 1.8 */
   4123             return UCOL_NOT_FOUND;
   4124 
   4125         default:           /* this tag always returns */
   4126             *status = U_INTERNAL_PROGRAM_ERROR;
   4127             CE=0;
   4128             break;
   4129         }
   4130 
   4131         if (CE <= UCOL_NOT_FOUND) {
   4132             break;
   4133         }
   4134     }
   4135 
   4136     return CE;
   4137 }
   4138 
   4139 /* This should really be a macro                                                                      */
   4140 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
   4141 /* secondaries in French                                                                              */
   4142 /*
   4143 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
   4144   uint8_t temp;
   4145   while(start<end) {
   4146     temp = *start;
   4147     *start++ = *end;
   4148     *end-- = temp;
   4149   }
   4150 }
   4151 */
   4152 
   4153 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
   4154   TYPE tempA; \
   4155 while((start)<(end)) { \
   4156     tempA = *(start); \
   4157     *(start)++ = *(end); \
   4158     *(end)-- = tempA; \
   4159 } \
   4160 }
   4161 
   4162 /****************************************************************************/
   4163 /* Following are the sortkey generation functions                           */
   4164 /*                                                                          */
   4165 /****************************************************************************/
   4166 
   4167 /**
   4168  * Merge two sort keys.
   4169  * This is useful, for example, to combine sort keys from first and last names
   4170  * to sort such pairs.
   4171  * Merged sort keys consider on each collation level the first part first entirely,
   4172  * then the second one.
   4173  * It is possible to merge multiple sort keys by consecutively merging
   4174  * another one with the intermediate result.
   4175  *
   4176  * The length of the merge result is the sum of the lengths of the input sort keys
   4177  * minus 1.
   4178  *
   4179  * @param src1 the first sort key
   4180  * @param src1Length the length of the first sort key, including the zero byte at the end;
   4181  *        can be -1 if the function is to find the length
   4182  * @param src2 the second sort key
   4183  * @param src2Length the length of the second sort key, including the zero byte at the end;
   4184  *        can be -1 if the function is to find the length
   4185  * @param dest the buffer where the merged sort key is written,
   4186  *        can be NULL if destCapacity==0
   4187  * @param destCapacity the number of bytes in the dest buffer
   4188  * @return the length of the merged sort key, src1Length+src2Length-1;
   4189  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
   4190  *         in which cases the contents of dest is undefined
   4191  *
   4192  * @draft
   4193  */
   4194 U_CAPI int32_t U_EXPORT2
   4195 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
   4196                    const uint8_t *src2, int32_t src2Length,
   4197                    uint8_t *dest, int32_t destCapacity) {
   4198     int32_t destLength;
   4199     uint8_t b;
   4200 
   4201     /* check arguments */
   4202     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
   4203         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
   4204         destCapacity<0 || (destCapacity>0 && dest==NULL)
   4205     ) {
   4206         /* error, attempt to write a zero byte and return 0 */
   4207         if(dest!=NULL && destCapacity>0) {
   4208             *dest=0;
   4209         }
   4210         return 0;
   4211     }
   4212 
   4213     /* check lengths and capacity */
   4214     if(src1Length<0) {
   4215         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
   4216     }
   4217     if(src2Length<0) {
   4218         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
   4219     }
   4220 
   4221     destLength=src1Length+src2Length-1;
   4222     if(destLength>destCapacity) {
   4223         /* the merged sort key does not fit into the destination */
   4224         return destLength;
   4225     }
   4226 
   4227     /* merge the sort keys with the same number of levels */
   4228     while(*src1!=0 && *src2!=0) { /* while both have another level */
   4229         /* copy level from src1 not including 00 or 01 */
   4230         while((b=*src1)>=2) {
   4231             ++src1;
   4232             *dest++=b;
   4233         }
   4234 
   4235         /* add a 02 merge separator */
   4236         *dest++=2;
   4237 
   4238         /* copy level from src2 not including 00 or 01 */
   4239         while((b=*src2)>=2) {
   4240             ++src2;
   4241             *dest++=b;
   4242         }
   4243 
   4244         /* if both sort keys have another level, then add a 01 level separator and continue */
   4245         if(*src1==1 && *src2==1) {
   4246             ++src1;
   4247             ++src2;
   4248             *dest++=1;
   4249         }
   4250     }
   4251 
   4252     /*
   4253      * here, at least one sort key is finished now, but the other one
   4254      * might have some contents left from containing more levels;
   4255      * that contents is just appended to the result
   4256      */
   4257     if(*src1!=0) {
   4258         /* src1 is not finished, therefore *src2==0, and src1 is appended */
   4259         src2=src1;
   4260     }
   4261     /* append src2, "the other, unfinished sort key" */
   4262     uprv_strcpy((char *)dest, (const char *)src2);
   4263 
   4264     /* trust that neither sort key contained illegally embedded zero bytes */
   4265     return destLength;
   4266 }
   4267 
   4268 U_NAMESPACE_BEGIN
   4269 
   4270 class SortKeyByteSink : public ByteSink {
   4271 public:
   4272     static const uint32_t FILL_ORIGINAL_BUFFER = 1;
   4273     static const uint32_t DONT_GROW = 2;
   4274     SortKeyByteSink(char *dest, int32_t destCapacity, uint32_t flags=0)
   4275             : ownedBuffer_(NULL), buffer_(dest), capacity_(destCapacity),
   4276               appended_(0),
   4277               fill_(flags & FILL_ORIGINAL_BUFFER),
   4278               grow_((flags & DONT_GROW) == 0) {
   4279         if (buffer_ == NULL || capacity_ < 0) {
   4280             buffer_ = reinterpret_cast<char *>(&lastResortByte_);
   4281             capacity_ = 0;
   4282         }
   4283     }
   4284     virtual ~SortKeyByteSink() { uprv_free(ownedBuffer_); }
   4285 
   4286     virtual void Append(const char *bytes, int32_t n);
   4287     void Append(const uint8_t *bytes, int32_t n) { Append(reinterpret_cast<const char *>(bytes), n); }
   4288     void Append(uint8_t b) {
   4289         if (appended_ < capacity_) {
   4290             buffer_[appended_++] = (char)b;
   4291         } else {
   4292             Append(&b, 1);
   4293         }
   4294     }
   4295     void Append(uint8_t b1, uint8_t b2) {
   4296         int32_t a2 = appended_ + 2;
   4297         if (a2 <= capacity_) {
   4298             buffer_[appended_] = (char)b1;
   4299             buffer_[appended_ + 1] = (char)b2;
   4300             appended_ = a2;
   4301         } else {
   4302             char bytes[2] = { (char)b1, (char)b2 };
   4303             Append(bytes, 2);
   4304         }
   4305     }
   4306     void Append(const SortKeyByteSink &other) { Append(other.buffer_, other.appended_); }
   4307     virtual char *GetAppendBuffer(int32_t min_capacity,
   4308                                   int32_t desired_capacity_hint,
   4309                                   char *scratch, int32_t scratch_capacity,
   4310                                   int32_t *result_capacity);
   4311     int32_t NumberOfBytesAppended() const { return appended_; }
   4312     uint8_t &LastByte() {
   4313         if (buffer_ != NULL && appended_ > 0) {
   4314             return reinterpret_cast<uint8_t *>(buffer_)[appended_ - 1];
   4315         } else {
   4316             return lastResortByte_;
   4317         }
   4318     }
   4319     uint8_t *GetLastFewBytes(int32_t n) {
   4320         if (buffer_ != NULL && appended_ >= n) {
   4321             return reinterpret_cast<uint8_t *>(buffer_) + appended_ - n;
   4322         } else {
   4323             return NULL;
   4324         }
   4325     }
   4326     char *GetBuffer() { return buffer_; }
   4327     uint8_t *GetUnsignedBuffer() { return reinterpret_cast<uint8_t *>(buffer_); }
   4328     uint8_t *OrphanUnsignedBuffer(int32_t &orphanedCapacity);
   4329     UBool IsOk() const { return buffer_ != NULL; }  // otherwise out-of-memory
   4330 
   4331 private:
   4332     SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
   4333     SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
   4334 
   4335     UBool Resize(int32_t appendCapacity, int32_t length);
   4336     void SetNotOk() {
   4337         buffer_ = NULL;
   4338         capacity_ = 0;
   4339     }
   4340 
   4341     static uint8_t lastResortByte_;  // last-resort return value from LastByte()
   4342 
   4343     char *ownedBuffer_;
   4344     char *buffer_;
   4345     int32_t capacity_;
   4346     int32_t appended_;
   4347     UBool fill_;
   4348     UBool grow_;
   4349 };
   4350 
   4351 uint8_t SortKeyByteSink::lastResortByte_ = 0;
   4352 
   4353 void
   4354 SortKeyByteSink::Append(const char *bytes, int32_t n) {
   4355     if (n <= 0) {
   4356         return;
   4357     }
   4358     int32_t length = appended_;
   4359     appended_ += n;
   4360     if ((buffer_ + length) == bytes) {
   4361         return;  // the caller used GetAppendBuffer() and wrote the bytes already
   4362     }
   4363     if (buffer_ == NULL) {
   4364         return;  // allocation failed before already
   4365     }
   4366     int32_t available = capacity_ - length;
   4367     if (bytes == NULL) {
   4368         // assume that the caller failed to allocate memory
   4369         if (fill_) {
   4370             if (n > available) {
   4371                 n = available;
   4372             }
   4373             uprv_memset(buffer_, 0, n);
   4374         }
   4375         SetNotOk();  // propagate the out-of-memory error
   4376         return;
   4377     }
   4378     if (n > available) {
   4379         if (fill_ && available > 0) {
   4380             // Fill the original buffer completely.
   4381             uprv_memcpy(buffer_ + length, bytes, available);
   4382             bytes += available;
   4383             length += available;
   4384             n -= available;
   4385             available = 0;
   4386         }
   4387         fill_ = FALSE;
   4388         if (!Resize(n, length)) {
   4389             SetNotOk();
   4390             return;
   4391         }
   4392     }
   4393     uprv_memcpy(buffer_ + length, bytes, n);
   4394 }
   4395 
   4396 char *
   4397 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
   4398                                  int32_t desired_capacity_hint,
   4399                                  char *scratch,
   4400                                  int32_t scratch_capacity,
   4401                                  int32_t *result_capacity) {
   4402     if (min_capacity < 1 || scratch_capacity < min_capacity) {
   4403         *result_capacity = 0;
   4404         return NULL;
   4405     }
   4406     int32_t available = capacity_ - appended_;
   4407     if (available >= min_capacity) {
   4408         *result_capacity = available;
   4409         return buffer_ + appended_;
   4410     } else if (Resize(desired_capacity_hint, appended_)) {
   4411         *result_capacity = capacity_ - appended_;
   4412         return buffer_ + appended_;
   4413     } else {
   4414         *result_capacity = scratch_capacity;
   4415         return scratch;
   4416     }
   4417 }
   4418 
   4419 UBool
   4420 SortKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
   4421     if (!grow_) {
   4422         return FALSE;
   4423     }
   4424     int32_t newCapacity = 2 * capacity_;
   4425     int32_t altCapacity = length + 2 * appendCapacity;
   4426     if (newCapacity < altCapacity) {
   4427         newCapacity = altCapacity;
   4428     }
   4429     if (newCapacity < 1024) {
   4430         newCapacity = 1024;
   4431     }
   4432     char *newBuffer = (char *)uprv_malloc(newCapacity);
   4433     if (newBuffer == NULL) {
   4434         return FALSE;
   4435     }
   4436     uprv_memcpy(newBuffer, buffer_, length);
   4437     uprv_free(ownedBuffer_);
   4438     ownedBuffer_ = buffer_ = newBuffer;
   4439     capacity_ = newCapacity;
   4440     return TRUE;
   4441 }
   4442 
   4443 uint8_t *
   4444 SortKeyByteSink::OrphanUnsignedBuffer(int32_t &orphanedCapacity) {
   4445     if (buffer_ == NULL || appended_ == 0) {
   4446         orphanedCapacity = 0;
   4447         return NULL;
   4448     }
   4449     if (ownedBuffer_ != NULL) {
   4450         // orphan & forget the ownedBuffer_
   4451         uint8_t *returnBuffer = reinterpret_cast<uint8_t *>(ownedBuffer_);
   4452         ownedBuffer_ = buffer_ = NULL;
   4453         orphanedCapacity = capacity_;
   4454         capacity_ = appended_ = 0;
   4455         return returnBuffer;
   4456     }
   4457     // clone the buffer_
   4458     uint8_t *newBuffer = (uint8_t *)uprv_malloc(appended_);
   4459     if (newBuffer == NULL) {
   4460         orphanedCapacity = 0;
   4461         return NULL;
   4462     }
   4463     uprv_memcpy(newBuffer, buffer_, appended_);
   4464     orphanedCapacity = appended_;
   4465     return newBuffer;
   4466 }
   4467 
   4468 U_NAMESPACE_END
   4469 
   4470 /* sortkey API */
   4471 U_CAPI int32_t U_EXPORT2
   4472 ucol_getSortKey(const    UCollator    *coll,
   4473         const    UChar        *source,
   4474         int32_t        sourceLength,
   4475         uint8_t        *result,
   4476         int32_t        resultLength)
   4477 {
   4478     UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
   4479     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   4480         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
   4481             ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
   4482     }
   4483 
   4484     UErrorCode status = U_ZERO_ERROR;
   4485     int32_t keySize   = 0;
   4486 
   4487     if(source != NULL) {
   4488         // source == NULL is actually an error situation, but we would need to
   4489         // have an error code to return it. Until we introduce a new
   4490         // API, it stays like this
   4491 
   4492         /* this uses the function pointer that is set in updateinternalstate */
   4493         /* currently, there are two funcs: */
   4494         /*ucol_calcSortKey(...);*/
   4495         /*ucol_calcSortKeySimpleTertiary(...);*/
   4496 
   4497         SortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength,
   4498                              SortKeyByteSink::FILL_ORIGINAL_BUFFER | SortKeyByteSink::DONT_GROW);
   4499         coll->sortKeyGen(coll, source, sourceLength, sink, &status);
   4500         keySize = sink.NumberOfBytesAppended();
   4501     }
   4502     UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
   4503     UTRACE_EXIT_STATUS(status);
   4504     return keySize;
   4505 }
   4506 
   4507 /* this function is called by the C++ API for sortkey generation */
   4508 U_CFUNC int32_t
   4509 ucol_getSortKeyWithAllocation(const UCollator *coll,
   4510                               const UChar *source, int32_t sourceLength,
   4511                               uint8_t *&result, int32_t &resultCapacity,
   4512                               UErrorCode *pErrorCode) {
   4513     SortKeyByteSink sink(reinterpret_cast<char *>(result), resultCapacity);
   4514     coll->sortKeyGen(coll, source, sourceLength, sink, pErrorCode);
   4515     int32_t resultLen = sink.NumberOfBytesAppended();
   4516     if (U_SUCCESS(*pErrorCode)) {
   4517         if (!sink.IsOk()) {
   4518             *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
   4519         } else if (result != sink.GetUnsignedBuffer()) {
   4520             result = sink.OrphanUnsignedBuffer(resultCapacity);
   4521         }
   4522     }
   4523     return resultLen;
   4524 }
   4525 
   4526 // Is this primary weight compressible?
   4527 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
   4528 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
   4529 static inline UBool
   4530 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
   4531     return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
   4532 }
   4533 
   4534 static
   4535 inline void doCaseShift(SortKeyByteSink &cases, uint32_t &caseShift) {
   4536     if (caseShift  == 0) {
   4537         cases.Append(UCOL_CASE_BYTE_START);
   4538         caseShift = UCOL_CASE_SHIFT_START;
   4539     }
   4540 }
   4541 
   4542 // Packs the secondary buffer when processing French locale.
   4543 static void
   4544 packFrench(uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
   4545     secondaries += secsize;  // We read the secondary-level bytes back to front.
   4546     uint8_t secondary;
   4547     int32_t count2 = 0;
   4548     int32_t i = 0;
   4549     // we use i here since the key size already accounts for terminators, so we'll discard the increment
   4550     for(i = 0; i<secsize; i++) {
   4551         secondary = *(secondaries-i-1);
   4552         /* This is compression code. */
   4553         if (secondary == UCOL_COMMON2) {
   4554             ++count2;
   4555         } else {
   4556             if (count2 > 0) {
   4557                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4558                     while (count2 > UCOL_TOP_COUNT2) {
   4559                         result.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
   4560                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4561                     }
   4562                     result.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
   4563                 } else {
   4564                     while (count2 > UCOL_BOT_COUNT2) {
   4565                         result.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   4566                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4567                     }
   4568                     result.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   4569                 }
   4570                 count2 = 0;
   4571             }
   4572             result.Append(secondary);
   4573         }
   4574     }
   4575     if (count2 > 0) {
   4576         while (count2 > UCOL_BOT_COUNT2) {
   4577             result.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   4578             count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4579         }
   4580         result.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   4581     }
   4582 }
   4583 
   4584 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
   4585 
   4586 /* This is the sortkey work horse function */
   4587 U_CFUNC void U_CALLCONV
   4588 ucol_calcSortKey(const    UCollator    *coll,
   4589         const    UChar        *source,
   4590         int32_t        sourceLength,
   4591         SortKeyByteSink &result,
   4592         UErrorCode *status)
   4593 {
   4594     if(U_FAILURE(*status)) {
   4595         return;
   4596     }
   4597 
   4598     /* Stack allocated buffers for buffers we use */
   4599     char second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
   4600     char caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
   4601 
   4602     SortKeyByteSink &primaries = result;
   4603     SortKeyByteSink secondaries(second, LENGTHOF(second));
   4604     SortKeyByteSink tertiaries(tert, LENGTHOF(tert));
   4605     SortKeyByteSink cases(caseB, LENGTHOF(caseB));
   4606     SortKeyByteSink quads(quad, LENGTHOF(quad));
   4607 
   4608     UnicodeString normSource;
   4609 
   4610     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
   4611 
   4612     UColAttributeValue strength = coll->strength;
   4613 
   4614     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
   4615     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
   4616     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
   4617     UBool  compareIdent = (strength == UCOL_IDENTICAL);
   4618     UBool  doCase = (coll->caseLevel == UCOL_ON);
   4619     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
   4620     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
   4621     //UBool  qShifted = shifted && (compareQuad == 0);
   4622     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
   4623 
   4624     uint32_t variableTopValue = coll->variableTopValue;
   4625     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
   4626     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
   4627     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
   4628     uint8_t UCOL_HIRAGANA_QUAD = 0;
   4629     if(doHiragana) {
   4630         UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
   4631         /* allocate one more space for hiragana, value for hiragana */
   4632     }
   4633     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
   4634 
   4635     /* support for special features like caselevel and funky secondaries */
   4636     int32_t lastSecondaryLength = 0;
   4637     uint32_t caseShift = 0;
   4638 
   4639     /* If we need to normalize, we'll do it all at once at the beginning! */
   4640     const Normalizer2 *norm2;
   4641     if(compareIdent) {
   4642         norm2 = Normalizer2Factory::getNFDInstance(*status);
   4643     } else if(coll->normalizationMode != UCOL_OFF) {
   4644         norm2 = Normalizer2Factory::getFCDInstance(*status);
   4645     } else {
   4646         norm2 = NULL;
   4647     }
   4648     if(norm2 != NULL) {
   4649         normSource.setTo(FALSE, source, len);
   4650         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   4651         if(qcYesLength != len) {
   4652             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   4653             normSource.truncate(qcYesLength);
   4654             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   4655             source = normSource.getBuffer();
   4656             len = normSource.length();
   4657         }
   4658     }
   4659     collIterate s;
   4660     IInit_collIterate(coll, source, len, &s, status);
   4661     if(U_FAILURE(*status)) {
   4662         return;
   4663     }
   4664     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
   4665 
   4666     uint32_t order = 0;
   4667 
   4668     uint8_t primary1 = 0;
   4669     uint8_t primary2 = 0;
   4670     uint8_t secondary = 0;
   4671     uint8_t tertiary = 0;
   4672     uint8_t caseSwitch = coll->caseSwitch;
   4673     uint8_t tertiaryMask = coll->tertiaryMask;
   4674     int8_t tertiaryAddition = coll->tertiaryAddition;
   4675     uint8_t tertiaryTop = coll->tertiaryTop;
   4676     uint8_t tertiaryBottom = coll->tertiaryBottom;
   4677     uint8_t tertiaryCommon = coll->tertiaryCommon;
   4678     uint8_t caseBits = 0;
   4679 
   4680     UBool wasShifted = FALSE;
   4681     UBool notIsContinuation = FALSE;
   4682 
   4683     uint32_t count2 = 0, count3 = 0, count4 = 0;
   4684     uint8_t leadPrimary = 0;
   4685 
   4686     for(;;) {
   4687         order = ucol_IGetNextCE(coll, &s, status);
   4688         if(order == UCOL_NO_MORE_CES) {
   4689             break;
   4690         }
   4691 
   4692         if(order == 0) {
   4693             continue;
   4694         }
   4695 
   4696         notIsContinuation = !isContinuation(order);
   4697 
   4698         if(notIsContinuation) {
   4699             tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
   4700         } else {
   4701             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   4702         }
   4703 
   4704         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4705         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4706         primary1 = (uint8_t)(order >> 8);
   4707 
   4708         uint8_t originalPrimary1 = primary1;
   4709         if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
   4710             primary1 = coll->leadBytePermutationTable[primary1];
   4711         }
   4712 
   4713         if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
   4714                         || (!notIsContinuation && wasShifted)))
   4715             || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   4716         {
   4717             /* and other ignorables should be removed if following a shifted code point */
   4718             if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
   4719                 /* we should just completely ignore it */
   4720                 continue;
   4721             }
   4722             if(compareQuad == 0) {
   4723                 if(count4 > 0) {
   4724                     while (count4 > UCOL_BOT_COUNT4) {
   4725                         quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4));
   4726                         count4 -= UCOL_BOT_COUNT4;
   4727                     }
   4728                     quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1)));
   4729                     count4 = 0;
   4730                 }
   4731                 /* We are dealing with a variable and we're treating them as shifted */
   4732                 /* This is a shifted ignorable */
   4733                 if(primary1 != 0) { /* we need to check this since we could be in continuation */
   4734                     quads.Append(primary1);
   4735                 }
   4736                 if(primary2 != 0) {
   4737                     quads.Append(primary2);
   4738                 }
   4739             }
   4740             wasShifted = TRUE;
   4741         } else {
   4742             wasShifted = FALSE;
   4743             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   4744             /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   4745             /* regular and simple sortkey calc */
   4746             if(primary1 != UCOL_IGNORABLE) {
   4747                 if(notIsContinuation) {
   4748                     if(leadPrimary == primary1) {
   4749                         primaries.Append(primary2);
   4750                     } else {
   4751                         if(leadPrimary != 0) {
   4752                             primaries.Append((uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN));
   4753                         }
   4754                         if(primary2 == UCOL_IGNORABLE) {
   4755                             /* one byter, not compressed */
   4756                             primaries.Append(primary1);
   4757                             leadPrimary = 0;
   4758                         } else if(isCompressible(coll, originalPrimary1)) {
   4759                             /* compress */
   4760                             primaries.Append(leadPrimary = primary1, primary2);
   4761                         } else {
   4762                             leadPrimary = 0;
   4763                             primaries.Append(primary1, primary2);
   4764                         }
   4765                     }
   4766                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   4767                     if(primary2 == UCOL_IGNORABLE) {
   4768                         primaries.Append(primary1);
   4769                     } else {
   4770                         primaries.Append(primary1, primary2);
   4771                     }
   4772                 }
   4773             }
   4774 
   4775             if(secondary > compareSec) {
   4776                 if(!isFrenchSec) {
   4777                     /* This is compression code. */
   4778                     if (secondary == UCOL_COMMON2 && notIsContinuation) {
   4779                         ++count2;
   4780                     } else {
   4781                         if (count2 > 0) {
   4782                             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4783                                 while (count2 > UCOL_TOP_COUNT2) {
   4784                                     secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
   4785                                     count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4786                                 }
   4787                                 secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
   4788                             } else {
   4789                                 while (count2 > UCOL_BOT_COUNT2) {
   4790                                     secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   4791                                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4792                                 }
   4793                                 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   4794                             }
   4795                             count2 = 0;
   4796                         }
   4797                         secondaries.Append(secondary);
   4798                     }
   4799                 } else {
   4800                     /* Do the special handling for French secondaries */
   4801                     /* We need to get continuation elements and do intermediate restore */
   4802                     /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
   4803                     if(notIsContinuation) {
   4804                         if (lastSecondaryLength > 1) {
   4805                             uint8_t *frenchStartPtr = secondaries.GetLastFewBytes(lastSecondaryLength);
   4806                             if (frenchStartPtr != NULL) {
   4807                                 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4808                                 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
   4809                                 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4810                             }
   4811                         }
   4812                         lastSecondaryLength = 1;
   4813                     } else {
   4814                         ++lastSecondaryLength;
   4815                     }
   4816                     secondaries.Append(secondary);
   4817                 }
   4818             }
   4819 
   4820             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
   4821                 // do the case level if we need to do it. We don't want to calculate
   4822                 // case level for primary ignorables if we have only primary strength and case level
   4823                 // otherwise we would break well formedness of CEs
   4824                 doCaseShift(cases, caseShift);
   4825                 if(notIsContinuation) {
   4826                     caseBits = (uint8_t)(tertiary & 0xC0);
   4827 
   4828                     if(tertiary != 0) {
   4829                         if(coll->caseFirst == UCOL_UPPER_FIRST) {
   4830                             if((caseBits & 0xC0) == 0) {
   4831                                 cases.LastByte() |= 1 << (--caseShift);
   4832                             } else {
   4833                                 cases.LastByte() |= 0 << (--caseShift);
   4834                                 /* second bit */
   4835                                 doCaseShift(cases, caseShift);
   4836                                 cases.LastByte() |= ((caseBits>>6)&1) << (--caseShift);
   4837                             }
   4838                         } else {
   4839                             if((caseBits & 0xC0) == 0) {
   4840                                 cases.LastByte() |= 0 << (--caseShift);
   4841                             } else {
   4842                                 cases.LastByte() |= 1 << (--caseShift);
   4843                                 /* second bit */
   4844                                 doCaseShift(cases, caseShift);
   4845                                 cases.LastByte() |= ((caseBits>>7)&1) << (--caseShift);
   4846                             }
   4847                         }
   4848                     }
   4849                 }
   4850             } else {
   4851                 if(notIsContinuation) {
   4852                     tertiary ^= caseSwitch;
   4853                 }
   4854             }
   4855 
   4856             tertiary &= tertiaryMask;
   4857             if(tertiary > compareTer) {
   4858                 /* This is compression code. */
   4859                 /* sequence size check is included in the if clause */
   4860                 if (tertiary == tertiaryCommon && notIsContinuation) {
   4861                     ++count3;
   4862                 } else {
   4863                     if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   4864                         tertiary += tertiaryAddition;
   4865                     } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   4866                         tertiary -= tertiaryAddition;
   4867                     }
   4868                     if (count3 > 0) {
   4869                         if ((tertiary > tertiaryCommon)) {
   4870                             while (count3 > coll->tertiaryTopCount) {
   4871                                 tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
   4872                                 count3 -= (uint32_t)coll->tertiaryTopCount;
   4873                             }
   4874                             tertiaries.Append((uint8_t)(tertiaryTop - (count3-1)));
   4875                         } else {
   4876                             while (count3 > coll->tertiaryBottomCount) {
   4877                                 tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
   4878                                 count3 -= (uint32_t)coll->tertiaryBottomCount;
   4879                             }
   4880                             tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
   4881                         }
   4882                         count3 = 0;
   4883                     }
   4884                     tertiaries.Append(tertiary);
   4885                 }
   4886             }
   4887 
   4888             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
   4889                 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   4890                     if(count4>0) { // Close this part
   4891                         while (count4 > UCOL_BOT_COUNT4) {
   4892                             quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4));
   4893                             count4 -= UCOL_BOT_COUNT4;
   4894                         }
   4895                         quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1)));
   4896                         count4 = 0;
   4897                     }
   4898                     quads.Append(UCOL_HIRAGANA_QUAD); // Add the Hiragana
   4899                 } else { // This wasn't Hiragana, so we can continue adding stuff
   4900                     count4++;
   4901                 }
   4902             }
   4903         }
   4904     }
   4905 
   4906     /* Here, we are generally done with processing */
   4907     /* bailing out would not be too productive */
   4908 
   4909     if(U_SUCCESS(*status)) {
   4910         /* we have done all the CE's, now let's put them together to form a key */
   4911         if(compareSec == 0) {
   4912             if (count2 > 0) {
   4913                 while (count2 > UCOL_BOT_COUNT2) {
   4914                     secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   4915                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4916                 }
   4917                 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   4918             }
   4919             result.Append(UCOL_LEVELTERMINATOR);
   4920             if(!isFrenchSec || !secondaries.IsOk()) {
   4921                 result.Append(secondaries);
   4922             } else {
   4923                 // If there are any unresolved continuation secondaries,
   4924                 // reverse them here so that we can reverse the whole secondary thing.
   4925                 if (lastSecondaryLength > 1) {
   4926                     uint8_t *frenchStartPtr = secondaries.GetLastFewBytes(lastSecondaryLength);
   4927                     if (frenchStartPtr != NULL) {
   4928                         /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4929                         uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
   4930                         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4931                     }
   4932                 }
   4933                 packFrench(secondaries.GetUnsignedBuffer(), secondaries.NumberOfBytesAppended(), result);
   4934             }
   4935         }
   4936 
   4937         if(doCase) {
   4938             result.Append(UCOL_LEVELTERMINATOR);
   4939             result.Append(cases);
   4940         }
   4941 
   4942         if(compareTer == 0) {
   4943             if (count3 > 0) {
   4944                 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
   4945                     while (count3 >= coll->tertiaryTopCount) {
   4946                         tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
   4947                         count3 -= (uint32_t)coll->tertiaryTopCount;
   4948                     }
   4949                     tertiaries.Append((uint8_t)(tertiaryTop - count3));
   4950                 } else {
   4951                     while (count3 > coll->tertiaryBottomCount) {
   4952                         tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
   4953                         count3 -= (uint32_t)coll->tertiaryBottomCount;
   4954                     }
   4955                     tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
   4956                 }
   4957             }
   4958             result.Append(UCOL_LEVELTERMINATOR);
   4959             result.Append(tertiaries);
   4960 
   4961             if(compareQuad == 0/*qShifted == TRUE*/) {
   4962                 if(count4 > 0) {
   4963                     while (count4 > UCOL_BOT_COUNT4) {
   4964                         quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4));
   4965                         count4 -= UCOL_BOT_COUNT4;
   4966                     }
   4967                     quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1)));
   4968                 }
   4969                 result.Append(UCOL_LEVELTERMINATOR);
   4970                 result.Append(quads);
   4971             }
   4972 
   4973             if(compareIdent) {
   4974                 result.Append(UCOL_LEVELTERMINATOR);
   4975                 u_writeIdenticalLevelRun(s.string, len, result);
   4976             }
   4977         }
   4978         result.Append(0);
   4979     }
   4980 
   4981     /* To avoid memory leak, free the offset buffer if necessary. */
   4982     ucol_freeOffsetBuffer(&s);
   4983 }
   4984 
   4985 
   4986 U_CFUNC void U_CALLCONV
   4987 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
   4988         const    UChar        *source,
   4989         int32_t        sourceLength,
   4990         SortKeyByteSink &result,
   4991         UErrorCode *status)
   4992 {
   4993     U_ALIGN_CODE(16);
   4994 
   4995     if(U_FAILURE(*status)) {
   4996         return;
   4997     }
   4998 
   4999     /* Stack allocated buffers for buffers we use */
   5000     char second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
   5001 
   5002     SortKeyByteSink &primaries = result;
   5003     SortKeyByteSink secondaries(second, LENGTHOF(second));
   5004     SortKeyByteSink tertiaries(tert, LENGTHOF(tert));
   5005 
   5006     UnicodeString normSource;
   5007 
   5008     int32_t len =  sourceLength;
   5009 
   5010     /* If we need to normalize, we'll do it all at once at the beginning! */
   5011     if(coll->normalizationMode != UCOL_OFF) {
   5012         normSource.setTo(len < 0, source, len);
   5013         const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
   5014         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   5015         if(qcYesLength != normSource.length()) {
   5016             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   5017             normSource.truncate(qcYesLength);
   5018             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   5019             source = normSource.getBuffer();
   5020             len = normSource.length();
   5021         }
   5022     }
   5023     collIterate s;
   5024     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   5025     if(U_FAILURE(*status)) {
   5026         return;
   5027     }
   5028     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
   5029 
   5030     uint32_t order = 0;
   5031 
   5032     uint8_t primary1 = 0;
   5033     uint8_t primary2 = 0;
   5034     uint8_t secondary = 0;
   5035     uint8_t tertiary = 0;
   5036     uint8_t caseSwitch = coll->caseSwitch;
   5037     uint8_t tertiaryMask = coll->tertiaryMask;
   5038     int8_t tertiaryAddition = coll->tertiaryAddition;
   5039     uint8_t tertiaryTop = coll->tertiaryTop;
   5040     uint8_t tertiaryBottom = coll->tertiaryBottom;
   5041     uint8_t tertiaryCommon = coll->tertiaryCommon;
   5042 
   5043     UBool notIsContinuation = FALSE;
   5044 
   5045     uint32_t count2 = 0, count3 = 0;
   5046     uint8_t leadPrimary = 0;
   5047 
   5048     for(;;) {
   5049         order = ucol_IGetNextCE(coll, &s, status);
   5050 
   5051         if(order == 0) {
   5052             continue;
   5053         }
   5054 
   5055         if(order == UCOL_NO_MORE_CES) {
   5056             break;
   5057         }
   5058 
   5059         notIsContinuation = !isContinuation(order);
   5060 
   5061         if(notIsContinuation) {
   5062             tertiary = (uint8_t)((order & tertiaryMask));
   5063         } else {
   5064             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   5065         }
   5066 
   5067         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5068         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5069         primary1 = (uint8_t)(order >> 8);
   5070 
   5071         uint8_t originalPrimary1 = primary1;
   5072         if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
   5073             primary1 = coll->leadBytePermutationTable[primary1];
   5074         }
   5075 
   5076         /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   5077         /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   5078         /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
   5079         /* regular and simple sortkey calc */
   5080         if(primary1 != UCOL_IGNORABLE) {
   5081             if(notIsContinuation) {
   5082                 if(leadPrimary == primary1) {
   5083                     primaries.Append(primary2);
   5084                 } else {
   5085                     if(leadPrimary != 0) {
   5086                         primaries.Append((uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN));
   5087                     }
   5088                     if(primary2 == UCOL_IGNORABLE) {
   5089                         /* one byter, not compressed */
   5090                         primaries.Append(primary1);
   5091                         leadPrimary = 0;
   5092                     } else if(isCompressible(coll, originalPrimary1)) {
   5093                         /* compress */
   5094                         primaries.Append(leadPrimary = primary1, primary2);
   5095                     } else {
   5096                         leadPrimary = 0;
   5097                         primaries.Append(primary1, primary2);
   5098                     }
   5099                 }
   5100             } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   5101                 if(primary2 == UCOL_IGNORABLE) {
   5102                     primaries.Append(primary1);
   5103                 } else {
   5104                     primaries.Append(primary1, primary2);
   5105                 }
   5106             }
   5107         }
   5108 
   5109         if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
   5110             /* This is compression code. */
   5111             if (secondary == UCOL_COMMON2 && notIsContinuation) {
   5112                 ++count2;
   5113             } else {
   5114                 if (count2 > 0) {
   5115                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   5116                         while (count2 > UCOL_TOP_COUNT2) {
   5117                             secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
   5118                             count2 -= (uint32_t)UCOL_TOP_COUNT2;
   5119                         }
   5120                         secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
   5121                     } else {
   5122                         while (count2 > UCOL_BOT_COUNT2) {
   5123                             secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   5124                             count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5125                         }
   5126                         secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   5127                     }
   5128                     count2 = 0;
   5129                 }
   5130                 secondaries.Append(secondary);
   5131             }
   5132         }
   5133 
   5134         if(notIsContinuation) {
   5135             tertiary ^= caseSwitch;
   5136         }
   5137 
   5138         if(tertiary > 0) {
   5139             /* This is compression code. */
   5140             /* sequence size check is included in the if clause */
   5141             if (tertiary == tertiaryCommon && notIsContinuation) {
   5142                 ++count3;
   5143             } else {
   5144                 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   5145                     tertiary += tertiaryAddition;
   5146                 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   5147                     tertiary -= tertiaryAddition;
   5148                 }
   5149                 if (count3 > 0) {
   5150                     if ((tertiary > tertiaryCommon)) {
   5151                         while (count3 > coll->tertiaryTopCount) {
   5152                             tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
   5153                             count3 -= (uint32_t)coll->tertiaryTopCount;
   5154                         }
   5155                         tertiaries.Append((uint8_t)(tertiaryTop - (count3-1)));
   5156                     } else {
   5157                         while (count3 > coll->tertiaryBottomCount) {
   5158                             tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
   5159                             count3 -= (uint32_t)coll->tertiaryBottomCount;
   5160                         }
   5161                         tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
   5162                     }
   5163                     count3 = 0;
   5164                 }
   5165                 tertiaries.Append(tertiary);
   5166             }
   5167         }
   5168     }
   5169 
   5170     if(U_SUCCESS(*status)) {
   5171         /* we have done all the CE's, now let's put them together to form a key */
   5172         if (count2 > 0) {
   5173             while (count2 > UCOL_BOT_COUNT2) {
   5174                 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
   5175                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5176             }
   5177             secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
   5178         }
   5179         result.Append(UCOL_LEVELTERMINATOR);
   5180         result.Append(secondaries);
   5181 
   5182         if (count3 > 0) {
   5183             if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
   5184                 while (count3 >= coll->tertiaryTopCount) {
   5185                     tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
   5186                     count3 -= (uint32_t)coll->tertiaryTopCount;
   5187                 }
   5188                 tertiaries.Append((uint8_t)(tertiaryTop - count3));
   5189             } else {
   5190                 while (count3 > coll->tertiaryBottomCount) {
   5191                     tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
   5192                     count3 -= (uint32_t)coll->tertiaryBottomCount;
   5193                 }
   5194                 tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
   5195             }
   5196         }
   5197         result.Append(UCOL_LEVELTERMINATOR);
   5198         result.Append(tertiaries);
   5199 
   5200         result.Append(0);
   5201     }
   5202 
   5203     /* To avoid memory leak, free the offset buffer if necessary. */
   5204     ucol_freeOffsetBuffer(&s);
   5205 
   5206     if (U_SUCCESS(*status) && !result.IsOk()) {
   5207         *status = U_BUFFER_OVERFLOW_ERROR;
   5208     }
   5209 }
   5210 
   5211 static inline
   5212 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
   5213     UBool notIsContinuation = !isContinuation(CE);
   5214     uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
   5215     if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
   5216                || (!notIsContinuation && *wasShifted)))
   5217         || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   5218     {
   5219         // The stuff below should probably be in the sortkey code... maybe not...
   5220         if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
   5221             /* we should just completely ignore it */
   5222             *wasShifted = TRUE;
   5223             //continue;
   5224         }
   5225         //*wasShifted = TRUE;
   5226         return TRUE;
   5227     } else {
   5228         *wasShifted = FALSE;
   5229         return FALSE;
   5230     }
   5231 }
   5232 static inline
   5233 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
   5234     if(level < maxLevel) {
   5235         dest[i++] = UCOL_LEVELTERMINATOR;
   5236     } else {
   5237         dest[i++] = 0;
   5238     }
   5239 }
   5240 
   5241 /** enumeration of level identifiers for partial sort key generation */
   5242 enum {
   5243   UCOL_PSK_PRIMARY = 0,
   5244     UCOL_PSK_SECONDARY = 1,
   5245     UCOL_PSK_CASE = 2,
   5246     UCOL_PSK_TERTIARY = 3,
   5247     UCOL_PSK_QUATERNARY = 4,
   5248     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
   5249     UCOL_PSK_IDENTICAL = 6,
   5250     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
   5251     UCOL_PSK_LIMIT
   5252 };
   5253 
   5254 /** collation state enum. *_SHIFT value is how much to shift right
   5255  *  to get the state piece to the right. *_MASK value should be
   5256  *  ANDed with the shifted state. This data is stored in state[1]
   5257  *  field.
   5258  */
   5259 enum {
   5260     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
   5261     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
   5262     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
   5263     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
   5264     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
   5265      *  This field is also used to denote that the French secondary level is finished
   5266      */
   5267     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
   5268     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
   5269     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
   5270     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
   5271     /** When we do French we need to reverse secondary values. However, continuations
   5272      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
   5273      */
   5274     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
   5275     UCOL_PSK_BOCSU_BYTES_MASK = 3,
   5276     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
   5277     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
   5278 };
   5279 
   5280 // macro calculating the number of expansion CEs available
   5281 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
   5282 
   5283 
   5284 /** main sortkey part procedure. On the first call,
   5285  *  you should pass in a collator, an iterator, empty state
   5286  *  state[0] == state[1] == 0, a buffer to hold results
   5287  *  number of bytes you need and an error code pointer.
   5288  *  Make sure your buffer is big enough to hold the wanted
   5289  *  number of sortkey bytes. I don't check.
   5290  *  The only meaningful status you can get back is
   5291  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
   5292  *  have been dealt a raw deal and that you probably won't
   5293  *  be able to use partial sortkey generation for this
   5294  *  particular combination of string and collator. This
   5295  *  is highly unlikely, but you should still check the error code.
   5296  *  Any other status means that you're not in a sane situation
   5297  *  anymore. After the first call, preserve state values and
   5298  *  use them on subsequent calls to obtain more bytes of a sortkey.
   5299  *  Use until the number of bytes written is smaller than the requested
   5300  *  number of bytes. Generated sortkey is not compatible with the
   5301  *  one generated by ucol_getSortKey, as we don't do any compression.
   5302  *  However, levels are still terminated by a 1 (one) and the sortkey
   5303  *  is terminated by a 0 (zero). Identical level is the same as in the
   5304  *  regular sortkey - internal bocu-1 implementation is used.
   5305  *  For curious, although you cannot do much about this, here is
   5306  *  the structure of state words.
   5307  *  state[0] - iterator state. Depends on the iterator implementation,
   5308  *             but allows the iterator to continue where it stopped in
   5309  *             the last iteration.
   5310  *  state[1] - collation processing state. Here is the distribution
   5311  *             of the bits:
   5312  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
   5313  *             quaternary, quin (we don't use this one), identical and
   5314  *             null (producing only zeroes - first one to terminate the
   5315  *             sortkey and subsequent to fill the buffer).
   5316  *   3       - byte count. Number of bytes written on the primary level.
   5317  *   4       - was shifted. Whether the previous iteration finished in the
   5318  *             shifted state.
   5319  *   5, 6    - French continuation bytes written. See the comment in the enum
   5320  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
   5321  *             the identical level.
   5322  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
   5323  *             since thes last successful update of the iterator state.
   5324  */
   5325 U_CAPI int32_t U_EXPORT2
   5326 ucol_nextSortKeyPart(const UCollator *coll,
   5327                      UCharIterator *iter,
   5328                      uint32_t state[2],
   5329                      uint8_t *dest, int32_t count,
   5330                      UErrorCode *status)
   5331 {
   5332     /* error checking */
   5333     if(status==NULL || U_FAILURE(*status)) {
   5334         return 0;
   5335     }
   5336     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
   5337     if( coll==NULL || iter==NULL ||
   5338         state==NULL ||
   5339         count<0 || (count>0 && dest==NULL)
   5340     ) {
   5341         *status=U_ILLEGAL_ARGUMENT_ERROR;
   5342         UTRACE_EXIT_STATUS(status);
   5343         return 0;
   5344     }
   5345 
   5346     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
   5347                   coll, iter, state[0], state[1], dest, count);
   5348 
   5349     if(count==0) {
   5350         /* nothing to do */
   5351         UTRACE_EXIT_VALUE(0);
   5352         return 0;
   5353     }
   5354     /** Setting up situation according to the state we got from the previous iteration */
   5355     // The state of the iterator from the previous invocation
   5356     uint32_t iterState = state[0];
   5357     // Has the last iteration ended in the shifted state
   5358     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
   5359     // What is the current level of the sortkey?
   5360     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
   5361     // Have we written only one byte from a two byte primary in the previous iteration?
   5362     // Also on secondary level - have we finished with the French secondary?
   5363     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
   5364     // number of bytes in the continuation buffer for French
   5365     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
   5366     // Number of bytes already written from a bocsu sequence. Since
   5367     // the longes bocsu sequence is 4 long, this can be up to 3.
   5368     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
   5369     // Number of elements that need to be consumed in this iteration because
   5370     // the iterator returned UITER_NO_STATE at the end of the last iteration,
   5371     // so we had to save the last valid state.
   5372     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
   5373 
   5374     /** values that depend on the collator attributes */
   5375     // strength of the collator.
   5376     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
   5377     // maximal level of the partial sortkey. Need to take whether case level is done
   5378     int32_t maxLevel = 0;
   5379     if(strength < UCOL_TERTIARY) {
   5380         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   5381             maxLevel = UCOL_PSK_CASE;
   5382         } else {
   5383             maxLevel = strength;
   5384         }
   5385     } else {
   5386         if(strength == UCOL_TERTIARY) {
   5387             maxLevel = UCOL_PSK_TERTIARY;
   5388         } else if(strength == UCOL_QUATERNARY) {
   5389             maxLevel = UCOL_PSK_QUATERNARY;
   5390         } else { // identical
   5391             maxLevel = UCOL_IDENTICAL;
   5392         }
   5393     }
   5394     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
   5395     uint8_t UCOL_HIRAGANA_QUAD =
   5396       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
   5397     // Boundary value that decides whether a CE is shifted or not
   5398     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
   5399     // Are we doing French collation?
   5400     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
   5401 
   5402     /** initializing the collation state */
   5403     UBool notIsContinuation = FALSE;
   5404     uint32_t CE = UCOL_NO_MORE_CES;
   5405 
   5406     collIterate s;
   5407     IInit_collIterate(coll, NULL, -1, &s, status);
   5408     if(U_FAILURE(*status)) {
   5409         UTRACE_EXIT_STATUS(*status);
   5410         return 0;
   5411     }
   5412     s.iterator = iter;
   5413     s.flags |= UCOL_USE_ITERATOR;
   5414     // This variable tells us whether we have produced some other levels in this iteration
   5415     // before we moved to the identical level. In that case, we need to switch the
   5416     // type of the iterator.
   5417     UBool doingIdenticalFromStart = FALSE;
   5418     // Normalizing iterator
   5419     // The division for the array length may truncate the array size to
   5420     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   5421     // for all platforms anyway.
   5422     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   5423     UNormIterator *normIter = NULL;
   5424     // If the normalization is turned on for the collator and we are below identical level
   5425     // we will use a FCD normalizing iterator
   5426     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
   5427         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5428         s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
   5429         s.flags &= ~UCOL_ITER_NORM;
   5430         if(U_FAILURE(*status)) {
   5431             UTRACE_EXIT_STATUS(*status);
   5432             return 0;
   5433         }
   5434     } else if(level == UCOL_PSK_IDENTICAL) {
   5435         // for identical level, we need a NFD iterator. We need to instantiate it here, since we
   5436         // will be updating the state - and this cannot be done on an ordinary iterator.
   5437         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5438         s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   5439         s.flags &= ~UCOL_ITER_NORM;
   5440         if(U_FAILURE(*status)) {
   5441             UTRACE_EXIT_STATUS(*status);
   5442             return 0;
   5443         }
   5444         doingIdenticalFromStart = TRUE;
   5445     }
   5446 
   5447     // This is the tentative new state of the iterator. The problem
   5448     // is that the iterator might return an undefined state, in
   5449     // which case we should save the last valid state and increase
   5450     // the iterator skip value.
   5451     uint32_t newState = 0;
   5452 
   5453     // First, we set the iterator to the last valid position
   5454     // from the last iteration. This was saved in state[0].
   5455     if(iterState == 0) {
   5456         /* initial state */
   5457         if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
   5458             s.iterator->move(s.iterator, 0, UITER_LIMIT);
   5459         } else {
   5460             s.iterator->move(s.iterator, 0, UITER_START);
   5461         }
   5462     } else {
   5463         /* reset to previous state */
   5464         s.iterator->setState(s.iterator, iterState, status);
   5465         if(U_FAILURE(*status)) {
   5466             UTRACE_EXIT_STATUS(*status);
   5467             return 0;
   5468         }
   5469     }
   5470 
   5471 
   5472 
   5473     // This variable tells us whether we can attempt to update the state
   5474     // of iterator. Situations where we don't want to update iterator state
   5475     // are the existence of expansion CEs that are not yet processed, and
   5476     // finishing the case level without enough space in the buffer to insert
   5477     // a level terminator.
   5478     UBool canUpdateState = TRUE;
   5479 
   5480     // Consume all the CEs that were consumed at the end of the previous
   5481     // iteration without updating the iterator state. On identical level,
   5482     // consume the code points.
   5483     int32_t counter = cces;
   5484     if(level < UCOL_PSK_IDENTICAL) {
   5485         while(counter-->0) {
   5486             // If we're doing French and we are on the secondary level,
   5487             // we go backwards.
   5488             if(level == UCOL_PSK_SECONDARY && doingFrench) {
   5489                 CE = ucol_IGetPrevCE(coll, &s, status);
   5490             } else {
   5491                 CE = ucol_IGetNextCE(coll, &s, status);
   5492             }
   5493             if(CE==UCOL_NO_MORE_CES) {
   5494                 /* should not happen */
   5495                 *status=U_INTERNAL_PROGRAM_ERROR;
   5496                 UTRACE_EXIT_STATUS(*status);
   5497                 return 0;
   5498             }
   5499             if(uprv_numAvailableExpCEs(s)) {
   5500                 canUpdateState = FALSE;
   5501             }
   5502         }
   5503     } else {
   5504         while(counter-->0) {
   5505             uiter_next32(s.iterator);
   5506         }
   5507     }
   5508 
   5509     // French secondary needs to know whether the iterator state of zero came from previous level OR
   5510     // from a new invocation...
   5511     UBool wasDoingPrimary = FALSE;
   5512     // destination buffer byte counter. When this guy
   5513     // gets to count, we're done with the iteration
   5514     int32_t i = 0;
   5515     // used to count the zero bytes written after we
   5516     // have finished with the sort key
   5517     int32_t j = 0;
   5518 
   5519 
   5520     // Hm.... I think we're ready to plunge in. Basic story is as following:
   5521     // we have a fall through case based on level. This is used for initial
   5522     // positioning on iteration start. Every level processor contains a
   5523     // for(;;) which will be broken when we exhaust all the CEs. Other
   5524     // way to exit is a goto saveState, which happens when we have filled
   5525     // out our buffer.
   5526     switch(level) {
   5527     case UCOL_PSK_PRIMARY:
   5528         wasDoingPrimary = TRUE;
   5529         for(;;) {
   5530             if(i==count) {
   5531                 goto saveState;
   5532             }
   5533             // We should save the state only if we
   5534             // are sure that we are done with the
   5535             // previous iterator state
   5536             if(canUpdateState && byteCountOrFrenchDone == 0) {
   5537                 newState = s.iterator->getState(s.iterator);
   5538                 if(newState != UITER_NO_STATE) {
   5539                     iterState = newState;
   5540                     cces = 0;
   5541                 }
   5542             }
   5543             CE = ucol_IGetNextCE(coll, &s, status);
   5544             cces++;
   5545             if(CE==UCOL_NO_MORE_CES) {
   5546                 // Add the level separator
   5547                 terminatePSKLevel(level, maxLevel, i, dest);
   5548                 byteCountOrFrenchDone=0;
   5549                 // Restart the iteration an move to the
   5550                 // second level
   5551                 s.iterator->move(s.iterator, 0, UITER_START);
   5552                 cces = 0;
   5553                 level = UCOL_PSK_SECONDARY;
   5554                 break;
   5555             }
   5556             if(!isContinuation(CE)){
   5557                 if(coll->leadBytePermutationTable != NULL){
   5558                     CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
   5559                 }
   5560             }
   5561             if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5562                 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
   5563                 if(CE != 0) {
   5564                     if(byteCountOrFrenchDone == 0) {
   5565                         // get the second byte of primary
   5566                         dest[i++]=(uint8_t)(CE >> 8);
   5567                     } else {
   5568                         byteCountOrFrenchDone = 0;
   5569                     }
   5570                     if((CE &=0xff)!=0) {
   5571                         if(i==count) {
   5572                             /* overflow */
   5573                             byteCountOrFrenchDone = 1;
   5574                             cces--;
   5575                             goto saveState;
   5576                         }
   5577                         dest[i++]=(uint8_t)CE;
   5578                     }
   5579                 }
   5580             }
   5581             if(uprv_numAvailableExpCEs(s)) {
   5582                 canUpdateState = FALSE;
   5583             } else {
   5584                 canUpdateState = TRUE;
   5585             }
   5586         }
   5587         /* fall through to next level */
   5588     case UCOL_PSK_SECONDARY:
   5589         if(strength >= UCOL_SECONDARY) {
   5590             if(!doingFrench) {
   5591                 for(;;) {
   5592                     if(i == count) {
   5593                         goto saveState;
   5594                     }
   5595                     // We should save the state only if we
   5596                     // are sure that we are done with the
   5597                     // previous iterator state
   5598                     if(canUpdateState) {
   5599                         newState = s.iterator->getState(s.iterator);
   5600                         if(newState != UITER_NO_STATE) {
   5601                             iterState = newState;
   5602                             cces = 0;
   5603                         }
   5604                     }
   5605                     CE = ucol_IGetNextCE(coll, &s, status);
   5606                     cces++;
   5607                     if(CE==UCOL_NO_MORE_CES) {
   5608                         // Add the level separator
   5609                         terminatePSKLevel(level, maxLevel, i, dest);
   5610                         byteCountOrFrenchDone = 0;
   5611                         // Restart the iteration an move to the
   5612                         // second level
   5613                         s.iterator->move(s.iterator, 0, UITER_START);
   5614                         cces = 0;
   5615                         level = UCOL_PSK_CASE;
   5616                         break;
   5617                     }
   5618                     if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5619                         CE >>= 8; /* get secondary */
   5620                         if(CE != 0) {
   5621                             dest[i++]=(uint8_t)CE;
   5622                         }
   5623                     }
   5624                     if(uprv_numAvailableExpCEs(s)) {
   5625                         canUpdateState = FALSE;
   5626                     } else {
   5627                         canUpdateState = TRUE;
   5628                     }
   5629                 }
   5630             } else { // French secondary processing
   5631                 uint8_t frenchBuff[UCOL_MAX_BUFFER];
   5632                 int32_t frenchIndex = 0;
   5633                 // Here we are going backwards.
   5634                 // If the iterator is at the beggining, it should be
   5635                 // moved to end.
   5636                 if(wasDoingPrimary) {
   5637                     s.iterator->move(s.iterator, 0, UITER_LIMIT);
   5638                     cces = 0;
   5639                 }
   5640                 for(;;) {
   5641                     if(i == count) {
   5642                         goto saveState;
   5643                     }
   5644                     if(canUpdateState) {
   5645                         newState = s.iterator->getState(s.iterator);
   5646                         if(newState != UITER_NO_STATE) {
   5647                             iterState = newState;
   5648                             cces = 0;
   5649                         }
   5650                     }
   5651                     CE = ucol_IGetPrevCE(coll, &s, status);
   5652                     cces++;
   5653                     if(CE==UCOL_NO_MORE_CES) {
   5654                         // Add the level separator
   5655                         terminatePSKLevel(level, maxLevel, i, dest);
   5656                         byteCountOrFrenchDone = 0;
   5657                         // Restart the iteration an move to the next level
   5658                         s.iterator->move(s.iterator, 0, UITER_START);
   5659                         level = UCOL_PSK_CASE;
   5660                         break;
   5661                     }
   5662                     if(isContinuation(CE)) { // if it's a continuation, we want to save it and
   5663                         // reverse when we get a first non-continuation CE.
   5664                         CE >>= 8;
   5665                         frenchBuff[frenchIndex++] = (uint8_t)CE;
   5666                     } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5667                         CE >>= 8; /* get secondary */
   5668                         if(!frenchIndex) {
   5669                             if(CE != 0) {
   5670                                 dest[i++]=(uint8_t)CE;
   5671                             }
   5672                         } else {
   5673                             frenchBuff[frenchIndex++] = (uint8_t)CE;
   5674                             frenchIndex -= usedFrench;
   5675                             usedFrench = 0;
   5676                             while(i < count && frenchIndex) {
   5677                                 dest[i++] = frenchBuff[--frenchIndex];
   5678                                 usedFrench++;
   5679                             }
   5680                         }
   5681                     }
   5682                     if(uprv_numAvailableExpCEs(s)) {
   5683                         canUpdateState = FALSE;
   5684                     } else {
   5685                         canUpdateState = TRUE;
   5686                     }
   5687                 }
   5688             }
   5689         } else {
   5690             level = UCOL_PSK_CASE;
   5691         }
   5692         /* fall through to next level */
   5693     case UCOL_PSK_CASE:
   5694         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   5695             uint32_t caseShift = UCOL_CASE_SHIFT_START;
   5696             uint8_t caseByte = UCOL_CASE_BYTE_START;
   5697             uint8_t caseBits = 0;
   5698 
   5699             for(;;) {
   5700                 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
   5701                 if(i == count) {
   5702                     goto saveState;
   5703                 }
   5704                 // We should save the state only if we
   5705                 // are sure that we are done with the
   5706                 // previous iterator state
   5707                 if(canUpdateState) {
   5708                     newState = s.iterator->getState(s.iterator);
   5709                     if(newState != UITER_NO_STATE) {
   5710                         iterState = newState;
   5711                         cces = 0;
   5712                     }
   5713                 }
   5714                 CE = ucol_IGetNextCE(coll, &s, status);
   5715                 cces++;
   5716                 if(CE==UCOL_NO_MORE_CES) {
   5717                     // On the case level we might have an unfinished
   5718                     // case byte. Add one if it's started.
   5719                     if(caseShift != UCOL_CASE_SHIFT_START) {
   5720                         dest[i++] = caseByte;
   5721                     }
   5722                     cces = 0;
   5723                     // We have finished processing CEs on this level.
   5724                     // However, we don't know if we have enough space
   5725                     // to add a case level terminator.
   5726                     if(i < count) {
   5727                         // Add the level separator
   5728                         terminatePSKLevel(level, maxLevel, i, dest);
   5729                         // Restart the iteration and move to the
   5730                         // next level
   5731                         s.iterator->move(s.iterator, 0, UITER_START);
   5732                         level = UCOL_PSK_TERTIARY;
   5733                     } else {
   5734                         canUpdateState = FALSE;
   5735                     }
   5736                     break;
   5737                 }
   5738 
   5739                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5740                     if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
   5741                         // do the case level if we need to do it. We don't want to calculate
   5742                         // case level for primary ignorables if we have only primary strength and case level
   5743                         // otherwise we would break well formedness of CEs
   5744                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   5745                         caseBits = (uint8_t)(CE & 0xC0);
   5746                         // this copies the case level logic from the
   5747                         // sort key generation code
   5748                         if(CE != 0) {
   5749                             if (caseShift == 0) {
   5750                                 dest[i++] = caseByte;
   5751                                 caseShift = UCOL_CASE_SHIFT_START;
   5752                                 caseByte = UCOL_CASE_BYTE_START;
   5753                             }
   5754                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   5755                                 if((caseBits & 0xC0) == 0) {
   5756                                     caseByte |= 1 << (--caseShift);
   5757                                 } else {
   5758                                     caseByte |= 0 << (--caseShift);
   5759                                     /* second bit */
   5760                                     if(caseShift == 0) {
   5761                                         dest[i++] = caseByte;
   5762                                         caseShift = UCOL_CASE_SHIFT_START;
   5763                                         caseByte = UCOL_CASE_BYTE_START;
   5764                                     }
   5765                                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
   5766                                 }
   5767                             } else {
   5768                                 if((caseBits & 0xC0) == 0) {
   5769                                     caseByte |= 0 << (--caseShift);
   5770                                 } else {
   5771                                     caseByte |= 1 << (--caseShift);
   5772                                     /* second bit */
   5773                                     if(caseShift == 0) {
   5774                                         dest[i++] = caseByte;
   5775                                         caseShift = UCOL_CASE_SHIFT_START;
   5776                                         caseByte = UCOL_CASE_BYTE_START;
   5777                                     }
   5778                                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
   5779                                 }
   5780                             }
   5781                         }
   5782 
   5783                     }
   5784                 }
   5785                 // Not sure this is correct for the case level - revisit
   5786                 if(uprv_numAvailableExpCEs(s)) {
   5787                     canUpdateState = FALSE;
   5788                 } else {
   5789                     canUpdateState = TRUE;
   5790                 }
   5791             }
   5792         } else {
   5793             level = UCOL_PSK_TERTIARY;
   5794         }
   5795         /* fall through to next level */
   5796     case UCOL_PSK_TERTIARY:
   5797         if(strength >= UCOL_TERTIARY) {
   5798             for(;;) {
   5799                 if(i == count) {
   5800                     goto saveState;
   5801                 }
   5802                 // We should save the state only if we
   5803                 // are sure that we are done with the
   5804                 // previous iterator state
   5805                 if(canUpdateState) {
   5806                     newState = s.iterator->getState(s.iterator);
   5807                     if(newState != UITER_NO_STATE) {
   5808                         iterState = newState;
   5809                         cces = 0;
   5810                     }
   5811                 }
   5812                 CE = ucol_IGetNextCE(coll, &s, status);
   5813                 cces++;
   5814                 if(CE==UCOL_NO_MORE_CES) {
   5815                     // Add the level separator
   5816                     terminatePSKLevel(level, maxLevel, i, dest);
   5817                     byteCountOrFrenchDone = 0;
   5818                     // Restart the iteration an move to the
   5819                     // second level
   5820                     s.iterator->move(s.iterator, 0, UITER_START);
   5821                     cces = 0;
   5822                     level = UCOL_PSK_QUATERNARY;
   5823                     break;
   5824                 }
   5825                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5826                     notIsContinuation = !isContinuation(CE);
   5827 
   5828                     if(notIsContinuation) {
   5829                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   5830                         CE ^= coll->caseSwitch;
   5831                         CE &= coll->tertiaryMask;
   5832                     } else {
   5833                         CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   5834                     }
   5835 
   5836                     if(CE != 0) {
   5837                         dest[i++]=(uint8_t)CE;
   5838                     }
   5839                 }
   5840                 if(uprv_numAvailableExpCEs(s)) {
   5841                     canUpdateState = FALSE;
   5842                 } else {
   5843                     canUpdateState = TRUE;
   5844                 }
   5845             }
   5846         } else {
   5847             // if we're not doing tertiary
   5848             // skip to the end
   5849             level = UCOL_PSK_NULL;
   5850         }
   5851         /* fall through to next level */
   5852     case UCOL_PSK_QUATERNARY:
   5853         if(strength >= UCOL_QUATERNARY) {
   5854             for(;;) {
   5855                 if(i == count) {
   5856                     goto saveState;
   5857                 }
   5858                 // We should save the state only if we
   5859                 // are sure that we are done with the
   5860                 // previous iterator state
   5861                 if(canUpdateState) {
   5862                     newState = s.iterator->getState(s.iterator);
   5863                     if(newState != UITER_NO_STATE) {
   5864                         iterState = newState;
   5865                         cces = 0;
   5866                     }
   5867                 }
   5868                 CE = ucol_IGetNextCE(coll, &s, status);
   5869                 cces++;
   5870                 if(CE==UCOL_NO_MORE_CES) {
   5871                     // Add the level separator
   5872                     terminatePSKLevel(level, maxLevel, i, dest);
   5873                     //dest[i++] = UCOL_LEVELTERMINATOR;
   5874                     byteCountOrFrenchDone = 0;
   5875                     // Restart the iteration an move to the
   5876                     // second level
   5877                     s.iterator->move(s.iterator, 0, UITER_START);
   5878                     cces = 0;
   5879                     level = UCOL_PSK_QUIN;
   5880                     break;
   5881                 }
   5882                 if(CE==0)
   5883                     continue;
   5884                 if(isShiftedCE(CE, LVT, &wasShifted)) {
   5885                     CE >>= 16; /* get primary */
   5886                     if(CE != 0) {
   5887                         if(byteCountOrFrenchDone == 0) {
   5888                             dest[i++]=(uint8_t)(CE >> 8);
   5889                         } else {
   5890                             byteCountOrFrenchDone = 0;
   5891                         }
   5892                         if((CE &=0xff)!=0) {
   5893                             if(i==count) {
   5894                                 /* overflow */
   5895                                 byteCountOrFrenchDone = 1;
   5896                                 goto saveState;
   5897                             }
   5898                             dest[i++]=(uint8_t)CE;
   5899                         }
   5900                     }
   5901                 } else {
   5902                     notIsContinuation = !isContinuation(CE);
   5903                     if(notIsContinuation) {
   5904                         if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   5905                             dest[i++] = UCOL_HIRAGANA_QUAD;
   5906                         } else {
   5907                             dest[i++] = 0xFF;
   5908                         }
   5909                     }
   5910                 }
   5911                 if(uprv_numAvailableExpCEs(s)) {
   5912                     canUpdateState = FALSE;
   5913                 } else {
   5914                     canUpdateState = TRUE;
   5915                 }
   5916             }
   5917         } else {
   5918             // if we're not doing quaternary
   5919             // skip to the end
   5920             level = UCOL_PSK_NULL;
   5921         }
   5922         /* fall through to next level */
   5923     case UCOL_PSK_QUIN:
   5924         level = UCOL_PSK_IDENTICAL;
   5925         /* fall through to next level */
   5926     case UCOL_PSK_IDENTICAL:
   5927         if(strength >= UCOL_IDENTICAL) {
   5928             UChar32 first, second;
   5929             int32_t bocsuBytesWritten = 0;
   5930             // We always need to do identical on
   5931             // the NFD form of the string.
   5932             if(normIter == NULL) {
   5933                 // we arrived from the level below and
   5934                 // normalization was not turned on.
   5935                 // therefore, we need to make a fresh NFD iterator
   5936                 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5937                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   5938             } else if(!doingIdenticalFromStart) {
   5939                 // there is an iterator, but we did some other levels.
   5940                 // therefore, we have a FCD iterator - need to make
   5941                 // a NFD one.
   5942                 // normIter being at the beginning does not guarantee
   5943                 // that the underlying iterator is at the beginning
   5944                 iter->move(iter, 0, UITER_START);
   5945                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   5946             }
   5947             // At this point we have a NFD iterator that is positioned
   5948             // in the right place
   5949             if(U_FAILURE(*status)) {
   5950                 UTRACE_EXIT_STATUS(*status);
   5951                 return 0;
   5952             }
   5953             first = uiter_previous32(s.iterator);
   5954             // maybe we're at the start of the string
   5955             if(first == U_SENTINEL) {
   5956                 first = 0;
   5957             } else {
   5958                 uiter_next32(s.iterator);
   5959             }
   5960 
   5961             j = 0;
   5962             for(;;) {
   5963                 if(i == count) {
   5964                     if(j+1 < bocsuBytesWritten) {
   5965                         bocsuBytesUsed = j+1;
   5966                     }
   5967                     goto saveState;
   5968                 }
   5969 
   5970                 // On identical level, we will always save
   5971                 // the state if we reach this point, since
   5972                 // we don't depend on getNextCE for content
   5973                 // all the content is in our buffer and we
   5974                 // already either stored the full buffer OR
   5975                 // otherwise we won't arrive here.
   5976                 newState = s.iterator->getState(s.iterator);
   5977                 if(newState != UITER_NO_STATE) {
   5978                     iterState = newState;
   5979                     cces = 0;
   5980                 }
   5981 
   5982                 uint8_t buff[4];
   5983                 second = uiter_next32(s.iterator);
   5984                 cces++;
   5985 
   5986                 // end condition for identical level
   5987                 if(second == U_SENTINEL) {
   5988                     terminatePSKLevel(level, maxLevel, i, dest);
   5989                     level = UCOL_PSK_NULL;
   5990                     break;
   5991                 }
   5992                 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
   5993                 first = second;
   5994 
   5995                 j = 0;
   5996                 if(bocsuBytesUsed != 0) {
   5997                     while(bocsuBytesUsed-->0) {
   5998                         j++;
   5999                     }
   6000                 }
   6001 
   6002                 while(i < count && j < bocsuBytesWritten) {
   6003                     dest[i++] = buff[j++];
   6004                 }
   6005             }
   6006 
   6007         } else {
   6008             level = UCOL_PSK_NULL;
   6009         }
   6010         /* fall through to next level */
   6011     case UCOL_PSK_NULL:
   6012         j = i;
   6013         while(j<count) {
   6014             dest[j++]=0;
   6015         }
   6016         break;
   6017     default:
   6018         *status = U_INTERNAL_PROGRAM_ERROR;
   6019         UTRACE_EXIT_STATUS(*status);
   6020         return 0;
   6021     }
   6022 
   6023 saveState:
   6024     // Now we need to return stuff. First we want to see whether we have
   6025     // done everything for the current state of iterator.
   6026     if(byteCountOrFrenchDone
   6027         || canUpdateState == FALSE
   6028         || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
   6029     {
   6030         // Any of above mean that the previous transaction
   6031         // wasn't finished and that we should store the
   6032         // previous iterator state.
   6033         state[0] = iterState;
   6034     } else {
   6035         // The transaction is complete. We will continue in the next iteration.
   6036         state[0] = s.iterator->getState(s.iterator);
   6037         cces = 0;
   6038     }
   6039     // Store the number of bocsu bytes written.
   6040     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
   6041         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6042     }
   6043     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
   6044 
   6045     // Next we put in the level of comparison
   6046     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
   6047 
   6048     // If we are doing French, we need to store whether we have just finished the French level
   6049     if(level == UCOL_PSK_SECONDARY && doingFrench) {
   6050         state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6051     } else {
   6052         state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6053     }
   6054 
   6055     // Was the latest CE shifted
   6056     if(wasShifted) {
   6057         state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
   6058     }
   6059     // Check for cces overflow
   6060     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
   6061         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6062     }
   6063     // Store cces
   6064     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
   6065 
   6066     // Check for French overflow
   6067     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
   6068         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6069     }
   6070     // Store number of bytes written in the French secondary continuation sequence
   6071     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
   6072 
   6073 
   6074     // If we have used normalizing iterator, get rid of it
   6075     if(normIter != NULL) {
   6076         unorm_closeIter(normIter);
   6077     }
   6078 
   6079     /* To avoid memory leak, free the offset buffer if necessary. */
   6080     ucol_freeOffsetBuffer(&s);
   6081 
   6082     // Return number of meaningful sortkey bytes.
   6083     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
   6084                   dest,i, state[0], state[1]);
   6085     UTRACE_EXIT_VALUE(i);
   6086     return i;
   6087 }
   6088 
   6089 /**
   6090  * Produce a bound for a given sortkey and a number of levels.
   6091  */
   6092 U_CAPI int32_t U_EXPORT2
   6093 ucol_getBound(const uint8_t       *source,
   6094         int32_t             sourceLength,
   6095         UColBoundMode       boundType,
   6096         uint32_t            noOfLevels,
   6097         uint8_t             *result,
   6098         int32_t             resultLength,
   6099         UErrorCode          *status)
   6100 {
   6101     // consistency checks
   6102     if(status == NULL || U_FAILURE(*status)) {
   6103         return 0;
   6104     }
   6105     if(source == NULL) {
   6106         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6107         return 0;
   6108     }
   6109 
   6110     int32_t sourceIndex = 0;
   6111     // Scan the string until we skip enough of the key OR reach the end of the key
   6112     do {
   6113         sourceIndex++;
   6114         if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
   6115             noOfLevels--;
   6116         }
   6117     } while (noOfLevels > 0
   6118         && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
   6119 
   6120     if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
   6121         && noOfLevels > 0) {
   6122             *status = U_SORT_KEY_TOO_SHORT_WARNING;
   6123     }
   6124 
   6125 
   6126     // READ ME: this code assumes that the values for boundType
   6127     // enum will not changes. They are set so that the enum value
   6128     // corresponds to the number of extra bytes each bound type
   6129     // needs.
   6130     if(result != NULL && resultLength >= sourceIndex+boundType) {
   6131         uprv_memcpy(result, source, sourceIndex);
   6132         switch(boundType) {
   6133             // Lower bound just gets terminated. No extra bytes
   6134         case UCOL_BOUND_LOWER: // = 0
   6135             break;
   6136             // Upper bound needs one extra byte
   6137         case UCOL_BOUND_UPPER: // = 1
   6138             result[sourceIndex++] = 2;
   6139             break;
   6140             // Upper long bound needs two extra bytes
   6141         case UCOL_BOUND_UPPER_LONG: // = 2
   6142             result[sourceIndex++] = 0xFF;
   6143             result[sourceIndex++] = 0xFF;
   6144             break;
   6145         default:
   6146             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6147             return 0;
   6148         }
   6149         result[sourceIndex++] = 0;
   6150 
   6151         return sourceIndex;
   6152     } else {
   6153         return sourceIndex+boundType+1;
   6154     }
   6155 }
   6156 
   6157 /****************************************************************************/
   6158 /* Following are the functions that deal with the properties of a collator  */
   6159 /* there are new APIs and some compatibility APIs                           */
   6160 /****************************************************************************/
   6161 
   6162 static inline void
   6163 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
   6164                     int32_t *primShift, int32_t *secShift, int32_t *terShift)
   6165 {
   6166     uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
   6167     UBool reverseSecondary = FALSE;
   6168     UBool continuation = isContinuation(CE);
   6169     if(!continuation) {
   6170         tertiary = (uint8_t)((CE & coll->tertiaryMask));
   6171         tertiary ^= coll->caseSwitch;
   6172         reverseSecondary = TRUE;
   6173     } else {
   6174         tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   6175         tertiary &= UCOL_REMOVE_CASE;
   6176         reverseSecondary = FALSE;
   6177     }
   6178 
   6179     secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6180     primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6181     primary1 = (uint8_t)(CE >> 8);
   6182 
   6183     if(primary1 != 0) {
   6184         if (coll->leadBytePermutationTable != NULL && !continuation) {
   6185             primary1 = coll->leadBytePermutationTable[primary1];
   6186         }
   6187 
   6188         coll->latinOneCEs[ch] |= (primary1 << *primShift);
   6189         *primShift -= 8;
   6190     }
   6191     if(primary2 != 0) {
   6192         if(*primShift < 0) {
   6193             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6194             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6195             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6196             return;
   6197         }
   6198         coll->latinOneCEs[ch] |= (primary2 << *primShift);
   6199         *primShift -= 8;
   6200     }
   6201     if(secondary != 0) {
   6202         if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
   6203             coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
   6204             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
   6205         } else { // normal case
   6206             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
   6207         }
   6208         *secShift -= 8;
   6209     }
   6210     if(tertiary != 0) {
   6211         coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
   6212         *terShift -= 8;
   6213     }
   6214 }
   6215 
   6216 static inline UBool
   6217 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
   6218     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
   6219     if(newTable == NULL) {
   6220       *status = U_MEMORY_ALLOCATION_ERROR;
   6221       coll->latinOneFailed = TRUE;
   6222       return FALSE;
   6223     }
   6224     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
   6225     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
   6226     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
   6227     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
   6228     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
   6229     coll->latinOneTableLen = size;
   6230     uprv_free(coll->latinOneCEs);
   6231     coll->latinOneCEs = newTable;
   6232     return TRUE;
   6233 }
   6234 
   6235 static UBool
   6236 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
   6237     UBool result = TRUE;
   6238     if(coll->latinOneCEs == NULL) {
   6239         coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
   6240         if(coll->latinOneCEs == NULL) {
   6241             *status = U_MEMORY_ALLOCATION_ERROR;
   6242             return FALSE;
   6243         }
   6244         coll->latinOneTableLen = UCOL_LATINONETABLELEN;
   6245     }
   6246     UChar ch = 0;
   6247     UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
   6248     // Check for null pointer
   6249     if (U_FAILURE(*status)) {
   6250         return FALSE;
   6251     }
   6252     uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
   6253 
   6254     int32_t primShift = 24, secShift = 24, terShift = 24;
   6255     uint32_t CE = 0;
   6256     int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
   6257 
   6258     // TODO: make safe if you get more than you wanted...
   6259     for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
   6260         primShift = 24; secShift = 24; terShift = 24;
   6261         if(ch < 0x100) {
   6262             CE = coll->latinOneMapping[ch];
   6263         } else {
   6264             CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   6265             if(CE == UCOL_NOT_FOUND && coll->UCA) {
   6266                 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   6267             }
   6268         }
   6269         if(CE < UCOL_NOT_FOUND) {
   6270             ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6271         } else {
   6272             switch (getCETag(CE)) {
   6273             case EXPANSION_TAG:
   6274             case DIGIT_TAG:
   6275                 ucol_setText(it, &ch, 1, status);
   6276                 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
   6277                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6278                         coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6279                         coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6280                         coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6281                         break;
   6282                     }
   6283                     ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6284                 }
   6285                 break;
   6286             case CONTRACTION_TAG:
   6287                 // here is the trick
   6288                 // F2 is contraction. We do something very similar to contractions
   6289                 // but have two indices, one in the real contraction table and the
   6290                 // other to where we stuffed things. This hopes that we don't have
   6291                 // many contractions (this should work for latin-1 tables).
   6292                 {
   6293                     if((CE & 0x00FFF000) != 0) {
   6294                         *status = U_UNSUPPORTED_ERROR;
   6295                         goto cleanup_after_failure;
   6296                     }
   6297 
   6298                     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   6299 
   6300                     CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
   6301 
   6302                     coll->latinOneCEs[ch] = CE;
   6303                     coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
   6304                     coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
   6305 
   6306                     // We're going to jump into contraction table, pick the elements
   6307                     // and use them
   6308                     do {
   6309                         CE = *(coll->contractionCEs +
   6310                             (UCharOffset - coll->contractionIndex));
   6311                         if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
   6312                             uint32_t size;
   6313                             uint32_t i;    /* general counter */
   6314                             uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   6315                             size = getExpansionCount(CE);
   6316                             //CE = *CEOffset++;
   6317                             if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   6318                                 for(i = 0; i<size; i++) {
   6319                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6320                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6321                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6322                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6323                                         break;
   6324                                     }
   6325                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6326                                 }
   6327                             } else { /* else, we do */
   6328                                 while(*CEOffset != 0) {
   6329                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6330                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6331                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6332                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6333                                         break;
   6334                                     }
   6335                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6336                                 }
   6337                             }
   6338                             contractionOffset++;
   6339                         } else if(CE < UCOL_NOT_FOUND) {
   6340                             ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
   6341                         } else {
   6342                             coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6343                             coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6344                             coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6345                             contractionOffset++;
   6346                         }
   6347                         UCharOffset++;
   6348                         primShift = 24; secShift = 24; terShift = 24;
   6349                         if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
   6350                             if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
   6351                                 goto cleanup_after_failure;
   6352                             }
   6353                         }
   6354                     } while(*UCharOffset != 0xFFFF);
   6355                 }
   6356                 break;;
   6357             case SPEC_PROC_TAG:
   6358                 {
   6359                     // 0xB7 is a precontext character defined in UCA5.1, a special
   6360                     // handle is implemeted in order to save LatinOne table for
   6361                     // most locales.
   6362                     if (ch==0xb7) {
   6363                         ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6364                     }
   6365                     else {
   6366                         goto cleanup_after_failure;
   6367                     }
   6368                 }
   6369                 break;
   6370             default:
   6371                 goto cleanup_after_failure;
   6372             }
   6373         }
   6374     }
   6375     // compact table
   6376     if(contractionOffset < coll->latinOneTableLen) {
   6377         if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
   6378             goto cleanup_after_failure;
   6379         }
   6380     }
   6381     ucol_closeElements(it);
   6382     return result;
   6383 
   6384 cleanup_after_failure:
   6385     // status should already be set before arriving here.
   6386     coll->latinOneFailed = TRUE;
   6387     ucol_closeElements(it);
   6388     return FALSE;
   6389 }
   6390 
   6391 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
   6392     if(U_SUCCESS(*status)) {
   6393         if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6394             coll->caseSwitch = UCOL_CASE_SWITCH;
   6395         } else {
   6396             coll->caseSwitch = UCOL_NO_CASE_SWITCH;
   6397         }
   6398 
   6399         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
   6400             coll->tertiaryMask = UCOL_REMOVE_CASE;
   6401             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6402             coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
   6403             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
   6404             coll->tertiaryBottom = UCOL_COMMON_BOT3;
   6405         } else {
   6406             coll->tertiaryMask = UCOL_KEEP_CASE;
   6407             coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
   6408             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6409                 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
   6410                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
   6411                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
   6412             } else {
   6413                 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6414                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
   6415                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
   6416             }
   6417         }
   6418 
   6419         /* Set the compression values */
   6420         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
   6421         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
   6422         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
   6423 
   6424         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
   6425             && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
   6426         {
   6427             coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
   6428         } else {
   6429             coll->sortKeyGen = ucol_calcSortKey;
   6430         }
   6431         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
   6432             && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
   6433         {
   6434             if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
   6435                 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
   6436                     //fprintf(stderr, "F");
   6437                     coll->latinOneUse = TRUE;
   6438                 } else {
   6439                     coll->latinOneUse = FALSE;
   6440                 }
   6441                 if(*status == U_UNSUPPORTED_ERROR) {
   6442                     *status = U_ZERO_ERROR;
   6443                 }
   6444             } else { // latin1Table exists and it doesn't need to be regenerated, just use it
   6445                 coll->latinOneUse = TRUE;
   6446             }
   6447         } else {
   6448             coll->latinOneUse = FALSE;
   6449         }
   6450     }
   6451 }
   6452 
   6453 U_CAPI uint32_t  U_EXPORT2
   6454 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
   6455     if(U_FAILURE(*status) || coll == NULL) {
   6456         return 0;
   6457     }
   6458     if(len == -1) {
   6459         len = u_strlen(varTop);
   6460     }
   6461     if(len == 0) {
   6462         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6463         return 0;
   6464     }
   6465 
   6466     collIterate s;
   6467     IInit_collIterate(coll, varTop, len, &s, status);
   6468     if(U_FAILURE(*status)) {
   6469         return 0;
   6470     }
   6471 
   6472     uint32_t CE = ucol_IGetNextCE(coll, &s, status);
   6473 
   6474     /* here we check if we have consumed all characters */
   6475     /* you can put in either one character or a contraction */
   6476     /* you shouldn't put more... */
   6477     if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
   6478         *status = U_CE_NOT_FOUND_ERROR;
   6479         return 0;
   6480     }
   6481 
   6482     uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
   6483 
   6484     if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
   6485         *status = U_PRIMARY_TOO_LONG_ERROR;
   6486         return 0;
   6487     }
   6488     if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
   6489         coll->variableTopValueisDefault = FALSE;
   6490         coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
   6491     }
   6492 
   6493     /* To avoid memory leak, free the offset buffer if necessary. */
   6494     ucol_freeOffsetBuffer(&s);
   6495 
   6496     return CE & UCOL_PRIMARYMASK;
   6497 }
   6498 
   6499 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
   6500     if(U_FAILURE(*status) || coll == NULL) {
   6501         return 0;
   6502     }
   6503     return coll->variableTopValue<<16;
   6504 }
   6505 
   6506 U_CAPI void  U_EXPORT2
   6507 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
   6508     if(U_FAILURE(*status) || coll == NULL) {
   6509         return;
   6510     }
   6511 
   6512     if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
   6513         coll->variableTopValueisDefault = FALSE;
   6514         coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
   6515     }
   6516 }
   6517 /* Attribute setter API */
   6518 U_CAPI void  U_EXPORT2
   6519 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
   6520     if(U_FAILURE(*status) || coll == NULL) {
   6521       return;
   6522     }
   6523 
   6524     UColAttributeValue oldFrench = coll->frenchCollation;
   6525     UColAttributeValue oldCaseFirst = coll->caseFirst;
   6526     switch(attr) {
   6527     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
   6528         if(value == UCOL_ON) {
   6529             coll->numericCollation = UCOL_ON;
   6530             coll->numericCollationisDefault = FALSE;
   6531         } else if (value == UCOL_OFF) {
   6532             coll->numericCollation = UCOL_OFF;
   6533             coll->numericCollationisDefault = FALSE;
   6534         } else if (value == UCOL_DEFAULT) {
   6535             coll->numericCollationisDefault = TRUE;
   6536             coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
   6537         } else {
   6538             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6539         }
   6540         break;
   6541     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
   6542         if(value == UCOL_ON) {
   6543             coll->hiraganaQ = UCOL_ON;
   6544             coll->hiraganaQisDefault = FALSE;
   6545         } else if (value == UCOL_OFF) {
   6546             coll->hiraganaQ = UCOL_OFF;
   6547             coll->hiraganaQisDefault = FALSE;
   6548         } else if (value == UCOL_DEFAULT) {
   6549             coll->hiraganaQisDefault = TRUE;
   6550             coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
   6551         } else {
   6552             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6553         }
   6554         break;
   6555     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   6556         if(value == UCOL_ON) {
   6557             coll->frenchCollation = UCOL_ON;
   6558             coll->frenchCollationisDefault = FALSE;
   6559         } else if (value == UCOL_OFF) {
   6560             coll->frenchCollation = UCOL_OFF;
   6561             coll->frenchCollationisDefault = FALSE;
   6562         } else if (value == UCOL_DEFAULT) {
   6563             coll->frenchCollationisDefault = TRUE;
   6564             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
   6565         } else {
   6566             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6567         }
   6568         break;
   6569     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   6570         if(value == UCOL_SHIFTED) {
   6571             coll->alternateHandling = UCOL_SHIFTED;
   6572             coll->alternateHandlingisDefault = FALSE;
   6573         } else if (value == UCOL_NON_IGNORABLE) {
   6574             coll->alternateHandling = UCOL_NON_IGNORABLE;
   6575             coll->alternateHandlingisDefault = FALSE;
   6576         } else if (value == UCOL_DEFAULT) {
   6577             coll->alternateHandlingisDefault = TRUE;
   6578             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
   6579         } else {
   6580             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6581         }
   6582         break;
   6583     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   6584         if(value == UCOL_LOWER_FIRST) {
   6585             coll->caseFirst = UCOL_LOWER_FIRST;
   6586             coll->caseFirstisDefault = FALSE;
   6587         } else if (value == UCOL_UPPER_FIRST) {
   6588             coll->caseFirst = UCOL_UPPER_FIRST;
   6589             coll->caseFirstisDefault = FALSE;
   6590         } else if (value == UCOL_OFF) {
   6591             coll->caseFirst = UCOL_OFF;
   6592             coll->caseFirstisDefault = FALSE;
   6593         } else if (value == UCOL_DEFAULT) {
   6594             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
   6595             coll->caseFirstisDefault = TRUE;
   6596         } else {
   6597             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6598         }
   6599         break;
   6600     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   6601         if(value == UCOL_ON) {
   6602             coll->caseLevel = UCOL_ON;
   6603             coll->caseLevelisDefault = FALSE;
   6604         } else if (value == UCOL_OFF) {
   6605             coll->caseLevel = UCOL_OFF;
   6606             coll->caseLevelisDefault = FALSE;
   6607         } else if (value == UCOL_DEFAULT) {
   6608             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
   6609             coll->caseLevelisDefault = TRUE;
   6610         } else {
   6611             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6612         }
   6613         break;
   6614     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   6615         if(value == UCOL_ON) {
   6616             coll->normalizationMode = UCOL_ON;
   6617             coll->normalizationModeisDefault = FALSE;
   6618             initializeFCD(status);
   6619         } else if (value == UCOL_OFF) {
   6620             coll->normalizationMode = UCOL_OFF;
   6621             coll->normalizationModeisDefault = FALSE;
   6622         } else if (value == UCOL_DEFAULT) {
   6623             coll->normalizationModeisDefault = TRUE;
   6624             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
   6625             if(coll->normalizationMode == UCOL_ON) {
   6626                 initializeFCD(status);
   6627             }
   6628         } else {
   6629             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6630         }
   6631         break;
   6632     case UCOL_STRENGTH:         /* attribute for strength */
   6633         if (value == UCOL_DEFAULT) {
   6634             coll->strengthisDefault = TRUE;
   6635             coll->strength = (UColAttributeValue)coll->options->strength;
   6636         } else if (value <= UCOL_IDENTICAL) {
   6637             coll->strengthisDefault = FALSE;
   6638             coll->strength = value;
   6639         } else {
   6640             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6641         }
   6642         break;
   6643     case UCOL_ATTRIBUTE_COUNT:
   6644     default:
   6645         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6646         break;
   6647     }
   6648     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
   6649         coll->latinOneRegenTable = TRUE;
   6650     } else {
   6651         coll->latinOneRegenTable = FALSE;
   6652     }
   6653     ucol_updateInternalState(coll, status);
   6654 }
   6655 
   6656 U_CAPI UColAttributeValue  U_EXPORT2
   6657 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
   6658     if(U_FAILURE(*status) || coll == NULL) {
   6659       return UCOL_DEFAULT;
   6660     }
   6661     switch(attr) {
   6662     case UCOL_NUMERIC_COLLATION:
   6663       return coll->numericCollation;
   6664     case UCOL_HIRAGANA_QUATERNARY_MODE:
   6665       return coll->hiraganaQ;
   6666     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   6667         return coll->frenchCollation;
   6668     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   6669         return coll->alternateHandling;
   6670     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   6671         return coll->caseFirst;
   6672     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   6673         return coll->caseLevel;
   6674     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   6675         return coll->normalizationMode;
   6676     case UCOL_STRENGTH:         /* attribute for strength */
   6677         return coll->strength;
   6678     case UCOL_ATTRIBUTE_COUNT:
   6679     default:
   6680         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6681         break;
   6682     }
   6683     return UCOL_DEFAULT;
   6684 }
   6685 
   6686 U_CAPI void U_EXPORT2
   6687 ucol_setStrength(    UCollator                *coll,
   6688             UCollationStrength        strength)
   6689 {
   6690     UErrorCode status = U_ZERO_ERROR;
   6691     ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
   6692 }
   6693 
   6694 U_CAPI UCollationStrength U_EXPORT2
   6695 ucol_getStrength(const UCollator *coll)
   6696 {
   6697     UErrorCode status = U_ZERO_ERROR;
   6698     return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
   6699 }
   6700 
   6701 U_DRAFT int32_t U_EXPORT2
   6702 ucol_getReorderCodes(const UCollator *coll,
   6703                     int32_t *dest,
   6704                     int32_t destCapacity,
   6705                     UErrorCode *status) {
   6706     if (U_FAILURE(*status)) {
   6707         return 0;
   6708     }
   6709 
   6710     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
   6711         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6712         return 0;
   6713     }
   6714 
   6715 #ifdef UCOL_DEBUG
   6716     printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
   6717     printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
   6718 #endif
   6719 
   6720     if (coll->reorderCodesLength > destCapacity) {
   6721         *status = U_BUFFER_OVERFLOW_ERROR;
   6722         return coll->reorderCodesLength;
   6723     }
   6724     for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
   6725         dest[i] = coll->reorderCodes[i];
   6726     }
   6727     return coll->reorderCodesLength;
   6728 }
   6729 
   6730 U_DRAFT void U_EXPORT2
   6731 ucol_setReorderCodes(UCollator* coll,
   6732                     const int32_t* reorderCodes,
   6733                     int32_t reorderCodesLength,
   6734                     UErrorCode *status) {
   6735     if (U_FAILURE(*status)) {
   6736         return;
   6737     }
   6738 
   6739     if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
   6740         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6741         return;
   6742     }
   6743 
   6744     if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
   6745         uprv_free(coll->reorderCodes);
   6746     }
   6747     coll->reorderCodes = NULL;
   6748     coll->reorderCodesLength = 0;
   6749     if (reorderCodesLength == 0) {
   6750         if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
   6751             uprv_free(coll->leadBytePermutationTable);
   6752         }
   6753         coll->leadBytePermutationTable = NULL;
   6754         return;
   6755     }
   6756     coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
   6757     if (coll->reorderCodes == NULL) {
   6758         *status = U_MEMORY_ALLOCATION_ERROR;
   6759         return;
   6760     }
   6761     coll->freeReorderCodesOnClose = TRUE;
   6762     for (int32_t i = 0; i < reorderCodesLength; i++) {
   6763         coll->reorderCodes[i] = reorderCodes[i];
   6764     }
   6765     coll->reorderCodesLength = reorderCodesLength;
   6766     ucol_buildPermutationTable(coll, status);
   6767 }
   6768 
   6769 U_DRAFT int32_t U_EXPORT2
   6770 ucol_getEquivalentReorderCodes(int32_t reorderCode,
   6771                     int32_t* dest,
   6772                     int32_t destCapacity,
   6773                     UErrorCode *pErrorCode) {
   6774     bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
   6775     uint16_t leadBytes[256];
   6776     int leadBytesCount;
   6777     int leadByteIndex;
   6778     int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
   6779     int reorderCodesForLeadByteCount;
   6780     int reorderCodeIndex;
   6781 
   6782     int32_t equivalentCodesCount = 0;
   6783     int setIndex;
   6784 
   6785     if (U_FAILURE(*pErrorCode)) {
   6786         return 0;
   6787     }
   6788 
   6789     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
   6790         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   6791         return 0;
   6792     }
   6793 
   6794     uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
   6795 
   6796     const UCollator* uca = ucol_initUCA(pErrorCode);
   6797     if (U_FAILURE(*pErrorCode)) {
   6798 	return 0;
   6799     }
   6800     leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
   6801     for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
   6802         reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
   6803             uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
   6804         for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
   6805             equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
   6806         }
   6807     }
   6808 
   6809     for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
   6810         if (equivalentCodesSet[setIndex] == true) {
   6811             equivalentCodesCount++;
   6812         }
   6813     }
   6814 
   6815     if (destCapacity == 0) {
   6816         return equivalentCodesCount;
   6817     }
   6818 
   6819     equivalentCodesCount = 0;
   6820     for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
   6821         if (equivalentCodesSet[setIndex] == true) {
   6822             dest[equivalentCodesCount++] = setIndex;
   6823             if (equivalentCodesCount >= destCapacity) {
   6824                 break;
   6825             }
   6826         }
   6827     }
   6828     return equivalentCodesCount;
   6829 }
   6830 
   6831 
   6832 /****************************************************************************/
   6833 /* Following are misc functions                                             */
   6834 /* there are new APIs and some compatibility APIs                           */
   6835 /****************************************************************************/
   6836 
   6837 U_CAPI void U_EXPORT2
   6838 ucol_getVersion(const UCollator* coll,
   6839                 UVersionInfo versionInfo)
   6840 {
   6841     /* RunTime version  */
   6842     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
   6843     /* Builder version*/
   6844     uint8_t bdVersion = coll->image->version[0];
   6845 
   6846     /* Charset Version. Need to get the version from cnv files
   6847      * makeconv should populate cnv files with version and
   6848      * an api has to be provided in ucnv.h to obtain this version
   6849      */
   6850     uint8_t csVersion = 0;
   6851 
   6852     /* combine the version info */
   6853     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
   6854 
   6855     /* Tailoring rules */
   6856     versionInfo[0] = (uint8_t)(cmbVersion>>8);
   6857     versionInfo[1] = (uint8_t)cmbVersion;
   6858     versionInfo[2] = coll->image->version[1];
   6859     if(coll->UCA) {
   6860         /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
   6861         versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
   6862     } else {
   6863         versionInfo[3] = 0;
   6864     }
   6865 }
   6866 
   6867 
   6868 /* This internal API checks whether a character is tailored or not */
   6869 U_CAPI UBool  U_EXPORT2
   6870 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
   6871     if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
   6872         return FALSE;
   6873     }
   6874 
   6875     uint32_t CE = UCOL_NOT_FOUND;
   6876     const UChar *ContractionStart = NULL;
   6877     if(u < 0x100) { /* latin-1 */
   6878         CE = coll->latinOneMapping[u];
   6879         if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
   6880             return FALSE;
   6881         }
   6882     } else { /* regular */
   6883         CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
   6884     }
   6885 
   6886     if(isContraction(CE)) {
   6887         ContractionStart = (UChar *)coll->image+getContractOffset(CE);
   6888         CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
   6889     }
   6890 
   6891     return (UBool)(CE != UCOL_NOT_FOUND);
   6892 }
   6893 
   6894 
   6895 /****************************************************************************/
   6896 /* Following are the string compare functions                               */
   6897 /*                                                                          */
   6898 /****************************************************************************/
   6899 
   6900 
   6901 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
   6902 /*                     Used by strcoll if strength == identical and strings  */
   6903 /*                     are otherwise equal.                                  */
   6904 /*                                                                           */
   6905 /*                     Comparison must be done on NFD normalized strings.    */
   6906 /*                     FCD is not good enough.                               */
   6907 
   6908 static
   6909 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
   6910 {
   6911     // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
   6912     // of same type, but that doesn't really mean that it will stay that way.
   6913     int32_t            comparison;
   6914 
   6915     if (sColl->flags & UCOL_USE_ITERATOR) {
   6916         // The division for the array length may truncate the array size to
   6917         // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   6918         // for all platforms anyway.
   6919         UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   6920         UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   6921         UNormIterator *sNIt = NULL, *tNIt = NULL;
   6922         sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   6923         tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   6924         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   6925         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   6926         UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
   6927         UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
   6928         comparison = u_strCompareIter(sIt, tIt, TRUE);
   6929         unorm_closeIter(sNIt);
   6930         unorm_closeIter(tNIt);
   6931     } else {
   6932         int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
   6933         const UChar *sBuf = sColl->string;
   6934         int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
   6935         const UChar *tBuf = tColl->string;
   6936 
   6937         if (normalize) {
   6938             *status = U_ZERO_ERROR;
   6939             // Note: We could use Normalizer::compare() or similar, but for short strings
   6940             // which may not be in FCD it might be faster to just NFD them.
   6941             // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
   6942             // NFD'ing immediately might be faster for long strings,
   6943             // but string comparison is usually done on relatively short strings.
   6944             sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
   6945                                   sColl->writableBuffer,
   6946                                   *status);
   6947             tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
   6948                                   tColl->writableBuffer,
   6949                                   *status);
   6950             if(U_FAILURE(*status)) {
   6951                 return UCOL_LESS;
   6952             }
   6953             comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
   6954         } else {
   6955             comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
   6956         }
   6957     }
   6958 
   6959     if (comparison < 0) {
   6960         return UCOL_LESS;
   6961     } else if (comparison == 0) {
   6962         return UCOL_EQUAL;
   6963     } else /* comparison > 0 */ {
   6964         return UCOL_GREATER;
   6965     }
   6966 }
   6967 
   6968 /*  CEBuf - A struct and some inline functions to handle the saving    */
   6969 /*          of CEs in a buffer within ucol_strcoll                     */
   6970 
   6971 #define UCOL_CEBUF_SIZE 512
   6972 typedef struct ucol_CEBuf {
   6973     uint32_t    *buf;
   6974     uint32_t    *endp;
   6975     uint32_t    *pos;
   6976     uint32_t     localArray[UCOL_CEBUF_SIZE];
   6977 } ucol_CEBuf;
   6978 
   6979 
   6980 static
   6981 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
   6982     (b)->buf = (b)->pos = (b)->localArray;
   6983     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
   6984 }
   6985 
   6986 static
   6987 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
   6988     uint32_t  oldSize;
   6989     uint32_t  newSize;
   6990     uint32_t  *newBuf;
   6991 
   6992     ci->flags |= UCOL_ITER_ALLOCATED;
   6993     oldSize = (uint32_t)(b->pos - b->buf);
   6994     newSize = oldSize * 2;
   6995     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
   6996     if(newBuf == NULL) {
   6997         *status = U_MEMORY_ALLOCATION_ERROR;
   6998     }
   6999     else {
   7000         uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
   7001         if (b->buf != b->localArray) {
   7002             uprv_free(b->buf);
   7003         }
   7004         b->buf = newBuf;
   7005         b->endp = b->buf + newSize;
   7006         b->pos  = b->buf + oldSize;
   7007     }
   7008 }
   7009 
   7010 static
   7011 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
   7012     if (b->pos == b->endp) {
   7013         ucol_CEBuf_Expand(b, ci, status);
   7014     }
   7015     if (U_SUCCESS(*status)) {
   7016         *(b)->pos++ = ce;
   7017     }
   7018 }
   7019 
   7020 /* This is a trick string compare function that goes in and uses sortkeys to compare */
   7021 /* It is used when compare gets in trouble and needs to bail out                     */
   7022 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
   7023                                                   collIterate *tColl,
   7024                                                   UErrorCode *status)
   7025 {
   7026     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
   7027     uint8_t *sourceKeyP = sourceKey;
   7028     uint8_t *targetKeyP = targetKey;
   7029     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
   7030     const UCollator *coll = sColl->coll;
   7031     const UChar *source = NULL;
   7032     const UChar *target = NULL;
   7033     int32_t result = UCOL_EQUAL;
   7034     UnicodeString sourceString, targetString;
   7035     int32_t sourceLength;
   7036     int32_t targetLength;
   7037 
   7038     if(sColl->flags & UCOL_USE_ITERATOR) {
   7039         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   7040         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   7041         UChar32 c;
   7042         while((c=sColl->iterator->next(sColl->iterator))>=0) {
   7043             sourceString.append((UChar)c);
   7044         }
   7045         while((c=tColl->iterator->next(tColl->iterator))>=0) {
   7046             targetString.append((UChar)c);
   7047         }
   7048         source = sourceString.getBuffer();
   7049         sourceLength = sourceString.length();
   7050         target = targetString.getBuffer();
   7051         targetLength = targetString.length();
   7052     } else { // no iterators
   7053         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
   7054         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
   7055         source = sColl->string;
   7056         target = tColl->string;
   7057     }
   7058 
   7059 
   7060 
   7061     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7062     if(sourceKeyLen > UCOL_MAX_BUFFER) {
   7063         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
   7064         if(sourceKeyP == NULL) {
   7065             *status = U_MEMORY_ALLOCATION_ERROR;
   7066             goto cleanup_and_do_compare;
   7067         }
   7068         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7069     }
   7070 
   7071     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7072     if(targetKeyLen > UCOL_MAX_BUFFER) {
   7073         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
   7074         if(targetKeyP == NULL) {
   7075             *status = U_MEMORY_ALLOCATION_ERROR;
   7076             goto cleanup_and_do_compare;
   7077         }
   7078         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7079     }
   7080 
   7081     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
   7082 
   7083 cleanup_and_do_compare:
   7084     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
   7085         uprv_free(sourceKeyP);
   7086     }
   7087 
   7088     if(targetKeyP != NULL && targetKeyP != targetKey) {
   7089         uprv_free(targetKeyP);
   7090     }
   7091 
   7092     if(result<0) {
   7093         return UCOL_LESS;
   7094     } else if(result>0) {
   7095         return UCOL_GREATER;
   7096     } else {
   7097         return UCOL_EQUAL;
   7098     }
   7099 }
   7100 
   7101 
   7102 static UCollationResult
   7103 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
   7104 {
   7105     U_ALIGN_CODE(16);
   7106 
   7107     const UCollator *coll = sColl->coll;
   7108 
   7109 
   7110     // setting up the collator parameters
   7111     UColAttributeValue strength = coll->strength;
   7112     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
   7113 
   7114     UBool checkSecTer = initialCheckSecTer;
   7115     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
   7116     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
   7117     UBool checkIdent = (strength == UCOL_IDENTICAL);
   7118     UBool checkCase = (coll->caseLevel == UCOL_ON);
   7119     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
   7120     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
   7121     UBool qShifted = shifted && checkQuad;
   7122     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
   7123 
   7124     if(doHiragana && shifted) {
   7125         return (ucol_compareUsingSortKeys(sColl, tColl, status));
   7126     }
   7127     uint8_t caseSwitch = coll->caseSwitch;
   7128     uint8_t tertiaryMask = coll->tertiaryMask;
   7129 
   7130     // This is the lowest primary value that will not be ignored if shifted
   7131     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
   7132 
   7133     UCollationResult result = UCOL_EQUAL;
   7134     UCollationResult hirResult = UCOL_EQUAL;
   7135 
   7136     // Preparing the CE buffers. They will be filled during the primary phase
   7137     ucol_CEBuf   sCEs;
   7138     ucol_CEBuf   tCEs;
   7139     UCOL_INIT_CEBUF(&sCEs);
   7140     UCOL_INIT_CEBUF(&tCEs);
   7141 
   7142     uint32_t secS = 0, secT = 0;
   7143     uint32_t sOrder=0, tOrder=0;
   7144 
   7145     // Non shifted primary processing is quite simple
   7146     if(!shifted) {
   7147         for(;;) {
   7148 
   7149             // We fetch CEs until we hit a non ignorable primary or end.
   7150             do {
   7151                 // We get the next CE
   7152                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7153                 // Stuff it in the buffer
   7154                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7155                 // And keep just the primary part.
   7156                 sOrder &= UCOL_PRIMARYMASK;
   7157             } while(sOrder == 0);
   7158 
   7159             // see the comments on the above block
   7160             do {
   7161                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7162                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7163                 tOrder &= UCOL_PRIMARYMASK;
   7164             } while(tOrder == 0);
   7165 
   7166             // if both primaries are the same
   7167             if(sOrder == tOrder) {
   7168                 // and there are no more CEs, we advance to the next level
   7169                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7170                     break;
   7171                 }
   7172                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7173                     if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
   7174                         hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
   7175                             ? UCOL_LESS:UCOL_GREATER;
   7176                     }
   7177                 }
   7178             } else {
   7179                 // only need to check one for continuation
   7180                 // if one is then the other must be or the preceding CE would be a prefix of the other
   7181                 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
   7182                     sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
   7183                     tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
   7184                 }
   7185                 // if two primaries are different, we are done
   7186                 result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
   7187                 goto commonReturn;
   7188             }
   7189         } // no primary difference... do the rest from the buffers
   7190     } else { // shifted - do a slightly more complicated processing :)
   7191         for(;;) {
   7192             UBool sInShifted = FALSE;
   7193             UBool tInShifted = FALSE;
   7194             // This version of code can be refactored. However, it seems easier to understand this way.
   7195             // Source loop. Sam as the target loop.
   7196             for(;;) {
   7197                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7198                 if(sOrder == UCOL_NO_MORE_CES) {
   7199                     UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7200                     break;
   7201                 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
   7202                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7203                     continue;
   7204                 } else if(isContinuation(sOrder)) {
   7205                     if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7206                         if(sInShifted) {
   7207                             sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7208                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7209                             continue;
   7210                         } else {
   7211                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7212                             break;
   7213                         }
   7214                     } else { /* Just lower level values */
   7215                         if(sInShifted) {
   7216                             continue;
   7217                         } else {
   7218                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7219                             continue;
   7220                         }
   7221                     }
   7222                 } else { /* regular */
   7223                     if(coll->leadBytePermutationTable != NULL){
   7224                         sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
   7225                     }
   7226                     if((sOrder & UCOL_PRIMARYMASK) > LVT) {
   7227                         UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7228                         break;
   7229                     } else {
   7230                         if((sOrder & UCOL_PRIMARYMASK) > 0) {
   7231                             sInShifted = TRUE;
   7232                             sOrder &= UCOL_PRIMARYMASK;
   7233                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7234                             continue;
   7235                         } else {
   7236                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7237                             sInShifted = FALSE;
   7238                             continue;
   7239                         }
   7240                     }
   7241                 }
   7242             }
   7243             sOrder &= UCOL_PRIMARYMASK;
   7244             sInShifted = FALSE;
   7245 
   7246             for(;;) {
   7247                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7248                 if(tOrder == UCOL_NO_MORE_CES) {
   7249                     UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7250                     break;
   7251                 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
   7252                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7253                     continue;
   7254                 } else if(isContinuation(tOrder)) {
   7255                     if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7256                         if(tInShifted) {
   7257                             tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7258                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7259                             continue;
   7260                         } else {
   7261                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7262                             break;
   7263                         }
   7264                     } else { /* Just lower level values */
   7265                         if(tInShifted) {
   7266                             continue;
   7267                         } else {
   7268                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7269                             continue;
   7270                         }
   7271                     }
   7272                 } else { /* regular */
   7273                     if(coll->leadBytePermutationTable != NULL){
   7274                         tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
   7275                     }
   7276                     if((tOrder & UCOL_PRIMARYMASK) > LVT) {
   7277                         UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7278                         break;
   7279                     } else {
   7280                         if((tOrder & UCOL_PRIMARYMASK) > 0) {
   7281                             tInShifted = TRUE;
   7282                             tOrder &= UCOL_PRIMARYMASK;
   7283                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7284                             continue;
   7285                         } else {
   7286                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7287                             tInShifted = FALSE;
   7288                             continue;
   7289                         }
   7290                     }
   7291                 }
   7292             }
   7293             tOrder &= UCOL_PRIMARYMASK;
   7294             tInShifted = FALSE;
   7295 
   7296             if(sOrder == tOrder) {
   7297                 /*
   7298                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7299                 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
   7300                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
   7301                 ? UCOL_LESS:UCOL_GREATER;
   7302                 }
   7303                 }
   7304                 */
   7305                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7306                     break;
   7307                 } else {
   7308                     sOrder = 0;
   7309                     tOrder = 0;
   7310                     continue;
   7311                 }
   7312             } else {
   7313                 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
   7314                 goto commonReturn;
   7315             }
   7316         } /* no primary difference... do the rest from the buffers */
   7317     }
   7318 
   7319     /* now, we're gonna reexamine collected CEs */
   7320     uint32_t    *sCE;
   7321     uint32_t    *tCE;
   7322 
   7323     /* This is the secondary level of comparison */
   7324     if(checkSecTer) {
   7325         if(!isFrenchSec) { /* normal */
   7326             sCE = sCEs.buf;
   7327             tCE = tCEs.buf;
   7328             for(;;) {
   7329                 while (secS == 0) {
   7330                     secS = *(sCE++) & UCOL_SECONDARYMASK;
   7331                 }
   7332 
   7333                 while(secT == 0) {
   7334                     secT = *(tCE++) & UCOL_SECONDARYMASK;
   7335                 }
   7336 
   7337                 if(secS == secT) {
   7338                     if(secS == UCOL_NO_MORE_CES_SECONDARY) {
   7339                         break;
   7340                     } else {
   7341                         secS = 0; secT = 0;
   7342                         continue;
   7343                     }
   7344                 } else {
   7345                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7346                     goto commonReturn;
   7347                 }
   7348             }
   7349         } else { /* do the French */
   7350             uint32_t *sCESave = NULL;
   7351             uint32_t *tCESave = NULL;
   7352             sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
   7353             tCE = tCEs.pos-2;
   7354             for(;;) {
   7355                 while (secS == 0 && sCE >= sCEs.buf) {
   7356                     if(sCESave == NULL) {
   7357                         secS = *(sCE--);
   7358                         if(isContinuation(secS)) {
   7359                             while(isContinuation(secS = *(sCE--)))
   7360                                 ;
   7361                             /* after this, secS has the start of continuation, and sCEs points before that */
   7362                             sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7363                             sCE+=2;  /* need to point to the first continuation CP */
   7364                             /* However, now you can just continue doing stuff */
   7365                         }
   7366                     } else {
   7367                         secS = *(sCE++);
   7368                         if(!isContinuation(secS)) { /* This means we have finished with this cont */
   7369                             sCE = sCESave;            /* reset the pointer to before continuation */
   7370                             sCESave = NULL;
   7371                             secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
   7372                             continue;
   7373                         }
   7374                     }
   7375                     secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7376                 }
   7377 
   7378                 while(secT == 0 && tCE >= tCEs.buf) {
   7379                     if(tCESave == NULL) {
   7380                         secT = *(tCE--);
   7381                         if(isContinuation(secT)) {
   7382                             while(isContinuation(secT = *(tCE--)))
   7383                                 ;
   7384                             /* after this, secS has the start of continuation, and sCEs points before that */
   7385                             tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7386                             tCE+=2;  /* need to point to the first continuation CP */
   7387                             /* However, now you can just continue doing stuff */
   7388                         }
   7389                     } else {
   7390                         secT = *(tCE++);
   7391                         if(!isContinuation(secT)) { /* This means we have finished with this cont */
   7392                             tCE = tCESave;          /* reset the pointer to before continuation */
   7393                             tCESave = NULL;
   7394                             secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
   7395                             continue;
   7396                         }
   7397                     }
   7398                     secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7399                 }
   7400 
   7401                 if(secS == secT) {
   7402                     if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
   7403                         break;
   7404                     } else {
   7405                         secS = 0; secT = 0;
   7406                         continue;
   7407                     }
   7408                 } else {
   7409                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7410                     goto commonReturn;
   7411                 }
   7412             }
   7413         }
   7414     }
   7415 
   7416     /* doing the case bit */
   7417     if(checkCase) {
   7418         sCE = sCEs.buf;
   7419         tCE = tCEs.buf;
   7420         for(;;) {
   7421             while((secS & UCOL_REMOVE_CASE) == 0) {
   7422                 if(!isContinuation(*sCE++)) {
   7423                     secS =*(sCE-1);
   7424                     if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7425                         // primary ignorables should not be considered on the case level when the strength is primary
   7426                         // otherwise, the CEs stop being well-formed
   7427                         secS &= UCOL_TERT_CASE_MASK;
   7428                         secS ^= caseSwitch;
   7429                     } else {
   7430                         secS = 0;
   7431                     }
   7432                 } else {
   7433                     secS = 0;
   7434                 }
   7435             }
   7436 
   7437             while((secT & UCOL_REMOVE_CASE) == 0) {
   7438                 if(!isContinuation(*tCE++)) {
   7439                     secT = *(tCE-1);
   7440                     if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7441                         // primary ignorables should not be considered on the case level when the strength is primary
   7442                         // otherwise, the CEs stop being well-formed
   7443                         secT &= UCOL_TERT_CASE_MASK;
   7444                         secT ^= caseSwitch;
   7445                     } else {
   7446                         secT = 0;
   7447                     }
   7448                 } else {
   7449                     secT = 0;
   7450                 }
   7451             }
   7452 
   7453             if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
   7454                 result = UCOL_LESS;
   7455                 goto commonReturn;
   7456             } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
   7457                 result = UCOL_GREATER;
   7458                 goto commonReturn;
   7459             }
   7460 
   7461             if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
   7462                 break;
   7463             } else {
   7464                 secS = 0;
   7465                 secT = 0;
   7466             }
   7467         }
   7468     }
   7469 
   7470     /* Tertiary level */
   7471     if(checkTertiary) {
   7472         secS = 0;
   7473         secT = 0;
   7474         sCE = sCEs.buf;
   7475         tCE = tCEs.buf;
   7476         for(;;) {
   7477             while((secS & UCOL_REMOVE_CASE) == 0) {
   7478                 secS = *(sCE++) & tertiaryMask;
   7479                 if(!isContinuation(secS)) {
   7480                     secS ^= caseSwitch;
   7481                 } else {
   7482                     secS &= UCOL_REMOVE_CASE;
   7483                 }
   7484             }
   7485 
   7486             while((secT & UCOL_REMOVE_CASE)  == 0) {
   7487                 secT = *(tCE++) & tertiaryMask;
   7488                 if(!isContinuation(secT)) {
   7489                     secT ^= caseSwitch;
   7490                 } else {
   7491                     secT &= UCOL_REMOVE_CASE;
   7492                 }
   7493             }
   7494 
   7495             if(secS == secT) {
   7496                 if((secS & UCOL_REMOVE_CASE) == 1) {
   7497                     break;
   7498                 } else {
   7499                     secS = 0; secT = 0;
   7500                     continue;
   7501                 }
   7502             } else {
   7503                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7504                 goto commonReturn;
   7505             }
   7506         }
   7507     }
   7508 
   7509 
   7510     if(qShifted /*checkQuad*/) {
   7511         UBool sInShifted = TRUE;
   7512         UBool tInShifted = TRUE;
   7513         secS = 0;
   7514         secT = 0;
   7515         sCE = sCEs.buf;
   7516         tCE = tCEs.buf;
   7517         for(;;) {
   7518             while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
   7519                 secS = *(sCE++);
   7520                 if(isContinuation(secS)) {
   7521                     if(!sInShifted) {
   7522                         continue;
   7523                     }
   7524                 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
   7525                     secS = UCOL_PRIMARYMASK;
   7526                     sInShifted = FALSE;
   7527                 } else {
   7528                     sInShifted = TRUE;
   7529                 }
   7530             }
   7531             secS &= UCOL_PRIMARYMASK;
   7532 
   7533 
   7534             while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
   7535                 secT = *(tCE++);
   7536                 if(isContinuation(secT)) {
   7537                     if(!tInShifted) {
   7538                         continue;
   7539                     }
   7540                 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
   7541                     secT = UCOL_PRIMARYMASK;
   7542                     tInShifted = FALSE;
   7543                 } else {
   7544                     tInShifted = TRUE;
   7545                 }
   7546             }
   7547             secT &= UCOL_PRIMARYMASK;
   7548 
   7549             if(secS == secT) {
   7550                 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
   7551                     break;
   7552                 } else {
   7553                     secS = 0; secT = 0;
   7554                     continue;
   7555                 }
   7556             } else {
   7557                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7558                 goto commonReturn;
   7559             }
   7560         }
   7561     } else if(doHiragana && hirResult != UCOL_EQUAL) {
   7562         // If we're fine on quaternaries, we might be different
   7563         // on Hiragana. This, however, might fail us in shifted.
   7564         result = hirResult;
   7565         goto commonReturn;
   7566     }
   7567 
   7568     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
   7569     /*  as a tiebreaker if all else is equal.                                */
   7570     /*  Getting here  should be quite rare - strings are not identical -     */
   7571     /*     that is checked first, but compared == through all other checks.  */
   7572     if(checkIdent)
   7573     {
   7574         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
   7575         result = ucol_checkIdent(sColl, tColl, TRUE, status);
   7576     }
   7577 
   7578 commonReturn:
   7579     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
   7580         if (sCEs.buf != sCEs.localArray ) {
   7581             uprv_free(sCEs.buf);
   7582         }
   7583         if (tCEs.buf != tCEs.localArray ) {
   7584             uprv_free(tCEs.buf);
   7585         }
   7586     }
   7587 
   7588     return result;
   7589 }
   7590 
   7591 static UCollationResult
   7592 ucol_strcollRegular(const UCollator *coll,
   7593                     const UChar *source, int32_t sourceLength,
   7594                     const UChar *target, int32_t targetLength,
   7595                     UErrorCode *status) {
   7596     collIterate sColl, tColl;
   7597     // Preparing the context objects for iterating over strings
   7598     IInit_collIterate(coll, source, sourceLength, &sColl, status);
   7599     IInit_collIterate(coll, target, targetLength, &tColl, status);
   7600     if(U_FAILURE(*status)) {
   7601         return UCOL_LESS;
   7602     }
   7603     return ucol_strcollRegular(&sColl, &tColl, status);
   7604 }
   7605 
   7606 static inline uint32_t
   7607 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
   7608                           uint32_t CE, const UChar *s, int32_t *index, int32_t len)
   7609 {
   7610     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
   7611     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
   7612     int32_t offset = 1;
   7613     UChar schar = 0, tchar = 0;
   7614 
   7615     for(;;) {
   7616         if(len == -1) {
   7617             if(s[*index] == 0) { // end of string
   7618                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7619             } else {
   7620                 schar = s[*index];
   7621             }
   7622         } else {
   7623             if(*index == len) {
   7624                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7625             } else {
   7626                 schar = s[*index];
   7627             }
   7628         }
   7629 
   7630         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   7631             offset++;
   7632         }
   7633 
   7634         if (schar == tchar) {
   7635             (*index)++;
   7636             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
   7637         }
   7638         else
   7639         {
   7640             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
   7641                 return UCOL_BAIL_OUT_CE;
   7642             }
   7643             // skip completely ignorables
   7644             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   7645             if(isZeroCE == 0) { // we have to ignore completely ignorables
   7646                 (*index)++;
   7647                 continue;
   7648             }
   7649 
   7650             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7651         }
   7652     }
   7653 }
   7654 
   7655 
   7656 /**
   7657  * This is a fast strcoll, geared towards text in Latin-1.
   7658  * It supports contractions of size two, French secondaries
   7659  * and case switching. You can use it with strengths primary
   7660  * to tertiary. It does not support shifted and case level.
   7661  * It relies on the table build by setupLatin1Table. If it
   7662  * doesn't understand something, it will go to the regular
   7663  * strcoll.
   7664  */
   7665 static UCollationResult
   7666 ucol_strcollUseLatin1( const UCollator    *coll,
   7667               const UChar        *source,
   7668               int32_t            sLen,
   7669               const UChar        *target,
   7670               int32_t            tLen,
   7671               UErrorCode *status)
   7672 {
   7673     U_ALIGN_CODE(16);
   7674     int32_t strength = coll->strength;
   7675 
   7676     int32_t sIndex = 0, tIndex = 0;
   7677     UChar sChar = 0, tChar = 0;
   7678     uint32_t sOrder=0, tOrder=0;
   7679 
   7680     UBool endOfSource = FALSE;
   7681 
   7682     uint32_t *elements = coll->latinOneCEs;
   7683 
   7684     UBool haveContractions = FALSE; // if we have contractions in our string
   7685                                     // we cannot do French secondary
   7686 
   7687     // Do the primary level
   7688     for(;;) {
   7689         while(sOrder==0) { // this loop skips primary ignorables
   7690             // sOrder=getNextlatinOneCE(source);
   7691             if(sLen==-1) {   // handling zero terminated strings
   7692                 sChar=source[sIndex++];
   7693                 if(sChar==0) {
   7694                     endOfSource = TRUE;
   7695                     break;
   7696                 }
   7697             } else {        // handling strings with known length
   7698                 if(sIndex==sLen) {
   7699                     endOfSource = TRUE;
   7700                     break;
   7701                 }
   7702                 sChar=source[sIndex++];
   7703             }
   7704             if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   7705                 //fprintf(stderr, "R");
   7706                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7707             }
   7708             sOrder = elements[sChar];
   7709             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
   7710                 // specials can basically be either contractions or bail-out signs. If we get anything
   7711                 // else, we'll bail out anywasy
   7712                 if(getCETag(sOrder) == CONTRACTION_TAG) {
   7713                     sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
   7714                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
   7715                     // However, if there are contractions in the table, but we always use just one char,
   7716                     // we might be able to do French. This should be checked out.
   7717                 }
   7718                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   7719                     //fprintf(stderr, "S");
   7720                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7721                 }
   7722             }
   7723         }
   7724 
   7725         while(tOrder==0) {  // this loop skips primary ignorables
   7726             // tOrder=getNextlatinOneCE(target);
   7727             if(tLen==-1) {    // handling zero terminated strings
   7728                 tChar=target[tIndex++];
   7729                 if(tChar==0) {
   7730                     if(endOfSource) { // this is different than source loop,
   7731                         // as we already know that source loop is done here,
   7732                         // so we can either finish the primary loop if both
   7733                         // strings are done or anounce the result if only
   7734                         // target is done. Same below.
   7735                         goto endOfPrimLoop;
   7736                     } else {
   7737                         return UCOL_GREATER;
   7738                     }
   7739                 }
   7740             } else {          // handling strings with known length
   7741                 if(tIndex==tLen) {
   7742                     if(endOfSource) {
   7743                         goto endOfPrimLoop;
   7744                     } else {
   7745                         return UCOL_GREATER;
   7746                     }
   7747                 }
   7748                 tChar=target[tIndex++];
   7749             }
   7750             if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   7751                 //fprintf(stderr, "R");
   7752                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7753             }
   7754             tOrder = elements[tChar];
   7755             if(tOrder >= UCOL_NOT_FOUND) {
   7756                 // Handling specials, see the comments for source
   7757                 if(getCETag(tOrder) == CONTRACTION_TAG) {
   7758                     tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
   7759                     haveContractions = TRUE;
   7760                 }
   7761                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   7762                     //fprintf(stderr, "S");
   7763                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7764                 }
   7765             }
   7766         }
   7767         if(endOfSource) { // source is finished, but target is not, say the result.
   7768             return UCOL_LESS;
   7769         }
   7770 
   7771         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
   7772             sOrder = 0; tOrder = 0;
   7773             continue;
   7774         } else {
   7775             // compare current top bytes
   7776             if(((sOrder^tOrder)&0xFF000000)!=0) {
   7777                 // top bytes differ, return difference
   7778                 if(sOrder < tOrder) {
   7779                     return UCOL_LESS;
   7780                 } else if(sOrder > tOrder) {
   7781                     return UCOL_GREATER;
   7782                 }
   7783                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
   7784                 // since we must return enum value
   7785             }
   7786 
   7787             // top bytes match, continue with following bytes
   7788             sOrder<<=8;
   7789             tOrder<<=8;
   7790         }
   7791     }
   7792 
   7793 endOfPrimLoop:
   7794     // after primary loop, we definitely know the sizes of strings,
   7795     // so we set it and use simpler loop for secondaries and tertiaries
   7796     sLen = sIndex; tLen = tIndex;
   7797     if(strength >= UCOL_SECONDARY) {
   7798         // adjust the table beggining
   7799         elements += coll->latinOneTableLen;
   7800         endOfSource = FALSE;
   7801 
   7802         if(coll->frenchCollation == UCOL_OFF) { // non French
   7803             // This loop is a simplified copy of primary loop
   7804             // at this point we know that whole strings are latin-1, so we don't
   7805             // check for that. We also know that we only have contractions as
   7806             // specials.
   7807             sIndex = 0; tIndex = 0;
   7808             for(;;) {
   7809                 while(sOrder==0) {
   7810                     if(sIndex==sLen) {
   7811                         endOfSource = TRUE;
   7812                         break;
   7813                     }
   7814                     sChar=source[sIndex++];
   7815                     sOrder = elements[sChar];
   7816                     if(sOrder > UCOL_NOT_FOUND) {
   7817                         sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
   7818                     }
   7819                 }
   7820 
   7821                 while(tOrder==0) {
   7822                     if(tIndex==tLen) {
   7823                         if(endOfSource) {
   7824                             goto endOfSecLoop;
   7825                         } else {
   7826                             return UCOL_GREATER;
   7827                         }
   7828                     }
   7829                     tChar=target[tIndex++];
   7830                     tOrder = elements[tChar];
   7831                     if(tOrder > UCOL_NOT_FOUND) {
   7832                         tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
   7833                     }
   7834                 }
   7835                 if(endOfSource) {
   7836                     return UCOL_LESS;
   7837                 }
   7838 
   7839                 if(sOrder == tOrder) {
   7840                     sOrder = 0; tOrder = 0;
   7841                     continue;
   7842                 } else {
   7843                     // see primary loop for comments on this
   7844                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   7845                         if(sOrder < tOrder) {
   7846                             return UCOL_LESS;
   7847                         } else if(sOrder > tOrder) {
   7848                             return UCOL_GREATER;
   7849                         }
   7850                     }
   7851                     sOrder<<=8;
   7852                     tOrder<<=8;
   7853                 }
   7854             }
   7855         } else { // French
   7856             if(haveContractions) { // if we have contractions, we have to bail out
   7857                 // since we don't really know how to handle them here
   7858                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7859             }
   7860             // For French, we go backwards
   7861             sIndex = sLen; tIndex = tLen;
   7862             for(;;) {
   7863                 while(sOrder==0) {
   7864                     if(sIndex==0) {
   7865                         endOfSource = TRUE;
   7866                         break;
   7867                     }
   7868                     sChar=source[--sIndex];
   7869                     sOrder = elements[sChar];
   7870                     // don't even look for contractions
   7871                 }
   7872 
   7873                 while(tOrder==0) {
   7874                     if(tIndex==0) {
   7875                         if(endOfSource) {
   7876                             goto endOfSecLoop;
   7877                         } else {
   7878                             return UCOL_GREATER;
   7879                         }
   7880                     }
   7881                     tChar=target[--tIndex];
   7882                     tOrder = elements[tChar];
   7883                     // don't even look for contractions
   7884                 }
   7885                 if(endOfSource) {
   7886                     return UCOL_LESS;
   7887                 }
   7888 
   7889                 if(sOrder == tOrder) {
   7890                     sOrder = 0; tOrder = 0;
   7891                     continue;
   7892                 } else {
   7893                     // see the primary loop for comments
   7894                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   7895                         if(sOrder < tOrder) {
   7896                             return UCOL_LESS;
   7897                         } else if(sOrder > tOrder) {
   7898                             return UCOL_GREATER;
   7899                         }
   7900                     }
   7901                     sOrder<<=8;
   7902                     tOrder<<=8;
   7903                 }
   7904             }
   7905         }
   7906     }
   7907 
   7908 endOfSecLoop:
   7909     if(strength >= UCOL_TERTIARY) {
   7910         // tertiary loop is the same as secondary (except no French)
   7911         elements += coll->latinOneTableLen;
   7912         sIndex = 0; tIndex = 0;
   7913         endOfSource = FALSE;
   7914         for(;;) {
   7915             while(sOrder==0) {
   7916                 if(sIndex==sLen) {
   7917                     endOfSource = TRUE;
   7918                     break;
   7919                 }
   7920                 sChar=source[sIndex++];
   7921                 sOrder = elements[sChar];
   7922                 if(sOrder > UCOL_NOT_FOUND) {
   7923                     sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
   7924                 }
   7925             }
   7926             while(tOrder==0) {
   7927                 if(tIndex==tLen) {
   7928                     if(endOfSource) {
   7929                         return UCOL_EQUAL; // if both strings are at the end, they are equal
   7930                     } else {
   7931                         return UCOL_GREATER;
   7932                     }
   7933                 }
   7934                 tChar=target[tIndex++];
   7935                 tOrder = elements[tChar];
   7936                 if(tOrder > UCOL_NOT_FOUND) {
   7937                     tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
   7938                 }
   7939             }
   7940             if(endOfSource) {
   7941                 return UCOL_LESS;
   7942             }
   7943             if(sOrder == tOrder) {
   7944                 sOrder = 0; tOrder = 0;
   7945                 continue;
   7946             } else {
   7947                 if(((sOrder^tOrder)&0xff000000)!=0) {
   7948                     if(sOrder < tOrder) {
   7949                         return UCOL_LESS;
   7950                     } else if(sOrder > tOrder) {
   7951                         return UCOL_GREATER;
   7952                     }
   7953                 }
   7954                 sOrder<<=8;
   7955                 tOrder<<=8;
   7956             }
   7957         }
   7958     }
   7959     return UCOL_EQUAL;
   7960 }
   7961 
   7962 
   7963 U_CAPI UCollationResult U_EXPORT2
   7964 ucol_strcollIter( const UCollator    *coll,
   7965                  UCharIterator *sIter,
   7966                  UCharIterator *tIter,
   7967                  UErrorCode         *status)
   7968 {
   7969     if(!status || U_FAILURE(*status)) {
   7970         return UCOL_EQUAL;
   7971     }
   7972 
   7973     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
   7974     UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
   7975 
   7976     if (sIter == tIter) {
   7977         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   7978         return UCOL_EQUAL;
   7979     }
   7980     if(sIter == NULL || tIter == NULL || coll == NULL) {
   7981         *status = U_ILLEGAL_ARGUMENT_ERROR;
   7982         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   7983         return UCOL_EQUAL;
   7984     }
   7985 
   7986     UCollationResult result = UCOL_EQUAL;
   7987 
   7988     // Preparing the context objects for iterating over strings
   7989     collIterate sColl, tColl;
   7990     IInit_collIterate(coll, NULL, -1, &sColl, status);
   7991     IInit_collIterate(coll, NULL, -1, &tColl, status);
   7992     if(U_FAILURE(*status)) {
   7993         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   7994         return UCOL_EQUAL;
   7995     }
   7996     // The division for the array length may truncate the array size to
   7997     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   7998     // for all platforms anyway.
   7999     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8000     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8001     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
   8002 
   8003     sColl.iterator = sIter;
   8004     sColl.flags |= UCOL_USE_ITERATOR;
   8005     tColl.flags |= UCOL_USE_ITERATOR;
   8006     tColl.iterator = tIter;
   8007 
   8008     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
   8009         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   8010         sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
   8011         sColl.flags &= ~UCOL_ITER_NORM;
   8012 
   8013         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   8014         tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
   8015         tColl.flags &= ~UCOL_ITER_NORM;
   8016     }
   8017 
   8018     UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
   8019 
   8020     while((sChar = sColl.iterator->next(sColl.iterator)) ==
   8021         (tChar = tColl.iterator->next(tColl.iterator))) {
   8022             if(sChar == U_SENTINEL) {
   8023                 result = UCOL_EQUAL;
   8024                 goto end_compare;
   8025             }
   8026     }
   8027 
   8028     if(sChar == U_SENTINEL) {
   8029         tChar = tColl.iterator->previous(tColl.iterator);
   8030     }
   8031 
   8032     if(tChar == U_SENTINEL) {
   8033         sChar = sColl.iterator->previous(sColl.iterator);
   8034     }
   8035 
   8036     sChar = sColl.iterator->previous(sColl.iterator);
   8037     tChar = tColl.iterator->previous(tColl.iterator);
   8038 
   8039     if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
   8040     {
   8041         // We are stopped in the middle of a contraction.
   8042         // Scan backwards through the == part of the string looking for the start of the contraction.
   8043         //   It doesn't matter which string we scan, since they are the same in this region.
   8044         do
   8045         {
   8046             sChar = sColl.iterator->previous(sColl.iterator);
   8047             tChar = tColl.iterator->previous(tColl.iterator);
   8048         }
   8049         while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
   8050     }
   8051 
   8052 
   8053     if(U_SUCCESS(*status)) {
   8054         result = ucol_strcollRegular(&sColl, &tColl, status);
   8055     }
   8056 
   8057 end_compare:
   8058     if(sNormIter || tNormIter) {
   8059         unorm_closeIter(sNormIter);
   8060         unorm_closeIter(tNormIter);
   8061     }
   8062 
   8063     UTRACE_EXIT_VALUE_STATUS(result, *status)
   8064     return result;
   8065 }
   8066 
   8067 
   8068 /*                                                                      */
   8069 /* ucol_strcoll     Main public API string comparison function          */
   8070 /*                                                                      */
   8071 U_CAPI UCollationResult U_EXPORT2
   8072 ucol_strcoll( const UCollator    *coll,
   8073               const UChar        *source,
   8074               int32_t            sourceLength,
   8075               const UChar        *target,
   8076               int32_t            targetLength)
   8077 {
   8078     U_ALIGN_CODE(16);
   8079 
   8080     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
   8081     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   8082         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
   8083         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
   8084         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
   8085     }
   8086 
   8087     if(source == NULL || target == NULL) {
   8088         // do not crash, but return. Should have
   8089         // status argument to return error.
   8090         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8091         return UCOL_EQUAL;
   8092     }
   8093 
   8094     /* Quick check if source and target are same strings. */
   8095     /* They should either both be NULL terminated or the explicit length should be set on both. */
   8096     if (source==target && sourceLength==targetLength) {
   8097         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8098         return UCOL_EQUAL;
   8099     }
   8100 
   8101     /* Scan the strings.  Find:                                                             */
   8102     /*    The length of any leading portion that is equal                                   */
   8103     /*    Whether they are exactly equal.  (in which case we just return)                   */
   8104     const UChar    *pSrc    = source;
   8105     const UChar    *pTarg   = target;
   8106     int32_t        equalLength;
   8107 
   8108     if (sourceLength == -1 && targetLength == -1) {
   8109         // Both strings are null terminated.
   8110         //    Scan through any leading equal portion.
   8111         while (*pSrc == *pTarg && *pSrc != 0) {
   8112             pSrc++;
   8113             pTarg++;
   8114         }
   8115         if (*pSrc == 0 && *pTarg == 0) {
   8116             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8117             return UCOL_EQUAL;
   8118         }
   8119         equalLength = (int32_t)(pSrc - source);
   8120     }
   8121     else
   8122     {
   8123         // One or both strings has an explicit length.
   8124         const UChar    *pSrcEnd = source + sourceLength;
   8125         const UChar    *pTargEnd = target + targetLength;
   8126 
   8127         // Scan while the strings are bitwise ==, or until one is exhausted.
   8128         for (;;) {
   8129             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
   8130                 break;
   8131             }
   8132             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
   8133                 break;
   8134             }
   8135             if (*pSrc != *pTarg) {
   8136                 break;
   8137             }
   8138             pSrc++;
   8139             pTarg++;
   8140         }
   8141         equalLength = (int32_t)(pSrc - source);
   8142 
   8143         // If we made it all the way through both strings, we are done.  They are ==
   8144         if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
   8145             (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
   8146         {
   8147             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8148             return UCOL_EQUAL;
   8149         }
   8150     }
   8151     if (equalLength > 0) {
   8152         /* There is an identical portion at the beginning of the two strings.        */
   8153         /*   If the identical portion ends within a contraction or a comibining      */
   8154         /*   character sequence, back up to the start of that sequence.              */
   8155 
   8156         // These values should already be set by the code above.
   8157         //pSrc  = source + equalLength;        /* point to the first differing chars   */
   8158         //pTarg = target + equalLength;
   8159         if ((pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
   8160             (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
   8161         {
   8162             // We are stopped in the middle of a contraction.
   8163             // Scan backwards through the == part of the string looking for the start of the contraction.
   8164             //   It doesn't matter which string we scan, since they are the same in this region.
   8165             do
   8166             {
   8167                 equalLength--;
   8168                 pSrc--;
   8169             }
   8170             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
   8171         }
   8172 
   8173         source += equalLength;
   8174         target += equalLength;
   8175         if (sourceLength > 0) {
   8176             sourceLength -= equalLength;
   8177         }
   8178         if (targetLength > 0) {
   8179             targetLength -= equalLength;
   8180         }
   8181     }
   8182 
   8183     UErrorCode status = U_ZERO_ERROR;
   8184     UCollationResult returnVal;
   8185     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
   8186         returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
   8187     } else {
   8188         returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
   8189     }
   8190     UTRACE_EXIT_VALUE(returnVal);
   8191     return returnVal;
   8192 }
   8193 
   8194 /* convenience function for comparing strings */
   8195 U_CAPI UBool U_EXPORT2
   8196 ucol_greater(    const    UCollator        *coll,
   8197         const    UChar            *source,
   8198         int32_t            sourceLength,
   8199         const    UChar            *target,
   8200         int32_t            targetLength)
   8201 {
   8202     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8203         == UCOL_GREATER);
   8204 }
   8205 
   8206 /* convenience function for comparing strings */
   8207 U_CAPI UBool U_EXPORT2
   8208 ucol_greaterOrEqual(    const    UCollator    *coll,
   8209             const    UChar        *source,
   8210             int32_t        sourceLength,
   8211             const    UChar        *target,
   8212             int32_t        targetLength)
   8213 {
   8214     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8215         != UCOL_LESS);
   8216 }
   8217 
   8218 /* convenience function for comparing strings */
   8219 U_CAPI UBool U_EXPORT2
   8220 ucol_equal(        const    UCollator        *coll,
   8221             const    UChar            *source,
   8222             int32_t            sourceLength,
   8223             const    UChar            *target,
   8224             int32_t            targetLength)
   8225 {
   8226     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8227         == UCOL_EQUAL);
   8228 }
   8229 
   8230 U_CAPI void U_EXPORT2
   8231 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
   8232     if(coll && coll->UCA) {
   8233         uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
   8234     }
   8235 }
   8236 
   8237 #endif /* #if !UCONFIG_NO_COLLATION */
   8238