Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 1996-2013, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ucol.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 * Modification history
     12 * Date        Name      Comments
     13 * 1996-1999   various members of ICU team maintained C API for collation framework
     14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
     15 * 03/01/2001  synwee    Added maxexpansion functionality.
     16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_COLLATION
     22 
     23 #include "unicode/bytestream.h"
     24 #include "unicode/coleitr.h"
     25 #include "unicode/unorm.h"
     26 #include "unicode/udata.h"
     27 #include "unicode/ustring.h"
     28 #include "unicode/utf8.h"
     29 
     30 #include "ucol_imp.h"
     31 #include "bocsu.h"
     32 
     33 #include "normalizer2impl.h"
     34 #include "unorm_it.h"
     35 #include "umutex.h"
     36 #include "cmemory.h"
     37 #include "ucln_in.h"
     38 #include "cstring.h"
     39 #include "utracimp.h"
     40 #include "putilimp.h"
     41 #include "uassert.h"
     42 #include "unicode/coll.h"
     43 
     44 #ifdef UCOL_DEBUG
     45 #include <stdio.h>
     46 #endif
     47 
     48 U_NAMESPACE_USE
     49 
     50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     51 
     52 #define LAST_BYTE_MASK_           0xFF
     53 #define SECOND_LAST_BYTE_SHIFT_   8
     54 
     55 #define ZERO_CC_LIMIT_            0xC0
     56 
     57 // These are static pointers to the NFC/NFD implementation instance.
     58 // Each of them is always the same between calls to u_cleanup
     59 // and therefore writing to it is not synchronized.
     60 // They are cleaned in ucol_cleanup
     61 static const Normalizer2 *g_nfd = NULL;
     62 static const Normalizer2Impl *g_nfcImpl = NULL;
     63 
     64 // These are values from UCA required for
     65 // implicit generation and supressing sort key compression
     66 // they should regularly be in the UCA, but if one
     67 // is running without UCA, it could be a problem
     68 static const int32_t maxRegularPrimary  = 0x7A;
     69 static const int32_t minImplicitPrimary = 0xE0;
     70 static const int32_t maxImplicitPrimary = 0xE4;
     71 
     72 U_CDECL_BEGIN
     73 static UBool U_CALLCONV
     74 ucol_cleanup(void)
     75 {
     76     g_nfd = NULL;
     77     g_nfcImpl = NULL;
     78     return TRUE;
     79 }
     80 
     81 static int32_t U_CALLCONV
     82 _getFoldingOffset(uint32_t data) {
     83     return (int32_t)(data&0xFFFFFF);
     84 }
     85 
     86 U_CDECL_END
     87 
     88 static inline
     89 UBool initializeNFD(UErrorCode *status) {
     90     if (g_nfd != NULL) {
     91         return TRUE;
     92     } else {
     93         // The result is constant, until the library is reloaded.
     94         g_nfd = Normalizer2Factory::getNFDInstance(*status);
     95         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
     96         return U_SUCCESS(*status);
     97     }
     98 }
     99 
    100 // init FCD data
    101 static inline
    102 UBool initializeFCD(UErrorCode *status) {
    103     if (g_nfcImpl != NULL) {
    104         return TRUE;
    105     } else {
    106         // The result is constant, until the library is reloaded.
    107         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
    108         // Note: Alternatively, we could also store this pointer in each collIterate struct,
    109         // same as Normalizer2Factory::getImpl(collIterate->nfd).
    110         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
    111         return U_SUCCESS(*status);
    112     }
    113 }
    114 
    115 static
    116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
    117                               int32_t sourceLen, collIterate *s,
    118                               UErrorCode *status)
    119 {
    120     (s)->string = (s)->pos = sourceString;
    121     (s)->origFlags = 0;
    122     (s)->flags = 0;
    123     if (sourceLen >= 0) {
    124         s->flags |= UCOL_ITER_HASLEN;
    125         (s)->endp = (UChar *)sourceString+sourceLen;
    126     }
    127     else {
    128         /* change to enable easier checking for end of string for fcdpositon */
    129         (s)->endp = NULL;
    130     }
    131     (s)->extendCEs = NULL;
    132     (s)->extendCEsSize = 0;
    133     (s)->CEpos = (s)->toReturn = (s)->CEs;
    134     (s)->offsetBuffer = NULL;
    135     (s)->offsetBufferSize = 0;
    136     (s)->offsetReturn = (s)->offsetStore = NULL;
    137     (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
    138     (s)->coll = (collator);
    139     if (initializeNFD(status)) {
    140         (s)->nfd = g_nfd;
    141     } else {
    142         return;
    143     }
    144     (s)->fcdPosition = 0;
    145     if(collator->normalizationMode == UCOL_ON) {
    146         (s)->flags |= UCOL_ITER_NORM;
    147     }
    148     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
    149         (s)->flags |= UCOL_HIRAGANA_Q;
    150     }
    151     (s)->iterator = NULL;
    152     //(s)->iteratorIndex = 0;
    153 }
    154 
    155 U_CAPI void  U_EXPORT2
    156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
    157                              int32_t sourceLen, collIterate *s,
    158                              UErrorCode *status) {
    159     /* Out-of-line version for use from other files. */
    160     IInit_collIterate(collator, sourceString, sourceLen, s, status);
    161 }
    162 
    163 U_CAPI collIterate * U_EXPORT2
    164 uprv_new_collIterate(UErrorCode *status) {
    165     if(U_FAILURE(*status)) {
    166         return NULL;
    167     }
    168     collIterate *s = new collIterate;
    169     if(s == NULL) {
    170         *status = U_MEMORY_ALLOCATION_ERROR;
    171         return NULL;
    172     }
    173     return s;
    174 }
    175 
    176 U_CAPI void U_EXPORT2
    177 uprv_delete_collIterate(collIterate *s) {
    178     delete s;
    179 }
    180 
    181 U_CAPI UBool U_EXPORT2
    182 uprv_collIterateAtEnd(collIterate *s) {
    183     return s == NULL || s->pos == s->endp;
    184 }
    185 
    186 /**
    187 * Backup the state of the collIterate struct data
    188 * @param data collIterate to backup
    189 * @param backup storage
    190 */
    191 static
    192 inline void backupState(const collIterate *data, collIterateState *backup)
    193 {
    194     backup->fcdPosition = data->fcdPosition;
    195     backup->flags       = data->flags;
    196     backup->origFlags   = data->origFlags;
    197     backup->pos         = data->pos;
    198     backup->bufferaddress = data->writableBuffer.getBuffer();
    199     backup->buffersize    = data->writableBuffer.length();
    200     backup->iteratorMove = 0;
    201     backup->iteratorIndex = 0;
    202     if(data->iterator != NULL) {
    203         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
    204         backup->iteratorIndex = data->iterator->getState(data->iterator);
    205         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
    206         if(backup->iteratorIndex == UITER_NO_STATE) {
    207             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
    208                 backup->iteratorMove++;
    209                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
    210             }
    211             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    212         }
    213     }
    214 }
    215 
    216 /**
    217 * Loads the state into the collIterate struct data
    218 * @param data collIterate to backup
    219 * @param backup storage
    220 * @param forwards boolean to indicate if forwards iteration is used,
    221 *        false indicates backwards iteration
    222 */
    223 static
    224 inline void loadState(collIterate *data, const collIterateState *backup,
    225                       UBool        forwards)
    226 {
    227     UErrorCode status = U_ZERO_ERROR;
    228     data->flags       = backup->flags;
    229     data->origFlags   = backup->origFlags;
    230     if(data->iterator != NULL) {
    231         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
    232         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
    233         if(backup->iteratorMove != 0) {
    234             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
    235         }
    236     }
    237     data->pos         = backup->pos;
    238 
    239     if ((data->flags & UCOL_ITER_INNORMBUF) &&
    240         data->writableBuffer.getBuffer() != backup->bufferaddress) {
    241         /*
    242         this is when a new buffer has been reallocated and we'll have to
    243         calculate the new position.
    244         note the new buffer has to contain the contents of the old buffer.
    245         */
    246         if (forwards) {
    247             data->pos = data->writableBuffer.getTerminatedBuffer() +
    248                                          (data->pos - backup->bufferaddress);
    249         }
    250         else {
    251             /* backwards direction */
    252             int32_t temp = backup->buffersize -
    253                                   (int32_t)(data->pos - backup->bufferaddress);
    254             data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
    255         }
    256     }
    257     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
    258         /*
    259         this is alittle tricky.
    260         if we are initially not in the normalization buffer, even if we
    261         normalize in the later stage, the data in the buffer will be
    262         ignored, since we skip back up to the data string.
    263         however if we are already in the normalization buffer, any
    264         further normalization will pull data into the normalization
    265         buffer and modify the fcdPosition.
    266         since we are keeping the data in the buffer for use, the
    267         fcdPosition can not be reverted back.
    268         arrgghh....
    269         */
    270         data->fcdPosition = backup->fcdPosition;
    271     }
    272 }
    273 
    274 static UBool
    275 reallocCEs(collIterate *data, int32_t newCapacity) {
    276     uint32_t *oldCEs = data->extendCEs;
    277     if(oldCEs == NULL) {
    278         oldCEs = data->CEs;
    279     }
    280     int32_t length = data->CEpos - oldCEs;
    281     uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
    282     if(newCEs == NULL) {
    283         return FALSE;
    284     }
    285     uprv_memcpy(newCEs, oldCEs, length * 4);
    286     uprv_free(data->extendCEs);
    287     data->extendCEs = newCEs;
    288     data->extendCEsSize = newCapacity;
    289     data->CEpos = newCEs + length;
    290     return TRUE;
    291 }
    292 
    293 static UBool
    294 increaseCEsCapacity(collIterate *data) {
    295     int32_t oldCapacity;
    296     if(data->extendCEs != NULL) {
    297         oldCapacity = data->extendCEsSize;
    298     } else {
    299         oldCapacity = LENGTHOF(data->CEs);
    300     }
    301     return reallocCEs(data, 2 * oldCapacity);
    302 }
    303 
    304 static UBool
    305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
    306     int32_t oldCapacity;
    307     if(data->extendCEs != NULL) {
    308         oldCapacity = data->extendCEsSize;
    309     } else {
    310         oldCapacity = LENGTHOF(data->CEs);
    311     }
    312     if(minCapacity <= oldCapacity) {
    313         return TRUE;
    314     }
    315     oldCapacity *= 2;
    316     return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
    317 }
    318 
    319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
    320     if(U_FAILURE(errorCode)) {
    321         return;
    322     }
    323     int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
    324     U_ASSERT(length >= offsetBufferSize || offsetStore != NULL);
    325     if(length >= offsetBufferSize) {
    326         int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
    327         int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4));
    328         if(newBuffer == NULL) {
    329             errorCode = U_MEMORY_ALLOCATION_ERROR;
    330             return;
    331         }
    332         if(length > 0) {
    333             uprv_memcpy(newBuffer, offsetBuffer, length * 4);
    334         }
    335         uprv_free(offsetBuffer);
    336         offsetBuffer = newBuffer;
    337         offsetStore = offsetBuffer + length;
    338         offsetBufferSize = newCapacity;
    339     }
    340     *offsetStore++ = offset;
    341 }
    342 
    343 /*
    344 * collIter_eos()
    345 *     Checks for a collIterate being positioned at the end of
    346 *     its source string.
    347 *
    348 */
    349 static
    350 inline UBool collIter_eos(collIterate *s) {
    351     if(s->flags & UCOL_USE_ITERATOR) {
    352       return !(s->iterator->hasNext(s->iterator));
    353     }
    354     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
    355         // Null terminated string, but not at null, so not at end.
    356         //   Whether in main or normalization buffer doesn't matter.
    357         return FALSE;
    358     }
    359 
    360     // String with length.  Can't be in normalization buffer, which is always
    361     //  null termintated.
    362     if (s->flags & UCOL_ITER_HASLEN) {
    363         return (s->pos == s->endp);
    364     }
    365 
    366     // We are at a null termination, could be either normalization buffer or main string.
    367     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
    368         // At null at end of main string.
    369         return TRUE;
    370     }
    371 
    372     // At null at end of normalization buffer.  Need to check whether there there are
    373     //   any characters left in the main buffer.
    374     if(s->origFlags & UCOL_USE_ITERATOR) {
    375       return !(s->iterator->hasNext(s->iterator));
    376     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
    377         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
    378         return (*s->fcdPosition == 0);
    379     }
    380     else {
    381         // Main string with an end pointer.
    382         return s->fcdPosition == s->endp;
    383     }
    384 }
    385 
    386 /*
    387 * collIter_bos()
    388 *     Checks for a collIterate being positioned at the start of
    389 *     its source string.
    390 *
    391 */
    392 static
    393 inline UBool collIter_bos(collIterate *source) {
    394   // if we're going backwards, we need to know whether there is more in the
    395   // iterator, even if we are in the side buffer
    396   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    397     return !source->iterator->hasPrevious(source->iterator);
    398   }
    399   if (source->pos <= source->string ||
    400       ((source->flags & UCOL_ITER_INNORMBUF) &&
    401       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
    402     return TRUE;
    403   }
    404   return FALSE;
    405 }
    406 
    407 /*static
    408 inline UBool collIter_SimpleBos(collIterate *source) {
    409   // if we're going backwards, we need to know whether there is more in the
    410   // iterator, even if we are in the side buffer
    411   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
    412     return !source->iterator->hasPrevious(source->iterator);
    413   }
    414   if (source->pos == source->string) {
    415     return TRUE;
    416   }
    417   return FALSE;
    418 }*/
    419     //return (data->pos == data->string) ||
    420 
    421 
    422 /****************************************************************************/
    423 /* Following are the open/close functions                                   */
    424 /*                                                                          */
    425 /****************************************************************************/
    426 
    427 static UCollator*
    428 ucol_initFromBinary(const uint8_t *bin, int32_t length,
    429                 const UCollator *base,
    430                 UCollator *fillIn,
    431                 UErrorCode *status)
    432 {
    433     UCollator *result = fillIn;
    434     if(U_FAILURE(*status)) {
    435         return NULL;
    436     }
    437     /*
    438     if(base == NULL) {
    439         // we don't support null base yet
    440         *status = U_ILLEGAL_ARGUMENT_ERROR;
    441         return NULL;
    442     }
    443     */
    444     // We need these and we could be running without UCA
    445     uprv_uca_initImplicitConstants(status);
    446     UCATableHeader *colData = (UCATableHeader *)bin;
    447     // do we want version check here? We're trying to figure out whether collators are compatible
    448     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
    449         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
    450         colData->version[0] != UCOL_BUILDER_VERSION)
    451     {
    452         *status = U_COLLATOR_VERSION_MISMATCH;
    453         return NULL;
    454     }
    455     else {
    456         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
    457             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
    458             if(U_FAILURE(*status)){
    459                 return NULL;
    460             }
    461             result->hasRealData = TRUE;
    462         }
    463         else {
    464             if(base) {
    465                 result = ucol_initCollator(base->image, result, base, status);
    466                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
    467                 if(U_FAILURE(*status)){
    468                     return NULL;
    469                 }
    470                 result->hasRealData = FALSE;
    471             }
    472             else {
    473                 *status = U_USELESS_COLLATOR_ERROR;
    474                 return NULL;
    475             }
    476         }
    477         result->freeImageOnClose = FALSE;
    478     }
    479     result->actualLocale = NULL;
    480     result->validLocale = NULL;
    481     result->requestedLocale = NULL;
    482     result->rules = NULL;
    483     result->rulesLength = 0;
    484     result->freeRulesOnClose = FALSE;
    485     result->ucaRules = NULL;
    486     return result;
    487 }
    488 
    489 U_CAPI UCollator* U_EXPORT2
    490 ucol_openBinary(const uint8_t *bin, int32_t length,
    491                 const UCollator *base,
    492                 UErrorCode *status)
    493 {
    494     return ucol_initFromBinary(bin, length, base, NULL, status);
    495 }
    496 
    497 U_CAPI int32_t U_EXPORT2
    498 ucol_cloneBinary(const UCollator *coll,
    499                  uint8_t *buffer, int32_t capacity,
    500                  UErrorCode *status)
    501 {
    502     int32_t length = 0;
    503     if(U_FAILURE(*status)) {
    504         return length;
    505     }
    506     if(capacity < 0) {
    507         *status = U_ILLEGAL_ARGUMENT_ERROR;
    508         return length;
    509     }
    510     if(coll->hasRealData == TRUE) {
    511         length = coll->image->size;
    512         if(length <= capacity) {
    513             uprv_memcpy(buffer, coll->image, length);
    514         } else {
    515             *status = U_BUFFER_OVERFLOW_ERROR;
    516         }
    517     } else {
    518         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
    519         if(length <= capacity) {
    520             /* build the UCATableHeader with minimal entries */
    521             /* do not copy the header from the UCA file because its values are wrong! */
    522             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
    523 
    524             /* reset everything */
    525             uprv_memset(buffer, 0, length);
    526 
    527             /* set the tailoring-specific values */
    528             UCATableHeader *myData = (UCATableHeader *)buffer;
    529             myData->size = length;
    530 
    531             /* offset for the options, the only part of the data that is present after the header */
    532             myData->options = sizeof(UCATableHeader);
    533 
    534             /* need to always set the expansion value for an upper bound of the options */
    535             myData->expansion = myData->options + sizeof(UColOptionSet);
    536 
    537             myData->magic = UCOL_HEADER_MAGIC;
    538             myData->isBigEndian = U_IS_BIG_ENDIAN;
    539             myData->charSetFamily = U_CHARSET_FAMILY;
    540 
    541             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
    542             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
    543 
    544             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
    545             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
    546             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
    547             myData->jamoSpecial = coll->image->jamoSpecial;
    548 
    549             /* copy the collator options */
    550             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
    551         } else {
    552             *status = U_BUFFER_OVERFLOW_ERROR;
    553         }
    554     }
    555     return length;
    556 }
    557 
    558 U_CAPI UCollator* U_EXPORT2
    559 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status)
    560 {
    561     UCollator * localCollator;
    562     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
    563     int32_t imageSize = 0;
    564     int32_t rulesSize = 0;
    565     int32_t rulesPadding = 0;
    566     int32_t defaultReorderCodesSize = 0;
    567     int32_t reorderCodesSize = 0;
    568     uint8_t *image;
    569     UChar *rules;
    570     int32_t* defaultReorderCodes;
    571     int32_t* reorderCodes;
    572     uint8_t* leadBytePermutationTable;
    573     UBool imageAllocated = FALSE;
    574 
    575     if (status == NULL || U_FAILURE(*status)){
    576         return NULL;
    577     }
    578     if (coll == NULL) {
    579        *status = U_ILLEGAL_ARGUMENT_ERROR;
    580         return NULL;
    581     }
    582 
    583     if (coll->rules && coll->freeRulesOnClose) {
    584         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
    585         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
    586         bufferSizeNeeded += rulesSize + rulesPadding;
    587     }
    588     // no padding for alignment needed from here since the next two are 4 byte quantities
    589     if (coll->defaultReorderCodes) {
    590         defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
    591         bufferSizeNeeded += defaultReorderCodesSize;
    592     }
    593     if (coll->reorderCodes) {
    594         reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
    595         bufferSizeNeeded += reorderCodesSize;
    596     }
    597     if (coll->leadBytePermutationTable) {
    598         bufferSizeNeeded += 256 * sizeof(uint8_t);
    599     }
    600 
    601     if (pBufferSize != NULL) {
    602         int32_t inputSize = *pBufferSize;
    603         *pBufferSize = 1;
    604         if (inputSize == 0) {
    605             return NULL;  // preflighting for deprecated functionality
    606         }
    607     }
    608 
    609     char *stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
    610     // Null pointer check.
    611     if (stackBufferChars == NULL) {
    612         *status = U_MEMORY_ALLOCATION_ERROR;
    613         return NULL;
    614     }
    615     *status = U_SAFECLONE_ALLOCATED_WARNING;
    616 
    617     localCollator = (UCollator *)stackBufferChars;
    618     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
    619     defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
    620     reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
    621     leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
    622 
    623     {
    624         UErrorCode tempStatus = U_ZERO_ERROR;
    625         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
    626     }
    627     if (coll->freeImageOnClose) {
    628         image = (uint8_t *)uprv_malloc(imageSize);
    629         // Null pointer check
    630         if (image == NULL) {
    631             *status = U_MEMORY_ALLOCATION_ERROR;
    632             return NULL;
    633         }
    634         ucol_cloneBinary(coll, image, imageSize, status);
    635         imageAllocated = TRUE;
    636     }
    637     else {
    638         image = (uint8_t *)coll->image;
    639     }
    640     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
    641     if (U_FAILURE(*status)) {
    642         return NULL;
    643     }
    644 
    645     if (coll->rules) {
    646         if (coll->freeRulesOnClose) {
    647             localCollator->rules = u_strcpy(rules, coll->rules);
    648             //bufferEnd += rulesSize;
    649         }
    650         else {
    651             localCollator->rules = coll->rules;
    652         }
    653         localCollator->freeRulesOnClose = FALSE;
    654         localCollator->rulesLength = coll->rulesLength;
    655     }
    656 
    657     // collator reordering
    658     if (coll->defaultReorderCodes) {
    659         localCollator->defaultReorderCodes =
    660             (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
    661         localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
    662         localCollator->freeDefaultReorderCodesOnClose = FALSE;
    663     }
    664     if (coll->reorderCodes) {
    665         localCollator->reorderCodes =
    666             (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
    667         localCollator->reorderCodesLength = coll->reorderCodesLength;
    668         localCollator->freeReorderCodesOnClose = FALSE;
    669     }
    670     if (coll->leadBytePermutationTable) {
    671         localCollator->leadBytePermutationTable =
    672             (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
    673         localCollator->freeLeadBytePermutationTableOnClose = FALSE;
    674     }
    675 
    676     int32_t i;
    677     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
    678         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
    679     }
    680     // zero copies of pointers
    681     localCollator->actualLocale = NULL;
    682     localCollator->validLocale = NULL;
    683     localCollator->requestedLocale = NULL;
    684     localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
    685     localCollator->freeOnClose = TRUE;
    686     localCollator->freeImageOnClose = imageAllocated;
    687     return localCollator;
    688 }
    689 
    690 U_CAPI void U_EXPORT2
    691 ucol_close(UCollator *coll)
    692 {
    693     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
    694     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
    695     if(coll != NULL) {
    696         // these are always owned by each UCollator struct,
    697         // so we always free them
    698         if(coll->validLocale != NULL) {
    699             uprv_free(coll->validLocale);
    700         }
    701         if(coll->actualLocale != NULL) {
    702             uprv_free(coll->actualLocale);
    703         }
    704         if(coll->requestedLocale != NULL) {
    705             uprv_free(coll->requestedLocale);
    706         }
    707         if(coll->latinOneCEs != NULL) {
    708             uprv_free(coll->latinOneCEs);
    709         }
    710         if(coll->options != NULL && coll->freeOptionsOnClose) {
    711             uprv_free(coll->options);
    712         }
    713         if(coll->rules != NULL && coll->freeRulesOnClose) {
    714             uprv_free((UChar *)coll->rules);
    715         }
    716         if(coll->image != NULL && coll->freeImageOnClose) {
    717             uprv_free((UCATableHeader *)coll->image);
    718         }
    719 
    720         if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
    721             uprv_free(coll->leadBytePermutationTable);
    722         }
    723         if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
    724             uprv_free(coll->defaultReorderCodes);
    725         }
    726         if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
    727             uprv_free(coll->reorderCodes);
    728         }
    729 
    730         if(coll->delegate != NULL) {
    731           delete (Collator*)coll->delegate;
    732         }
    733 
    734         /* Here, it would be advisable to close: */
    735         /* - UData for UCA (unless we stuff it in the root resb */
    736         /* Again, do we need additional housekeeping... HMMM! */
    737         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
    738         if(coll->freeOnClose){
    739             /* for safeClone, if freeOnClose is FALSE,
    740             don't free the other instance data */
    741             uprv_free(coll);
    742         }
    743     }
    744     UTRACE_EXIT();
    745 }
    746 
    747 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
    748     if(U_FAILURE(*status)) {
    749         return;
    750     }
    751     result->caseFirst = (UColAttributeValue)opts->caseFirst;
    752     result->caseLevel = (UColAttributeValue)opts->caseLevel;
    753     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
    754     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
    755     if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
    756         return;
    757     }
    758     result->strength = (UColAttributeValue)opts->strength;
    759     result->variableTopValue = opts->variableTopValue;
    760     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
    761     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
    762     result->numericCollation = (UColAttributeValue)opts->numericCollation;
    763     result->caseFirstisDefault = TRUE;
    764     result->caseLevelisDefault = TRUE;
    765     result->frenchCollationisDefault = TRUE;
    766     result->normalizationModeisDefault = TRUE;
    767     result->strengthisDefault = TRUE;
    768     result->variableTopValueisDefault = TRUE;
    769     result->alternateHandlingisDefault = TRUE;
    770     result->hiraganaQisDefault = TRUE;
    771     result->numericCollationisDefault = TRUE;
    772 
    773     ucol_updateInternalState(result, status);
    774 
    775     result->options = opts;
    776 }
    777 
    778 
    779 /**
    780 * Approximate determination if a character is at a contraction end.
    781 * Guaranteed to be TRUE if a character is at the end of a contraction,
    782 * otherwise it is not deterministic.
    783 * @param c character to be determined
    784 * @param coll collator
    785 */
    786 static
    787 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
    788     if (c < coll->minContrEndCP) {
    789         return FALSE;
    790     }
    791 
    792     int32_t  hash = c;
    793     uint8_t  htbyte;
    794     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
    795         if (U16_IS_TRAIL(c)) {
    796             return TRUE;
    797         }
    798         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
    799     }
    800     htbyte = coll->contrEndCP[hash>>3];
    801     return (((htbyte >> (hash & 7)) & 1) == 1);
    802 }
    803 
    804 
    805 
    806 /*
    807 *   i_getCombiningClass()
    808 *        A fast, at least partly inline version of u_getCombiningClass()
    809 *        This is a candidate for further optimization.  Used heavily
    810 *        in contraction processing.
    811 */
    812 static
    813 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
    814     uint8_t sCC = 0;
    815     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
    816         sCC = u_getCombiningClass(c);
    817     }
    818     return sCC;
    819 }
    820 
    821 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
    822     UChar c;
    823     UCollator *result = fillIn;
    824     if(U_FAILURE(*status) || image == NULL) {
    825         return NULL;
    826     }
    827 
    828     if(result == NULL) {
    829         result = (UCollator *)uprv_malloc(sizeof(UCollator));
    830         if(result == NULL) {
    831             *status = U_MEMORY_ALLOCATION_ERROR;
    832             return result;
    833         }
    834         result->freeOnClose = TRUE;
    835     } else {
    836         result->freeOnClose = FALSE;
    837     }
    838 
    839     result->delegate = NULL;
    840 
    841     result->image = image;
    842     result->mapping.getFoldingOffset = _getFoldingOffset;
    843     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
    844     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
    845     if(U_FAILURE(*status)) {
    846         if(result->freeOnClose == TRUE) {
    847             uprv_free(result);
    848             result = NULL;
    849         }
    850         return result;
    851     }
    852 
    853     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
    854     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
    855     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
    856     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
    857     result->rules = NULL;
    858     result->rulesLength = 0;
    859     result->freeRulesOnClose = FALSE;
    860     result->defaultReorderCodes = NULL;
    861     result->defaultReorderCodesLength = 0;
    862     result->freeDefaultReorderCodesOnClose = FALSE;
    863     result->reorderCodes = NULL;
    864     result->reorderCodesLength = 0;
    865     result->freeReorderCodesOnClose = FALSE;
    866     result->leadBytePermutationTable = NULL;
    867     result->freeLeadBytePermutationTableOnClose = FALSE;
    868 
    869     /* get the version info from UCATableHeader and populate the Collator struct*/
    870     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
    871     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
    872     result->dataVersion[2] = 0;
    873     result->dataVersion[3] = 0;
    874 
    875     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
    876     result->minUnsafeCP = 0;
    877     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
    878         if (ucol_unsafeCP(c, result)) break;
    879     }
    880     result->minUnsafeCP = c;
    881 
    882     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
    883     result->minContrEndCP = 0;
    884     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
    885         if (ucol_contractionEndCP(c, result)) break;
    886     }
    887     result->minContrEndCP = c;
    888 
    889     /* max expansion tables */
    890     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
    891                                          result->image->endExpansionCE);
    892     result->lastEndExpansionCE = result->endExpansionCE +
    893                                  result->image->endExpansionCECount - 1;
    894     result->expansionCESize = (uint8_t*)result->image +
    895                                                result->image->expansionCESize;
    896 
    897 
    898     //result->errorCode = *status;
    899 
    900     result->latinOneCEs = NULL;
    901 
    902     result->latinOneRegenTable = FALSE;
    903     result->latinOneFailed = FALSE;
    904     result->UCA = UCA;
    905 
    906     /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
    907     result->ucaRules = NULL;
    908     result->actualLocale = NULL;
    909     result->validLocale = NULL;
    910     result->requestedLocale = NULL;
    911     result->hasRealData = FALSE; // real data lives in .dat file...
    912     result->freeImageOnClose = FALSE;
    913 
    914     /* set attributes */
    915     ucol_setOptionsFromHeader(
    916         result,
    917         (UColOptionSet*)((uint8_t*)result->image+result->image->options),
    918         status);
    919     result->freeOptionsOnClose = FALSE;
    920 
    921     return result;
    922 }
    923 
    924 /* new Mark's code */
    925 
    926 /**
    927  * For generation of Implicit CEs
    928  * @author Davis
    929  *
    930  * Cleaned up so that changes can be made more easily.
    931  * Old values:
    932 # First Implicit: E26A792D
    933 # Last Implicit: E3DC70C0
    934 # First CJK: E0030300
    935 # Last CJK: E0A9DD00
    936 # First CJK_A: E0A9DF00
    937 # Last CJK_A: E0DE3100
    938  */
    939 /* Following is a port of Mark's code for new treatment of implicits.
    940  * It is positioned here, since ucol_initUCA need to initialize the
    941  * variables below according to the data in the fractional UCA.
    942  */
    943 
    944 /**
    945  * Function used to:
    946  * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
    947  * b) bump any non-CJK characters by 10FFFF.
    948  * The relevant blocks are:
    949  * A:    4E00..9FFF; CJK Unified Ideographs
    950  *       F900..FAFF; CJK Compatibility Ideographs
    951  * B:    3400..4DBF; CJK Unified Ideographs Extension A
    952  *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
    953  * As long as
    954  *   no new B characters are allocated between 4E00 and FAFF, and
    955  *   no new A characters are outside of this range,
    956  * (very high probability) this simple code will work.
    957  * The reordered blocks are:
    958  * Block1 is CJK
    959  * Block2 is CJK_COMPAT_USED
    960  * Block3 is CJK_A
    961  * (all contiguous)
    962  * Any other CJK gets its normal code point
    963  * Any non-CJK gets +10FFFF
    964  * When we reorder Block1, we make sure that it is at the very start,
    965  * so that it will use a 3-byte form.
    966  * Warning: the we only pick up the compatibility characters that are
    967  * NOT decomposed, so that block is smaller!
    968  */
    969 
    970 // CONSTANTS
    971 static const UChar32
    972     NON_CJK_OFFSET = 0x110000,
    973     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
    974 
    975 /**
    976  * Precomputed by initImplicitConstants()
    977  */
    978 static int32_t
    979     final3Multiplier = 0,
    980     final4Multiplier = 0,
    981     final3Count = 0,
    982     final4Count = 0,
    983     medialCount = 0,
    984     min3Primary = 0,
    985     min4Primary = 0,
    986     max4Primary = 0,
    987     minTrail = 0,
    988     maxTrail = 0,
    989     max3Trail = 0,
    990     max4Trail = 0,
    991     min4Boundary = 0;
    992 
    993 static const UChar32
    994     // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
    995     // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;  (Unicode 6.1)
    996     CJK_BASE = 0x4E00,
    997     CJK_LIMIT = 0x9FCC+1,
    998     // Unified CJK ideographs in the compatibility ideographs block.
    999     CJK_COMPAT_USED_BASE = 0xFA0E,
   1000     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
   1001     // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
   1002     // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
   1003     CJK_A_BASE = 0x3400,
   1004     CJK_A_LIMIT = 0x4DB5+1,
   1005     // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
   1006     // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
   1007     CJK_B_BASE = 0x20000,
   1008     CJK_B_LIMIT = 0x2A6D6+1,
   1009     // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
   1010     // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
   1011     CJK_C_BASE = 0x2A700,
   1012     CJK_C_LIMIT = 0x2B734+1,
   1013     // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
   1014     // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
   1015     CJK_D_BASE = 0x2B740,
   1016     CJK_D_LIMIT = 0x2B81D+1;
   1017     // when adding to this list, look for all occurrences (in project)
   1018     // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
   1019 
   1020 static UChar32 swapCJK(UChar32 i) {
   1021     if (i < CJK_A_BASE) {
   1022         // non-CJK
   1023     } else if (i < CJK_A_LIMIT) {
   1024         // Extension A has lower code points than the original Unihan+compat
   1025         // but sorts higher.
   1026         return i - CJK_A_BASE
   1027                 + (CJK_LIMIT - CJK_BASE)
   1028                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1029     } else if (i < CJK_BASE) {
   1030         // non-CJK
   1031     } else if (i < CJK_LIMIT) {
   1032         return i - CJK_BASE;
   1033     } else if (i < CJK_COMPAT_USED_BASE) {
   1034         // non-CJK
   1035     } else if (i < CJK_COMPAT_USED_LIMIT) {
   1036         return i - CJK_COMPAT_USED_BASE
   1037                 + (CJK_LIMIT - CJK_BASE);
   1038     } else if (i < CJK_B_BASE) {
   1039         // non-CJK
   1040     } else if (i < CJK_B_LIMIT) {
   1041         return i; // non-BMP-CJK
   1042     } else if (i < CJK_C_BASE) {
   1043         // non-CJK
   1044     } else if (i < CJK_C_LIMIT) {
   1045         return i; // non-BMP-CJK
   1046     } else if (i < CJK_D_BASE) {
   1047         // non-CJK
   1048     } else if (i < CJK_D_LIMIT) {
   1049         return i; // non-BMP-CJK
   1050     }
   1051     return i + NON_CJK_OFFSET; // non-CJK
   1052 }
   1053 
   1054 U_CAPI UChar32 U_EXPORT2
   1055 uprv_uca_getRawFromCodePoint(UChar32 i) {
   1056     return swapCJK(i)+1;
   1057 }
   1058 
   1059 U_CAPI UChar32 U_EXPORT2
   1060 uprv_uca_getCodePointFromRaw(UChar32 i) {
   1061     i--;
   1062     UChar32 result = 0;
   1063     if(i >= NON_CJK_OFFSET) {
   1064         result = i - NON_CJK_OFFSET;
   1065     } else if(i >= CJK_B_BASE) {
   1066         result = i;
   1067     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
   1068         if(i < CJK_LIMIT - CJK_BASE) {
   1069             result = i + CJK_BASE;
   1070         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
   1071             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
   1072         } else {
   1073             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
   1074         }
   1075     } else {
   1076         result = -1;
   1077     }
   1078     return result;
   1079 }
   1080 
   1081 // GET IMPLICIT PRIMARY WEIGHTS
   1082 // Return value is left justified primary key
   1083 U_CAPI uint32_t U_EXPORT2
   1084 uprv_uca_getImplicitFromRaw(UChar32 cp) {
   1085     /*
   1086     if (cp < 0 || cp > UCOL_MAX_INPUT) {
   1087         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
   1088     }
   1089     */
   1090     int32_t last0 = cp - min4Boundary;
   1091     if (last0 < 0) {
   1092         int32_t last1 = cp / final3Count;
   1093         last0 = cp % final3Count;
   1094 
   1095         int32_t last2 = last1 / medialCount;
   1096         last1 %= medialCount;
   1097 
   1098         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
   1099         last1 = minTrail + last1; // offset
   1100         last2 = min3Primary + last2; // offset
   1101         /*
   1102         if (last2 >= min4Primary) {
   1103             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
   1104         }
   1105         */
   1106         return (last2 << 24) + (last1 << 16) + (last0 << 8);
   1107     } else {
   1108         int32_t last1 = last0 / final4Count;
   1109         last0 %= final4Count;
   1110 
   1111         int32_t last2 = last1 / medialCount;
   1112         last1 %= medialCount;
   1113 
   1114         int32_t last3 = last2 / medialCount;
   1115         last2 %= medialCount;
   1116 
   1117         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
   1118         last1 = minTrail + last1; // offset
   1119         last2 = minTrail + last2; // offset
   1120         last3 = min4Primary + last3; // offset
   1121         /*
   1122         if (last3 > max4Primary) {
   1123             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
   1124         }
   1125         */
   1126         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
   1127     }
   1128 }
   1129 
   1130 static uint32_t U_EXPORT2
   1131 uprv_uca_getImplicitPrimary(UChar32 cp) {
   1132    //fprintf(stdout, "Incoming: %04x\n", cp);
   1133     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
   1134 
   1135     cp = swapCJK(cp);
   1136     cp++;
   1137     // we now have a range of numbers from 0 to 21FFFF.
   1138 
   1139     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
   1140     //fprintf(stdout, "CJK swapped: %04x\n", cp);
   1141 
   1142     return uprv_uca_getImplicitFromRaw(cp);
   1143 }
   1144 
   1145 /**
   1146  * Converts implicit CE into raw integer ("code point")
   1147  * @param implicit
   1148  * @return -1 if illegal format
   1149  */
   1150 U_CAPI UChar32 U_EXPORT2
   1151 uprv_uca_getRawFromImplicit(uint32_t implicit) {
   1152     UChar32 result;
   1153     UChar32 b3 = implicit & 0xFF;
   1154     UChar32 b2 = (implicit >> 8) & 0xFF;
   1155     UChar32 b1 = (implicit >> 16) & 0xFF;
   1156     UChar32 b0 = (implicit >> 24) & 0xFF;
   1157 
   1158     // simple parameter checks
   1159     if (b0 < min3Primary || b0 > max4Primary
   1160         || b1 < minTrail || b1 > maxTrail)
   1161         return -1;
   1162     // normal offsets
   1163     b1 -= minTrail;
   1164 
   1165     // take care of the final values, and compose
   1166     if (b0 < min4Primary) {
   1167         if (b2 < minTrail || b2 > max3Trail || b3 != 0)
   1168             return -1;
   1169         b2 -= minTrail;
   1170         UChar32 remainder = b2 % final3Multiplier;
   1171         if (remainder != 0)
   1172             return -1;
   1173         b0 -= min3Primary;
   1174         b2 /= final3Multiplier;
   1175         result = ((b0 * medialCount) + b1) * final3Count + b2;
   1176     } else {
   1177         if (b2 < minTrail || b2 > maxTrail
   1178             || b3 < minTrail || b3 > max4Trail)
   1179             return -1;
   1180         b2 -= minTrail;
   1181         b3 -= minTrail;
   1182         UChar32 remainder = b3 % final4Multiplier;
   1183         if (remainder != 0)
   1184             return -1;
   1185         b3 /= final4Multiplier;
   1186         b0 -= min4Primary;
   1187         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
   1188     }
   1189     // final check
   1190     if (result < 0 || result > UCOL_MAX_INPUT)
   1191         return -1;
   1192     return result;
   1193 }
   1194 
   1195 
   1196 static inline int32_t divideAndRoundUp(int a, int b) {
   1197     return 1 + (a-1)/b;
   1198 }
   1199 
   1200 /* this function is either called from initUCA or from genUCA before
   1201  * doing canonical closure for the UCA.
   1202  */
   1203 
   1204 /**
   1205  * Set up to generate implicits.
   1206  * Maintenance Note:  this function may end up being called more than once, due
   1207  *                    to threading races during initialization.  Make sure that
   1208  *                    none of the Constants is ever transiently assigned an
   1209  *                    incorrect value.
   1210  * @param minPrimary
   1211  * @param maxPrimary
   1212  * @param minTrail final byte
   1213  * @param maxTrail final byte
   1214  * @param gap3 the gap we leave for tailoring for 3-byte forms
   1215  * @param gap4 the gap we leave for tailoring for 4-byte forms
   1216  */
   1217 static void initImplicitConstants(int minPrimary, int maxPrimary,
   1218                                     int minTrailIn, int maxTrailIn,
   1219                                     int gap3, int primaries3count,
   1220                                     UErrorCode *status) {
   1221     // some simple parameter checks
   1222     if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
   1223         || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
   1224         || (primaries3count < 1))
   1225     {
   1226         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1227         return;
   1228     };
   1229 
   1230     minTrail = minTrailIn;
   1231     maxTrail = maxTrailIn;
   1232 
   1233     min3Primary = minPrimary;
   1234     max4Primary = maxPrimary;
   1235     // compute constants for use later.
   1236     // number of values we can use in trailing bytes
   1237     // leave room for empty values between AND above, e.g. if gap = 2
   1238     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
   1239     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
   1240     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
   1241     final3Multiplier = gap3 + 1;
   1242     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
   1243     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
   1244 
   1245     // medials can use full range
   1246     medialCount = (maxTrail - minTrail + 1);
   1247     // find out how many values fit in each form
   1248     int32_t threeByteCount = medialCount * final3Count;
   1249     // now determine where the 3/4 boundary is.
   1250     // we use 3 bytes below the boundary, and 4 above
   1251     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
   1252     int32_t primaries4count = primariesAvailable - primaries3count;
   1253 
   1254 
   1255     int32_t min3ByteCoverage = primaries3count * threeByteCount;
   1256     min4Primary = minPrimary + primaries3count;
   1257     min4Boundary = min3ByteCoverage;
   1258     // Now expand out the multiplier for the 4 bytes, and redo.
   1259 
   1260     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
   1261     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
   1262     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
   1263     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
   1264     if (gap4 < 1) {
   1265         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1266         return;
   1267     }
   1268     final4Multiplier = gap4 + 1;
   1269     final4Count = neededPerFinalByte;
   1270     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
   1271 }
   1272 
   1273     /**
   1274      * Supply parameters for generating implicit CEs
   1275      */
   1276 U_CAPI void U_EXPORT2
   1277 uprv_uca_initImplicitConstants(UErrorCode *status) {
   1278     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
   1279     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
   1280     initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
   1281 }
   1282 
   1283 
   1284 /*    collIterNormalize     Incremental Normalization happens here.                       */
   1285 /*                          pick up the range of chars identifed by FCD,                  */
   1286 /*                          normalize it into the collIterate's writable buffer,          */
   1287 /*                          switch the collIterate's state to use the writable buffer.    */
   1288 /*                                                                                        */
   1289 static
   1290 void collIterNormalize(collIterate *collationSource)
   1291 {
   1292     UErrorCode  status = U_ZERO_ERROR;
   1293     const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
   1294     const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
   1295 
   1296     collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
   1297                                     collationSource->writableBuffer,
   1298                                     status);
   1299     if (U_FAILURE(status)) {
   1300 #ifdef UCOL_DEBUG
   1301         fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
   1302 #endif
   1303         return;
   1304     }
   1305 
   1306     collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
   1307     collationSource->origFlags  = collationSource->flags;
   1308     collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1309     collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1310 }
   1311 
   1312 
   1313 // This function takes the iterator and extracts normalized stuff up to the next boundary
   1314 // It is similar in the end results to the collIterNormalize, but for the cases when we
   1315 // use an iterator
   1316 /*static
   1317 inline void normalizeIterator(collIterate *collationSource) {
   1318   UErrorCode status = U_ZERO_ERROR;
   1319   UBool wasNormalized = FALSE;
   1320   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
   1321   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
   1322   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1323     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1324   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
   1325     // reallocate and terminate
   1326     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
   1327                                &collationSource->writableBuffer,
   1328                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
   1329                                0)
   1330     ) {
   1331     #ifdef UCOL_DEBUG
   1332         fprintf(stderr, "normalizeIterator(), out of memory\n");
   1333     #endif
   1334         return;
   1335     }
   1336     status = U_ZERO_ERROR;
   1337     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
   1338     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
   1339     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
   1340     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
   1341   }
   1342   // Terminate the buffer - we already checked that it is big enough
   1343   collationSource->writableBuffer[normLen] = 0;
   1344   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
   1345       collationSource->flags |= UCOL_ITER_ALLOCATED;
   1346   }
   1347   collationSource->pos        = collationSource->writableBuffer;
   1348   collationSource->origFlags  = collationSource->flags;
   1349   collationSource->flags     |= UCOL_ITER_INNORMBUF;
   1350   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   1351 }*/
   1352 
   1353 
   1354 /* Incremental FCD check and normalize                                                    */
   1355 /*   Called from getNextCE when normalization state is suspect.                           */
   1356 /*   When entering, the state is known to be this:                                        */
   1357 /*      o   We are working in the main buffer of the collIterate, not the side            */
   1358 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
   1359 /*          so we won't get here.                                                         */
   1360 /*      o   The leading combining class from the current character is 0 or                */
   1361 /*          the trailing combining class of the previous char was zero.                   */
   1362 /*          True because the previous call to this function will have always exited       */
   1363 /*          that way, and we get called for every char where cc might be non-zero.        */
   1364 static
   1365 inline UBool collIterFCD(collIterate *collationSource) {
   1366     const UChar *srcP, *endP;
   1367     uint8_t     leadingCC;
   1368     uint8_t     prevTrailingCC = 0;
   1369     uint16_t    fcd;
   1370     UBool       needNormalize = FALSE;
   1371 
   1372     srcP = collationSource->pos-1;
   1373 
   1374     if (collationSource->flags & UCOL_ITER_HASLEN) {
   1375         endP = collationSource->endp;
   1376     } else {
   1377         endP = NULL;
   1378     }
   1379 
   1380     // Get the trailing combining class of the current character. If it's zero, we are OK.
   1381     fcd = g_nfcImpl->nextFCD16(srcP, endP);
   1382     if (fcd != 0) {
   1383         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1384 
   1385         if (prevTrailingCC != 0) {
   1386             // The current char has a non-zero trailing CC.  Scan forward until we find
   1387             //   a char with a leading cc of zero.
   1388             while (endP == NULL || srcP != endP)
   1389             {
   1390                 const UChar *savedSrcP = srcP;
   1391 
   1392                 fcd = g_nfcImpl->nextFCD16(srcP, endP);
   1393                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1394                 if (leadingCC == 0) {
   1395                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
   1396                                            //   back up over it.  (Could be surrogate pair!)
   1397                     break;
   1398                 }
   1399 
   1400                 if (leadingCC < prevTrailingCC) {
   1401                     needNormalize = TRUE;
   1402                 }
   1403 
   1404                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1405             }
   1406         }
   1407     }
   1408 
   1409     collationSource->fcdPosition = (UChar *)srcP;
   1410 
   1411     return needNormalize;
   1412 }
   1413 
   1414 /****************************************************************************/
   1415 /* Following are the CE retrieval functions                                 */
   1416 /*                                                                          */
   1417 /****************************************************************************/
   1418 
   1419 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
   1420 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
   1421 
   1422 /* there should be a macro version of this function in the header file */
   1423 /* This is the first function that tries to fetch a collation element  */
   1424 /* If it's not succesfull or it encounters a more difficult situation  */
   1425 /* some more sofisticated and slower functions are invoked             */
   1426 static
   1427 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1428     uint32_t order = 0;
   1429     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
   1430         order = *(collationSource->toReturn++);                         /* if so, return them */
   1431         if(collationSource->CEpos == collationSource->toReturn) {
   1432             collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
   1433         }
   1434         return order;
   1435     }
   1436 
   1437     UChar ch = 0;
   1438     collationSource->offsetReturn = NULL;
   1439 
   1440     do {
   1441         for (;;)                           /* Loop handles case when incremental normalize switches   */
   1442         {                                  /*   to or from the side buffer / original string, and we  */
   1443             /*   need to start again to get the next character.        */
   1444 
   1445             if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
   1446             {
   1447                 // The source string is null terminated and we're not working from the side buffer,
   1448                 //   and we're not normalizing.  This is the fast path.
   1449                 //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
   1450                 ch = *collationSource->pos++;
   1451                 if (ch != 0) {
   1452                     break;
   1453                 }
   1454                 else {
   1455                     return UCOL_NO_MORE_CES;
   1456                 }
   1457             }
   1458 
   1459             if (collationSource->flags & UCOL_ITER_HASLEN) {
   1460                 // Normal path for strings when length is specified.
   1461                 //   (We can't be in side buffer because it is always null terminated.)
   1462                 if (collationSource->pos >= collationSource->endp) {
   1463                     // Ran off of the end of the main source string.  We're done.
   1464                     return UCOL_NO_MORE_CES;
   1465                 }
   1466                 ch = *collationSource->pos++;
   1467             }
   1468             else if(collationSource->flags & UCOL_USE_ITERATOR) {
   1469                 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
   1470                 if(iterCh == U_SENTINEL) {
   1471                     return UCOL_NO_MORE_CES;
   1472                 }
   1473                 ch = (UChar)iterCh;
   1474             }
   1475             else
   1476             {
   1477                 // Null terminated string.
   1478                 ch = *collationSource->pos++;
   1479                 if (ch == 0) {
   1480                     // Ran off end of buffer.
   1481                     if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1482                         // Ran off end of main string. backing up one character.
   1483                         collationSource->pos--;
   1484                         return UCOL_NO_MORE_CES;
   1485                     }
   1486                     else
   1487                     {
   1488                         // Hit null in the normalize side buffer.
   1489                         // Usually this means the end of the normalized data,
   1490                         // except for one odd case: a null followed by combining chars,
   1491                         //   which is the case if we are at the start of the buffer.
   1492                         if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
   1493                             break;
   1494                         }
   1495 
   1496                         //  Null marked end of side buffer.
   1497                         //   Revert to the main string and
   1498                         //   loop back to top to try again to get a character.
   1499                         collationSource->pos   = collationSource->fcdPosition;
   1500                         collationSource->flags = collationSource->origFlags;
   1501                         continue;
   1502                     }
   1503                 }
   1504             }
   1505 
   1506             if(collationSource->flags&UCOL_HIRAGANA_Q) {
   1507                 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
   1508                  * based on whether the previous codepoint was Hiragana or Katakana.
   1509                  */
   1510                 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
   1511                         ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
   1512                     collationSource->flags |= UCOL_WAS_HIRAGANA;
   1513                 } else {
   1514                     collationSource->flags &= ~UCOL_WAS_HIRAGANA;
   1515                 }
   1516             }
   1517 
   1518             // We've got a character.  See if there's any fcd and/or normalization stuff to do.
   1519             //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
   1520             if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
   1521                 break;
   1522             }
   1523 
   1524             if (collationSource->fcdPosition >= collationSource->pos) {
   1525                 // An earlier FCD check has already covered the current character.
   1526                 // We can go ahead and process this char.
   1527                 break;
   1528             }
   1529 
   1530             if (ch < ZERO_CC_LIMIT_ ) {
   1531                 // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
   1532                 break;
   1533             }
   1534 
   1535             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1536                 // We need to peek at the next character in order to tell if we are FCD
   1537                 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
   1538                     // We are at the last char of source string.
   1539                     //  It is always OK for FCD check.
   1540                     break;
   1541                 }
   1542 
   1543                 // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
   1544                 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
   1545                     break;
   1546                 }
   1547             }
   1548 
   1549 
   1550             // Need a more complete FCD check and possible normalization.
   1551             if (collIterFCD(collationSource)) {
   1552                 collIterNormalize(collationSource);
   1553             }
   1554             if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
   1555                 //  No normalization was needed.  Go ahead and process the char we already had.
   1556                 break;
   1557             }
   1558 
   1559             // Some normalization happened.  Next loop iteration will pick up a char
   1560             //   from the normalization buffer.
   1561 
   1562         }   // end for (;;)
   1563 
   1564 
   1565         if (ch <= 0xFF) {
   1566             /*  For latin-1 characters we never need to fall back to the UCA table        */
   1567             /*    because all of the UCA data is replicated in the latinOneMapping array  */
   1568             order = coll->latinOneMapping[ch];
   1569             if (order > UCOL_NOT_FOUND) {
   1570                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
   1571             }
   1572         }
   1573         else
   1574         {
   1575             // Always use UCA for Han, Hangul
   1576             // (Han extension A is before main Han block)
   1577             // **** Han compatibility chars ?? ****
   1578             if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   1579                 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
   1580                 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
   1581                     // between the two target ranges; do normal lookup
   1582                     // **** this range is YI, Modifier tone letters, ****
   1583                     // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   1584                     // **** Latin-D might be tailored, so we need to ****
   1585                     // **** do the normal lookup for these guys.     ****
   1586                     order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1587                 } else {
   1588                     // in one of the target ranges; use UCA
   1589                     order = UCOL_NOT_FOUND;
   1590                 }
   1591             } else {
   1592                 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   1593             }
   1594 
   1595             if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
   1596                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
   1597             }
   1598 
   1599             if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
   1600                 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
   1601                 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   1602 
   1603                 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
   1604                     order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
   1605                 }
   1606             }
   1607         }
   1608     } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
   1609 
   1610     if(order == UCOL_NOT_FOUND) {
   1611         order = getImplicit(ch, collationSource);
   1612     }
   1613     return order; /* return the CE */
   1614 }
   1615 
   1616 /* ucol_getNextCE, out-of-line version for use from other files.   */
   1617 U_CAPI uint32_t  U_EXPORT2
   1618 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
   1619     return ucol_IGetNextCE(coll, collationSource, status);
   1620 }
   1621 
   1622 
   1623 /**
   1624 * Incremental previous normalization happens here. Pick up the range of chars
   1625 * identifed by FCD, normalize it into the collIterate's writable buffer,
   1626 * switch the collIterate's state to use the writable buffer.
   1627 * @param data collation iterator data
   1628 */
   1629 static
   1630 void collPrevIterNormalize(collIterate *data)
   1631 {
   1632     UErrorCode status  = U_ZERO_ERROR;
   1633     const UChar *pEnd   = data->pos;  /* End normalize + 1 */
   1634     const UChar *pStart;
   1635 
   1636     /* Start normalize */
   1637     if (data->fcdPosition == NULL) {
   1638         pStart = data->string;
   1639     }
   1640     else {
   1641         pStart = data->fcdPosition + 1;
   1642     }
   1643 
   1644     int32_t normLen =
   1645         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
   1646                              data->writableBuffer,
   1647                              status).
   1648         length();
   1649     if(U_FAILURE(status)) {
   1650         return;
   1651     }
   1652     /*
   1653     this puts the null termination infront of the normalized string instead
   1654     of the end
   1655     */
   1656     data->writableBuffer.insert(0, (UChar)0);
   1657 
   1658     /*
   1659      * The usual case at this point is that we've got a base
   1660      * character followed by marks that were normalized. If
   1661      * fcdPosition is NULL, that means that we backed up to
   1662      * the beginning of the string and there's no base character.
   1663      *
   1664      * Forward processing will usually normalize when it sees
   1665      * the first mark, so that mark will get it's natural offset
   1666      * and the rest will get the offset of the character following
   1667      * the marks. The base character will also get its natural offset.
   1668      *
   1669      * We write the offset of the base character, if there is one,
   1670      * followed by the offset of the first mark and then the offsets
   1671      * of the rest of the marks.
   1672      */
   1673     int32_t firstMarkOffset = 0;
   1674     int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
   1675     int32_t trailCount      = normLen - 1;
   1676 
   1677     if (data->fcdPosition != NULL) {
   1678         int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
   1679         UChar   baseChar   = *data->fcdPosition;
   1680 
   1681         firstMarkOffset = baseOffset + 1;
   1682 
   1683         /*
   1684          * If the base character is the start of a contraction, forward processing
   1685          * will normalize the marks while checking for the contraction, which means
   1686          * that the offset of the first mark will the same as the other marks.
   1687          *
   1688          * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
   1689          */
   1690         if (baseChar >= 0x100) {
   1691             uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
   1692 
   1693             if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
   1694                 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
   1695             }
   1696 
   1697             if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
   1698                 firstMarkOffset = trailOffset;
   1699             }
   1700         }
   1701 
   1702         data->appendOffset(baseOffset, status);
   1703     }
   1704 
   1705     data->appendOffset(firstMarkOffset, status);
   1706 
   1707     for (int32_t i = 0; i < trailCount; i += 1) {
   1708         data->appendOffset(trailOffset, status);
   1709     }
   1710 
   1711     data->offsetRepeatValue = trailOffset;
   1712 
   1713     data->offsetReturn = data->offsetStore - 1;
   1714     if (data->offsetReturn == data->offsetBuffer) {
   1715         data->offsetStore = data->offsetBuffer;
   1716     }
   1717 
   1718     data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
   1719     data->origFlags  = data->flags;
   1720     data->flags     |= UCOL_ITER_INNORMBUF;
   1721     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   1722 }
   1723 
   1724 
   1725 /**
   1726 * Incremental FCD check for previous iteration and normalize. Called from
   1727 * getPrevCE when normalization state is suspect.
   1728 * When entering, the state is known to be this:
   1729 * o  We are working in the main buffer of the collIterate, not the side
   1730 *    writable buffer. When in the side buffer, normalization mode is always
   1731 *    off, so we won't get here.
   1732 * o  The leading combining class from the current character is 0 or the
   1733 *    trailing combining class of the previous char was zero.
   1734 *    True because the previous call to this function will have always exited
   1735 *    that way, and we get called for every char where cc might be non-zero.
   1736 * @param data collation iterate struct
   1737 * @return normalization status, TRUE for normalization to be done, FALSE
   1738 *         otherwise
   1739 */
   1740 static
   1741 inline UBool collPrevIterFCD(collIterate *data)
   1742 {
   1743     const UChar *src, *start;
   1744     uint8_t     leadingCC;
   1745     uint8_t     trailingCC = 0;
   1746     uint16_t    fcd;
   1747     UBool       result = FALSE;
   1748 
   1749     start = data->string;
   1750     src = data->pos + 1;
   1751 
   1752     /* Get the trailing combining class of the current character. */
   1753     fcd = g_nfcImpl->previousFCD16(start, src);
   1754 
   1755     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1756 
   1757     if (leadingCC != 0) {
   1758         /*
   1759         The current char has a non-zero leading combining class.
   1760         Scan backward until we find a char with a trailing cc of zero.
   1761         */
   1762         for (;;)
   1763         {
   1764             if (start == src) {
   1765                 data->fcdPosition = NULL;
   1766                 return result;
   1767             }
   1768 
   1769             fcd = g_nfcImpl->previousFCD16(start, src);
   1770 
   1771             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
   1772 
   1773             if (trailingCC == 0) {
   1774                 break;
   1775             }
   1776 
   1777             if (leadingCC < trailingCC) {
   1778                 result = TRUE;
   1779             }
   1780 
   1781             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
   1782         }
   1783     }
   1784 
   1785     data->fcdPosition = (UChar *)src;
   1786 
   1787     return result;
   1788 }
   1789 
   1790 /** gets a code unit from the string at a given offset
   1791  *  Handles both normal and iterative cases.
   1792  *  No error checking - caller beware!
   1793  */
   1794 static inline
   1795 UChar peekCodeUnit(collIterate *source, int32_t offset) {
   1796     if(source->pos != NULL) {
   1797         return *(source->pos + offset);
   1798     } else if(source->iterator != NULL) {
   1799         UChar32 c;
   1800         if(offset != 0) {
   1801             source->iterator->move(source->iterator, offset, UITER_CURRENT);
   1802             c = source->iterator->next(source->iterator);
   1803             source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
   1804         } else {
   1805             c = source->iterator->current(source->iterator);
   1806         }
   1807         return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
   1808     } else {
   1809         return 0xfffd;
   1810     }
   1811 }
   1812 
   1813 // Code point version. Treats the offset as a _code point_ delta.
   1814 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
   1815 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
   1816 static inline
   1817 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
   1818     UChar32 c;
   1819     if(source->pos != NULL) {
   1820         const UChar *p = source->pos;
   1821         if(offset >= 0) {
   1822             // Skip forward over (offset-1) code points.
   1823             while(--offset >= 0) {
   1824                 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
   1825                     ++p;
   1826                 }
   1827             }
   1828             // Read the code point there.
   1829             c = *p++;
   1830             UChar trail;
   1831             if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
   1832                 c = U16_GET_SUPPLEMENTARY(c, trail);
   1833             }
   1834         } else /* offset<0 */ {
   1835             // Skip backward over (offset-1) code points.
   1836             while(++offset < 0) {
   1837                 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
   1838                     --p;
   1839                 }
   1840             }
   1841             // Read the code point before that.
   1842             c = *--p;
   1843             UChar lead;
   1844             if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
   1845                 c = U16_GET_SUPPLEMENTARY(lead, c);
   1846             }
   1847         }
   1848     } else if(source->iterator != NULL) {
   1849         if(offset >= 0) {
   1850             // Skip forward over (offset-1) code points.
   1851             int32_t fwd = offset;
   1852             while(fwd-- > 0) {
   1853                 uiter_next32(source->iterator);
   1854             }
   1855             // Read the code point there.
   1856             c = uiter_current32(source->iterator);
   1857             // Return to the starting point, skipping backward over (offset-1) code points.
   1858             while(offset-- > 0) {
   1859                 uiter_previous32(source->iterator);
   1860             }
   1861         } else /* offset<0 */ {
   1862             // Read backward, reading offset code points, remember only the last-read one.
   1863             int32_t back = offset;
   1864             do {
   1865                 c = uiter_previous32(source->iterator);
   1866             } while(++back < 0);
   1867             // Return to the starting position, skipping forward over offset code points.
   1868             do {
   1869                 uiter_next32(source->iterator);
   1870             } while(++offset < 0);
   1871         }
   1872     } else {
   1873         c = U_SENTINEL;
   1874     }
   1875     return c;
   1876 }
   1877 
   1878 /**
   1879 * Determines if we are at the start of the data string in the backwards
   1880 * collation iterator
   1881 * @param data collation iterator
   1882 * @return TRUE if we are at the start
   1883 */
   1884 static
   1885 inline UBool isAtStartPrevIterate(collIterate *data) {
   1886     if(data->pos == NULL && data->iterator != NULL) {
   1887         return !data->iterator->hasPrevious(data->iterator);
   1888     }
   1889     //return (collIter_bos(data)) ||
   1890     return (data->pos == data->string) ||
   1891               ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&
   1892               *(data->pos - 1) == 0 && data->fcdPosition == NULL);
   1893 }
   1894 
   1895 static
   1896 inline void goBackOne(collIterate *data) {
   1897 # if 0
   1898     // somehow, it looks like we need to keep iterator synced up
   1899     // at all times, as above.
   1900     if(data->pos) {
   1901         data->pos--;
   1902     }
   1903     if(data->iterator) {
   1904         data->iterator->previous(data->iterator);
   1905     }
   1906 #endif
   1907     if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
   1908         data->iterator->previous(data->iterator);
   1909     }
   1910     if(data->pos) {
   1911         data->pos --;
   1912     }
   1913 }
   1914 
   1915 /**
   1916 * Inline function that gets a simple CE.
   1917 * So what it does is that it will first check the expansion buffer. If the
   1918 * expansion buffer is not empty, ie the end pointer to the expansion buffer
   1919 * is different from the string pointer, we return the collation element at the
   1920 * return pointer and decrement it.
   1921 * For more complicated CEs it resorts to getComplicatedCE.
   1922 * @param coll collator data
   1923 * @param data collation iterator struct
   1924 * @param status error status
   1925 */
   1926 static
   1927 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
   1928                                UErrorCode *status)
   1929 {
   1930     uint32_t result = (uint32_t)UCOL_NULLORDER;
   1931 
   1932     if (data->offsetReturn != NULL) {
   1933         if (data->offsetRepeatCount > 0) {
   1934                 data->offsetRepeatCount -= 1;
   1935         } else {
   1936             if (data->offsetReturn == data->offsetBuffer) {
   1937                 data->offsetReturn = NULL;
   1938                 data->offsetStore  = data->offsetBuffer;
   1939             } else {
   1940                 data->offsetReturn -= 1;
   1941             }
   1942         }
   1943     }
   1944 
   1945     if ((data->extendCEs && data->toReturn > data->extendCEs) ||
   1946             (!data->extendCEs && data->toReturn > data->CEs))
   1947     {
   1948         data->toReturn -= 1;
   1949         result = *(data->toReturn);
   1950         if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
   1951             data->CEpos = data->toReturn;
   1952         }
   1953     }
   1954     else {
   1955         UChar ch = 0;
   1956 
   1957         do {
   1958             /*
   1959             Loop handles case when incremental normalize switches to or from the
   1960             side buffer / original string, and we need to start again to get the
   1961             next character.
   1962             */
   1963             for (;;) {
   1964                 if (data->flags & UCOL_ITER_HASLEN) {
   1965                     /*
   1966                     Normal path for strings when length is specified.
   1967                     Not in side buffer because it is always null terminated.
   1968                     */
   1969                     if (data->pos <= data->string) {
   1970                         /* End of the main source string */
   1971                         return UCOL_NO_MORE_CES;
   1972                     }
   1973                     data->pos --;
   1974                     ch = *data->pos;
   1975                 }
   1976                 // we are using an iterator to go back. Pray for us!
   1977                 else if (data->flags & UCOL_USE_ITERATOR) {
   1978                   UChar32 iterCh = data->iterator->previous(data->iterator);
   1979                   if(iterCh == U_SENTINEL) {
   1980                     return UCOL_NO_MORE_CES;
   1981                   } else {
   1982                     ch = (UChar)iterCh;
   1983                   }
   1984                 }
   1985                 else {
   1986                     data->pos --;
   1987                     ch = *data->pos;
   1988                     /* we are in the side buffer. */
   1989                     if (ch == 0) {
   1990                         /*
   1991                         At the start of the normalize side buffer.
   1992                         Go back to string.
   1993                         Because pointer points to the last accessed character,
   1994                         hence we have to increment it by one here.
   1995                         */
   1996                         data->flags = data->origFlags;
   1997                         data->offsetRepeatValue = 0;
   1998 
   1999                          if (data->fcdPosition == NULL) {
   2000                             data->pos = data->string;
   2001                             return UCOL_NO_MORE_CES;
   2002                         }
   2003                         else {
   2004                             data->pos   = data->fcdPosition + 1;
   2005                         }
   2006 
   2007                        continue;
   2008                     }
   2009                 }
   2010 
   2011                 if(data->flags&UCOL_HIRAGANA_Q) {
   2012                   if(ch>=0x3040 && ch<=0x309f) {
   2013                     data->flags |= UCOL_WAS_HIRAGANA;
   2014                   } else {
   2015                     data->flags &= ~UCOL_WAS_HIRAGANA;
   2016                   }
   2017                 }
   2018 
   2019                 /*
   2020                 * got a character to determine if there's fcd and/or normalization
   2021                 * stuff to do.
   2022                 * if the current character is not fcd.
   2023                 * if current character is at the start of the string
   2024                 * Trailing combining class == 0.
   2025                 * Note if pos is in the writablebuffer, norm is always 0
   2026                 */
   2027                 if (ch < ZERO_CC_LIMIT_ ||
   2028                   // this should propel us out of the loop in the iterator case
   2029                     (data->flags & UCOL_ITER_NORM) == 0 ||
   2030                     (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
   2031                     || data->string == data->pos) {
   2032                     break;
   2033                 }
   2034 
   2035                 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
   2036                     /* if next character is FCD */
   2037                     if (data->pos == data->string) {
   2038                         /* First char of string is always OK for FCD check */
   2039                         break;
   2040                     }
   2041 
   2042                     /* Not first char of string, do the FCD fast test */
   2043                     if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
   2044                         break;
   2045                     }
   2046                 }
   2047 
   2048                 /* Need a more complete FCD check and possible normalization. */
   2049                 if (collPrevIterFCD(data)) {
   2050                     collPrevIterNormalize(data);
   2051                 }
   2052 
   2053                 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2054                     /*  No normalization. Go ahead and process the char. */
   2055                     break;
   2056                 }
   2057 
   2058                 /*
   2059                 Some normalization happened.
   2060                 Next loop picks up a char from the normalization buffer.
   2061                 */
   2062             }
   2063 
   2064             /* attempt to handle contractions, after removal of the backwards
   2065             contraction
   2066             */
   2067             if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
   2068                 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
   2069             } else {
   2070                 if (ch <= 0xFF) {
   2071                     result = coll->latinOneMapping[ch];
   2072                 }
   2073                 else {
   2074                     // Always use UCA for [3400..9FFF], [AC00..D7AF]
   2075                     // **** [FA0E..FA2F] ?? ****
   2076                     if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
   2077                         (ch >= 0x3400 && ch <= 0xD7AF)) {
   2078                         if (ch > 0x9FFF && ch < 0xAC00) {
   2079                             // between the two target ranges; do normal lookup
   2080                             // **** this range is YI, Modifier tone letters, ****
   2081                             // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
   2082                             // **** Latin-D might be tailored, so we need to ****
   2083                             // **** do the normal lookup for these guys.     ****
   2084                              result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   2085                         } else {
   2086                             result = UCOL_NOT_FOUND;
   2087                         }
   2088                     } else {
   2089                         result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   2090                     }
   2091                 }
   2092                 if (result > UCOL_NOT_FOUND) {
   2093                     result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
   2094                 }
   2095                 if (result == UCOL_NOT_FOUND) { // Not found in master list
   2096                     if (!isAtStartPrevIterate(data) &&
   2097                         ucol_contractionEndCP(ch, data->coll))
   2098                     {
   2099                         result = UCOL_CONTRACTION;
   2100                     } else {
   2101                         if(coll->UCA) {
   2102                             result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   2103                         }
   2104                     }
   2105 
   2106                     if (result > UCOL_NOT_FOUND) {
   2107                         if(coll->UCA) {
   2108                             result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
   2109                         }
   2110                     }
   2111                 }
   2112             }
   2113         } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
   2114 
   2115         if(result == UCOL_NOT_FOUND) {
   2116             result = getPrevImplicit(ch, data);
   2117         }
   2118     }
   2119 
   2120     return result;
   2121 }
   2122 
   2123 
   2124 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
   2125 U_CFUNC uint32_t  U_EXPORT2
   2126 ucol_getPrevCE(const UCollator *coll, collIterate *data,
   2127                         UErrorCode *status) {
   2128     return ucol_IGetPrevCE(coll, data, status);
   2129 }
   2130 
   2131 
   2132 /* this should be connected to special Jamo handling */
   2133 U_CFUNC uint32_t  U_EXPORT2
   2134 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
   2135     collIterate colIt;
   2136     IInit_collIterate(coll, &u, 1, &colIt, status);
   2137     if(U_FAILURE(*status)) {
   2138         return 0;
   2139     }
   2140     return ucol_IGetNextCE(coll, &colIt, status);
   2141 }
   2142 
   2143 /**
   2144 * Inserts the argument character into the end of the buffer pushing back the
   2145 * null terminator.
   2146 * @param data collIterate struct data
   2147 * @param ch character to be appended
   2148 * @return the position of the new addition
   2149 */
   2150 static
   2151 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
   2152 {
   2153     int32_t oldLength = data->writableBuffer.length();
   2154     return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
   2155 }
   2156 
   2157 /**
   2158 * Inserts the argument string into the end of the buffer pushing back the
   2159 * null terminator.
   2160 * @param data collIterate struct data
   2161 * @param string to be appended
   2162 * @param length of the string to be appended
   2163 * @return the position of the new addition
   2164 */
   2165 static
   2166 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
   2167 {
   2168     int32_t oldLength = data->writableBuffer.length();
   2169     return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
   2170 }
   2171 
   2172 /**
   2173 * Special normalization function for contraction in the forwards iterator.
   2174 * This normalization sequence will place the current character at source->pos
   2175 * and its following normalized sequence into the buffer.
   2176 * The fcd position, pos will be changed.
   2177 * pos will now point to positions in the buffer.
   2178 * Flags will be changed accordingly.
   2179 * @param data collation iterator data
   2180 */
   2181 static
   2182 inline void normalizeNextContraction(collIterate *data)
   2183 {
   2184     int32_t     strsize;
   2185     UErrorCode  status     = U_ZERO_ERROR;
   2186     /* because the pointer points to the next character */
   2187     const UChar *pStart    = data->pos - 1;
   2188     const UChar *pEnd;
   2189 
   2190     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
   2191         data->writableBuffer.setTo(*(pStart - 1));
   2192         strsize               = 1;
   2193     }
   2194     else {
   2195         strsize = data->writableBuffer.length();
   2196     }
   2197 
   2198     pEnd = data->fcdPosition;
   2199 
   2200     data->writableBuffer.append(
   2201         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
   2202     if(U_FAILURE(status)) {
   2203         return;
   2204     }
   2205 
   2206     data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
   2207     data->origFlags  = data->flags;
   2208     data->flags     |= UCOL_ITER_INNORMBUF;
   2209     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2210 }
   2211 
   2212 /**
   2213 * Contraction character management function that returns the next character
   2214 * for the forwards iterator.
   2215 * Does nothing if the next character is in buffer and not the first character
   2216 * in it.
   2217 * Else it checks next character in data string to see if it is normalizable.
   2218 * If it is not, the character is simply copied into the buffer, else
   2219 * the whole normalized substring is copied into the buffer, including the
   2220 * current character.
   2221 * @param data collation element iterator data
   2222 * @return next character
   2223 */
   2224 static
   2225 inline UChar getNextNormalizedChar(collIterate *data)
   2226 {
   2227     UChar  nextch;
   2228     UChar  ch;
   2229     // Here we need to add the iterator code. One problem is the way
   2230     // end of string is handled. If we just return next char, it could
   2231     // be the sentinel. Most of the cases already check for this, but we
   2232     // need to be sure.
   2233     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
   2234          /* if no normalization and not in buffer. */
   2235       if(data->flags & UCOL_USE_ITERATOR) {
   2236          return (UChar)data->iterator->next(data->iterator);
   2237       } else {
   2238          return *(data->pos ++);
   2239       }
   2240     }
   2241 
   2242     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
   2243       //normalizeIterator(data);
   2244     //}
   2245 
   2246     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2247     if ((innormbuf && *data->pos != 0) ||
   2248         (data->fcdPosition != NULL && !innormbuf &&
   2249         data->pos < data->fcdPosition)) {
   2250         /*
   2251         if next character is in normalized buffer, no further normalization
   2252         is required
   2253         */
   2254         return *(data->pos ++);
   2255     }
   2256 
   2257     if (data->flags & UCOL_ITER_HASLEN) {
   2258         /* in data string */
   2259         if (data->pos + 1 == data->endp) {
   2260             return *(data->pos ++);
   2261         }
   2262     }
   2263     else {
   2264         if (innormbuf) {
   2265           // inside the normalization buffer, but at the end
   2266           // (since we encountered zero). This means, in the
   2267           // case we're using char iterator, that we need to
   2268           // do another round of normalization.
   2269           //if(data->origFlags & UCOL_USE_ITERATOR) {
   2270             // we need to restore original flags,
   2271             // otherwise, we'll lose them
   2272             //data->flags = data->origFlags;
   2273             //normalizeIterator(data);
   2274             //return *(data->pos++);
   2275           //} else {
   2276             /*
   2277             in writable buffer, at this point fcdPosition can not be
   2278             pointing to the end of the data string. see contracting tag.
   2279             */
   2280           if(data->fcdPosition) {
   2281             if (*(data->fcdPosition + 1) == 0 ||
   2282                 data->fcdPosition + 1 == data->endp) {
   2283                 /* at the end of the string, dump it into the normalizer */
   2284                 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
   2285                 // Check if data->pos received a null pointer
   2286                 if (data->pos == NULL) {
   2287                     return (UChar)-1; // Return to indicate error.
   2288                 }
   2289                 return *(data->fcdPosition ++);
   2290             }
   2291             data->pos = data->fcdPosition;
   2292           } else if(data->origFlags & UCOL_USE_ITERATOR) {
   2293             // if we are here, we're using a normalizing iterator.
   2294             // we should just continue further.
   2295             data->flags = data->origFlags;
   2296             data->pos = NULL;
   2297             return (UChar)data->iterator->next(data->iterator);
   2298           }
   2299           //}
   2300         }
   2301         else {
   2302             if (*(data->pos + 1) == 0) {
   2303                 return *(data->pos ++);
   2304             }
   2305         }
   2306     }
   2307 
   2308     ch = *data->pos ++;
   2309     nextch = *data->pos;
   2310 
   2311     /*
   2312     * if the current character is not fcd.
   2313     * Trailing combining class == 0.
   2314     */
   2315     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
   2316         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
   2317          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
   2318             /*
   2319             Need a more complete FCD check and possible normalization.
   2320             normalize substring will be appended to buffer
   2321             */
   2322         if (collIterFCD(data)) {
   2323             normalizeNextContraction(data);
   2324             return *(data->pos ++);
   2325         }
   2326         else if (innormbuf) {
   2327             /* fcdposition shifted even when there's no normalization, if we
   2328             don't input the rest into this, we'll get the wrong position when
   2329             we reach the end of the writableBuffer */
   2330             int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
   2331             data->pos = insertBufferEnd(data, data->pos - 1, length);
   2332             // Check if data->pos received a null pointer
   2333             if (data->pos == NULL) {
   2334                 return (UChar)-1; // Return to indicate error.
   2335             }
   2336             return *(data->pos ++);
   2337         }
   2338     }
   2339 
   2340     if (innormbuf) {
   2341         /*
   2342         no normalization is to be done hence only one character will be
   2343         appended to the buffer.
   2344         */
   2345         data->pos = insertBufferEnd(data, ch) + 1;
   2346         // Check if data->pos received a null pointer
   2347         if (data->pos == NULL) {
   2348             return (UChar)-1; // Return to indicate error.
   2349         }
   2350     }
   2351 
   2352     /* points back to the pos in string */
   2353     return ch;
   2354 }
   2355 
   2356 
   2357 
   2358 /**
   2359 * Function to copy the buffer into writableBuffer and sets the fcd position to
   2360 * the correct position
   2361 * @param source data string source
   2362 * @param buffer character buffer
   2363 */
   2364 static
   2365 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
   2366 {
   2367     /* okay confusing part here. to ensure that the skipped characters are
   2368     considered later, we need to place it in the appropriate position in the
   2369     normalization buffer and reassign the pos pointer. simple case if pos
   2370     reside in string, simply copy to normalization buffer and
   2371     fcdposition = pos, pos = start of normalization buffer. if pos in
   2372     normalization buffer, we'll insert the copy infront of pos and point pos
   2373     to the start of the normalization buffer. why am i doing these copies?
   2374     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
   2375     not require any changes, which be really painful. */
   2376     if (source->flags & UCOL_ITER_INNORMBUF) {
   2377         int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
   2378         source->writableBuffer.replace(0, replaceLength, buffer);
   2379     }
   2380     else {
   2381         source->fcdPosition  = source->pos;
   2382         source->origFlags    = source->flags;
   2383         source->flags       |= UCOL_ITER_INNORMBUF;
   2384         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
   2385         source->writableBuffer = buffer;
   2386     }
   2387 
   2388     source->pos = source->writableBuffer.getTerminatedBuffer();
   2389 }
   2390 
   2391 /**
   2392 * Function to get the discontiguos collation element within the source.
   2393 * Note this function will set the position to the appropriate places.
   2394 * @param coll current collator used
   2395 * @param source data string source
   2396 * @param constart index to the start character in the contraction table
   2397 * @return discontiguos collation element offset
   2398 */
   2399 static
   2400 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
   2401                                 const UChar *constart)
   2402 {
   2403     /* source->pos currently points to the second combining character after
   2404        the start character */
   2405           const UChar *temppos      = source->pos;
   2406           UnicodeString buffer;
   2407     const UChar   *tempconstart = constart;
   2408           uint8_t  tempflags    = source->flags;
   2409           UBool    multicontraction = FALSE;
   2410           collIterateState discState;
   2411 
   2412           backupState(source, &discState);
   2413 
   2414     buffer.setTo(peekCodePoint(source, -1));
   2415     for (;;) {
   2416         UChar    *UCharOffset;
   2417         UChar     schar,
   2418                   tchar;
   2419         uint32_t  result;
   2420 
   2421         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
   2422             || (peekCodeUnit(source, 0) == 0  &&
   2423             //|| (*source->pos == 0  &&
   2424                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
   2425                  source->fcdPosition == NULL ||
   2426                  source->fcdPosition == source->endp ||
   2427                  *(source->fcdPosition) == 0 ||
   2428                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
   2429                  /* end of string in null terminated string or stopped by a
   2430                  null character, note fcd does not always point to a base
   2431                  character after the discontiguos change */
   2432                  u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
   2433                  //u_getCombiningClass(*(source->pos)) == 0) {
   2434             //constart = (UChar *)coll->image + getContractOffset(CE);
   2435             if (multicontraction) {
   2436                 source->pos    = temppos - 1;
   2437                 setDiscontiguosAttribute(source, buffer);
   2438                 return *(coll->contractionCEs +
   2439                                     (tempconstart - coll->contractionIndex));
   2440             }
   2441             constart = tempconstart;
   2442             break;
   2443         }
   2444 
   2445         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
   2446         schar = getNextNormalizedChar(source);
   2447 
   2448         while (schar > (tchar = *UCharOffset)) {
   2449             UCharOffset++;
   2450         }
   2451 
   2452         if (schar != tchar) {
   2453             /* not the correct codepoint. we stuff the current codepoint into
   2454             the discontiguos buffer and try the next character */
   2455             buffer.append(schar);
   2456             continue;
   2457         }
   2458         else {
   2459             if (u_getCombiningClass(schar) ==
   2460                 u_getCombiningClass(peekCodePoint(source, -2))) {
   2461                 buffer.append(schar);
   2462                 continue;
   2463             }
   2464             result = *(coll->contractionCEs +
   2465                                       (UCharOffset - coll->contractionIndex));
   2466         }
   2467 
   2468         if (result == UCOL_NOT_FOUND) {
   2469           break;
   2470         } else if (isContraction(result)) {
   2471             /* this is a multi-contraction*/
   2472             tempconstart = (UChar *)coll->image + getContractOffset(result);
   2473             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
   2474                 != UCOL_NOT_FOUND) {
   2475                 multicontraction = TRUE;
   2476                 temppos       = source->pos + 1;
   2477             }
   2478         } else {
   2479             setDiscontiguosAttribute(source, buffer);
   2480             return result;
   2481         }
   2482     }
   2483 
   2484     /* no problems simply reverting just like that,
   2485     if we are in string before getting into this function, points back to
   2486     string hence no problem.
   2487     if we are in normalization buffer before getting into this function,
   2488     since we'll never use another normalization within this function, we
   2489     know that fcdposition points to a base character. the normalization buffer
   2490     never change, hence this revert works. */
   2491     loadState(source, &discState, TRUE);
   2492     goBackOne(source);
   2493 
   2494     //source->pos   = temppos - 1;
   2495     source->flags = tempflags;
   2496     return *(coll->contractionCEs + (constart - coll->contractionIndex));
   2497 }
   2498 
   2499 /* now uses Mark's getImplicitPrimary code */
   2500 static
   2501 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
   2502     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   2503     *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
   2504     collationSource->offsetRepeatCount += 1;
   2505     return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
   2506 }
   2507 
   2508 /**
   2509 * Inserts the argument character into the front of the buffer replacing the
   2510 * front null terminator.
   2511 * @param data collation element iterator data
   2512 * @param ch character to be appended
   2513 */
   2514 static
   2515 inline void insertBufferFront(collIterate *data, UChar ch)
   2516 {
   2517     data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
   2518 }
   2519 
   2520 /**
   2521 * Special normalization function for contraction in the previous iterator.
   2522 * This normalization sequence will place the current character at source->pos
   2523 * and its following normalized sequence into the buffer.
   2524 * The fcd position, pos will be changed.
   2525 * pos will now point to positions in the buffer.
   2526 * Flags will be changed accordingly.
   2527 * @param data collation iterator data
   2528 */
   2529 static
   2530 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
   2531 {
   2532     const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
   2533     const UChar *pStart;
   2534 
   2535     UnicodeString endOfBuffer;
   2536     if (data->flags & UCOL_ITER_HASLEN) {
   2537         /*
   2538         normalization buffer not used yet, we'll pull down the next
   2539         character into the end of the buffer
   2540         */
   2541         endOfBuffer.setTo(*pEnd);
   2542     }
   2543     else {
   2544         endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
   2545     }
   2546 
   2547     if (data->fcdPosition == NULL) {
   2548         pStart = data->string;
   2549     }
   2550     else {
   2551         pStart = data->fcdPosition + 1;
   2552     }
   2553     int32_t normLen =
   2554         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
   2555                              data->writableBuffer,
   2556                              *status).
   2557         length();
   2558     if(U_FAILURE(*status)) {
   2559         return;
   2560     }
   2561     /*
   2562     this puts the null termination infront of the normalized string instead
   2563     of the end
   2564     */
   2565     data->pos =
   2566         data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
   2567         1 + normLen;
   2568     data->origFlags  = data->flags;
   2569     data->flags     |= UCOL_ITER_INNORMBUF;
   2570     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   2571 }
   2572 
   2573 /**
   2574 * Contraction character management function that returns the previous character
   2575 * for the backwards iterator.
   2576 * Does nothing if the previous character is in buffer and not the first
   2577 * character in it.
   2578 * Else it checks previous character in data string to see if it is
   2579 * normalizable.
   2580 * If it is not, the character is simply copied into the buffer, else
   2581 * the whole normalized substring is copied into the buffer, including the
   2582 * current character.
   2583 * @param data collation element iterator data
   2584 * @return previous character
   2585 */
   2586 static
   2587 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
   2588 {
   2589     UChar  prevch;
   2590     UChar  ch;
   2591     const UChar *start;
   2592     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
   2593     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
   2594         (innormbuf && *(data->pos - 1) != 0)) {
   2595         /*
   2596         if no normalization.
   2597         if previous character is in normalized buffer, no further normalization
   2598         is required
   2599         */
   2600       if(data->flags & UCOL_USE_ITERATOR) {
   2601         data->iterator->move(data->iterator, -1, UITER_CURRENT);
   2602         return (UChar)data->iterator->next(data->iterator);
   2603       } else {
   2604         return *(data->pos - 1);
   2605       }
   2606     }
   2607 
   2608     start = data->pos;
   2609     if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
   2610         /* in data string */
   2611         if ((start - 1) == data->string) {
   2612             return *(start - 1);
   2613         }
   2614         start --;
   2615         ch     = *start;
   2616         prevch = *(start - 1);
   2617     }
   2618     else {
   2619         /*
   2620         in writable buffer, at this point fcdPosition can not be NULL.
   2621         see contracting tag.
   2622         */
   2623         if (data->fcdPosition == data->string) {
   2624             /* at the start of the string, just dump it into the normalizer */
   2625             insertBufferFront(data, *(data->fcdPosition));
   2626             data->fcdPosition = NULL;
   2627             return *(data->pos - 1);
   2628         }
   2629         start  = data->fcdPosition;
   2630         ch     = *start;
   2631         prevch = *(start - 1);
   2632     }
   2633     /*
   2634     * if the current character is not fcd.
   2635     * Trailing combining class == 0.
   2636     */
   2637     if (data->fcdPosition > start &&
   2638        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
   2639     {
   2640         /*
   2641         Need a more complete FCD check and possible normalization.
   2642         normalize substring will be appended to buffer
   2643         */
   2644         const UChar *backuppos = data->pos;
   2645         data->pos = start;
   2646         if (collPrevIterFCD(data)) {
   2647             normalizePrevContraction(data, status);
   2648             return *(data->pos - 1);
   2649         }
   2650         data->pos = backuppos;
   2651         data->fcdPosition ++;
   2652     }
   2653 
   2654     if (innormbuf) {
   2655     /*
   2656     no normalization is to be done hence only one character will be
   2657     appended to the buffer.
   2658     */
   2659         insertBufferFront(data, ch);
   2660         data->fcdPosition --;
   2661     }
   2662 
   2663     return ch;
   2664 }
   2665 
   2666 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
   2667 /* It is called by getNextCE */
   2668 
   2669 /* The following should be even */
   2670 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
   2671 
   2672 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
   2673     collIterateState entryState;
   2674     backupState(source, &entryState);
   2675     UChar32 cp = ch;
   2676 
   2677     for (;;) {
   2678         // This loop will repeat only in the case of contractions, and only when a contraction
   2679         //   is found and the first CE resulting from that contraction is itself a special
   2680         //   (an expansion, for example.)  All other special CE types are fully handled the
   2681         //   first time through, and the loop exits.
   2682 
   2683         const uint32_t *CEOffset = NULL;
   2684         switch(getCETag(CE)) {
   2685         case NOT_FOUND_TAG:
   2686             /* This one is not found, and we'll let somebody else bother about it... no more games */
   2687             return CE;
   2688         case SPEC_PROC_TAG:
   2689             {
   2690                 // Special processing is getting a CE that is preceded by a certain prefix
   2691                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   2692                 // When we encouter a special processing tag, we go backwards and try to see if
   2693                 // we have a match.
   2694                 // Contraction tables are used - so the whole process is not unlike contraction.
   2695                 // prefix data is stored backwards in the table.
   2696                 const UChar *UCharOffset;
   2697                 UChar schar, tchar;
   2698                 collIterateState prefixState;
   2699                 backupState(source, &prefixState);
   2700                 loadState(source, &entryState, TRUE);
   2701                 goBackOne(source); // We want to look at the point where we entered - actually one
   2702                 // before that...
   2703 
   2704                 for(;;) {
   2705                     // This loop will run once per source string character, for as long as we
   2706                     //  are matching a potential contraction sequence
   2707 
   2708                     // First we position ourselves at the begining of contraction sequence
   2709                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2710                     if (collIter_bos(source)) {
   2711                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2712                         break;
   2713                     }
   2714                     schar = getPrevNormalizedChar(source, status);
   2715                     goBackOne(source);
   2716 
   2717                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2718                         UCharOffset++;
   2719                     }
   2720 
   2721                     if (schar == tchar) {
   2722                         // Found the source string char in the table.
   2723                         //  Pick up the corresponding CE from the table.
   2724                         CE = *(coll->contractionCEs +
   2725                             (UCharOffset - coll->contractionIndex));
   2726                     }
   2727                     else
   2728                     {
   2729                         // Source string char was not in the table.
   2730                         //   We have not found the prefix.
   2731                         CE = *(coll->contractionCEs +
   2732                             (ContractionStart - coll->contractionIndex));
   2733                     }
   2734 
   2735                     if(!isPrefix(CE)) {
   2736                         // The source string char was in the contraction table, and the corresponding
   2737                         //   CE is not a prefix CE.  We found the prefix, break
   2738                         //   out of loop, this CE will end up being returned.  This is the normal
   2739                         //   way out of prefix handling when the source actually contained
   2740                         //   the prefix.
   2741                         break;
   2742                     }
   2743                 }
   2744                 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
   2745                     loadState(source, &prefixState, TRUE);
   2746                     if(source->origFlags & UCOL_USE_ITERATOR) {
   2747                         source->flags = source->origFlags;
   2748                     }
   2749                 } else { // prefix search was a failure, we have to backup all the way to the start
   2750                     loadState(source, &entryState, TRUE);
   2751                 }
   2752                 break;
   2753             }
   2754         case CONTRACTION_TAG:
   2755             {
   2756                 /* This should handle contractions */
   2757                 collIterateState state;
   2758                 backupState(source, &state);
   2759                 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
   2760                 const UChar *UCharOffset;
   2761                 UChar schar, tchar;
   2762 
   2763                 for (;;) {
   2764                     /* This loop will run once per source string character, for as long as we     */
   2765                     /*  are matching a potential contraction sequence                  */
   2766 
   2767                     /* First we position ourselves at the begining of contraction sequence */
   2768                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   2769 
   2770                     if (collIter_eos(source)) {
   2771                         // Ran off the end of the source string.
   2772                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   2773                         // So we'll pick whatever we have at the point...
   2774                         if (CE == UCOL_NOT_FOUND) {
   2775                             // back up the source over all the chars we scanned going into this contraction.
   2776                             CE = firstCE;
   2777                             loadState(source, &state, TRUE);
   2778                             if(source->origFlags & UCOL_USE_ITERATOR) {
   2779                                 source->flags = source->origFlags;
   2780                             }
   2781                         }
   2782                         break;
   2783                     }
   2784 
   2785                     uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
   2786                     uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
   2787 
   2788                     schar = getNextNormalizedChar(source);
   2789                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   2790                         UCharOffset++;
   2791                     }
   2792 
   2793                     if (schar == tchar) {
   2794                         // Found the source string char in the contraction table.
   2795                         //  Pick up the corresponding CE from the table.
   2796                         CE = *(coll->contractionCEs +
   2797                             (UCharOffset - coll->contractionIndex));
   2798                     }
   2799                     else
   2800                     {
   2801                         // Source string char was not in contraction table.
   2802                         //   Unless we have a discontiguous contraction, we have finished
   2803                         //   with this contraction.
   2804                         // in order to do the proper detection, we
   2805                         // need to see if we're dealing with a supplementary
   2806                         /* We test whether the next two char are surrogate pairs.
   2807                         * This test is done if the iterator is not NULL.
   2808                         * If there is no surrogate pair, the iterator
   2809                         * goes back one if needed. */
   2810                         UChar32 miss = schar;
   2811                         if (source->iterator) {
   2812                             UChar32 surrNextChar; /* the next char in the iteration to test */
   2813                             int32_t prevPos; /* holds the previous position before move forward of the source iterator */
   2814                             if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
   2815                                 prevPos = source->iterator->index;
   2816                                 surrNextChar = getNextNormalizedChar(source);
   2817                                 if (U16_IS_TRAIL(surrNextChar)) {
   2818                                     miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
   2819                                 } else if (prevPos < source->iterator->index){
   2820                                     goBackOne(source);
   2821                                 }
   2822                             }
   2823                         } else if (U16_IS_LEAD(schar)) {
   2824                             miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
   2825                         }
   2826 
   2827                         uint8_t sCC;
   2828                         if (miss < 0x300 ||
   2829                             maxCC == 0 ||
   2830                             (sCC = i_getCombiningClass(miss, coll)) == 0 ||
   2831                             sCC>maxCC ||
   2832                             (allSame != 0 && sCC == maxCC) ||
   2833                             collIter_eos(source))
   2834                         {
   2835                             //  Contraction can not be discontiguous.
   2836                             goBackOne(source);  // back up the source string by one,
   2837                             //  because  the character we just looked at was
   2838                             //  not part of the contraction.   */
   2839                             if(U_IS_SUPPLEMENTARY(miss)) {
   2840                                 goBackOne(source);
   2841                             }
   2842                             CE = *(coll->contractionCEs +
   2843                                 (ContractionStart - coll->contractionIndex));
   2844                         } else {
   2845                             //
   2846                             // Contraction is possibly discontiguous.
   2847                             //   Scan more of source string looking for a match
   2848                             //
   2849                             UChar tempchar;
   2850                             /* find the next character if schar is not a base character
   2851                             and we are not yet at the end of the string */
   2852                             tempchar = getNextNormalizedChar(source);
   2853                             // probably need another supplementary thingie here
   2854                             goBackOne(source);
   2855                             if (i_getCombiningClass(tempchar, coll) == 0) {
   2856                                 goBackOne(source);
   2857                                 if(U_IS_SUPPLEMENTARY(miss)) {
   2858                                     goBackOne(source);
   2859                                 }
   2860                                 /* Spit out the last char of the string, wasn't tasty enough */
   2861                                 CE = *(coll->contractionCEs +
   2862                                     (ContractionStart - coll->contractionIndex));
   2863                             } else {
   2864                                 CE = getDiscontiguous(coll, source, ContractionStart);
   2865                             }
   2866                         }
   2867                     } // else after if(schar == tchar)
   2868 
   2869                     if(CE == UCOL_NOT_FOUND) {
   2870                         /* The Source string did not match the contraction that we were checking.  */
   2871                         /*  Back up the source position to undo the effects of having partially    */
   2872                         /*   scanned through what ultimately proved to not be a contraction.       */
   2873                         loadState(source, &state, TRUE);
   2874                         CE = firstCE;
   2875                         break;
   2876                     }
   2877 
   2878                     if(!isContraction(CE)) {
   2879                         // The source string char was in the contraction table, and the corresponding
   2880                         //   CE is not a contraction CE.  We completed the contraction, break
   2881                         //   out of loop, this CE will end up being returned.  This is the normal
   2882                         //   way out of contraction handling when the source actually contained
   2883                         //   the contraction.
   2884                         break;
   2885                     }
   2886 
   2887 
   2888                     // The source string char was in the contraction table, and the corresponding
   2889                     //   CE is IS  a contraction CE.  We will continue looping to check the source
   2890                     //   string for the remaining chars in the contraction.
   2891                     uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
   2892                     if(tempCE != UCOL_NOT_FOUND) {
   2893                         // We have scanned a a section of source string for which there is a
   2894                         //  CE from the contraction table.  Remember the CE and scan position, so
   2895                         //  that we can return to this point if further scanning fails to
   2896                         //  match a longer contraction sequence.
   2897                         firstCE = tempCE;
   2898 
   2899                         goBackOne(source);
   2900                         backupState(source, &state);
   2901                         getNextNormalizedChar(source);
   2902 
   2903                         // Another way to do this is:
   2904                         //collIterateState tempState;
   2905                         //backupState(source, &tempState);
   2906                         //goBackOne(source);
   2907                         //backupState(source, &state);
   2908                         //loadState(source, &tempState, TRUE);
   2909 
   2910                         // The problem is that for incomplete contractions we have to remember the previous
   2911                         // position. Before, the only thing I needed to do was state.pos--;
   2912                         // After iterator introduction and especially after introduction of normalizing
   2913                         // iterators, it became much more difficult to decrease the saved state.
   2914                         // I'm not yet sure which of the two methods above is faster.
   2915                     }
   2916                 } // for(;;)
   2917                 break;
   2918             } // case CONTRACTION_TAG:
   2919         case LONG_PRIMARY_TAG:
   2920             {
   2921                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   2922                 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   2923                 source->offsetRepeatCount += 1;
   2924                 return CE;
   2925             }
   2926         case EXPANSION_TAG:
   2927             {
   2928                 /* This should handle expansion. */
   2929                 /* NOTE: we can encounter both continuations and expansions in an expansion! */
   2930                 /* I have to decide where continuations are going to be dealt with */
   2931                 uint32_t size;
   2932                 uint32_t i;    /* general counter */
   2933 
   2934                 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   2935                 size = getExpansionCount(CE);
   2936                 CE = *CEOffset++;
   2937               //source->offsetRepeatCount = -1;
   2938 
   2939                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   2940                     for(i = 1; i<size; i++) {
   2941                         *(source->CEpos++) = *CEOffset++;
   2942                         source->offsetRepeatCount += 1;
   2943                     }
   2944                 } else { /* else, we do */
   2945                     while(*CEOffset != 0) {
   2946                         *(source->CEpos++) = *CEOffset++;
   2947                         source->offsetRepeatCount += 1;
   2948                     }
   2949                 }
   2950 
   2951                 return CE;
   2952             }
   2953         case DIGIT_TAG:
   2954             {
   2955                 /*
   2956                 We do a check to see if we want to collate digits as numbers; if so we generate
   2957                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   2958                 */
   2959                 //uint32_t size;
   2960                 uint32_t i;    /* general counter */
   2961 
   2962                 if (source->coll->numericCollation == UCOL_ON){
   2963                     collIterateState digitState = {0,0,0,0,0,0,0,0,0};
   2964                     UChar32 char32 = 0;
   2965                     int32_t digVal = 0;
   2966 
   2967                     uint32_t digIndx = 0;
   2968                     uint32_t endIndex = 0;
   2969                     uint32_t trailingZeroIndex = 0;
   2970 
   2971                     uint8_t collateVal = 0;
   2972 
   2973                     UBool nonZeroValReached = FALSE;
   2974 
   2975                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
   2976                     /*
   2977                          We parse the source string until we hit a char that's NOT a digit.
   2978                         Use this u_charDigitValue. This might be slow because we have to
   2979                         handle surrogates...
   2980                     */
   2981             /*
   2982                     if (U16_IS_LEAD(ch)){
   2983                       if (!collIter_eos(source)) {
   2984                         backupState(source, &digitState);
   2985                         UChar trail = getNextNormalizedChar(source);
   2986                         if(U16_IS_TRAIL(trail)) {
   2987                           char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   2988                         } else {
   2989                           loadState(source, &digitState, TRUE);
   2990                           char32 = ch;
   2991                         }
   2992                       } else {
   2993                         char32 = ch;
   2994                       }
   2995                     } else {
   2996                       char32 = ch;
   2997                     }
   2998                     digVal = u_charDigitValue(char32);
   2999             */
   3000                     digVal = u_charDigitValue(cp); // if we have arrived here, we have
   3001                     // already processed possible supplementaries that trigered the digit tag -
   3002                     // all supplementaries are marked in the UCA.
   3003                     /*
   3004                         We  pad a zero in front of the first element anyways. This takes
   3005                         care of the (probably) most common case where people are sorting things followed
   3006                         by a single digit
   3007                     */
   3008                     digIndx++;
   3009                     for(;;){
   3010                         // Make sure we have enough space. No longer needed;
   3011                         // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
   3012                         // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
   3013                         // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
   3014 
   3015                         // Skipping over leading zeroes.
   3016                         if (digVal != 0) {
   3017                             nonZeroValReached = TRUE;
   3018                         }
   3019                         if (nonZeroValReached) {
   3020                             /*
   3021                             We parse the digit string into base 100 numbers (this fits into a byte).
   3022                             We only add to the buffer in twos, thus if we are parsing an odd character,
   3023                             that serves as the 'tens' digit while the if we are parsing an even one, that
   3024                             is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3025                             a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3026                             overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3027                             than all the other bytes.
   3028                             */
   3029 
   3030                             if (digIndx % 2 == 1){
   3031                                 collateVal += (uint8_t)digVal;
   3032 
   3033                                 // We don't enter the low-order-digit case unless we've already seen
   3034                                 // the high order, or for the first digit, which is always non-zero.
   3035                                 if (collateVal != 0)
   3036                                     trailingZeroIndex = 0;
   3037 
   3038                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3039                                 collateVal = 0;
   3040                             }
   3041                             else{
   3042                                 // We drop the collation value into the buffer so if we need to do
   3043                                 // a "front patch" we don't have to check to see if we're hitting the
   3044                                 // last element.
   3045                                 collateVal = (uint8_t)(digVal * 10);
   3046 
   3047                                 // Check for trailing zeroes.
   3048                                 if (collateVal == 0)
   3049                                 {
   3050                                     if (!trailingZeroIndex)
   3051                                         trailingZeroIndex = (digIndx/2) + 2;
   3052                                 }
   3053                                 else
   3054                                     trailingZeroIndex = 0;
   3055 
   3056                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3057                             }
   3058                             digIndx++;
   3059                         }
   3060 
   3061                         // Get next character.
   3062                         if (!collIter_eos(source)){
   3063                             ch = getNextNormalizedChar(source);
   3064                             if (U16_IS_LEAD(ch)){
   3065                                 if (!collIter_eos(source)) {
   3066                                     backupState(source, &digitState);
   3067                                     UChar trail = getNextNormalizedChar(source);
   3068                                     if(U16_IS_TRAIL(trail)) {
   3069                                         char32 = U16_GET_SUPPLEMENTARY(ch, trail);
   3070                                     } else {
   3071                                         loadState(source, &digitState, TRUE);
   3072                                         char32 = ch;
   3073                                     }
   3074                                 }
   3075                             } else {
   3076                                 char32 = ch;
   3077                             }
   3078 
   3079                             if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
   3080                                 // Resetting position to point to the next unprocessed char. We
   3081                                 // overshot it when doing our test/set for numbers.
   3082                                 if (char32 > 0xFFFF) { // For surrogates.
   3083                                     loadState(source, &digitState, TRUE);
   3084                                     //goBackOne(source);
   3085                                 }
   3086                                 goBackOne(source);
   3087                                 break;
   3088                             }
   3089                         } else {
   3090                             break;
   3091                         }
   3092                     }
   3093 
   3094                     if (nonZeroValReached == FALSE){
   3095                         digIndx = 2;
   3096                         numTempBuf[2] = 6;
   3097                     }
   3098 
   3099                     endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
   3100                     if (digIndx % 2 != 0){
   3101                         /*
   3102                         We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
   3103                         we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
   3104                         Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
   3105                         single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
   3106                         */
   3107 
   3108                         for(i = 2; i < endIndex; i++){
   3109                             numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
   3110                                 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
   3111                         }
   3112                         --digIndx;
   3113                     }
   3114 
   3115                     // Subtract one off of the last byte.
   3116                     numTempBuf[endIndex-1] -= 1;
   3117 
   3118                     /*
   3119                     We want to skip over the first two slots in the buffer. The first slot
   3120                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3121                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3122                     */
   3123                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3124                     numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
   3125 
   3126                     // Now transfer the collation key to our collIterate struct.
   3127                     // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
   3128                     //size = ((endIndex+1) & ~1)/2;
   3129                     CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3130                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3131                         UCOL_BYTE_COMMON; // Tertiary weight.
   3132                     i = 2; // Reset the index into the buffer.
   3133                     while(i < endIndex)
   3134                     {
   3135                         uint32_t primWeight = numTempBuf[i++] << 8;
   3136                         if ( i < endIndex)
   3137                             primWeight |= numTempBuf[i++];
   3138                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3139                     }
   3140 
   3141                 } else {
   3142                     // no numeric mode, we'll just switch to whatever we stashed and continue
   3143                     CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   3144                     CE = *CEOffset++;
   3145                     break;
   3146                 }
   3147                 return CE;
   3148             }
   3149             /* various implicits optimization */
   3150         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   3151             /* UCA is filled with these. Tailorings are NOT_FOUND */
   3152             return getImplicit(cp, source);
   3153         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   3154             // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
   3155             return getImplicit(cp, source);
   3156         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3157             {
   3158                 static const uint32_t
   3159                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3160                 //const uint32_t LCount = 19;
   3161                 static const uint32_t VCount = 21;
   3162                 static const uint32_t TCount = 28;
   3163                 //const uint32_t NCount = VCount * TCount;   // 588
   3164                 //const uint32_t SCount = LCount * NCount;   // 11172
   3165                 uint32_t L = ch - SBase;
   3166 
   3167                 // divide into pieces
   3168 
   3169                 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
   3170                 L /= TCount;
   3171                 uint32_t V = L % VCount;
   3172                 L /= VCount;
   3173 
   3174                 // offset them
   3175 
   3176                 L += LBase;
   3177                 V += VBase;
   3178                 T += TBase;
   3179 
   3180                 // return the first CE, but first put the rest into the expansion buffer
   3181                 if (!source->coll->image->jamoSpecial) { // FAST PATH
   3182 
   3183                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   3184                     if (T != TBase) {
   3185                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   3186                     }
   3187 
   3188                     return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   3189 
   3190                 } else { // Jamo is Special
   3191                     // Since Hanguls pass the FCD check, it is
   3192                     // guaranteed that we won't be in
   3193                     // the normalization buffer if something like this happens
   3194 
   3195                     // However, if we are using a uchar iterator and normalization
   3196                     // is ON, the Hangul that lead us here is going to be in that
   3197                     // normalization buffer. Here we want to restore the uchar
   3198                     // iterator state and pull out of the normalization buffer
   3199                     if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
   3200                         source->flags = source->origFlags; // restore the iterator
   3201                         source->pos = NULL;
   3202                     }
   3203 
   3204                     // Move Jamos into normalization buffer
   3205                     UChar *buffer = source->writableBuffer.getBuffer(4);
   3206                     int32_t bufferLength;
   3207                     buffer[0] = (UChar)L;
   3208                     buffer[1] = (UChar)V;
   3209                     if (T != TBase) {
   3210                         buffer[2] = (UChar)T;
   3211                         bufferLength = 3;
   3212                     } else {
   3213                         bufferLength = 2;
   3214                     }
   3215                     source->writableBuffer.releaseBuffer(bufferLength);
   3216 
   3217                     // Indicate where to continue in main input string after exhausting the writableBuffer
   3218                     source->fcdPosition       = source->pos;
   3219 
   3220                     source->pos   = source->writableBuffer.getTerminatedBuffer();
   3221                     source->origFlags   = source->flags;
   3222                     source->flags       |= UCOL_ITER_INNORMBUF;
   3223                     source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   3224 
   3225                     return(UCOL_IGNORABLE);
   3226                 }
   3227             }
   3228         case SURROGATE_TAG:
   3229             /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
   3230             /* two things can happen here: next code point can be a trailing surrogate - we will use it */
   3231             /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
   3232             /* we treat it like an unassigned code point. */
   3233             {
   3234                 UChar trail;
   3235                 collIterateState state;
   3236                 backupState(source, &state);
   3237                 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
   3238                     // we chould have stepped one char forward and it might have turned that it
   3239                     // was not a trail surrogate. In that case, we have to backup.
   3240                     loadState(source, &state, TRUE);
   3241                     return UCOL_NOT_FOUND;
   3242                 } else {
   3243                     /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
   3244                     CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
   3245                     if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
   3246                         // We need to backup
   3247                         loadState(source, &state, TRUE);
   3248                         return CE;
   3249                     }
   3250                     // calculate the supplementary code point value, if surrogate was not tailored
   3251                     cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   3252                 }
   3253             }
   3254             break;
   3255         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   3256             UChar nextChar;
   3257             if( source->flags & UCOL_USE_ITERATOR) {
   3258                 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
   3259                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3260                     source->iterator->next(source->iterator);
   3261                     return getImplicit(cp, source);
   3262                 }
   3263             } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
   3264                       U_IS_TRAIL((nextChar=*source->pos))) {
   3265                 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
   3266                 source->pos++;
   3267                 return getImplicit(cp, source);
   3268             }
   3269             return UCOL_NOT_FOUND;
   3270         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   3271             return UCOL_NOT_FOUND; /* broken surrogate sequence */
   3272         case CHARSET_TAG:
   3273             /* not yet implemented */
   3274             /* probably after 1.8 */
   3275             return UCOL_NOT_FOUND;
   3276         default:
   3277             *status = U_INTERNAL_PROGRAM_ERROR;
   3278             CE=0;
   3279             break;
   3280     }
   3281     if (CE <= UCOL_NOT_FOUND) break;
   3282   }
   3283   return CE;
   3284 }
   3285 
   3286 
   3287 /* now uses Mark's getImplicitPrimary code */
   3288 static
   3289 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
   3290     uint32_t r = uprv_uca_getImplicitPrimary(cp);
   3291 
   3292     *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
   3293     collationSource->toReturn = collationSource->CEpos;
   3294 
   3295     // **** doesn't work if using iterator ****
   3296     if (collationSource->flags & UCOL_ITER_INNORMBUF) {
   3297         collationSource->offsetRepeatCount = 1;
   3298     } else {
   3299         int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
   3300 
   3301         UErrorCode errorCode = U_ZERO_ERROR;
   3302         collationSource->appendOffset(firstOffset, errorCode);
   3303         collationSource->appendOffset(firstOffset + 1, errorCode);
   3304 
   3305         collationSource->offsetReturn = collationSource->offsetStore - 1;
   3306         *(collationSource->offsetBuffer) = firstOffset;
   3307         if (collationSource->offsetReturn == collationSource->offsetBuffer) {
   3308             collationSource->offsetStore = collationSource->offsetBuffer;
   3309         }
   3310     }
   3311 
   3312     return ((r & 0x0000FFFF)<<16) | 0x000000C0;
   3313 }
   3314 
   3315 /**
   3316  * This function handles the special CEs like contractions, expansions,
   3317  * surrogates, Thai.
   3318  * It is called by both getPrevCE
   3319  */
   3320 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
   3321                           collIterate *source,
   3322                           UErrorCode *status)
   3323 {
   3324     const uint32_t *CEOffset    = NULL;
   3325           UChar    *UCharOffset = NULL;
   3326           UChar    schar;
   3327     const UChar    *constart    = NULL;
   3328           uint32_t size;
   3329           UChar    buffer[UCOL_MAX_BUFFER];
   3330           uint32_t *endCEBuffer;
   3331           UChar   *strbuffer;
   3332           int32_t noChars = 0;
   3333           int32_t CECount = 0;
   3334 
   3335     for(;;)
   3336     {
   3337         /* the only ces that loops are thai and contractions */
   3338         switch (getCETag(CE))
   3339         {
   3340         case NOT_FOUND_TAG:  /* this tag always returns */
   3341             return CE;
   3342 
   3343         case SPEC_PROC_TAG:
   3344             {
   3345                 // Special processing is getting a CE that is preceded by a certain prefix
   3346                 // Currently this is only needed for optimizing Japanese length and iteration marks.
   3347                 // When we encouter a special processing tag, we go backwards and try to see if
   3348                 // we have a match.
   3349                 // Contraction tables are used - so the whole process is not unlike contraction.
   3350                 // prefix data is stored backwards in the table.
   3351                 const UChar *UCharOffset;
   3352                 UChar schar, tchar;
   3353                 collIterateState prefixState;
   3354                 backupState(source, &prefixState);
   3355                 for(;;) {
   3356                     // This loop will run once per source string character, for as long as we
   3357                     //  are matching a potential contraction sequence
   3358 
   3359                     // First we position ourselves at the begining of contraction sequence
   3360                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   3361 
   3362                     if (collIter_bos(source)) {
   3363                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
   3364                         break;
   3365                     }
   3366                     schar = getPrevNormalizedChar(source, status);
   3367                     goBackOne(source);
   3368 
   3369                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   3370                         UCharOffset++;
   3371                     }
   3372 
   3373                     if (schar == tchar) {
   3374                         // Found the source string char in the table.
   3375                         //  Pick up the corresponding CE from the table.
   3376                         CE = *(coll->contractionCEs +
   3377                             (UCharOffset - coll->contractionIndex));
   3378                     }
   3379                     else
   3380                     {
   3381                         // if there is a completely ignorable code point in the middle of
   3382                         // a prefix, we need to act as if it's not there
   3383                         // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
   3384                         // lone surrogates cannot be set to zero as it would break other processing
   3385                         uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   3386                         // it's easy for BMP code points
   3387                         if(isZeroCE == 0) {
   3388                             continue;
   3389                         } else if(U16_IS_SURROGATE(schar)) {
   3390                             // for supplementary code points, we have to check the next one
   3391                             // situations where we are going to ignore
   3392                             // 1. beginning of the string: schar is a lone surrogate
   3393                             // 2. schar is a lone surrogate
   3394                             // 3. schar is a trail surrogate in a valid surrogate sequence
   3395                             //    that is explicitly set to zero.
   3396                             if (!collIter_bos(source)) {
   3397                                 UChar lead;
   3398                                 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
   3399                                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
   3400                                     if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
   3401                                         uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
   3402                                         if(finalCE == 0) {
   3403                                             // this is a real, assigned completely ignorable code point
   3404                                             goBackOne(source);
   3405                                             continue;
   3406                                         }
   3407                                     }
   3408                                 } else {
   3409                                     // lone surrogate, treat like unassigned
   3410                                     return UCOL_NOT_FOUND;
   3411                                 }
   3412                             } else {
   3413                                 // lone surrogate at the beggining, treat like unassigned
   3414                                 return UCOL_NOT_FOUND;
   3415                             }
   3416                         }
   3417                         // Source string char was not in the table.
   3418                         //   We have not found the prefix.
   3419                         CE = *(coll->contractionCEs +
   3420                             (ContractionStart - coll->contractionIndex));
   3421                     }
   3422 
   3423                     if(!isPrefix(CE)) {
   3424                         // The source string char was in the contraction table, and the corresponding
   3425                         //   CE is not a prefix CE.  We found the prefix, break
   3426                         //   out of loop, this CE will end up being returned.  This is the normal
   3427                         //   way out of prefix handling when the source actually contained
   3428                         //   the prefix.
   3429                         break;
   3430                     }
   3431                 }
   3432                 loadState(source, &prefixState, TRUE);
   3433                 break;
   3434             }
   3435 
   3436         case CONTRACTION_TAG: {
   3437             /* to ensure that the backwards and forwards iteration matches, we
   3438             take the current region of most possible match and pass it through
   3439             the forward iteration. this will ensure that the obstinate problem of
   3440             overlapping contractions will not occur.
   3441             */
   3442             schar = peekCodeUnit(source, 0);
   3443             constart = (UChar *)coll->image + getContractOffset(CE);
   3444             if (isAtStartPrevIterate(source)
   3445                 /* commented away contraction end checks after adding the checks
   3446                 in getPrevCE  */) {
   3447                     /* start of string or this is not the end of any contraction */
   3448                     CE = *(coll->contractionCEs +
   3449                         (constart - coll->contractionIndex));
   3450                     break;
   3451             }
   3452             strbuffer = buffer;
   3453             UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
   3454             *(UCharOffset --) = 0;
   3455             noChars = 0;
   3456             // have to swap thai characters
   3457             while (ucol_unsafeCP(schar, coll)) {
   3458                 *(UCharOffset) = schar;
   3459                 noChars++;
   3460                 UCharOffset --;
   3461                 schar = getPrevNormalizedChar(source, status);
   3462                 goBackOne(source);
   3463                 // TODO: when we exhaust the contraction buffer,
   3464                 // it needs to get reallocated. The problem is
   3465                 // that the size depends on the string which is
   3466                 // not iterated over. However, since we're travelling
   3467                 // backwards, we already had to set the iterator at
   3468                 // the end - so we might as well know where we are?
   3469                 if (UCharOffset + 1 == buffer) {
   3470                     /* we have exhausted the buffer */
   3471                     int32_t newsize = 0;
   3472                     if(source->pos) { // actually dealing with a position
   3473                         newsize = (int32_t)(source->pos - source->string + 1);
   3474                     } else { // iterator
   3475                         newsize = 4 * UCOL_MAX_BUFFER;
   3476                     }
   3477                     strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
   3478                         (newsize + UCOL_MAX_BUFFER));
   3479                     /* test for NULL */
   3480                     if (strbuffer == NULL) {
   3481                         *status = U_MEMORY_ALLOCATION_ERROR;
   3482                         return UCOL_NO_MORE_CES;
   3483                     }
   3484                     UCharOffset = strbuffer + newsize;
   3485                     uprv_memcpy(UCharOffset, buffer,
   3486                         UCOL_MAX_BUFFER * sizeof(UChar));
   3487                     UCharOffset --;
   3488                 }
   3489                 if ((source->pos && (source->pos == source->string ||
   3490                     ((source->flags & UCOL_ITER_INNORMBUF) &&
   3491                     *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
   3492                     || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
   3493                         break;
   3494                 }
   3495             }
   3496             /* adds the initial base character to the string */
   3497             *(UCharOffset) = schar;
   3498             noChars++;
   3499 
   3500             int32_t offsetBias;
   3501 
   3502             // **** doesn't work if using iterator ****
   3503             if (source->flags & UCOL_ITER_INNORMBUF) {
   3504                 offsetBias = -1;
   3505             } else {
   3506                 offsetBias = (int32_t)(source->pos - source->string);
   3507             }
   3508 
   3509             /* a new collIterate is used to simplify things, since using the current
   3510             collIterate will mean that the forward and backwards iteration will
   3511             share and change the same buffers. we don't want to get into that. */
   3512             collIterate temp;
   3513             int32_t rawOffset;
   3514 
   3515             IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
   3516             if(U_FAILURE(*status)) {
   3517                 return (uint32_t)UCOL_NULLORDER;
   3518             }
   3519             temp.flags &= ~UCOL_ITER_NORM;
   3520             temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
   3521 
   3522             rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
   3523             CE = ucol_IGetNextCE(coll, &temp, status);
   3524 
   3525             if (source->extendCEs) {
   3526                 endCEBuffer = source->extendCEs + source->extendCEsSize;
   3527                 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
   3528             } else {
   3529                 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
   3530                 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
   3531             }
   3532 
   3533             while (CE != UCOL_NO_MORE_CES) {
   3534                 *(source->CEpos ++) = CE;
   3535 
   3536                 if (offsetBias >= 0) {
   3537                     source->appendOffset(rawOffset + offsetBias, *status);
   3538                 }
   3539 
   3540                 CECount++;
   3541                 if (source->CEpos == endCEBuffer) {
   3542                     /* ran out of CE space, reallocate to new buffer.
   3543                     If reallocation fails, reset pointers and bail out,
   3544                     there's no guarantee of the right character position after
   3545                     this bail*/
   3546                     if (!increaseCEsCapacity(source)) {
   3547                         *status = U_MEMORY_ALLOCATION_ERROR;
   3548                         break;
   3549                     }
   3550 
   3551                     endCEBuffer = source->extendCEs + source->extendCEsSize;
   3552                 }
   3553 
   3554                 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
   3555                     rawOffset = (int32_t)(temp.fcdPosition - temp.string);
   3556                 } else {
   3557                     rawOffset = (int32_t)(temp.pos - temp.string);
   3558                 }
   3559 
   3560                 CE = ucol_IGetNextCE(coll, &temp, status);
   3561             }
   3562 
   3563             if (strbuffer != buffer) {
   3564                 uprv_free(strbuffer);
   3565             }
   3566             if (U_FAILURE(*status)) {
   3567                 return (uint32_t)UCOL_NULLORDER;
   3568             }
   3569 
   3570             if (source->offsetRepeatValue != 0) {
   3571                 if (CECount > noChars) {
   3572                     source->offsetRepeatCount += temp.offsetRepeatCount;
   3573                 } else {
   3574                     // **** does this really skip the right offsets? ****
   3575                     source->offsetReturn -= (noChars - CECount);
   3576                 }
   3577             }
   3578 
   3579             if (offsetBias >= 0) {
   3580                 source->offsetReturn = source->offsetStore - 1;
   3581                 if (source->offsetReturn == source->offsetBuffer) {
   3582                     source->offsetStore = source->offsetBuffer;
   3583                 }
   3584             }
   3585 
   3586             source->toReturn = source->CEpos - 1;
   3587             if (source->toReturn == source->CEs) {
   3588                 source->CEpos = source->CEs;
   3589             }
   3590 
   3591             return *(source->toReturn);
   3592         }
   3593         case LONG_PRIMARY_TAG:
   3594             {
   3595                 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
   3596                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
   3597                 source->toReturn = source->CEpos - 1;
   3598 
   3599                 if (source->flags & UCOL_ITER_INNORMBUF) {
   3600                     source->offsetRepeatCount = 1;
   3601                 } else {
   3602                     int32_t firstOffset = (int32_t)(source->pos - source->string);
   3603 
   3604                     source->appendOffset(firstOffset, *status);
   3605                     source->appendOffset(firstOffset + 1, *status);
   3606 
   3607                     source->offsetReturn = source->offsetStore - 1;
   3608                     *(source->offsetBuffer) = firstOffset;
   3609                     if (source->offsetReturn == source->offsetBuffer) {
   3610                         source->offsetStore = source->offsetBuffer;
   3611                     }
   3612                 }
   3613 
   3614 
   3615                 return *(source->toReturn);
   3616             }
   3617 
   3618         case EXPANSION_TAG: /* this tag always returns */
   3619             {
   3620             /*
   3621             This should handle expansion.
   3622             NOTE: we can encounter both continuations and expansions in an expansion!
   3623             I have to decide where continuations are going to be dealt with
   3624             */
   3625             int32_t firstOffset = (int32_t)(source->pos - source->string);
   3626 
   3627             // **** doesn't work if using iterator ****
   3628             if (source->offsetReturn != NULL) {
   3629                 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
   3630                     source->offsetStore = source->offsetBuffer;
   3631                 }else {
   3632                   firstOffset = -1;
   3633                 }
   3634             }
   3635 
   3636             /* find the offset to expansion table */
   3637             CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3638             size     = getExpansionCount(CE);
   3639             if (size != 0) {
   3640                 /*
   3641                 if there are less than 16 elements in expansion, we don't terminate
   3642                 */
   3643                 uint32_t count;
   3644 
   3645                 for (count = 0; count < size; count++) {
   3646                     *(source->CEpos ++) = *CEOffset++;
   3647 
   3648                     if (firstOffset >= 0) {
   3649                         source->appendOffset(firstOffset + 1, *status);
   3650                     }
   3651                 }
   3652             } else {
   3653                 /* else, we do */
   3654                 while (*CEOffset != 0) {
   3655                     *(source->CEpos ++) = *CEOffset ++;
   3656 
   3657                     if (firstOffset >= 0) {
   3658                         source->appendOffset(firstOffset + 1, *status);
   3659                     }
   3660                 }
   3661             }
   3662 
   3663             if (firstOffset >= 0) {
   3664                 source->offsetReturn = source->offsetStore - 1;
   3665                 *(source->offsetBuffer) = firstOffset;
   3666                 if (source->offsetReturn == source->offsetBuffer) {
   3667                     source->offsetStore = source->offsetBuffer;
   3668                 }
   3669             } else {
   3670                 source->offsetRepeatCount += size - 1;
   3671             }
   3672 
   3673             source->toReturn = source->CEpos - 1;
   3674             // in case of one element expansion, we
   3675             // want to immediately return CEpos
   3676             if(source->toReturn == source->CEs) {
   3677                 source->CEpos = source->CEs;
   3678             }
   3679 
   3680             return *(source->toReturn);
   3681             }
   3682 
   3683         case DIGIT_TAG:
   3684             {
   3685                 /*
   3686                 We do a check to see if we want to collate digits as numbers; if so we generate
   3687                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
   3688                 */
   3689                 uint32_t i;    /* general counter */
   3690 
   3691                 if (source->coll->numericCollation == UCOL_ON){
   3692                     uint32_t digIndx = 0;
   3693                     uint32_t endIndex = 0;
   3694                     uint32_t leadingZeroIndex = 0;
   3695                     uint32_t trailingZeroCount = 0;
   3696 
   3697                     uint8_t collateVal = 0;
   3698 
   3699                     UBool nonZeroValReached = FALSE;
   3700 
   3701                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
   3702                     /*
   3703                     We parse the source string until we hit a char that's NOT a digit.
   3704                     Use this u_charDigitValue. This might be slow because we have to
   3705                     handle surrogates...
   3706                     */
   3707                     /*
   3708                     We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
   3709                     with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
   3710                     element we process when going backward. To determine how long that chunk might be, we may need to make
   3711                     two passes through the loop that collects digits - one to see how long the string is (and how much is
   3712                     leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
   3713                     more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
   3714                     element chunk after resetting the state to the initialState at the right side of the digit string.
   3715                     */
   3716                     uint32_t ceLimit = 0;
   3717                     UChar initial_ch = ch;
   3718                     collIterateState initialState = {0,0,0,0,0,0,0,0,0};
   3719                     backupState(source, &initialState);
   3720 
   3721                     for(;;) {
   3722                         collIterateState state = {0,0,0,0,0,0,0,0,0};
   3723                         UChar32 char32 = 0;
   3724                         int32_t digVal = 0;
   3725 
   3726                         if (U16_IS_TRAIL (ch)) {
   3727                             if (!collIter_bos(source)){
   3728                                 UChar lead = getPrevNormalizedChar(source, status);
   3729                                 if(U16_IS_LEAD(lead)) {
   3730                                     char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3731                                     goBackOne(source);
   3732                                 } else {
   3733                                     char32 = ch;
   3734                                 }
   3735                             } else {
   3736                                 char32 = ch;
   3737                             }
   3738                         } else {
   3739                             char32 = ch;
   3740                         }
   3741                         digVal = u_charDigitValue(char32);
   3742 
   3743                         for(;;) {
   3744                             // Make sure we have enough space. No longer needed;
   3745                             // at this point the largest value of digIndx when we need to save data in numTempBuf
   3746                             // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
   3747                             // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
   3748 
   3749                             // Skip over trailing zeroes, and keep a count of them.
   3750                             if (digVal != 0)
   3751                                 nonZeroValReached = TRUE;
   3752 
   3753                             if (nonZeroValReached) {
   3754                                 /*
   3755                                 We parse the digit string into base 100 numbers (this fits into a byte).
   3756                                 We only add to the buffer in twos, thus if we are parsing an odd character,
   3757                                 that serves as the 'tens' digit while the if we are parsing an even one, that
   3758                                 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
   3759                                 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
   3760                                 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
   3761                                 than all the other bytes.
   3762 
   3763                                 Since we're doing in this reverse we want to put the first digit encountered into the
   3764                                 ones place and the second digit encountered into the tens place.
   3765                                 */
   3766 
   3767                                 if ((digIndx + trailingZeroCount) % 2 == 1) {
   3768                                     // High-order digit case (tens place)
   3769                                     collateVal += (uint8_t)(digVal * 10);
   3770 
   3771                                     // We cannot set leadingZeroIndex unless it has been set for the
   3772                                     // low-order digit. Therefore, all we can do for the high-order
   3773                                     // digit is turn it off, never on.
   3774                                     // The only time we will have a high digit without a low is for
   3775                                     // the very first non-zero digit, so no zero check is necessary.
   3776                                     if (collateVal != 0)
   3777                                         leadingZeroIndex = 0;
   3778 
   3779                                     // The first pass through, digIndx may exceed the limit, but in that case
   3780                                     // we no longer care about numTempBuf contents since they will be discarded
   3781                                     if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
   3782                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
   3783                                     }
   3784                                     collateVal = 0;
   3785                                 } else {
   3786                                     // Low-order digit case (ones place)
   3787                                     collateVal = (uint8_t)digVal;
   3788 
   3789                                     // Check for leading zeroes.
   3790                                     if (collateVal == 0) {
   3791                                         if (!leadingZeroIndex)
   3792                                             leadingZeroIndex = (digIndx/2) + 2;
   3793                                     } else
   3794                                         leadingZeroIndex = 0;
   3795 
   3796                                     // No need to write to buffer; the case of a last odd digit
   3797                                     // is handled below.
   3798                                 }
   3799                                 ++digIndx;
   3800                             } else
   3801                                 ++trailingZeroCount;
   3802 
   3803                             if (!collIter_bos(source)) {
   3804                                 ch = getPrevNormalizedChar(source, status);
   3805                                 //goBackOne(source);
   3806                                 if (U16_IS_TRAIL(ch)) {
   3807                                     backupState(source, &state);
   3808                                     if (!collIter_bos(source)) {
   3809                                         goBackOne(source);
   3810                                         UChar lead = getPrevNormalizedChar(source, status);
   3811 
   3812                                         if(U16_IS_LEAD(lead)) {
   3813                                             char32 = U16_GET_SUPPLEMENTARY(lead,ch);
   3814                                         } else {
   3815                                             loadState(source, &state, FALSE);
   3816                                             char32 = ch;
   3817                                         }
   3818                                     }
   3819                                 } else
   3820                                     char32 = ch;
   3821 
   3822                                 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
   3823                                     if (char32 > 0xFFFF) {// For surrogates.
   3824                                         loadState(source, &state, FALSE);
   3825                                     }
   3826                                     // Don't need to "reverse" the goBackOne call,
   3827                                     // as this points to the next position to process..
   3828                                     //if (char32 > 0xFFFF) // For surrogates.
   3829                                     //getNextNormalizedChar(source);
   3830                                     break;
   3831                                 }
   3832 
   3833                                 goBackOne(source);
   3834                             }else
   3835                                 break;
   3836                         }
   3837 
   3838                         if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
   3839                             // our collation element is not too big, go ahead and finish with it
   3840                             break;
   3841                         }
   3842                         // our digit string is too long for a collation element;
   3843                         // set the limit for it, reset the state and begin again
   3844                         ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
   3845                         if ( ceLimit == 0 ) {
   3846                             ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
   3847                         }
   3848                         ch = initial_ch;
   3849                         loadState(source, &initialState, FALSE);
   3850                         digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
   3851                         collateVal = 0;
   3852                         nonZeroValReached = FALSE;
   3853                     }
   3854 
   3855                     if (! nonZeroValReached) {
   3856                         digIndx = 2;
   3857                         trailingZeroCount = 0;
   3858                         numTempBuf[2] = 6;
   3859                     }
   3860 
   3861                     if ((digIndx + trailingZeroCount) % 2 != 0) {
   3862                         numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
   3863                         digIndx += 1;       // The implicit leading zero
   3864                     }
   3865                     if (trailingZeroCount % 2 != 0) {
   3866                         // We had to consume one trailing zero for the low digit
   3867                         // of the least significant byte
   3868                         digIndx += 1;       // The trailing zero not in the exponent
   3869                         trailingZeroCount -= 1;
   3870                     }
   3871 
   3872                     endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
   3873 
   3874                     // Subtract one off of the last byte. Really the first byte here, but it's reversed...
   3875                     numTempBuf[2] -= 1;
   3876 
   3877                     /*
   3878                     We want to skip over the first two slots in the buffer. The first slot
   3879                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
   3880                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
   3881                     The exponent must be adjusted by the number of leading zeroes, and the number of
   3882                     trailing zeroes.
   3883                     */
   3884                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
   3885                     uint32_t exponent = (digIndx+trailingZeroCount)/2;
   3886                     if (leadingZeroIndex)
   3887                         exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
   3888                     numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
   3889 
   3890                     // Now transfer the collation key to our collIterate struct.
   3891                     // The total size for our collation key is half of endIndex, rounded up.
   3892                     int32_t size = (endIndex+1)/2;
   3893                     if(!ensureCEsCapacity(source, size)) {
   3894                         return (uint32_t)UCOL_NULLORDER;
   3895                     }
   3896                     *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
   3897                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
   3898                         UCOL_BYTE_COMMON; // Tertiary weight.
   3899                     i = endIndex - 1; // Reset the index into the buffer.
   3900                     while(i >= 2) {
   3901                         uint32_t primWeight = numTempBuf[i--] << 8;
   3902                         if ( i >= 2)
   3903                             primWeight |= numTempBuf[i--];
   3904                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
   3905                     }
   3906 
   3907                     source->toReturn = source->CEpos -1;
   3908                     return *(source->toReturn);
   3909                 } else {
   3910                     CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
   3911                     CE = *(CEOffset++);
   3912                     break;
   3913                 }
   3914             }
   3915 
   3916         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
   3917             {
   3918                 static const uint32_t
   3919                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
   3920                 //const uint32_t LCount = 19;
   3921                 static const uint32_t VCount = 21;
   3922                 static const uint32_t TCount = 28;
   3923                 //const uint32_t NCount = VCount * TCount;   /* 588 */
   3924                 //const uint32_t SCount = LCount * NCount;   /* 11172 */
   3925 
   3926                 uint32_t L = ch - SBase;
   3927                 /*
   3928                 divide into pieces.
   3929                 we do it in this order since some compilers can do % and / in one
   3930                 operation
   3931                 */
   3932                 uint32_t T = L % TCount;
   3933                 L /= TCount;
   3934                 uint32_t V = L % VCount;
   3935                 L /= VCount;
   3936 
   3937                 /* offset them */
   3938                 L += LBase;
   3939                 V += VBase;
   3940                 T += TBase;
   3941 
   3942                 int32_t firstOffset = (int32_t)(source->pos - source->string);
   3943                 source->appendOffset(firstOffset, *status);
   3944 
   3945                 /*
   3946                  * return the first CE, but first put the rest into the expansion buffer
   3947                  */
   3948                 if (!source->coll->image->jamoSpecial) {
   3949                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
   3950                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
   3951                     source->appendOffset(firstOffset + 1, *status);
   3952 
   3953                     if (T != TBase) {
   3954                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
   3955                         source->appendOffset(firstOffset + 1, *status);
   3956                     }
   3957 
   3958                     source->toReturn = source->CEpos - 1;
   3959 
   3960                     source->offsetReturn = source->offsetStore - 1;
   3961                     if (source->offsetReturn == source->offsetBuffer) {
   3962                         source->offsetStore = source->offsetBuffer;
   3963                     }
   3964 
   3965                     return *(source->toReturn);
   3966                 } else {
   3967                     // Since Hanguls pass the FCD check, it is
   3968                     // guaranteed that we won't be in
   3969                     // the normalization buffer if something like this happens
   3970 
   3971                     // Move Jamos into normalization buffer
   3972                     UChar *tempbuffer = source->writableBuffer.getBuffer(5);
   3973                     int32_t tempbufferLength, jamoOffset;
   3974                     tempbuffer[0] = 0;
   3975                     tempbuffer[1] = (UChar)L;
   3976                     tempbuffer[2] = (UChar)V;
   3977                     if (T != TBase) {
   3978                         tempbuffer[3] = (UChar)T;
   3979                         tempbufferLength = 4;
   3980                     } else {
   3981                         tempbufferLength = 3;
   3982                     }
   3983                     source->writableBuffer.releaseBuffer(tempbufferLength);
   3984 
   3985                     // Indicate where to continue in main input string after exhausting the writableBuffer
   3986                     if (source->pos  == source->string) {
   3987                         jamoOffset = 0;
   3988                         source->fcdPosition = NULL;
   3989                     } else {
   3990                         jamoOffset = source->pos - source->string;
   3991                         source->fcdPosition       = source->pos-1;
   3992                     }
   3993 
   3994                     // Append offsets for the additional chars
   3995                     // (not the 0, and not the L whose offsets match the original Hangul)
   3996                     int32_t jamoRemaining = tempbufferLength - 2;
   3997                     jamoOffset++; // appended offsets should match end of original Hangul
   3998                     while (jamoRemaining-- > 0) {
   3999                         source->appendOffset(jamoOffset, *status);
   4000                     }
   4001 
   4002                     source->offsetRepeatValue = jamoOffset;
   4003 
   4004                     source->offsetReturn = source->offsetStore - 1;
   4005                     if (source->offsetReturn == source->offsetBuffer) {
   4006                         source->offsetStore = source->offsetBuffer;
   4007                     }
   4008 
   4009                     source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
   4010                     source->origFlags         = source->flags;
   4011                     source->flags            |= UCOL_ITER_INNORMBUF;
   4012                     source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
   4013 
   4014                     return(UCOL_IGNORABLE);
   4015                 }
   4016             }
   4017 
   4018         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
   4019             return getPrevImplicit(ch, source);
   4020 
   4021             // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
   4022         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
   4023             return getPrevImplicit(ch, source);
   4024 
   4025         case SURROGATE_TAG:  /* This is a surrogate pair */
   4026             /* essentially an engaged lead surrogate. */
   4027             /* if you have encountered it here, it means that a */
   4028             /* broken sequence was encountered and this is an error */
   4029             return UCOL_NOT_FOUND;
   4030 
   4031         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
   4032             return UCOL_NOT_FOUND; /* broken surrogate sequence */
   4033 
   4034         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
   4035             {
   4036                 UChar32 cp = 0;
   4037                 UChar  prevChar;
   4038                 const UChar *prev;
   4039                 if (isAtStartPrevIterate(source)) {
   4040                     /* we are at the start of the string, wrong place to be at */
   4041                     return UCOL_NOT_FOUND;
   4042                 }
   4043                 if (source->pos != source->writableBuffer.getBuffer()) {
   4044                     prev     = source->pos - 1;
   4045                 } else {
   4046                     prev     = source->fcdPosition;
   4047                 }
   4048                 prevChar = *prev;
   4049 
   4050                 /* Handles Han and Supplementary characters here.*/
   4051                 if (U16_IS_LEAD(prevChar)) {
   4052                     cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
   4053                     source->pos = prev;
   4054                 } else {
   4055                     return UCOL_NOT_FOUND; /* like unassigned */
   4056                 }
   4057 
   4058                 return getPrevImplicit(cp, source);
   4059             }
   4060 
   4061             /* UCA is filled with these. Tailorings are NOT_FOUND */
   4062             /* not yet implemented */
   4063         case CHARSET_TAG:  /* this tag always returns */
   4064             /* probably after 1.8 */
   4065             return UCOL_NOT_FOUND;
   4066 
   4067         default:           /* this tag always returns */
   4068             *status = U_INTERNAL_PROGRAM_ERROR;
   4069             CE=0;
   4070             break;
   4071         }
   4072 
   4073         if (CE <= UCOL_NOT_FOUND) {
   4074             break;
   4075         }
   4076     }
   4077 
   4078     return CE;
   4079 }
   4080 
   4081 /* This should really be a macro                                                                      */
   4082 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
   4083 /* secondaries in French                                                                              */
   4084 /*
   4085 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
   4086   uint8_t temp;
   4087   while(start<end) {
   4088     temp = *start;
   4089     *start++ = *end;
   4090     *end-- = temp;
   4091   }
   4092 }
   4093 */
   4094 
   4095 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
   4096   TYPE tempA; \
   4097 while((start)<(end)) { \
   4098     tempA = *(start); \
   4099     *(start)++ = *(end); \
   4100     *(end)-- = tempA; \
   4101 } \
   4102 }
   4103 
   4104 /****************************************************************************/
   4105 /* Following are the sortkey generation functions                           */
   4106 /*                                                                          */
   4107 /****************************************************************************/
   4108 
   4109 U_CAPI int32_t U_EXPORT2
   4110 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
   4111                    const uint8_t *src2, int32_t src2Length,
   4112                    uint8_t *dest, int32_t destCapacity) {
   4113     /* check arguments */
   4114     if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
   4115         src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
   4116         destCapacity<0 || (destCapacity>0 && dest==NULL)
   4117     ) {
   4118         /* error, attempt to write a zero byte and return 0 */
   4119         if(dest!=NULL && destCapacity>0) {
   4120             *dest=0;
   4121         }
   4122         return 0;
   4123     }
   4124 
   4125     /* check lengths and capacity */
   4126     if(src1Length<0) {
   4127         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
   4128     }
   4129     if(src2Length<0) {
   4130         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
   4131     }
   4132 
   4133     int32_t destLength=src1Length+src2Length;
   4134     if(destLength>destCapacity) {
   4135         /* the merged sort key does not fit into the destination */
   4136         return destLength;
   4137     }
   4138 
   4139     /* merge the sort keys with the same number of levels */
   4140     uint8_t *p=dest;
   4141     for(;;) {
   4142         /* copy level from src1 not including 00 or 01 */
   4143         uint8_t b;
   4144         while((b=*src1)>=2) {
   4145             ++src1;
   4146             *p++=b;
   4147         }
   4148 
   4149         /* add a 02 merge separator */
   4150         *p++=2;
   4151 
   4152         /* copy level from src2 not including 00 or 01 */
   4153         while((b=*src2)>=2) {
   4154             ++src2;
   4155             *p++=b;
   4156         }
   4157 
   4158         /* if both sort keys have another level, then add a 01 level separator and continue */
   4159         if(*src1==1 && *src2==1) {
   4160             ++src1;
   4161             ++src2;
   4162             *p++=1;
   4163         } else {
   4164             break;
   4165         }
   4166     }
   4167 
   4168     /*
   4169      * here, at least one sort key is finished now, but the other one
   4170      * might have some contents left from containing more levels;
   4171      * that contents is just appended to the result
   4172      */
   4173     if(*src1!=0) {
   4174         /* src1 is not finished, therefore *src2==0, and src1 is appended */
   4175         src2=src1;
   4176     }
   4177     /* append src2, "the other, unfinished sort key" */
   4178     while((*p++=*src2++)!=0) {}
   4179 
   4180     /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */
   4181     return (int32_t)(p-dest);
   4182 }
   4183 
   4184 U_NAMESPACE_BEGIN
   4185 
   4186 class SortKeyByteSink : public ByteSink {
   4187 public:
   4188     SortKeyByteSink(char *dest, int32_t destCapacity)
   4189             : buffer_(dest), capacity_(destCapacity),
   4190               appended_(0) {
   4191         if (buffer_ == NULL) {
   4192             capacity_ = 0;
   4193         } else if(capacity_ < 0) {
   4194             buffer_ = NULL;
   4195             capacity_ = 0;
   4196         }
   4197     }
   4198     virtual ~SortKeyByteSink();
   4199 
   4200     virtual void Append(const char *bytes, int32_t n);
   4201     void Append(uint32_t b) {
   4202         if (appended_ < capacity_ || Resize(1, appended_)) {
   4203             buffer_[appended_] = (char)b;
   4204         }
   4205         ++appended_;
   4206     }
   4207     void Append(uint32_t b1, uint32_t b2) {
   4208         int32_t a2 = appended_ + 2;
   4209         if (a2 <= capacity_ || Resize(2, appended_)) {
   4210             buffer_[appended_] = (char)b1;
   4211             buffer_[appended_ + 1] = (char)b2;
   4212         } else if(appended_ < capacity_) {
   4213             buffer_[appended_] = (char)b1;
   4214         }
   4215         appended_ = a2;
   4216     }
   4217     virtual char *GetAppendBuffer(int32_t min_capacity,
   4218                                   int32_t desired_capacity_hint,
   4219                                   char *scratch, int32_t scratch_capacity,
   4220                                   int32_t *result_capacity);
   4221     int32_t NumberOfBytesAppended() const { return appended_; }
   4222     /** @return FALSE if memory allocation failed */
   4223     UBool IsOk() const { return buffer_ != NULL; }
   4224 
   4225 protected:
   4226     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0;
   4227     virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;
   4228 
   4229     void SetNotOk() {
   4230         buffer_ = NULL;
   4231         capacity_ = 0;
   4232     }
   4233 
   4234     char *buffer_;
   4235     int32_t capacity_;
   4236     int32_t appended_;
   4237 
   4238 private:
   4239     SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
   4240     SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
   4241 };
   4242 
   4243 SortKeyByteSink::~SortKeyByteSink() {}
   4244 
   4245 void
   4246 SortKeyByteSink::Append(const char *bytes, int32_t n) {
   4247     if (n <= 0 || bytes == NULL) {
   4248         return;
   4249     }
   4250     int32_t length = appended_;
   4251     appended_ += n;
   4252     if ((buffer_ + length) == bytes) {
   4253         return;  // the caller used GetAppendBuffer() and wrote the bytes already
   4254     }
   4255     int32_t available = capacity_ - length;
   4256     if (n <= available) {
   4257         uprv_memcpy(buffer_ + length, bytes, n);
   4258     } else {
   4259         AppendBeyondCapacity(bytes, n, length);
   4260     }
   4261 }
   4262 
   4263 char *
   4264 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
   4265                                  int32_t desired_capacity_hint,
   4266                                  char *scratch,
   4267                                  int32_t scratch_capacity,
   4268                                  int32_t *result_capacity) {
   4269     if (min_capacity < 1 || scratch_capacity < min_capacity) {
   4270         *result_capacity = 0;
   4271         return NULL;
   4272     }
   4273     int32_t available = capacity_ - appended_;
   4274     if (available >= min_capacity) {
   4275         *result_capacity = available;
   4276         return buffer_ + appended_;
   4277     } else if (Resize(desired_capacity_hint, appended_)) {
   4278         *result_capacity = capacity_ - appended_;
   4279         return buffer_ + appended_;
   4280     } else {
   4281         *result_capacity = scratch_capacity;
   4282         return scratch;
   4283     }
   4284 }
   4285 
   4286 class FixedSortKeyByteSink : public SortKeyByteSink {
   4287 public:
   4288     FixedSortKeyByteSink(char *dest, int32_t destCapacity)
   4289             : SortKeyByteSink(dest, destCapacity) {}
   4290     virtual ~FixedSortKeyByteSink();
   4291 
   4292 private:
   4293     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
   4294     virtual UBool Resize(int32_t appendCapacity, int32_t length);
   4295 };
   4296 
   4297 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
   4298 
   4299 void
   4300 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
   4301     // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
   4302     // Fill the buffer completely.
   4303     int32_t available = capacity_ - length;
   4304     if (available > 0) {
   4305         uprv_memcpy(buffer_ + length, bytes, available);
   4306     }
   4307 }
   4308 
   4309 UBool
   4310 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
   4311     return FALSE;
   4312 }
   4313 
   4314 class CollationKeyByteSink : public SortKeyByteSink {
   4315 public:
   4316     CollationKeyByteSink(CollationKey &key)
   4317             : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
   4318               key_(key) {}
   4319     virtual ~CollationKeyByteSink();
   4320 
   4321 private:
   4322     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
   4323     virtual UBool Resize(int32_t appendCapacity, int32_t length);
   4324 
   4325     CollationKey &key_;
   4326 };
   4327 
   4328 CollationKeyByteSink::~CollationKeyByteSink() {}
   4329 
   4330 void
   4331 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
   4332     // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
   4333     if (Resize(n, length)) {
   4334         uprv_memcpy(buffer_ + length, bytes, n);
   4335     }
   4336 }
   4337 
   4338 UBool
   4339 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
   4340     if (buffer_ == NULL) {
   4341         return FALSE;  // allocation failed before already
   4342     }
   4343     int32_t newCapacity = 2 * capacity_;
   4344     int32_t altCapacity = length + 2 * appendCapacity;
   4345     if (newCapacity < altCapacity) {
   4346         newCapacity = altCapacity;
   4347     }
   4348     if (newCapacity < 200) {
   4349         newCapacity = 200;
   4350     }
   4351     uint8_t *newBuffer = key_.reallocate(newCapacity, length);
   4352     if (newBuffer == NULL) {
   4353         SetNotOk();
   4354         return FALSE;
   4355     }
   4356     buffer_ = reinterpret_cast<char *>(newBuffer);
   4357     capacity_ = newCapacity;
   4358     return TRUE;
   4359 }
   4360 
   4361 /**
   4362  * uint8_t byte buffer, similar to CharString but simpler.
   4363  */
   4364 class SortKeyLevel : public UMemory {
   4365 public:
   4366     SortKeyLevel() : len(0), ok(TRUE) {}
   4367     ~SortKeyLevel() {}
   4368 
   4369     /** @return FALSE if memory allocation failed */
   4370     UBool isOk() const { return ok; }
   4371     UBool isEmpty() const { return len == 0; }
   4372     int32_t length() const { return len; }
   4373     const uint8_t *data() const { return buffer.getAlias(); }
   4374     uint8_t operator[](int32_t index) const { return buffer[index]; }
   4375 
   4376     void appendByte(uint32_t b);
   4377 
   4378     void appendTo(ByteSink &sink) const {
   4379         sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len);
   4380     }
   4381 
   4382     uint8_t &lastByte() {
   4383         U_ASSERT(len > 0);
   4384         return buffer[len - 1];
   4385     }
   4386 
   4387     uint8_t *getLastFewBytes(int32_t n) {
   4388         if (ok && len >= n) {
   4389             return buffer.getAlias() + len - n;
   4390         } else {
   4391             return NULL;
   4392         }
   4393     }
   4394 
   4395 private:
   4396     MaybeStackArray<uint8_t, 40> buffer;
   4397     int32_t len;
   4398     UBool ok;
   4399 
   4400     UBool ensureCapacity(int32_t appendCapacity);
   4401 
   4402     SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class
   4403     SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class
   4404 };
   4405 
   4406 void SortKeyLevel::appendByte(uint32_t b) {
   4407     if(len < buffer.getCapacity() || ensureCapacity(1)) {
   4408         buffer[len++] = (uint8_t)b;
   4409     }
   4410 }
   4411 
   4412 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) {
   4413     if(!ok) {
   4414         return FALSE;
   4415     }
   4416     int32_t newCapacity = 2 * buffer.getCapacity();
   4417     int32_t altCapacity = len + 2 * appendCapacity;
   4418     if (newCapacity < altCapacity) {
   4419         newCapacity = altCapacity;
   4420     }
   4421     if (newCapacity < 200) {
   4422         newCapacity = 200;
   4423     }
   4424     if(buffer.resize(newCapacity, len)==NULL) {
   4425         return ok = FALSE;
   4426     }
   4427     return TRUE;
   4428 }
   4429 
   4430 U_NAMESPACE_END
   4431 
   4432 /* sortkey API */
   4433 U_CAPI int32_t U_EXPORT2
   4434 ucol_getSortKey(const    UCollator    *coll,
   4435         const    UChar        *source,
   4436         int32_t        sourceLength,
   4437         uint8_t        *result,
   4438         int32_t        resultLength)
   4439 {
   4440     UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
   4441     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   4442         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
   4443             ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
   4444     }
   4445 
   4446     if(coll->delegate != NULL) {
   4447       return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength);
   4448     }
   4449 
   4450     UErrorCode status = U_ZERO_ERROR;
   4451     int32_t keySize   = 0;
   4452 
   4453     if(source != NULL) {
   4454         // source == NULL is actually an error situation, but we would need to
   4455         // have an error code to return it. Until we introduce a new
   4456         // API, it stays like this
   4457 
   4458         /* this uses the function pointer that is set in updateinternalstate */
   4459         /* currently, there are two funcs: */
   4460         /*ucol_calcSortKey(...);*/
   4461         /*ucol_calcSortKeySimpleTertiary(...);*/
   4462 
   4463         uint8_t noDest[1] = { 0 };
   4464         if(result == NULL) {
   4465             // Distinguish pure preflighting from an allocation error.
   4466             result = noDest;
   4467             resultLength = 0;
   4468         }
   4469         FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength);
   4470         coll->sortKeyGen(coll, source, sourceLength, sink, &status);
   4471         if(U_SUCCESS(status)) {
   4472             keySize = sink.NumberOfBytesAppended();
   4473         }
   4474     }
   4475     UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
   4476     UTRACE_EXIT_STATUS(status);
   4477     return keySize;
   4478 }
   4479 
   4480 U_CFUNC int32_t
   4481 ucol_getCollationKey(const UCollator *coll,
   4482                      const UChar *source, int32_t sourceLength,
   4483                      CollationKey &key,
   4484                      UErrorCode &errorCode) {
   4485     CollationKeyByteSink sink(key);
   4486     coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode);
   4487     return sink.NumberOfBytesAppended();
   4488 }
   4489 
   4490 // Is this primary weight compressible?
   4491 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
   4492 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
   4493 static inline UBool
   4494 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
   4495     return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
   4496 }
   4497 
   4498 static
   4499 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) {
   4500     if (caseShift  == 0) {
   4501         cases.appendByte(UCOL_CASE_BYTE_START);
   4502         caseShift = UCOL_CASE_SHIFT_START;
   4503     }
   4504 }
   4505 
   4506 // Packs the secondary buffer when processing French locale.
   4507 static void
   4508 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
   4509     secondaries += secsize;  // We read the secondary-level bytes back to front.
   4510     uint8_t secondary;
   4511     int32_t count2 = 0;
   4512     int32_t i = 0;
   4513     // we use i here since the key size already accounts for terminators, so we'll discard the increment
   4514     for(i = 0; i<secsize; i++) {
   4515         secondary = *(secondaries-i-1);
   4516         /* This is compression code. */
   4517         if (secondary == UCOL_COMMON2) {
   4518             ++count2;
   4519         } else {
   4520             if (count2 > 0) {
   4521                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4522                     while (count2 > UCOL_TOP_COUNT2) {
   4523                         result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   4524                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4525                     }
   4526                     result.Append(UCOL_COMMON_TOP2 - (count2-1));
   4527                 } else {
   4528                     while (count2 > UCOL_BOT_COUNT2) {
   4529                         result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4530                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4531                     }
   4532                     result.Append(UCOL_COMMON_BOT2 + (count2-1));
   4533                 }
   4534                 count2 = 0;
   4535             }
   4536             result.Append(secondary);
   4537         }
   4538     }
   4539     if (count2 > 0) {
   4540         while (count2 > UCOL_BOT_COUNT2) {
   4541             result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4542             count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4543         }
   4544         result.Append(UCOL_COMMON_BOT2 + (count2-1));
   4545     }
   4546 }
   4547 
   4548 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
   4549 
   4550 /* This is the sortkey work horse function */
   4551 U_CFUNC void U_CALLCONV
   4552 ucol_calcSortKey(const    UCollator    *coll,
   4553         const    UChar        *source,
   4554         int32_t        sourceLength,
   4555         SortKeyByteSink &result,
   4556         UErrorCode *status)
   4557 {
   4558     if(U_FAILURE(*status)) {
   4559         return;
   4560     }
   4561 
   4562     SortKeyByteSink &primaries = result;
   4563     SortKeyLevel secondaries;
   4564     SortKeyLevel tertiaries;
   4565     SortKeyLevel cases;
   4566     SortKeyLevel quads;
   4567 
   4568     UnicodeString normSource;
   4569 
   4570     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
   4571 
   4572     UColAttributeValue strength = coll->strength;
   4573 
   4574     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
   4575     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
   4576     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
   4577     UBool  compareIdent = (strength == UCOL_IDENTICAL);
   4578     UBool  doCase = (coll->caseLevel == UCOL_ON);
   4579     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
   4580     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
   4581     //UBool  qShifted = shifted && (compareQuad == 0);
   4582     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
   4583 
   4584     uint32_t variableTopValue = coll->variableTopValue;
   4585     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
   4586     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
   4587     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
   4588     uint8_t UCOL_HIRAGANA_QUAD = 0;
   4589     if(doHiragana) {
   4590         UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
   4591         /* allocate one more space for hiragana, value for hiragana */
   4592     }
   4593     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
   4594 
   4595     /* support for special features like caselevel and funky secondaries */
   4596     int32_t lastSecondaryLength = 0;
   4597     uint32_t caseShift = 0;
   4598 
   4599     /* If we need to normalize, we'll do it all at once at the beginning! */
   4600     const Normalizer2 *norm2;
   4601     if(compareIdent) {
   4602         norm2 = Normalizer2Factory::getNFDInstance(*status);
   4603     } else if(coll->normalizationMode != UCOL_OFF) {
   4604         norm2 = Normalizer2Factory::getFCDInstance(*status);
   4605     } else {
   4606         norm2 = NULL;
   4607     }
   4608     if(norm2 != NULL) {
   4609         normSource.setTo(FALSE, source, len);
   4610         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   4611         if(qcYesLength != len) {
   4612             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   4613             normSource.truncate(qcYesLength);
   4614             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   4615             source = normSource.getBuffer();
   4616             len = normSource.length();
   4617         }
   4618     }
   4619     collIterate s;
   4620     IInit_collIterate(coll, source, len, &s, status);
   4621     if(U_FAILURE(*status)) {
   4622         return;
   4623     }
   4624     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
   4625 
   4626     uint32_t order = 0;
   4627 
   4628     uint8_t primary1 = 0;
   4629     uint8_t primary2 = 0;
   4630     uint8_t secondary = 0;
   4631     uint8_t tertiary = 0;
   4632     uint8_t caseSwitch = coll->caseSwitch;
   4633     uint8_t tertiaryMask = coll->tertiaryMask;
   4634     int8_t tertiaryAddition = coll->tertiaryAddition;
   4635     uint8_t tertiaryTop = coll->tertiaryTop;
   4636     uint8_t tertiaryBottom = coll->tertiaryBottom;
   4637     uint8_t tertiaryCommon = coll->tertiaryCommon;
   4638     uint8_t caseBits = 0;
   4639 
   4640     UBool wasShifted = FALSE;
   4641     UBool notIsContinuation = FALSE;
   4642 
   4643     uint32_t count2 = 0, count3 = 0, count4 = 0;
   4644     uint8_t leadPrimary = 0;
   4645 
   4646     for(;;) {
   4647         order = ucol_IGetNextCE(coll, &s, status);
   4648         if(order == UCOL_NO_MORE_CES) {
   4649             break;
   4650         }
   4651 
   4652         if(order == 0) {
   4653             continue;
   4654         }
   4655 
   4656         notIsContinuation = !isContinuation(order);
   4657 
   4658         if(notIsContinuation) {
   4659             tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
   4660         } else {
   4661             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   4662         }
   4663 
   4664         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4665         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   4666         primary1 = (uint8_t)(order >> 8);
   4667 
   4668         uint8_t originalPrimary1 = primary1;
   4669         if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
   4670             primary1 = coll->leadBytePermutationTable[primary1];
   4671         }
   4672 
   4673         if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
   4674                         || (!notIsContinuation && wasShifted)))
   4675             || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   4676         {
   4677             /* and other ignorables should be removed if following a shifted code point */
   4678             if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
   4679                 /* we should just completely ignore it */
   4680                 continue;
   4681             }
   4682             if(compareQuad == 0) {
   4683                 if(count4 > 0) {
   4684                     while (count4 > UCOL_BOT_COUNT4) {
   4685                         quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   4686                         count4 -= UCOL_BOT_COUNT4;
   4687                     }
   4688                     quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
   4689                     count4 = 0;
   4690                 }
   4691                 /* We are dealing with a variable and we're treating them as shifted */
   4692                 /* This is a shifted ignorable */
   4693                 if(primary1 != 0) { /* we need to check this since we could be in continuation */
   4694                     quads.appendByte(primary1);
   4695                 }
   4696                 if(primary2 != 0) {
   4697                     quads.appendByte(primary2);
   4698                 }
   4699             }
   4700             wasShifted = TRUE;
   4701         } else {
   4702             wasShifted = FALSE;
   4703             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   4704             /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   4705             /* regular and simple sortkey calc */
   4706             if(primary1 != UCOL_IGNORABLE) {
   4707                 if(notIsContinuation) {
   4708                     if(leadPrimary == primary1) {
   4709                         primaries.Append(primary2);
   4710                     } else {
   4711                         if(leadPrimary != 0) {
   4712                             primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
   4713                         }
   4714                         if(primary2 == UCOL_IGNORABLE) {
   4715                             /* one byter, not compressed */
   4716                             primaries.Append(primary1);
   4717                             leadPrimary = 0;
   4718                         } else if(isCompressible(coll, originalPrimary1)) {
   4719                             /* compress */
   4720                             primaries.Append(leadPrimary = primary1, primary2);
   4721                         } else {
   4722                             leadPrimary = 0;
   4723                             primaries.Append(primary1, primary2);
   4724                         }
   4725                     }
   4726                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   4727                     if(primary2 == UCOL_IGNORABLE) {
   4728                         primaries.Append(primary1);
   4729                     } else {
   4730                         primaries.Append(primary1, primary2);
   4731                     }
   4732                 }
   4733             }
   4734 
   4735             if(secondary > compareSec) {
   4736                 if(!isFrenchSec) {
   4737                     /* This is compression code. */
   4738                     if (secondary == UCOL_COMMON2 && notIsContinuation) {
   4739                         ++count2;
   4740                     } else {
   4741                         if (count2 > 0) {
   4742                             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   4743                                 while (count2 > UCOL_TOP_COUNT2) {
   4744                                     secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   4745                                     count2 -= (uint32_t)UCOL_TOP_COUNT2;
   4746                                 }
   4747                                 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
   4748                             } else {
   4749                                 while (count2 > UCOL_BOT_COUNT2) {
   4750                                     secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4751                                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4752                                 }
   4753                                 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
   4754                             }
   4755                             count2 = 0;
   4756                         }
   4757                         secondaries.appendByte(secondary);
   4758                     }
   4759                 } else {
   4760                     /* Do the special handling for French secondaries */
   4761                     /* We need to get continuation elements and do intermediate restore */
   4762                     /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
   4763                     if(notIsContinuation) {
   4764                         if (lastSecondaryLength > 1) {
   4765                             uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
   4766                             if (frenchStartPtr != NULL) {
   4767                                 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4768                                 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
   4769                                 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4770                             }
   4771                         }
   4772                         lastSecondaryLength = 1;
   4773                     } else {
   4774                         ++lastSecondaryLength;
   4775                     }
   4776                     secondaries.appendByte(secondary);
   4777                 }
   4778             }
   4779 
   4780             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
   4781                 // do the case level if we need to do it. We don't want to calculate
   4782                 // case level for primary ignorables if we have only primary strength and case level
   4783                 // otherwise we would break well formedness of CEs
   4784                 doCaseShift(cases, caseShift);
   4785                 if(notIsContinuation) {
   4786                     caseBits = (uint8_t)(tertiary & 0xC0);
   4787 
   4788                     if(tertiary != 0) {
   4789                         if(coll->caseFirst == UCOL_UPPER_FIRST) {
   4790                             if((caseBits & 0xC0) == 0) {
   4791                                 cases.lastByte() |= 1 << (--caseShift);
   4792                             } else {
   4793                                 cases.lastByte() |= 0 << (--caseShift);
   4794                                 /* second bit */
   4795                                 doCaseShift(cases, caseShift);
   4796                                 cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift);
   4797                             }
   4798                         } else {
   4799                             if((caseBits & 0xC0) == 0) {
   4800                                 cases.lastByte() |= 0 << (--caseShift);
   4801                             } else {
   4802                                 cases.lastByte() |= 1 << (--caseShift);
   4803                                 /* second bit */
   4804                                 doCaseShift(cases, caseShift);
   4805                                 cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift);
   4806                             }
   4807                         }
   4808                     }
   4809                 }
   4810             } else {
   4811                 if(notIsContinuation) {
   4812                     tertiary ^= caseSwitch;
   4813                 }
   4814             }
   4815 
   4816             tertiary &= tertiaryMask;
   4817             if(tertiary > compareTer) {
   4818                 /* This is compression code. */
   4819                 /* sequence size check is included in the if clause */
   4820                 if (tertiary == tertiaryCommon && notIsContinuation) {
   4821                     ++count3;
   4822                 } else {
   4823                     if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   4824                         tertiary += tertiaryAddition;
   4825                     } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   4826                         tertiary -= tertiaryAddition;
   4827                     }
   4828                     if (count3 > 0) {
   4829                         if ((tertiary > tertiaryCommon)) {
   4830                             while (count3 > coll->tertiaryTopCount) {
   4831                                 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
   4832                                 count3 -= (uint32_t)coll->tertiaryTopCount;
   4833                             }
   4834                             tertiaries.appendByte(tertiaryTop - (count3-1));
   4835                         } else {
   4836                             while (count3 > coll->tertiaryBottomCount) {
   4837                                 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
   4838                                 count3 -= (uint32_t)coll->tertiaryBottomCount;
   4839                             }
   4840                             tertiaries.appendByte(tertiaryBottom + (count3-1));
   4841                         }
   4842                         count3 = 0;
   4843                     }
   4844                     tertiaries.appendByte(tertiary);
   4845                 }
   4846             }
   4847 
   4848             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
   4849                 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   4850                     if(count4>0) { // Close this part
   4851                         while (count4 > UCOL_BOT_COUNT4) {
   4852                             quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   4853                             count4 -= UCOL_BOT_COUNT4;
   4854                         }
   4855                         quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
   4856                         count4 = 0;
   4857                     }
   4858                     quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana
   4859                 } else { // This wasn't Hiragana, so we can continue adding stuff
   4860                     count4++;
   4861                 }
   4862             }
   4863         }
   4864     }
   4865 
   4866     /* Here, we are generally done with processing */
   4867     /* bailing out would not be too productive */
   4868 
   4869     UBool ok = TRUE;
   4870     if(U_SUCCESS(*status)) {
   4871         /* we have done all the CE's, now let's put them together to form a key */
   4872         if(compareSec == 0) {
   4873             if (count2 > 0) {
   4874                 while (count2 > UCOL_BOT_COUNT2) {
   4875                     secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   4876                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
   4877                 }
   4878                 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
   4879             }
   4880             result.Append(UCOL_LEVELTERMINATOR);
   4881             if(!secondaries.isOk()) {
   4882                 ok = FALSE;
   4883             } else if(!isFrenchSec) {
   4884                 secondaries.appendTo(result);
   4885             } else {
   4886                 // If there are any unresolved continuation secondaries,
   4887                 // reverse them here so that we can reverse the whole secondary thing.
   4888                 if (lastSecondaryLength > 1) {
   4889                     uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
   4890                     if (frenchStartPtr != NULL) {
   4891                         /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
   4892                         uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
   4893                         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
   4894                     }
   4895                 }
   4896                 packFrench(secondaries.data(), secondaries.length(), result);
   4897             }
   4898         }
   4899 
   4900         if(doCase) {
   4901             ok &= cases.isOk();
   4902             result.Append(UCOL_LEVELTERMINATOR);
   4903             cases.appendTo(result);
   4904         }
   4905 
   4906         if(compareTer == 0) {
   4907             if (count3 > 0) {
   4908                 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
   4909                     while (count3 >= coll->tertiaryTopCount) {
   4910                         tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
   4911                         count3 -= (uint32_t)coll->tertiaryTopCount;
   4912                     }
   4913                     tertiaries.appendByte(tertiaryTop - count3);
   4914                 } else {
   4915                     while (count3 > coll->tertiaryBottomCount) {
   4916                         tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
   4917                         count3 -= (uint32_t)coll->tertiaryBottomCount;
   4918                     }
   4919                     tertiaries.appendByte(tertiaryBottom + (count3-1));
   4920                 }
   4921             }
   4922             ok &= tertiaries.isOk();
   4923             result.Append(UCOL_LEVELTERMINATOR);
   4924             tertiaries.appendTo(result);
   4925 
   4926             if(compareQuad == 0/*qShifted == TRUE*/) {
   4927                 if(count4 > 0) {
   4928                     while (count4 > UCOL_BOT_COUNT4) {
   4929                         quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
   4930                         count4 -= UCOL_BOT_COUNT4;
   4931                     }
   4932                     quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
   4933                 }
   4934                 ok &= quads.isOk();
   4935                 result.Append(UCOL_LEVELTERMINATOR);
   4936                 quads.appendTo(result);
   4937             }
   4938 
   4939             if(compareIdent) {
   4940                 result.Append(UCOL_LEVELTERMINATOR);
   4941                 u_writeIdenticalLevelRun(s.string, len, result);
   4942             }
   4943         }
   4944         result.Append(0);
   4945     }
   4946 
   4947     /* To avoid memory leak, free the offset buffer if necessary. */
   4948     ucol_freeOffsetBuffer(&s);
   4949 
   4950     ok &= result.IsOk();
   4951     if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
   4952 }
   4953 
   4954 
   4955 U_CFUNC void U_CALLCONV
   4956 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
   4957         const    UChar        *source,
   4958         int32_t        sourceLength,
   4959         SortKeyByteSink &result,
   4960         UErrorCode *status)
   4961 {
   4962     U_ALIGN_CODE(16);
   4963 
   4964     if(U_FAILURE(*status)) {
   4965         return;
   4966     }
   4967 
   4968     SortKeyByteSink &primaries = result;
   4969     SortKeyLevel secondaries;
   4970     SortKeyLevel tertiaries;
   4971 
   4972     UnicodeString normSource;
   4973 
   4974     int32_t len =  sourceLength;
   4975 
   4976     /* If we need to normalize, we'll do it all at once at the beginning! */
   4977     if(coll->normalizationMode != UCOL_OFF) {
   4978         normSource.setTo(len < 0, source, len);
   4979         const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
   4980         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
   4981         if(qcYesLength != normSource.length()) {
   4982             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
   4983             normSource.truncate(qcYesLength);
   4984             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
   4985             source = normSource.getBuffer();
   4986             len = normSource.length();
   4987         }
   4988     }
   4989     collIterate s;
   4990     IInit_collIterate(coll, (UChar *)source, len, &s, status);
   4991     if(U_FAILURE(*status)) {
   4992         return;
   4993     }
   4994     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
   4995 
   4996     uint32_t order = 0;
   4997 
   4998     uint8_t primary1 = 0;
   4999     uint8_t primary2 = 0;
   5000     uint8_t secondary = 0;
   5001     uint8_t tertiary = 0;
   5002     uint8_t caseSwitch = coll->caseSwitch;
   5003     uint8_t tertiaryMask = coll->tertiaryMask;
   5004     int8_t tertiaryAddition = coll->tertiaryAddition;
   5005     uint8_t tertiaryTop = coll->tertiaryTop;
   5006     uint8_t tertiaryBottom = coll->tertiaryBottom;
   5007     uint8_t tertiaryCommon = coll->tertiaryCommon;
   5008 
   5009     UBool notIsContinuation = FALSE;
   5010 
   5011     uint32_t count2 = 0, count3 = 0;
   5012     uint8_t leadPrimary = 0;
   5013 
   5014     for(;;) {
   5015         order = ucol_IGetNextCE(coll, &s, status);
   5016 
   5017         if(order == 0) {
   5018             continue;
   5019         }
   5020 
   5021         if(order == UCOL_NO_MORE_CES) {
   5022             break;
   5023         }
   5024 
   5025         notIsContinuation = !isContinuation(order);
   5026 
   5027         if(notIsContinuation) {
   5028             tertiary = (uint8_t)((order & tertiaryMask));
   5029         } else {
   5030             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
   5031         }
   5032 
   5033         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5034         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
   5035         primary1 = (uint8_t)(order >> 8);
   5036 
   5037         uint8_t originalPrimary1 = primary1;
   5038         if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
   5039             primary1 = coll->leadBytePermutationTable[primary1];
   5040         }
   5041 
   5042         /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
   5043         /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
   5044         /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
   5045         /* regular and simple sortkey calc */
   5046         if(primary1 != UCOL_IGNORABLE) {
   5047             if(notIsContinuation) {
   5048                 if(leadPrimary == primary1) {
   5049                     primaries.Append(primary2);
   5050                 } else {
   5051                     if(leadPrimary != 0) {
   5052                         primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
   5053                     }
   5054                     if(primary2 == UCOL_IGNORABLE) {
   5055                         /* one byter, not compressed */
   5056                         primaries.Append(primary1);
   5057                         leadPrimary = 0;
   5058                     } else if(isCompressible(coll, originalPrimary1)) {
   5059                         /* compress */
   5060                         primaries.Append(leadPrimary = primary1, primary2);
   5061                     } else {
   5062                         leadPrimary = 0;
   5063                         primaries.Append(primary1, primary2);
   5064                     }
   5065                 }
   5066             } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
   5067                 if(primary2 == UCOL_IGNORABLE) {
   5068                     primaries.Append(primary1);
   5069                 } else {
   5070                     primaries.Append(primary1, primary2);
   5071                 }
   5072             }
   5073         }
   5074 
   5075         if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
   5076             /* This is compression code. */
   5077             if (secondary == UCOL_COMMON2 && notIsContinuation) {
   5078                 ++count2;
   5079             } else {
   5080                 if (count2 > 0) {
   5081                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
   5082                         while (count2 > UCOL_TOP_COUNT2) {
   5083                             secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
   5084                             count2 -= (uint32_t)UCOL_TOP_COUNT2;
   5085                         }
   5086                         secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
   5087                     } else {
   5088                         while (count2 > UCOL_BOT_COUNT2) {
   5089                             secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5090                             count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5091                         }
   5092                         secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
   5093                     }
   5094                     count2 = 0;
   5095                 }
   5096                 secondaries.appendByte(secondary);
   5097             }
   5098         }
   5099 
   5100         if(notIsContinuation) {
   5101             tertiary ^= caseSwitch;
   5102         }
   5103 
   5104         if(tertiary > 0) {
   5105             /* This is compression code. */
   5106             /* sequence size check is included in the if clause */
   5107             if (tertiary == tertiaryCommon && notIsContinuation) {
   5108                 ++count3;
   5109             } else {
   5110                 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
   5111                     tertiary += tertiaryAddition;
   5112                 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
   5113                     tertiary -= tertiaryAddition;
   5114                 }
   5115                 if (count3 > 0) {
   5116                     if ((tertiary > tertiaryCommon)) {
   5117                         while (count3 > coll->tertiaryTopCount) {
   5118                             tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
   5119                             count3 -= (uint32_t)coll->tertiaryTopCount;
   5120                         }
   5121                         tertiaries.appendByte(tertiaryTop - (count3-1));
   5122                     } else {
   5123                         while (count3 > coll->tertiaryBottomCount) {
   5124                             tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
   5125                             count3 -= (uint32_t)coll->tertiaryBottomCount;
   5126                         }
   5127                         tertiaries.appendByte(tertiaryBottom + (count3-1));
   5128                     }
   5129                     count3 = 0;
   5130                 }
   5131                 tertiaries.appendByte(tertiary);
   5132             }
   5133         }
   5134     }
   5135 
   5136     UBool ok = TRUE;
   5137     if(U_SUCCESS(*status)) {
   5138         /* we have done all the CE's, now let's put them together to form a key */
   5139         if (count2 > 0) {
   5140             while (count2 > UCOL_BOT_COUNT2) {
   5141                 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
   5142                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
   5143             }
   5144             secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
   5145         }
   5146         ok &= secondaries.isOk();
   5147         result.Append(UCOL_LEVELTERMINATOR);
   5148         secondaries.appendTo(result);
   5149 
   5150         if (count3 > 0) {
   5151             if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
   5152                 while (count3 >= coll->tertiaryTopCount) {
   5153                     tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
   5154                     count3 -= (uint32_t)coll->tertiaryTopCount;
   5155                 }
   5156                 tertiaries.appendByte(tertiaryTop - count3);
   5157             } else {
   5158                 while (count3 > coll->tertiaryBottomCount) {
   5159                     tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
   5160                     count3 -= (uint32_t)coll->tertiaryBottomCount;
   5161                 }
   5162                 tertiaries.appendByte(tertiaryBottom + (count3-1));
   5163             }
   5164         }
   5165         ok &= tertiaries.isOk();
   5166         result.Append(UCOL_LEVELTERMINATOR);
   5167         tertiaries.appendTo(result);
   5168 
   5169         result.Append(0);
   5170     }
   5171 
   5172     /* To avoid memory leak, free the offset buffer if necessary. */
   5173     ucol_freeOffsetBuffer(&s);
   5174 
   5175     ok &= result.IsOk();
   5176     if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
   5177 }
   5178 
   5179 static inline
   5180 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
   5181     UBool notIsContinuation = !isContinuation(CE);
   5182     uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
   5183     if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
   5184                || (!notIsContinuation && *wasShifted)))
   5185         || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
   5186     {
   5187         // The stuff below should probably be in the sortkey code... maybe not...
   5188         if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
   5189             /* we should just completely ignore it */
   5190             *wasShifted = TRUE;
   5191             //continue;
   5192         }
   5193         //*wasShifted = TRUE;
   5194         return TRUE;
   5195     } else {
   5196         *wasShifted = FALSE;
   5197         return FALSE;
   5198     }
   5199 }
   5200 static inline
   5201 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
   5202     if(level < maxLevel) {
   5203         dest[i++] = UCOL_LEVELTERMINATOR;
   5204     } else {
   5205         dest[i++] = 0;
   5206     }
   5207 }
   5208 
   5209 /** enumeration of level identifiers for partial sort key generation */
   5210 enum {
   5211   UCOL_PSK_PRIMARY = 0,
   5212     UCOL_PSK_SECONDARY = 1,
   5213     UCOL_PSK_CASE = 2,
   5214     UCOL_PSK_TERTIARY = 3,
   5215     UCOL_PSK_QUATERNARY = 4,
   5216     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
   5217     UCOL_PSK_IDENTICAL = 6,
   5218     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
   5219     UCOL_PSK_LIMIT
   5220 };
   5221 
   5222 /** collation state enum. *_SHIFT value is how much to shift right
   5223  *  to get the state piece to the right. *_MASK value should be
   5224  *  ANDed with the shifted state. This data is stored in state[1]
   5225  *  field.
   5226  */
   5227 enum {
   5228     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
   5229     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
   5230     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
   5231     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
   5232     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
   5233      *  This field is also used to denote that the French secondary level is finished
   5234      */
   5235     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
   5236     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
   5237     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
   5238     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
   5239     /** When we do French we need to reverse secondary values. However, continuations
   5240      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
   5241      */
   5242     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
   5243     UCOL_PSK_BOCSU_BYTES_MASK = 3,
   5244     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
   5245     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
   5246 };
   5247 
   5248 // macro calculating the number of expansion CEs available
   5249 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
   5250 
   5251 
   5252 /** main sortkey part procedure. On the first call,
   5253  *  you should pass in a collator, an iterator, empty state
   5254  *  state[0] == state[1] == 0, a buffer to hold results
   5255  *  number of bytes you need and an error code pointer.
   5256  *  Make sure your buffer is big enough to hold the wanted
   5257  *  number of sortkey bytes. I don't check.
   5258  *  The only meaningful status you can get back is
   5259  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
   5260  *  have been dealt a raw deal and that you probably won't
   5261  *  be able to use partial sortkey generation for this
   5262  *  particular combination of string and collator. This
   5263  *  is highly unlikely, but you should still check the error code.
   5264  *  Any other status means that you're not in a sane situation
   5265  *  anymore. After the first call, preserve state values and
   5266  *  use them on subsequent calls to obtain more bytes of a sortkey.
   5267  *  Use until the number of bytes written is smaller than the requested
   5268  *  number of bytes. Generated sortkey is not compatible with the
   5269  *  one generated by ucol_getSortKey, as we don't do any compression.
   5270  *  However, levels are still terminated by a 1 (one) and the sortkey
   5271  *  is terminated by a 0 (zero). Identical level is the same as in the
   5272  *  regular sortkey - internal bocu-1 implementation is used.
   5273  *  For curious, although you cannot do much about this, here is
   5274  *  the structure of state words.
   5275  *  state[0] - iterator state. Depends on the iterator implementation,
   5276  *             but allows the iterator to continue where it stopped in
   5277  *             the last iteration.
   5278  *  state[1] - collation processing state. Here is the distribution
   5279  *             of the bits:
   5280  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
   5281  *             quaternary, quin (we don't use this one), identical and
   5282  *             null (producing only zeroes - first one to terminate the
   5283  *             sortkey and subsequent to fill the buffer).
   5284  *   3       - byte count. Number of bytes written on the primary level.
   5285  *   4       - was shifted. Whether the previous iteration finished in the
   5286  *             shifted state.
   5287  *   5, 6    - French continuation bytes written. See the comment in the enum
   5288  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
   5289  *             the identical level.
   5290  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
   5291  *             since thes last successful update of the iterator state.
   5292  */
   5293 U_CAPI int32_t U_EXPORT2
   5294 ucol_nextSortKeyPart(const UCollator *coll,
   5295                      UCharIterator *iter,
   5296                      uint32_t state[2],
   5297                      uint8_t *dest, int32_t count,
   5298                      UErrorCode *status)
   5299 {
   5300     /* error checking */
   5301     if(status==NULL || U_FAILURE(*status)) {
   5302         return 0;
   5303     }
   5304     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
   5305     if( coll==NULL || iter==NULL ||
   5306         state==NULL ||
   5307         count<0 || (count>0 && dest==NULL)
   5308     ) {
   5309         *status=U_ILLEGAL_ARGUMENT_ERROR;
   5310         UTRACE_EXIT_STATUS(status);
   5311         return 0;
   5312     }
   5313 
   5314     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
   5315                   coll, iter, state[0], state[1], dest, count);
   5316 
   5317     if(count==0) {
   5318         /* nothing to do */
   5319         UTRACE_EXIT_VALUE(0);
   5320         return 0;
   5321     }
   5322     /** Setting up situation according to the state we got from the previous iteration */
   5323     // The state of the iterator from the previous invocation
   5324     uint32_t iterState = state[0];
   5325     // Has the last iteration ended in the shifted state
   5326     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
   5327     // What is the current level of the sortkey?
   5328     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
   5329     // Have we written only one byte from a two byte primary in the previous iteration?
   5330     // Also on secondary level - have we finished with the French secondary?
   5331     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
   5332     // number of bytes in the continuation buffer for French
   5333     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
   5334     // Number of bytes already written from a bocsu sequence. Since
   5335     // the longes bocsu sequence is 4 long, this can be up to 3.
   5336     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
   5337     // Number of elements that need to be consumed in this iteration because
   5338     // the iterator returned UITER_NO_STATE at the end of the last iteration,
   5339     // so we had to save the last valid state.
   5340     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
   5341 
   5342     /** values that depend on the collator attributes */
   5343     // strength of the collator.
   5344     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
   5345     // maximal level of the partial sortkey. Need to take whether case level is done
   5346     int32_t maxLevel = 0;
   5347     if(strength < UCOL_TERTIARY) {
   5348         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   5349             maxLevel = UCOL_PSK_CASE;
   5350         } else {
   5351             maxLevel = strength;
   5352         }
   5353     } else {
   5354         if(strength == UCOL_TERTIARY) {
   5355             maxLevel = UCOL_PSK_TERTIARY;
   5356         } else if(strength == UCOL_QUATERNARY) {
   5357             maxLevel = UCOL_PSK_QUATERNARY;
   5358         } else { // identical
   5359             maxLevel = UCOL_IDENTICAL;
   5360         }
   5361     }
   5362     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
   5363     uint8_t UCOL_HIRAGANA_QUAD =
   5364       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
   5365     // Boundary value that decides whether a CE is shifted or not
   5366     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
   5367     // Are we doing French collation?
   5368     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
   5369 
   5370     /** initializing the collation state */
   5371     UBool notIsContinuation = FALSE;
   5372     uint32_t CE = UCOL_NO_MORE_CES;
   5373 
   5374     collIterate s;
   5375     IInit_collIterate(coll, NULL, -1, &s, status);
   5376     if(U_FAILURE(*status)) {
   5377         UTRACE_EXIT_STATUS(*status);
   5378         return 0;
   5379     }
   5380     s.iterator = iter;
   5381     s.flags |= UCOL_USE_ITERATOR;
   5382     // This variable tells us whether we have produced some other levels in this iteration
   5383     // before we moved to the identical level. In that case, we need to switch the
   5384     // type of the iterator.
   5385     UBool doingIdenticalFromStart = FALSE;
   5386     // Normalizing iterator
   5387     // The division for the array length may truncate the array size to
   5388     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   5389     // for all platforms anyway.
   5390     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   5391     UNormIterator *normIter = NULL;
   5392     // If the normalization is turned on for the collator and we are below identical level
   5393     // we will use a FCD normalizing iterator
   5394     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
   5395         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5396         s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
   5397         s.flags &= ~UCOL_ITER_NORM;
   5398         if(U_FAILURE(*status)) {
   5399             UTRACE_EXIT_STATUS(*status);
   5400             return 0;
   5401         }
   5402     } else if(level == UCOL_PSK_IDENTICAL) {
   5403         // for identical level, we need a NFD iterator. We need to instantiate it here, since we
   5404         // will be updating the state - and this cannot be done on an ordinary iterator.
   5405         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5406         s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   5407         s.flags &= ~UCOL_ITER_NORM;
   5408         if(U_FAILURE(*status)) {
   5409             UTRACE_EXIT_STATUS(*status);
   5410             return 0;
   5411         }
   5412         doingIdenticalFromStart = TRUE;
   5413     }
   5414 
   5415     // This is the tentative new state of the iterator. The problem
   5416     // is that the iterator might return an undefined state, in
   5417     // which case we should save the last valid state and increase
   5418     // the iterator skip value.
   5419     uint32_t newState = 0;
   5420 
   5421     // First, we set the iterator to the last valid position
   5422     // from the last iteration. This was saved in state[0].
   5423     if(iterState == 0) {
   5424         /* initial state */
   5425         if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
   5426             s.iterator->move(s.iterator, 0, UITER_LIMIT);
   5427         } else {
   5428             s.iterator->move(s.iterator, 0, UITER_START);
   5429         }
   5430     } else {
   5431         /* reset to previous state */
   5432         s.iterator->setState(s.iterator, iterState, status);
   5433         if(U_FAILURE(*status)) {
   5434             UTRACE_EXIT_STATUS(*status);
   5435             return 0;
   5436         }
   5437     }
   5438 
   5439 
   5440 
   5441     // This variable tells us whether we can attempt to update the state
   5442     // of iterator. Situations where we don't want to update iterator state
   5443     // are the existence of expansion CEs that are not yet processed, and
   5444     // finishing the case level without enough space in the buffer to insert
   5445     // a level terminator.
   5446     UBool canUpdateState = TRUE;
   5447 
   5448     // Consume all the CEs that were consumed at the end of the previous
   5449     // iteration without updating the iterator state. On identical level,
   5450     // consume the code points.
   5451     int32_t counter = cces;
   5452     if(level < UCOL_PSK_IDENTICAL) {
   5453         while(counter-->0) {
   5454             // If we're doing French and we are on the secondary level,
   5455             // we go backwards.
   5456             if(level == UCOL_PSK_SECONDARY && doingFrench) {
   5457                 CE = ucol_IGetPrevCE(coll, &s, status);
   5458             } else {
   5459                 CE = ucol_IGetNextCE(coll, &s, status);
   5460             }
   5461             if(CE==UCOL_NO_MORE_CES) {
   5462                 /* should not happen */
   5463                 *status=U_INTERNAL_PROGRAM_ERROR;
   5464                 UTRACE_EXIT_STATUS(*status);
   5465                 return 0;
   5466             }
   5467             if(uprv_numAvailableExpCEs(s)) {
   5468                 canUpdateState = FALSE;
   5469             }
   5470         }
   5471     } else {
   5472         while(counter-->0) {
   5473             uiter_next32(s.iterator);
   5474         }
   5475     }
   5476 
   5477     // French secondary needs to know whether the iterator state of zero came from previous level OR
   5478     // from a new invocation...
   5479     UBool wasDoingPrimary = FALSE;
   5480     // destination buffer byte counter. When this guy
   5481     // gets to count, we're done with the iteration
   5482     int32_t i = 0;
   5483     // used to count the zero bytes written after we
   5484     // have finished with the sort key
   5485     int32_t j = 0;
   5486 
   5487 
   5488     // Hm.... I think we're ready to plunge in. Basic story is as following:
   5489     // we have a fall through case based on level. This is used for initial
   5490     // positioning on iteration start. Every level processor contains a
   5491     // for(;;) which will be broken when we exhaust all the CEs. Other
   5492     // way to exit is a goto saveState, which happens when we have filled
   5493     // out our buffer.
   5494     switch(level) {
   5495     case UCOL_PSK_PRIMARY:
   5496         wasDoingPrimary = TRUE;
   5497         for(;;) {
   5498             if(i==count) {
   5499                 goto saveState;
   5500             }
   5501             // We should save the state only if we
   5502             // are sure that we are done with the
   5503             // previous iterator state
   5504             if(canUpdateState && byteCountOrFrenchDone == 0) {
   5505                 newState = s.iterator->getState(s.iterator);
   5506                 if(newState != UITER_NO_STATE) {
   5507                     iterState = newState;
   5508                     cces = 0;
   5509                 }
   5510             }
   5511             CE = ucol_IGetNextCE(coll, &s, status);
   5512             cces++;
   5513             if(CE==UCOL_NO_MORE_CES) {
   5514                 // Add the level separator
   5515                 terminatePSKLevel(level, maxLevel, i, dest);
   5516                 byteCountOrFrenchDone=0;
   5517                 // Restart the iteration an move to the
   5518                 // second level
   5519                 s.iterator->move(s.iterator, 0, UITER_START);
   5520                 cces = 0;
   5521                 level = UCOL_PSK_SECONDARY;
   5522                 break;
   5523             }
   5524             if(!isContinuation(CE)){
   5525                 if(coll->leadBytePermutationTable != NULL){
   5526                     CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
   5527                 }
   5528             }
   5529             if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5530                 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
   5531                 if(CE != 0) {
   5532                     if(byteCountOrFrenchDone == 0) {
   5533                         // get the second byte of primary
   5534                         dest[i++]=(uint8_t)(CE >> 8);
   5535                     } else {
   5536                         byteCountOrFrenchDone = 0;
   5537                     }
   5538                     if((CE &=0xff)!=0) {
   5539                         if(i==count) {
   5540                             /* overflow */
   5541                             byteCountOrFrenchDone = 1;
   5542                             cces--;
   5543                             goto saveState;
   5544                         }
   5545                         dest[i++]=(uint8_t)CE;
   5546                     }
   5547                 }
   5548             }
   5549             if(uprv_numAvailableExpCEs(s)) {
   5550                 canUpdateState = FALSE;
   5551             } else {
   5552                 canUpdateState = TRUE;
   5553             }
   5554         }
   5555         /* fall through to next level */
   5556     case UCOL_PSK_SECONDARY:
   5557         if(strength >= UCOL_SECONDARY) {
   5558             if(!doingFrench) {
   5559                 for(;;) {
   5560                     if(i == count) {
   5561                         goto saveState;
   5562                     }
   5563                     // We should save the state only if we
   5564                     // are sure that we are done with the
   5565                     // previous iterator state
   5566                     if(canUpdateState) {
   5567                         newState = s.iterator->getState(s.iterator);
   5568                         if(newState != UITER_NO_STATE) {
   5569                             iterState = newState;
   5570                             cces = 0;
   5571                         }
   5572                     }
   5573                     CE = ucol_IGetNextCE(coll, &s, status);
   5574                     cces++;
   5575                     if(CE==UCOL_NO_MORE_CES) {
   5576                         // Add the level separator
   5577                         terminatePSKLevel(level, maxLevel, i, dest);
   5578                         byteCountOrFrenchDone = 0;
   5579                         // Restart the iteration an move to the
   5580                         // second level
   5581                         s.iterator->move(s.iterator, 0, UITER_START);
   5582                         cces = 0;
   5583                         level = UCOL_PSK_CASE;
   5584                         break;
   5585                     }
   5586                     if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5587                         CE >>= 8; /* get secondary */
   5588                         if(CE != 0) {
   5589                             dest[i++]=(uint8_t)CE;
   5590                         }
   5591                     }
   5592                     if(uprv_numAvailableExpCEs(s)) {
   5593                         canUpdateState = FALSE;
   5594                     } else {
   5595                         canUpdateState = TRUE;
   5596                     }
   5597                 }
   5598             } else { // French secondary processing
   5599                 uint8_t frenchBuff[UCOL_MAX_BUFFER];
   5600                 int32_t frenchIndex = 0;
   5601                 // Here we are going backwards.
   5602                 // If the iterator is at the beggining, it should be
   5603                 // moved to end.
   5604                 if(wasDoingPrimary) {
   5605                     s.iterator->move(s.iterator, 0, UITER_LIMIT);
   5606                     cces = 0;
   5607                 }
   5608                 for(;;) {
   5609                     if(i == count) {
   5610                         goto saveState;
   5611                     }
   5612                     if(canUpdateState) {
   5613                         newState = s.iterator->getState(s.iterator);
   5614                         if(newState != UITER_NO_STATE) {
   5615                             iterState = newState;
   5616                             cces = 0;
   5617                         }
   5618                     }
   5619                     CE = ucol_IGetPrevCE(coll, &s, status);
   5620                     cces++;
   5621                     if(CE==UCOL_NO_MORE_CES) {
   5622                         // Add the level separator
   5623                         terminatePSKLevel(level, maxLevel, i, dest);
   5624                         byteCountOrFrenchDone = 0;
   5625                         // Restart the iteration an move to the next level
   5626                         s.iterator->move(s.iterator, 0, UITER_START);
   5627                         level = UCOL_PSK_CASE;
   5628                         break;
   5629                     }
   5630                     if(isContinuation(CE)) { // if it's a continuation, we want to save it and
   5631                         // reverse when we get a first non-continuation CE.
   5632                         CE >>= 8;
   5633                         frenchBuff[frenchIndex++] = (uint8_t)CE;
   5634                     } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5635                         CE >>= 8; /* get secondary */
   5636                         if(!frenchIndex) {
   5637                             if(CE != 0) {
   5638                                 dest[i++]=(uint8_t)CE;
   5639                             }
   5640                         } else {
   5641                             frenchBuff[frenchIndex++] = (uint8_t)CE;
   5642                             frenchIndex -= usedFrench;
   5643                             usedFrench = 0;
   5644                             while(i < count && frenchIndex) {
   5645                                 dest[i++] = frenchBuff[--frenchIndex];
   5646                                 usedFrench++;
   5647                             }
   5648                         }
   5649                     }
   5650                     if(uprv_numAvailableExpCEs(s)) {
   5651                         canUpdateState = FALSE;
   5652                     } else {
   5653                         canUpdateState = TRUE;
   5654                     }
   5655                 }
   5656             }
   5657         } else {
   5658             level = UCOL_PSK_CASE;
   5659         }
   5660         /* fall through to next level */
   5661     case UCOL_PSK_CASE:
   5662         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
   5663             uint32_t caseShift = UCOL_CASE_SHIFT_START;
   5664             uint8_t caseByte = UCOL_CASE_BYTE_START;
   5665             uint8_t caseBits = 0;
   5666 
   5667             for(;;) {
   5668                 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
   5669                 if(i == count) {
   5670                     goto saveState;
   5671                 }
   5672                 // We should save the state only if we
   5673                 // are sure that we are done with the
   5674                 // previous iterator state
   5675                 if(canUpdateState) {
   5676                     newState = s.iterator->getState(s.iterator);
   5677                     if(newState != UITER_NO_STATE) {
   5678                         iterState = newState;
   5679                         cces = 0;
   5680                     }
   5681                 }
   5682                 CE = ucol_IGetNextCE(coll, &s, status);
   5683                 cces++;
   5684                 if(CE==UCOL_NO_MORE_CES) {
   5685                     // On the case level we might have an unfinished
   5686                     // case byte. Add one if it's started.
   5687                     if(caseShift != UCOL_CASE_SHIFT_START) {
   5688                         dest[i++] = caseByte;
   5689                     }
   5690                     cces = 0;
   5691                     // We have finished processing CEs on this level.
   5692                     // However, we don't know if we have enough space
   5693                     // to add a case level terminator.
   5694                     if(i < count) {
   5695                         // Add the level separator
   5696                         terminatePSKLevel(level, maxLevel, i, dest);
   5697                         // Restart the iteration and move to the
   5698                         // next level
   5699                         s.iterator->move(s.iterator, 0, UITER_START);
   5700                         level = UCOL_PSK_TERTIARY;
   5701                     } else {
   5702                         canUpdateState = FALSE;
   5703                     }
   5704                     break;
   5705                 }
   5706 
   5707                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5708                     if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
   5709                         // do the case level if we need to do it. We don't want to calculate
   5710                         // case level for primary ignorables if we have only primary strength and case level
   5711                         // otherwise we would break well formedness of CEs
   5712                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   5713                         caseBits = (uint8_t)(CE & 0xC0);
   5714                         // this copies the case level logic from the
   5715                         // sort key generation code
   5716                         if(CE != 0) {
   5717                             if (caseShift == 0) {
   5718                                 dest[i++] = caseByte;
   5719                                 caseShift = UCOL_CASE_SHIFT_START;
   5720                                 caseByte = UCOL_CASE_BYTE_START;
   5721                             }
   5722                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   5723                                 if((caseBits & 0xC0) == 0) {
   5724                                     caseByte |= 1 << (--caseShift);
   5725                                 } else {
   5726                                     caseByte |= 0 << (--caseShift);
   5727                                     /* second bit */
   5728                                     if(caseShift == 0) {
   5729                                         dest[i++] = caseByte;
   5730                                         caseShift = UCOL_CASE_SHIFT_START;
   5731                                         caseByte = UCOL_CASE_BYTE_START;
   5732                                     }
   5733                                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
   5734                                 }
   5735                             } else {
   5736                                 if((caseBits & 0xC0) == 0) {
   5737                                     caseByte |= 0 << (--caseShift);
   5738                                 } else {
   5739                                     caseByte |= 1 << (--caseShift);
   5740                                     /* second bit */
   5741                                     if(caseShift == 0) {
   5742                                         dest[i++] = caseByte;
   5743                                         caseShift = UCOL_CASE_SHIFT_START;
   5744                                         caseByte = UCOL_CASE_BYTE_START;
   5745                                     }
   5746                                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
   5747                                 }
   5748                             }
   5749                         }
   5750 
   5751                     }
   5752                 }
   5753                 // Not sure this is correct for the case level - revisit
   5754                 if(uprv_numAvailableExpCEs(s)) {
   5755                     canUpdateState = FALSE;
   5756                 } else {
   5757                     canUpdateState = TRUE;
   5758                 }
   5759             }
   5760         } else {
   5761             level = UCOL_PSK_TERTIARY;
   5762         }
   5763         /* fall through to next level */
   5764     case UCOL_PSK_TERTIARY:
   5765         if(strength >= UCOL_TERTIARY) {
   5766             for(;;) {
   5767                 if(i == count) {
   5768                     goto saveState;
   5769                 }
   5770                 // We should save the state only if we
   5771                 // are sure that we are done with the
   5772                 // previous iterator state
   5773                 if(canUpdateState) {
   5774                     newState = s.iterator->getState(s.iterator);
   5775                     if(newState != UITER_NO_STATE) {
   5776                         iterState = newState;
   5777                         cces = 0;
   5778                     }
   5779                 }
   5780                 CE = ucol_IGetNextCE(coll, &s, status);
   5781                 cces++;
   5782                 if(CE==UCOL_NO_MORE_CES) {
   5783                     // Add the level separator
   5784                     terminatePSKLevel(level, maxLevel, i, dest);
   5785                     byteCountOrFrenchDone = 0;
   5786                     // Restart the iteration an move to the
   5787                     // second level
   5788                     s.iterator->move(s.iterator, 0, UITER_START);
   5789                     cces = 0;
   5790                     level = UCOL_PSK_QUATERNARY;
   5791                     break;
   5792                 }
   5793                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
   5794                     notIsContinuation = !isContinuation(CE);
   5795 
   5796                     if(notIsContinuation) {
   5797                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
   5798                         CE ^= coll->caseSwitch;
   5799                         CE &= coll->tertiaryMask;
   5800                     } else {
   5801                         CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   5802                     }
   5803 
   5804                     if(CE != 0) {
   5805                         dest[i++]=(uint8_t)CE;
   5806                     }
   5807                 }
   5808                 if(uprv_numAvailableExpCEs(s)) {
   5809                     canUpdateState = FALSE;
   5810                 } else {
   5811                     canUpdateState = TRUE;
   5812                 }
   5813             }
   5814         } else {
   5815             // if we're not doing tertiary
   5816             // skip to the end
   5817             level = UCOL_PSK_NULL;
   5818         }
   5819         /* fall through to next level */
   5820     case UCOL_PSK_QUATERNARY:
   5821         if(strength >= UCOL_QUATERNARY) {
   5822             for(;;) {
   5823                 if(i == count) {
   5824                     goto saveState;
   5825                 }
   5826                 // We should save the state only if we
   5827                 // are sure that we are done with the
   5828                 // previous iterator state
   5829                 if(canUpdateState) {
   5830                     newState = s.iterator->getState(s.iterator);
   5831                     if(newState != UITER_NO_STATE) {
   5832                         iterState = newState;
   5833                         cces = 0;
   5834                     }
   5835                 }
   5836                 CE = ucol_IGetNextCE(coll, &s, status);
   5837                 cces++;
   5838                 if(CE==UCOL_NO_MORE_CES) {
   5839                     // Add the level separator
   5840                     terminatePSKLevel(level, maxLevel, i, dest);
   5841                     //dest[i++] = UCOL_LEVELTERMINATOR;
   5842                     byteCountOrFrenchDone = 0;
   5843                     // Restart the iteration an move to the
   5844                     // second level
   5845                     s.iterator->move(s.iterator, 0, UITER_START);
   5846                     cces = 0;
   5847                     level = UCOL_PSK_QUIN;
   5848                     break;
   5849                 }
   5850                 if(CE==0)
   5851                     continue;
   5852                 if(isShiftedCE(CE, LVT, &wasShifted)) {
   5853                     CE >>= 16; /* get primary */
   5854                     if(CE != 0) {
   5855                         if(byteCountOrFrenchDone == 0) {
   5856                             dest[i++]=(uint8_t)(CE >> 8);
   5857                         } else {
   5858                             byteCountOrFrenchDone = 0;
   5859                         }
   5860                         if((CE &=0xff)!=0) {
   5861                             if(i==count) {
   5862                                 /* overflow */
   5863                                 byteCountOrFrenchDone = 1;
   5864                                 goto saveState;
   5865                             }
   5866                             dest[i++]=(uint8_t)CE;
   5867                         }
   5868                     }
   5869                 } else {
   5870                     notIsContinuation = !isContinuation(CE);
   5871                     if(notIsContinuation) {
   5872                         if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
   5873                             dest[i++] = UCOL_HIRAGANA_QUAD;
   5874                         } else {
   5875                             dest[i++] = 0xFF;
   5876                         }
   5877                     }
   5878                 }
   5879                 if(uprv_numAvailableExpCEs(s)) {
   5880                     canUpdateState = FALSE;
   5881                 } else {
   5882                     canUpdateState = TRUE;
   5883                 }
   5884             }
   5885         } else {
   5886             // if we're not doing quaternary
   5887             // skip to the end
   5888             level = UCOL_PSK_NULL;
   5889         }
   5890         /* fall through to next level */
   5891     case UCOL_PSK_QUIN:
   5892         level = UCOL_PSK_IDENTICAL;
   5893         /* fall through to next level */
   5894     case UCOL_PSK_IDENTICAL:
   5895         if(strength >= UCOL_IDENTICAL) {
   5896             UChar32 first, second;
   5897             int32_t bocsuBytesWritten = 0;
   5898             // We always need to do identical on
   5899             // the NFD form of the string.
   5900             if(normIter == NULL) {
   5901                 // we arrived from the level below and
   5902                 // normalization was not turned on.
   5903                 // therefore, we need to make a fresh NFD iterator
   5904                 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
   5905                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   5906             } else if(!doingIdenticalFromStart) {
   5907                 // there is an iterator, but we did some other levels.
   5908                 // therefore, we have a FCD iterator - need to make
   5909                 // a NFD one.
   5910                 // normIter being at the beginning does not guarantee
   5911                 // that the underlying iterator is at the beginning
   5912                 iter->move(iter, 0, UITER_START);
   5913                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
   5914             }
   5915             // At this point we have a NFD iterator that is positioned
   5916             // in the right place
   5917             if(U_FAILURE(*status)) {
   5918                 UTRACE_EXIT_STATUS(*status);
   5919                 return 0;
   5920             }
   5921             first = uiter_previous32(s.iterator);
   5922             // maybe we're at the start of the string
   5923             if(first == U_SENTINEL) {
   5924                 first = 0;
   5925             } else {
   5926                 uiter_next32(s.iterator);
   5927             }
   5928 
   5929             j = 0;
   5930             for(;;) {
   5931                 if(i == count) {
   5932                     if(j+1 < bocsuBytesWritten) {
   5933                         bocsuBytesUsed = j+1;
   5934                     }
   5935                     goto saveState;
   5936                 }
   5937 
   5938                 // On identical level, we will always save
   5939                 // the state if we reach this point, since
   5940                 // we don't depend on getNextCE for content
   5941                 // all the content is in our buffer and we
   5942                 // already either stored the full buffer OR
   5943                 // otherwise we won't arrive here.
   5944                 newState = s.iterator->getState(s.iterator);
   5945                 if(newState != UITER_NO_STATE) {
   5946                     iterState = newState;
   5947                     cces = 0;
   5948                 }
   5949 
   5950                 uint8_t buff[4];
   5951                 second = uiter_next32(s.iterator);
   5952                 cces++;
   5953 
   5954                 // end condition for identical level
   5955                 if(second == U_SENTINEL) {
   5956                     terminatePSKLevel(level, maxLevel, i, dest);
   5957                     level = UCOL_PSK_NULL;
   5958                     break;
   5959                 }
   5960                 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
   5961                 first = second;
   5962 
   5963                 j = 0;
   5964                 if(bocsuBytesUsed != 0) {
   5965                     while(bocsuBytesUsed-->0) {
   5966                         j++;
   5967                     }
   5968                 }
   5969 
   5970                 while(i < count && j < bocsuBytesWritten) {
   5971                     dest[i++] = buff[j++];
   5972                 }
   5973             }
   5974 
   5975         } else {
   5976             level = UCOL_PSK_NULL;
   5977         }
   5978         /* fall through to next level */
   5979     case UCOL_PSK_NULL:
   5980         j = i;
   5981         while(j<count) {
   5982             dest[j++]=0;
   5983         }
   5984         break;
   5985     default:
   5986         *status = U_INTERNAL_PROGRAM_ERROR;
   5987         UTRACE_EXIT_STATUS(*status);
   5988         return 0;
   5989     }
   5990 
   5991 saveState:
   5992     // Now we need to return stuff. First we want to see whether we have
   5993     // done everything for the current state of iterator.
   5994     if(byteCountOrFrenchDone
   5995         || canUpdateState == FALSE
   5996         || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
   5997     {
   5998         // Any of above mean that the previous transaction
   5999         // wasn't finished and that we should store the
   6000         // previous iterator state.
   6001         state[0] = iterState;
   6002     } else {
   6003         // The transaction is complete. We will continue in the next iteration.
   6004         state[0] = s.iterator->getState(s.iterator);
   6005         cces = 0;
   6006     }
   6007     // Store the number of bocsu bytes written.
   6008     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
   6009         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6010     }
   6011     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
   6012 
   6013     // Next we put in the level of comparison
   6014     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
   6015 
   6016     // If we are doing French, we need to store whether we have just finished the French level
   6017     if(level == UCOL_PSK_SECONDARY && doingFrench) {
   6018         state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6019     } else {
   6020         state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
   6021     }
   6022 
   6023     // Was the latest CE shifted
   6024     if(wasShifted) {
   6025         state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
   6026     }
   6027     // Check for cces overflow
   6028     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
   6029         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6030     }
   6031     // Store cces
   6032     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
   6033 
   6034     // Check for French overflow
   6035     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
   6036         *status = U_INDEX_OUTOFBOUNDS_ERROR;
   6037     }
   6038     // Store number of bytes written in the French secondary continuation sequence
   6039     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
   6040 
   6041 
   6042     // If we have used normalizing iterator, get rid of it
   6043     if(normIter != NULL) {
   6044         unorm_closeIter(normIter);
   6045     }
   6046 
   6047     /* To avoid memory leak, free the offset buffer if necessary. */
   6048     ucol_freeOffsetBuffer(&s);
   6049 
   6050     // Return number of meaningful sortkey bytes.
   6051     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
   6052                   dest,i, state[0], state[1]);
   6053     UTRACE_EXIT_VALUE(i);
   6054     return i;
   6055 }
   6056 
   6057 /**
   6058  * Produce a bound for a given sortkey and a number of levels.
   6059  */
   6060 U_CAPI int32_t U_EXPORT2
   6061 ucol_getBound(const uint8_t       *source,
   6062         int32_t             sourceLength,
   6063         UColBoundMode       boundType,
   6064         uint32_t            noOfLevels,
   6065         uint8_t             *result,
   6066         int32_t             resultLength,
   6067         UErrorCode          *status)
   6068 {
   6069     // consistency checks
   6070     if(status == NULL || U_FAILURE(*status)) {
   6071         return 0;
   6072     }
   6073     if(source == NULL) {
   6074         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6075         return 0;
   6076     }
   6077 
   6078     int32_t sourceIndex = 0;
   6079     // Scan the string until we skip enough of the key OR reach the end of the key
   6080     do {
   6081         sourceIndex++;
   6082         if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
   6083             noOfLevels--;
   6084         }
   6085     } while (noOfLevels > 0
   6086         && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
   6087 
   6088     if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
   6089         && noOfLevels > 0) {
   6090             *status = U_SORT_KEY_TOO_SHORT_WARNING;
   6091     }
   6092 
   6093 
   6094     // READ ME: this code assumes that the values for boundType
   6095     // enum will not changes. They are set so that the enum value
   6096     // corresponds to the number of extra bytes each bound type
   6097     // needs.
   6098     if(result != NULL && resultLength >= sourceIndex+boundType) {
   6099         uprv_memcpy(result, source, sourceIndex);
   6100         switch(boundType) {
   6101             // Lower bound just gets terminated. No extra bytes
   6102         case UCOL_BOUND_LOWER: // = 0
   6103             break;
   6104             // Upper bound needs one extra byte
   6105         case UCOL_BOUND_UPPER: // = 1
   6106             result[sourceIndex++] = 2;
   6107             break;
   6108             // Upper long bound needs two extra bytes
   6109         case UCOL_BOUND_UPPER_LONG: // = 2
   6110             result[sourceIndex++] = 0xFF;
   6111             result[sourceIndex++] = 0xFF;
   6112             break;
   6113         default:
   6114             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6115             return 0;
   6116         }
   6117         result[sourceIndex++] = 0;
   6118 
   6119         return sourceIndex;
   6120     } else {
   6121         return sourceIndex+boundType+1;
   6122     }
   6123 }
   6124 
   6125 /****************************************************************************/
   6126 /* Following are the functions that deal with the properties of a collator  */
   6127 /* there are new APIs and some compatibility APIs                           */
   6128 /****************************************************************************/
   6129 
   6130 static inline void
   6131 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
   6132                     int32_t *primShift, int32_t *secShift, int32_t *terShift)
   6133 {
   6134     uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
   6135     UBool reverseSecondary = FALSE;
   6136     UBool continuation = isContinuation(CE);
   6137     if(!continuation) {
   6138         tertiary = (uint8_t)((CE & coll->tertiaryMask));
   6139         tertiary ^= coll->caseSwitch;
   6140         reverseSecondary = TRUE;
   6141     } else {
   6142         tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
   6143         tertiary &= UCOL_REMOVE_CASE;
   6144         reverseSecondary = FALSE;
   6145     }
   6146 
   6147     secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6148     primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
   6149     primary1 = (uint8_t)(CE >> 8);
   6150 
   6151     if(primary1 != 0) {
   6152         if (coll->leadBytePermutationTable != NULL && !continuation) {
   6153             primary1 = coll->leadBytePermutationTable[primary1];
   6154         }
   6155 
   6156         coll->latinOneCEs[ch] |= (primary1 << *primShift);
   6157         *primShift -= 8;
   6158     }
   6159     if(primary2 != 0) {
   6160         if(*primShift < 0) {
   6161             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6162             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6163             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6164             return;
   6165         }
   6166         coll->latinOneCEs[ch] |= (primary2 << *primShift);
   6167         *primShift -= 8;
   6168     }
   6169     if(secondary != 0) {
   6170         if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
   6171             coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
   6172             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
   6173         } else { // normal case
   6174             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
   6175         }
   6176         *secShift -= 8;
   6177     }
   6178     if(tertiary != 0) {
   6179         coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
   6180         *terShift -= 8;
   6181     }
   6182 }
   6183 
   6184 static inline UBool
   6185 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
   6186     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
   6187     if(newTable == NULL) {
   6188       *status = U_MEMORY_ALLOCATION_ERROR;
   6189       coll->latinOneFailed = TRUE;
   6190       return FALSE;
   6191     }
   6192     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
   6193     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
   6194     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
   6195     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
   6196     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
   6197     coll->latinOneTableLen = size;
   6198     uprv_free(coll->latinOneCEs);
   6199     coll->latinOneCEs = newTable;
   6200     return TRUE;
   6201 }
   6202 
   6203 static UBool
   6204 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
   6205     UBool result = TRUE;
   6206     if(coll->latinOneCEs == NULL) {
   6207         coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
   6208         if(coll->latinOneCEs == NULL) {
   6209             *status = U_MEMORY_ALLOCATION_ERROR;
   6210             return FALSE;
   6211         }
   6212         coll->latinOneTableLen = UCOL_LATINONETABLELEN;
   6213     }
   6214     UChar ch = 0;
   6215     UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
   6216     // Check for null pointer
   6217     if (U_FAILURE(*status)) {
   6218         ucol_closeElements(it);
   6219         return FALSE;
   6220     }
   6221     uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
   6222 
   6223     int32_t primShift = 24, secShift = 24, terShift = 24;
   6224     uint32_t CE = 0;
   6225     int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
   6226 
   6227     // TODO: make safe if you get more than you wanted...
   6228     for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
   6229         primShift = 24; secShift = 24; terShift = 24;
   6230         if(ch < 0x100) {
   6231             CE = coll->latinOneMapping[ch];
   6232         } else {
   6233             CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
   6234             if(CE == UCOL_NOT_FOUND && coll->UCA) {
   6235                 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
   6236             }
   6237         }
   6238         if(CE < UCOL_NOT_FOUND) {
   6239             ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6240         } else {
   6241             switch (getCETag(CE)) {
   6242             case EXPANSION_TAG:
   6243             case DIGIT_TAG:
   6244                 ucol_setText(it, &ch, 1, status);
   6245                 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
   6246                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6247                         coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
   6248                         coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6249                         coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
   6250                         break;
   6251                     }
   6252                     ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6253                 }
   6254                 break;
   6255             case CONTRACTION_TAG:
   6256                 // here is the trick
   6257                 // F2 is contraction. We do something very similar to contractions
   6258                 // but have two indices, one in the real contraction table and the
   6259                 // other to where we stuffed things. This hopes that we don't have
   6260                 // many contractions (this should work for latin-1 tables).
   6261                 {
   6262                     if((CE & 0x00FFF000) != 0) {
   6263                         *status = U_UNSUPPORTED_ERROR;
   6264                         goto cleanup_after_failure;
   6265                     }
   6266 
   6267                     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
   6268 
   6269                     CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
   6270 
   6271                     coll->latinOneCEs[ch] = CE;
   6272                     coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
   6273                     coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
   6274 
   6275                     // We're going to jump into contraction table, pick the elements
   6276                     // and use them
   6277                     do {
   6278                         CE = *(coll->contractionCEs +
   6279                             (UCharOffset - coll->contractionIndex));
   6280                         if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
   6281                             uint32_t size;
   6282                             uint32_t i;    /* general counter */
   6283                             uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
   6284                             size = getExpansionCount(CE);
   6285                             //CE = *CEOffset++;
   6286                             if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
   6287                                 for(i = 0; i<size; i++) {
   6288                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6289                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6290                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6291                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6292                                         break;
   6293                                     }
   6294                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6295                                 }
   6296                             } else { /* else, we do */
   6297                                 while(*CEOffset != 0) {
   6298                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
   6299                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6300                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6301                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6302                                         break;
   6303                                     }
   6304                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
   6305                                 }
   6306                             }
   6307                             contractionOffset++;
   6308                         } else if(CE < UCOL_NOT_FOUND) {
   6309                             ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
   6310                         } else {
   6311                             coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6312                             coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6313                             coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
   6314                             contractionOffset++;
   6315                         }
   6316                         UCharOffset++;
   6317                         primShift = 24; secShift = 24; terShift = 24;
   6318                         if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
   6319                             if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
   6320                                 goto cleanup_after_failure;
   6321                             }
   6322                         }
   6323                     } while(*UCharOffset != 0xFFFF);
   6324                 }
   6325                 break;;
   6326             case SPEC_PROC_TAG:
   6327                 {
   6328                     // 0xB7 is a precontext character defined in UCA5.1, a special
   6329                     // handle is implemeted in order to save LatinOne table for
   6330                     // most locales.
   6331                     if (ch==0xb7) {
   6332                         ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
   6333                     }
   6334                     else {
   6335                         goto cleanup_after_failure;
   6336                     }
   6337                 }
   6338                 break;
   6339             default:
   6340                 goto cleanup_after_failure;
   6341             }
   6342         }
   6343     }
   6344     // compact table
   6345     if(contractionOffset < coll->latinOneTableLen) {
   6346         if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
   6347             goto cleanup_after_failure;
   6348         }
   6349     }
   6350     ucol_closeElements(it);
   6351     return result;
   6352 
   6353 cleanup_after_failure:
   6354     // status should already be set before arriving here.
   6355     coll->latinOneFailed = TRUE;
   6356     ucol_closeElements(it);
   6357     return FALSE;
   6358 }
   6359 
   6360 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
   6361     if(U_SUCCESS(*status)) {
   6362         if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6363             coll->caseSwitch = UCOL_CASE_SWITCH;
   6364         } else {
   6365             coll->caseSwitch = UCOL_NO_CASE_SWITCH;
   6366         }
   6367 
   6368         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
   6369             coll->tertiaryMask = UCOL_REMOVE_CASE;
   6370             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6371             coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
   6372             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
   6373             coll->tertiaryBottom = UCOL_COMMON_BOT3;
   6374         } else {
   6375             coll->tertiaryMask = UCOL_KEEP_CASE;
   6376             coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
   6377             if(coll->caseFirst == UCOL_UPPER_FIRST) {
   6378                 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
   6379                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
   6380                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
   6381             } else {
   6382                 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
   6383                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
   6384                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
   6385             }
   6386         }
   6387 
   6388         /* Set the compression values */
   6389         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
   6390         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
   6391         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
   6392 
   6393         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
   6394             && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
   6395         {
   6396             coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
   6397         } else {
   6398             coll->sortKeyGen = ucol_calcSortKey;
   6399         }
   6400         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
   6401             && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
   6402         {
   6403             if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
   6404                 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
   6405                     //fprintf(stderr, "F");
   6406                     coll->latinOneUse = TRUE;
   6407                 } else {
   6408                     coll->latinOneUse = FALSE;
   6409                 }
   6410                 if(*status == U_UNSUPPORTED_ERROR) {
   6411                     *status = U_ZERO_ERROR;
   6412                 }
   6413             } else { // latin1Table exists and it doesn't need to be regenerated, just use it
   6414                 coll->latinOneUse = TRUE;
   6415             }
   6416         } else {
   6417             coll->latinOneUse = FALSE;
   6418         }
   6419     }
   6420 }
   6421 
   6422 U_CAPI uint32_t  U_EXPORT2
   6423 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
   6424     if(U_FAILURE(*status) || coll == NULL) {
   6425         return 0;
   6426     }
   6427     if(len == -1) {
   6428         len = u_strlen(varTop);
   6429     }
   6430     if(len == 0) {
   6431         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6432         return 0;
   6433     }
   6434 
   6435     if(coll->delegate!=NULL) {
   6436       return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status);
   6437     }
   6438 
   6439 
   6440     collIterate s;
   6441     IInit_collIterate(coll, varTop, len, &s, status);
   6442     if(U_FAILURE(*status)) {
   6443         return 0;
   6444     }
   6445 
   6446     uint32_t CE = ucol_IGetNextCE(coll, &s, status);
   6447 
   6448     /* here we check if we have consumed all characters */
   6449     /* you can put in either one character or a contraction */
   6450     /* you shouldn't put more... */
   6451     if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
   6452         *status = U_CE_NOT_FOUND_ERROR;
   6453         return 0;
   6454     }
   6455 
   6456     uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
   6457 
   6458     if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
   6459         *status = U_PRIMARY_TOO_LONG_ERROR;
   6460         return 0;
   6461     }
   6462     if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
   6463         coll->variableTopValueisDefault = FALSE;
   6464         coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
   6465     }
   6466 
   6467     /* To avoid memory leak, free the offset buffer if necessary. */
   6468     ucol_freeOffsetBuffer(&s);
   6469 
   6470     return CE & UCOL_PRIMARYMASK;
   6471 }
   6472 
   6473 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
   6474     if(U_FAILURE(*status) || coll == NULL) {
   6475         return 0;
   6476     }
   6477     if(coll->delegate!=NULL) {
   6478       return ((const Collator*)coll->delegate)->getVariableTop(*status);
   6479     }
   6480     return coll->variableTopValue<<16;
   6481 }
   6482 
   6483 U_CAPI void  U_EXPORT2
   6484 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
   6485     if(U_FAILURE(*status) || coll == NULL) {
   6486         return;
   6487     }
   6488 
   6489     if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
   6490         coll->variableTopValueisDefault = FALSE;
   6491         coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
   6492     }
   6493 }
   6494 /* Attribute setter API */
   6495 U_CAPI void  U_EXPORT2
   6496 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
   6497     if(U_FAILURE(*status) || coll == NULL) {
   6498       return;
   6499     }
   6500 
   6501     if(coll->delegate != NULL) {
   6502       ((Collator*)coll->delegate)->setAttribute(attr,value,*status);
   6503       return;
   6504     }
   6505 
   6506     UColAttributeValue oldFrench = coll->frenchCollation;
   6507     UColAttributeValue oldCaseFirst = coll->caseFirst;
   6508     switch(attr) {
   6509     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
   6510         if(value == UCOL_ON) {
   6511             coll->numericCollation = UCOL_ON;
   6512             coll->numericCollationisDefault = FALSE;
   6513         } else if (value == UCOL_OFF) {
   6514             coll->numericCollation = UCOL_OFF;
   6515             coll->numericCollationisDefault = FALSE;
   6516         } else if (value == UCOL_DEFAULT) {
   6517             coll->numericCollationisDefault = TRUE;
   6518             coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
   6519         } else {
   6520             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6521         }
   6522         break;
   6523     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
   6524         if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) {
   6525             // This attribute is an implementation detail of the CLDR Japanese tailoring.
   6526             // The implementation might change to use a different mechanism
   6527             // to achieve the same Japanese sort order.
   6528             // Since ICU 50, this attribute is not settable any more via API functions.
   6529         } else {
   6530             *status = U_ILLEGAL_ARGUMENT_ERROR;
   6531         }
   6532         break;
   6533     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   6534         if(value == UCOL_ON) {
   6535             coll->frenchCollation = UCOL_ON;
   6536             coll->frenchCollationisDefault = FALSE;
   6537         } else if (value == UCOL_OFF) {
   6538             coll->frenchCollation = UCOL_OFF;
   6539             coll->frenchCollationisDefault = FALSE;
   6540         } else if (value == UCOL_DEFAULT) {
   6541             coll->frenchCollationisDefault = TRUE;
   6542             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
   6543         } else {
   6544             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6545         }
   6546         break;
   6547     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   6548         if(value == UCOL_SHIFTED) {
   6549             coll->alternateHandling = UCOL_SHIFTED;
   6550             coll->alternateHandlingisDefault = FALSE;
   6551         } else if (value == UCOL_NON_IGNORABLE) {
   6552             coll->alternateHandling = UCOL_NON_IGNORABLE;
   6553             coll->alternateHandlingisDefault = FALSE;
   6554         } else if (value == UCOL_DEFAULT) {
   6555             coll->alternateHandlingisDefault = TRUE;
   6556             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
   6557         } else {
   6558             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6559         }
   6560         break;
   6561     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   6562         if(value == UCOL_LOWER_FIRST) {
   6563             coll->caseFirst = UCOL_LOWER_FIRST;
   6564             coll->caseFirstisDefault = FALSE;
   6565         } else if (value == UCOL_UPPER_FIRST) {
   6566             coll->caseFirst = UCOL_UPPER_FIRST;
   6567             coll->caseFirstisDefault = FALSE;
   6568         } else if (value == UCOL_OFF) {
   6569             coll->caseFirst = UCOL_OFF;
   6570             coll->caseFirstisDefault = FALSE;
   6571         } else if (value == UCOL_DEFAULT) {
   6572             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
   6573             coll->caseFirstisDefault = TRUE;
   6574         } else {
   6575             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6576         }
   6577         break;
   6578     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   6579         if(value == UCOL_ON) {
   6580             coll->caseLevel = UCOL_ON;
   6581             coll->caseLevelisDefault = FALSE;
   6582         } else if (value == UCOL_OFF) {
   6583             coll->caseLevel = UCOL_OFF;
   6584             coll->caseLevelisDefault = FALSE;
   6585         } else if (value == UCOL_DEFAULT) {
   6586             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
   6587             coll->caseLevelisDefault = TRUE;
   6588         } else {
   6589             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6590         }
   6591         break;
   6592     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   6593         if(value == UCOL_ON) {
   6594             coll->normalizationMode = UCOL_ON;
   6595             coll->normalizationModeisDefault = FALSE;
   6596             initializeFCD(status);
   6597         } else if (value == UCOL_OFF) {
   6598             coll->normalizationMode = UCOL_OFF;
   6599             coll->normalizationModeisDefault = FALSE;
   6600         } else if (value == UCOL_DEFAULT) {
   6601             coll->normalizationModeisDefault = TRUE;
   6602             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
   6603             if(coll->normalizationMode == UCOL_ON) {
   6604                 initializeFCD(status);
   6605             }
   6606         } else {
   6607             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6608         }
   6609         break;
   6610     case UCOL_STRENGTH:         /* attribute for strength */
   6611         if (value == UCOL_DEFAULT) {
   6612             coll->strengthisDefault = TRUE;
   6613             coll->strength = (UColAttributeValue)coll->options->strength;
   6614         } else if (value <= UCOL_IDENTICAL) {
   6615             coll->strengthisDefault = FALSE;
   6616             coll->strength = value;
   6617         } else {
   6618             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
   6619         }
   6620         break;
   6621     case UCOL_ATTRIBUTE_COUNT:
   6622     default:
   6623         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6624         break;
   6625     }
   6626     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
   6627         coll->latinOneRegenTable = TRUE;
   6628     } else {
   6629         coll->latinOneRegenTable = FALSE;
   6630     }
   6631     ucol_updateInternalState(coll, status);
   6632 }
   6633 
   6634 U_CAPI UColAttributeValue  U_EXPORT2
   6635 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
   6636     if(U_FAILURE(*status) || coll == NULL) {
   6637       return UCOL_DEFAULT;
   6638     }
   6639 
   6640     if(coll->delegate != NULL) {
   6641       return ((Collator*)coll->delegate)->getAttribute(attr,*status);
   6642     }
   6643 
   6644     switch(attr) {
   6645     case UCOL_NUMERIC_COLLATION:
   6646       return coll->numericCollation;
   6647     case UCOL_HIRAGANA_QUATERNARY_MODE:
   6648       return coll->hiraganaQ;
   6649     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
   6650         return coll->frenchCollation;
   6651     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
   6652         return coll->alternateHandling;
   6653     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
   6654         return coll->caseFirst;
   6655     case UCOL_CASE_LEVEL: /* do we have an extra case level */
   6656         return coll->caseLevel;
   6657     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
   6658         return coll->normalizationMode;
   6659     case UCOL_STRENGTH:         /* attribute for strength */
   6660         return coll->strength;
   6661     case UCOL_ATTRIBUTE_COUNT:
   6662     default:
   6663         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6664         break;
   6665     }
   6666     return UCOL_DEFAULT;
   6667 }
   6668 
   6669 U_CAPI void U_EXPORT2
   6670 ucol_setStrength(    UCollator                *coll,
   6671             UCollationStrength        strength)
   6672 {
   6673     UErrorCode status = U_ZERO_ERROR;
   6674     ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
   6675 }
   6676 
   6677 U_CAPI UCollationStrength U_EXPORT2
   6678 ucol_getStrength(const UCollator *coll)
   6679 {
   6680     UErrorCode status = U_ZERO_ERROR;
   6681     return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
   6682 }
   6683 
   6684 U_CAPI int32_t U_EXPORT2
   6685 ucol_getReorderCodes(const UCollator *coll,
   6686                     int32_t *dest,
   6687                     int32_t destCapacity,
   6688                     UErrorCode *status) {
   6689     if (U_FAILURE(*status)) {
   6690         return 0;
   6691     }
   6692 
   6693     if(coll->delegate!=NULL) {
   6694       return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status);
   6695     }
   6696 
   6697     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
   6698         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6699         return 0;
   6700     }
   6701 
   6702 #ifdef UCOL_DEBUG
   6703     printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
   6704     printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
   6705 #endif
   6706 
   6707     if (coll->reorderCodesLength > destCapacity) {
   6708         *status = U_BUFFER_OVERFLOW_ERROR;
   6709         return coll->reorderCodesLength;
   6710     }
   6711     for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
   6712         dest[i] = coll->reorderCodes[i];
   6713     }
   6714     return coll->reorderCodesLength;
   6715 }
   6716 
   6717 U_CAPI void U_EXPORT2
   6718 ucol_setReorderCodes(UCollator* coll,
   6719                     const int32_t* reorderCodes,
   6720                     int32_t reorderCodesLength,
   6721                     UErrorCode *status) {
   6722     if (U_FAILURE(*status)) {
   6723         return;
   6724     }
   6725 
   6726     if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
   6727         *status = U_ILLEGAL_ARGUMENT_ERROR;
   6728         return;
   6729     }
   6730 
   6731     if(coll->delegate!=NULL) {
   6732       ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
   6733       return;
   6734     }
   6735 
   6736     if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
   6737         uprv_free(coll->reorderCodes);
   6738     }
   6739     coll->reorderCodes = NULL;
   6740     coll->freeReorderCodesOnClose = FALSE;
   6741     coll->reorderCodesLength = 0;
   6742     if (reorderCodesLength == 0) {
   6743         if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
   6744             uprv_free(coll->leadBytePermutationTable);
   6745         }
   6746         coll->leadBytePermutationTable = NULL;
   6747         coll->freeLeadBytePermutationTableOnClose = FALSE;
   6748         return;
   6749     }
   6750     coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
   6751     if (coll->reorderCodes == NULL) {
   6752         *status = U_MEMORY_ALLOCATION_ERROR;
   6753         return;
   6754     }
   6755     coll->freeReorderCodesOnClose = TRUE;
   6756     for (int32_t i = 0; i < reorderCodesLength; i++) {
   6757         coll->reorderCodes[i] = reorderCodes[i];
   6758     }
   6759     coll->reorderCodesLength = reorderCodesLength;
   6760     ucol_buildPermutationTable(coll, status);
   6761 }
   6762 
   6763 U_CAPI int32_t U_EXPORT2
   6764 ucol_getEquivalentReorderCodes(int32_t reorderCode,
   6765                     int32_t* dest,
   6766                     int32_t destCapacity,
   6767                     UErrorCode *pErrorCode) {
   6768     bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
   6769     uint16_t leadBytes[256];
   6770     int leadBytesCount;
   6771     int leadByteIndex;
   6772     int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
   6773     int reorderCodesForLeadByteCount;
   6774     int reorderCodeIndex;
   6775 
   6776     int32_t equivalentCodesCount = 0;
   6777     int setIndex;
   6778 
   6779     if (U_FAILURE(*pErrorCode)) {
   6780         return 0;
   6781     }
   6782 
   6783     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
   6784         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   6785         return 0;
   6786     }
   6787 
   6788     uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
   6789 
   6790     const UCollator* uca = ucol_initUCA(pErrorCode);
   6791     if (U_FAILURE(*pErrorCode)) {
   6792 	return 0;
   6793     }
   6794     leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
   6795     for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
   6796         reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
   6797             uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
   6798         for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
   6799             equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
   6800         }
   6801     }
   6802 
   6803     for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
   6804         if (equivalentCodesSet[setIndex] == true) {
   6805             equivalentCodesCount++;
   6806         }
   6807     }
   6808 
   6809     if (destCapacity == 0) {
   6810         return equivalentCodesCount;
   6811     }
   6812 
   6813     equivalentCodesCount = 0;
   6814     for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
   6815         if (equivalentCodesSet[setIndex] == true) {
   6816             dest[equivalentCodesCount++] = setIndex;
   6817             if (equivalentCodesCount >= destCapacity) {
   6818                 break;
   6819             }
   6820         }
   6821     }
   6822     return equivalentCodesCount;
   6823 }
   6824 
   6825 
   6826 /****************************************************************************/
   6827 /* Following are misc functions                                             */
   6828 /* there are new APIs and some compatibility APIs                           */
   6829 /****************************************************************************/
   6830 
   6831 U_CAPI void U_EXPORT2
   6832 ucol_getVersion(const UCollator* coll,
   6833                 UVersionInfo versionInfo)
   6834 {
   6835     if(coll->delegate!=NULL) {
   6836       ((const Collator*)coll->delegate)->getVersion(versionInfo);
   6837       return;
   6838     }
   6839     /* RunTime version  */
   6840     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
   6841     /* Builder version*/
   6842     uint8_t bdVersion = coll->image->version[0];
   6843 
   6844     /* Charset Version. Need to get the version from cnv files
   6845      * makeconv should populate cnv files with version and
   6846      * an api has to be provided in ucnv.h to obtain this version
   6847      */
   6848     uint8_t csVersion = 0;
   6849 
   6850     /* combine the version info */
   6851     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
   6852 
   6853     /* Tailoring rules */
   6854     versionInfo[0] = (uint8_t)(cmbVersion>>8);
   6855     versionInfo[1] = (uint8_t)cmbVersion;
   6856     versionInfo[2] = coll->image->version[1];
   6857     if(coll->UCA) {
   6858         /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
   6859         versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
   6860     } else {
   6861         versionInfo[3] = 0;
   6862     }
   6863 }
   6864 
   6865 
   6866 /* This internal API checks whether a character is tailored or not */
   6867 U_CAPI UBool  U_EXPORT2
   6868 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
   6869     if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
   6870         return FALSE;
   6871     }
   6872 
   6873     uint32_t CE = UCOL_NOT_FOUND;
   6874     const UChar *ContractionStart = NULL;
   6875     if(u < 0x100) { /* latin-1 */
   6876         CE = coll->latinOneMapping[u];
   6877         if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
   6878             return FALSE;
   6879         }
   6880     } else { /* regular */
   6881         CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
   6882     }
   6883 
   6884     if(isContraction(CE)) {
   6885         ContractionStart = (UChar *)coll->image+getContractOffset(CE);
   6886         CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
   6887     }
   6888 
   6889     return (UBool)(CE != UCOL_NOT_FOUND);
   6890 }
   6891 
   6892 
   6893 /****************************************************************************/
   6894 /* Following are the string compare functions                               */
   6895 /*                                                                          */
   6896 /****************************************************************************/
   6897 
   6898 
   6899 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
   6900 /*                     Used by strcoll if strength == identical and strings  */
   6901 /*                     are otherwise equal.                                  */
   6902 /*                                                                           */
   6903 /*                     Comparison must be done on NFD normalized strings.    */
   6904 /*                     FCD is not good enough.                               */
   6905 
   6906 static
   6907 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
   6908 {
   6909     // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
   6910     // of same type, but that doesn't really mean that it will stay that way.
   6911     int32_t            comparison;
   6912 
   6913     if (sColl->flags & UCOL_USE_ITERATOR) {
   6914         // The division for the array length may truncate the array size to
   6915         // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   6916         // for all platforms anyway.
   6917         UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   6918         UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   6919         UNormIterator *sNIt = NULL, *tNIt = NULL;
   6920         sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   6921         tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   6922         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   6923         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   6924         UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
   6925         UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
   6926         comparison = u_strCompareIter(sIt, tIt, TRUE);
   6927         unorm_closeIter(sNIt);
   6928         unorm_closeIter(tNIt);
   6929     } else {
   6930         int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
   6931         const UChar *sBuf = sColl->string;
   6932         int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
   6933         const UChar *tBuf = tColl->string;
   6934 
   6935         if (normalize) {
   6936             *status = U_ZERO_ERROR;
   6937             // Note: We could use Normalizer::compare() or similar, but for short strings
   6938             // which may not be in FCD it might be faster to just NFD them.
   6939             // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
   6940             // NFD'ing immediately might be faster for long strings,
   6941             // but string comparison is usually done on relatively short strings.
   6942             sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
   6943                                   sColl->writableBuffer,
   6944                                   *status);
   6945             tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
   6946                                   tColl->writableBuffer,
   6947                                   *status);
   6948             if(U_FAILURE(*status)) {
   6949                 return UCOL_LESS;
   6950             }
   6951             comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
   6952         } else {
   6953             comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
   6954         }
   6955     }
   6956 
   6957     if (comparison < 0) {
   6958         return UCOL_LESS;
   6959     } else if (comparison == 0) {
   6960         return UCOL_EQUAL;
   6961     } else /* comparison > 0 */ {
   6962         return UCOL_GREATER;
   6963     }
   6964 }
   6965 
   6966 /*  CEBuf - A struct and some inline functions to handle the saving    */
   6967 /*          of CEs in a buffer within ucol_strcoll                     */
   6968 
   6969 #define UCOL_CEBUF_SIZE 512
   6970 typedef struct ucol_CEBuf {
   6971     uint32_t    *buf;
   6972     uint32_t    *endp;
   6973     uint32_t    *pos;
   6974     uint32_t     localArray[UCOL_CEBUF_SIZE];
   6975 } ucol_CEBuf;
   6976 
   6977 
   6978 static
   6979 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
   6980     (b)->buf = (b)->pos = (b)->localArray;
   6981     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
   6982 }
   6983 
   6984 static
   6985 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
   6986     uint32_t  oldSize;
   6987     uint32_t  newSize;
   6988     uint32_t  *newBuf;
   6989 
   6990     ci->flags |= UCOL_ITER_ALLOCATED;
   6991     oldSize = (uint32_t)(b->pos - b->buf);
   6992     newSize = oldSize * 2;
   6993     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
   6994     if(newBuf == NULL) {
   6995         *status = U_MEMORY_ALLOCATION_ERROR;
   6996     }
   6997     else {
   6998         uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
   6999         if (b->buf != b->localArray) {
   7000             uprv_free(b->buf);
   7001         }
   7002         b->buf = newBuf;
   7003         b->endp = b->buf + newSize;
   7004         b->pos  = b->buf + oldSize;
   7005     }
   7006 }
   7007 
   7008 static
   7009 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
   7010     if (b->pos == b->endp) {
   7011         ucol_CEBuf_Expand(b, ci, status);
   7012     }
   7013     if (U_SUCCESS(*status)) {
   7014         *(b)->pos++ = ce;
   7015     }
   7016 }
   7017 
   7018 /* This is a trick string compare function that goes in and uses sortkeys to compare */
   7019 /* It is used when compare gets in trouble and needs to bail out                     */
   7020 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
   7021                                                   collIterate *tColl,
   7022                                                   UErrorCode *status)
   7023 {
   7024     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
   7025     uint8_t *sourceKeyP = sourceKey;
   7026     uint8_t *targetKeyP = targetKey;
   7027     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
   7028     const UCollator *coll = sColl->coll;
   7029     const UChar *source = NULL;
   7030     const UChar *target = NULL;
   7031     int32_t result = UCOL_EQUAL;
   7032     UnicodeString sourceString, targetString;
   7033     int32_t sourceLength;
   7034     int32_t targetLength;
   7035 
   7036     if(sColl->flags & UCOL_USE_ITERATOR) {
   7037         sColl->iterator->move(sColl->iterator, 0, UITER_START);
   7038         tColl->iterator->move(tColl->iterator, 0, UITER_START);
   7039         UChar32 c;
   7040         while((c=sColl->iterator->next(sColl->iterator))>=0) {
   7041             sourceString.append((UChar)c);
   7042         }
   7043         while((c=tColl->iterator->next(tColl->iterator))>=0) {
   7044             targetString.append((UChar)c);
   7045         }
   7046         source = sourceString.getBuffer();
   7047         sourceLength = sourceString.length();
   7048         target = targetString.getBuffer();
   7049         targetLength = targetString.length();
   7050     } else { // no iterators
   7051         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
   7052         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
   7053         source = sColl->string;
   7054         target = tColl->string;
   7055     }
   7056 
   7057 
   7058 
   7059     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7060     if(sourceKeyLen > UCOL_MAX_BUFFER) {
   7061         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
   7062         if(sourceKeyP == NULL) {
   7063             *status = U_MEMORY_ALLOCATION_ERROR;
   7064             goto cleanup_and_do_compare;
   7065         }
   7066         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
   7067     }
   7068 
   7069     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7070     if(targetKeyLen > UCOL_MAX_BUFFER) {
   7071         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
   7072         if(targetKeyP == NULL) {
   7073             *status = U_MEMORY_ALLOCATION_ERROR;
   7074             goto cleanup_and_do_compare;
   7075         }
   7076         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
   7077     }
   7078 
   7079     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
   7080 
   7081 cleanup_and_do_compare:
   7082     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
   7083         uprv_free(sourceKeyP);
   7084     }
   7085 
   7086     if(targetKeyP != NULL && targetKeyP != targetKey) {
   7087         uprv_free(targetKeyP);
   7088     }
   7089 
   7090     if(result<0) {
   7091         return UCOL_LESS;
   7092     } else if(result>0) {
   7093         return UCOL_GREATER;
   7094     } else {
   7095         return UCOL_EQUAL;
   7096     }
   7097 }
   7098 
   7099 
   7100 static UCollationResult
   7101 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
   7102 {
   7103     U_ALIGN_CODE(16);
   7104 
   7105     const UCollator *coll = sColl->coll;
   7106 
   7107 
   7108     // setting up the collator parameters
   7109     UColAttributeValue strength = coll->strength;
   7110     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
   7111 
   7112     UBool checkSecTer = initialCheckSecTer;
   7113     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
   7114     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
   7115     UBool checkIdent = (strength == UCOL_IDENTICAL);
   7116     UBool checkCase = (coll->caseLevel == UCOL_ON);
   7117     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
   7118     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
   7119     UBool qShifted = shifted && checkQuad;
   7120     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
   7121 
   7122     if(doHiragana && shifted) {
   7123         return (ucol_compareUsingSortKeys(sColl, tColl, status));
   7124     }
   7125     uint8_t caseSwitch = coll->caseSwitch;
   7126     uint8_t tertiaryMask = coll->tertiaryMask;
   7127 
   7128     // This is the lowest primary value that will not be ignored if shifted
   7129     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
   7130 
   7131     UCollationResult result = UCOL_EQUAL;
   7132     UCollationResult hirResult = UCOL_EQUAL;
   7133 
   7134     // Preparing the CE buffers. They will be filled during the primary phase
   7135     ucol_CEBuf   sCEs;
   7136     ucol_CEBuf   tCEs;
   7137     UCOL_INIT_CEBUF(&sCEs);
   7138     UCOL_INIT_CEBUF(&tCEs);
   7139 
   7140     uint32_t secS = 0, secT = 0;
   7141     uint32_t sOrder=0, tOrder=0;
   7142 
   7143     // Non shifted primary processing is quite simple
   7144     if(!shifted) {
   7145         for(;;) {
   7146             // We fetch CEs until we hit a non ignorable primary or end.
   7147             uint32_t sPrimary;
   7148             do {
   7149                 // We get the next CE
   7150                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7151                 // Stuff it in the buffer
   7152                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7153                 // And keep just the primary part.
   7154                 sPrimary = sOrder & UCOL_PRIMARYMASK;
   7155             } while(sPrimary == 0);
   7156 
   7157             // see the comments on the above block
   7158             uint32_t tPrimary;
   7159             do {
   7160                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7161                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7162                 tPrimary = tOrder & UCOL_PRIMARYMASK;
   7163             } while(tPrimary == 0);
   7164 
   7165             // if both primaries are the same
   7166             if(sPrimary == tPrimary) {
   7167                 // and there are no more CEs, we advance to the next level
   7168                 if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) {
   7169                     break;
   7170                 }
   7171                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7172                     if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
   7173                         hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
   7174                             ? UCOL_LESS:UCOL_GREATER;
   7175                     }
   7176                 }
   7177             } else {
   7178                 // only need to check one for continuation
   7179                 // if one is then the other must be or the preceding CE would be a prefix of the other
   7180                 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
   7181                     sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] << 24) | (sPrimary & 0x00FFFFFF);
   7182                     tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] << 24) | (tPrimary & 0x00FFFFFF);
   7183                 }
   7184                 // if two primaries are different, we are done
   7185                 result = (sPrimary < tPrimary) ?  UCOL_LESS: UCOL_GREATER;
   7186                 goto commonReturn;
   7187             }
   7188         } // no primary difference... do the rest from the buffers
   7189     } else { // shifted - do a slightly more complicated processing :)
   7190         for(;;) {
   7191             UBool sInShifted = FALSE;
   7192             UBool tInShifted = FALSE;
   7193             // This version of code can be refactored. However, it seems easier to understand this way.
   7194             // Source loop. Same as the target loop.
   7195             for(;;) {
   7196                 sOrder = ucol_IGetNextCE(coll, sColl, status);
   7197                 if(sOrder == UCOL_NO_MORE_CES) {
   7198                     UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7199                     break;
   7200                 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
   7201                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7202                     continue;
   7203                 } else if(isContinuation(sOrder)) {
   7204                     if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7205                         if(sInShifted) {
   7206                             sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7207                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7208                             continue;
   7209                         } else {
   7210                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7211                             break;
   7212                         }
   7213                     } else { /* Just lower level values */
   7214                         if(sInShifted) {
   7215                             continue;
   7216                         } else {
   7217                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7218                             continue;
   7219                         }
   7220                     }
   7221                 } else { /* regular */
   7222                     if(coll->leadBytePermutationTable != NULL){
   7223                         sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
   7224                     }
   7225                     if((sOrder & UCOL_PRIMARYMASK) > LVT) {
   7226                         UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7227                         break;
   7228                     } else {
   7229                         if((sOrder & UCOL_PRIMARYMASK) > 0) {
   7230                             sInShifted = TRUE;
   7231                             sOrder &= UCOL_PRIMARYMASK;
   7232                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7233                             continue;
   7234                         } else {
   7235                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
   7236                             sInShifted = FALSE;
   7237                             continue;
   7238                         }
   7239                     }
   7240                 }
   7241             }
   7242             sOrder &= UCOL_PRIMARYMASK;
   7243             sInShifted = FALSE;
   7244 
   7245             for(;;) {
   7246                 tOrder = ucol_IGetNextCE(coll, tColl, status);
   7247                 if(tOrder == UCOL_NO_MORE_CES) {
   7248                     UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7249                     break;
   7250                 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
   7251                     /* UCA amendment - ignore ignorables that follow shifted code points */
   7252                     continue;
   7253                 } else if(isContinuation(tOrder)) {
   7254                     if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
   7255                         if(tInShifted) {
   7256                             tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
   7257                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7258                             continue;
   7259                         } else {
   7260                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7261                             break;
   7262                         }
   7263                     } else { /* Just lower level values */
   7264                         if(tInShifted) {
   7265                             continue;
   7266                         } else {
   7267                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7268                             continue;
   7269                         }
   7270                     }
   7271                 } else { /* regular */
   7272                     if(coll->leadBytePermutationTable != NULL){
   7273                         tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
   7274                     }
   7275                     if((tOrder & UCOL_PRIMARYMASK) > LVT) {
   7276                         UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7277                         break;
   7278                     } else {
   7279                         if((tOrder & UCOL_PRIMARYMASK) > 0) {
   7280                             tInShifted = TRUE;
   7281                             tOrder &= UCOL_PRIMARYMASK;
   7282                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7283                             continue;
   7284                         } else {
   7285                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
   7286                             tInShifted = FALSE;
   7287                             continue;
   7288                         }
   7289                     }
   7290                 }
   7291             }
   7292             tOrder &= UCOL_PRIMARYMASK;
   7293             tInShifted = FALSE;
   7294 
   7295             if(sOrder == tOrder) {
   7296                 /*
   7297                 if(doHiragana && hirResult == UCOL_EQUAL) {
   7298                 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
   7299                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
   7300                 ? UCOL_LESS:UCOL_GREATER;
   7301                 }
   7302                 }
   7303                 */
   7304                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
   7305                     break;
   7306                 } else {
   7307                     sOrder = 0;
   7308                     tOrder = 0;
   7309                     continue;
   7310                 }
   7311             } else {
   7312                 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
   7313                 goto commonReturn;
   7314             }
   7315         } /* no primary difference... do the rest from the buffers */
   7316     }
   7317 
   7318     /* now, we're gonna reexamine collected CEs */
   7319     uint32_t    *sCE;
   7320     uint32_t    *tCE;
   7321 
   7322     /* This is the secondary level of comparison */
   7323     if(checkSecTer) {
   7324         if(!isFrenchSec) { /* normal */
   7325             sCE = sCEs.buf;
   7326             tCE = tCEs.buf;
   7327             for(;;) {
   7328                 while (secS == 0) {
   7329                     secS = *(sCE++) & UCOL_SECONDARYMASK;
   7330                 }
   7331 
   7332                 while(secT == 0) {
   7333                     secT = *(tCE++) & UCOL_SECONDARYMASK;
   7334                 }
   7335 
   7336                 if(secS == secT) {
   7337                     if(secS == UCOL_NO_MORE_CES_SECONDARY) {
   7338                         break;
   7339                     } else {
   7340                         secS = 0; secT = 0;
   7341                         continue;
   7342                     }
   7343                 } else {
   7344                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7345                     goto commonReturn;
   7346                 }
   7347             }
   7348         } else { /* do the French */
   7349             uint32_t *sCESave = NULL;
   7350             uint32_t *tCESave = NULL;
   7351             sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
   7352             tCE = tCEs.pos-2;
   7353             for(;;) {
   7354                 while (secS == 0 && sCE >= sCEs.buf) {
   7355                     if(sCESave == NULL) {
   7356                         secS = *(sCE--);
   7357                         if(isContinuation(secS)) {
   7358                             while(isContinuation(secS = *(sCE--)))
   7359                                 ;
   7360                             /* after this, secS has the start of continuation, and sCEs points before that */
   7361                             sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7362                             sCE+=2;  /* need to point to the first continuation CP */
   7363                             /* However, now you can just continue doing stuff */
   7364                         }
   7365                     } else {
   7366                         secS = *(sCE++);
   7367                         if(!isContinuation(secS)) { /* This means we have finished with this cont */
   7368                             sCE = sCESave;            /* reset the pointer to before continuation */
   7369                             sCESave = NULL;
   7370                             secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
   7371                             continue;
   7372                         }
   7373                     }
   7374                     secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7375                 }
   7376 
   7377                 while(secT == 0 && tCE >= tCEs.buf) {
   7378                     if(tCESave == NULL) {
   7379                         secT = *(tCE--);
   7380                         if(isContinuation(secT)) {
   7381                             while(isContinuation(secT = *(tCE--)))
   7382                                 ;
   7383                             /* after this, secS has the start of continuation, and sCEs points before that */
   7384                             tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
   7385                             tCE+=2;  /* need to point to the first continuation CP */
   7386                             /* However, now you can just continue doing stuff */
   7387                         }
   7388                     } else {
   7389                         secT = *(tCE++);
   7390                         if(!isContinuation(secT)) { /* This means we have finished with this cont */
   7391                             tCE = tCESave;          /* reset the pointer to before continuation */
   7392                             tCESave = NULL;
   7393                             secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
   7394                             continue;
   7395                         }
   7396                     }
   7397                     secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
   7398                 }
   7399 
   7400                 if(secS == secT) {
   7401                     if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
   7402                         break;
   7403                     } else {
   7404                         secS = 0; secT = 0;
   7405                         continue;
   7406                     }
   7407                 } else {
   7408                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7409                     goto commonReturn;
   7410                 }
   7411             }
   7412         }
   7413     }
   7414 
   7415     /* doing the case bit */
   7416     if(checkCase) {
   7417         sCE = sCEs.buf;
   7418         tCE = tCEs.buf;
   7419         for(;;) {
   7420             while((secS & UCOL_REMOVE_CASE) == 0) {
   7421                 if(!isContinuation(*sCE++)) {
   7422                     secS =*(sCE-1);
   7423                     if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7424                         // primary ignorables should not be considered on the case level when the strength is primary
   7425                         // otherwise, the CEs stop being well-formed
   7426                         secS &= UCOL_TERT_CASE_MASK;
   7427                         secS ^= caseSwitch;
   7428                     } else {
   7429                         secS = 0;
   7430                     }
   7431                 } else {
   7432                     secS = 0;
   7433                 }
   7434             }
   7435 
   7436             while((secT & UCOL_REMOVE_CASE) == 0) {
   7437                 if(!isContinuation(*tCE++)) {
   7438                     secT = *(tCE-1);
   7439                     if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
   7440                         // primary ignorables should not be considered on the case level when the strength is primary
   7441                         // otherwise, the CEs stop being well-formed
   7442                         secT &= UCOL_TERT_CASE_MASK;
   7443                         secT ^= caseSwitch;
   7444                     } else {
   7445                         secT = 0;
   7446                     }
   7447                 } else {
   7448                     secT = 0;
   7449                 }
   7450             }
   7451 
   7452             if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
   7453                 result = UCOL_LESS;
   7454                 goto commonReturn;
   7455             } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
   7456                 result = UCOL_GREATER;
   7457                 goto commonReturn;
   7458             }
   7459 
   7460             if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
   7461                 break;
   7462             } else {
   7463                 secS = 0;
   7464                 secT = 0;
   7465             }
   7466         }
   7467     }
   7468 
   7469     /* Tertiary level */
   7470     if(checkTertiary) {
   7471         secS = 0;
   7472         secT = 0;
   7473         sCE = sCEs.buf;
   7474         tCE = tCEs.buf;
   7475         for(;;) {
   7476             while((secS & UCOL_REMOVE_CASE) == 0) {
   7477                 sOrder = *sCE++;
   7478                 secS = sOrder & tertiaryMask;
   7479                 if(!isContinuation(sOrder)) {
   7480                     secS ^= caseSwitch;
   7481                 } else {
   7482                     secS &= UCOL_REMOVE_CASE;
   7483                 }
   7484             }
   7485 
   7486             while((secT & UCOL_REMOVE_CASE)  == 0) {
   7487                 tOrder = *tCE++;
   7488                 secT = tOrder & tertiaryMask;
   7489                 if(!isContinuation(tOrder)) {
   7490                     secT ^= caseSwitch;
   7491                 } else {
   7492                     secT &= UCOL_REMOVE_CASE;
   7493                 }
   7494             }
   7495 
   7496             if(secS == secT) {
   7497                 if((secS & UCOL_REMOVE_CASE) == 1) {
   7498                     break;
   7499                 } else {
   7500                     secS = 0; secT = 0;
   7501                     continue;
   7502                 }
   7503             } else {
   7504                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7505                 goto commonReturn;
   7506             }
   7507         }
   7508     }
   7509 
   7510 
   7511     if(qShifted /*checkQuad*/) {
   7512         UBool sInShifted = TRUE;
   7513         UBool tInShifted = TRUE;
   7514         secS = 0;
   7515         secT = 0;
   7516         sCE = sCEs.buf;
   7517         tCE = tCEs.buf;
   7518         for(;;) {
   7519             while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
   7520                 secS = *(sCE++);
   7521                 if(isContinuation(secS)) {
   7522                     if(!sInShifted) {
   7523                         continue;
   7524                     }
   7525                 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
   7526                     secS = UCOL_PRIMARYMASK;
   7527                     sInShifted = FALSE;
   7528                 } else {
   7529                     sInShifted = TRUE;
   7530                 }
   7531             }
   7532             secS &= UCOL_PRIMARYMASK;
   7533 
   7534 
   7535             while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
   7536                 secT = *(tCE++);
   7537                 if(isContinuation(secT)) {
   7538                     if(!tInShifted) {
   7539                         continue;
   7540                     }
   7541                 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
   7542                     secT = UCOL_PRIMARYMASK;
   7543                     tInShifted = FALSE;
   7544                 } else {
   7545                     tInShifted = TRUE;
   7546                 }
   7547             }
   7548             secT &= UCOL_PRIMARYMASK;
   7549 
   7550             if(secS == secT) {
   7551                 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
   7552                     break;
   7553                 } else {
   7554                     secS = 0; secT = 0;
   7555                     continue;
   7556                 }
   7557             } else {
   7558                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
   7559                 goto commonReturn;
   7560             }
   7561         }
   7562     } else if(doHiragana && hirResult != UCOL_EQUAL) {
   7563         // If we're fine on quaternaries, we might be different
   7564         // on Hiragana. This, however, might fail us in shifted.
   7565         result = hirResult;
   7566         goto commonReturn;
   7567     }
   7568 
   7569     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
   7570     /*  as a tiebreaker if all else is equal.                                */
   7571     /*  Getting here  should be quite rare - strings are not identical -     */
   7572     /*     that is checked first, but compared == through all other checks.  */
   7573     if(checkIdent)
   7574     {
   7575         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
   7576         result = ucol_checkIdent(sColl, tColl, TRUE, status);
   7577     }
   7578 
   7579 commonReturn:
   7580     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
   7581         if (sCEs.buf != sCEs.localArray ) {
   7582             uprv_free(sCEs.buf);
   7583         }
   7584         if (tCEs.buf != tCEs.localArray ) {
   7585             uprv_free(tCEs.buf);
   7586         }
   7587     }
   7588 
   7589     return result;
   7590 }
   7591 
   7592 static UCollationResult
   7593 ucol_strcollRegular(const UCollator *coll,
   7594                     const UChar *source, int32_t sourceLength,
   7595                     const UChar *target, int32_t targetLength,
   7596                     UErrorCode *status) {
   7597     collIterate sColl, tColl;
   7598     // Preparing the context objects for iterating over strings
   7599     IInit_collIterate(coll, source, sourceLength, &sColl, status);
   7600     IInit_collIterate(coll, target, targetLength, &tColl, status);
   7601     if(U_FAILURE(*status)) {
   7602         return UCOL_LESS;
   7603     }
   7604     return ucol_strcollRegular(&sColl, &tColl, status);
   7605 }
   7606 
   7607 static inline uint32_t
   7608 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
   7609                           uint32_t CE, const UChar *s, int32_t *index, int32_t len)
   7610 {
   7611     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
   7612     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
   7613     int32_t offset = 1;
   7614     UChar schar = 0, tchar = 0;
   7615 
   7616     for(;;) {
   7617         if(len == -1) {
   7618             if(s[*index] == 0) { // end of string
   7619                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7620             } else {
   7621                 schar = s[*index];
   7622             }
   7623         } else {
   7624             if(*index == len) {
   7625                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7626             } else {
   7627                 schar = s[*index];
   7628             }
   7629         }
   7630 
   7631         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   7632             offset++;
   7633         }
   7634 
   7635         if (schar == tchar) {
   7636             (*index)++;
   7637             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
   7638         }
   7639         else
   7640         {
   7641             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
   7642                 return UCOL_BAIL_OUT_CE;
   7643             }
   7644             // skip completely ignorables
   7645             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   7646             if(isZeroCE == 0) { // we have to ignore completely ignorables
   7647                 (*index)++;
   7648                 continue;
   7649             }
   7650 
   7651             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   7652         }
   7653     }
   7654 }
   7655 
   7656 
   7657 /**
   7658  * This is a fast strcoll, geared towards text in Latin-1.
   7659  * It supports contractions of size two, French secondaries
   7660  * and case switching. You can use it with strengths primary
   7661  * to tertiary. It does not support shifted and case level.
   7662  * It relies on the table build by setupLatin1Table. If it
   7663  * doesn't understand something, it will go to the regular
   7664  * strcoll.
   7665  */
   7666 static UCollationResult
   7667 ucol_strcollUseLatin1( const UCollator    *coll,
   7668               const UChar        *source,
   7669               int32_t            sLen,
   7670               const UChar        *target,
   7671               int32_t            tLen,
   7672               UErrorCode *status)
   7673 {
   7674     U_ALIGN_CODE(16);
   7675     int32_t strength = coll->strength;
   7676 
   7677     int32_t sIndex = 0, tIndex = 0;
   7678     UChar sChar = 0, tChar = 0;
   7679     uint32_t sOrder=0, tOrder=0;
   7680 
   7681     UBool endOfSource = FALSE;
   7682 
   7683     uint32_t *elements = coll->latinOneCEs;
   7684 
   7685     UBool haveContractions = FALSE; // if we have contractions in our string
   7686                                     // we cannot do French secondary
   7687 
   7688     // Do the primary level
   7689     for(;;) {
   7690         while(sOrder==0) { // this loop skips primary ignorables
   7691             // sOrder=getNextlatinOneCE(source);
   7692             if(sLen==-1) {   // handling zero terminated strings
   7693                 sChar=source[sIndex++];
   7694                 if(sChar==0) {
   7695                     endOfSource = TRUE;
   7696                     break;
   7697                 }
   7698             } else {        // handling strings with known length
   7699                 if(sIndex==sLen) {
   7700                     endOfSource = TRUE;
   7701                     break;
   7702                 }
   7703                 sChar=source[sIndex++];
   7704             }
   7705             if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   7706                 //fprintf(stderr, "R");
   7707                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7708             }
   7709             sOrder = elements[sChar];
   7710             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
   7711                 // specials can basically be either contractions or bail-out signs. If we get anything
   7712                 // else, we'll bail out anywasy
   7713                 if(getCETag(sOrder) == CONTRACTION_TAG) {
   7714                     sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
   7715                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
   7716                     // However, if there are contractions in the table, but we always use just one char,
   7717                     // we might be able to do French. This should be checked out.
   7718                 }
   7719                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   7720                     //fprintf(stderr, "S");
   7721                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7722                 }
   7723             }
   7724         }
   7725 
   7726         while(tOrder==0) {  // this loop skips primary ignorables
   7727             // tOrder=getNextlatinOneCE(target);
   7728             if(tLen==-1) {    // handling zero terminated strings
   7729                 tChar=target[tIndex++];
   7730                 if(tChar==0) {
   7731                     if(endOfSource) { // this is different than source loop,
   7732                         // as we already know that source loop is done here,
   7733                         // so we can either finish the primary loop if both
   7734                         // strings are done or anounce the result if only
   7735                         // target is done. Same below.
   7736                         goto endOfPrimLoop;
   7737                     } else {
   7738                         return UCOL_GREATER;
   7739                     }
   7740                 }
   7741             } else {          // handling strings with known length
   7742                 if(tIndex==tLen) {
   7743                     if(endOfSource) {
   7744                         goto endOfPrimLoop;
   7745                     } else {
   7746                         return UCOL_GREATER;
   7747                     }
   7748                 }
   7749                 tChar=target[tIndex++];
   7750             }
   7751             if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   7752                 //fprintf(stderr, "R");
   7753                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7754             }
   7755             tOrder = elements[tChar];
   7756             if(tOrder >= UCOL_NOT_FOUND) {
   7757                 // Handling specials, see the comments for source
   7758                 if(getCETag(tOrder) == CONTRACTION_TAG) {
   7759                     tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
   7760                     haveContractions = TRUE;
   7761                 }
   7762                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   7763                     //fprintf(stderr, "S");
   7764                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7765                 }
   7766             }
   7767         }
   7768         if(endOfSource) { // source is finished, but target is not, say the result.
   7769             return UCOL_LESS;
   7770         }
   7771 
   7772         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
   7773             sOrder = 0; tOrder = 0;
   7774             continue;
   7775         } else {
   7776             // compare current top bytes
   7777             if(((sOrder^tOrder)&0xFF000000)!=0) {
   7778                 // top bytes differ, return difference
   7779                 if(sOrder < tOrder) {
   7780                     return UCOL_LESS;
   7781                 } else if(sOrder > tOrder) {
   7782                     return UCOL_GREATER;
   7783                 }
   7784                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
   7785                 // since we must return enum value
   7786             }
   7787 
   7788             // top bytes match, continue with following bytes
   7789             sOrder<<=8;
   7790             tOrder<<=8;
   7791         }
   7792     }
   7793 
   7794 endOfPrimLoop:
   7795     // after primary loop, we definitely know the sizes of strings,
   7796     // so we set it and use simpler loop for secondaries and tertiaries
   7797     sLen = sIndex; tLen = tIndex;
   7798     if(strength >= UCOL_SECONDARY) {
   7799         // adjust the table beggining
   7800         elements += coll->latinOneTableLen;
   7801         endOfSource = FALSE;
   7802 
   7803         if(coll->frenchCollation == UCOL_OFF) { // non French
   7804             // This loop is a simplified copy of primary loop
   7805             // at this point we know that whole strings are latin-1, so we don't
   7806             // check for that. We also know that we only have contractions as
   7807             // specials.
   7808             sIndex = 0; tIndex = 0;
   7809             for(;;) {
   7810                 while(sOrder==0) {
   7811                     if(sIndex==sLen) {
   7812                         endOfSource = TRUE;
   7813                         break;
   7814                     }
   7815                     sChar=source[sIndex++];
   7816                     sOrder = elements[sChar];
   7817                     if(sOrder > UCOL_NOT_FOUND) {
   7818                         sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
   7819                     }
   7820                 }
   7821 
   7822                 while(tOrder==0) {
   7823                     if(tIndex==tLen) {
   7824                         if(endOfSource) {
   7825                             goto endOfSecLoop;
   7826                         } else {
   7827                             return UCOL_GREATER;
   7828                         }
   7829                     }
   7830                     tChar=target[tIndex++];
   7831                     tOrder = elements[tChar];
   7832                     if(tOrder > UCOL_NOT_FOUND) {
   7833                         tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
   7834                     }
   7835                 }
   7836                 if(endOfSource) {
   7837                     return UCOL_LESS;
   7838                 }
   7839 
   7840                 if(sOrder == tOrder) {
   7841                     sOrder = 0; tOrder = 0;
   7842                     continue;
   7843                 } else {
   7844                     // see primary loop for comments on this
   7845                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   7846                         if(sOrder < tOrder) {
   7847                             return UCOL_LESS;
   7848                         } else if(sOrder > tOrder) {
   7849                             return UCOL_GREATER;
   7850                         }
   7851                     }
   7852                     sOrder<<=8;
   7853                     tOrder<<=8;
   7854                 }
   7855             }
   7856         } else { // French
   7857             if(haveContractions) { // if we have contractions, we have to bail out
   7858                 // since we don't really know how to handle them here
   7859                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
   7860             }
   7861             // For French, we go backwards
   7862             sIndex = sLen; tIndex = tLen;
   7863             for(;;) {
   7864                 while(sOrder==0) {
   7865                     if(sIndex==0) {
   7866                         endOfSource = TRUE;
   7867                         break;
   7868                     }
   7869                     sChar=source[--sIndex];
   7870                     sOrder = elements[sChar];
   7871                     // don't even look for contractions
   7872                 }
   7873 
   7874                 while(tOrder==0) {
   7875                     if(tIndex==0) {
   7876                         if(endOfSource) {
   7877                             goto endOfSecLoop;
   7878                         } else {
   7879                             return UCOL_GREATER;
   7880                         }
   7881                     }
   7882                     tChar=target[--tIndex];
   7883                     tOrder = elements[tChar];
   7884                     // don't even look for contractions
   7885                 }
   7886                 if(endOfSource) {
   7887                     return UCOL_LESS;
   7888                 }
   7889 
   7890                 if(sOrder == tOrder) {
   7891                     sOrder = 0; tOrder = 0;
   7892                     continue;
   7893                 } else {
   7894                     // see the primary loop for comments
   7895                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   7896                         if(sOrder < tOrder) {
   7897                             return UCOL_LESS;
   7898                         } else if(sOrder > tOrder) {
   7899                             return UCOL_GREATER;
   7900                         }
   7901                     }
   7902                     sOrder<<=8;
   7903                     tOrder<<=8;
   7904                 }
   7905             }
   7906         }
   7907     }
   7908 
   7909 endOfSecLoop:
   7910     if(strength >= UCOL_TERTIARY) {
   7911         // tertiary loop is the same as secondary (except no French)
   7912         elements += coll->latinOneTableLen;
   7913         sIndex = 0; tIndex = 0;
   7914         endOfSource = FALSE;
   7915         for(;;) {
   7916             while(sOrder==0) {
   7917                 if(sIndex==sLen) {
   7918                     endOfSource = TRUE;
   7919                     break;
   7920                 }
   7921                 sChar=source[sIndex++];
   7922                 sOrder = elements[sChar];
   7923                 if(sOrder > UCOL_NOT_FOUND) {
   7924                     sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
   7925                 }
   7926             }
   7927             while(tOrder==0) {
   7928                 if(tIndex==tLen) {
   7929                     if(endOfSource) {
   7930                         return UCOL_EQUAL; // if both strings are at the end, they are equal
   7931                     } else {
   7932                         return UCOL_GREATER;
   7933                     }
   7934                 }
   7935                 tChar=target[tIndex++];
   7936                 tOrder = elements[tChar];
   7937                 if(tOrder > UCOL_NOT_FOUND) {
   7938                     tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
   7939                 }
   7940             }
   7941             if(endOfSource) {
   7942                 return UCOL_LESS;
   7943             }
   7944             if(sOrder == tOrder) {
   7945                 sOrder = 0; tOrder = 0;
   7946                 continue;
   7947             } else {
   7948                 if(((sOrder^tOrder)&0xff000000)!=0) {
   7949                     if(sOrder < tOrder) {
   7950                         return UCOL_LESS;
   7951                     } else if(sOrder > tOrder) {
   7952                         return UCOL_GREATER;
   7953                     }
   7954                 }
   7955                 sOrder<<=8;
   7956                 tOrder<<=8;
   7957             }
   7958         }
   7959     }
   7960     return UCOL_EQUAL;
   7961 }
   7962 
   7963 /*
   7964   Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
   7965   null terminated input string takes extra amount of CPU cycles.
   7966 */
   7967 static UCollationResult
   7968 ucol_strcollRegularUTF8(
   7969                     const UCollator *coll,
   7970                     const char      *source,
   7971                     int32_t         sourceLength,
   7972                     const char      *target,
   7973                     int32_t         targetLength,
   7974                     UErrorCode      *status)
   7975 {
   7976     UCharIterator src;
   7977     UCharIterator tgt;
   7978 
   7979     uiter_setUTF8(&src, source, sourceLength);
   7980     uiter_setUTF8(&tgt, target, targetLength);
   7981 
   7982     // Preparing the context objects for iterating over strings
   7983     collIterate sColl, tColl;
   7984     IInit_collIterate(coll, NULL, -1, &sColl, status);
   7985     IInit_collIterate(coll, NULL, -1, &tColl, status);
   7986     if(U_FAILURE(*status)) {
   7987         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   7988         return UCOL_EQUAL;
   7989     }
   7990     // The division for the array length may truncate the array size to
   7991     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   7992     // for all platforms anyway.
   7993     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   7994     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   7995     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
   7996 
   7997     sColl.iterator = &src;
   7998     sColl.flags |= UCOL_USE_ITERATOR;
   7999     tColl.flags |= UCOL_USE_ITERATOR;
   8000     tColl.iterator = &tgt;
   8001 
   8002     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
   8003         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   8004         sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
   8005         sColl.flags &= ~UCOL_ITER_NORM;
   8006 
   8007         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   8008         tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
   8009         tColl.flags &= ~UCOL_ITER_NORM;
   8010     }
   8011 
   8012     return ucol_strcollRegular(&sColl, &tColl, status);
   8013 }
   8014 
   8015 static inline uint32_t
   8016 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
   8017                           uint32_t CE, const char *s, int32_t *index, int32_t len)
   8018 {
   8019     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
   8020     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
   8021     int32_t offset = 1;
   8022     UChar32 schar = 0, tchar = 0;
   8023 
   8024     for(;;) {
   8025         if (*index == len) {
   8026             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   8027         }
   8028         U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar);
   8029         if (len < 0 && schar == 0) {
   8030             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   8031         }
   8032 
   8033         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
   8034             offset++;
   8035         }
   8036 
   8037         if (schar == tchar) {
   8038             U8_FWD_1(s, *index, len);
   8039             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
   8040         }
   8041         else
   8042         {
   8043             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
   8044                 return UCOL_BAIL_OUT_CE;
   8045             }
   8046             // skip completely ignorables
   8047             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
   8048             if(isZeroCE == 0) { // we have to ignore completely ignorables
   8049                 U8_FWD_1(s, *index, len);
   8050                 continue;
   8051             }
   8052 
   8053             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
   8054         }
   8055     }
   8056 }
   8057 
   8058 static inline UCollationResult
   8059 ucol_strcollUseLatin1UTF8(
   8060                 const UCollator *coll,
   8061                 const char      *source,
   8062                 int32_t         sLen,
   8063                 const char      *target,
   8064                 int32_t         tLen,
   8065                 UErrorCode      *status)
   8066 {
   8067     U_ALIGN_CODE(16);
   8068     int32_t strength = coll->strength;
   8069 
   8070     int32_t sIndex = 0, tIndex = 0;
   8071     UChar32 sChar = 0, tChar = 0;
   8072     uint32_t sOrder=0, tOrder=0;
   8073 
   8074     UBool endOfSource = FALSE;
   8075 
   8076     uint32_t *elements = coll->latinOneCEs;
   8077 
   8078     UBool haveContractions = FALSE; // if we have contractions in our string
   8079                                     // we cannot do French secondary
   8080 
   8081     // Do the primary level
   8082     for(;;) {
   8083         while(sOrder==0) { // this loop skips primary ignorables
   8084             // sOrder=getNextlatinOneCE(source);
   8085             if (sIndex == sLen) {
   8086                 endOfSource = TRUE;
   8087                 break;
   8088             }
   8089             U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar);
   8090             if (sLen < 0 && sChar == 0) {
   8091                 endOfSource = TRUE;
   8092                 sLen = sIndex;
   8093                 break;
   8094             }
   8095             if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   8096                 //fprintf(stderr, "R");
   8097                 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
   8098             }
   8099             sOrder = elements[sChar];
   8100             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
   8101                 // specials can basically be either contractions or bail-out signs. If we get anything
   8102                 // else, we'll bail out anywasy
   8103                 if(getCETag(sOrder) == CONTRACTION_TAG) {
   8104                     sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
   8105                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
   8106                     // However, if there are contractions in the table, but we always use just one char,
   8107                     // we might be able to do French. This should be checked out.
   8108                 }
   8109                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   8110                     //fprintf(stderr, "S");
   8111                     return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
   8112                 }
   8113             }
   8114         }
   8115 
   8116         while(tOrder==0) {  // this loop skips primary ignorables
   8117             // tOrder=getNextlatinOneCE(target);
   8118             if (tIndex == tLen) {
   8119                 if(endOfSource) {
   8120                     goto endOfPrimLoopU8;
   8121                 } else {
   8122                     return UCOL_GREATER;
   8123                 }
   8124             }
   8125             U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
   8126             if (tLen < 0 && tChar == 0) {
   8127                 if(endOfSource) {
   8128                     tLen = tIndex;
   8129                     goto endOfPrimLoopU8;
   8130                 } else {
   8131                     return UCOL_GREATER;
   8132                 }
   8133             }
   8134             if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
   8135                 //fprintf(stderr, "R");
   8136                 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
   8137             }
   8138             tOrder = elements[tChar];
   8139             if(tOrder >= UCOL_NOT_FOUND) {
   8140                 // Handling specials, see the comments for source
   8141                 if(getCETag(tOrder) == CONTRACTION_TAG) {
   8142                     tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
   8143                     haveContractions = TRUE;
   8144                 }
   8145                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
   8146                     //fprintf(stderr, "S");
   8147                     return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
   8148                 }
   8149             }
   8150         }
   8151         if(endOfSource) { // source is finished, but target is not, say the result.
   8152             return UCOL_LESS;
   8153         }
   8154 
   8155         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
   8156             sOrder = 0; tOrder = 0;
   8157             continue;
   8158         } else {
   8159             // compare current top bytes
   8160             if(((sOrder^tOrder)&0xFF000000)!=0) {
   8161                 // top bytes differ, return difference
   8162                 if(sOrder < tOrder) {
   8163                     return UCOL_LESS;
   8164                 } else if(sOrder > tOrder) {
   8165                     return UCOL_GREATER;
   8166                 }
   8167                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
   8168                 // since we must return enum value
   8169             }
   8170 
   8171             // top bytes match, continue with following bytes
   8172             sOrder<<=8;
   8173             tOrder<<=8;
   8174         }
   8175     }
   8176 
   8177 endOfPrimLoopU8:
   8178     // after primary loop, we definitely know the sizes of strings,
   8179     // so we set it and use simpler loop for secondaries and tertiaries
   8180     sLen = sIndex; tLen = tIndex;
   8181     if(strength >= UCOL_SECONDARY) {
   8182         // adjust the table beggining
   8183         elements += coll->latinOneTableLen;
   8184         endOfSource = FALSE;
   8185 
   8186         if(coll->frenchCollation == UCOL_OFF) { // non French
   8187             // This loop is a simplified copy of primary loop
   8188             // at this point we know that whole strings are latin-1, so we don't
   8189             // check for that. We also know that we only have contractions as
   8190             // specials.
   8191             sIndex = 0; tIndex = 0;
   8192             for(;;) {
   8193                 while(sOrder==0) {
   8194                     if(sIndex==sLen) {
   8195                         endOfSource = TRUE;
   8196                         break;
   8197                     }
   8198                     U_ASSERT(sLen >= 0);
   8199                     U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
   8200                     U_ASSERT(sChar >= 0 && sChar <= 0xFF);
   8201                     sOrder = elements[sChar];
   8202                     if(sOrder > UCOL_NOT_FOUND) {
   8203                         sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
   8204                     }
   8205                 }
   8206 
   8207                 while(tOrder==0) {
   8208                     if(tIndex==tLen) {
   8209                         if(endOfSource) {
   8210                             goto endOfSecLoopU8;
   8211                         } else {
   8212                             return UCOL_GREATER;
   8213                         }
   8214                     }
   8215                     U_ASSERT(tLen >= 0);
   8216                     U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
   8217                     U_ASSERT(tChar >= 0 && tChar <= 0xFF);
   8218                     tOrder = elements[tChar];
   8219                     if(tOrder > UCOL_NOT_FOUND) {
   8220                         tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
   8221                     }
   8222                 }
   8223                 if(endOfSource) {
   8224                     return UCOL_LESS;
   8225                 }
   8226 
   8227                 if(sOrder == tOrder) {
   8228                     sOrder = 0; tOrder = 0;
   8229                     continue;
   8230                 } else {
   8231                     // see primary loop for comments on this
   8232                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   8233                         if(sOrder < tOrder) {
   8234                             return UCOL_LESS;
   8235                         } else if(sOrder > tOrder) {
   8236                             return UCOL_GREATER;
   8237                         }
   8238                     }
   8239                     sOrder<<=8;
   8240                     tOrder<<=8;
   8241                 }
   8242             }
   8243         } else { // French
   8244             if(haveContractions) { // if we have contractions, we have to bail out
   8245                 // since we don't really know how to handle them here
   8246                 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
   8247             }
   8248             // For French, we go backwards
   8249             sIndex = sLen; tIndex = tLen;
   8250             for(;;) {
   8251                 while(sOrder==0) {
   8252                     if(sIndex==0) {
   8253                         endOfSource = TRUE;
   8254                         break;
   8255                     }
   8256                     U8_PREV_OR_FFFD(source, 0, sIndex, sChar);
   8257                     U_ASSERT(sChar >= 0 && sChar <= 0xFF);
   8258                     sOrder = elements[sChar];
   8259                     // don't even look for contractions
   8260                 }
   8261 
   8262                 while(tOrder==0) {
   8263                     if(tIndex==0) {
   8264                         if(endOfSource) {
   8265                             goto endOfSecLoopU8;
   8266                         } else {
   8267                             return UCOL_GREATER;
   8268                         }
   8269                     }
   8270                     U8_PREV_OR_FFFD(target, 0, tIndex, tChar);
   8271                     U_ASSERT(tChar >= 0 && tChar <= 0xFF);
   8272                     tOrder = elements[tChar];
   8273                     // don't even look for contractions
   8274                 }
   8275                 if(endOfSource) {
   8276                     return UCOL_LESS;
   8277                 }
   8278 
   8279                 if(sOrder == tOrder) {
   8280                     sOrder = 0; tOrder = 0;
   8281                     continue;
   8282                 } else {
   8283                     // see the primary loop for comments
   8284                     if(((sOrder^tOrder)&0xFF000000)!=0) {
   8285                         if(sOrder < tOrder) {
   8286                             return UCOL_LESS;
   8287                         } else if(sOrder > tOrder) {
   8288                             return UCOL_GREATER;
   8289                         }
   8290                     }
   8291                     sOrder<<=8;
   8292                     tOrder<<=8;
   8293                 }
   8294             }
   8295         }
   8296     }
   8297 
   8298 endOfSecLoopU8:
   8299     if(strength >= UCOL_TERTIARY) {
   8300         // tertiary loop is the same as secondary (except no French)
   8301         elements += coll->latinOneTableLen;
   8302         sIndex = 0; tIndex = 0;
   8303         endOfSource = FALSE;
   8304         for(;;) {
   8305             while(sOrder==0) {
   8306                 if(sIndex==sLen) {
   8307                     endOfSource = TRUE;
   8308                     break;
   8309                 }
   8310                 U_ASSERT(sLen >= 0);
   8311                 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
   8312                 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
   8313                 sOrder = elements[sChar];
   8314                 if(sOrder > UCOL_NOT_FOUND) {
   8315                     sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
   8316                 }
   8317             }
   8318             while(tOrder==0) {
   8319                 if(tIndex==tLen) {
   8320                     if(endOfSource) {
   8321                         return UCOL_EQUAL; // if both strings are at the end, they are equal
   8322                     } else {
   8323                         return UCOL_GREATER;
   8324                     }
   8325                 }
   8326                 U_ASSERT(tLen >= 0);
   8327                 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
   8328                 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
   8329                 tOrder = elements[tChar];
   8330                 if(tOrder > UCOL_NOT_FOUND) {
   8331                     tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
   8332                 }
   8333             }
   8334             if(endOfSource) {
   8335                 return UCOL_LESS;
   8336             }
   8337             if(sOrder == tOrder) {
   8338                 sOrder = 0; tOrder = 0;
   8339                 continue;
   8340             } else {
   8341                 if(((sOrder^tOrder)&0xff000000)!=0) {
   8342                     if(sOrder < tOrder) {
   8343                         return UCOL_LESS;
   8344                     } else if(sOrder > tOrder) {
   8345                         return UCOL_GREATER;
   8346                     }
   8347                 }
   8348                 sOrder<<=8;
   8349                 tOrder<<=8;
   8350             }
   8351         }
   8352     }
   8353     return UCOL_EQUAL;
   8354 }
   8355 
   8356 U_CAPI UCollationResult U_EXPORT2
   8357 ucol_strcollIter( const UCollator    *coll,
   8358                  UCharIterator *sIter,
   8359                  UCharIterator *tIter,
   8360                  UErrorCode         *status)
   8361 {
   8362     if(!status || U_FAILURE(*status)) {
   8363         return UCOL_EQUAL;
   8364     }
   8365 
   8366     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
   8367     UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
   8368 
   8369     if (sIter == tIter) {
   8370         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8371         return UCOL_EQUAL;
   8372     }
   8373     if(sIter == NULL || tIter == NULL || coll == NULL) {
   8374         *status = U_ILLEGAL_ARGUMENT_ERROR;
   8375         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8376         return UCOL_EQUAL;
   8377     }
   8378 
   8379     UCollationResult result = UCOL_EQUAL;
   8380 
   8381     // Preparing the context objects for iterating over strings
   8382     collIterate sColl, tColl;
   8383     IInit_collIterate(coll, NULL, -1, &sColl, status);
   8384     IInit_collIterate(coll, NULL, -1, &tColl, status);
   8385     if(U_FAILURE(*status)) {
   8386         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
   8387         return UCOL_EQUAL;
   8388     }
   8389     // The division for the array length may truncate the array size to
   8390     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
   8391     // for all platforms anyway.
   8392     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8393     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
   8394     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
   8395 
   8396     sColl.iterator = sIter;
   8397     sColl.flags |= UCOL_USE_ITERATOR;
   8398     tColl.flags |= UCOL_USE_ITERATOR;
   8399     tColl.iterator = tIter;
   8400 
   8401     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
   8402         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
   8403         sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
   8404         sColl.flags &= ~UCOL_ITER_NORM;
   8405 
   8406         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
   8407         tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
   8408         tColl.flags &= ~UCOL_ITER_NORM;
   8409     }
   8410 
   8411     UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
   8412 
   8413     while((sChar = sColl.iterator->next(sColl.iterator)) ==
   8414         (tChar = tColl.iterator->next(tColl.iterator))) {
   8415             if(sChar == U_SENTINEL) {
   8416                 result = UCOL_EQUAL;
   8417                 goto end_compare;
   8418             }
   8419     }
   8420 
   8421     if(sChar == U_SENTINEL) {
   8422         tChar = tColl.iterator->previous(tColl.iterator);
   8423     }
   8424 
   8425     if(tChar == U_SENTINEL) {
   8426         sChar = sColl.iterator->previous(sColl.iterator);
   8427     }
   8428 
   8429     sChar = sColl.iterator->previous(sColl.iterator);
   8430     tChar = tColl.iterator->previous(tColl.iterator);
   8431 
   8432     if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
   8433     {
   8434         // We are stopped in the middle of a contraction.
   8435         // Scan backwards through the == part of the string looking for the start of the contraction.
   8436         //   It doesn't matter which string we scan, since they are the same in this region.
   8437         do
   8438         {
   8439             sChar = sColl.iterator->previous(sColl.iterator);
   8440             tChar = tColl.iterator->previous(tColl.iterator);
   8441         }
   8442         while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
   8443     }
   8444 
   8445 
   8446     if(U_SUCCESS(*status)) {
   8447         result = ucol_strcollRegular(&sColl, &tColl, status);
   8448     }
   8449 
   8450 end_compare:
   8451     if(sNormIter || tNormIter) {
   8452         unorm_closeIter(sNormIter);
   8453         unorm_closeIter(tNormIter);
   8454     }
   8455 
   8456     UTRACE_EXIT_VALUE_STATUS(result, *status)
   8457     return result;
   8458 }
   8459 
   8460 
   8461 /*                                                                      */
   8462 /* ucol_strcoll     Main public API string comparison function          */
   8463 /*                                                                      */
   8464 U_CAPI UCollationResult U_EXPORT2
   8465 ucol_strcoll( const UCollator    *coll,
   8466               const UChar        *source,
   8467               int32_t            sourceLength,
   8468               const UChar        *target,
   8469               int32_t            targetLength)
   8470 {
   8471     U_ALIGN_CODE(16);
   8472 
   8473     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
   8474     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   8475         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
   8476         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
   8477         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
   8478     }
   8479 
   8480     if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) {
   8481         // do not crash, but return. Should have
   8482         // status argument to return error.
   8483         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8484         return UCOL_EQUAL;
   8485     }
   8486 
   8487     /* Quick check if source and target are same strings. */
   8488     /* They should either both be NULL terminated or the explicit length should be set on both. */
   8489     if (source==target && sourceLength==targetLength) {
   8490         UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8491         return UCOL_EQUAL;
   8492     }
   8493 
   8494     if(coll->delegate != NULL) {
   8495       UErrorCode status = U_ZERO_ERROR;
   8496       return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
   8497     }
   8498 
   8499     /* Scan the strings.  Find:                                                             */
   8500     /*    The length of any leading portion that is equal                                   */
   8501     /*    Whether they are exactly equal.  (in which case we just return)                   */
   8502     const UChar    *pSrc    = source;
   8503     const UChar    *pTarg   = target;
   8504     int32_t        equalLength;
   8505 
   8506     if (sourceLength == -1 && targetLength == -1) {
   8507         // Both strings are null terminated.
   8508         //    Scan through any leading equal portion.
   8509         while (*pSrc == *pTarg && *pSrc != 0) {
   8510             pSrc++;
   8511             pTarg++;
   8512         }
   8513         if (*pSrc == 0 && *pTarg == 0) {
   8514             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8515             return UCOL_EQUAL;
   8516         }
   8517         equalLength = (int32_t)(pSrc - source);
   8518     }
   8519     else
   8520     {
   8521         // One or both strings has an explicit length.
   8522         const UChar    *pSrcEnd = source + sourceLength;
   8523         const UChar    *pTargEnd = target + targetLength;
   8524 
   8525         // Scan while the strings are bitwise ==, or until one is exhausted.
   8526         for (;;) {
   8527             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
   8528                 break;
   8529             }
   8530             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
   8531                 break;
   8532             }
   8533             if (*pSrc != *pTarg) {
   8534                 break;
   8535             }
   8536             pSrc++;
   8537             pTarg++;
   8538         }
   8539         equalLength = (int32_t)(pSrc - source);
   8540 
   8541         // If we made it all the way through both strings, we are done.  They are ==
   8542         if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
   8543             (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
   8544         {
   8545             UTRACE_EXIT_VALUE(UCOL_EQUAL);
   8546             return UCOL_EQUAL;
   8547         }
   8548     }
   8549     if (equalLength > 0) {
   8550         /* There is an identical portion at the beginning of the two strings.        */
   8551         /*   If the identical portion ends within a contraction or a comibining      */
   8552         /*   character sequence, back up to the start of that sequence.              */
   8553 
   8554         // These values should already be set by the code above.
   8555         //pSrc  = source + equalLength;        /* point to the first differing chars   */
   8556         //pTarg = target + equalLength;
   8557         if ((pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
   8558             (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
   8559         {
   8560             // We are stopped in the middle of a contraction.
   8561             // Scan backwards through the == part of the string looking for the start of the contraction.
   8562             //   It doesn't matter which string we scan, since they are the same in this region.
   8563             do
   8564             {
   8565                 equalLength--;
   8566                 pSrc--;
   8567             }
   8568             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
   8569         }
   8570 
   8571         source += equalLength;
   8572         target += equalLength;
   8573         if (sourceLength > 0) {
   8574             sourceLength -= equalLength;
   8575         }
   8576         if (targetLength > 0) {
   8577             targetLength -= equalLength;
   8578         }
   8579     }
   8580 
   8581     UErrorCode status = U_ZERO_ERROR;
   8582     UCollationResult returnVal;
   8583     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
   8584         returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
   8585     } else {
   8586         returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
   8587     }
   8588     UTRACE_EXIT_VALUE(returnVal);
   8589     return returnVal;
   8590 }
   8591 
   8592 U_CAPI UCollationResult U_EXPORT2
   8593 ucol_strcollUTF8(
   8594         const UCollator *coll,
   8595         const char      *source,
   8596         int32_t         sourceLength,
   8597         const char      *target,
   8598         int32_t         targetLength,
   8599         UErrorCode      *status)
   8600 {
   8601     U_ALIGN_CODE(16);
   8602 
   8603     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
   8604     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
   8605         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
   8606         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
   8607         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
   8608     }
   8609 
   8610     if (U_FAILURE(*status)) {
   8611         /* do nothing */
   8612         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
   8613         return UCOL_EQUAL;
   8614     }
   8615 
   8616     if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) {
   8617         *status = U_ILLEGAL_ARGUMENT_ERROR;
   8618         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
   8619         return UCOL_EQUAL;
   8620     }
   8621 
   8622     /* Quick check if source and target are same strings. */
   8623     /* They should either both be NULL terminated or the explicit length should be set on both. */
   8624     if (source==target && sourceLength==targetLength) {
   8625         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
   8626         return UCOL_EQUAL;
   8627     }
   8628 
   8629     if(coll->delegate != NULL) {
   8630         return ((const Collator*)coll->delegate)->compareUTF8(
   8631             StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength),
   8632             StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength),
   8633             *status);
   8634     }
   8635 
   8636     /* Scan the strings.  Find:                                                             */
   8637     /*    The length of any leading portion that is equal                                   */
   8638     /*    Whether they are exactly equal.  (in which case we just return)                   */
   8639     const char  *pSrc = source;
   8640     const char  *pTarg = target;
   8641     UBool       bSrcLimit = FALSE;
   8642     UBool       bTargLimit = FALSE;
   8643 
   8644     if (sourceLength == -1 && targetLength == -1) {
   8645         // Both strings are null terminated.
   8646         //    Scan through any leading equal portion.
   8647         while (*pSrc == *pTarg && *pSrc != 0) {
   8648             pSrc++;
   8649             pTarg++;
   8650         }
   8651         if (*pSrc == 0 && *pTarg == 0) {
   8652             UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
   8653             return UCOL_EQUAL;
   8654         }
   8655         bSrcLimit = (*pSrc == 0);
   8656         bTargLimit = (*pTarg == 0);
   8657     }
   8658     else
   8659     {
   8660         // One or both strings has an explicit length.
   8661         const char *pSrcEnd = source + sourceLength;
   8662         const char *pTargEnd = target + targetLength;
   8663 
   8664         // Scan while the strings are bitwise ==, or until one is exhausted.
   8665         for (;;) {
   8666             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
   8667                 break;
   8668             }
   8669             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
   8670                 break;
   8671             }
   8672             if (*pSrc != *pTarg) {
   8673                 break;
   8674             }
   8675             pSrc++;
   8676             pTarg++;
   8677         }
   8678         bSrcLimit = (pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0));
   8679         bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
   8680 
   8681         // If we made it all the way through both strings, we are done.  They are ==
   8682         if (bSrcLimit &&    /* At end of src string, however it was specified. */
   8683             bTargLimit)     /* and also at end of dest string                  */
   8684         {
   8685             UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
   8686             return UCOL_EQUAL;
   8687         }
   8688     }
   8689 
   8690     U_ASSERT(!(bSrcLimit && bTargLimit));
   8691 
   8692     int32_t    equalLength = pSrc - source;
   8693     UBool       bSawNonLatin1 = FALSE;
   8694 
   8695     if (equalLength > 0) {
   8696         // Align position to the start of UTF-8 code point.
   8697         if (bTargLimit) {
   8698             U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
   8699         } else {
   8700             U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
   8701         }
   8702         pSrc = source + equalLength;
   8703         pTarg = target + equalLength;
   8704     }
   8705 
   8706     if (equalLength > 0) {
   8707         /* There is an identical portion at the beginning of the two strings.        */
   8708         /*   If the identical portion ends within a contraction or a comibining      */
   8709         /*   character sequence, back up to the start of that sequence.              */
   8710         UBool bUnsafeCP = FALSE;
   8711         UChar32 uc32 = -1;
   8712 
   8713         if (!bSrcLimit) {
   8714             U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32);
   8715             if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
   8716                 bUnsafeCP = TRUE;
   8717             }
   8718             bSawNonLatin1 |= (uc32 > 0xff);
   8719         }
   8720         if (!bTargLimit) {
   8721             U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32);
   8722             if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
   8723                 bUnsafeCP = TRUE;
   8724             }
   8725             bSawNonLatin1 |= (uc32 > 0xff);
   8726         }
   8727 
   8728         if (bUnsafeCP) {
   8729             while (equalLength > 0) {
   8730                 // We are stopped in the middle of a contraction.
   8731                 // Scan backwards through the == part of the string looking for the start of the contraction.
   8732                 //   It doesn't matter which string we scan, since they are the same in this region.
   8733                 U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32);
   8734                 bSawNonLatin1 |= (uc32 > 0xff);
   8735                 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
   8736                     break;
   8737                 }
   8738             }
   8739         }
   8740         source += equalLength;
   8741         target += equalLength;
   8742         if (sourceLength > 0) {
   8743             sourceLength -= equalLength;
   8744         }
   8745         if (targetLength > 0) {
   8746             targetLength -= equalLength;
   8747         }
   8748     } else {
   8749         // Lead byte of Latin 1 character is 0x00 - 0xC3
   8750         bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3);
   8751         bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3);
   8752     }
   8753 
   8754     UCollationResult returnVal;
   8755 
   8756     if(!coll->latinOneUse || bSawNonLatin1) {
   8757         returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
   8758     } else {
   8759         returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status);
   8760     }
   8761     UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
   8762     return returnVal;
   8763 }
   8764 
   8765 
   8766 /* convenience function for comparing strings */
   8767 U_CAPI UBool U_EXPORT2
   8768 ucol_greater(    const    UCollator        *coll,
   8769         const    UChar            *source,
   8770         int32_t            sourceLength,
   8771         const    UChar            *target,
   8772         int32_t            targetLength)
   8773 {
   8774     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8775         == UCOL_GREATER);
   8776 }
   8777 
   8778 /* convenience function for comparing strings */
   8779 U_CAPI UBool U_EXPORT2
   8780 ucol_greaterOrEqual(    const    UCollator    *coll,
   8781             const    UChar        *source,
   8782             int32_t        sourceLength,
   8783             const    UChar        *target,
   8784             int32_t        targetLength)
   8785 {
   8786     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8787         != UCOL_LESS);
   8788 }
   8789 
   8790 /* convenience function for comparing strings */
   8791 U_CAPI UBool U_EXPORT2
   8792 ucol_equal(        const    UCollator        *coll,
   8793             const    UChar            *source,
   8794             int32_t            sourceLength,
   8795             const    UChar            *target,
   8796             int32_t            targetLength)
   8797 {
   8798     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
   8799         == UCOL_EQUAL);
   8800 }
   8801 
   8802 U_CAPI void U_EXPORT2
   8803 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
   8804     if(coll && coll->UCA) {
   8805         uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
   8806     }
   8807 }
   8808 
   8809 #endif /* #if !UCONFIG_NO_COLLATION */
   8810