Home | History | Annotate | Download | only in i18n
      1 /*
      2 ******************************************************************************
      3 *   Copyright (C) 2001-2014, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 ******************************************************************************
      6 *
      7 * File ucoleitr.cpp
      8 *
      9 * Modification History:
     10 *
     11 * Date        Name        Description
     12 * 02/15/2001  synwee      Modified all methods to process its own function
     13 *                         instead of calling the equivalent c++ api (coleitr.h)
     14 * 2012-2014   markus      Rewritten in C++ again.
     15 ******************************************************************************/
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/coleitr.h"
     22 #include "unicode/tblcoll.h"
     23 #include "unicode/ucoleitr.h"
     24 #include "unicode/ustring.h"
     25 #include "unicode/sortkey.h"
     26 #include "unicode/uobject.h"
     27 #include "cmemory.h"
     28 #include "usrchimp.h"
     29 
     30 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     31 
     32 U_NAMESPACE_USE
     33 
     34 #define BUFFER_LENGTH             100
     35 
     36 #define DEFAULT_BUFFER_SIZE 16
     37 #define BUFFER_GROW 8
     38 
     39 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
     40 
     41 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
     42 
     43 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
     44 
     45 #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
     46 
     47 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     48 
     49 struct RCEI
     50 {
     51     uint32_t ce;
     52     int32_t  low;
     53     int32_t  high;
     54 };
     55 
     56 U_NAMESPACE_BEGIN
     57 
     58 struct RCEBuffer
     59 {
     60     RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
     61     RCEI   *buffer;
     62     int32_t bufferIndex;
     63     int32_t bufferSize;
     64 
     65     RCEBuffer();
     66     ~RCEBuffer();
     67 
     68     UBool empty() const;
     69     void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
     70     const RCEI *get();
     71 };
     72 
     73 RCEBuffer::RCEBuffer()
     74 {
     75     buffer = defaultBuffer;
     76     bufferIndex = 0;
     77     bufferSize = LENGTHOF(defaultBuffer);
     78 }
     79 
     80 RCEBuffer::~RCEBuffer()
     81 {
     82     if (buffer != defaultBuffer) {
     83         DELETE_ARRAY(buffer);
     84     }
     85 }
     86 
     87 UBool RCEBuffer::empty() const
     88 {
     89     return bufferIndex <= 0;
     90 }
     91 
     92 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
     93 {
     94     if (bufferIndex >= bufferSize) {
     95         RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
     96 
     97         ARRAY_COPY(newBuffer, buffer, bufferSize);
     98 
     99         if (buffer != defaultBuffer) {
    100             DELETE_ARRAY(buffer);
    101         }
    102 
    103         buffer = newBuffer;
    104         bufferSize += BUFFER_GROW;
    105     }
    106 
    107     buffer[bufferIndex].ce   = ce;
    108     buffer[bufferIndex].low  = ixLow;
    109     buffer[bufferIndex].high = ixHigh;
    110 
    111     bufferIndex += 1;
    112 }
    113 
    114 const RCEI *RCEBuffer::get()
    115 {
    116     if (bufferIndex > 0) {
    117      return &buffer[--bufferIndex];
    118     }
    119 
    120     return NULL;
    121 }
    122 
    123 PCEBuffer::PCEBuffer()
    124 {
    125     buffer = defaultBuffer;
    126     bufferIndex = 0;
    127     bufferSize = LENGTHOF(defaultBuffer);
    128 }
    129 
    130 PCEBuffer::~PCEBuffer()
    131 {
    132     if (buffer != defaultBuffer) {
    133         DELETE_ARRAY(buffer);
    134     }
    135 }
    136 
    137 void PCEBuffer::reset()
    138 {
    139     bufferIndex = 0;
    140 }
    141 
    142 UBool PCEBuffer::empty() const
    143 {
    144     return bufferIndex <= 0;
    145 }
    146 
    147 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
    148 {
    149     if (bufferIndex >= bufferSize) {
    150         PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
    151 
    152         ARRAY_COPY(newBuffer, buffer, bufferSize);
    153 
    154         if (buffer != defaultBuffer) {
    155             DELETE_ARRAY(buffer);
    156         }
    157 
    158         buffer = newBuffer;
    159         bufferSize += BUFFER_GROW;
    160     }
    161 
    162     buffer[bufferIndex].ce   = ce;
    163     buffer[bufferIndex].low  = ixLow;
    164     buffer[bufferIndex].high = ixHigh;
    165 
    166     bufferIndex += 1;
    167 }
    168 
    169 const PCEI *PCEBuffer::get()
    170 {
    171     if (bufferIndex > 0) {
    172      return &buffer[--bufferIndex];
    173     }
    174 
    175     return NULL;
    176 }
    177 
    178 UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
    179 
    180 UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
    181 
    182 void UCollationPCE::init(UCollationElements *elems) {
    183     init(CollationElementIterator::fromUCollationElements(elems));
    184 }
    185 
    186 void UCollationPCE::init(CollationElementIterator *iter)
    187 {
    188     cei = iter;
    189     init(*iter->rbc_);
    190 }
    191 
    192 void UCollationPCE::init(const Collator &coll)
    193 {
    194     UErrorCode status = U_ZERO_ERROR;
    195 
    196     strength    = coll.getAttribute(UCOL_STRENGTH, status);
    197     toShift     = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
    198     isShifted   = FALSE;
    199     variableTop = coll.getVariableTop(status);
    200 }
    201 
    202 UCollationPCE::~UCollationPCE()
    203 {
    204     // nothing to do
    205 }
    206 
    207 uint64_t UCollationPCE::processCE(uint32_t ce)
    208 {
    209     uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
    210 
    211     // This is clean, but somewhat slow...
    212     // We could apply the mask to ce and then
    213     // just get all three orders...
    214     switch(strength) {
    215     default:
    216         tertiary = ucol_tertiaryOrder(ce);
    217         /* note fall-through */
    218 
    219     case UCOL_SECONDARY:
    220         secondary = ucol_secondaryOrder(ce);
    221         /* note fall-through */
    222 
    223     case UCOL_PRIMARY:
    224         primary = ucol_primaryOrder(ce);
    225     }
    226 
    227     // **** This should probably handle continuations too.  ****
    228     // **** That means that we need 24 bits for the primary ****
    229     // **** instead of the 16 that we're currently using.   ****
    230     // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
    231     // **** Another complication with continuations is that ****
    232     // **** the *second* CE is marked as a continuation, so ****
    233     // **** we always have to peek ahead to know how long   ****
    234     // **** the primary is...                               ****
    235     if ((toShift && variableTop > ce && primary != 0)
    236                 || (isShifted && primary == 0)) {
    237 
    238         if (primary == 0) {
    239             return UCOL_IGNORABLE;
    240         }
    241 
    242         if (strength >= UCOL_QUATERNARY) {
    243             quaternary = primary;
    244         }
    245 
    246         primary = secondary = tertiary = 0;
    247         isShifted = TRUE;
    248     } else {
    249         if (strength >= UCOL_QUATERNARY) {
    250             quaternary = 0xFFFF;
    251         }
    252 
    253         isShifted = FALSE;
    254     }
    255 
    256     return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
    257 }
    258 
    259 U_NAMESPACE_END
    260 
    261 /* public methods ---------------------------------------------------- */
    262 
    263 U_CAPI UCollationElements* U_EXPORT2
    264 ucol_openElements(const UCollator  *coll,
    265                   const UChar      *text,
    266                         int32_t    textLength,
    267                         UErrorCode *status)
    268 {
    269     if (U_FAILURE(*status)) {
    270         return NULL;
    271     }
    272     if (coll == NULL || (text == NULL && textLength != 0)) {
    273         *status = U_ILLEGAL_ARGUMENT_ERROR;
    274         return NULL;
    275     }
    276     const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
    277     if (rbc == NULL) {
    278         *status = U_UNSUPPORTED_ERROR;  // coll is a Collator but not a RuleBasedCollator
    279         return NULL;
    280     }
    281 
    282     UnicodeString s((UBool)(textLength < 0), text, textLength);
    283     CollationElementIterator *cei = rbc->createCollationElementIterator(s);
    284     if (cei == NULL) {
    285         *status = U_MEMORY_ALLOCATION_ERROR;
    286         return NULL;
    287     }
    288 
    289     return cei->toUCollationElements();
    290 }
    291 
    292 
    293 U_CAPI void U_EXPORT2
    294 ucol_closeElements(UCollationElements *elems)
    295 {
    296     delete CollationElementIterator::fromUCollationElements(elems);
    297 }
    298 
    299 U_CAPI void U_EXPORT2
    300 ucol_reset(UCollationElements *elems)
    301 {
    302     CollationElementIterator::fromUCollationElements(elems)->reset();
    303 }
    304 
    305 U_CAPI int32_t U_EXPORT2
    306 ucol_next(UCollationElements *elems,
    307           UErrorCode         *status)
    308 {
    309     if (U_FAILURE(*status)) {
    310         return UCOL_NULLORDER;
    311     }
    312 
    313     return CollationElementIterator::fromUCollationElements(elems)->next(*status);
    314 }
    315 
    316 U_NAMESPACE_BEGIN
    317 
    318 int64_t
    319 UCollationPCE::nextProcessed(
    320                    int32_t            *ixLow,
    321                    int32_t            *ixHigh,
    322                    UErrorCode         *status)
    323 {
    324     int64_t result = UCOL_IGNORABLE;
    325     uint32_t low = 0, high = 0;
    326 
    327     if (U_FAILURE(*status)) {
    328         return UCOL_PROCESSED_NULLORDER;
    329     }
    330 
    331     pceBuffer.reset();
    332 
    333     do {
    334         low = cei->getOffset();
    335         int32_t ce = cei->next(*status);
    336         high = cei->getOffset();
    337 
    338         if (ce == UCOL_NULLORDER) {
    339              result = UCOL_PROCESSED_NULLORDER;
    340              break;
    341         }
    342 
    343         result = processCE((uint32_t)ce);
    344     } while (result == UCOL_IGNORABLE);
    345 
    346     if (ixLow != NULL) {
    347         *ixLow = low;
    348     }
    349 
    350     if (ixHigh != NULL) {
    351         *ixHigh = high;
    352     }
    353 
    354     return result;
    355 }
    356 
    357 U_NAMESPACE_END
    358 
    359 U_CAPI int32_t U_EXPORT2
    360 ucol_previous(UCollationElements *elems,
    361               UErrorCode         *status)
    362 {
    363     if(U_FAILURE(*status)) {
    364         return UCOL_NULLORDER;
    365     }
    366     return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
    367 }
    368 
    369 U_NAMESPACE_BEGIN
    370 
    371 int64_t
    372 UCollationPCE::previousProcessed(
    373                    int32_t            *ixLow,
    374                    int32_t            *ixHigh,
    375                    UErrorCode         *status)
    376 {
    377     int64_t result = UCOL_IGNORABLE;
    378     int32_t  low = 0, high = 0;
    379 
    380     if (U_FAILURE(*status)) {
    381         return UCOL_PROCESSED_NULLORDER;
    382     }
    383 
    384     // pceBuffer.reset();
    385 
    386     while (pceBuffer.empty()) {
    387         // buffer raw CEs up to non-ignorable primary
    388         RCEBuffer rceb;
    389         int32_t ce;
    390 
    391         // **** do we need to reset rceb, or will it always be empty at this point ****
    392         do {
    393             high = cei->getOffset();
    394             ce   = cei->previous(*status);
    395             low  = cei->getOffset();
    396 
    397             if (ce == UCOL_NULLORDER) {
    398                 if (! rceb.empty()) {
    399                     break;
    400                 }
    401 
    402                 goto finish;
    403             }
    404 
    405             rceb.put((uint32_t)ce, low, high);
    406         } while ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce));
    407 
    408         // process the raw CEs
    409         while (! rceb.empty()) {
    410             const RCEI *rcei = rceb.get();
    411 
    412             result = processCE(rcei->ce);
    413 
    414             if (result != UCOL_IGNORABLE) {
    415                 pceBuffer.put(result, rcei->low, rcei->high);
    416             }
    417         }
    418     }
    419 
    420 finish:
    421     if (pceBuffer.empty()) {
    422         // **** Is -1 the right value for ixLow, ixHigh? ****
    423     	if (ixLow != NULL) {
    424     		*ixLow = -1;
    425     	}
    426 
    427     	if (ixHigh != NULL) {
    428     		*ixHigh = -1
    429     		;
    430     	}
    431         return UCOL_PROCESSED_NULLORDER;
    432     }
    433 
    434     const PCEI *pcei = pceBuffer.get();
    435 
    436     if (ixLow != NULL) {
    437         *ixLow = pcei->low;
    438     }
    439 
    440     if (ixHigh != NULL) {
    441         *ixHigh = pcei->high;
    442     }
    443 
    444     return pcei->ce;
    445 }
    446 
    447 U_NAMESPACE_END
    448 
    449 U_CAPI int32_t U_EXPORT2
    450 ucol_getMaxExpansion(const UCollationElements *elems,
    451                            int32_t            order)
    452 {
    453     return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
    454 
    455     // TODO: The old code masked the order according to strength and then did a binary search.
    456     // However this was probably at least partially broken because of the following comment.
    457     // Still, it might have found a match when this version may not.
    458 
    459     // FIXME: with a masked search, there might be more than one hit,
    460     // so we need to look forward and backward from the match to find all
    461     // of the hits...
    462 }
    463 
    464 U_CAPI void U_EXPORT2
    465 ucol_setText(      UCollationElements *elems,
    466              const UChar              *text,
    467                    int32_t            textLength,
    468                    UErrorCode         *status)
    469 {
    470     if (U_FAILURE(*status)) {
    471         return;
    472     }
    473 
    474     if ((text == NULL && textLength != 0)) {
    475         *status = U_ILLEGAL_ARGUMENT_ERROR;
    476         return;
    477     }
    478     UnicodeString s((UBool)(textLength < 0), text, textLength);
    479     return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
    480 }
    481 
    482 U_CAPI int32_t U_EXPORT2
    483 ucol_getOffset(const UCollationElements *elems)
    484 {
    485     return CollationElementIterator::fromUCollationElements(elems)->getOffset();
    486 }
    487 
    488 U_CAPI void U_EXPORT2
    489 ucol_setOffset(UCollationElements    *elems,
    490                int32_t           offset,
    491                UErrorCode            *status)
    492 {
    493     if (U_FAILURE(*status)) {
    494         return;
    495     }
    496 
    497     CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
    498 }
    499 
    500 U_CAPI int32_t U_EXPORT2
    501 ucol_primaryOrder (int32_t order)
    502 {
    503     return (order >> 16) & 0xffff;
    504 }
    505 
    506 U_CAPI int32_t U_EXPORT2
    507 ucol_secondaryOrder (int32_t order)
    508 {
    509     return (order >> 8) & 0xff;
    510 }
    511 
    512 U_CAPI int32_t U_EXPORT2
    513 ucol_tertiaryOrder (int32_t order)
    514 {
    515     return order & 0xff;
    516 }
    517 
    518 #endif /* #if !UCONFIG_NO_COLLATION */
    519