Home | History | Annotate | Download | only in i18n
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *   Copyright (C) 2001-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 ******************************************************************************
      8 *
      9 * File ucoleitr.cpp
     10 *
     11 * Modification History:
     12 *
     13 * Date        Name        Description
     14 * 02/15/2001  synwee      Modified all methods to process its own function
     15 *                         instead of calling the equivalent c++ api (coleitr.h)
     16 * 2012-2014   markus      Rewritten in C++ again.
     17 ******************************************************************************/
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_COLLATION
     22 
     23 #include "unicode/coleitr.h"
     24 #include "unicode/tblcoll.h"
     25 #include "unicode/ucoleitr.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/sortkey.h"
     28 #include "unicode/uobject.h"
     29 #include "cmemory.h"
     30 #include "usrchimp.h"
     31 
     32 U_NAMESPACE_USE
     33 
     34 #define BUFFER_LENGTH             100
     35 
     36 #define DEFAULT_BUFFER_SIZE 16
     37 #define BUFFER_GROW 8
     38 
     39 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
     40 
     41 #define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
     42 
     43 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     44 
     45 struct RCEI
     46 {
     47     uint32_t ce;
     48     int32_t  low;
     49     int32_t  high;
     50 };
     51 
     52 U_NAMESPACE_BEGIN
     53 
     54 struct RCEBuffer
     55 {
     56     RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
     57     RCEI   *buffer;
     58     int32_t bufferIndex;
     59     int32_t bufferSize;
     60 
     61     RCEBuffer();
     62     ~RCEBuffer();
     63 
     64     UBool isEmpty() const;
     65     void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
     66     const RCEI *get();
     67 };
     68 
     69 RCEBuffer::RCEBuffer()
     70 {
     71     buffer = defaultBuffer;
     72     bufferIndex = 0;
     73     bufferSize = UPRV_LENGTHOF(defaultBuffer);
     74 }
     75 
     76 RCEBuffer::~RCEBuffer()
     77 {
     78     if (buffer != defaultBuffer) {
     79         DELETE_ARRAY(buffer);
     80     }
     81 }
     82 
     83 UBool RCEBuffer::isEmpty() const
     84 {
     85     return bufferIndex <= 0;
     86 }
     87 
     88 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
     89 {
     90     if (U_FAILURE(errorCode)) {
     91         return;
     92     }
     93     if (bufferIndex >= bufferSize) {
     94         RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
     95         if (newBuffer == NULL) {
     96             errorCode = U_MEMORY_ALLOCATION_ERROR;
     97             return;
     98         }
     99 
    100         ARRAY_COPY(newBuffer, buffer, bufferSize);
    101 
    102         if (buffer != defaultBuffer) {
    103             DELETE_ARRAY(buffer);
    104         }
    105 
    106         buffer = newBuffer;
    107         bufferSize += BUFFER_GROW;
    108     }
    109 
    110     buffer[bufferIndex].ce   = ce;
    111     buffer[bufferIndex].low  = ixLow;
    112     buffer[bufferIndex].high = ixHigh;
    113 
    114     bufferIndex += 1;
    115 }
    116 
    117 const RCEI *RCEBuffer::get()
    118 {
    119     if (bufferIndex > 0) {
    120      return &buffer[--bufferIndex];
    121     }
    122 
    123     return NULL;
    124 }
    125 
    126 PCEBuffer::PCEBuffer()
    127 {
    128     buffer = defaultBuffer;
    129     bufferIndex = 0;
    130     bufferSize = UPRV_LENGTHOF(defaultBuffer);
    131 }
    132 
    133 PCEBuffer::~PCEBuffer()
    134 {
    135     if (buffer != defaultBuffer) {
    136         DELETE_ARRAY(buffer);
    137     }
    138 }
    139 
    140 void PCEBuffer::reset()
    141 {
    142     bufferIndex = 0;
    143 }
    144 
    145 UBool PCEBuffer::isEmpty() const
    146 {
    147     return bufferIndex <= 0;
    148 }
    149 
    150 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
    151 {
    152     if (U_FAILURE(errorCode)) {
    153         return;
    154     }
    155     if (bufferIndex >= bufferSize) {
    156         PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
    157         if (newBuffer == NULL) {
    158             errorCode = U_MEMORY_ALLOCATION_ERROR;
    159             return;
    160         }
    161 
    162         ARRAY_COPY(newBuffer, buffer, bufferSize);
    163 
    164         if (buffer != defaultBuffer) {
    165             DELETE_ARRAY(buffer);
    166         }
    167 
    168         buffer = newBuffer;
    169         bufferSize += BUFFER_GROW;
    170     }
    171 
    172     buffer[bufferIndex].ce   = ce;
    173     buffer[bufferIndex].low  = ixLow;
    174     buffer[bufferIndex].high = ixHigh;
    175 
    176     bufferIndex += 1;
    177 }
    178 
    179 const PCEI *PCEBuffer::get()
    180 {
    181     if (bufferIndex > 0) {
    182      return &buffer[--bufferIndex];
    183     }
    184 
    185     return NULL;
    186 }
    187 
    188 UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
    189 
    190 UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
    191 
    192 void UCollationPCE::init(UCollationElements *elems) {
    193     init(CollationElementIterator::fromUCollationElements(elems));
    194 }
    195 
    196 void UCollationPCE::init(CollationElementIterator *iter)
    197 {
    198     cei = iter;
    199     init(*iter->rbc_);
    200 }
    201 
    202 void UCollationPCE::init(const Collator &coll)
    203 {
    204     UErrorCode status = U_ZERO_ERROR;
    205 
    206     strength    = coll.getAttribute(UCOL_STRENGTH, status);
    207     toShift     = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
    208     isShifted   = FALSE;
    209     variableTop = coll.getVariableTop(status);
    210 }
    211 
    212 UCollationPCE::~UCollationPCE()
    213 {
    214     // nothing to do
    215 }
    216 
    217 uint64_t UCollationPCE::processCE(uint32_t ce)
    218 {
    219     uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
    220 
    221     // This is clean, but somewhat slow...
    222     // We could apply the mask to ce and then
    223     // just get all three orders...
    224     switch(strength) {
    225     default:
    226         tertiary = ucol_tertiaryOrder(ce);
    227         U_FALLTHROUGH;
    228 
    229     case UCOL_SECONDARY:
    230         secondary = ucol_secondaryOrder(ce);
    231         U_FALLTHROUGH;
    232 
    233     case UCOL_PRIMARY:
    234         primary = ucol_primaryOrder(ce);
    235     }
    236 
    237     // **** This should probably handle continuations too.  ****
    238     // **** That means that we need 24 bits for the primary ****
    239     // **** instead of the 16 that we're currently using.   ****
    240     // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
    241     // **** Another complication with continuations is that ****
    242     // **** the *second* CE is marked as a continuation, so ****
    243     // **** we always have to peek ahead to know how long   ****
    244     // **** the primary is...                               ****
    245     if ((toShift && variableTop > ce && primary != 0)
    246                 || (isShifted && primary == 0)) {
    247 
    248         if (primary == 0) {
    249             return UCOL_IGNORABLE;
    250         }
    251 
    252         if (strength >= UCOL_QUATERNARY) {
    253             quaternary = primary;
    254         }
    255 
    256         primary = secondary = tertiary = 0;
    257         isShifted = TRUE;
    258     } else {
    259         if (strength >= UCOL_QUATERNARY) {
    260             quaternary = 0xFFFF;
    261         }
    262 
    263         isShifted = FALSE;
    264     }
    265 
    266     return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
    267 }
    268 
    269 U_NAMESPACE_END
    270 
    271 /* public methods ---------------------------------------------------- */
    272 
    273 U_CAPI UCollationElements* U_EXPORT2
    274 ucol_openElements(const UCollator  *coll,
    275                   const UChar      *text,
    276                         int32_t    textLength,
    277                         UErrorCode *status)
    278 {
    279     if (U_FAILURE(*status)) {
    280         return NULL;
    281     }
    282     if (coll == NULL || (text == NULL && textLength != 0)) {
    283         *status = U_ILLEGAL_ARGUMENT_ERROR;
    284         return NULL;
    285     }
    286     const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
    287     if (rbc == NULL) {
    288         *status = U_UNSUPPORTED_ERROR;  // coll is a Collator but not a RuleBasedCollator
    289         return NULL;
    290     }
    291 
    292     UnicodeString s((UBool)(textLength < 0), text, textLength);
    293     CollationElementIterator *cei = rbc->createCollationElementIterator(s);
    294     if (cei == NULL) {
    295         *status = U_MEMORY_ALLOCATION_ERROR;
    296         return NULL;
    297     }
    298 
    299     return cei->toUCollationElements();
    300 }
    301 
    302 
    303 U_CAPI void U_EXPORT2
    304 ucol_closeElements(UCollationElements *elems)
    305 {
    306     delete CollationElementIterator::fromUCollationElements(elems);
    307 }
    308 
    309 U_CAPI void U_EXPORT2
    310 ucol_reset(UCollationElements *elems)
    311 {
    312     CollationElementIterator::fromUCollationElements(elems)->reset();
    313 }
    314 
    315 U_CAPI int32_t U_EXPORT2
    316 ucol_next(UCollationElements *elems,
    317           UErrorCode         *status)
    318 {
    319     if (U_FAILURE(*status)) {
    320         return UCOL_NULLORDER;
    321     }
    322 
    323     return CollationElementIterator::fromUCollationElements(elems)->next(*status);
    324 }
    325 
    326 U_NAMESPACE_BEGIN
    327 
    328 int64_t
    329 UCollationPCE::nextProcessed(
    330                    int32_t            *ixLow,
    331                    int32_t            *ixHigh,
    332                    UErrorCode         *status)
    333 {
    334     int64_t result = UCOL_IGNORABLE;
    335     uint32_t low = 0, high = 0;
    336 
    337     if (U_FAILURE(*status)) {
    338         return UCOL_PROCESSED_NULLORDER;
    339     }
    340 
    341     pceBuffer.reset();
    342 
    343     do {
    344         low = cei->getOffset();
    345         int32_t ce = cei->next(*status);
    346         high = cei->getOffset();
    347 
    348         if (ce == UCOL_NULLORDER) {
    349              result = UCOL_PROCESSED_NULLORDER;
    350              break;
    351         }
    352 
    353         result = processCE((uint32_t)ce);
    354     } while (result == UCOL_IGNORABLE);
    355 
    356     if (ixLow != NULL) {
    357         *ixLow = low;
    358     }
    359 
    360     if (ixHigh != NULL) {
    361         *ixHigh = high;
    362     }
    363 
    364     return result;
    365 }
    366 
    367 U_NAMESPACE_END
    368 
    369 U_CAPI int32_t U_EXPORT2
    370 ucol_previous(UCollationElements *elems,
    371               UErrorCode         *status)
    372 {
    373     if(U_FAILURE(*status)) {
    374         return UCOL_NULLORDER;
    375     }
    376     return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
    377 }
    378 
    379 U_NAMESPACE_BEGIN
    380 
    381 int64_t
    382 UCollationPCE::previousProcessed(
    383                    int32_t            *ixLow,
    384                    int32_t            *ixHigh,
    385                    UErrorCode         *status)
    386 {
    387     int64_t result = UCOL_IGNORABLE;
    388     int32_t  low = 0, high = 0;
    389 
    390     if (U_FAILURE(*status)) {
    391         return UCOL_PROCESSED_NULLORDER;
    392     }
    393 
    394     // pceBuffer.reset();
    395 
    396     while (pceBuffer.isEmpty()) {
    397         // buffer raw CEs up to non-ignorable primary
    398         RCEBuffer rceb;
    399         int32_t ce;
    400 
    401         // **** do we need to reset rceb, or will it always be empty at this point ****
    402         do {
    403             high = cei->getOffset();
    404             ce   = cei->previous(*status);
    405             low  = cei->getOffset();
    406 
    407             if (ce == UCOL_NULLORDER) {
    408                 if (!rceb.isEmpty()) {
    409                     break;
    410                 }
    411 
    412                 goto finish;
    413             }
    414 
    415             rceb.put((uint32_t)ce, low, high, *status);
    416         } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce)));
    417 
    418         // process the raw CEs
    419         while (U_SUCCESS(*status) && !rceb.isEmpty()) {
    420             const RCEI *rcei = rceb.get();
    421 
    422             result = processCE(rcei->ce);
    423 
    424             if (result != UCOL_IGNORABLE) {
    425                 pceBuffer.put(result, rcei->low, rcei->high, *status);
    426             }
    427         }
    428         if (U_FAILURE(*status)) {
    429             return UCOL_PROCESSED_NULLORDER;
    430         }
    431     }
    432 
    433 finish:
    434     if (pceBuffer.isEmpty()) {
    435         // **** Is -1 the right value for ixLow, ixHigh? ****
    436     	if (ixLow != NULL) {
    437     		*ixLow = -1;
    438     	}
    439 
    440     	if (ixHigh != NULL) {
    441     		*ixHigh = -1
    442     		;
    443     	}
    444         return UCOL_PROCESSED_NULLORDER;
    445     }
    446 
    447     const PCEI *pcei = pceBuffer.get();
    448 
    449     if (ixLow != NULL) {
    450         *ixLow = pcei->low;
    451     }
    452 
    453     if (ixHigh != NULL) {
    454         *ixHigh = pcei->high;
    455     }
    456 
    457     return pcei->ce;
    458 }
    459 
    460 U_NAMESPACE_END
    461 
    462 U_CAPI int32_t U_EXPORT2
    463 ucol_getMaxExpansion(const UCollationElements *elems,
    464                            int32_t            order)
    465 {
    466     return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
    467 
    468     // TODO: The old code masked the order according to strength and then did a binary search.
    469     // However this was probably at least partially broken because of the following comment.
    470     // Still, it might have found a match when this version may not.
    471 
    472     // FIXME: with a masked search, there might be more than one hit,
    473     // so we need to look forward and backward from the match to find all
    474     // of the hits...
    475 }
    476 
    477 U_CAPI void U_EXPORT2
    478 ucol_setText(      UCollationElements *elems,
    479              const UChar              *text,
    480                    int32_t            textLength,
    481                    UErrorCode         *status)
    482 {
    483     if (U_FAILURE(*status)) {
    484         return;
    485     }
    486 
    487     if ((text == NULL && textLength != 0)) {
    488         *status = U_ILLEGAL_ARGUMENT_ERROR;
    489         return;
    490     }
    491     UnicodeString s((UBool)(textLength < 0), text, textLength);
    492     return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
    493 }
    494 
    495 U_CAPI int32_t U_EXPORT2
    496 ucol_getOffset(const UCollationElements *elems)
    497 {
    498     return CollationElementIterator::fromUCollationElements(elems)->getOffset();
    499 }
    500 
    501 U_CAPI void U_EXPORT2
    502 ucol_setOffset(UCollationElements    *elems,
    503                int32_t           offset,
    504                UErrorCode            *status)
    505 {
    506     if (U_FAILURE(*status)) {
    507         return;
    508     }
    509 
    510     CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
    511 }
    512 
    513 U_CAPI int32_t U_EXPORT2
    514 ucol_primaryOrder (int32_t order)
    515 {
    516     return (order >> 16) & 0xffff;
    517 }
    518 
    519 U_CAPI int32_t U_EXPORT2
    520 ucol_secondaryOrder (int32_t order)
    521 {
    522     return (order >> 8) & 0xff;
    523 }
    524 
    525 U_CAPI int32_t U_EXPORT2
    526 ucol_tertiaryOrder (int32_t order)
    527 {
    528     return order & 0xff;
    529 }
    530 
    531 #endif /* #if !UCONFIG_NO_COLLATION */
    532