Home | History | Annotate | Download | only in intltest
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1996-2014, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  ******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_COLLATION
     11 
     12 #include "unicode/unistr.h"
     13 #include "unicode/usearch.h"
     14 
     15 #include "cmemory.h"
     16 #include "unicode/coll.h"
     17 #include "unicode/tblcoll.h"
     18 #include "unicode/coleitr.h"
     19 #include "unicode/ucoleitr.h"
     20 
     21 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     22 
     23 #include "unicode/uniset.h"
     24 #include "unicode/uset.h"
     25 #include "unicode/usetiter.h"
     26 #include "unicode/ustring.h"
     27 #include "hash.h"
     28 #include "normalizer2impl.h"
     29 #include "uhash.h"
     30 #include "usrchimp.h"
     31 #include "uassert.h"
     32 
     33 #include "colldata.h"
     34 
     35 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
     36 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
     37 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     38 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
     39 
     40 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
     41     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
     42 {
     43     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
     44     UCollationStrength strength = ucol_getStrength(coll);
     45     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
     46     uint32_t variableTop = ucol_getVariableTop(coll, &status);
     47     uint32_t strengthMask = 0;
     48     int32_t order;
     49 
     50     if (U_FAILURE(status)) {
     51         return;
     52     }
     53 
     54     // **** only set flag if string has Han(gul) ****
     55     // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
     56 
     57     switch (strength)
     58     {
     59     default:
     60         strengthMask |= UCOL_TERTIARYORDERMASK;
     61         /* fall through */
     62 
     63     case UCOL_SECONDARY:
     64         strengthMask |= UCOL_SECONDARYORDERMASK;
     65         /* fall through */
     66 
     67     case UCOL_PRIMARY:
     68         strengthMask |= UCOL_PRIMARYORDERMASK;
     69     }
     70 
     71     ces = ceBuffer;
     72 
     73     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
     74         UBool cont = isContinuation(order);
     75 
     76         order &= strengthMask;
     77 
     78         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
     79             if (strength >= UCOL_QUATERNARY) {
     80                 order &= UCOL_PRIMARYORDERMASK;
     81             } else {
     82                 order = UCOL_IGNORABLE;
     83             }
     84         }
     85 
     86         if (order == UCOL_IGNORABLE) {
     87             continue;
     88         }
     89 
     90         if (cont) {
     91             order |= UCOL_CONTINUATION_MARKER;
     92         }
     93 
     94         add(order, status);
     95     }
     96 
     97     ucol_closeElements(elems);
     98 }
     99 
    100 CEList::~CEList()
    101 {
    102     if (ces != ceBuffer) {
    103         DELETE_ARRAY(ces);
    104     }
    105 }
    106 
    107 void CEList::add(uint32_t ce, UErrorCode &status)
    108 {
    109     if (U_FAILURE(status)) {
    110         return;
    111     }
    112 
    113     if (listSize >= listMax) {
    114         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
    115         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
    116 
    117         if (newCEs == NULL) {
    118             status = U_MEMORY_ALLOCATION_ERROR;
    119             return;
    120         }
    121 
    122         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
    123 
    124         if (ces != ceBuffer) {
    125             DELETE_ARRAY(ces);
    126         }
    127 
    128         ces = newCEs;
    129         listMax = newMax;
    130     }
    131 
    132     ces[listSize++] = ce;
    133 }
    134 
    135 uint32_t CEList::get(int32_t index) const
    136 {
    137     if (index >= 0 && index < listSize) {
    138         return ces[index];
    139     }
    140 
    141     return (uint32_t)UCOL_NULLORDER;
    142 }
    143 
    144 uint32_t &CEList::operator[](int32_t index) const
    145 {
    146     return ces[index];
    147 }
    148 
    149 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
    150 {
    151     if (other == NULL || listSize - offset < other->size()) {
    152         return FALSE;
    153     }
    154 
    155     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
    156         if (ces[i] != (*other)[j]) {
    157             return FALSE;
    158         }
    159     }
    160 
    161     return TRUE;
    162 }
    163 
    164 int32_t CEList::size() const
    165 {
    166     return listSize;
    167 }
    168 
    169 StringList::StringList(UErrorCode &status)
    170     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
    171 {
    172     if (U_FAILURE(status)) {
    173         return;
    174     }
    175 
    176     strings = new UnicodeString [listMax];
    177 
    178     if (strings == NULL) {
    179         status = U_MEMORY_ALLOCATION_ERROR;
    180         return;
    181     }
    182 }
    183 
    184 StringList::~StringList()
    185 {
    186     delete[] strings;
    187 }
    188 
    189 void StringList::add(const UnicodeString *string, UErrorCode &status)
    190 {
    191     if (U_FAILURE(status)) {
    192         return;
    193     }
    194     if (listSize >= listMax) {
    195         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
    196         UnicodeString *newStrings = new UnicodeString[newMax];
    197         if (newStrings == NULL) {
    198             status = U_MEMORY_ALLOCATION_ERROR;
    199             return;
    200         }
    201         for (int32_t i=0; i<listSize; ++i) {
    202             newStrings[i] = strings[i];
    203         }
    204         delete[] strings;
    205         strings = newStrings;
    206         listMax = newMax;
    207     }
    208 
    209     // The ctor initialized all the strings in
    210     // the array to empty strings, so this
    211     // is the same as copying the source string.
    212     strings[listSize++].append(*string);
    213 }
    214 
    215 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
    216 {
    217     const UnicodeString string(chars, count);
    218 
    219     add(&string, status);
    220 }
    221 
    222 const UnicodeString *StringList::get(int32_t index) const
    223 {
    224     if (index >= 0 && index < listSize) {
    225         return &strings[index];
    226     }
    227 
    228     return NULL;
    229 }
    230 
    231 int32_t StringList::size() const
    232 {
    233     return listSize;
    234 }
    235 
    236 
    237 U_CDECL_BEGIN
    238 static void U_CALLCONV
    239 deleteStringList(void *obj)
    240 {
    241     StringList *strings = (StringList *) obj;
    242 
    243     delete strings;
    244 }
    245 U_CDECL_END
    246 
    247 class CEToStringsMap
    248 {
    249 public:
    250     CEToStringsMap(UErrorCode &status);
    251     ~CEToStringsMap();
    252 
    253     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
    254     StringList *getStringList(uint32_t ce) const;
    255 
    256 private:
    257     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
    258     UHashtable *map;
    259 };
    260 
    261 CEToStringsMap::CEToStringsMap(UErrorCode &status)
    262     : map(NULL)
    263 {
    264     if (U_FAILURE(status)) {
    265         return;
    266     }
    267 
    268     map = uhash_open(uhash_hashLong, uhash_compareLong,
    269                      uhash_compareCaselessUnicodeString,
    270                      &status);
    271 
    272     if (U_FAILURE(status)) {
    273         return;
    274     }
    275 
    276     uhash_setValueDeleter(map, deleteStringList);
    277 }
    278 
    279 CEToStringsMap::~CEToStringsMap()
    280 {
    281     uhash_close(map);
    282 }
    283 
    284 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
    285 {
    286     StringList *strings = getStringList(ce);
    287 
    288     if (strings == NULL) {
    289         strings = new StringList(status);
    290 
    291         if (strings == NULL || U_FAILURE(status)) {
    292             status = U_MEMORY_ALLOCATION_ERROR;
    293             return;
    294         }
    295 
    296         putStringList(ce, strings, status);
    297     }
    298 
    299     strings->add(string, status);
    300 }
    301 
    302 StringList *CEToStringsMap::getStringList(uint32_t ce) const
    303 {
    304     return (StringList *) uhash_iget(map, ce);
    305 }
    306 
    307 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
    308 {
    309     uhash_iput(map, ce, (void *) stringList, &status);
    310 }
    311 
    312 #define CLONE_COLLATOR
    313 
    314 CollData::CollData(UCollator *collator, UErrorCode &status)
    315     : coll(NULL), ceToCharsStartingWith(NULL)
    316 {
    317     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
    318     // i.e. other, control, private use, format, surrogate
    319     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
    320     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
    321     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
    322 
    323     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
    324     // i.e. all the characers we handle implicitly
    325     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    326     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    327     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
    328 
    329     if (U_FAILURE(status)) {
    330         return;
    331     }
    332 
    333     USet *expansions   = uset_openEmpty();
    334     USet *contractions = uset_openEmpty();
    335     int32_t itemCount;
    336 
    337     ceToCharsStartingWith = new CEToStringsMap(status);
    338 
    339     if (U_FAILURE(status)) {
    340         goto bail;
    341     }
    342 
    343 #ifdef CLONE_COLLATOR
    344     coll = ucol_safeClone(collator, NULL, NULL, &status);
    345 
    346     if (U_FAILURE(status)) {
    347         goto bail;
    348     }
    349 #else
    350     coll = collator;
    351 #endif
    352 
    353     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
    354 
    355     uset_addAll(charsToTest, contractions);
    356     uset_addAll(charsToTest, expansions);
    357     uset_removeAll(charsToTest, charsToRemove);
    358 
    359     itemCount = uset_getItemCount(charsToTest);
    360     for(int32_t item = 0; item < itemCount; item += 1) {
    361         UChar32 start = 0, end = 0;
    362         UChar buffer[16];
    363         int32_t len = uset_getItem(charsToTest, item, &start, &end,
    364                                    buffer, 16, &status);
    365 
    366         if (len == 0) {
    367             for (UChar32 ch = start; ch <= end; ch += 1) {
    368                 UnicodeString *st = new UnicodeString(ch);
    369 
    370                 if (st == NULL) {
    371                     status = U_MEMORY_ALLOCATION_ERROR;
    372                     break;
    373                 }
    374 
    375                 CEList *ceList = new CEList(coll, *st, status);
    376 
    377                 ceToCharsStartingWith->put(ceList->get(0), st, status);
    378 
    379                 delete ceList;
    380                 delete st;
    381             }
    382         } else if (len > 0) {
    383             UnicodeString *st = new UnicodeString(buffer, len);
    384 
    385             if (st == NULL) {
    386                 status = U_MEMORY_ALLOCATION_ERROR;
    387                 break;
    388             }
    389 
    390             CEList *ceList = new CEList(coll, *st, status);
    391 
    392             ceToCharsStartingWith->put(ceList->get(0), st, status);
    393 
    394             delete ceList;
    395             delete st;
    396         } else {
    397             // shouldn't happen...
    398         }
    399 
    400         if (U_FAILURE(status)) {
    401              break;
    402         }
    403     }
    404 
    405 bail:
    406     uset_close(contractions);
    407     uset_close(expansions);
    408     uset_close(charsToRemove);
    409     uset_close(charsToTest);
    410 
    411     if (U_FAILURE(status)) {
    412         return;
    413     }
    414 
    415     UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
    416     if (U_FAILURE(status)) {
    417         return;
    418     }
    419     UnicodeSetIterator hanIter(hanRanges);
    420     UnicodeString hanString;
    421     while(hanIter.nextRange()) {
    422         hanString.append(hanIter.getCodepoint());
    423         hanString.append(hanIter.getCodepointEnd());
    424     }
    425     // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
    426     // but as of Unicode 6.3 the 11xx block is filled,
    427     // and there are also more Jamo T at U+D7CB..U+D7FB.
    428     // Maybe use [:HST=T:] and look for the end of the last range?
    429     // Maybe use script boundary mappings instead of this code??
    430     UChar  jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
    431      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
    432      CEList hanList(coll, hanString, status);
    433      CEList jamoList(coll, jamoString, status);
    434      int32_t j = 0;
    435 
    436      if (U_FAILURE(status)) {
    437          return;
    438      }
    439 
    440      for (int32_t c = 0; c < jamoList.size(); c += 1) {
    441          uint32_t jce = jamoList[c];
    442 
    443          if (! isContinuation(jce)) {
    444              jamoLimits[j++] = jce;
    445          }
    446      }
    447 
    448      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
    449 
    450      minHan = 0xFFFFFFFF;
    451      maxHan = 0;
    452 
    453      for(int32_t h = 0; h < hanList.size(); h += 2) {
    454          uint32_t han = (uint32_t) hanList[h];
    455 
    456          if (han < minHan) {
    457              minHan = han;
    458          }
    459 
    460          if (han > maxHan) {
    461              maxHan = han;
    462          }
    463      }
    464 
    465      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
    466 }
    467 
    468 CollData::~CollData()
    469 {
    470 #ifdef CLONE_COLLATOR
    471    ucol_close(coll);
    472 #endif
    473 
    474    delete ceToCharsStartingWith;
    475 }
    476 
    477 UCollator *CollData::getCollator() const
    478 {
    479     return coll;
    480 }
    481 
    482 const StringList *CollData::getStringList(int32_t ce) const
    483 {
    484     return ceToCharsStartingWith->getStringList(ce);
    485 }
    486 
    487 const CEList *CollData::getCEList(const UnicodeString *string) const
    488 {
    489     UErrorCode status = U_ZERO_ERROR;
    490     const CEList *list = new CEList(coll, *string, status);
    491 
    492     if (U_FAILURE(status)) {
    493         delete list;
    494         list = NULL;
    495     }
    496 
    497     return list;
    498 }
    499 
    500 void CollData::freeCEList(const CEList *list)
    501 {
    502     delete list;
    503 }
    504 
    505 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
    506 {
    507     // find out shortest string for the longest sequence of ces.
    508     // this can probably be folded with the minLengthCache...
    509 
    510     if (history[offset] >= 0) {
    511         return history[offset];
    512     }
    513 
    514     uint32_t ce = ceList->get(offset);
    515     int32_t maxOffset = ceList->size();
    516     int32_t shortestLength = INT32_MAX;
    517     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
    518 
    519     if (strings != NULL) {
    520         int32_t stringCount = strings->size();
    521 
    522         for (int32_t s = 0; s < stringCount; s += 1) {
    523             const UnicodeString *string = strings->get(s);
    524             UErrorCode status = U_ZERO_ERROR;
    525             const CEList *ceList2 = new CEList(coll, *string, status);
    526 
    527             if (U_FAILURE(status)) {
    528                 delete ceList2;
    529                 ceList2 = NULL;
    530             }
    531 
    532             if (ceList->matchesAt(offset, ceList2)) {
    533                 U_ASSERT(ceList2 != NULL);
    534                 int32_t clength = ceList2->size();
    535                 int32_t slength = string->length();
    536                 int32_t roffset = offset + clength;
    537                 int32_t rlength = 0;
    538 
    539                 if (roffset < maxOffset) {
    540                     rlength = minLengthInChars(ceList, roffset, history);
    541 
    542                     if (rlength <= 0) {
    543                     // delete before continue to avoid memory leak.
    544                         delete ceList2;
    545 
    546                         // ignore any dead ends
    547                         continue;
    548                     }
    549                 }
    550 
    551                 if (shortestLength > slength + rlength) {
    552                     shortestLength = slength + rlength;
    553                 }
    554             }
    555 
    556             delete ceList2;
    557         }
    558     }
    559 
    560     if (shortestLength == INT32_MAX) {
    561         // No matching strings at this offset. See if
    562         // the CE is in a range we can handle manually.
    563         if (ce >= minHan && ce < maxHan) {
    564             // all han have implicit orders which
    565             // generate two CEs.
    566             int32_t roffset = offset + 2;
    567             int32_t rlength = 0;
    568 
    569           //history[roffset++] = -1;
    570           //history[roffset++] = 1;
    571 
    572             if (roffset < maxOffset) {
    573                 rlength = minLengthInChars(ceList, roffset, history);
    574             }
    575 
    576             if (rlength < 0) {
    577                 return -1;
    578             }
    579 
    580             shortestLength = 1 + rlength;
    581             goto have_shortest;
    582         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
    583             int32_t roffset = offset;
    584             int32_t rlength = 0;
    585 
    586             // **** this loop may not handle archaic Hangul correctly ****
    587             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
    588                 uint32_t jce = ceList->get(roffset);
    589 
    590                 // Some Jamo have 24-bit primary order; skip the
    591                 // 2nd CE. This should always be OK because if
    592                 // we're still in the loop all we've seen are
    593                 // a series of Jamo in LVT order.
    594                 if (isContinuation(jce)) {
    595                     continue;
    596                 }
    597 
    598                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
    599                     break;
    600                 }
    601             }
    602 
    603             if (roffset == offset) {
    604                 // we started with a non-L Jamo...
    605                 // just say it comes from a single character
    606                 roffset += 1;
    607 
    608                 // See if the single Jamo has a 24-bit order.
    609                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
    610                     roffset += 1;
    611                 }
    612             }
    613 
    614             if (roffset < maxOffset) {
    615                 rlength = minLengthInChars(ceList, roffset, history);
    616             }
    617 
    618             if (rlength < 0) {
    619                 return -1;
    620             }
    621 
    622             shortestLength = 1 + rlength;
    623             goto have_shortest;
    624         }
    625 
    626         // Can't handle it manually either. Just move on.
    627         return -1;
    628     }
    629 
    630 have_shortest:
    631     history[offset] = shortestLength;
    632 
    633     return shortestLength;
    634 }
    635 
    636 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
    637 {
    638     int32_t clength = ceList->size();
    639     int32_t *history = NEW_ARRAY(int32_t, clength);
    640 
    641     for (int32_t i = 0; i < clength; i += 1) {
    642         history[i] = -1;
    643     }
    644 
    645     int32_t minLength = minLengthInChars(ceList, offset, history);
    646 
    647     DELETE_ARRAY(history);
    648 
    649     return minLength;
    650 }
    651 
    652 #endif // #if !UCONFIG_NO_COLLATION
    653