Home | History | Annotate | Download | only in intltest
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1996-2012, International Business Machines                 *
      4  *   Corporation and others.  All Rights Reserved.                            *
      5  ******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_COLLATION
     11 
     12 #include "unicode/unistr.h"
     13 #include "unicode/usearch.h"
     14 
     15 #include "cmemory.h"
     16 #include "unicode/coll.h"
     17 #include "unicode/tblcoll.h"
     18 #include "unicode/coleitr.h"
     19 #include "unicode/ucoleitr.h"
     20 
     21 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     22 
     23 #include "unicode/uniset.h"
     24 #include "unicode/uset.h"
     25 #include "unicode/ustring.h"
     26 #include "hash.h"
     27 #include "uhash.h"
     28 #include "ucol_imp.h"
     29 #include "uassert.h"
     30 
     31 #include "colldata.h"
     32 
     33 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
     34 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
     35 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     36 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
     37 
     38 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
     39     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
     40 {
     41     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
     42     UCollationStrength strength = ucol_getStrength(coll);
     43     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
     44     uint32_t variableTop = ucol_getVariableTop(coll, &status);
     45     uint32_t strengthMask = 0;
     46     int32_t order;
     47 
     48     if (U_FAILURE(status)) {
     49         return;
     50     }
     51 
     52     // **** only set flag if string has Han(gul) ****
     53     ucol_forceHanImplicit(elems, &status);
     54 
     55     switch (strength)
     56     {
     57     default:
     58         strengthMask |= UCOL_TERTIARYORDERMASK;
     59         /* fall through */
     60 
     61     case UCOL_SECONDARY:
     62         strengthMask |= UCOL_SECONDARYORDERMASK;
     63         /* fall through */
     64 
     65     case UCOL_PRIMARY:
     66         strengthMask |= UCOL_PRIMARYORDERMASK;
     67     }
     68 
     69     ces = ceBuffer;
     70 
     71     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
     72         UBool cont = isContinuation(order);
     73 
     74         order &= strengthMask;
     75 
     76         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
     77             if (strength >= UCOL_QUATERNARY) {
     78                 order &= UCOL_PRIMARYORDERMASK;
     79             } else {
     80                 order = UCOL_IGNORABLE;
     81             }
     82         }
     83 
     84         if (order == UCOL_IGNORABLE) {
     85             continue;
     86         }
     87 
     88         if (cont) {
     89             order |= UCOL_CONTINUATION_MARKER;
     90         }
     91 
     92         add(order, status);
     93     }
     94 
     95     ucol_closeElements(elems);
     96 }
     97 
     98 CEList::~CEList()
     99 {
    100     if (ces != ceBuffer) {
    101         DELETE_ARRAY(ces);
    102     }
    103 }
    104 
    105 void CEList::add(uint32_t ce, UErrorCode &status)
    106 {
    107     if (U_FAILURE(status)) {
    108         return;
    109     }
    110 
    111     if (listSize >= listMax) {
    112         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
    113         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
    114 
    115         if (newCEs == NULL) {
    116             status = U_MEMORY_ALLOCATION_ERROR;
    117             return;
    118         }
    119 
    120         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
    121 
    122         if (ces != ceBuffer) {
    123             DELETE_ARRAY(ces);
    124         }
    125 
    126         ces = newCEs;
    127         listMax = newMax;
    128     }
    129 
    130     ces[listSize++] = ce;
    131 }
    132 
    133 uint32_t CEList::get(int32_t index) const
    134 {
    135     if (index >= 0 && index < listSize) {
    136         return ces[index];
    137     }
    138 
    139     return (uint32_t)UCOL_NULLORDER;
    140 }
    141 
    142 uint32_t &CEList::operator[](int32_t index) const
    143 {
    144     return ces[index];
    145 }
    146 
    147 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
    148 {
    149     if (other == NULL || listSize - offset < other->size()) {
    150         return FALSE;
    151     }
    152 
    153     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
    154         if (ces[i] != (*other)[j]) {
    155             return FALSE;
    156         }
    157     }
    158 
    159     return TRUE;
    160 }
    161 
    162 int32_t CEList::size() const
    163 {
    164     return listSize;
    165 }
    166 
    167 StringList::StringList(UErrorCode &status)
    168     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
    169 {
    170     if (U_FAILURE(status)) {
    171         return;
    172     }
    173 
    174     strings = new UnicodeString [listMax];
    175 
    176     if (strings == NULL) {
    177         status = U_MEMORY_ALLOCATION_ERROR;
    178         return;
    179     }
    180 }
    181 
    182 StringList::~StringList()
    183 {
    184     delete[] strings;
    185 }
    186 
    187 void StringList::add(const UnicodeString *string, UErrorCode &status)
    188 {
    189     if (U_FAILURE(status)) {
    190         return;
    191     }
    192     if (listSize >= listMax) {
    193         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
    194         UnicodeString *newStrings = new UnicodeString[newMax];
    195         if (newStrings == NULL) {
    196             status = U_MEMORY_ALLOCATION_ERROR;
    197             return;
    198         }
    199         for (int32_t i=0; i<listSize; ++i) {
    200             newStrings[i] = strings[i];
    201         }
    202         delete[] strings;
    203         strings = newStrings;
    204         listMax = newMax;
    205     }
    206 
    207     // The ctor initialized all the strings in
    208     // the array to empty strings, so this
    209     // is the same as copying the source string.
    210     strings[listSize++].append(*string);
    211 }
    212 
    213 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
    214 {
    215     const UnicodeString string(chars, count);
    216 
    217     add(&string, status);
    218 }
    219 
    220 const UnicodeString *StringList::get(int32_t index) const
    221 {
    222     if (index >= 0 && index < listSize) {
    223         return &strings[index];
    224     }
    225 
    226     return NULL;
    227 }
    228 
    229 int32_t StringList::size() const
    230 {
    231     return listSize;
    232 }
    233 
    234 
    235 U_CDECL_BEGIN
    236 static void U_CALLCONV
    237 deleteStringList(void *obj)
    238 {
    239     StringList *strings = (StringList *) obj;
    240 
    241     delete strings;
    242 }
    243 U_CDECL_END
    244 
    245 class CEToStringsMap
    246 {
    247 public:
    248     CEToStringsMap(UErrorCode &status);
    249     ~CEToStringsMap();
    250 
    251     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
    252     StringList *getStringList(uint32_t ce) const;
    253 
    254 private:
    255     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
    256     UHashtable *map;
    257 };
    258 
    259 CEToStringsMap::CEToStringsMap(UErrorCode &status)
    260     : map(NULL)
    261 {
    262     if (U_FAILURE(status)) {
    263         return;
    264     }
    265 
    266     map = uhash_open(uhash_hashLong, uhash_compareLong,
    267                      uhash_compareCaselessUnicodeString,
    268                      &status);
    269 
    270     if (U_FAILURE(status)) {
    271         return;
    272     }
    273 
    274     uhash_setValueDeleter(map, deleteStringList);
    275 }
    276 
    277 CEToStringsMap::~CEToStringsMap()
    278 {
    279     uhash_close(map);
    280 }
    281 
    282 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
    283 {
    284     StringList *strings = getStringList(ce);
    285 
    286     if (strings == NULL) {
    287         strings = new StringList(status);
    288 
    289         if (strings == NULL || U_FAILURE(status)) {
    290             status = U_MEMORY_ALLOCATION_ERROR;
    291             return;
    292         }
    293 
    294         putStringList(ce, strings, status);
    295     }
    296 
    297     strings->add(string, status);
    298 }
    299 
    300 StringList *CEToStringsMap::getStringList(uint32_t ce) const
    301 {
    302     return (StringList *) uhash_iget(map, ce);
    303 }
    304 
    305 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
    306 {
    307     uhash_iput(map, ce, (void *) stringList, &status);
    308 }
    309 
    310 #define CLONE_COLLATOR
    311 
    312 CollData::CollData(UCollator *collator, UErrorCode &status)
    313     : coll(NULL), ceToCharsStartingWith(NULL)
    314 {
    315     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
    316     // i.e. other, control, private use, format, surrogate
    317     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
    318     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
    319     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
    320 
    321     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
    322     // i.e. all the characers we handle implicitly
    323     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    324     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    325     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
    326 
    327     if (U_FAILURE(status)) {
    328         return;
    329     }
    330 
    331     USet *expansions   = uset_openEmpty();
    332     USet *contractions = uset_openEmpty();
    333     int32_t itemCount;
    334 
    335     ceToCharsStartingWith = new CEToStringsMap(status);
    336 
    337     if (U_FAILURE(status)) {
    338         goto bail;
    339     }
    340 
    341 #ifdef CLONE_COLLATOR
    342     coll = ucol_safeClone(collator, NULL, NULL, &status);
    343 
    344     if (U_FAILURE(status)) {
    345         goto bail;
    346     }
    347 #else
    348     coll = collator;
    349 #endif
    350 
    351     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
    352 
    353     uset_addAll(charsToTest, contractions);
    354     uset_addAll(charsToTest, expansions);
    355     uset_removeAll(charsToTest, charsToRemove);
    356 
    357     itemCount = uset_getItemCount(charsToTest);
    358     for(int32_t item = 0; item < itemCount; item += 1) {
    359         UChar32 start = 0, end = 0;
    360         UChar buffer[16];
    361         int32_t len = uset_getItem(charsToTest, item, &start, &end,
    362                                    buffer, 16, &status);
    363 
    364         if (len == 0) {
    365             for (UChar32 ch = start; ch <= end; ch += 1) {
    366                 UnicodeString *st = new UnicodeString(ch);
    367 
    368                 if (st == NULL) {
    369                     status = U_MEMORY_ALLOCATION_ERROR;
    370                     break;
    371                 }
    372 
    373                 CEList *ceList = new CEList(coll, *st, status);
    374 
    375                 ceToCharsStartingWith->put(ceList->get(0), st, status);
    376 
    377                 delete ceList;
    378                 delete st;
    379             }
    380         } else if (len > 0) {
    381             UnicodeString *st = new UnicodeString(buffer, len);
    382 
    383             if (st == NULL) {
    384                 status = U_MEMORY_ALLOCATION_ERROR;
    385                 break;
    386             }
    387 
    388             CEList *ceList = new CEList(coll, *st, status);
    389 
    390             ceToCharsStartingWith->put(ceList->get(0), st, status);
    391 
    392             delete ceList;
    393             delete st;
    394         } else {
    395             // shouldn't happen...
    396         }
    397 
    398         if (U_FAILURE(status)) {
    399              break;
    400         }
    401     }
    402 
    403 bail:
    404     uset_close(contractions);
    405     uset_close(expansions);
    406     uset_close(charsToRemove);
    407     uset_close(charsToTest);
    408 
    409     if (U_FAILURE(status)) {
    410         return;
    411     }
    412 
    413      UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
    414                             UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
    415      UChar  jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
    416      UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
    417      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
    418      CEList hanList(coll, hanString, status);
    419      CEList jamoList(coll, jamoString, status);
    420      int32_t j = 0;
    421 
    422      if (U_FAILURE(status)) {
    423          return;
    424      }
    425 
    426      for (int32_t c = 0; c < jamoList.size(); c += 1) {
    427          uint32_t jce = jamoList[c];
    428 
    429          if (! isContinuation(jce)) {
    430              jamoLimits[j++] = jce;
    431          }
    432      }
    433 
    434      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
    435 
    436      minHan = 0xFFFFFFFF;
    437      maxHan = 0;
    438 
    439      for(int32_t h = 0; h < hanList.size(); h += 2) {
    440          uint32_t han = (uint32_t) hanList[h];
    441 
    442          if (han < minHan) {
    443              minHan = han;
    444          }
    445 
    446          if (han > maxHan) {
    447              maxHan = han;
    448          }
    449      }
    450 
    451      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
    452 }
    453 
    454 CollData::~CollData()
    455 {
    456 #ifdef CLONE_COLLATOR
    457    ucol_close(coll);
    458 #endif
    459 
    460    delete ceToCharsStartingWith;
    461 }
    462 
    463 UCollator *CollData::getCollator() const
    464 {
    465     return coll;
    466 }
    467 
    468 const StringList *CollData::getStringList(int32_t ce) const
    469 {
    470     return ceToCharsStartingWith->getStringList(ce);
    471 }
    472 
    473 const CEList *CollData::getCEList(const UnicodeString *string) const
    474 {
    475     UErrorCode status = U_ZERO_ERROR;
    476     const CEList *list = new CEList(coll, *string, status);
    477 
    478     if (U_FAILURE(status)) {
    479         delete list;
    480         list = NULL;
    481     }
    482 
    483     return list;
    484 }
    485 
    486 void CollData::freeCEList(const CEList *list)
    487 {
    488     delete list;
    489 }
    490 
    491 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
    492 {
    493     // find out shortest string for the longest sequence of ces.
    494     // this can probably be folded with the minLengthCache...
    495 
    496     if (history[offset] >= 0) {
    497         return history[offset];
    498     }
    499 
    500     uint32_t ce = ceList->get(offset);
    501     int32_t maxOffset = ceList->size();
    502     int32_t shortestLength = INT32_MAX;
    503     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
    504 
    505     if (strings != NULL) {
    506         int32_t stringCount = strings->size();
    507 
    508         for (int32_t s = 0; s < stringCount; s += 1) {
    509             const UnicodeString *string = strings->get(s);
    510             UErrorCode status = U_ZERO_ERROR;
    511             const CEList *ceList2 = new CEList(coll, *string, status);
    512 
    513             if (U_FAILURE(status)) {
    514                 delete ceList2;
    515                 ceList2 = NULL;
    516             }
    517 
    518             if (ceList->matchesAt(offset, ceList2)) {
    519                 U_ASSERT(ceList2 != NULL);
    520                 int32_t clength = ceList2->size();
    521                 int32_t slength = string->length();
    522                 int32_t roffset = offset + clength;
    523                 int32_t rlength = 0;
    524 
    525                 if (roffset < maxOffset) {
    526                     rlength = minLengthInChars(ceList, roffset, history);
    527 
    528                     if (rlength <= 0) {
    529                     // delete before continue to avoid memory leak.
    530                         delete ceList2;
    531 
    532                         // ignore any dead ends
    533                         continue;
    534                     }
    535                 }
    536 
    537                 if (shortestLength > slength + rlength) {
    538                     shortestLength = slength + rlength;
    539                 }
    540             }
    541 
    542             delete ceList2;
    543         }
    544     }
    545 
    546     if (shortestLength == INT32_MAX) {
    547         // No matching strings at this offset. See if
    548         // the CE is in a range we can handle manually.
    549         if (ce >= minHan && ce < maxHan) {
    550             // all han have implicit orders which
    551             // generate two CEs.
    552             int32_t roffset = offset + 2;
    553             int32_t rlength = 0;
    554 
    555           //history[roffset++] = -1;
    556           //history[roffset++] = 1;
    557 
    558             if (roffset < maxOffset) {
    559                 rlength = minLengthInChars(ceList, roffset, history);
    560             }
    561 
    562             if (rlength < 0) {
    563                 return -1;
    564             }
    565 
    566             shortestLength = 1 + rlength;
    567             goto have_shortest;
    568         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
    569             int32_t roffset = offset;
    570             int32_t rlength = 0;
    571 
    572             // **** this loop may not handle archaic Hangul correctly ****
    573             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
    574                 uint32_t jce = ceList->get(roffset);
    575 
    576                 // Some Jamo have 24-bit primary order; skip the
    577                 // 2nd CE. This should always be OK because if
    578                 // we're still in the loop all we've seen are
    579                 // a series of Jamo in LVT order.
    580                 if (isContinuation(jce)) {
    581                     continue;
    582                 }
    583 
    584                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
    585                     break;
    586                 }
    587             }
    588 
    589             if (roffset == offset) {
    590                 // we started with a non-L Jamo...
    591                 // just say it comes from a single character
    592                 roffset += 1;
    593 
    594                 // See if the single Jamo has a 24-bit order.
    595                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
    596                     roffset += 1;
    597                 }
    598             }
    599 
    600             if (roffset < maxOffset) {
    601                 rlength = minLengthInChars(ceList, roffset, history);
    602             }
    603 
    604             if (rlength < 0) {
    605                 return -1;
    606             }
    607 
    608             shortestLength = 1 + rlength;
    609             goto have_shortest;
    610         }
    611 
    612         // Can't handle it manually either. Just move on.
    613         return -1;
    614     }
    615 
    616 have_shortest:
    617     history[offset] = shortestLength;
    618 
    619     return shortestLength;
    620 }
    621 
    622 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
    623 {
    624     int32_t clength = ceList->size();
    625     int32_t *history = NEW_ARRAY(int32_t, clength);
    626 
    627     for (int32_t i = 0; i < clength; i += 1) {
    628         history[i] = -1;
    629     }
    630 
    631     int32_t minLength = minLengthInChars(ceList, offset, history);
    632 
    633     DELETE_ARRAY(history);
    634 
    635     return minLength;
    636 }
    637 
    638 #endif // #if !UCONFIG_NO_COLLATION
    639