Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2012-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * collationdata.cpp
      7 *
      8 * created on: 2012jul28
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_COLLATION
     15 
     16 #include "unicode/ucol.h"
     17 #include "unicode/udata.h"
     18 #include "unicode/uscript.h"
     19 #include "cmemory.h"
     20 #include "collation.h"
     21 #include "collationdata.h"
     22 #include "uassert.h"
     23 #include "utrie2.h"
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 uint32_t
     28 CollationData::getIndirectCE32(uint32_t ce32) const {
     29     U_ASSERT(Collation::isSpecialCE32(ce32));
     30     int32_t tag = Collation::tagFromCE32(ce32);
     31     if(tag == Collation::DIGIT_TAG) {
     32         // Fetch the non-numeric-collation CE32.
     33         ce32 = ce32s[Collation::indexFromCE32(ce32)];
     34     } else if(tag == Collation::LEAD_SURROGATE_TAG) {
     35         ce32 = Collation::UNASSIGNED_CE32;
     36     } else if(tag == Collation::U0000_TAG) {
     37         // Fetch the normal ce32 for U+0000.
     38         ce32 = ce32s[0];
     39     }
     40     return ce32;
     41 }
     42 
     43 uint32_t
     44 CollationData::getFinalCE32(uint32_t ce32) const {
     45     if(Collation::isSpecialCE32(ce32)) {
     46         ce32 = getIndirectCE32(ce32);
     47     }
     48     return ce32;
     49 }
     50 
     51 uint32_t
     52 CollationData::getFirstPrimaryForGroup(int32_t script) const {
     53     int32_t index = findScript(script);
     54     if(index < 0) {
     55         return 0;
     56     }
     57     uint32_t head = scripts[index];
     58     return (head & 0xff00) << 16;
     59 }
     60 
     61 uint32_t
     62 CollationData::getLastPrimaryForGroup(int32_t script) const {
     63     int32_t index = findScript(script);
     64     if(index < 0) {
     65         return 0;
     66     }
     67     uint32_t head = scripts[index];
     68     uint32_t lastByte = head & 0xff;
     69     return ((lastByte + 1) << 24) - 1;
     70 }
     71 
     72 int32_t
     73 CollationData::getGroupForPrimary(uint32_t p) const {
     74     p >>= 24;  // Reordering groups are distinguished by primary lead bytes.
     75     for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) {
     76         uint32_t lastByte = scripts[i] & 0xff;
     77         if(p <= lastByte) {
     78             return scripts[i + 2];
     79         }
     80     }
     81     return -1;
     82 }
     83 
     84 int32_t
     85 CollationData::findScript(int32_t script) const {
     86     if(script < 0 || 0xffff < script) { return -1; }
     87     for(int32_t i = 0; i < scriptsLength;) {
     88         int32_t limit = i + 2 + scripts[i + 1];
     89         for(int32_t j = i + 2; j < limit; ++j) {
     90             if(script == scripts[j]) { return i; }
     91         }
     92         i = limit;
     93     }
     94     return -1;
     95 }
     96 
     97 int32_t
     98 CollationData::getEquivalentScripts(int32_t script,
     99                                     int32_t dest[], int32_t capacity,
    100                                     UErrorCode &errorCode) const {
    101     if(U_FAILURE(errorCode)) { return 0; }
    102     int32_t i = findScript(script);
    103     if(i < 0) { return 0; }
    104     int32_t length = scripts[i + 1];
    105     U_ASSERT(length != 0);
    106     if(length > capacity) {
    107         errorCode = U_BUFFER_OVERFLOW_ERROR;
    108         return length;
    109     }
    110     i += 2;
    111     dest[0] = scripts[i++];
    112     for(int32_t j = 1; j < length; ++j) {
    113         script = scripts[i++];
    114         // Sorted insertion.
    115         for(int32_t k = j;; --k) {
    116             // Invariant: dest[k] is free to receive either script or dest[k - 1].
    117             if(k > 0 && script < dest[k - 1]) {
    118                 dest[k] = dest[k - 1];
    119             } else {
    120                 dest[k] = script;
    121                 break;
    122             }
    123         }
    124     }
    125     return length;
    126 }
    127 
    128 void
    129 CollationData::makeReorderTable(const int32_t *reorder, int32_t length,
    130                                 uint8_t table[256], UErrorCode &errorCode) const {
    131     if(U_FAILURE(errorCode)) { return; }
    132 
    133     // Initialize the table.
    134     // Never reorder special low and high primary lead bytes.
    135     int32_t lowByte;
    136     for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) {
    137         table[lowByte] = lowByte;
    138     }
    139     // lowByte == 03
    140 
    141     int32_t highByte;
    142     for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) {
    143         table[highByte] = highByte;
    144     }
    145     // highByte == FE
    146 
    147     // Set intermediate bytes to 0 to indicate that they have not been set yet.
    148     for(int32_t i = lowByte; i <= highByte; ++i) {
    149         table[i] = 0;
    150     }
    151 
    152     // Get the set of special reorder codes in the input list.
    153     // This supports up to 32 special reorder codes;
    154     // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
    155     uint32_t specials = 0;
    156     for(int32_t i = 0; i < length; ++i) {
    157         int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
    158         if(0 <= reorderCode && reorderCode <= 31) {
    159             specials |= (uint32_t)1 << reorderCode;
    160         }
    161     }
    162 
    163     // Start the reordering with the special low reorder codes that do not occur in the input.
    164     for(int32_t i = 0;; i += 3) {
    165         if(scripts[i + 1] != 1) { break; }  // Went beyond special single-code reorder codes.
    166         int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST;
    167         if(reorderCode < 0) { break; }  // Went beyond special reorder codes.
    168         if((specials & ((uint32_t)1 << reorderCode)) == 0) {
    169             int32_t head = scripts[i];
    170             int32_t firstByte = head >> 8;
    171             int32_t lastByte = head & 0xff;
    172             do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
    173         }
    174     }
    175 
    176     // Reorder according to the input scripts, continuing from the bottom of the bytes range.
    177     for(int32_t i = 0; i < length;) {
    178         int32_t script = reorder[i++];
    179         if(script == USCRIPT_UNKNOWN) {
    180             // Put the remaining scripts at the top.
    181             while(i < length) {
    182                 script = reorder[--length];
    183                 if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
    184                         script == UCOL_REORDER_CODE_DEFAULT) {
    185                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    186                     return;
    187                 }
    188                 int32_t index = findScript(script);
    189                 if(index < 0) { continue; }
    190                 int32_t head = scripts[index];
    191                 int32_t firstByte = head >> 8;
    192                 int32_t lastByte = head & 0xff;
    193                 if(table[firstByte] != 0) {  // Duplicate or equivalent script.
    194                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    195                     return;
    196                 }
    197                 do { table[lastByte--] = highByte--; } while(firstByte <= lastByte);
    198             }
    199             break;
    200         }
    201         if(script == UCOL_REORDER_CODE_DEFAULT) {
    202             // The default code must be the only one in the list, and that is handled by the caller.
    203             // Otherwise it must not be used.
    204             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    205             return;
    206         }
    207         int32_t index = findScript(script);
    208         if(index < 0) { continue; }
    209         int32_t head = scripts[index];
    210         int32_t firstByte = head >> 8;
    211         int32_t lastByte = head & 0xff;
    212         if(table[firstByte] != 0) {  // Duplicate or equivalent script.
    213             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    214             return;
    215         }
    216         do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
    217     }
    218 
    219     // Put all remaining scripts into the middle.
    220     // Avoid table[0] which must remain 0.
    221     for(int32_t i = 1; i <= 0xff; ++i) {
    222         if(table[i] == 0) { table[i] = lowByte++; }
    223     }
    224     U_ASSERT(lowByte == highByte + 1);
    225 }
    226 
    227 U_NAMESPACE_END
    228 
    229 #endif  // !UCONFIG_NO_COLLATION
    230