1 /* 2 ******************************************************************************* 3 * Copyright (C) 2012-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationdata.cpp 7 * 8 * created on: 2012jul28 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/ucol.h" 17 #include "unicode/udata.h" 18 #include "unicode/uscript.h" 19 #include "cmemory.h" 20 #include "collation.h" 21 #include "collationdata.h" 22 #include "uassert.h" 23 #include "utrie2.h" 24 25 U_NAMESPACE_BEGIN 26 27 uint32_t 28 CollationData::getIndirectCE32(uint32_t ce32) const { 29 U_ASSERT(Collation::isSpecialCE32(ce32)); 30 int32_t tag = Collation::tagFromCE32(ce32); 31 if(tag == Collation::DIGIT_TAG) { 32 // Fetch the non-numeric-collation CE32. 33 ce32 = ce32s[Collation::indexFromCE32(ce32)]; 34 } else if(tag == Collation::LEAD_SURROGATE_TAG) { 35 ce32 = Collation::UNASSIGNED_CE32; 36 } else if(tag == Collation::U0000_TAG) { 37 // Fetch the normal ce32 for U+0000. 38 ce32 = ce32s[0]; 39 } 40 return ce32; 41 } 42 43 uint32_t 44 CollationData::getFinalCE32(uint32_t ce32) const { 45 if(Collation::isSpecialCE32(ce32)) { 46 ce32 = getIndirectCE32(ce32); 47 } 48 return ce32; 49 } 50 51 uint32_t 52 CollationData::getFirstPrimaryForGroup(int32_t script) const { 53 int32_t index = findScript(script); 54 if(index < 0) { 55 return 0; 56 } 57 uint32_t head = scripts[index]; 58 return (head & 0xff00) << 16; 59 } 60 61 uint32_t 62 CollationData::getLastPrimaryForGroup(int32_t script) const { 63 int32_t index = findScript(script); 64 if(index < 0) { 65 return 0; 66 } 67 uint32_t head = scripts[index]; 68 uint32_t lastByte = head & 0xff; 69 return ((lastByte + 1) << 24) - 1; 70 } 71 72 int32_t 73 CollationData::getGroupForPrimary(uint32_t p) const { 74 p >>= 24; // Reordering groups are distinguished by primary lead bytes. 75 for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) { 76 uint32_t lastByte = scripts[i] & 0xff; 77 if(p <= lastByte) { 78 return scripts[i + 2]; 79 } 80 } 81 return -1; 82 } 83 84 int32_t 85 CollationData::findScript(int32_t script) const { 86 if(script < 0 || 0xffff < script) { return -1; } 87 for(int32_t i = 0; i < scriptsLength;) { 88 int32_t limit = i + 2 + scripts[i + 1]; 89 for(int32_t j = i + 2; j < limit; ++j) { 90 if(script == scripts[j]) { return i; } 91 } 92 i = limit; 93 } 94 return -1; 95 } 96 97 int32_t 98 CollationData::getEquivalentScripts(int32_t script, 99 int32_t dest[], int32_t capacity, 100 UErrorCode &errorCode) const { 101 if(U_FAILURE(errorCode)) { return 0; } 102 int32_t i = findScript(script); 103 if(i < 0) { return 0; } 104 int32_t length = scripts[i + 1]; 105 U_ASSERT(length != 0); 106 if(length > capacity) { 107 errorCode = U_BUFFER_OVERFLOW_ERROR; 108 return length; 109 } 110 i += 2; 111 dest[0] = scripts[i++]; 112 for(int32_t j = 1; j < length; ++j) { 113 script = scripts[i++]; 114 // Sorted insertion. 115 for(int32_t k = j;; --k) { 116 // Invariant: dest[k] is free to receive either script or dest[k - 1]. 117 if(k > 0 && script < dest[k - 1]) { 118 dest[k] = dest[k - 1]; 119 } else { 120 dest[k] = script; 121 break; 122 } 123 } 124 } 125 return length; 126 } 127 128 void 129 CollationData::makeReorderTable(const int32_t *reorder, int32_t length, 130 uint8_t table[256], UErrorCode &errorCode) const { 131 if(U_FAILURE(errorCode)) { return; } 132 133 // Initialize the table. 134 // Never reorder special low and high primary lead bytes. 135 int32_t lowByte; 136 for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) { 137 table[lowByte] = lowByte; 138 } 139 // lowByte == 03 140 141 int32_t highByte; 142 for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) { 143 table[highByte] = highByte; 144 } 145 // highByte == FE 146 147 // Set intermediate bytes to 0 to indicate that they have not been set yet. 148 for(int32_t i = lowByte; i <= highByte; ++i) { 149 table[i] = 0; 150 } 151 152 // Get the set of special reorder codes in the input list. 153 // This supports up to 32 special reorder codes; 154 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. 155 uint32_t specials = 0; 156 for(int32_t i = 0; i < length; ++i) { 157 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; 158 if(0 <= reorderCode && reorderCode <= 31) { 159 specials |= (uint32_t)1 << reorderCode; 160 } 161 } 162 163 // Start the reordering with the special low reorder codes that do not occur in the input. 164 for(int32_t i = 0;; i += 3) { 165 if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes. 166 int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST; 167 if(reorderCode < 0) { break; } // Went beyond special reorder codes. 168 if((specials & ((uint32_t)1 << reorderCode)) == 0) { 169 int32_t head = scripts[i]; 170 int32_t firstByte = head >> 8; 171 int32_t lastByte = head & 0xff; 172 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); 173 } 174 } 175 176 // Reorder according to the input scripts, continuing from the bottom of the bytes range. 177 for(int32_t i = 0; i < length;) { 178 int32_t script = reorder[i++]; 179 if(script == USCRIPT_UNKNOWN) { 180 // Put the remaining scripts at the top. 181 while(i < length) { 182 script = reorder[--length]; 183 if(script == USCRIPT_UNKNOWN || // Must occur at most once. 184 script == UCOL_REORDER_CODE_DEFAULT) { 185 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 186 return; 187 } 188 int32_t index = findScript(script); 189 if(index < 0) { continue; } 190 int32_t head = scripts[index]; 191 int32_t firstByte = head >> 8; 192 int32_t lastByte = head & 0xff; 193 if(table[firstByte] != 0) { // Duplicate or equivalent script. 194 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 195 return; 196 } 197 do { table[lastByte--] = highByte--; } while(firstByte <= lastByte); 198 } 199 break; 200 } 201 if(script == UCOL_REORDER_CODE_DEFAULT) { 202 // The default code must be the only one in the list, and that is handled by the caller. 203 // Otherwise it must not be used. 204 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 205 return; 206 } 207 int32_t index = findScript(script); 208 if(index < 0) { continue; } 209 int32_t head = scripts[index]; 210 int32_t firstByte = head >> 8; 211 int32_t lastByte = head & 0xff; 212 if(table[firstByte] != 0) { // Duplicate or equivalent script. 213 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 214 return; 215 } 216 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); 217 } 218 219 // Put all remaining scripts into the middle. 220 // Avoid table[0] which must remain 0. 221 for(int32_t i = 1; i <= 0xff; ++i) { 222 if(table[i] == 0) { table[i] = lowByte++; } 223 } 224 U_ASSERT(lowByte == highByte + 1); 225 } 226 227 U_NAMESPACE_END 228 229 #endif // !UCONFIG_NO_COLLATION 230