1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationcompare.cpp 7 * 8 * created on: 2012feb14 with new and old collation code 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/ucol.h" 17 #include "cmemory.h" 18 #include "collation.h" 19 #include "collationcompare.h" 20 #include "collationiterator.h" 21 #include "collationsettings.h" 22 #include "uassert.h" 23 24 U_NAMESPACE_BEGIN 25 26 UCollationResult 27 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right, 28 const CollationSettings &settings, 29 UErrorCode &errorCode) { 30 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 31 32 int32_t options = settings.options; 33 uint32_t variableTop; 34 if((options & CollationSettings::ALTERNATE_MASK) == 0) { 35 variableTop = 0; 36 } else { 37 // +1 so that we can use "<" and primary ignorables test out early. 38 variableTop = settings.variableTop + 1; 39 } 40 UBool anyVariable = FALSE; 41 42 // Fetch CEs, compare primaries, store secondary & tertiary weights. 43 U_ALIGN_CODE(16); 44 for(;;) { 45 // We fetch CEs until we get a non-ignorable primary or reach the end. 46 uint32_t leftPrimary; 47 do { 48 int64_t ce = left.nextCE(errorCode); 49 leftPrimary = (uint32_t)(ce >> 32); 50 if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { 51 // Variable CE, shift it to quaternary level. 52 // Ignore all following primary ignorables, and shift further variable CEs. 53 anyVariable = TRUE; 54 do { 55 // Store only the primary of the variable CE. 56 left.setCurrentCE(ce & INT64_C(0xffffffff00000000)); 57 for(;;) { 58 ce = left.nextCE(errorCode); 59 leftPrimary = (uint32_t)(ce >> 32); 60 if(leftPrimary == 0) { 61 left.setCurrentCE(0); 62 } else { 63 break; 64 } 65 } 66 } while(leftPrimary < variableTop && 67 leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY); 68 } 69 } while(leftPrimary == 0); 70 71 uint32_t rightPrimary; 72 do { 73 int64_t ce = right.nextCE(errorCode); 74 rightPrimary = (uint32_t)(ce >> 32); 75 if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { 76 // Variable CE, shift it to quaternary level. 77 // Ignore all following primary ignorables, and shift further variable CEs. 78 anyVariable = TRUE; 79 do { 80 // Store only the primary of the variable CE. 81 right.setCurrentCE(ce & INT64_C(0xffffffff00000000)); 82 for(;;) { 83 ce = right.nextCE(errorCode); 84 rightPrimary = (uint32_t)(ce >> 32); 85 if(rightPrimary == 0) { 86 right.setCurrentCE(0); 87 } else { 88 break; 89 } 90 } 91 } while(rightPrimary < variableTop && 92 rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY); 93 } 94 } while(rightPrimary == 0); 95 96 if(leftPrimary != rightPrimary) { 97 // Return the primary difference, with script reordering. 98 if(settings.hasReordering()) { 99 leftPrimary = settings.reorder(leftPrimary); 100 rightPrimary = settings.reorder(rightPrimary); 101 } 102 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 103 } 104 if(leftPrimary == Collation::NO_CE_PRIMARY) { break; } 105 } 106 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 107 108 // Compare the buffered secondary & tertiary weights. 109 // We might skip the secondary level but continue with the case level 110 // which is turned on separately. 111 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 112 if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { 113 int32_t leftIndex = 0; 114 int32_t rightIndex = 0; 115 for(;;) { 116 uint32_t leftSecondary; 117 do { 118 leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16; 119 } while(leftSecondary == 0); 120 121 uint32_t rightSecondary; 122 do { 123 rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16; 124 } while(rightSecondary == 0); 125 126 if(leftSecondary != rightSecondary) { 127 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 128 } 129 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; } 130 } 131 } else { 132 // The backwards secondary level compares secondary weights backwards 133 // within segments separated by the merge separator (U+FFFE, weight 02). 134 int32_t leftStart = 0; 135 int32_t rightStart = 0; 136 for(;;) { 137 // Find the merge separator or the NO_CE terminator. 138 uint32_t p; 139 int32_t leftLimit = leftStart; 140 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) > 141 Collation::MERGE_SEPARATOR_PRIMARY || 142 p == 0) { 143 ++leftLimit; 144 } 145 int32_t rightLimit = rightStart; 146 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) > 147 Collation::MERGE_SEPARATOR_PRIMARY || 148 p == 0) { 149 ++rightLimit; 150 } 151 152 // Compare the segments. 153 int32_t leftIndex = leftLimit; 154 int32_t rightIndex = rightLimit; 155 for(;;) { 156 int32_t leftSecondary = 0; 157 while(leftSecondary == 0 && leftIndex > leftStart) { 158 leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16; 159 } 160 161 int32_t rightSecondary = 0; 162 while(rightSecondary == 0 && rightIndex > rightStart) { 163 rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16; 164 } 165 166 if(leftSecondary != rightSecondary) { 167 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 168 } 169 if(leftSecondary == 0) { break; } 170 } 171 172 // Did we reach the end of either string? 173 // Both strings have the same number of merge separators, 174 // or else there would have been a primary-level difference. 175 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit)); 176 if(p == Collation::NO_CE_PRIMARY) { break; } 177 // Skip both merge separators and continue. 178 leftStart = leftLimit + 1; 179 rightStart = rightLimit + 1; 180 } 181 } 182 } 183 184 if((options & CollationSettings::CASE_LEVEL) != 0) { 185 int32_t strength = CollationSettings::getStrength(options); 186 int32_t leftIndex = 0; 187 int32_t rightIndex = 0; 188 for(;;) { 189 uint32_t leftCase, leftLower32, rightCase; 190 if(strength == UCOL_PRIMARY) { 191 // Primary+caseLevel: Ignore case level weights of primary ignorables. 192 // Otherwise we would get a-umlaut > a 193 // which is not desirable for accent-insensitive sorting. 194 // Check for (lower 32 bits) == 0 as well because variable CEs are stored 195 // with only primary weights. 196 int64_t ce; 197 do { 198 ce = left.getCE(leftIndex++); 199 leftCase = (uint32_t)ce; 200 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0); 201 leftLower32 = leftCase; 202 leftCase &= 0xc000; 203 204 do { 205 ce = right.getCE(rightIndex++); 206 rightCase = (uint32_t)ce; 207 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0); 208 rightCase &= 0xc000; 209 } else { 210 // Secondary+caseLevel: By analogy with the above, 211 // ignore case level weights of secondary ignorables. 212 // 213 // Note: A tertiary CE has uppercase case bits (0.0.ut) 214 // to keep tertiary+caseFirst well-formed. 215 // 216 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables. 217 // Otherwise a tertiary CE's uppercase would be no greater than 218 // a primary/secondary CE's uppercase. 219 // (See UCA well-formedness condition 2.) 220 // We could construct a special case weight higher than uppercase, 221 // but it's simpler to always ignore case weights of secondary ignorables, 222 // turning 0.0.ut into 0.0.0.t. 223 // (See LDML Collation, Case Parameters.) 224 do { 225 leftCase = (uint32_t)left.getCE(leftIndex++); 226 } while(leftCase <= 0xffff); 227 leftLower32 = leftCase; 228 leftCase &= 0xc000; 229 230 do { 231 rightCase = (uint32_t)right.getCE(rightIndex++); 232 } while(rightCase <= 0xffff); 233 rightCase &= 0xc000; 234 } 235 236 // No need to handle NO_CE and MERGE_SEPARATOR specially: 237 // There is one case weight for each previous-level weight, 238 // so level length differences were handled there. 239 if(leftCase != rightCase) { 240 if((options & CollationSettings::UPPER_FIRST) == 0) { 241 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 242 } else { 243 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 244 } 245 } 246 if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; } 247 } 248 } 249 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 250 251 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); 252 253 int32_t leftIndex = 0; 254 int32_t rightIndex = 0; 255 uint32_t anyQuaternaries = 0; 256 for(;;) { 257 uint32_t leftLower32, leftTertiary; 258 do { 259 leftLower32 = (uint32_t)left.getCE(leftIndex++); 260 anyQuaternaries |= leftLower32; 261 U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || 262 (leftLower32 & 0xc0c0) == 0); 263 leftTertiary = leftLower32 & tertiaryMask; 264 } while(leftTertiary == 0); 265 266 uint32_t rightLower32, rightTertiary; 267 do { 268 rightLower32 = (uint32_t)right.getCE(rightIndex++); 269 anyQuaternaries |= rightLower32; 270 U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || 271 (rightLower32 & 0xc0c0) == 0); 272 rightTertiary = rightLower32 & tertiaryMask; 273 } while(rightTertiary == 0); 274 275 if(leftTertiary != rightTertiary) { 276 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 277 // Pass through NO_CE and keep real tertiary weights larger than that. 278 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), 279 // to keep tertiary CEs well-formed. 280 // Their case+tertiary weights must be greater than those of 281 // primary and secondary CEs. 282 if(leftTertiary > Collation::NO_CE_WEIGHT16) { 283 if(leftLower32 > 0xffff) { 284 leftTertiary ^= 0xc000; 285 } else { 286 leftTertiary += 0x4000; 287 } 288 } 289 if(rightTertiary > Collation::NO_CE_WEIGHT16) { 290 if(rightLower32 > 0xffff) { 291 rightTertiary ^= 0xc000; 292 } else { 293 rightTertiary += 0x4000; 294 } 295 } 296 } 297 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 298 } 299 if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; } 300 } 301 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 302 303 if(!anyVariable && (anyQuaternaries & 0xc0) == 0) { 304 // If there are no "variable" CEs and no non-zero quaternary weights, 305 // then there are no quaternary differences. 306 return UCOL_EQUAL; 307 } 308 309 leftIndex = 0; 310 rightIndex = 0; 311 for(;;) { 312 uint32_t leftQuaternary; 313 do { 314 int64_t ce = left.getCE(leftIndex++); 315 leftQuaternary = (uint32_t)ce & 0xffff; 316 if(leftQuaternary <= Collation::NO_CE_WEIGHT16) { 317 // Variable primary or completely ignorable or NO_CE. 318 leftQuaternary = (uint32_t)(ce >> 32); 319 } else { 320 // Regular CE, not tertiary ignorable. 321 // Preserve the quaternary weight in bits 7..6. 322 leftQuaternary |= 0xffffff3f; 323 } 324 } while(leftQuaternary == 0); 325 326 uint32_t rightQuaternary; 327 do { 328 int64_t ce = right.getCE(rightIndex++); 329 rightQuaternary = (uint32_t)ce & 0xffff; 330 if(rightQuaternary <= Collation::NO_CE_WEIGHT16) { 331 // Variable primary or completely ignorable or NO_CE. 332 rightQuaternary = (uint32_t)(ce >> 32); 333 } else { 334 // Regular CE, not tertiary ignorable. 335 // Preserve the quaternary weight in bits 7..6. 336 rightQuaternary |= 0xffffff3f; 337 } 338 } while(rightQuaternary == 0); 339 340 if(leftQuaternary != rightQuaternary) { 341 // Return the difference, with script reordering. 342 if(settings.hasReordering()) { 343 leftQuaternary = settings.reorder(leftQuaternary); 344 rightQuaternary = settings.reorder(rightQuaternary); 345 } 346 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 347 } 348 if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; } 349 } 350 return UCOL_EQUAL; 351 } 352 353 U_NAMESPACE_END 354 355 #endif // !UCONFIG_NO_COLLATION 356