1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationcompare.cpp 7 * 8 * created on: 2012feb14 with new and old collation code 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/ucol.h" 17 #include "cmemory.h" 18 #include "collation.h" 19 #include "collationcompare.h" 20 #include "collationiterator.h" 21 #include "collationsettings.h" 22 #include "uassert.h" 23 24 U_NAMESPACE_BEGIN 25 26 UCollationResult 27 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right, 28 const CollationSettings &settings, 29 UErrorCode &errorCode) { 30 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 31 32 int32_t options = settings.options; 33 uint32_t variableTop; 34 if((options & CollationSettings::ALTERNATE_MASK) == 0) { 35 variableTop = 0; 36 } else { 37 // +1 so that we can use "<" and primary ignorables test out early. 38 variableTop = settings.variableTop + 1; 39 } 40 UBool anyVariable = FALSE; 41 42 // Fetch CEs, compare primaries, store secondary & tertiary weights. 43 for(;;) { 44 // We fetch CEs until we get a non-ignorable primary or reach the end. 45 uint32_t leftPrimary; 46 do { 47 int64_t ce = left.nextCE(errorCode); 48 leftPrimary = (uint32_t)(ce >> 32); 49 if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { 50 // Variable CE, shift it to quaternary level. 51 // Ignore all following primary ignorables, and shift further variable CEs. 52 anyVariable = TRUE; 53 do { 54 // Store only the primary of the variable CE. 55 left.setCurrentCE(ce & INT64_C(0xffffffff00000000)); 56 for(;;) { 57 ce = left.nextCE(errorCode); 58 leftPrimary = (uint32_t)(ce >> 32); 59 if(leftPrimary == 0) { 60 left.setCurrentCE(0); 61 } else { 62 break; 63 } 64 } 65 } while(leftPrimary < variableTop && 66 leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY); 67 } 68 } while(leftPrimary == 0); 69 70 uint32_t rightPrimary; 71 do { 72 int64_t ce = right.nextCE(errorCode); 73 rightPrimary = (uint32_t)(ce >> 32); 74 if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { 75 // Variable CE, shift it to quaternary level. 76 // Ignore all following primary ignorables, and shift further variable CEs. 77 anyVariable = TRUE; 78 do { 79 // Store only the primary of the variable CE. 80 right.setCurrentCE(ce & INT64_C(0xffffffff00000000)); 81 for(;;) { 82 ce = right.nextCE(errorCode); 83 rightPrimary = (uint32_t)(ce >> 32); 84 if(rightPrimary == 0) { 85 right.setCurrentCE(0); 86 } else { 87 break; 88 } 89 } 90 } while(rightPrimary < variableTop && 91 rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY); 92 } 93 } while(rightPrimary == 0); 94 95 if(leftPrimary != rightPrimary) { 96 // Return the primary difference, with script reordering. 97 if(settings.hasReordering()) { 98 leftPrimary = settings.reorder(leftPrimary); 99 rightPrimary = settings.reorder(rightPrimary); 100 } 101 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 102 } 103 if(leftPrimary == Collation::NO_CE_PRIMARY) { break; } 104 } 105 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 106 107 // Compare the buffered secondary & tertiary weights. 108 // We might skip the secondary level but continue with the case level 109 // which is turned on separately. 110 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 111 if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { 112 int32_t leftIndex = 0; 113 int32_t rightIndex = 0; 114 for(;;) { 115 uint32_t leftSecondary; 116 do { 117 leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16; 118 } while(leftSecondary == 0); 119 120 uint32_t rightSecondary; 121 do { 122 rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16; 123 } while(rightSecondary == 0); 124 125 if(leftSecondary != rightSecondary) { 126 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 127 } 128 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; } 129 } 130 } else { 131 // The backwards secondary level compares secondary weights backwards 132 // within segments separated by the merge separator (U+FFFE, weight 02). 133 int32_t leftStart = 0; 134 int32_t rightStart = 0; 135 for(;;) { 136 // Find the merge separator or the NO_CE terminator. 137 uint32_t p; 138 int32_t leftLimit = leftStart; 139 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) > 140 Collation::MERGE_SEPARATOR_PRIMARY || 141 p == 0) { 142 ++leftLimit; 143 } 144 int32_t rightLimit = rightStart; 145 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) > 146 Collation::MERGE_SEPARATOR_PRIMARY || 147 p == 0) { 148 ++rightLimit; 149 } 150 151 // Compare the segments. 152 int32_t leftIndex = leftLimit; 153 int32_t rightIndex = rightLimit; 154 for(;;) { 155 int32_t leftSecondary = 0; 156 while(leftSecondary == 0 && leftIndex > leftStart) { 157 leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16; 158 } 159 160 int32_t rightSecondary = 0; 161 while(rightSecondary == 0 && rightIndex > rightStart) { 162 rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16; 163 } 164 165 if(leftSecondary != rightSecondary) { 166 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 167 } 168 if(leftSecondary == 0) { break; } 169 } 170 171 // Did we reach the end of either string? 172 // Both strings have the same number of merge separators, 173 // or else there would have been a primary-level difference. 174 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit)); 175 if(p == Collation::NO_CE_PRIMARY) { break; } 176 // Skip both merge separators and continue. 177 leftStart = leftLimit + 1; 178 rightStart = rightLimit + 1; 179 } 180 } 181 } 182 183 if((options & CollationSettings::CASE_LEVEL) != 0) { 184 int32_t strength = CollationSettings::getStrength(options); 185 int32_t leftIndex = 0; 186 int32_t rightIndex = 0; 187 for(;;) { 188 uint32_t leftCase, leftLower32, rightCase; 189 if(strength == UCOL_PRIMARY) { 190 // Primary+caseLevel: Ignore case level weights of primary ignorables. 191 // Otherwise we would get a-umlaut > a 192 // which is not desirable for accent-insensitive sorting. 193 // Check for (lower 32 bits) == 0 as well because variable CEs are stored 194 // with only primary weights. 195 int64_t ce; 196 do { 197 ce = left.getCE(leftIndex++); 198 leftCase = (uint32_t)ce; 199 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0); 200 leftLower32 = leftCase; 201 leftCase &= 0xc000; 202 203 do { 204 ce = right.getCE(rightIndex++); 205 rightCase = (uint32_t)ce; 206 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0); 207 rightCase &= 0xc000; 208 } else { 209 // Secondary+caseLevel: By analogy with the above, 210 // ignore case level weights of secondary ignorables. 211 // 212 // Note: A tertiary CE has uppercase case bits (0.0.ut) 213 // to keep tertiary+caseFirst well-formed. 214 // 215 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables. 216 // Otherwise a tertiary CE's uppercase would be no greater than 217 // a primary/secondary CE's uppercase. 218 // (See UCA well-formedness condition 2.) 219 // We could construct a special case weight higher than uppercase, 220 // but it's simpler to always ignore case weights of secondary ignorables, 221 // turning 0.0.ut into 0.0.0.t. 222 // (See LDML Collation, Case Parameters.) 223 do { 224 leftCase = (uint32_t)left.getCE(leftIndex++); 225 } while(leftCase <= 0xffff); 226 leftLower32 = leftCase; 227 leftCase &= 0xc000; 228 229 do { 230 rightCase = (uint32_t)right.getCE(rightIndex++); 231 } while(rightCase <= 0xffff); 232 rightCase &= 0xc000; 233 } 234 235 // No need to handle NO_CE and MERGE_SEPARATOR specially: 236 // There is one case weight for each previous-level weight, 237 // so level length differences were handled there. 238 if(leftCase != rightCase) { 239 if((options & CollationSettings::UPPER_FIRST) == 0) { 240 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 241 } else { 242 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 243 } 244 } 245 if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; } 246 } 247 } 248 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 249 250 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); 251 252 int32_t leftIndex = 0; 253 int32_t rightIndex = 0; 254 uint32_t anyQuaternaries = 0; 255 for(;;) { 256 uint32_t leftLower32, leftTertiary; 257 do { 258 leftLower32 = (uint32_t)left.getCE(leftIndex++); 259 anyQuaternaries |= leftLower32; 260 U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || 261 (leftLower32 & 0xc0c0) == 0); 262 leftTertiary = leftLower32 & tertiaryMask; 263 } while(leftTertiary == 0); 264 265 uint32_t rightLower32, rightTertiary; 266 do { 267 rightLower32 = (uint32_t)right.getCE(rightIndex++); 268 anyQuaternaries |= rightLower32; 269 U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || 270 (rightLower32 & 0xc0c0) == 0); 271 rightTertiary = rightLower32 & tertiaryMask; 272 } while(rightTertiary == 0); 273 274 if(leftTertiary != rightTertiary) { 275 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 276 // Pass through NO_CE and keep real tertiary weights larger than that. 277 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), 278 // to keep tertiary CEs well-formed. 279 // Their case+tertiary weights must be greater than those of 280 // primary and secondary CEs. 281 if(leftTertiary > Collation::NO_CE_WEIGHT16) { 282 if(leftLower32 > 0xffff) { 283 leftTertiary ^= 0xc000; 284 } else { 285 leftTertiary += 0x4000; 286 } 287 } 288 if(rightTertiary > Collation::NO_CE_WEIGHT16) { 289 if(rightLower32 > 0xffff) { 290 rightTertiary ^= 0xc000; 291 } else { 292 rightTertiary += 0x4000; 293 } 294 } 295 } 296 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 297 } 298 if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; } 299 } 300 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 301 302 if(!anyVariable && (anyQuaternaries & 0xc0) == 0) { 303 // If there are no "variable" CEs and no non-zero quaternary weights, 304 // then there are no quaternary differences. 305 return UCOL_EQUAL; 306 } 307 308 leftIndex = 0; 309 rightIndex = 0; 310 for(;;) { 311 uint32_t leftQuaternary; 312 do { 313 int64_t ce = left.getCE(leftIndex++); 314 leftQuaternary = (uint32_t)ce & 0xffff; 315 if(leftQuaternary <= Collation::NO_CE_WEIGHT16) { 316 // Variable primary or completely ignorable or NO_CE. 317 leftQuaternary = (uint32_t)(ce >> 32); 318 } else { 319 // Regular CE, not tertiary ignorable. 320 // Preserve the quaternary weight in bits 7..6. 321 leftQuaternary |= 0xffffff3f; 322 } 323 } while(leftQuaternary == 0); 324 325 uint32_t rightQuaternary; 326 do { 327 int64_t ce = right.getCE(rightIndex++); 328 rightQuaternary = (uint32_t)ce & 0xffff; 329 if(rightQuaternary <= Collation::NO_CE_WEIGHT16) { 330 // Variable primary or completely ignorable or NO_CE. 331 rightQuaternary = (uint32_t)(ce >> 32); 332 } else { 333 // Regular CE, not tertiary ignorable. 334 // Preserve the quaternary weight in bits 7..6. 335 rightQuaternary |= 0xffffff3f; 336 } 337 } while(rightQuaternary == 0); 338 339 if(leftQuaternary != rightQuaternary) { 340 // Return the difference, with script reordering. 341 if(settings.hasReordering()) { 342 leftQuaternary = settings.reorder(leftQuaternary); 343 rightQuaternary = settings.reorder(rightQuaternary); 344 } 345 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 346 } 347 if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; } 348 } 349 return UCOL_EQUAL; 350 } 351 352 U_NAMESPACE_END 353 354 #endif // !UCONFIG_NO_COLLATION 355