1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 2001-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ****************************************************************************** 8 * 9 * File ucoleitr.cpp 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 02/15/2001 synwee Modified all methods to process its own function 15 * instead of calling the equivalent c++ api (coleitr.h) 16 * 2012-2014 markus Rewritten in C++ again. 17 ******************************************************************************/ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_COLLATION 22 23 #include "unicode/coleitr.h" 24 #include "unicode/tblcoll.h" 25 #include "unicode/ucoleitr.h" 26 #include "unicode/ustring.h" 27 #include "unicode/sortkey.h" 28 #include "unicode/uobject.h" 29 #include "cmemory.h" 30 #include "usrchimp.h" 31 32 U_NAMESPACE_USE 33 34 #define BUFFER_LENGTH 100 35 36 #define DEFAULT_BUFFER_SIZE 16 37 #define BUFFER_GROW 8 38 39 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0]) 40 41 #define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type)) 42 43 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 44 45 struct RCEI 46 { 47 uint32_t ce; 48 int32_t low; 49 int32_t high; 50 }; 51 52 U_NAMESPACE_BEGIN 53 54 struct RCEBuffer 55 { 56 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; 57 RCEI *buffer; 58 int32_t bufferIndex; 59 int32_t bufferSize; 60 61 RCEBuffer(); 62 ~RCEBuffer(); 63 64 UBool isEmpty() const; 65 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); 66 const RCEI *get(); 67 }; 68 69 RCEBuffer::RCEBuffer() 70 { 71 buffer = defaultBuffer; 72 bufferIndex = 0; 73 bufferSize = UPRV_LENGTHOF(defaultBuffer); 74 } 75 76 RCEBuffer::~RCEBuffer() 77 { 78 if (buffer != defaultBuffer) { 79 DELETE_ARRAY(buffer); 80 } 81 } 82 83 UBool RCEBuffer::isEmpty() const 84 { 85 return bufferIndex <= 0; 86 } 87 88 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode) 89 { 90 if (U_FAILURE(errorCode)) { 91 return; 92 } 93 if (bufferIndex >= bufferSize) { 94 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); 95 if (newBuffer == NULL) { 96 errorCode = U_MEMORY_ALLOCATION_ERROR; 97 return; 98 } 99 100 ARRAY_COPY(newBuffer, buffer, bufferSize); 101 102 if (buffer != defaultBuffer) { 103 DELETE_ARRAY(buffer); 104 } 105 106 buffer = newBuffer; 107 bufferSize += BUFFER_GROW; 108 } 109 110 buffer[bufferIndex].ce = ce; 111 buffer[bufferIndex].low = ixLow; 112 buffer[bufferIndex].high = ixHigh; 113 114 bufferIndex += 1; 115 } 116 117 const RCEI *RCEBuffer::get() 118 { 119 if (bufferIndex > 0) { 120 return &buffer[--bufferIndex]; 121 } 122 123 return NULL; 124 } 125 126 PCEBuffer::PCEBuffer() 127 { 128 buffer = defaultBuffer; 129 bufferIndex = 0; 130 bufferSize = UPRV_LENGTHOF(defaultBuffer); 131 } 132 133 PCEBuffer::~PCEBuffer() 134 { 135 if (buffer != defaultBuffer) { 136 DELETE_ARRAY(buffer); 137 } 138 } 139 140 void PCEBuffer::reset() 141 { 142 bufferIndex = 0; 143 } 144 145 UBool PCEBuffer::isEmpty() const 146 { 147 return bufferIndex <= 0; 148 } 149 150 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode) 151 { 152 if (U_FAILURE(errorCode)) { 153 return; 154 } 155 if (bufferIndex >= bufferSize) { 156 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); 157 if (newBuffer == NULL) { 158 errorCode = U_MEMORY_ALLOCATION_ERROR; 159 return; 160 } 161 162 ARRAY_COPY(newBuffer, buffer, bufferSize); 163 164 if (buffer != defaultBuffer) { 165 DELETE_ARRAY(buffer); 166 } 167 168 buffer = newBuffer; 169 bufferSize += BUFFER_GROW; 170 } 171 172 buffer[bufferIndex].ce = ce; 173 buffer[bufferIndex].low = ixLow; 174 buffer[bufferIndex].high = ixHigh; 175 176 bufferIndex += 1; 177 } 178 179 const PCEI *PCEBuffer::get() 180 { 181 if (bufferIndex > 0) { 182 return &buffer[--bufferIndex]; 183 } 184 185 return NULL; 186 } 187 188 UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); } 189 190 UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); } 191 192 void UCollationPCE::init(UCollationElements *elems) { 193 init(CollationElementIterator::fromUCollationElements(elems)); 194 } 195 196 void UCollationPCE::init(CollationElementIterator *iter) 197 { 198 cei = iter; 199 init(*iter->rbc_); 200 } 201 202 void UCollationPCE::init(const Collator &coll) 203 { 204 UErrorCode status = U_ZERO_ERROR; 205 206 strength = coll.getAttribute(UCOL_STRENGTH, status); 207 toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; 208 isShifted = FALSE; 209 variableTop = coll.getVariableTop(status); 210 } 211 212 UCollationPCE::~UCollationPCE() 213 { 214 // nothing to do 215 } 216 217 uint64_t UCollationPCE::processCE(uint32_t ce) 218 { 219 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 220 221 // This is clean, but somewhat slow... 222 // We could apply the mask to ce and then 223 // just get all three orders... 224 switch(strength) { 225 default: 226 tertiary = ucol_tertiaryOrder(ce); 227 U_FALLTHROUGH; 228 229 case UCOL_SECONDARY: 230 secondary = ucol_secondaryOrder(ce); 231 U_FALLTHROUGH; 232 233 case UCOL_PRIMARY: 234 primary = ucol_primaryOrder(ce); 235 } 236 237 // **** This should probably handle continuations too. **** 238 // **** That means that we need 24 bits for the primary **** 239 // **** instead of the 16 that we're currently using. **** 240 // **** So we can lay out the 64 bits as: 24.12.12.16. **** 241 // **** Another complication with continuations is that **** 242 // **** the *second* CE is marked as a continuation, so **** 243 // **** we always have to peek ahead to know how long **** 244 // **** the primary is... **** 245 if ((toShift && variableTop > ce && primary != 0) 246 || (isShifted && primary == 0)) { 247 248 if (primary == 0) { 249 return UCOL_IGNORABLE; 250 } 251 252 if (strength >= UCOL_QUATERNARY) { 253 quaternary = primary; 254 } 255 256 primary = secondary = tertiary = 0; 257 isShifted = TRUE; 258 } else { 259 if (strength >= UCOL_QUATERNARY) { 260 quaternary = 0xFFFF; 261 } 262 263 isShifted = FALSE; 264 } 265 266 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; 267 } 268 269 U_NAMESPACE_END 270 271 /* public methods ---------------------------------------------------- */ 272 273 U_CAPI UCollationElements* U_EXPORT2 274 ucol_openElements(const UCollator *coll, 275 const UChar *text, 276 int32_t textLength, 277 UErrorCode *status) 278 { 279 if (U_FAILURE(*status)) { 280 return NULL; 281 } 282 if (coll == NULL || (text == NULL && textLength != 0)) { 283 *status = U_ILLEGAL_ARGUMENT_ERROR; 284 return NULL; 285 } 286 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 287 if (rbc == NULL) { 288 *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator 289 return NULL; 290 } 291 292 UnicodeString s((UBool)(textLength < 0), text, textLength); 293 CollationElementIterator *cei = rbc->createCollationElementIterator(s); 294 if (cei == NULL) { 295 *status = U_MEMORY_ALLOCATION_ERROR; 296 return NULL; 297 } 298 299 return cei->toUCollationElements(); 300 } 301 302 303 U_CAPI void U_EXPORT2 304 ucol_closeElements(UCollationElements *elems) 305 { 306 delete CollationElementIterator::fromUCollationElements(elems); 307 } 308 309 U_CAPI void U_EXPORT2 310 ucol_reset(UCollationElements *elems) 311 { 312 CollationElementIterator::fromUCollationElements(elems)->reset(); 313 } 314 315 U_CAPI int32_t U_EXPORT2 316 ucol_next(UCollationElements *elems, 317 UErrorCode *status) 318 { 319 if (U_FAILURE(*status)) { 320 return UCOL_NULLORDER; 321 } 322 323 return CollationElementIterator::fromUCollationElements(elems)->next(*status); 324 } 325 326 U_NAMESPACE_BEGIN 327 328 int64_t 329 UCollationPCE::nextProcessed( 330 int32_t *ixLow, 331 int32_t *ixHigh, 332 UErrorCode *status) 333 { 334 int64_t result = UCOL_IGNORABLE; 335 uint32_t low = 0, high = 0; 336 337 if (U_FAILURE(*status)) { 338 return UCOL_PROCESSED_NULLORDER; 339 } 340 341 pceBuffer.reset(); 342 343 do { 344 low = cei->getOffset(); 345 int32_t ce = cei->next(*status); 346 high = cei->getOffset(); 347 348 if (ce == UCOL_NULLORDER) { 349 result = UCOL_PROCESSED_NULLORDER; 350 break; 351 } 352 353 result = processCE((uint32_t)ce); 354 } while (result == UCOL_IGNORABLE); 355 356 if (ixLow != NULL) { 357 *ixLow = low; 358 } 359 360 if (ixHigh != NULL) { 361 *ixHigh = high; 362 } 363 364 return result; 365 } 366 367 U_NAMESPACE_END 368 369 U_CAPI int32_t U_EXPORT2 370 ucol_previous(UCollationElements *elems, 371 UErrorCode *status) 372 { 373 if(U_FAILURE(*status)) { 374 return UCOL_NULLORDER; 375 } 376 return CollationElementIterator::fromUCollationElements(elems)->previous(*status); 377 } 378 379 U_NAMESPACE_BEGIN 380 381 int64_t 382 UCollationPCE::previousProcessed( 383 int32_t *ixLow, 384 int32_t *ixHigh, 385 UErrorCode *status) 386 { 387 int64_t result = UCOL_IGNORABLE; 388 int32_t low = 0, high = 0; 389 390 if (U_FAILURE(*status)) { 391 return UCOL_PROCESSED_NULLORDER; 392 } 393 394 // pceBuffer.reset(); 395 396 while (pceBuffer.isEmpty()) { 397 // buffer raw CEs up to non-ignorable primary 398 RCEBuffer rceb; 399 int32_t ce; 400 401 // **** do we need to reset rceb, or will it always be empty at this point **** 402 do { 403 high = cei->getOffset(); 404 ce = cei->previous(*status); 405 low = cei->getOffset(); 406 407 if (ce == UCOL_NULLORDER) { 408 if (!rceb.isEmpty()) { 409 break; 410 } 411 412 goto finish; 413 } 414 415 rceb.put((uint32_t)ce, low, high, *status); 416 } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce))); 417 418 // process the raw CEs 419 while (U_SUCCESS(*status) && !rceb.isEmpty()) { 420 const RCEI *rcei = rceb.get(); 421 422 result = processCE(rcei->ce); 423 424 if (result != UCOL_IGNORABLE) { 425 pceBuffer.put(result, rcei->low, rcei->high, *status); 426 } 427 } 428 if (U_FAILURE(*status)) { 429 return UCOL_PROCESSED_NULLORDER; 430 } 431 } 432 433 finish: 434 if (pceBuffer.isEmpty()) { 435 // **** Is -1 the right value for ixLow, ixHigh? **** 436 if (ixLow != NULL) { 437 *ixLow = -1; 438 } 439 440 if (ixHigh != NULL) { 441 *ixHigh = -1 442 ; 443 } 444 return UCOL_PROCESSED_NULLORDER; 445 } 446 447 const PCEI *pcei = pceBuffer.get(); 448 449 if (ixLow != NULL) { 450 *ixLow = pcei->low; 451 } 452 453 if (ixHigh != NULL) { 454 *ixHigh = pcei->high; 455 } 456 457 return pcei->ce; 458 } 459 460 U_NAMESPACE_END 461 462 U_CAPI int32_t U_EXPORT2 463 ucol_getMaxExpansion(const UCollationElements *elems, 464 int32_t order) 465 { 466 return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order); 467 468 // TODO: The old code masked the order according to strength and then did a binary search. 469 // However this was probably at least partially broken because of the following comment. 470 // Still, it might have found a match when this version may not. 471 472 // FIXME: with a masked search, there might be more than one hit, 473 // so we need to look forward and backward from the match to find all 474 // of the hits... 475 } 476 477 U_CAPI void U_EXPORT2 478 ucol_setText( UCollationElements *elems, 479 const UChar *text, 480 int32_t textLength, 481 UErrorCode *status) 482 { 483 if (U_FAILURE(*status)) { 484 return; 485 } 486 487 if ((text == NULL && textLength != 0)) { 488 *status = U_ILLEGAL_ARGUMENT_ERROR; 489 return; 490 } 491 UnicodeString s((UBool)(textLength < 0), text, textLength); 492 return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status); 493 } 494 495 U_CAPI int32_t U_EXPORT2 496 ucol_getOffset(const UCollationElements *elems) 497 { 498 return CollationElementIterator::fromUCollationElements(elems)->getOffset(); 499 } 500 501 U_CAPI void U_EXPORT2 502 ucol_setOffset(UCollationElements *elems, 503 int32_t offset, 504 UErrorCode *status) 505 { 506 if (U_FAILURE(*status)) { 507 return; 508 } 509 510 CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status); 511 } 512 513 U_CAPI int32_t U_EXPORT2 514 ucol_primaryOrder (int32_t order) 515 { 516 return (order >> 16) & 0xffff; 517 } 518 519 U_CAPI int32_t U_EXPORT2 520 ucol_secondaryOrder (int32_t order) 521 { 522 return (order >> 8) & 0xff; 523 } 524 525 U_CAPI int32_t U_EXPORT2 526 ucol_tertiaryOrder (int32_t order) 527 { 528 return order & 0xff; 529 } 530 531 #endif /* #if !UCONFIG_NO_COLLATION */ 532