1 /* 2 ****************************************************************************** 3 * Copyright (C) 2001-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ****************************************************************************** 6 * 7 * File ucoleitr.cpp 8 * 9 * Modification History: 10 * 11 * Date Name Description 12 * 02/15/2001 synwee Modified all methods to process its own function 13 * instead of calling the equivalent c++ api (coleitr.h) 14 ******************************************************************************/ 15 16 #include "unicode/utypes.h" 17 18 #if !UCONFIG_NO_COLLATION 19 20 #include "unicode/ucoleitr.h" 21 #include "unicode/ustring.h" 22 #include "unicode/sortkey.h" 23 #include "unicode/uobject.h" 24 #include "ucol_imp.h" 25 #include "cmemory.h" 26 27 U_NAMESPACE_USE 28 29 #define BUFFER_LENGTH 100 30 31 #define DEFAULT_BUFFER_SIZE 16 32 #define BUFFER_GROW 8 33 34 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 35 36 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) 37 38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 39 40 #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0]) 41 42 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 43 44 typedef struct icu::collIterate collIterator; 45 46 struct RCEI 47 { 48 uint32_t ce; 49 int32_t low; 50 int32_t high; 51 }; 52 53 U_NAMESPACE_BEGIN 54 55 struct RCEBuffer 56 { 57 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; 58 RCEI *buffer; 59 int32_t bufferIndex; 60 int32_t bufferSize; 61 62 RCEBuffer(); 63 ~RCEBuffer(); 64 65 UBool empty() const; 66 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh); 67 const RCEI *get(); 68 }; 69 70 RCEBuffer::RCEBuffer() 71 { 72 buffer = defaultBuffer; 73 bufferIndex = 0; 74 bufferSize = DEFAULT_BUFFER_SIZE; 75 } 76 77 RCEBuffer::~RCEBuffer() 78 { 79 if (buffer != defaultBuffer) { 80 DELETE_ARRAY(buffer); 81 } 82 } 83 84 UBool RCEBuffer::empty() const 85 { 86 return bufferIndex <= 0; 87 } 88 89 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh) 90 { 91 if (bufferIndex >= bufferSize) { 92 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); 93 94 ARRAY_COPY(newBuffer, buffer, bufferSize); 95 96 if (buffer != defaultBuffer) { 97 DELETE_ARRAY(buffer); 98 } 99 100 buffer = newBuffer; 101 bufferSize += BUFFER_GROW; 102 } 103 104 buffer[bufferIndex].ce = ce; 105 buffer[bufferIndex].low = ixLow; 106 buffer[bufferIndex].high = ixHigh; 107 108 bufferIndex += 1; 109 } 110 111 const RCEI *RCEBuffer::get() 112 { 113 if (bufferIndex > 0) { 114 return &buffer[--bufferIndex]; 115 } 116 117 return NULL; 118 } 119 120 struct PCEI 121 { 122 uint64_t ce; 123 int32_t low; 124 int32_t high; 125 }; 126 127 struct PCEBuffer 128 { 129 PCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; 130 PCEI *buffer; 131 int32_t bufferIndex; 132 int32_t bufferSize; 133 134 PCEBuffer(); 135 ~PCEBuffer(); 136 137 void reset(); 138 UBool empty() const; 139 void put(uint64_t ce, int32_t ixLow, int32_t ixHigh); 140 const PCEI *get(); 141 }; 142 143 PCEBuffer::PCEBuffer() 144 { 145 buffer = defaultBuffer; 146 bufferIndex = 0; 147 bufferSize = DEFAULT_BUFFER_SIZE; 148 } 149 150 PCEBuffer::~PCEBuffer() 151 { 152 if (buffer != defaultBuffer) { 153 DELETE_ARRAY(buffer); 154 } 155 } 156 157 void PCEBuffer::reset() 158 { 159 bufferIndex = 0; 160 } 161 162 UBool PCEBuffer::empty() const 163 { 164 return bufferIndex <= 0; 165 } 166 167 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh) 168 { 169 if (bufferIndex >= bufferSize) { 170 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); 171 172 ARRAY_COPY(newBuffer, buffer, bufferSize); 173 174 if (buffer != defaultBuffer) { 175 DELETE_ARRAY(buffer); 176 } 177 178 buffer = newBuffer; 179 bufferSize += BUFFER_GROW; 180 } 181 182 buffer[bufferIndex].ce = ce; 183 buffer[bufferIndex].low = ixLow; 184 buffer[bufferIndex].high = ixHigh; 185 186 bufferIndex += 1; 187 } 188 189 const PCEI *PCEBuffer::get() 190 { 191 if (bufferIndex > 0) { 192 return &buffer[--bufferIndex]; 193 } 194 195 return NULL; 196 } 197 198 /* 199 * This inherits from UObject so that 200 * it can be allocated by new and the 201 * constructor for PCEBuffer is called. 202 */ 203 struct UCollationPCE : public UObject 204 { 205 PCEBuffer pceBuffer; 206 UCollationStrength strength; 207 UBool toShift; 208 UBool isShifted; 209 uint32_t variableTop; 210 211 UCollationPCE(UCollationElements *elems); 212 ~UCollationPCE(); 213 214 void init(const UCollator *coll); 215 216 virtual UClassID getDynamicClassID() const; 217 static UClassID getStaticClassID(); 218 }; 219 220 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE) 221 222 UCollationPCE::UCollationPCE(UCollationElements *elems) 223 { 224 init(elems->iteratordata_.coll); 225 } 226 227 void UCollationPCE::init(const UCollator *coll) 228 { 229 UErrorCode status = U_ZERO_ERROR; 230 231 strength = ucol_getStrength(coll); 232 toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; 233 isShifted = FALSE; 234 variableTop = coll->variableTopValue << 16; 235 } 236 237 UCollationPCE::~UCollationPCE() 238 { 239 // nothing to do 240 } 241 242 243 U_NAMESPACE_END 244 245 246 inline uint64_t processCE(UCollationElements *elems, uint32_t ce) 247 { 248 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 249 250 // This is clean, but somewhat slow... 251 // We could apply the mask to ce and then 252 // just get all three orders... 253 switch(elems->pce->strength) { 254 default: 255 tertiary = ucol_tertiaryOrder(ce); 256 /* note fall-through */ 257 258 case UCOL_SECONDARY: 259 secondary = ucol_secondaryOrder(ce); 260 /* note fall-through */ 261 262 case UCOL_PRIMARY: 263 primary = ucol_primaryOrder(ce); 264 } 265 266 // **** This should probably handle continuations too. **** 267 // **** That means that we need 24 bits for the primary **** 268 // **** instead of the 16 that we're currently using. **** 269 // **** So we can lay out the 64 bits as: 24.12.12.16. **** 270 // **** Another complication with continuations is that **** 271 // **** the *second* CE is marked as a continuation, so **** 272 // **** we always have to peek ahead to know how long **** 273 // **** the primary is... **** 274 if ((elems->pce->toShift && elems->pce->variableTop > ce && primary != 0) 275 || (elems->pce->isShifted && primary == 0)) { 276 277 if (primary == 0) { 278 return UCOL_IGNORABLE; 279 } 280 281 if (elems->pce->strength >= UCOL_QUATERNARY) { 282 quaternary = primary; 283 } 284 285 primary = secondary = tertiary = 0; 286 elems->pce->isShifted = TRUE; 287 } else { 288 if (elems->pce->strength >= UCOL_QUATERNARY) { 289 quaternary = 0xFFFF; 290 } 291 292 elems->pce->isShifted = FALSE; 293 } 294 295 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; 296 } 297 298 U_CAPI void U_EXPORT2 299 uprv_init_pce(const UCollationElements *elems) 300 { 301 if (elems->pce != NULL) { 302 elems->pce->init(elems->iteratordata_.coll); 303 } 304 } 305 306 307 308 /* public methods ---------------------------------------------------- */ 309 310 U_CAPI UCollationElements* U_EXPORT2 311 ucol_openElements(const UCollator *coll, 312 const UChar *text, 313 int32_t textLength, 314 UErrorCode *status) 315 { 316 if (U_FAILURE(*status)) { 317 return NULL; 318 } 319 320 UCollationElements *result = new UCollationElements; 321 if (result == NULL) { 322 *status = U_MEMORY_ALLOCATION_ERROR; 323 return NULL; 324 } 325 326 result->reset_ = TRUE; 327 result->isWritable = FALSE; 328 result->pce = NULL; 329 330 if (text == NULL) { 331 textLength = 0; 332 } 333 uprv_init_collIterate(coll, text, textLength, &result->iteratordata_, status); 334 335 return result; 336 } 337 338 339 U_CAPI void U_EXPORT2 340 ucol_closeElements(UCollationElements *elems) 341 { 342 if (elems != NULL) { 343 collIterate *ci = &elems->iteratordata_; 344 345 if (ci->extendCEs) { 346 uprv_free(ci->extendCEs); 347 } 348 349 if (ci->offsetBuffer) { 350 uprv_free(ci->offsetBuffer); 351 } 352 353 if (elems->isWritable && elems->iteratordata_.string != NULL) 354 { 355 uprv_free((UChar *)elems->iteratordata_.string); 356 } 357 358 if (elems->pce != NULL) { 359 delete elems->pce; 360 } 361 362 delete elems; 363 } 364 } 365 366 U_CAPI void U_EXPORT2 367 ucol_reset(UCollationElements *elems) 368 { 369 collIterate *ci = &(elems->iteratordata_); 370 elems->reset_ = TRUE; 371 ci->pos = ci->string; 372 if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) { 373 ci->endp = ci->string + u_strlen(ci->string); 374 } 375 ci->CEpos = ci->toReturn = ci->CEs; 376 ci->flags = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN; 377 if (ci->coll->normalizationMode == UCOL_ON) { 378 ci->flags |= UCOL_ITER_NORM; 379 } 380 381 ci->writableBuffer.remove(); 382 ci->fcdPosition = NULL; 383 384 //ci->offsetReturn = ci->offsetStore = NULL; 385 ci->offsetRepeatCount = ci->offsetRepeatValue = 0; 386 } 387 388 U_CAPI void U_EXPORT2 389 ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status) 390 { 391 if (U_FAILURE(*status)) { 392 return; 393 } 394 395 if (elems == NULL) { 396 *status = U_ILLEGAL_ARGUMENT_ERROR; 397 return; 398 } 399 400 elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT; 401 } 402 403 U_CAPI int32_t U_EXPORT2 404 ucol_next(UCollationElements *elems, 405 UErrorCode *status) 406 { 407 int32_t result; 408 if (U_FAILURE(*status)) { 409 return UCOL_NULLORDER; 410 } 411 412 elems->reset_ = FALSE; 413 414 result = (int32_t)ucol_getNextCE(elems->iteratordata_.coll, 415 &elems->iteratordata_, 416 status); 417 418 if (result == UCOL_NO_MORE_CES) { 419 result = UCOL_NULLORDER; 420 } 421 return result; 422 } 423 424 U_CAPI int64_t U_EXPORT2 425 ucol_nextProcessed(UCollationElements *elems, 426 int32_t *ixLow, 427 int32_t *ixHigh, 428 UErrorCode *status) 429 { 430 const UCollator *coll = elems->iteratordata_.coll; 431 int64_t result = UCOL_IGNORABLE; 432 uint32_t low = 0, high = 0; 433 434 if (U_FAILURE(*status)) { 435 return UCOL_PROCESSED_NULLORDER; 436 } 437 438 if (elems->pce == NULL) { 439 elems->pce = new UCollationPCE(elems); 440 } else { 441 elems->pce->pceBuffer.reset(); 442 } 443 444 elems->reset_ = FALSE; 445 446 do { 447 low = ucol_getOffset(elems); 448 uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status); 449 high = ucol_getOffset(elems); 450 451 if (ce == UCOL_NO_MORE_CES) { 452 result = UCOL_PROCESSED_NULLORDER; 453 break; 454 } 455 456 result = processCE(elems, ce); 457 } while (result == UCOL_IGNORABLE); 458 459 if (ixLow != NULL) { 460 *ixLow = low; 461 } 462 463 if (ixHigh != NULL) { 464 *ixHigh = high; 465 } 466 467 return result; 468 } 469 470 U_CAPI int32_t U_EXPORT2 471 ucol_previous(UCollationElements *elems, 472 UErrorCode *status) 473 { 474 if(U_FAILURE(*status)) { 475 return UCOL_NULLORDER; 476 } 477 else 478 { 479 int32_t result; 480 481 if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) { 482 if (elems->iteratordata_.endp == NULL) { 483 elems->iteratordata_.endp = elems->iteratordata_.string + 484 u_strlen(elems->iteratordata_.string); 485 elems->iteratordata_.flags |= UCOL_ITER_HASLEN; 486 } 487 elems->iteratordata_.pos = elems->iteratordata_.endp; 488 elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; 489 } 490 491 elems->reset_ = FALSE; 492 493 result = (int32_t)ucol_getPrevCE(elems->iteratordata_.coll, 494 &(elems->iteratordata_), 495 status); 496 497 if (result == UCOL_NO_MORE_CES) { 498 result = UCOL_NULLORDER; 499 } 500 501 return result; 502 } 503 } 504 505 U_CAPI int64_t U_EXPORT2 506 ucol_previousProcessed(UCollationElements *elems, 507 int32_t *ixLow, 508 int32_t *ixHigh, 509 UErrorCode *status) 510 { 511 const UCollator *coll = elems->iteratordata_.coll; 512 int64_t result = UCOL_IGNORABLE; 513 // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 514 // UCollationStrength strength = ucol_getStrength(coll); 515 // UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; 516 // uint32_t variableTop = coll->variableTopValue; 517 int32_t low = 0, high = 0; 518 519 if (U_FAILURE(*status)) { 520 return UCOL_PROCESSED_NULLORDER; 521 } 522 523 if (elems->reset_ && 524 (elems->iteratordata_.pos == elems->iteratordata_.string)) { 525 if (elems->iteratordata_.endp == NULL) { 526 elems->iteratordata_.endp = elems->iteratordata_.string + 527 u_strlen(elems->iteratordata_.string); 528 elems->iteratordata_.flags |= UCOL_ITER_HASLEN; 529 } 530 531 elems->iteratordata_.pos = elems->iteratordata_.endp; 532 elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; 533 } 534 535 if (elems->pce == NULL) { 536 elems->pce = new UCollationPCE(elems); 537 } else { 538 //elems->pce->pceBuffer.reset(); 539 } 540 541 elems->reset_ = FALSE; 542 543 while (elems->pce->pceBuffer.empty()) { 544 // buffer raw CEs up to non-ignorable primary 545 RCEBuffer rceb; 546 uint32_t ce; 547 548 // **** do we need to reset rceb, or will it always be empty at this point **** 549 do { 550 high = ucol_getOffset(elems); 551 ce = ucol_getPrevCE(coll, &elems->iteratordata_, status); 552 low = ucol_getOffset(elems); 553 554 if (ce == UCOL_NO_MORE_CES) { 555 if (! rceb.empty()) { 556 break; 557 } 558 559 goto finish; 560 } 561 562 rceb.put(ce, low, high); 563 } while ((ce & UCOL_PRIMARYMASK) == 0); 564 565 // process the raw CEs 566 while (! rceb.empty()) { 567 const RCEI *rcei = rceb.get(); 568 569 result = processCE(elems, rcei->ce); 570 571 if (result != UCOL_IGNORABLE) { 572 elems->pce->pceBuffer.put(result, rcei->low, rcei->high); 573 } 574 } 575 } 576 577 finish: 578 if (elems->pce->pceBuffer.empty()) { 579 // **** Is -1 the right value for ixLow, ixHigh? **** 580 if (ixLow != NULL) { 581 *ixLow = -1; 582 } 583 584 if (ixHigh != NULL) { 585 *ixHigh = -1 586 ; 587 } 588 return UCOL_PROCESSED_NULLORDER; 589 } 590 591 const PCEI *pcei = elems->pce->pceBuffer.get(); 592 593 if (ixLow != NULL) { 594 *ixLow = pcei->low; 595 } 596 597 if (ixHigh != NULL) { 598 *ixHigh = pcei->high; 599 } 600 601 return pcei->ce; 602 } 603 604 U_CAPI int32_t U_EXPORT2 605 ucol_getMaxExpansion(const UCollationElements *elems, 606 int32_t order) 607 { 608 uint8_t result; 609 610 #if 0 611 UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result); 612 #else 613 const UCollator *coll = elems->iteratordata_.coll; 614 const uint32_t *start; 615 const uint32_t *limit; 616 const uint32_t *mid; 617 uint32_t strengthMask = 0; 618 uint32_t mOrder = (uint32_t) order; 619 620 switch (coll->strength) 621 { 622 default: 623 strengthMask |= UCOL_TERTIARYORDERMASK; 624 /* fall through */ 625 626 case UCOL_SECONDARY: 627 strengthMask |= UCOL_SECONDARYORDERMASK; 628 /* fall through */ 629 630 case UCOL_PRIMARY: 631 strengthMask |= UCOL_PRIMARYORDERMASK; 632 } 633 634 mOrder &= strengthMask; 635 start = (coll)->endExpansionCE; 636 limit = (coll)->lastEndExpansionCE; 637 638 while (start < limit - 1) { 639 mid = start + ((limit - start) >> 1); 640 if (mOrder <= (*mid & strengthMask)) { 641 limit = mid; 642 } else { 643 start = mid; 644 } 645 } 646 647 // FIXME: with a masked search, there might be more than one hit, 648 // so we need to look forward and backward from the match to find all 649 // of the hits... 650 if ((*start & strengthMask) == mOrder) { 651 result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE)); 652 } else if ((*limit & strengthMask) == mOrder) { 653 result = *(coll->expansionCESize + (limit - coll->endExpansionCE)); 654 } else if ((mOrder & 0xFFFF) == 0x00C0) { 655 result = 2; 656 } else { 657 result = 1; 658 } 659 #endif 660 661 return result; 662 } 663 664 U_CAPI void U_EXPORT2 665 ucol_setText( UCollationElements *elems, 666 const UChar *text, 667 int32_t textLength, 668 UErrorCode *status) 669 { 670 if (U_FAILURE(*status)) { 671 return; 672 } 673 674 if (elems->isWritable && elems->iteratordata_.string != NULL) 675 { 676 uprv_free((UChar *)elems->iteratordata_.string); 677 } 678 679 if (text == NULL) { 680 textLength = 0; 681 } 682 683 elems->isWritable = FALSE; 684 685 /* free offset buffer to avoid memory leak before initializing. */ 686 ucol_freeOffsetBuffer(&(elems->iteratordata_)); 687 /* Ensure that previously allocated extendCEs is freed before setting to NULL. */ 688 if (elems->iteratordata_.extendCEs != NULL) { 689 uprv_free(elems->iteratordata_.extendCEs); 690 } 691 uprv_init_collIterate(elems->iteratordata_.coll, text, textLength, 692 &elems->iteratordata_, status); 693 694 elems->reset_ = TRUE; 695 } 696 697 U_CAPI int32_t U_EXPORT2 698 ucol_getOffset(const UCollationElements *elems) 699 { 700 const collIterate *ci = &(elems->iteratordata_); 701 702 if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) { 703 return ci->offsetRepeatValue; 704 } 705 706 if (ci->offsetReturn != NULL) { 707 return *ci->offsetReturn; 708 } 709 710 // while processing characters in normalization buffer getOffset will 711 // return the next non-normalized character. 712 // should be inline with the old implementation since the old codes uses 713 // nextDecomp in normalizer which also decomposes the string till the 714 // first base character is found. 715 if (ci->flags & UCOL_ITER_INNORMBUF) { 716 if (ci->fcdPosition == NULL) { 717 return 0; 718 } 719 return (int32_t)(ci->fcdPosition - ci->string); 720 } 721 else { 722 return (int32_t)(ci->pos - ci->string); 723 } 724 } 725 726 U_CAPI void U_EXPORT2 727 ucol_setOffset(UCollationElements *elems, 728 int32_t offset, 729 UErrorCode *status) 730 { 731 if (U_FAILURE(*status)) { 732 return; 733 } 734 735 // this methods will clean up any use of the writable buffer and points to 736 // the original string 737 collIterate *ci = &(elems->iteratordata_); 738 ci->pos = ci->string + offset; 739 ci->CEpos = ci->toReturn = ci->CEs; 740 if (ci->flags & UCOL_ITER_INNORMBUF) { 741 ci->flags = ci->origFlags; 742 } 743 if ((ci->flags & UCOL_ITER_HASLEN) == 0) { 744 ci->endp = ci->string + u_strlen(ci->string); 745 ci->flags |= UCOL_ITER_HASLEN; 746 } 747 ci->fcdPosition = NULL; 748 elems->reset_ = FALSE; 749 750 ci->offsetReturn = NULL; 751 ci->offsetStore = ci->offsetBuffer; 752 ci->offsetRepeatCount = ci->offsetRepeatValue = 0; 753 } 754 755 U_CAPI int32_t U_EXPORT2 756 ucol_primaryOrder (int32_t order) 757 { 758 order &= UCOL_PRIMARYMASK; 759 return (order >> UCOL_PRIMARYORDERSHIFT); 760 } 761 762 U_CAPI int32_t U_EXPORT2 763 ucol_secondaryOrder (int32_t order) 764 { 765 order &= UCOL_SECONDARYMASK; 766 return (order >> UCOL_SECONDARYORDERSHIFT); 767 } 768 769 U_CAPI int32_t U_EXPORT2 770 ucol_tertiaryOrder (int32_t order) 771 { 772 return (order & UCOL_TERTIARYMASK); 773 } 774 775 776 void ucol_freeOffsetBuffer(collIterate *s) { 777 if (s != NULL && s->offsetBuffer != NULL) { 778 uprv_free(s->offsetBuffer); 779 s->offsetBuffer = NULL; 780 s->offsetBufferSize = 0; 781 } 782 } 783 784 #endif /* #if !UCONFIG_NO_COLLATION */ 785