1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2013, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucol.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Modification history 12 * Date Name Comments 13 * 1996-1999 various members of ICU team maintained C API for collation framework 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE 15 * 03/01/2001 synwee Added maxexpansion functionality. 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_COLLATION 22 23 #include "unicode/bytestream.h" 24 #include "unicode/coleitr.h" 25 #include "unicode/unorm.h" 26 #include "unicode/udata.h" 27 #include "unicode/ustring.h" 28 #include "unicode/utf8.h" 29 30 #include "ucol_imp.h" 31 #include "bocsu.h" 32 33 #include "normalizer2impl.h" 34 #include "unorm_it.h" 35 #include "umutex.h" 36 #include "cmemory.h" 37 #include "ucln_in.h" 38 #include "cstring.h" 39 #include "utracimp.h" 40 #include "putilimp.h" 41 #include "uassert.h" 42 #include "unicode/coll.h" 43 44 #ifdef UCOL_DEBUG 45 #include <stdio.h> 46 #endif 47 48 U_NAMESPACE_USE 49 50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 51 52 #define LAST_BYTE_MASK_ 0xFF 53 #define SECOND_LAST_BYTE_SHIFT_ 8 54 55 #define ZERO_CC_LIMIT_ 0xC0 56 57 // These are static pointers to the NFC/NFD implementation instance. 58 // Each of them is always the same between calls to u_cleanup 59 // and therefore writing to it is not synchronized. 60 // They are cleaned in ucol_cleanup 61 static const Normalizer2 *g_nfd = NULL; 62 static const Normalizer2Impl *g_nfcImpl = NULL; 63 64 // These are values from UCA required for 65 // implicit generation and supressing sort key compression 66 // they should regularly be in the UCA, but if one 67 // is running without UCA, it could be a problem 68 static const int32_t maxRegularPrimary = 0x7A; 69 static const int32_t minImplicitPrimary = 0xE0; 70 static const int32_t maxImplicitPrimary = 0xE4; 71 72 U_CDECL_BEGIN 73 static UBool U_CALLCONV 74 ucol_cleanup(void) 75 { 76 g_nfd = NULL; 77 g_nfcImpl = NULL; 78 return TRUE; 79 } 80 81 static int32_t U_CALLCONV 82 _getFoldingOffset(uint32_t data) { 83 return (int32_t)(data&0xFFFFFF); 84 } 85 86 U_CDECL_END 87 88 static inline 89 UBool initializeNFD(UErrorCode *status) { 90 if (g_nfd != NULL) { 91 return TRUE; 92 } else { 93 // The result is constant, until the library is reloaded. 94 g_nfd = Normalizer2Factory::getNFDInstance(*status); 95 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 96 return U_SUCCESS(*status); 97 } 98 } 99 100 // init FCD data 101 static inline 102 UBool initializeFCD(UErrorCode *status) { 103 if (g_nfcImpl != NULL) { 104 return TRUE; 105 } else { 106 // The result is constant, until the library is reloaded. 107 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); 108 // Note: Alternatively, we could also store this pointer in each collIterate struct, 109 // same as Normalizer2Factory::getImpl(collIterate->nfd). 110 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 111 return U_SUCCESS(*status); 112 } 113 } 114 115 static 116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, 117 int32_t sourceLen, collIterate *s, 118 UErrorCode *status) 119 { 120 (s)->string = (s)->pos = sourceString; 121 (s)->origFlags = 0; 122 (s)->flags = 0; 123 if (sourceLen >= 0) { 124 s->flags |= UCOL_ITER_HASLEN; 125 (s)->endp = (UChar *)sourceString+sourceLen; 126 } 127 else { 128 /* change to enable easier checking for end of string for fcdpositon */ 129 (s)->endp = NULL; 130 } 131 (s)->extendCEs = NULL; 132 (s)->extendCEsSize = 0; 133 (s)->CEpos = (s)->toReturn = (s)->CEs; 134 (s)->offsetBuffer = NULL; 135 (s)->offsetBufferSize = 0; 136 (s)->offsetReturn = (s)->offsetStore = NULL; 137 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; 138 (s)->coll = (collator); 139 if (initializeNFD(status)) { 140 (s)->nfd = g_nfd; 141 } else { 142 return; 143 } 144 (s)->fcdPosition = 0; 145 if(collator->normalizationMode == UCOL_ON) { 146 (s)->flags |= UCOL_ITER_NORM; 147 } 148 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { 149 (s)->flags |= UCOL_HIRAGANA_Q; 150 } 151 (s)->iterator = NULL; 152 //(s)->iteratorIndex = 0; 153 } 154 155 U_CAPI void U_EXPORT2 156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, 157 int32_t sourceLen, collIterate *s, 158 UErrorCode *status) { 159 /* Out-of-line version for use from other files. */ 160 IInit_collIterate(collator, sourceString, sourceLen, s, status); 161 } 162 163 U_CAPI collIterate * U_EXPORT2 164 uprv_new_collIterate(UErrorCode *status) { 165 if(U_FAILURE(*status)) { 166 return NULL; 167 } 168 collIterate *s = new collIterate; 169 if(s == NULL) { 170 *status = U_MEMORY_ALLOCATION_ERROR; 171 return NULL; 172 } 173 return s; 174 } 175 176 U_CAPI void U_EXPORT2 177 uprv_delete_collIterate(collIterate *s) { 178 delete s; 179 } 180 181 U_CAPI UBool U_EXPORT2 182 uprv_collIterateAtEnd(collIterate *s) { 183 return s == NULL || s->pos == s->endp; 184 } 185 186 /** 187 * Backup the state of the collIterate struct data 188 * @param data collIterate to backup 189 * @param backup storage 190 */ 191 static 192 inline void backupState(const collIterate *data, collIterateState *backup) 193 { 194 backup->fcdPosition = data->fcdPosition; 195 backup->flags = data->flags; 196 backup->origFlags = data->origFlags; 197 backup->pos = data->pos; 198 backup->bufferaddress = data->writableBuffer.getBuffer(); 199 backup->buffersize = data->writableBuffer.length(); 200 backup->iteratorMove = 0; 201 backup->iteratorIndex = 0; 202 if(data->iterator != NULL) { 203 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); 204 backup->iteratorIndex = data->iterator->getState(data->iterator); 205 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE 206 if(backup->iteratorIndex == UITER_NO_STATE) { 207 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { 208 backup->iteratorMove++; 209 data->iterator->move(data->iterator, -1, UITER_CURRENT); 210 } 211 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 212 } 213 } 214 } 215 216 /** 217 * Loads the state into the collIterate struct data 218 * @param data collIterate to backup 219 * @param backup storage 220 * @param forwards boolean to indicate if forwards iteration is used, 221 * false indicates backwards iteration 222 */ 223 static 224 inline void loadState(collIterate *data, const collIterateState *backup, 225 UBool forwards) 226 { 227 UErrorCode status = U_ZERO_ERROR; 228 data->flags = backup->flags; 229 data->origFlags = backup->origFlags; 230 if(data->iterator != NULL) { 231 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); 232 data->iterator->setState(data->iterator, backup->iteratorIndex, &status); 233 if(backup->iteratorMove != 0) { 234 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 235 } 236 } 237 data->pos = backup->pos; 238 239 if ((data->flags & UCOL_ITER_INNORMBUF) && 240 data->writableBuffer.getBuffer() != backup->bufferaddress) { 241 /* 242 this is when a new buffer has been reallocated and we'll have to 243 calculate the new position. 244 note the new buffer has to contain the contents of the old buffer. 245 */ 246 if (forwards) { 247 data->pos = data->writableBuffer.getTerminatedBuffer() + 248 (data->pos - backup->bufferaddress); 249 } 250 else { 251 /* backwards direction */ 252 int32_t temp = backup->buffersize - 253 (int32_t)(data->pos - backup->bufferaddress); 254 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp); 255 } 256 } 257 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 258 /* 259 this is alittle tricky. 260 if we are initially not in the normalization buffer, even if we 261 normalize in the later stage, the data in the buffer will be 262 ignored, since we skip back up to the data string. 263 however if we are already in the normalization buffer, any 264 further normalization will pull data into the normalization 265 buffer and modify the fcdPosition. 266 since we are keeping the data in the buffer for use, the 267 fcdPosition can not be reverted back. 268 arrgghh.... 269 */ 270 data->fcdPosition = backup->fcdPosition; 271 } 272 } 273 274 static UBool 275 reallocCEs(collIterate *data, int32_t newCapacity) { 276 uint32_t *oldCEs = data->extendCEs; 277 if(oldCEs == NULL) { 278 oldCEs = data->CEs; 279 } 280 int32_t length = data->CEpos - oldCEs; 281 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); 282 if(newCEs == NULL) { 283 return FALSE; 284 } 285 uprv_memcpy(newCEs, oldCEs, length * 4); 286 uprv_free(data->extendCEs); 287 data->extendCEs = newCEs; 288 data->extendCEsSize = newCapacity; 289 data->CEpos = newCEs + length; 290 return TRUE; 291 } 292 293 static UBool 294 increaseCEsCapacity(collIterate *data) { 295 int32_t oldCapacity; 296 if(data->extendCEs != NULL) { 297 oldCapacity = data->extendCEsSize; 298 } else { 299 oldCapacity = LENGTHOF(data->CEs); 300 } 301 return reallocCEs(data, 2 * oldCapacity); 302 } 303 304 static UBool 305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) { 306 int32_t oldCapacity; 307 if(data->extendCEs != NULL) { 308 oldCapacity = data->extendCEsSize; 309 } else { 310 oldCapacity = LENGTHOF(data->CEs); 311 } 312 if(minCapacity <= oldCapacity) { 313 return TRUE; 314 } 315 oldCapacity *= 2; 316 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity); 317 } 318 319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) { 320 if(U_FAILURE(errorCode)) { 321 return; 322 } 323 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer); 324 U_ASSERT(length >= offsetBufferSize || offsetStore != NULL); 325 if(length >= offsetBufferSize) { 326 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE; 327 int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4)); 328 if(newBuffer == NULL) { 329 errorCode = U_MEMORY_ALLOCATION_ERROR; 330 return; 331 } 332 if(length > 0) { 333 uprv_memcpy(newBuffer, offsetBuffer, length * 4); 334 } 335 uprv_free(offsetBuffer); 336 offsetBuffer = newBuffer; 337 offsetStore = offsetBuffer + length; 338 offsetBufferSize = newCapacity; 339 } 340 *offsetStore++ = offset; 341 } 342 343 /* 344 * collIter_eos() 345 * Checks for a collIterate being positioned at the end of 346 * its source string. 347 * 348 */ 349 static 350 inline UBool collIter_eos(collIterate *s) { 351 if(s->flags & UCOL_USE_ITERATOR) { 352 return !(s->iterator->hasNext(s->iterator)); 353 } 354 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { 355 // Null terminated string, but not at null, so not at end. 356 // Whether in main or normalization buffer doesn't matter. 357 return FALSE; 358 } 359 360 // String with length. Can't be in normalization buffer, which is always 361 // null termintated. 362 if (s->flags & UCOL_ITER_HASLEN) { 363 return (s->pos == s->endp); 364 } 365 366 // We are at a null termination, could be either normalization buffer or main string. 367 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { 368 // At null at end of main string. 369 return TRUE; 370 } 371 372 // At null at end of normalization buffer. Need to check whether there there are 373 // any characters left in the main buffer. 374 if(s->origFlags & UCOL_USE_ITERATOR) { 375 return !(s->iterator->hasNext(s->iterator)); 376 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { 377 // Null terminated main string. fcdPosition is the 'return' position into main buf. 378 return (*s->fcdPosition == 0); 379 } 380 else { 381 // Main string with an end pointer. 382 return s->fcdPosition == s->endp; 383 } 384 } 385 386 /* 387 * collIter_bos() 388 * Checks for a collIterate being positioned at the start of 389 * its source string. 390 * 391 */ 392 static 393 inline UBool collIter_bos(collIterate *source) { 394 // if we're going backwards, we need to know whether there is more in the 395 // iterator, even if we are in the side buffer 396 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 397 return !source->iterator->hasPrevious(source->iterator); 398 } 399 if (source->pos <= source->string || 400 ((source->flags & UCOL_ITER_INNORMBUF) && 401 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { 402 return TRUE; 403 } 404 return FALSE; 405 } 406 407 /*static 408 inline UBool collIter_SimpleBos(collIterate *source) { 409 // if we're going backwards, we need to know whether there is more in the 410 // iterator, even if we are in the side buffer 411 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 412 return !source->iterator->hasPrevious(source->iterator); 413 } 414 if (source->pos == source->string) { 415 return TRUE; 416 } 417 return FALSE; 418 }*/ 419 //return (data->pos == data->string) || 420 421 422 /****************************************************************************/ 423 /* Following are the open/close functions */ 424 /* */ 425 /****************************************************************************/ 426 427 static UCollator* 428 ucol_initFromBinary(const uint8_t *bin, int32_t length, 429 const UCollator *base, 430 UCollator *fillIn, 431 UErrorCode *status) 432 { 433 UCollator *result = fillIn; 434 if(U_FAILURE(*status)) { 435 return NULL; 436 } 437 /* 438 if(base == NULL) { 439 // we don't support null base yet 440 *status = U_ILLEGAL_ARGUMENT_ERROR; 441 return NULL; 442 } 443 */ 444 // We need these and we could be running without UCA 445 uprv_uca_initImplicitConstants(status); 446 UCATableHeader *colData = (UCATableHeader *)bin; 447 // do we want version check here? We're trying to figure out whether collators are compatible 448 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || 449 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || 450 colData->version[0] != UCOL_BUILDER_VERSION) 451 { 452 *status = U_COLLATOR_VERSION_MISMATCH; 453 return NULL; 454 } 455 else { 456 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { 457 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); 458 if(U_FAILURE(*status)){ 459 return NULL; 460 } 461 result->hasRealData = TRUE; 462 } 463 else { 464 if(base) { 465 result = ucol_initCollator(base->image, result, base, status); 466 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); 467 if(U_FAILURE(*status)){ 468 return NULL; 469 } 470 result->hasRealData = FALSE; 471 } 472 else { 473 *status = U_USELESS_COLLATOR_ERROR; 474 return NULL; 475 } 476 } 477 result->freeImageOnClose = FALSE; 478 } 479 result->actualLocale = NULL; 480 result->validLocale = NULL; 481 result->requestedLocale = NULL; 482 result->rules = NULL; 483 result->rulesLength = 0; 484 result->freeRulesOnClose = FALSE; 485 result->ucaRules = NULL; 486 return result; 487 } 488 489 U_CAPI UCollator* U_EXPORT2 490 ucol_openBinary(const uint8_t *bin, int32_t length, 491 const UCollator *base, 492 UErrorCode *status) 493 { 494 return ucol_initFromBinary(bin, length, base, NULL, status); 495 } 496 497 U_CAPI int32_t U_EXPORT2 498 ucol_cloneBinary(const UCollator *coll, 499 uint8_t *buffer, int32_t capacity, 500 UErrorCode *status) 501 { 502 int32_t length = 0; 503 if(U_FAILURE(*status)) { 504 return length; 505 } 506 if(capacity < 0) { 507 *status = U_ILLEGAL_ARGUMENT_ERROR; 508 return length; 509 } 510 if(coll->hasRealData == TRUE) { 511 length = coll->image->size; 512 if(length <= capacity) { 513 uprv_memcpy(buffer, coll->image, length); 514 } else { 515 *status = U_BUFFER_OVERFLOW_ERROR; 516 } 517 } else { 518 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 519 if(length <= capacity) { 520 /* build the UCATableHeader with minimal entries */ 521 /* do not copy the header from the UCA file because its values are wrong! */ 522 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 523 524 /* reset everything */ 525 uprv_memset(buffer, 0, length); 526 527 /* set the tailoring-specific values */ 528 UCATableHeader *myData = (UCATableHeader *)buffer; 529 myData->size = length; 530 531 /* offset for the options, the only part of the data that is present after the header */ 532 myData->options = sizeof(UCATableHeader); 533 534 /* need to always set the expansion value for an upper bound of the options */ 535 myData->expansion = myData->options + sizeof(UColOptionSet); 536 537 myData->magic = UCOL_HEADER_MAGIC; 538 myData->isBigEndian = U_IS_BIG_ENDIAN; 539 myData->charSetFamily = U_CHARSET_FAMILY; 540 541 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 542 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 543 544 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 545 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 546 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 547 myData->jamoSpecial = coll->image->jamoSpecial; 548 549 /* copy the collator options */ 550 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 551 } else { 552 *status = U_BUFFER_OVERFLOW_ERROR; 553 } 554 } 555 return length; 556 } 557 558 U_CAPI UCollator* U_EXPORT2 559 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status) 560 { 561 UCollator * localCollator; 562 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); 563 int32_t imageSize = 0; 564 int32_t rulesSize = 0; 565 int32_t rulesPadding = 0; 566 int32_t defaultReorderCodesSize = 0; 567 int32_t reorderCodesSize = 0; 568 uint8_t *image; 569 UChar *rules; 570 int32_t* defaultReorderCodes; 571 int32_t* reorderCodes; 572 uint8_t* leadBytePermutationTable; 573 UBool imageAllocated = FALSE; 574 575 if (status == NULL || U_FAILURE(*status)){ 576 return NULL; 577 } 578 if (coll == NULL) { 579 *status = U_ILLEGAL_ARGUMENT_ERROR; 580 return NULL; 581 } 582 583 if (coll->rules && coll->freeRulesOnClose) { 584 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); 585 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); 586 bufferSizeNeeded += rulesSize + rulesPadding; 587 } 588 // no padding for alignment needed from here since the next two are 4 byte quantities 589 if (coll->defaultReorderCodes) { 590 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t); 591 bufferSizeNeeded += defaultReorderCodesSize; 592 } 593 if (coll->reorderCodes) { 594 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t); 595 bufferSizeNeeded += reorderCodesSize; 596 } 597 if (coll->leadBytePermutationTable) { 598 bufferSizeNeeded += 256 * sizeof(uint8_t); 599 } 600 601 if (pBufferSize != NULL) { 602 int32_t inputSize = *pBufferSize; 603 *pBufferSize = 1; 604 if (inputSize == 0) { 605 return NULL; // preflighting for deprecated functionality 606 } 607 } 608 609 char *stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); 610 // Null pointer check. 611 if (stackBufferChars == NULL) { 612 *status = U_MEMORY_ALLOCATION_ERROR; 613 return NULL; 614 } 615 *status = U_SAFECLONE_ALLOCATED_WARNING; 616 617 localCollator = (UCollator *)stackBufferChars; 618 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); 619 defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize); 620 reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize); 621 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize; 622 623 { 624 UErrorCode tempStatus = U_ZERO_ERROR; 625 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); 626 } 627 if (coll->freeImageOnClose) { 628 image = (uint8_t *)uprv_malloc(imageSize); 629 // Null pointer check 630 if (image == NULL) { 631 *status = U_MEMORY_ALLOCATION_ERROR; 632 return NULL; 633 } 634 ucol_cloneBinary(coll, image, imageSize, status); 635 imageAllocated = TRUE; 636 } 637 else { 638 image = (uint8_t *)coll->image; 639 } 640 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); 641 if (U_FAILURE(*status)) { 642 return NULL; 643 } 644 645 if (coll->rules) { 646 if (coll->freeRulesOnClose) { 647 localCollator->rules = u_strcpy(rules, coll->rules); 648 //bufferEnd += rulesSize; 649 } 650 else { 651 localCollator->rules = coll->rules; 652 } 653 localCollator->freeRulesOnClose = FALSE; 654 localCollator->rulesLength = coll->rulesLength; 655 } 656 657 // collator reordering 658 if (coll->defaultReorderCodes) { 659 localCollator->defaultReorderCodes = 660 (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t)); 661 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength; 662 localCollator->freeDefaultReorderCodesOnClose = FALSE; 663 } 664 if (coll->reorderCodes) { 665 localCollator->reorderCodes = 666 (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t)); 667 localCollator->reorderCodesLength = coll->reorderCodesLength; 668 localCollator->freeReorderCodesOnClose = FALSE; 669 } 670 if (coll->leadBytePermutationTable) { 671 localCollator->leadBytePermutationTable = 672 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256); 673 localCollator->freeLeadBytePermutationTableOnClose = FALSE; 674 } 675 676 int32_t i; 677 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 678 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); 679 } 680 // zero copies of pointers 681 localCollator->actualLocale = NULL; 682 localCollator->validLocale = NULL; 683 localCollator->requestedLocale = NULL; 684 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. 685 localCollator->freeOnClose = TRUE; 686 localCollator->freeImageOnClose = imageAllocated; 687 return localCollator; 688 } 689 690 U_CAPI void U_EXPORT2 691 ucol_close(UCollator *coll) 692 { 693 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 694 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 695 if(coll != NULL) { 696 // these are always owned by each UCollator struct, 697 // so we always free them 698 if(coll->validLocale != NULL) { 699 uprv_free(coll->validLocale); 700 } 701 if(coll->actualLocale != NULL) { 702 uprv_free(coll->actualLocale); 703 } 704 if(coll->requestedLocale != NULL) { 705 uprv_free(coll->requestedLocale); 706 } 707 if(coll->latinOneCEs != NULL) { 708 uprv_free(coll->latinOneCEs); 709 } 710 if(coll->options != NULL && coll->freeOptionsOnClose) { 711 uprv_free(coll->options); 712 } 713 if(coll->rules != NULL && coll->freeRulesOnClose) { 714 uprv_free((UChar *)coll->rules); 715 } 716 if(coll->image != NULL && coll->freeImageOnClose) { 717 uprv_free((UCATableHeader *)coll->image); 718 } 719 720 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) { 721 uprv_free(coll->leadBytePermutationTable); 722 } 723 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) { 724 uprv_free(coll->defaultReorderCodes); 725 } 726 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { 727 uprv_free(coll->reorderCodes); 728 } 729 730 if(coll->delegate != NULL) { 731 delete (Collator*)coll->delegate; 732 } 733 734 /* Here, it would be advisable to close: */ 735 /* - UData for UCA (unless we stuff it in the root resb */ 736 /* Again, do we need additional housekeeping... HMMM! */ 737 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); 738 if(coll->freeOnClose){ 739 /* for safeClone, if freeOnClose is FALSE, 740 don't free the other instance data */ 741 uprv_free(coll); 742 } 743 } 744 UTRACE_EXIT(); 745 } 746 747 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { 748 if(U_FAILURE(*status)) { 749 return; 750 } 751 result->caseFirst = (UColAttributeValue)opts->caseFirst; 752 result->caseLevel = (UColAttributeValue)opts->caseLevel; 753 result->frenchCollation = (UColAttributeValue)opts->frenchCollation; 754 result->normalizationMode = (UColAttributeValue)opts->normalizationMode; 755 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) { 756 return; 757 } 758 result->strength = (UColAttributeValue)opts->strength; 759 result->variableTopValue = opts->variableTopValue; 760 result->alternateHandling = (UColAttributeValue)opts->alternateHandling; 761 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; 762 result->numericCollation = (UColAttributeValue)opts->numericCollation; 763 result->caseFirstisDefault = TRUE; 764 result->caseLevelisDefault = TRUE; 765 result->frenchCollationisDefault = TRUE; 766 result->normalizationModeisDefault = TRUE; 767 result->strengthisDefault = TRUE; 768 result->variableTopValueisDefault = TRUE; 769 result->alternateHandlingisDefault = TRUE; 770 result->hiraganaQisDefault = TRUE; 771 result->numericCollationisDefault = TRUE; 772 773 ucol_updateInternalState(result, status); 774 775 result->options = opts; 776 } 777 778 779 /** 780 * Approximate determination if a character is at a contraction end. 781 * Guaranteed to be TRUE if a character is at the end of a contraction, 782 * otherwise it is not deterministic. 783 * @param c character to be determined 784 * @param coll collator 785 */ 786 static 787 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { 788 if (c < coll->minContrEndCP) { 789 return FALSE; 790 } 791 792 int32_t hash = c; 793 uint8_t htbyte; 794 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 795 if (U16_IS_TRAIL(c)) { 796 return TRUE; 797 } 798 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 799 } 800 htbyte = coll->contrEndCP[hash>>3]; 801 return (((htbyte >> (hash & 7)) & 1) == 1); 802 } 803 804 805 806 /* 807 * i_getCombiningClass() 808 * A fast, at least partly inline version of u_getCombiningClass() 809 * This is a candidate for further optimization. Used heavily 810 * in contraction processing. 811 */ 812 static 813 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { 814 uint8_t sCC = 0; 815 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { 816 sCC = u_getCombiningClass(c); 817 } 818 return sCC; 819 } 820 821 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { 822 UChar c; 823 UCollator *result = fillIn; 824 if(U_FAILURE(*status) || image == NULL) { 825 return NULL; 826 } 827 828 if(result == NULL) { 829 result = (UCollator *)uprv_malloc(sizeof(UCollator)); 830 if(result == NULL) { 831 *status = U_MEMORY_ALLOCATION_ERROR; 832 return result; 833 } 834 result->freeOnClose = TRUE; 835 } else { 836 result->freeOnClose = FALSE; 837 } 838 839 result->delegate = NULL; 840 841 result->image = image; 842 result->mapping.getFoldingOffset = _getFoldingOffset; 843 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; 844 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); 845 if(U_FAILURE(*status)) { 846 if(result->freeOnClose == TRUE) { 847 uprv_free(result); 848 result = NULL; 849 } 850 return result; 851 } 852 853 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); 854 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); 855 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); 856 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); 857 result->rules = NULL; 858 result->rulesLength = 0; 859 result->freeRulesOnClose = FALSE; 860 result->defaultReorderCodes = NULL; 861 result->defaultReorderCodesLength = 0; 862 result->freeDefaultReorderCodesOnClose = FALSE; 863 result->reorderCodes = NULL; 864 result->reorderCodesLength = 0; 865 result->freeReorderCodesOnClose = FALSE; 866 result->leadBytePermutationTable = NULL; 867 result->freeLeadBytePermutationTableOnClose = FALSE; 868 869 /* get the version info from UCATableHeader and populate the Collator struct*/ 870 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ 871 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ 872 result->dataVersion[2] = 0; 873 result->dataVersion[3] = 0; 874 875 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; 876 result->minUnsafeCP = 0; 877 for (c=0; c<0x300; c++) { // Find the smallest unsafe char. 878 if (ucol_unsafeCP(c, result)) break; 879 } 880 result->minUnsafeCP = c; 881 882 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; 883 result->minContrEndCP = 0; 884 for (c=0; c<0x300; c++) { // Find the Contraction-ending char. 885 if (ucol_contractionEndCP(c, result)) break; 886 } 887 result->minContrEndCP = c; 888 889 /* max expansion tables */ 890 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + 891 result->image->endExpansionCE); 892 result->lastEndExpansionCE = result->endExpansionCE + 893 result->image->endExpansionCECount - 1; 894 result->expansionCESize = (uint8_t*)result->image + 895 result->image->expansionCESize; 896 897 898 //result->errorCode = *status; 899 900 result->latinOneCEs = NULL; 901 902 result->latinOneRegenTable = FALSE; 903 result->latinOneFailed = FALSE; 904 result->UCA = UCA; 905 906 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ 907 result->ucaRules = NULL; 908 result->actualLocale = NULL; 909 result->validLocale = NULL; 910 result->requestedLocale = NULL; 911 result->hasRealData = FALSE; // real data lives in .dat file... 912 result->freeImageOnClose = FALSE; 913 914 /* set attributes */ 915 ucol_setOptionsFromHeader( 916 result, 917 (UColOptionSet*)((uint8_t*)result->image+result->image->options), 918 status); 919 result->freeOptionsOnClose = FALSE; 920 921 return result; 922 } 923 924 /* new Mark's code */ 925 926 /** 927 * For generation of Implicit CEs 928 * @author Davis 929 * 930 * Cleaned up so that changes can be made more easily. 931 * Old values: 932 # First Implicit: E26A792D 933 # Last Implicit: E3DC70C0 934 # First CJK: E0030300 935 # Last CJK: E0A9DD00 936 # First CJK_A: E0A9DF00 937 # Last CJK_A: E0DE3100 938 */ 939 /* Following is a port of Mark's code for new treatment of implicits. 940 * It is positioned here, since ucol_initUCA need to initialize the 941 * variables below according to the data in the fractional UCA. 942 */ 943 944 /** 945 * Function used to: 946 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and 947 * b) bump any non-CJK characters by 10FFFF. 948 * The relevant blocks are: 949 * A: 4E00..9FFF; CJK Unified Ideographs 950 * F900..FAFF; CJK Compatibility Ideographs 951 * B: 3400..4DBF; CJK Unified Ideographs Extension A 952 * 20000..XX; CJK Unified Ideographs Extension B (and others later on) 953 * As long as 954 * no new B characters are allocated between 4E00 and FAFF, and 955 * no new A characters are outside of this range, 956 * (very high probability) this simple code will work. 957 * The reordered blocks are: 958 * Block1 is CJK 959 * Block2 is CJK_COMPAT_USED 960 * Block3 is CJK_A 961 * (all contiguous) 962 * Any other CJK gets its normal code point 963 * Any non-CJK gets +10FFFF 964 * When we reorder Block1, we make sure that it is at the very start, 965 * so that it will use a 3-byte form. 966 * Warning: the we only pick up the compatibility characters that are 967 * NOT decomposed, so that block is smaller! 968 */ 969 970 // CONSTANTS 971 static const UChar32 972 NON_CJK_OFFSET = 0x110000, 973 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 974 975 /** 976 * Precomputed by initImplicitConstants() 977 */ 978 static int32_t 979 final3Multiplier = 0, 980 final4Multiplier = 0, 981 final3Count = 0, 982 final4Count = 0, 983 medialCount = 0, 984 min3Primary = 0, 985 min4Primary = 0, 986 max4Primary = 0, 987 minTrail = 0, 988 maxTrail = 0, 989 max3Trail = 0, 990 max4Trail = 0, 991 min4Boundary = 0; 992 993 static const UChar32 994 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 995 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1) 996 CJK_BASE = 0x4E00, 997 CJK_LIMIT = 0x9FCC+1, 998 // Unified CJK ideographs in the compatibility ideographs block. 999 CJK_COMPAT_USED_BASE = 0xFA0E, 1000 CJK_COMPAT_USED_LIMIT = 0xFA2F+1, 1001 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 1002 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 1003 CJK_A_BASE = 0x3400, 1004 CJK_A_LIMIT = 0x4DB5+1, 1005 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; 1006 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; 1007 CJK_B_BASE = 0x20000, 1008 CJK_B_LIMIT = 0x2A6D6+1, 1009 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; 1010 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; 1011 CJK_C_BASE = 0x2A700, 1012 CJK_C_LIMIT = 0x2B734+1, 1013 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; 1014 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; 1015 CJK_D_BASE = 0x2B740, 1016 CJK_D_LIMIT = 0x2B81D+1; 1017 // when adding to this list, look for all occurrences (in project) 1018 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!! 1019 1020 static UChar32 swapCJK(UChar32 i) { 1021 if (i < CJK_A_BASE) { 1022 // non-CJK 1023 } else if (i < CJK_A_LIMIT) { 1024 // Extension A has lower code points than the original Unihan+compat 1025 // but sorts higher. 1026 return i - CJK_A_BASE 1027 + (CJK_LIMIT - CJK_BASE) 1028 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1029 } else if (i < CJK_BASE) { 1030 // non-CJK 1031 } else if (i < CJK_LIMIT) { 1032 return i - CJK_BASE; 1033 } else if (i < CJK_COMPAT_USED_BASE) { 1034 // non-CJK 1035 } else if (i < CJK_COMPAT_USED_LIMIT) { 1036 return i - CJK_COMPAT_USED_BASE 1037 + (CJK_LIMIT - CJK_BASE); 1038 } else if (i < CJK_B_BASE) { 1039 // non-CJK 1040 } else if (i < CJK_B_LIMIT) { 1041 return i; // non-BMP-CJK 1042 } else if (i < CJK_C_BASE) { 1043 // non-CJK 1044 } else if (i < CJK_C_LIMIT) { 1045 return i; // non-BMP-CJK 1046 } else if (i < CJK_D_BASE) { 1047 // non-CJK 1048 } else if (i < CJK_D_LIMIT) { 1049 return i; // non-BMP-CJK 1050 } 1051 return i + NON_CJK_OFFSET; // non-CJK 1052 } 1053 1054 U_CAPI UChar32 U_EXPORT2 1055 uprv_uca_getRawFromCodePoint(UChar32 i) { 1056 return swapCJK(i)+1; 1057 } 1058 1059 U_CAPI UChar32 U_EXPORT2 1060 uprv_uca_getCodePointFromRaw(UChar32 i) { 1061 i--; 1062 UChar32 result = 0; 1063 if(i >= NON_CJK_OFFSET) { 1064 result = i - NON_CJK_OFFSET; 1065 } else if(i >= CJK_B_BASE) { 1066 result = i; 1067 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted 1068 if(i < CJK_LIMIT - CJK_BASE) { 1069 result = i + CJK_BASE; 1070 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { 1071 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); 1072 } else { 1073 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1074 } 1075 } else { 1076 result = -1; 1077 } 1078 return result; 1079 } 1080 1081 // GET IMPLICIT PRIMARY WEIGHTS 1082 // Return value is left justified primary key 1083 U_CAPI uint32_t U_EXPORT2 1084 uprv_uca_getImplicitFromRaw(UChar32 cp) { 1085 /* 1086 if (cp < 0 || cp > UCOL_MAX_INPUT) { 1087 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); 1088 } 1089 */ 1090 int32_t last0 = cp - min4Boundary; 1091 if (last0 < 0) { 1092 int32_t last1 = cp / final3Count; 1093 last0 = cp % final3Count; 1094 1095 int32_t last2 = last1 / medialCount; 1096 last1 %= medialCount; 1097 1098 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start 1099 last1 = minTrail + last1; // offset 1100 last2 = min3Primary + last2; // offset 1101 /* 1102 if (last2 >= min4Primary) { 1103 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); 1104 } 1105 */ 1106 return (last2 << 24) + (last1 << 16) + (last0 << 8); 1107 } else { 1108 int32_t last1 = last0 / final4Count; 1109 last0 %= final4Count; 1110 1111 int32_t last2 = last1 / medialCount; 1112 last1 %= medialCount; 1113 1114 int32_t last3 = last2 / medialCount; 1115 last2 %= medialCount; 1116 1117 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start 1118 last1 = minTrail + last1; // offset 1119 last2 = minTrail + last2; // offset 1120 last3 = min4Primary + last3; // offset 1121 /* 1122 if (last3 > max4Primary) { 1123 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); 1124 } 1125 */ 1126 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; 1127 } 1128 } 1129 1130 static uint32_t U_EXPORT2 1131 uprv_uca_getImplicitPrimary(UChar32 cp) { 1132 //fprintf(stdout, "Incoming: %04x\n", cp); 1133 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); 1134 1135 cp = swapCJK(cp); 1136 cp++; 1137 // we now have a range of numbers from 0 to 21FFFF. 1138 1139 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); 1140 //fprintf(stdout, "CJK swapped: %04x\n", cp); 1141 1142 return uprv_uca_getImplicitFromRaw(cp); 1143 } 1144 1145 /** 1146 * Converts implicit CE into raw integer ("code point") 1147 * @param implicit 1148 * @return -1 if illegal format 1149 */ 1150 U_CAPI UChar32 U_EXPORT2 1151 uprv_uca_getRawFromImplicit(uint32_t implicit) { 1152 UChar32 result; 1153 UChar32 b3 = implicit & 0xFF; 1154 UChar32 b2 = (implicit >> 8) & 0xFF; 1155 UChar32 b1 = (implicit >> 16) & 0xFF; 1156 UChar32 b0 = (implicit >> 24) & 0xFF; 1157 1158 // simple parameter checks 1159 if (b0 < min3Primary || b0 > max4Primary 1160 || b1 < minTrail || b1 > maxTrail) 1161 return -1; 1162 // normal offsets 1163 b1 -= minTrail; 1164 1165 // take care of the final values, and compose 1166 if (b0 < min4Primary) { 1167 if (b2 < minTrail || b2 > max3Trail || b3 != 0) 1168 return -1; 1169 b2 -= minTrail; 1170 UChar32 remainder = b2 % final3Multiplier; 1171 if (remainder != 0) 1172 return -1; 1173 b0 -= min3Primary; 1174 b2 /= final3Multiplier; 1175 result = ((b0 * medialCount) + b1) * final3Count + b2; 1176 } else { 1177 if (b2 < minTrail || b2 > maxTrail 1178 || b3 < minTrail || b3 > max4Trail) 1179 return -1; 1180 b2 -= minTrail; 1181 b3 -= minTrail; 1182 UChar32 remainder = b3 % final4Multiplier; 1183 if (remainder != 0) 1184 return -1; 1185 b3 /= final4Multiplier; 1186 b0 -= min4Primary; 1187 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; 1188 } 1189 // final check 1190 if (result < 0 || result > UCOL_MAX_INPUT) 1191 return -1; 1192 return result; 1193 } 1194 1195 1196 static inline int32_t divideAndRoundUp(int a, int b) { 1197 return 1 + (a-1)/b; 1198 } 1199 1200 /* this function is either called from initUCA or from genUCA before 1201 * doing canonical closure for the UCA. 1202 */ 1203 1204 /** 1205 * Set up to generate implicits. 1206 * Maintenance Note: this function may end up being called more than once, due 1207 * to threading races during initialization. Make sure that 1208 * none of the Constants is ever transiently assigned an 1209 * incorrect value. 1210 * @param minPrimary 1211 * @param maxPrimary 1212 * @param minTrail final byte 1213 * @param maxTrail final byte 1214 * @param gap3 the gap we leave for tailoring for 3-byte forms 1215 * @param gap4 the gap we leave for tailoring for 4-byte forms 1216 */ 1217 static void initImplicitConstants(int minPrimary, int maxPrimary, 1218 int minTrailIn, int maxTrailIn, 1219 int gap3, int primaries3count, 1220 UErrorCode *status) { 1221 // some simple parameter checks 1222 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) 1223 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) 1224 || (primaries3count < 1)) 1225 { 1226 *status = U_ILLEGAL_ARGUMENT_ERROR; 1227 return; 1228 }; 1229 1230 minTrail = minTrailIn; 1231 maxTrail = maxTrailIn; 1232 1233 min3Primary = minPrimary; 1234 max4Primary = maxPrimary; 1235 // compute constants for use later. 1236 // number of values we can use in trailing bytes 1237 // leave room for empty values between AND above, e.g. if gap = 2 1238 // range 3..7 => +3 -4 -5 -6 -7: so 1 value 1239 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values 1240 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values 1241 final3Multiplier = gap3 + 1; 1242 final3Count = (maxTrail - minTrail + 1) / final3Multiplier; 1243 max3Trail = minTrail + (final3Count - 1) * final3Multiplier; 1244 1245 // medials can use full range 1246 medialCount = (maxTrail - minTrail + 1); 1247 // find out how many values fit in each form 1248 int32_t threeByteCount = medialCount * final3Count; 1249 // now determine where the 3/4 boundary is. 1250 // we use 3 bytes below the boundary, and 4 above 1251 int32_t primariesAvailable = maxPrimary - minPrimary + 1; 1252 int32_t primaries4count = primariesAvailable - primaries3count; 1253 1254 1255 int32_t min3ByteCoverage = primaries3count * threeByteCount; 1256 min4Primary = minPrimary + primaries3count; 1257 min4Boundary = min3ByteCoverage; 1258 // Now expand out the multiplier for the 4 bytes, and redo. 1259 1260 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; 1261 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); 1262 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); 1263 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; 1264 if (gap4 < 1) { 1265 *status = U_ILLEGAL_ARGUMENT_ERROR; 1266 return; 1267 } 1268 final4Multiplier = gap4 + 1; 1269 final4Count = neededPerFinalByte; 1270 max4Trail = minTrail + (final4Count - 1) * final4Multiplier; 1271 } 1272 1273 /** 1274 * Supply parameters for generating implicit CEs 1275 */ 1276 U_CAPI void U_EXPORT2 1277 uprv_uca_initImplicitConstants(UErrorCode *status) { 1278 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. 1279 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); 1280 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); 1281 } 1282 1283 1284 /* collIterNormalize Incremental Normalization happens here. */ 1285 /* pick up the range of chars identifed by FCD, */ 1286 /* normalize it into the collIterate's writable buffer, */ 1287 /* switch the collIterate's state to use the writable buffer. */ 1288 /* */ 1289 static 1290 void collIterNormalize(collIterate *collationSource) 1291 { 1292 UErrorCode status = U_ZERO_ERROR; 1293 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ 1294 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ 1295 1296 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)), 1297 collationSource->writableBuffer, 1298 status); 1299 if (U_FAILURE(status)) { 1300 #ifdef UCOL_DEBUG 1301 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status)); 1302 #endif 1303 return; 1304 } 1305 1306 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer(); 1307 collationSource->origFlags = collationSource->flags; 1308 collationSource->flags |= UCOL_ITER_INNORMBUF; 1309 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1310 } 1311 1312 1313 // This function takes the iterator and extracts normalized stuff up to the next boundary 1314 // It is similar in the end results to the collIterNormalize, but for the cases when we 1315 // use an iterator 1316 /*static 1317 inline void normalizeIterator(collIterate *collationSource) { 1318 UErrorCode status = U_ZERO_ERROR; 1319 UBool wasNormalized = FALSE; 1320 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); 1321 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); 1322 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1323 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1324 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { 1325 // reallocate and terminate 1326 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, 1327 &collationSource->writableBuffer, 1328 (int32_t *)&collationSource->writableBufSize, normLen + 1, 1329 0) 1330 ) { 1331 #ifdef UCOL_DEBUG 1332 fprintf(stderr, "normalizeIterator(), out of memory\n"); 1333 #endif 1334 return; 1335 } 1336 status = U_ZERO_ERROR; 1337 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); 1338 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); 1339 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1340 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1341 } 1342 // Terminate the buffer - we already checked that it is big enough 1343 collationSource->writableBuffer[normLen] = 0; 1344 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { 1345 collationSource->flags |= UCOL_ITER_ALLOCATED; 1346 } 1347 collationSource->pos = collationSource->writableBuffer; 1348 collationSource->origFlags = collationSource->flags; 1349 collationSource->flags |= UCOL_ITER_INNORMBUF; 1350 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1351 }*/ 1352 1353 1354 /* Incremental FCD check and normalize */ 1355 /* Called from getNextCE when normalization state is suspect. */ 1356 /* When entering, the state is known to be this: */ 1357 /* o We are working in the main buffer of the collIterate, not the side */ 1358 /* writable buffer. When in the side buffer, normalization mode is always off, */ 1359 /* so we won't get here. */ 1360 /* o The leading combining class from the current character is 0 or */ 1361 /* the trailing combining class of the previous char was zero. */ 1362 /* True because the previous call to this function will have always exited */ 1363 /* that way, and we get called for every char where cc might be non-zero. */ 1364 static 1365 inline UBool collIterFCD(collIterate *collationSource) { 1366 const UChar *srcP, *endP; 1367 uint8_t leadingCC; 1368 uint8_t prevTrailingCC = 0; 1369 uint16_t fcd; 1370 UBool needNormalize = FALSE; 1371 1372 srcP = collationSource->pos-1; 1373 1374 if (collationSource->flags & UCOL_ITER_HASLEN) { 1375 endP = collationSource->endp; 1376 } else { 1377 endP = NULL; 1378 } 1379 1380 // Get the trailing combining class of the current character. If it's zero, we are OK. 1381 fcd = g_nfcImpl->nextFCD16(srcP, endP); 1382 if (fcd != 0) { 1383 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1384 1385 if (prevTrailingCC != 0) { 1386 // The current char has a non-zero trailing CC. Scan forward until we find 1387 // a char with a leading cc of zero. 1388 while (endP == NULL || srcP != endP) 1389 { 1390 const UChar *savedSrcP = srcP; 1391 1392 fcd = g_nfcImpl->nextFCD16(srcP, endP); 1393 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1394 if (leadingCC == 0) { 1395 srcP = savedSrcP; // Hit char that is not part of combining sequence. 1396 // back up over it. (Could be surrogate pair!) 1397 break; 1398 } 1399 1400 if (leadingCC < prevTrailingCC) { 1401 needNormalize = TRUE; 1402 } 1403 1404 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1405 } 1406 } 1407 } 1408 1409 collationSource->fcdPosition = (UChar *)srcP; 1410 1411 return needNormalize; 1412 } 1413 1414 /****************************************************************************/ 1415 /* Following are the CE retrieval functions */ 1416 /* */ 1417 /****************************************************************************/ 1418 1419 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); 1420 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); 1421 1422 /* there should be a macro version of this function in the header file */ 1423 /* This is the first function that tries to fetch a collation element */ 1424 /* If it's not succesfull or it encounters a more difficult situation */ 1425 /* some more sofisticated and slower functions are invoked */ 1426 static 1427 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1428 uint32_t order = 0; 1429 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ 1430 order = *(collationSource->toReturn++); /* if so, return them */ 1431 if(collationSource->CEpos == collationSource->toReturn) { 1432 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; 1433 } 1434 return order; 1435 } 1436 1437 UChar ch = 0; 1438 collationSource->offsetReturn = NULL; 1439 1440 do { 1441 for (;;) /* Loop handles case when incremental normalize switches */ 1442 { /* to or from the side buffer / original string, and we */ 1443 /* need to start again to get the next character. */ 1444 1445 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) 1446 { 1447 // The source string is null terminated and we're not working from the side buffer, 1448 // and we're not normalizing. This is the fast path. 1449 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) 1450 ch = *collationSource->pos++; 1451 if (ch != 0) { 1452 break; 1453 } 1454 else { 1455 return UCOL_NO_MORE_CES; 1456 } 1457 } 1458 1459 if (collationSource->flags & UCOL_ITER_HASLEN) { 1460 // Normal path for strings when length is specified. 1461 // (We can't be in side buffer because it is always null terminated.) 1462 if (collationSource->pos >= collationSource->endp) { 1463 // Ran off of the end of the main source string. We're done. 1464 return UCOL_NO_MORE_CES; 1465 } 1466 ch = *collationSource->pos++; 1467 } 1468 else if(collationSource->flags & UCOL_USE_ITERATOR) { 1469 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); 1470 if(iterCh == U_SENTINEL) { 1471 return UCOL_NO_MORE_CES; 1472 } 1473 ch = (UChar)iterCh; 1474 } 1475 else 1476 { 1477 // Null terminated string. 1478 ch = *collationSource->pos++; 1479 if (ch == 0) { 1480 // Ran off end of buffer. 1481 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1482 // Ran off end of main string. backing up one character. 1483 collationSource->pos--; 1484 return UCOL_NO_MORE_CES; 1485 } 1486 else 1487 { 1488 // Hit null in the normalize side buffer. 1489 // Usually this means the end of the normalized data, 1490 // except for one odd case: a null followed by combining chars, 1491 // which is the case if we are at the start of the buffer. 1492 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) { 1493 break; 1494 } 1495 1496 // Null marked end of side buffer. 1497 // Revert to the main string and 1498 // loop back to top to try again to get a character. 1499 collationSource->pos = collationSource->fcdPosition; 1500 collationSource->flags = collationSource->origFlags; 1501 continue; 1502 } 1503 } 1504 } 1505 1506 if(collationSource->flags&UCOL_HIRAGANA_Q) { 1507 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag 1508 * based on whether the previous codepoint was Hiragana or Katakana. 1509 */ 1510 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || 1511 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { 1512 collationSource->flags |= UCOL_WAS_HIRAGANA; 1513 } else { 1514 collationSource->flags &= ~UCOL_WAS_HIRAGANA; 1515 } 1516 } 1517 1518 // We've got a character. See if there's any fcd and/or normalization stuff to do. 1519 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. 1520 if ((collationSource->flags & UCOL_ITER_NORM) == 0) { 1521 break; 1522 } 1523 1524 if (collationSource->fcdPosition >= collationSource->pos) { 1525 // An earlier FCD check has already covered the current character. 1526 // We can go ahead and process this char. 1527 break; 1528 } 1529 1530 if (ch < ZERO_CC_LIMIT_ ) { 1531 // Fast fcd safe path. Trailing combining class == 0. This char is OK. 1532 break; 1533 } 1534 1535 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1536 // We need to peek at the next character in order to tell if we are FCD 1537 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { 1538 // We are at the last char of source string. 1539 // It is always OK for FCD check. 1540 break; 1541 } 1542 1543 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test 1544 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { 1545 break; 1546 } 1547 } 1548 1549 1550 // Need a more complete FCD check and possible normalization. 1551 if (collIterFCD(collationSource)) { 1552 collIterNormalize(collationSource); 1553 } 1554 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1555 // No normalization was needed. Go ahead and process the char we already had. 1556 break; 1557 } 1558 1559 // Some normalization happened. Next loop iteration will pick up a char 1560 // from the normalization buffer. 1561 1562 } // end for (;;) 1563 1564 1565 if (ch <= 0xFF) { 1566 /* For latin-1 characters we never need to fall back to the UCA table */ 1567 /* because all of the UCA data is replicated in the latinOneMapping array */ 1568 order = coll->latinOneMapping[ch]; 1569 if (order > UCOL_NOT_FOUND) { 1570 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); 1571 } 1572 } 1573 else 1574 { 1575 // Always use UCA for Han, Hangul 1576 // (Han extension A is before main Han block) 1577 // **** Han compatibility chars ?? **** 1578 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1579 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { 1580 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { 1581 // between the two target ranges; do normal lookup 1582 // **** this range is YI, Modifier tone letters, **** 1583 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1584 // **** Latin-D might be tailored, so we need to **** 1585 // **** do the normal lookup for these guys. **** 1586 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1587 } else { 1588 // in one of the target ranges; use UCA 1589 order = UCOL_NOT_FOUND; 1590 } 1591 } else { 1592 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1593 } 1594 1595 if(order > UCOL_NOT_FOUND) { /* if a CE is special */ 1596 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ 1597 } 1598 1599 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ 1600 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ 1601 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1602 1603 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ 1604 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); 1605 } 1606 } 1607 } 1608 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); 1609 1610 if(order == UCOL_NOT_FOUND) { 1611 order = getImplicit(ch, collationSource); 1612 } 1613 return order; /* return the CE */ 1614 } 1615 1616 /* ucol_getNextCE, out-of-line version for use from other files. */ 1617 U_CAPI uint32_t U_EXPORT2 1618 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1619 return ucol_IGetNextCE(coll, collationSource, status); 1620 } 1621 1622 1623 /** 1624 * Incremental previous normalization happens here. Pick up the range of chars 1625 * identifed by FCD, normalize it into the collIterate's writable buffer, 1626 * switch the collIterate's state to use the writable buffer. 1627 * @param data collation iterator data 1628 */ 1629 static 1630 void collPrevIterNormalize(collIterate *data) 1631 { 1632 UErrorCode status = U_ZERO_ERROR; 1633 const UChar *pEnd = data->pos; /* End normalize + 1 */ 1634 const UChar *pStart; 1635 1636 /* Start normalize */ 1637 if (data->fcdPosition == NULL) { 1638 pStart = data->string; 1639 } 1640 else { 1641 pStart = data->fcdPosition + 1; 1642 } 1643 1644 int32_t normLen = 1645 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)), 1646 data->writableBuffer, 1647 status). 1648 length(); 1649 if(U_FAILURE(status)) { 1650 return; 1651 } 1652 /* 1653 this puts the null termination infront of the normalized string instead 1654 of the end 1655 */ 1656 data->writableBuffer.insert(0, (UChar)0); 1657 1658 /* 1659 * The usual case at this point is that we've got a base 1660 * character followed by marks that were normalized. If 1661 * fcdPosition is NULL, that means that we backed up to 1662 * the beginning of the string and there's no base character. 1663 * 1664 * Forward processing will usually normalize when it sees 1665 * the first mark, so that mark will get it's natural offset 1666 * and the rest will get the offset of the character following 1667 * the marks. The base character will also get its natural offset. 1668 * 1669 * We write the offset of the base character, if there is one, 1670 * followed by the offset of the first mark and then the offsets 1671 * of the rest of the marks. 1672 */ 1673 int32_t firstMarkOffset = 0; 1674 int32_t trailOffset = (int32_t)(data->pos - data->string + 1); 1675 int32_t trailCount = normLen - 1; 1676 1677 if (data->fcdPosition != NULL) { 1678 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); 1679 UChar baseChar = *data->fcdPosition; 1680 1681 firstMarkOffset = baseOffset + 1; 1682 1683 /* 1684 * If the base character is the start of a contraction, forward processing 1685 * will normalize the marks while checking for the contraction, which means 1686 * that the offset of the first mark will the same as the other marks. 1687 * 1688 * **** THIS IS PROBABLY NOT A COMPLETE TEST **** 1689 */ 1690 if (baseChar >= 0x100) { 1691 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); 1692 1693 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { 1694 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); 1695 } 1696 1697 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { 1698 firstMarkOffset = trailOffset; 1699 } 1700 } 1701 1702 data->appendOffset(baseOffset, status); 1703 } 1704 1705 data->appendOffset(firstMarkOffset, status); 1706 1707 for (int32_t i = 0; i < trailCount; i += 1) { 1708 data->appendOffset(trailOffset, status); 1709 } 1710 1711 data->offsetRepeatValue = trailOffset; 1712 1713 data->offsetReturn = data->offsetStore - 1; 1714 if (data->offsetReturn == data->offsetBuffer) { 1715 data->offsetStore = data->offsetBuffer; 1716 } 1717 1718 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; 1719 data->origFlags = data->flags; 1720 data->flags |= UCOL_ITER_INNORMBUF; 1721 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1722 } 1723 1724 1725 /** 1726 * Incremental FCD check for previous iteration and normalize. Called from 1727 * getPrevCE when normalization state is suspect. 1728 * When entering, the state is known to be this: 1729 * o We are working in the main buffer of the collIterate, not the side 1730 * writable buffer. When in the side buffer, normalization mode is always 1731 * off, so we won't get here. 1732 * o The leading combining class from the current character is 0 or the 1733 * trailing combining class of the previous char was zero. 1734 * True because the previous call to this function will have always exited 1735 * that way, and we get called for every char where cc might be non-zero. 1736 * @param data collation iterate struct 1737 * @return normalization status, TRUE for normalization to be done, FALSE 1738 * otherwise 1739 */ 1740 static 1741 inline UBool collPrevIterFCD(collIterate *data) 1742 { 1743 const UChar *src, *start; 1744 uint8_t leadingCC; 1745 uint8_t trailingCC = 0; 1746 uint16_t fcd; 1747 UBool result = FALSE; 1748 1749 start = data->string; 1750 src = data->pos + 1; 1751 1752 /* Get the trailing combining class of the current character. */ 1753 fcd = g_nfcImpl->previousFCD16(start, src); 1754 1755 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1756 1757 if (leadingCC != 0) { 1758 /* 1759 The current char has a non-zero leading combining class. 1760 Scan backward until we find a char with a trailing cc of zero. 1761 */ 1762 for (;;) 1763 { 1764 if (start == src) { 1765 data->fcdPosition = NULL; 1766 return result; 1767 } 1768 1769 fcd = g_nfcImpl->previousFCD16(start, src); 1770 1771 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1772 1773 if (trailingCC == 0) { 1774 break; 1775 } 1776 1777 if (leadingCC < trailingCC) { 1778 result = TRUE; 1779 } 1780 1781 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1782 } 1783 } 1784 1785 data->fcdPosition = (UChar *)src; 1786 1787 return result; 1788 } 1789 1790 /** gets a code unit from the string at a given offset 1791 * Handles both normal and iterative cases. 1792 * No error checking - caller beware! 1793 */ 1794 static inline 1795 UChar peekCodeUnit(collIterate *source, int32_t offset) { 1796 if(source->pos != NULL) { 1797 return *(source->pos + offset); 1798 } else if(source->iterator != NULL) { 1799 UChar32 c; 1800 if(offset != 0) { 1801 source->iterator->move(source->iterator, offset, UITER_CURRENT); 1802 c = source->iterator->next(source->iterator); 1803 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); 1804 } else { 1805 c = source->iterator->current(source->iterator); 1806 } 1807 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0. 1808 } else { 1809 return 0xfffd; 1810 } 1811 } 1812 1813 // Code point version. Treats the offset as a _code point_ delta. 1814 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16. 1815 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer. 1816 static inline 1817 UChar32 peekCodePoint(collIterate *source, int32_t offset) { 1818 UChar32 c; 1819 if(source->pos != NULL) { 1820 const UChar *p = source->pos; 1821 if(offset >= 0) { 1822 // Skip forward over (offset-1) code points. 1823 while(--offset >= 0) { 1824 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) { 1825 ++p; 1826 } 1827 } 1828 // Read the code point there. 1829 c = *p++; 1830 UChar trail; 1831 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) { 1832 c = U16_GET_SUPPLEMENTARY(c, trail); 1833 } 1834 } else /* offset<0 */ { 1835 // Skip backward over (offset-1) code points. 1836 while(++offset < 0) { 1837 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) { 1838 --p; 1839 } 1840 } 1841 // Read the code point before that. 1842 c = *--p; 1843 UChar lead; 1844 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) { 1845 c = U16_GET_SUPPLEMENTARY(lead, c); 1846 } 1847 } 1848 } else if(source->iterator != NULL) { 1849 if(offset >= 0) { 1850 // Skip forward over (offset-1) code points. 1851 int32_t fwd = offset; 1852 while(fwd-- > 0) { 1853 uiter_next32(source->iterator); 1854 } 1855 // Read the code point there. 1856 c = uiter_current32(source->iterator); 1857 // Return to the starting point, skipping backward over (offset-1) code points. 1858 while(offset-- > 0) { 1859 uiter_previous32(source->iterator); 1860 } 1861 } else /* offset<0 */ { 1862 // Read backward, reading offset code points, remember only the last-read one. 1863 int32_t back = offset; 1864 do { 1865 c = uiter_previous32(source->iterator); 1866 } while(++back < 0); 1867 // Return to the starting position, skipping forward over offset code points. 1868 do { 1869 uiter_next32(source->iterator); 1870 } while(++offset < 0); 1871 } 1872 } else { 1873 c = U_SENTINEL; 1874 } 1875 return c; 1876 } 1877 1878 /** 1879 * Determines if we are at the start of the data string in the backwards 1880 * collation iterator 1881 * @param data collation iterator 1882 * @return TRUE if we are at the start 1883 */ 1884 static 1885 inline UBool isAtStartPrevIterate(collIterate *data) { 1886 if(data->pos == NULL && data->iterator != NULL) { 1887 return !data->iterator->hasPrevious(data->iterator); 1888 } 1889 //return (collIter_bos(data)) || 1890 return (data->pos == data->string) || 1891 ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) && 1892 *(data->pos - 1) == 0 && data->fcdPosition == NULL); 1893 } 1894 1895 static 1896 inline void goBackOne(collIterate *data) { 1897 # if 0 1898 // somehow, it looks like we need to keep iterator synced up 1899 // at all times, as above. 1900 if(data->pos) { 1901 data->pos--; 1902 } 1903 if(data->iterator) { 1904 data->iterator->previous(data->iterator); 1905 } 1906 #endif 1907 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { 1908 data->iterator->previous(data->iterator); 1909 } 1910 if(data->pos) { 1911 data->pos --; 1912 } 1913 } 1914 1915 /** 1916 * Inline function that gets a simple CE. 1917 * So what it does is that it will first check the expansion buffer. If the 1918 * expansion buffer is not empty, ie the end pointer to the expansion buffer 1919 * is different from the string pointer, we return the collation element at the 1920 * return pointer and decrement it. 1921 * For more complicated CEs it resorts to getComplicatedCE. 1922 * @param coll collator data 1923 * @param data collation iterator struct 1924 * @param status error status 1925 */ 1926 static 1927 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, 1928 UErrorCode *status) 1929 { 1930 uint32_t result = (uint32_t)UCOL_NULLORDER; 1931 1932 if (data->offsetReturn != NULL) { 1933 if (data->offsetRepeatCount > 0) { 1934 data->offsetRepeatCount -= 1; 1935 } else { 1936 if (data->offsetReturn == data->offsetBuffer) { 1937 data->offsetReturn = NULL; 1938 data->offsetStore = data->offsetBuffer; 1939 } else { 1940 data->offsetReturn -= 1; 1941 } 1942 } 1943 } 1944 1945 if ((data->extendCEs && data->toReturn > data->extendCEs) || 1946 (!data->extendCEs && data->toReturn > data->CEs)) 1947 { 1948 data->toReturn -= 1; 1949 result = *(data->toReturn); 1950 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { 1951 data->CEpos = data->toReturn; 1952 } 1953 } 1954 else { 1955 UChar ch = 0; 1956 1957 do { 1958 /* 1959 Loop handles case when incremental normalize switches to or from the 1960 side buffer / original string, and we need to start again to get the 1961 next character. 1962 */ 1963 for (;;) { 1964 if (data->flags & UCOL_ITER_HASLEN) { 1965 /* 1966 Normal path for strings when length is specified. 1967 Not in side buffer because it is always null terminated. 1968 */ 1969 if (data->pos <= data->string) { 1970 /* End of the main source string */ 1971 return UCOL_NO_MORE_CES; 1972 } 1973 data->pos --; 1974 ch = *data->pos; 1975 } 1976 // we are using an iterator to go back. Pray for us! 1977 else if (data->flags & UCOL_USE_ITERATOR) { 1978 UChar32 iterCh = data->iterator->previous(data->iterator); 1979 if(iterCh == U_SENTINEL) { 1980 return UCOL_NO_MORE_CES; 1981 } else { 1982 ch = (UChar)iterCh; 1983 } 1984 } 1985 else { 1986 data->pos --; 1987 ch = *data->pos; 1988 /* we are in the side buffer. */ 1989 if (ch == 0) { 1990 /* 1991 At the start of the normalize side buffer. 1992 Go back to string. 1993 Because pointer points to the last accessed character, 1994 hence we have to increment it by one here. 1995 */ 1996 data->flags = data->origFlags; 1997 data->offsetRepeatValue = 0; 1998 1999 if (data->fcdPosition == NULL) { 2000 data->pos = data->string; 2001 return UCOL_NO_MORE_CES; 2002 } 2003 else { 2004 data->pos = data->fcdPosition + 1; 2005 } 2006 2007 continue; 2008 } 2009 } 2010 2011 if(data->flags&UCOL_HIRAGANA_Q) { 2012 if(ch>=0x3040 && ch<=0x309f) { 2013 data->flags |= UCOL_WAS_HIRAGANA; 2014 } else { 2015 data->flags &= ~UCOL_WAS_HIRAGANA; 2016 } 2017 } 2018 2019 /* 2020 * got a character to determine if there's fcd and/or normalization 2021 * stuff to do. 2022 * if the current character is not fcd. 2023 * if current character is at the start of the string 2024 * Trailing combining class == 0. 2025 * Note if pos is in the writablebuffer, norm is always 0 2026 */ 2027 if (ch < ZERO_CC_LIMIT_ || 2028 // this should propel us out of the loop in the iterator case 2029 (data->flags & UCOL_ITER_NORM) == 0 || 2030 (data->fcdPosition != NULL && data->fcdPosition <= data->pos) 2031 || data->string == data->pos) { 2032 break; 2033 } 2034 2035 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 2036 /* if next character is FCD */ 2037 if (data->pos == data->string) { 2038 /* First char of string is always OK for FCD check */ 2039 break; 2040 } 2041 2042 /* Not first char of string, do the FCD fast test */ 2043 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { 2044 break; 2045 } 2046 } 2047 2048 /* Need a more complete FCD check and possible normalization. */ 2049 if (collPrevIterFCD(data)) { 2050 collPrevIterNormalize(data); 2051 } 2052 2053 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2054 /* No normalization. Go ahead and process the char. */ 2055 break; 2056 } 2057 2058 /* 2059 Some normalization happened. 2060 Next loop picks up a char from the normalization buffer. 2061 */ 2062 } 2063 2064 /* attempt to handle contractions, after removal of the backwards 2065 contraction 2066 */ 2067 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { 2068 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); 2069 } else { 2070 if (ch <= 0xFF) { 2071 result = coll->latinOneMapping[ch]; 2072 } 2073 else { 2074 // Always use UCA for [3400..9FFF], [AC00..D7AF] 2075 // **** [FA0E..FA2F] ?? **** 2076 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 2077 (ch >= 0x3400 && ch <= 0xD7AF)) { 2078 if (ch > 0x9FFF && ch < 0xAC00) { 2079 // between the two target ranges; do normal lookup 2080 // **** this range is YI, Modifier tone letters, **** 2081 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 2082 // **** Latin-D might be tailored, so we need to **** 2083 // **** do the normal lookup for these guys. **** 2084 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 2085 } else { 2086 result = UCOL_NOT_FOUND; 2087 } 2088 } else { 2089 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 2090 } 2091 } 2092 if (result > UCOL_NOT_FOUND) { 2093 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); 2094 } 2095 if (result == UCOL_NOT_FOUND) { // Not found in master list 2096 if (!isAtStartPrevIterate(data) && 2097 ucol_contractionEndCP(ch, data->coll)) 2098 { 2099 result = UCOL_CONTRACTION; 2100 } else { 2101 if(coll->UCA) { 2102 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 2103 } 2104 } 2105 2106 if (result > UCOL_NOT_FOUND) { 2107 if(coll->UCA) { 2108 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); 2109 } 2110 } 2111 } 2112 } 2113 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); 2114 2115 if(result == UCOL_NOT_FOUND) { 2116 result = getPrevImplicit(ch, data); 2117 } 2118 } 2119 2120 return result; 2121 } 2122 2123 2124 /* ucol_getPrevCE, out-of-line version for use from other files. */ 2125 U_CFUNC uint32_t U_EXPORT2 2126 ucol_getPrevCE(const UCollator *coll, collIterate *data, 2127 UErrorCode *status) { 2128 return ucol_IGetPrevCE(coll, data, status); 2129 } 2130 2131 2132 /* this should be connected to special Jamo handling */ 2133 U_CFUNC uint32_t U_EXPORT2 2134 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { 2135 collIterate colIt; 2136 IInit_collIterate(coll, &u, 1, &colIt, status); 2137 if(U_FAILURE(*status)) { 2138 return 0; 2139 } 2140 return ucol_IGetNextCE(coll, &colIt, status); 2141 } 2142 2143 /** 2144 * Inserts the argument character into the end of the buffer pushing back the 2145 * null terminator. 2146 * @param data collIterate struct data 2147 * @param ch character to be appended 2148 * @return the position of the new addition 2149 */ 2150 static 2151 inline const UChar * insertBufferEnd(collIterate *data, UChar ch) 2152 { 2153 int32_t oldLength = data->writableBuffer.length(); 2154 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; 2155 } 2156 2157 /** 2158 * Inserts the argument string into the end of the buffer pushing back the 2159 * null terminator. 2160 * @param data collIterate struct data 2161 * @param string to be appended 2162 * @param length of the string to be appended 2163 * @return the position of the new addition 2164 */ 2165 static 2166 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length) 2167 { 2168 int32_t oldLength = data->writableBuffer.length(); 2169 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength; 2170 } 2171 2172 /** 2173 * Special normalization function for contraction in the forwards iterator. 2174 * This normalization sequence will place the current character at source->pos 2175 * and its following normalized sequence into the buffer. 2176 * The fcd position, pos will be changed. 2177 * pos will now point to positions in the buffer. 2178 * Flags will be changed accordingly. 2179 * @param data collation iterator data 2180 */ 2181 static 2182 inline void normalizeNextContraction(collIterate *data) 2183 { 2184 int32_t strsize; 2185 UErrorCode status = U_ZERO_ERROR; 2186 /* because the pointer points to the next character */ 2187 const UChar *pStart = data->pos - 1; 2188 const UChar *pEnd; 2189 2190 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2191 data->writableBuffer.setTo(*(pStart - 1)); 2192 strsize = 1; 2193 } 2194 else { 2195 strsize = data->writableBuffer.length(); 2196 } 2197 2198 pEnd = data->fcdPosition; 2199 2200 data->writableBuffer.append( 2201 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status)); 2202 if(U_FAILURE(status)) { 2203 return; 2204 } 2205 2206 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; 2207 data->origFlags = data->flags; 2208 data->flags |= UCOL_ITER_INNORMBUF; 2209 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2210 } 2211 2212 /** 2213 * Contraction character management function that returns the next character 2214 * for the forwards iterator. 2215 * Does nothing if the next character is in buffer and not the first character 2216 * in it. 2217 * Else it checks next character in data string to see if it is normalizable. 2218 * If it is not, the character is simply copied into the buffer, else 2219 * the whole normalized substring is copied into the buffer, including the 2220 * current character. 2221 * @param data collation element iterator data 2222 * @return next character 2223 */ 2224 static 2225 inline UChar getNextNormalizedChar(collIterate *data) 2226 { 2227 UChar nextch; 2228 UChar ch; 2229 // Here we need to add the iterator code. One problem is the way 2230 // end of string is handled. If we just return next char, it could 2231 // be the sentinel. Most of the cases already check for this, but we 2232 // need to be sure. 2233 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { 2234 /* if no normalization and not in buffer. */ 2235 if(data->flags & UCOL_USE_ITERATOR) { 2236 return (UChar)data->iterator->next(data->iterator); 2237 } else { 2238 return *(data->pos ++); 2239 } 2240 } 2241 2242 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { 2243 //normalizeIterator(data); 2244 //} 2245 2246 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2247 if ((innormbuf && *data->pos != 0) || 2248 (data->fcdPosition != NULL && !innormbuf && 2249 data->pos < data->fcdPosition)) { 2250 /* 2251 if next character is in normalized buffer, no further normalization 2252 is required 2253 */ 2254 return *(data->pos ++); 2255 } 2256 2257 if (data->flags & UCOL_ITER_HASLEN) { 2258 /* in data string */ 2259 if (data->pos + 1 == data->endp) { 2260 return *(data->pos ++); 2261 } 2262 } 2263 else { 2264 if (innormbuf) { 2265 // inside the normalization buffer, but at the end 2266 // (since we encountered zero). This means, in the 2267 // case we're using char iterator, that we need to 2268 // do another round of normalization. 2269 //if(data->origFlags & UCOL_USE_ITERATOR) { 2270 // we need to restore original flags, 2271 // otherwise, we'll lose them 2272 //data->flags = data->origFlags; 2273 //normalizeIterator(data); 2274 //return *(data->pos++); 2275 //} else { 2276 /* 2277 in writable buffer, at this point fcdPosition can not be 2278 pointing to the end of the data string. see contracting tag. 2279 */ 2280 if(data->fcdPosition) { 2281 if (*(data->fcdPosition + 1) == 0 || 2282 data->fcdPosition + 1 == data->endp) { 2283 /* at the end of the string, dump it into the normalizer */ 2284 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; 2285 // Check if data->pos received a null pointer 2286 if (data->pos == NULL) { 2287 return (UChar)-1; // Return to indicate error. 2288 } 2289 return *(data->fcdPosition ++); 2290 } 2291 data->pos = data->fcdPosition; 2292 } else if(data->origFlags & UCOL_USE_ITERATOR) { 2293 // if we are here, we're using a normalizing iterator. 2294 // we should just continue further. 2295 data->flags = data->origFlags; 2296 data->pos = NULL; 2297 return (UChar)data->iterator->next(data->iterator); 2298 } 2299 //} 2300 } 2301 else { 2302 if (*(data->pos + 1) == 0) { 2303 return *(data->pos ++); 2304 } 2305 } 2306 } 2307 2308 ch = *data->pos ++; 2309 nextch = *data->pos; 2310 2311 /* 2312 * if the current character is not fcd. 2313 * Trailing combining class == 0. 2314 */ 2315 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && 2316 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || 2317 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { 2318 /* 2319 Need a more complete FCD check and possible normalization. 2320 normalize substring will be appended to buffer 2321 */ 2322 if (collIterFCD(data)) { 2323 normalizeNextContraction(data); 2324 return *(data->pos ++); 2325 } 2326 else if (innormbuf) { 2327 /* fcdposition shifted even when there's no normalization, if we 2328 don't input the rest into this, we'll get the wrong position when 2329 we reach the end of the writableBuffer */ 2330 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); 2331 data->pos = insertBufferEnd(data, data->pos - 1, length); 2332 // Check if data->pos received a null pointer 2333 if (data->pos == NULL) { 2334 return (UChar)-1; // Return to indicate error. 2335 } 2336 return *(data->pos ++); 2337 } 2338 } 2339 2340 if (innormbuf) { 2341 /* 2342 no normalization is to be done hence only one character will be 2343 appended to the buffer. 2344 */ 2345 data->pos = insertBufferEnd(data, ch) + 1; 2346 // Check if data->pos received a null pointer 2347 if (data->pos == NULL) { 2348 return (UChar)-1; // Return to indicate error. 2349 } 2350 } 2351 2352 /* points back to the pos in string */ 2353 return ch; 2354 } 2355 2356 2357 2358 /** 2359 * Function to copy the buffer into writableBuffer and sets the fcd position to 2360 * the correct position 2361 * @param source data string source 2362 * @param buffer character buffer 2363 */ 2364 static 2365 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer) 2366 { 2367 /* okay confusing part here. to ensure that the skipped characters are 2368 considered later, we need to place it in the appropriate position in the 2369 normalization buffer and reassign the pos pointer. simple case if pos 2370 reside in string, simply copy to normalization buffer and 2371 fcdposition = pos, pos = start of normalization buffer. if pos in 2372 normalization buffer, we'll insert the copy infront of pos and point pos 2373 to the start of the normalization buffer. why am i doing these copies? 2374 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does 2375 not require any changes, which be really painful. */ 2376 if (source->flags & UCOL_ITER_INNORMBUF) { 2377 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer(); 2378 source->writableBuffer.replace(0, replaceLength, buffer); 2379 } 2380 else { 2381 source->fcdPosition = source->pos; 2382 source->origFlags = source->flags; 2383 source->flags |= UCOL_ITER_INNORMBUF; 2384 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 2385 source->writableBuffer = buffer; 2386 } 2387 2388 source->pos = source->writableBuffer.getTerminatedBuffer(); 2389 } 2390 2391 /** 2392 * Function to get the discontiguos collation element within the source. 2393 * Note this function will set the position to the appropriate places. 2394 * @param coll current collator used 2395 * @param source data string source 2396 * @param constart index to the start character in the contraction table 2397 * @return discontiguos collation element offset 2398 */ 2399 static 2400 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, 2401 const UChar *constart) 2402 { 2403 /* source->pos currently points to the second combining character after 2404 the start character */ 2405 const UChar *temppos = source->pos; 2406 UnicodeString buffer; 2407 const UChar *tempconstart = constart; 2408 uint8_t tempflags = source->flags; 2409 UBool multicontraction = FALSE; 2410 collIterateState discState; 2411 2412 backupState(source, &discState); 2413 2414 buffer.setTo(peekCodePoint(source, -1)); 2415 for (;;) { 2416 UChar *UCharOffset; 2417 UChar schar, 2418 tchar; 2419 uint32_t result; 2420 2421 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) 2422 || (peekCodeUnit(source, 0) == 0 && 2423 //|| (*source->pos == 0 && 2424 ((source->flags & UCOL_ITER_INNORMBUF) == 0 || 2425 source->fcdPosition == NULL || 2426 source->fcdPosition == source->endp || 2427 *(source->fcdPosition) == 0 || 2428 u_getCombiningClass(*(source->fcdPosition)) == 0)) || 2429 /* end of string in null terminated string or stopped by a 2430 null character, note fcd does not always point to a base 2431 character after the discontiguos change */ 2432 u_getCombiningClass(peekCodePoint(source, 0)) == 0) { 2433 //u_getCombiningClass(*(source->pos)) == 0) { 2434 //constart = (UChar *)coll->image + getContractOffset(CE); 2435 if (multicontraction) { 2436 source->pos = temppos - 1; 2437 setDiscontiguosAttribute(source, buffer); 2438 return *(coll->contractionCEs + 2439 (tempconstart - coll->contractionIndex)); 2440 } 2441 constart = tempconstart; 2442 break; 2443 } 2444 2445 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ 2446 schar = getNextNormalizedChar(source); 2447 2448 while (schar > (tchar = *UCharOffset)) { 2449 UCharOffset++; 2450 } 2451 2452 if (schar != tchar) { 2453 /* not the correct codepoint. we stuff the current codepoint into 2454 the discontiguos buffer and try the next character */ 2455 buffer.append(schar); 2456 continue; 2457 } 2458 else { 2459 if (u_getCombiningClass(schar) == 2460 u_getCombiningClass(peekCodePoint(source, -2))) { 2461 buffer.append(schar); 2462 continue; 2463 } 2464 result = *(coll->contractionCEs + 2465 (UCharOffset - coll->contractionIndex)); 2466 } 2467 2468 if (result == UCOL_NOT_FOUND) { 2469 break; 2470 } else if (isContraction(result)) { 2471 /* this is a multi-contraction*/ 2472 tempconstart = (UChar *)coll->image + getContractOffset(result); 2473 if (*(coll->contractionCEs + (constart - coll->contractionIndex)) 2474 != UCOL_NOT_FOUND) { 2475 multicontraction = TRUE; 2476 temppos = source->pos + 1; 2477 } 2478 } else { 2479 setDiscontiguosAttribute(source, buffer); 2480 return result; 2481 } 2482 } 2483 2484 /* no problems simply reverting just like that, 2485 if we are in string before getting into this function, points back to 2486 string hence no problem. 2487 if we are in normalization buffer before getting into this function, 2488 since we'll never use another normalization within this function, we 2489 know that fcdposition points to a base character. the normalization buffer 2490 never change, hence this revert works. */ 2491 loadState(source, &discState, TRUE); 2492 goBackOne(source); 2493 2494 //source->pos = temppos - 1; 2495 source->flags = tempflags; 2496 return *(coll->contractionCEs + (constart - coll->contractionIndex)); 2497 } 2498 2499 /* now uses Mark's getImplicitPrimary code */ 2500 static 2501 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { 2502 uint32_t r = uprv_uca_getImplicitPrimary(cp); 2503 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; 2504 collationSource->offsetRepeatCount += 1; 2505 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' 2506 } 2507 2508 /** 2509 * Inserts the argument character into the front of the buffer replacing the 2510 * front null terminator. 2511 * @param data collation element iterator data 2512 * @param ch character to be appended 2513 */ 2514 static 2515 inline void insertBufferFront(collIterate *data, UChar ch) 2516 { 2517 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2; 2518 } 2519 2520 /** 2521 * Special normalization function for contraction in the previous iterator. 2522 * This normalization sequence will place the current character at source->pos 2523 * and its following normalized sequence into the buffer. 2524 * The fcd position, pos will be changed. 2525 * pos will now point to positions in the buffer. 2526 * Flags will be changed accordingly. 2527 * @param data collation iterator data 2528 */ 2529 static 2530 inline void normalizePrevContraction(collIterate *data, UErrorCode *status) 2531 { 2532 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ 2533 const UChar *pStart; 2534 2535 UnicodeString endOfBuffer; 2536 if (data->flags & UCOL_ITER_HASLEN) { 2537 /* 2538 normalization buffer not used yet, we'll pull down the next 2539 character into the end of the buffer 2540 */ 2541 endOfBuffer.setTo(*pEnd); 2542 } 2543 else { 2544 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL 2545 } 2546 2547 if (data->fcdPosition == NULL) { 2548 pStart = data->string; 2549 } 2550 else { 2551 pStart = data->fcdPosition + 1; 2552 } 2553 int32_t normLen = 2554 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), 2555 data->writableBuffer, 2556 *status). 2557 length(); 2558 if(U_FAILURE(*status)) { 2559 return; 2560 } 2561 /* 2562 this puts the null termination infront of the normalized string instead 2563 of the end 2564 */ 2565 data->pos = 2566 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() + 2567 1 + normLen; 2568 data->origFlags = data->flags; 2569 data->flags |= UCOL_ITER_INNORMBUF; 2570 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2571 } 2572 2573 /** 2574 * Contraction character management function that returns the previous character 2575 * for the backwards iterator. 2576 * Does nothing if the previous character is in buffer and not the first 2577 * character in it. 2578 * Else it checks previous character in data string to see if it is 2579 * normalizable. 2580 * If it is not, the character is simply copied into the buffer, else 2581 * the whole normalized substring is copied into the buffer, including the 2582 * current character. 2583 * @param data collation element iterator data 2584 * @return previous character 2585 */ 2586 static 2587 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) 2588 { 2589 UChar prevch; 2590 UChar ch; 2591 const UChar *start; 2592 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2593 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || 2594 (innormbuf && *(data->pos - 1) != 0)) { 2595 /* 2596 if no normalization. 2597 if previous character is in normalized buffer, no further normalization 2598 is required 2599 */ 2600 if(data->flags & UCOL_USE_ITERATOR) { 2601 data->iterator->move(data->iterator, -1, UITER_CURRENT); 2602 return (UChar)data->iterator->next(data->iterator); 2603 } else { 2604 return *(data->pos - 1); 2605 } 2606 } 2607 2608 start = data->pos; 2609 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { 2610 /* in data string */ 2611 if ((start - 1) == data->string) { 2612 return *(start - 1); 2613 } 2614 start --; 2615 ch = *start; 2616 prevch = *(start - 1); 2617 } 2618 else { 2619 /* 2620 in writable buffer, at this point fcdPosition can not be NULL. 2621 see contracting tag. 2622 */ 2623 if (data->fcdPosition == data->string) { 2624 /* at the start of the string, just dump it into the normalizer */ 2625 insertBufferFront(data, *(data->fcdPosition)); 2626 data->fcdPosition = NULL; 2627 return *(data->pos - 1); 2628 } 2629 start = data->fcdPosition; 2630 ch = *start; 2631 prevch = *(start - 1); 2632 } 2633 /* 2634 * if the current character is not fcd. 2635 * Trailing combining class == 0. 2636 */ 2637 if (data->fcdPosition > start && 2638 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) 2639 { 2640 /* 2641 Need a more complete FCD check and possible normalization. 2642 normalize substring will be appended to buffer 2643 */ 2644 const UChar *backuppos = data->pos; 2645 data->pos = start; 2646 if (collPrevIterFCD(data)) { 2647 normalizePrevContraction(data, status); 2648 return *(data->pos - 1); 2649 } 2650 data->pos = backuppos; 2651 data->fcdPosition ++; 2652 } 2653 2654 if (innormbuf) { 2655 /* 2656 no normalization is to be done hence only one character will be 2657 appended to the buffer. 2658 */ 2659 insertBufferFront(data, ch); 2660 data->fcdPosition --; 2661 } 2662 2663 return ch; 2664 } 2665 2666 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ 2667 /* It is called by getNextCE */ 2668 2669 /* The following should be even */ 2670 #define UCOL_MAX_DIGITS_FOR_NUMBER 254 2671 2672 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { 2673 collIterateState entryState; 2674 backupState(source, &entryState); 2675 UChar32 cp = ch; 2676 2677 for (;;) { 2678 // This loop will repeat only in the case of contractions, and only when a contraction 2679 // is found and the first CE resulting from that contraction is itself a special 2680 // (an expansion, for example.) All other special CE types are fully handled the 2681 // first time through, and the loop exits. 2682 2683 const uint32_t *CEOffset = NULL; 2684 switch(getCETag(CE)) { 2685 case NOT_FOUND_TAG: 2686 /* This one is not found, and we'll let somebody else bother about it... no more games */ 2687 return CE; 2688 case SPEC_PROC_TAG: 2689 { 2690 // Special processing is getting a CE that is preceded by a certain prefix 2691 // Currently this is only needed for optimizing Japanese length and iteration marks. 2692 // When we encouter a special processing tag, we go backwards and try to see if 2693 // we have a match. 2694 // Contraction tables are used - so the whole process is not unlike contraction. 2695 // prefix data is stored backwards in the table. 2696 const UChar *UCharOffset; 2697 UChar schar, tchar; 2698 collIterateState prefixState; 2699 backupState(source, &prefixState); 2700 loadState(source, &entryState, TRUE); 2701 goBackOne(source); // We want to look at the point where we entered - actually one 2702 // before that... 2703 2704 for(;;) { 2705 // This loop will run once per source string character, for as long as we 2706 // are matching a potential contraction sequence 2707 2708 // First we position ourselves at the begining of contraction sequence 2709 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2710 if (collIter_bos(source)) { 2711 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2712 break; 2713 } 2714 schar = getPrevNormalizedChar(source, status); 2715 goBackOne(source); 2716 2717 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2718 UCharOffset++; 2719 } 2720 2721 if (schar == tchar) { 2722 // Found the source string char in the table. 2723 // Pick up the corresponding CE from the table. 2724 CE = *(coll->contractionCEs + 2725 (UCharOffset - coll->contractionIndex)); 2726 } 2727 else 2728 { 2729 // Source string char was not in the table. 2730 // We have not found the prefix. 2731 CE = *(coll->contractionCEs + 2732 (ContractionStart - coll->contractionIndex)); 2733 } 2734 2735 if(!isPrefix(CE)) { 2736 // The source string char was in the contraction table, and the corresponding 2737 // CE is not a prefix CE. We found the prefix, break 2738 // out of loop, this CE will end up being returned. This is the normal 2739 // way out of prefix handling when the source actually contained 2740 // the prefix. 2741 break; 2742 } 2743 } 2744 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue 2745 loadState(source, &prefixState, TRUE); 2746 if(source->origFlags & UCOL_USE_ITERATOR) { 2747 source->flags = source->origFlags; 2748 } 2749 } else { // prefix search was a failure, we have to backup all the way to the start 2750 loadState(source, &entryState, TRUE); 2751 } 2752 break; 2753 } 2754 case CONTRACTION_TAG: 2755 { 2756 /* This should handle contractions */ 2757 collIterateState state; 2758 backupState(source, &state); 2759 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; 2760 const UChar *UCharOffset; 2761 UChar schar, tchar; 2762 2763 for (;;) { 2764 /* This loop will run once per source string character, for as long as we */ 2765 /* are matching a potential contraction sequence */ 2766 2767 /* First we position ourselves at the begining of contraction sequence */ 2768 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2769 2770 if (collIter_eos(source)) { 2771 // Ran off the end of the source string. 2772 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2773 // So we'll pick whatever we have at the point... 2774 if (CE == UCOL_NOT_FOUND) { 2775 // back up the source over all the chars we scanned going into this contraction. 2776 CE = firstCE; 2777 loadState(source, &state, TRUE); 2778 if(source->origFlags & UCOL_USE_ITERATOR) { 2779 source->flags = source->origFlags; 2780 } 2781 } 2782 break; 2783 } 2784 2785 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ 2786 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); 2787 2788 schar = getNextNormalizedChar(source); 2789 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2790 UCharOffset++; 2791 } 2792 2793 if (schar == tchar) { 2794 // Found the source string char in the contraction table. 2795 // Pick up the corresponding CE from the table. 2796 CE = *(coll->contractionCEs + 2797 (UCharOffset - coll->contractionIndex)); 2798 } 2799 else 2800 { 2801 // Source string char was not in contraction table. 2802 // Unless we have a discontiguous contraction, we have finished 2803 // with this contraction. 2804 // in order to do the proper detection, we 2805 // need to see if we're dealing with a supplementary 2806 /* We test whether the next two char are surrogate pairs. 2807 * This test is done if the iterator is not NULL. 2808 * If there is no surrogate pair, the iterator 2809 * goes back one if needed. */ 2810 UChar32 miss = schar; 2811 if (source->iterator) { 2812 UChar32 surrNextChar; /* the next char in the iteration to test */ 2813 int32_t prevPos; /* holds the previous position before move forward of the source iterator */ 2814 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { 2815 prevPos = source->iterator->index; 2816 surrNextChar = getNextNormalizedChar(source); 2817 if (U16_IS_TRAIL(surrNextChar)) { 2818 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); 2819 } else if (prevPos < source->iterator->index){ 2820 goBackOne(source); 2821 } 2822 } 2823 } else if (U16_IS_LEAD(schar)) { 2824 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); 2825 } 2826 2827 uint8_t sCC; 2828 if (miss < 0x300 || 2829 maxCC == 0 || 2830 (sCC = i_getCombiningClass(miss, coll)) == 0 || 2831 sCC>maxCC || 2832 (allSame != 0 && sCC == maxCC) || 2833 collIter_eos(source)) 2834 { 2835 // Contraction can not be discontiguous. 2836 goBackOne(source); // back up the source string by one, 2837 // because the character we just looked at was 2838 // not part of the contraction. */ 2839 if(U_IS_SUPPLEMENTARY(miss)) { 2840 goBackOne(source); 2841 } 2842 CE = *(coll->contractionCEs + 2843 (ContractionStart - coll->contractionIndex)); 2844 } else { 2845 // 2846 // Contraction is possibly discontiguous. 2847 // Scan more of source string looking for a match 2848 // 2849 UChar tempchar; 2850 /* find the next character if schar is not a base character 2851 and we are not yet at the end of the string */ 2852 tempchar = getNextNormalizedChar(source); 2853 // probably need another supplementary thingie here 2854 goBackOne(source); 2855 if (i_getCombiningClass(tempchar, coll) == 0) { 2856 goBackOne(source); 2857 if(U_IS_SUPPLEMENTARY(miss)) { 2858 goBackOne(source); 2859 } 2860 /* Spit out the last char of the string, wasn't tasty enough */ 2861 CE = *(coll->contractionCEs + 2862 (ContractionStart - coll->contractionIndex)); 2863 } else { 2864 CE = getDiscontiguous(coll, source, ContractionStart); 2865 } 2866 } 2867 } // else after if(schar == tchar) 2868 2869 if(CE == UCOL_NOT_FOUND) { 2870 /* The Source string did not match the contraction that we were checking. */ 2871 /* Back up the source position to undo the effects of having partially */ 2872 /* scanned through what ultimately proved to not be a contraction. */ 2873 loadState(source, &state, TRUE); 2874 CE = firstCE; 2875 break; 2876 } 2877 2878 if(!isContraction(CE)) { 2879 // The source string char was in the contraction table, and the corresponding 2880 // CE is not a contraction CE. We completed the contraction, break 2881 // out of loop, this CE will end up being returned. This is the normal 2882 // way out of contraction handling when the source actually contained 2883 // the contraction. 2884 break; 2885 } 2886 2887 2888 // The source string char was in the contraction table, and the corresponding 2889 // CE is IS a contraction CE. We will continue looping to check the source 2890 // string for the remaining chars in the contraction. 2891 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); 2892 if(tempCE != UCOL_NOT_FOUND) { 2893 // We have scanned a a section of source string for which there is a 2894 // CE from the contraction table. Remember the CE and scan position, so 2895 // that we can return to this point if further scanning fails to 2896 // match a longer contraction sequence. 2897 firstCE = tempCE; 2898 2899 goBackOne(source); 2900 backupState(source, &state); 2901 getNextNormalizedChar(source); 2902 2903 // Another way to do this is: 2904 //collIterateState tempState; 2905 //backupState(source, &tempState); 2906 //goBackOne(source); 2907 //backupState(source, &state); 2908 //loadState(source, &tempState, TRUE); 2909 2910 // The problem is that for incomplete contractions we have to remember the previous 2911 // position. Before, the only thing I needed to do was state.pos--; 2912 // After iterator introduction and especially after introduction of normalizing 2913 // iterators, it became much more difficult to decrease the saved state. 2914 // I'm not yet sure which of the two methods above is faster. 2915 } 2916 } // for(;;) 2917 break; 2918 } // case CONTRACTION_TAG: 2919 case LONG_PRIMARY_TAG: 2920 { 2921 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 2922 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 2923 source->offsetRepeatCount += 1; 2924 return CE; 2925 } 2926 case EXPANSION_TAG: 2927 { 2928 /* This should handle expansion. */ 2929 /* NOTE: we can encounter both continuations and expansions in an expansion! */ 2930 /* I have to decide where continuations are going to be dealt with */ 2931 uint32_t size; 2932 uint32_t i; /* general counter */ 2933 2934 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 2935 size = getExpansionCount(CE); 2936 CE = *CEOffset++; 2937 //source->offsetRepeatCount = -1; 2938 2939 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 2940 for(i = 1; i<size; i++) { 2941 *(source->CEpos++) = *CEOffset++; 2942 source->offsetRepeatCount += 1; 2943 } 2944 } else { /* else, we do */ 2945 while(*CEOffset != 0) { 2946 *(source->CEpos++) = *CEOffset++; 2947 source->offsetRepeatCount += 1; 2948 } 2949 } 2950 2951 return CE; 2952 } 2953 case DIGIT_TAG: 2954 { 2955 /* 2956 We do a check to see if we want to collate digits as numbers; if so we generate 2957 a custom collation key. Otherwise we pull out the value stored in the expansion table. 2958 */ 2959 //uint32_t size; 2960 uint32_t i; /* general counter */ 2961 2962 if (source->coll->numericCollation == UCOL_ON){ 2963 collIterateState digitState = {0,0,0,0,0,0,0,0,0}; 2964 UChar32 char32 = 0; 2965 int32_t digVal = 0; 2966 2967 uint32_t digIndx = 0; 2968 uint32_t endIndex = 0; 2969 uint32_t trailingZeroIndex = 0; 2970 2971 uint8_t collateVal = 0; 2972 2973 UBool nonZeroValReached = FALSE; 2974 2975 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. 2976 /* 2977 We parse the source string until we hit a char that's NOT a digit. 2978 Use this u_charDigitValue. This might be slow because we have to 2979 handle surrogates... 2980 */ 2981 /* 2982 if (U16_IS_LEAD(ch)){ 2983 if (!collIter_eos(source)) { 2984 backupState(source, &digitState); 2985 UChar trail = getNextNormalizedChar(source); 2986 if(U16_IS_TRAIL(trail)) { 2987 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 2988 } else { 2989 loadState(source, &digitState, TRUE); 2990 char32 = ch; 2991 } 2992 } else { 2993 char32 = ch; 2994 } 2995 } else { 2996 char32 = ch; 2997 } 2998 digVal = u_charDigitValue(char32); 2999 */ 3000 digVal = u_charDigitValue(cp); // if we have arrived here, we have 3001 // already processed possible supplementaries that trigered the digit tag - 3002 // all supplementaries are marked in the UCA. 3003 /* 3004 We pad a zero in front of the first element anyways. This takes 3005 care of the (probably) most common case where people are sorting things followed 3006 by a single digit 3007 */ 3008 digIndx++; 3009 for(;;){ 3010 // Make sure we have enough space. No longer needed; 3011 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER 3012 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough 3013 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). 3014 3015 // Skipping over leading zeroes. 3016 if (digVal != 0) { 3017 nonZeroValReached = TRUE; 3018 } 3019 if (nonZeroValReached) { 3020 /* 3021 We parse the digit string into base 100 numbers (this fits into a byte). 3022 We only add to the buffer in twos, thus if we are parsing an odd character, 3023 that serves as the 'tens' digit while the if we are parsing an even one, that 3024 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3025 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3026 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3027 than all the other bytes. 3028 */ 3029 3030 if (digIndx % 2 == 1){ 3031 collateVal += (uint8_t)digVal; 3032 3033 // We don't enter the low-order-digit case unless we've already seen 3034 // the high order, or for the first digit, which is always non-zero. 3035 if (collateVal != 0) 3036 trailingZeroIndex = 0; 3037 3038 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3039 collateVal = 0; 3040 } 3041 else{ 3042 // We drop the collation value into the buffer so if we need to do 3043 // a "front patch" we don't have to check to see if we're hitting the 3044 // last element. 3045 collateVal = (uint8_t)(digVal * 10); 3046 3047 // Check for trailing zeroes. 3048 if (collateVal == 0) 3049 { 3050 if (!trailingZeroIndex) 3051 trailingZeroIndex = (digIndx/2) + 2; 3052 } 3053 else 3054 trailingZeroIndex = 0; 3055 3056 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3057 } 3058 digIndx++; 3059 } 3060 3061 // Get next character. 3062 if (!collIter_eos(source)){ 3063 ch = getNextNormalizedChar(source); 3064 if (U16_IS_LEAD(ch)){ 3065 if (!collIter_eos(source)) { 3066 backupState(source, &digitState); 3067 UChar trail = getNextNormalizedChar(source); 3068 if(U16_IS_TRAIL(trail)) { 3069 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 3070 } else { 3071 loadState(source, &digitState, TRUE); 3072 char32 = ch; 3073 } 3074 } 3075 } else { 3076 char32 = ch; 3077 } 3078 3079 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){ 3080 // Resetting position to point to the next unprocessed char. We 3081 // overshot it when doing our test/set for numbers. 3082 if (char32 > 0xFFFF) { // For surrogates. 3083 loadState(source, &digitState, TRUE); 3084 //goBackOne(source); 3085 } 3086 goBackOne(source); 3087 break; 3088 } 3089 } else { 3090 break; 3091 } 3092 } 3093 3094 if (nonZeroValReached == FALSE){ 3095 digIndx = 2; 3096 numTempBuf[2] = 6; 3097 } 3098 3099 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; 3100 if (digIndx % 2 != 0){ 3101 /* 3102 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what 3103 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. 3104 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a 3105 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. 3106 */ 3107 3108 for(i = 2; i < endIndex; i++){ 3109 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + 3110 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; 3111 } 3112 --digIndx; 3113 } 3114 3115 // Subtract one off of the last byte. 3116 numTempBuf[endIndex-1] -= 1; 3117 3118 /* 3119 We want to skip over the first two slots in the buffer. The first slot 3120 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3121 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3122 */ 3123 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3124 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); 3125 3126 // Now transfer the collation key to our collIterate struct. 3127 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. 3128 //size = ((endIndex+1) & ~1)/2; 3129 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3130 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3131 UCOL_BYTE_COMMON; // Tertiary weight. 3132 i = 2; // Reset the index into the buffer. 3133 while(i < endIndex) 3134 { 3135 uint32_t primWeight = numTempBuf[i++] << 8; 3136 if ( i < endIndex) 3137 primWeight |= numTempBuf[i++]; 3138 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3139 } 3140 3141 } else { 3142 // no numeric mode, we'll just switch to whatever we stashed and continue 3143 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 3144 CE = *CEOffset++; 3145 break; 3146 } 3147 return CE; 3148 } 3149 /* various implicits optimization */ 3150 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 3151 /* UCA is filled with these. Tailorings are NOT_FOUND */ 3152 return getImplicit(cp, source); 3153 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 3154 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit 3155 return getImplicit(cp, source); 3156 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3157 { 3158 static const uint32_t 3159 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3160 //const uint32_t LCount = 19; 3161 static const uint32_t VCount = 21; 3162 static const uint32_t TCount = 28; 3163 //const uint32_t NCount = VCount * TCount; // 588 3164 //const uint32_t SCount = LCount * NCount; // 11172 3165 uint32_t L = ch - SBase; 3166 3167 // divide into pieces 3168 3169 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation 3170 L /= TCount; 3171 uint32_t V = L % VCount; 3172 L /= VCount; 3173 3174 // offset them 3175 3176 L += LBase; 3177 V += VBase; 3178 T += TBase; 3179 3180 // return the first CE, but first put the rest into the expansion buffer 3181 if (!source->coll->image->jamoSpecial) { // FAST PATH 3182 3183 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3184 if (T != TBase) { 3185 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3186 } 3187 3188 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3189 3190 } else { // Jamo is Special 3191 // Since Hanguls pass the FCD check, it is 3192 // guaranteed that we won't be in 3193 // the normalization buffer if something like this happens 3194 3195 // However, if we are using a uchar iterator and normalization 3196 // is ON, the Hangul that lead us here is going to be in that 3197 // normalization buffer. Here we want to restore the uchar 3198 // iterator state and pull out of the normalization buffer 3199 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { 3200 source->flags = source->origFlags; // restore the iterator 3201 source->pos = NULL; 3202 } 3203 3204 // Move Jamos into normalization buffer 3205 UChar *buffer = source->writableBuffer.getBuffer(4); 3206 int32_t bufferLength; 3207 buffer[0] = (UChar)L; 3208 buffer[1] = (UChar)V; 3209 if (T != TBase) { 3210 buffer[2] = (UChar)T; 3211 bufferLength = 3; 3212 } else { 3213 bufferLength = 2; 3214 } 3215 source->writableBuffer.releaseBuffer(bufferLength); 3216 3217 // Indicate where to continue in main input string after exhausting the writableBuffer 3218 source->fcdPosition = source->pos; 3219 3220 source->pos = source->writableBuffer.getTerminatedBuffer(); 3221 source->origFlags = source->flags; 3222 source->flags |= UCOL_ITER_INNORMBUF; 3223 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 3224 3225 return(UCOL_IGNORABLE); 3226 } 3227 } 3228 case SURROGATE_TAG: 3229 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ 3230 /* two things can happen here: next code point can be a trailing surrogate - we will use it */ 3231 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ 3232 /* we treat it like an unassigned code point. */ 3233 { 3234 UChar trail; 3235 collIterateState state; 3236 backupState(source, &state); 3237 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { 3238 // we chould have stepped one char forward and it might have turned that it 3239 // was not a trail surrogate. In that case, we have to backup. 3240 loadState(source, &state, TRUE); 3241 return UCOL_NOT_FOUND; 3242 } else { 3243 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ 3244 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail); 3245 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. 3246 // We need to backup 3247 loadState(source, &state, TRUE); 3248 return CE; 3249 } 3250 // calculate the supplementary code point value, if surrogate was not tailored 3251 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 3252 } 3253 } 3254 break; 3255 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 3256 UChar nextChar; 3257 if( source->flags & UCOL_USE_ITERATOR) { 3258 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { 3259 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3260 source->iterator->next(source->iterator); 3261 return getImplicit(cp, source); 3262 } 3263 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) && 3264 U_IS_TRAIL((nextChar=*source->pos))) { 3265 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3266 source->pos++; 3267 return getImplicit(cp, source); 3268 } 3269 return UCOL_NOT_FOUND; 3270 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 3271 return UCOL_NOT_FOUND; /* broken surrogate sequence */ 3272 case CHARSET_TAG: 3273 /* not yet implemented */ 3274 /* probably after 1.8 */ 3275 return UCOL_NOT_FOUND; 3276 default: 3277 *status = U_INTERNAL_PROGRAM_ERROR; 3278 CE=0; 3279 break; 3280 } 3281 if (CE <= UCOL_NOT_FOUND) break; 3282 } 3283 return CE; 3284 } 3285 3286 3287 /* now uses Mark's getImplicitPrimary code */ 3288 static 3289 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { 3290 uint32_t r = uprv_uca_getImplicitPrimary(cp); 3291 3292 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; 3293 collationSource->toReturn = collationSource->CEpos; 3294 3295 // **** doesn't work if using iterator **** 3296 if (collationSource->flags & UCOL_ITER_INNORMBUF) { 3297 collationSource->offsetRepeatCount = 1; 3298 } else { 3299 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string); 3300 3301 UErrorCode errorCode = U_ZERO_ERROR; 3302 collationSource->appendOffset(firstOffset, errorCode); 3303 collationSource->appendOffset(firstOffset + 1, errorCode); 3304 3305 collationSource->offsetReturn = collationSource->offsetStore - 1; 3306 *(collationSource->offsetBuffer) = firstOffset; 3307 if (collationSource->offsetReturn == collationSource->offsetBuffer) { 3308 collationSource->offsetStore = collationSource->offsetBuffer; 3309 } 3310 } 3311 3312 return ((r & 0x0000FFFF)<<16) | 0x000000C0; 3313 } 3314 3315 /** 3316 * This function handles the special CEs like contractions, expansions, 3317 * surrogates, Thai. 3318 * It is called by both getPrevCE 3319 */ 3320 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, 3321 collIterate *source, 3322 UErrorCode *status) 3323 { 3324 const uint32_t *CEOffset = NULL; 3325 UChar *UCharOffset = NULL; 3326 UChar schar; 3327 const UChar *constart = NULL; 3328 uint32_t size; 3329 UChar buffer[UCOL_MAX_BUFFER]; 3330 uint32_t *endCEBuffer; 3331 UChar *strbuffer; 3332 int32_t noChars = 0; 3333 int32_t CECount = 0; 3334 3335 for(;;) 3336 { 3337 /* the only ces that loops are thai and contractions */ 3338 switch (getCETag(CE)) 3339 { 3340 case NOT_FOUND_TAG: /* this tag always returns */ 3341 return CE; 3342 3343 case SPEC_PROC_TAG: 3344 { 3345 // Special processing is getting a CE that is preceded by a certain prefix 3346 // Currently this is only needed for optimizing Japanese length and iteration marks. 3347 // When we encouter a special processing tag, we go backwards and try to see if 3348 // we have a match. 3349 // Contraction tables are used - so the whole process is not unlike contraction. 3350 // prefix data is stored backwards in the table. 3351 const UChar *UCharOffset; 3352 UChar schar, tchar; 3353 collIterateState prefixState; 3354 backupState(source, &prefixState); 3355 for(;;) { 3356 // This loop will run once per source string character, for as long as we 3357 // are matching a potential contraction sequence 3358 3359 // First we position ourselves at the begining of contraction sequence 3360 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 3361 3362 if (collIter_bos(source)) { 3363 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 3364 break; 3365 } 3366 schar = getPrevNormalizedChar(source, status); 3367 goBackOne(source); 3368 3369 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 3370 UCharOffset++; 3371 } 3372 3373 if (schar == tchar) { 3374 // Found the source string char in the table. 3375 // Pick up the corresponding CE from the table. 3376 CE = *(coll->contractionCEs + 3377 (UCharOffset - coll->contractionIndex)); 3378 } 3379 else 3380 { 3381 // if there is a completely ignorable code point in the middle of 3382 // a prefix, we need to act as if it's not there 3383 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) 3384 // lone surrogates cannot be set to zero as it would break other processing 3385 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 3386 // it's easy for BMP code points 3387 if(isZeroCE == 0) { 3388 continue; 3389 } else if(U16_IS_SURROGATE(schar)) { 3390 // for supplementary code points, we have to check the next one 3391 // situations where we are going to ignore 3392 // 1. beginning of the string: schar is a lone surrogate 3393 // 2. schar is a lone surrogate 3394 // 3. schar is a trail surrogate in a valid surrogate sequence 3395 // that is explicitly set to zero. 3396 if (!collIter_bos(source)) { 3397 UChar lead; 3398 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) { 3399 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead); 3400 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) { 3401 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); 3402 if(finalCE == 0) { 3403 // this is a real, assigned completely ignorable code point 3404 goBackOne(source); 3405 continue; 3406 } 3407 } 3408 } else { 3409 // lone surrogate, treat like unassigned 3410 return UCOL_NOT_FOUND; 3411 } 3412 } else { 3413 // lone surrogate at the beggining, treat like unassigned 3414 return UCOL_NOT_FOUND; 3415 } 3416 } 3417 // Source string char was not in the table. 3418 // We have not found the prefix. 3419 CE = *(coll->contractionCEs + 3420 (ContractionStart - coll->contractionIndex)); 3421 } 3422 3423 if(!isPrefix(CE)) { 3424 // The source string char was in the contraction table, and the corresponding 3425 // CE is not a prefix CE. We found the prefix, break 3426 // out of loop, this CE will end up being returned. This is the normal 3427 // way out of prefix handling when the source actually contained 3428 // the prefix. 3429 break; 3430 } 3431 } 3432 loadState(source, &prefixState, TRUE); 3433 break; 3434 } 3435 3436 case CONTRACTION_TAG: { 3437 /* to ensure that the backwards and forwards iteration matches, we 3438 take the current region of most possible match and pass it through 3439 the forward iteration. this will ensure that the obstinate problem of 3440 overlapping contractions will not occur. 3441 */ 3442 schar = peekCodeUnit(source, 0); 3443 constart = (UChar *)coll->image + getContractOffset(CE); 3444 if (isAtStartPrevIterate(source) 3445 /* commented away contraction end checks after adding the checks 3446 in getPrevCE */) { 3447 /* start of string or this is not the end of any contraction */ 3448 CE = *(coll->contractionCEs + 3449 (constart - coll->contractionIndex)); 3450 break; 3451 } 3452 strbuffer = buffer; 3453 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); 3454 *(UCharOffset --) = 0; 3455 noChars = 0; 3456 // have to swap thai characters 3457 while (ucol_unsafeCP(schar, coll)) { 3458 *(UCharOffset) = schar; 3459 noChars++; 3460 UCharOffset --; 3461 schar = getPrevNormalizedChar(source, status); 3462 goBackOne(source); 3463 // TODO: when we exhaust the contraction buffer, 3464 // it needs to get reallocated. The problem is 3465 // that the size depends on the string which is 3466 // not iterated over. However, since we're travelling 3467 // backwards, we already had to set the iterator at 3468 // the end - so we might as well know where we are? 3469 if (UCharOffset + 1 == buffer) { 3470 /* we have exhausted the buffer */ 3471 int32_t newsize = 0; 3472 if(source->pos) { // actually dealing with a position 3473 newsize = (int32_t)(source->pos - source->string + 1); 3474 } else { // iterator 3475 newsize = 4 * UCOL_MAX_BUFFER; 3476 } 3477 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * 3478 (newsize + UCOL_MAX_BUFFER)); 3479 /* test for NULL */ 3480 if (strbuffer == NULL) { 3481 *status = U_MEMORY_ALLOCATION_ERROR; 3482 return UCOL_NO_MORE_CES; 3483 } 3484 UCharOffset = strbuffer + newsize; 3485 uprv_memcpy(UCharOffset, buffer, 3486 UCOL_MAX_BUFFER * sizeof(UChar)); 3487 UCharOffset --; 3488 } 3489 if ((source->pos && (source->pos == source->string || 3490 ((source->flags & UCOL_ITER_INNORMBUF) && 3491 *(source->pos - 1) == 0 && source->fcdPosition == NULL))) 3492 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { 3493 break; 3494 } 3495 } 3496 /* adds the initial base character to the string */ 3497 *(UCharOffset) = schar; 3498 noChars++; 3499 3500 int32_t offsetBias; 3501 3502 // **** doesn't work if using iterator **** 3503 if (source->flags & UCOL_ITER_INNORMBUF) { 3504 offsetBias = -1; 3505 } else { 3506 offsetBias = (int32_t)(source->pos - source->string); 3507 } 3508 3509 /* a new collIterate is used to simplify things, since using the current 3510 collIterate will mean that the forward and backwards iteration will 3511 share and change the same buffers. we don't want to get into that. */ 3512 collIterate temp; 3513 int32_t rawOffset; 3514 3515 IInit_collIterate(coll, UCharOffset, noChars, &temp, status); 3516 if(U_FAILURE(*status)) { 3517 return (uint32_t)UCOL_NULLORDER; 3518 } 3519 temp.flags &= ~UCOL_ITER_NORM; 3520 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; 3521 3522 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero? 3523 CE = ucol_IGetNextCE(coll, &temp, status); 3524 3525 if (source->extendCEs) { 3526 endCEBuffer = source->extendCEs + source->extendCEsSize; 3527 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t)); 3528 } else { 3529 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; 3530 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t)); 3531 } 3532 3533 while (CE != UCOL_NO_MORE_CES) { 3534 *(source->CEpos ++) = CE; 3535 3536 if (offsetBias >= 0) { 3537 source->appendOffset(rawOffset + offsetBias, *status); 3538 } 3539 3540 CECount++; 3541 if (source->CEpos == endCEBuffer) { 3542 /* ran out of CE space, reallocate to new buffer. 3543 If reallocation fails, reset pointers and bail out, 3544 there's no guarantee of the right character position after 3545 this bail*/ 3546 if (!increaseCEsCapacity(source)) { 3547 *status = U_MEMORY_ALLOCATION_ERROR; 3548 break; 3549 } 3550 3551 endCEBuffer = source->extendCEs + source->extendCEsSize; 3552 } 3553 3554 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { 3555 rawOffset = (int32_t)(temp.fcdPosition - temp.string); 3556 } else { 3557 rawOffset = (int32_t)(temp.pos - temp.string); 3558 } 3559 3560 CE = ucol_IGetNextCE(coll, &temp, status); 3561 } 3562 3563 if (strbuffer != buffer) { 3564 uprv_free(strbuffer); 3565 } 3566 if (U_FAILURE(*status)) { 3567 return (uint32_t)UCOL_NULLORDER; 3568 } 3569 3570 if (source->offsetRepeatValue != 0) { 3571 if (CECount > noChars) { 3572 source->offsetRepeatCount += temp.offsetRepeatCount; 3573 } else { 3574 // **** does this really skip the right offsets? **** 3575 source->offsetReturn -= (noChars - CECount); 3576 } 3577 } 3578 3579 if (offsetBias >= 0) { 3580 source->offsetReturn = source->offsetStore - 1; 3581 if (source->offsetReturn == source->offsetBuffer) { 3582 source->offsetStore = source->offsetBuffer; 3583 } 3584 } 3585 3586 source->toReturn = source->CEpos - 1; 3587 if (source->toReturn == source->CEs) { 3588 source->CEpos = source->CEs; 3589 } 3590 3591 return *(source->toReturn); 3592 } 3593 case LONG_PRIMARY_TAG: 3594 { 3595 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 3596 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 3597 source->toReturn = source->CEpos - 1; 3598 3599 if (source->flags & UCOL_ITER_INNORMBUF) { 3600 source->offsetRepeatCount = 1; 3601 } else { 3602 int32_t firstOffset = (int32_t)(source->pos - source->string); 3603 3604 source->appendOffset(firstOffset, *status); 3605 source->appendOffset(firstOffset + 1, *status); 3606 3607 source->offsetReturn = source->offsetStore - 1; 3608 *(source->offsetBuffer) = firstOffset; 3609 if (source->offsetReturn == source->offsetBuffer) { 3610 source->offsetStore = source->offsetBuffer; 3611 } 3612 } 3613 3614 3615 return *(source->toReturn); 3616 } 3617 3618 case EXPANSION_TAG: /* this tag always returns */ 3619 { 3620 /* 3621 This should handle expansion. 3622 NOTE: we can encounter both continuations and expansions in an expansion! 3623 I have to decide where continuations are going to be dealt with 3624 */ 3625 int32_t firstOffset = (int32_t)(source->pos - source->string); 3626 3627 // **** doesn't work if using iterator **** 3628 if (source->offsetReturn != NULL) { 3629 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) { 3630 source->offsetStore = source->offsetBuffer; 3631 }else { 3632 firstOffset = -1; 3633 } 3634 } 3635 3636 /* find the offset to expansion table */ 3637 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3638 size = getExpansionCount(CE); 3639 if (size != 0) { 3640 /* 3641 if there are less than 16 elements in expansion, we don't terminate 3642 */ 3643 uint32_t count; 3644 3645 for (count = 0; count < size; count++) { 3646 *(source->CEpos ++) = *CEOffset++; 3647 3648 if (firstOffset >= 0) { 3649 source->appendOffset(firstOffset + 1, *status); 3650 } 3651 } 3652 } else { 3653 /* else, we do */ 3654 while (*CEOffset != 0) { 3655 *(source->CEpos ++) = *CEOffset ++; 3656 3657 if (firstOffset >= 0) { 3658 source->appendOffset(firstOffset + 1, *status); 3659 } 3660 } 3661 } 3662 3663 if (firstOffset >= 0) { 3664 source->offsetReturn = source->offsetStore - 1; 3665 *(source->offsetBuffer) = firstOffset; 3666 if (source->offsetReturn == source->offsetBuffer) { 3667 source->offsetStore = source->offsetBuffer; 3668 } 3669 } else { 3670 source->offsetRepeatCount += size - 1; 3671 } 3672 3673 source->toReturn = source->CEpos - 1; 3674 // in case of one element expansion, we 3675 // want to immediately return CEpos 3676 if(source->toReturn == source->CEs) { 3677 source->CEpos = source->CEs; 3678 } 3679 3680 return *(source->toReturn); 3681 } 3682 3683 case DIGIT_TAG: 3684 { 3685 /* 3686 We do a check to see if we want to collate digits as numbers; if so we generate 3687 a custom collation key. Otherwise we pull out the value stored in the expansion table. 3688 */ 3689 uint32_t i; /* general counter */ 3690 3691 if (source->coll->numericCollation == UCOL_ON){ 3692 uint32_t digIndx = 0; 3693 uint32_t endIndex = 0; 3694 uint32_t leadingZeroIndex = 0; 3695 uint32_t trailingZeroCount = 0; 3696 3697 uint8_t collateVal = 0; 3698 3699 UBool nonZeroValReached = FALSE; 3700 3701 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs. 3702 /* 3703 We parse the source string until we hit a char that's NOT a digit. 3704 Use this u_charDigitValue. This might be slow because we have to 3705 handle surrogates... 3706 */ 3707 /* 3708 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less, 3709 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation 3710 element we process when going backward. To determine how long that chunk might be, we may need to make 3711 two passes through the loop that collects digits - one to see how long the string is (and how much is 3712 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has 3713 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation 3714 element chunk after resetting the state to the initialState at the right side of the digit string. 3715 */ 3716 uint32_t ceLimit = 0; 3717 UChar initial_ch = ch; 3718 collIterateState initialState = {0,0,0,0,0,0,0,0,0}; 3719 backupState(source, &initialState); 3720 3721 for(;;) { 3722 collIterateState state = {0,0,0,0,0,0,0,0,0}; 3723 UChar32 char32 = 0; 3724 int32_t digVal = 0; 3725 3726 if (U16_IS_TRAIL (ch)) { 3727 if (!collIter_bos(source)){ 3728 UChar lead = getPrevNormalizedChar(source, status); 3729 if(U16_IS_LEAD(lead)) { 3730 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3731 goBackOne(source); 3732 } else { 3733 char32 = ch; 3734 } 3735 } else { 3736 char32 = ch; 3737 } 3738 } else { 3739 char32 = ch; 3740 } 3741 digVal = u_charDigitValue(char32); 3742 3743 for(;;) { 3744 // Make sure we have enough space. No longer needed; 3745 // at this point the largest value of digIndx when we need to save data in numTempBuf 3746 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure 3747 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2). 3748 3749 // Skip over trailing zeroes, and keep a count of them. 3750 if (digVal != 0) 3751 nonZeroValReached = TRUE; 3752 3753 if (nonZeroValReached) { 3754 /* 3755 We parse the digit string into base 100 numbers (this fits into a byte). 3756 We only add to the buffer in twos, thus if we are parsing an odd character, 3757 that serves as the 'tens' digit while the if we are parsing an even one, that 3758 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3759 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3760 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3761 than all the other bytes. 3762 3763 Since we're doing in this reverse we want to put the first digit encountered into the 3764 ones place and the second digit encountered into the tens place. 3765 */ 3766 3767 if ((digIndx + trailingZeroCount) % 2 == 1) { 3768 // High-order digit case (tens place) 3769 collateVal += (uint8_t)(digVal * 10); 3770 3771 // We cannot set leadingZeroIndex unless it has been set for the 3772 // low-order digit. Therefore, all we can do for the high-order 3773 // digit is turn it off, never on. 3774 // The only time we will have a high digit without a low is for 3775 // the very first non-zero digit, so no zero check is necessary. 3776 if (collateVal != 0) 3777 leadingZeroIndex = 0; 3778 3779 // The first pass through, digIndx may exceed the limit, but in that case 3780 // we no longer care about numTempBuf contents since they will be discarded 3781 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) { 3782 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3783 } 3784 collateVal = 0; 3785 } else { 3786 // Low-order digit case (ones place) 3787 collateVal = (uint8_t)digVal; 3788 3789 // Check for leading zeroes. 3790 if (collateVal == 0) { 3791 if (!leadingZeroIndex) 3792 leadingZeroIndex = (digIndx/2) + 2; 3793 } else 3794 leadingZeroIndex = 0; 3795 3796 // No need to write to buffer; the case of a last odd digit 3797 // is handled below. 3798 } 3799 ++digIndx; 3800 } else 3801 ++trailingZeroCount; 3802 3803 if (!collIter_bos(source)) { 3804 ch = getPrevNormalizedChar(source, status); 3805 //goBackOne(source); 3806 if (U16_IS_TRAIL(ch)) { 3807 backupState(source, &state); 3808 if (!collIter_bos(source)) { 3809 goBackOne(source); 3810 UChar lead = getPrevNormalizedChar(source, status); 3811 3812 if(U16_IS_LEAD(lead)) { 3813 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3814 } else { 3815 loadState(source, &state, FALSE); 3816 char32 = ch; 3817 } 3818 } 3819 } else 3820 char32 = ch; 3821 3822 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { 3823 if (char32 > 0xFFFF) {// For surrogates. 3824 loadState(source, &state, FALSE); 3825 } 3826 // Don't need to "reverse" the goBackOne call, 3827 // as this points to the next position to process.. 3828 //if (char32 > 0xFFFF) // For surrogates. 3829 //getNextNormalizedChar(source); 3830 break; 3831 } 3832 3833 goBackOne(source); 3834 }else 3835 break; 3836 } 3837 3838 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) { 3839 // our collation element is not too big, go ahead and finish with it 3840 break; 3841 } 3842 // our digit string is too long for a collation element; 3843 // set the limit for it, reset the state and begin again 3844 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER; 3845 if ( ceLimit == 0 ) { 3846 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; 3847 } 3848 ch = initial_ch; 3849 loadState(source, &initialState, FALSE); 3850 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0; 3851 collateVal = 0; 3852 nonZeroValReached = FALSE; 3853 } 3854 3855 if (! nonZeroValReached) { 3856 digIndx = 2; 3857 trailingZeroCount = 0; 3858 numTempBuf[2] = 6; 3859 } 3860 3861 if ((digIndx + trailingZeroCount) % 2 != 0) { 3862 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; 3863 digIndx += 1; // The implicit leading zero 3864 } 3865 if (trailingZeroCount % 2 != 0) { 3866 // We had to consume one trailing zero for the low digit 3867 // of the least significant byte 3868 digIndx += 1; // The trailing zero not in the exponent 3869 trailingZeroCount -= 1; 3870 } 3871 3872 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; 3873 3874 // Subtract one off of the last byte. Really the first byte here, but it's reversed... 3875 numTempBuf[2] -= 1; 3876 3877 /* 3878 We want to skip over the first two slots in the buffer. The first slot 3879 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3880 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3881 The exponent must be adjusted by the number of leading zeroes, and the number of 3882 trailing zeroes. 3883 */ 3884 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3885 uint32_t exponent = (digIndx+trailingZeroCount)/2; 3886 if (leadingZeroIndex) 3887 exponent -= ((digIndx/2) + 2 - leadingZeroIndex); 3888 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); 3889 3890 // Now transfer the collation key to our collIterate struct. 3891 // The total size for our collation key is half of endIndex, rounded up. 3892 int32_t size = (endIndex+1)/2; 3893 if(!ensureCEsCapacity(source, size)) { 3894 return (uint32_t)UCOL_NULLORDER; 3895 } 3896 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3897 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3898 UCOL_BYTE_COMMON; // Tertiary weight. 3899 i = endIndex - 1; // Reset the index into the buffer. 3900 while(i >= 2) { 3901 uint32_t primWeight = numTempBuf[i--] << 8; 3902 if ( i >= 2) 3903 primWeight |= numTempBuf[i--]; 3904 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3905 } 3906 3907 source->toReturn = source->CEpos -1; 3908 return *(source->toReturn); 3909 } else { 3910 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3911 CE = *(CEOffset++); 3912 break; 3913 } 3914 } 3915 3916 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3917 { 3918 static const uint32_t 3919 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3920 //const uint32_t LCount = 19; 3921 static const uint32_t VCount = 21; 3922 static const uint32_t TCount = 28; 3923 //const uint32_t NCount = VCount * TCount; /* 588 */ 3924 //const uint32_t SCount = LCount * NCount; /* 11172 */ 3925 3926 uint32_t L = ch - SBase; 3927 /* 3928 divide into pieces. 3929 we do it in this order since some compilers can do % and / in one 3930 operation 3931 */ 3932 uint32_t T = L % TCount; 3933 L /= TCount; 3934 uint32_t V = L % VCount; 3935 L /= VCount; 3936 3937 /* offset them */ 3938 L += LBase; 3939 V += VBase; 3940 T += TBase; 3941 3942 int32_t firstOffset = (int32_t)(source->pos - source->string); 3943 source->appendOffset(firstOffset, *status); 3944 3945 /* 3946 * return the first CE, but first put the rest into the expansion buffer 3947 */ 3948 if (!source->coll->image->jamoSpecial) { 3949 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3950 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3951 source->appendOffset(firstOffset + 1, *status); 3952 3953 if (T != TBase) { 3954 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3955 source->appendOffset(firstOffset + 1, *status); 3956 } 3957 3958 source->toReturn = source->CEpos - 1; 3959 3960 source->offsetReturn = source->offsetStore - 1; 3961 if (source->offsetReturn == source->offsetBuffer) { 3962 source->offsetStore = source->offsetBuffer; 3963 } 3964 3965 return *(source->toReturn); 3966 } else { 3967 // Since Hanguls pass the FCD check, it is 3968 // guaranteed that we won't be in 3969 // the normalization buffer if something like this happens 3970 3971 // Move Jamos into normalization buffer 3972 UChar *tempbuffer = source->writableBuffer.getBuffer(5); 3973 int32_t tempbufferLength, jamoOffset; 3974 tempbuffer[0] = 0; 3975 tempbuffer[1] = (UChar)L; 3976 tempbuffer[2] = (UChar)V; 3977 if (T != TBase) { 3978 tempbuffer[3] = (UChar)T; 3979 tempbufferLength = 4; 3980 } else { 3981 tempbufferLength = 3; 3982 } 3983 source->writableBuffer.releaseBuffer(tempbufferLength); 3984 3985 // Indicate where to continue in main input string after exhausting the writableBuffer 3986 if (source->pos == source->string) { 3987 jamoOffset = 0; 3988 source->fcdPosition = NULL; 3989 } else { 3990 jamoOffset = source->pos - source->string; 3991 source->fcdPosition = source->pos-1; 3992 } 3993 3994 // Append offsets for the additional chars 3995 // (not the 0, and not the L whose offsets match the original Hangul) 3996 int32_t jamoRemaining = tempbufferLength - 2; 3997 jamoOffset++; // appended offsets should match end of original Hangul 3998 while (jamoRemaining-- > 0) { 3999 source->appendOffset(jamoOffset, *status); 4000 } 4001 4002 source->offsetRepeatValue = jamoOffset; 4003 4004 source->offsetReturn = source->offsetStore - 1; 4005 if (source->offsetReturn == source->offsetBuffer) { 4006 source->offsetStore = source->offsetBuffer; 4007 } 4008 4009 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength; 4010 source->origFlags = source->flags; 4011 source->flags |= UCOL_ITER_INNORMBUF; 4012 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 4013 4014 return(UCOL_IGNORABLE); 4015 } 4016 } 4017 4018 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 4019 return getPrevImplicit(ch, source); 4020 4021 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function 4022 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 4023 return getPrevImplicit(ch, source); 4024 4025 case SURROGATE_TAG: /* This is a surrogate pair */ 4026 /* essentially an engaged lead surrogate. */ 4027 /* if you have encountered it here, it means that a */ 4028 /* broken sequence was encountered and this is an error */ 4029 return UCOL_NOT_FOUND; 4030 4031 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 4032 return UCOL_NOT_FOUND; /* broken surrogate sequence */ 4033 4034 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 4035 { 4036 UChar32 cp = 0; 4037 UChar prevChar; 4038 const UChar *prev; 4039 if (isAtStartPrevIterate(source)) { 4040 /* we are at the start of the string, wrong place to be at */ 4041 return UCOL_NOT_FOUND; 4042 } 4043 if (source->pos != source->writableBuffer.getBuffer()) { 4044 prev = source->pos - 1; 4045 } else { 4046 prev = source->fcdPosition; 4047 } 4048 prevChar = *prev; 4049 4050 /* Handles Han and Supplementary characters here.*/ 4051 if (U16_IS_LEAD(prevChar)) { 4052 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 4053 source->pos = prev; 4054 } else { 4055 return UCOL_NOT_FOUND; /* like unassigned */ 4056 } 4057 4058 return getPrevImplicit(cp, source); 4059 } 4060 4061 /* UCA is filled with these. Tailorings are NOT_FOUND */ 4062 /* not yet implemented */ 4063 case CHARSET_TAG: /* this tag always returns */ 4064 /* probably after 1.8 */ 4065 return UCOL_NOT_FOUND; 4066 4067 default: /* this tag always returns */ 4068 *status = U_INTERNAL_PROGRAM_ERROR; 4069 CE=0; 4070 break; 4071 } 4072 4073 if (CE <= UCOL_NOT_FOUND) { 4074 break; 4075 } 4076 } 4077 4078 return CE; 4079 } 4080 4081 /* This should really be a macro */ 4082 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */ 4083 /* secondaries in French */ 4084 /* 4085 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { 4086 uint8_t temp; 4087 while(start<end) { 4088 temp = *start; 4089 *start++ = *end; 4090 *end-- = temp; 4091 } 4092 } 4093 */ 4094 4095 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ 4096 TYPE tempA; \ 4097 while((start)<(end)) { \ 4098 tempA = *(start); \ 4099 *(start)++ = *(end); \ 4100 *(end)-- = tempA; \ 4101 } \ 4102 } 4103 4104 /****************************************************************************/ 4105 /* Following are the sortkey generation functions */ 4106 /* */ 4107 /****************************************************************************/ 4108 4109 U_CAPI int32_t U_EXPORT2 4110 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 4111 const uint8_t *src2, int32_t src2Length, 4112 uint8_t *dest, int32_t destCapacity) { 4113 /* check arguments */ 4114 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 4115 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 4116 destCapacity<0 || (destCapacity>0 && dest==NULL) 4117 ) { 4118 /* error, attempt to write a zero byte and return 0 */ 4119 if(dest!=NULL && destCapacity>0) { 4120 *dest=0; 4121 } 4122 return 0; 4123 } 4124 4125 /* check lengths and capacity */ 4126 if(src1Length<0) { 4127 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 4128 } 4129 if(src2Length<0) { 4130 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 4131 } 4132 4133 int32_t destLength=src1Length+src2Length; 4134 if(destLength>destCapacity) { 4135 /* the merged sort key does not fit into the destination */ 4136 return destLength; 4137 } 4138 4139 /* merge the sort keys with the same number of levels */ 4140 uint8_t *p=dest; 4141 for(;;) { 4142 /* copy level from src1 not including 00 or 01 */ 4143 uint8_t b; 4144 while((b=*src1)>=2) { 4145 ++src1; 4146 *p++=b; 4147 } 4148 4149 /* add a 02 merge separator */ 4150 *p++=2; 4151 4152 /* copy level from src2 not including 00 or 01 */ 4153 while((b=*src2)>=2) { 4154 ++src2; 4155 *p++=b; 4156 } 4157 4158 /* if both sort keys have another level, then add a 01 level separator and continue */ 4159 if(*src1==1 && *src2==1) { 4160 ++src1; 4161 ++src2; 4162 *p++=1; 4163 } else { 4164 break; 4165 } 4166 } 4167 4168 /* 4169 * here, at least one sort key is finished now, but the other one 4170 * might have some contents left from containing more levels; 4171 * that contents is just appended to the result 4172 */ 4173 if(*src1!=0) { 4174 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 4175 src2=src1; 4176 } 4177 /* append src2, "the other, unfinished sort key" */ 4178 while((*p++=*src2++)!=0) {} 4179 4180 /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */ 4181 return (int32_t)(p-dest); 4182 } 4183 4184 U_NAMESPACE_BEGIN 4185 4186 class SortKeyByteSink : public ByteSink { 4187 public: 4188 SortKeyByteSink(char *dest, int32_t destCapacity) 4189 : buffer_(dest), capacity_(destCapacity), 4190 appended_(0) { 4191 if (buffer_ == NULL) { 4192 capacity_ = 0; 4193 } else if(capacity_ < 0) { 4194 buffer_ = NULL; 4195 capacity_ = 0; 4196 } 4197 } 4198 virtual ~SortKeyByteSink(); 4199 4200 virtual void Append(const char *bytes, int32_t n); 4201 void Append(uint32_t b) { 4202 if (appended_ < capacity_ || Resize(1, appended_)) { 4203 buffer_[appended_] = (char)b; 4204 } 4205 ++appended_; 4206 } 4207 void Append(uint32_t b1, uint32_t b2) { 4208 int32_t a2 = appended_ + 2; 4209 if (a2 <= capacity_ || Resize(2, appended_)) { 4210 buffer_[appended_] = (char)b1; 4211 buffer_[appended_ + 1] = (char)b2; 4212 } else if(appended_ < capacity_) { 4213 buffer_[appended_] = (char)b1; 4214 } 4215 appended_ = a2; 4216 } 4217 virtual char *GetAppendBuffer(int32_t min_capacity, 4218 int32_t desired_capacity_hint, 4219 char *scratch, int32_t scratch_capacity, 4220 int32_t *result_capacity); 4221 int32_t NumberOfBytesAppended() const { return appended_; } 4222 /** @return FALSE if memory allocation failed */ 4223 UBool IsOk() const { return buffer_ != NULL; } 4224 4225 protected: 4226 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0; 4227 virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0; 4228 4229 void SetNotOk() { 4230 buffer_ = NULL; 4231 capacity_ = 0; 4232 } 4233 4234 char *buffer_; 4235 int32_t capacity_; 4236 int32_t appended_; 4237 4238 private: 4239 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented 4240 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented 4241 }; 4242 4243 SortKeyByteSink::~SortKeyByteSink() {} 4244 4245 void 4246 SortKeyByteSink::Append(const char *bytes, int32_t n) { 4247 if (n <= 0 || bytes == NULL) { 4248 return; 4249 } 4250 int32_t length = appended_; 4251 appended_ += n; 4252 if ((buffer_ + length) == bytes) { 4253 return; // the caller used GetAppendBuffer() and wrote the bytes already 4254 } 4255 int32_t available = capacity_ - length; 4256 if (n <= available) { 4257 uprv_memcpy(buffer_ + length, bytes, n); 4258 } else { 4259 AppendBeyondCapacity(bytes, n, length); 4260 } 4261 } 4262 4263 char * 4264 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity, 4265 int32_t desired_capacity_hint, 4266 char *scratch, 4267 int32_t scratch_capacity, 4268 int32_t *result_capacity) { 4269 if (min_capacity < 1 || scratch_capacity < min_capacity) { 4270 *result_capacity = 0; 4271 return NULL; 4272 } 4273 int32_t available = capacity_ - appended_; 4274 if (available >= min_capacity) { 4275 *result_capacity = available; 4276 return buffer_ + appended_; 4277 } else if (Resize(desired_capacity_hint, appended_)) { 4278 *result_capacity = capacity_ - appended_; 4279 return buffer_ + appended_; 4280 } else { 4281 *result_capacity = scratch_capacity; 4282 return scratch; 4283 } 4284 } 4285 4286 class FixedSortKeyByteSink : public SortKeyByteSink { 4287 public: 4288 FixedSortKeyByteSink(char *dest, int32_t destCapacity) 4289 : SortKeyByteSink(dest, destCapacity) {} 4290 virtual ~FixedSortKeyByteSink(); 4291 4292 private: 4293 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 4294 virtual UBool Resize(int32_t appendCapacity, int32_t length); 4295 }; 4296 4297 FixedSortKeyByteSink::~FixedSortKeyByteSink() {} 4298 4299 void 4300 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) { 4301 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 4302 // Fill the buffer completely. 4303 int32_t available = capacity_ - length; 4304 if (available > 0) { 4305 uprv_memcpy(buffer_ + length, bytes, available); 4306 } 4307 } 4308 4309 UBool 4310 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) { 4311 return FALSE; 4312 } 4313 4314 class CollationKeyByteSink : public SortKeyByteSink { 4315 public: 4316 CollationKeyByteSink(CollationKey &key) 4317 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()), 4318 key_(key) {} 4319 virtual ~CollationKeyByteSink(); 4320 4321 private: 4322 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 4323 virtual UBool Resize(int32_t appendCapacity, int32_t length); 4324 4325 CollationKey &key_; 4326 }; 4327 4328 CollationKeyByteSink::~CollationKeyByteSink() {} 4329 4330 void 4331 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) { 4332 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 4333 if (Resize(n, length)) { 4334 uprv_memcpy(buffer_ + length, bytes, n); 4335 } 4336 } 4337 4338 UBool 4339 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { 4340 if (buffer_ == NULL) { 4341 return FALSE; // allocation failed before already 4342 } 4343 int32_t newCapacity = 2 * capacity_; 4344 int32_t altCapacity = length + 2 * appendCapacity; 4345 if (newCapacity < altCapacity) { 4346 newCapacity = altCapacity; 4347 } 4348 if (newCapacity < 200) { 4349 newCapacity = 200; 4350 } 4351 uint8_t *newBuffer = key_.reallocate(newCapacity, length); 4352 if (newBuffer == NULL) { 4353 SetNotOk(); 4354 return FALSE; 4355 } 4356 buffer_ = reinterpret_cast<char *>(newBuffer); 4357 capacity_ = newCapacity; 4358 return TRUE; 4359 } 4360 4361 /** 4362 * uint8_t byte buffer, similar to CharString but simpler. 4363 */ 4364 class SortKeyLevel : public UMemory { 4365 public: 4366 SortKeyLevel() : len(0), ok(TRUE) {} 4367 ~SortKeyLevel() {} 4368 4369 /** @return FALSE if memory allocation failed */ 4370 UBool isOk() const { return ok; } 4371 UBool isEmpty() const { return len == 0; } 4372 int32_t length() const { return len; } 4373 const uint8_t *data() const { return buffer.getAlias(); } 4374 uint8_t operator[](int32_t index) const { return buffer[index]; } 4375 4376 void appendByte(uint32_t b); 4377 4378 void appendTo(ByteSink &sink) const { 4379 sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len); 4380 } 4381 4382 uint8_t &lastByte() { 4383 U_ASSERT(len > 0); 4384 return buffer[len - 1]; 4385 } 4386 4387 uint8_t *getLastFewBytes(int32_t n) { 4388 if (ok && len >= n) { 4389 return buffer.getAlias() + len - n; 4390 } else { 4391 return NULL; 4392 } 4393 } 4394 4395 private: 4396 MaybeStackArray<uint8_t, 40> buffer; 4397 int32_t len; 4398 UBool ok; 4399 4400 UBool ensureCapacity(int32_t appendCapacity); 4401 4402 SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class 4403 SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class 4404 }; 4405 4406 void SortKeyLevel::appendByte(uint32_t b) { 4407 if(len < buffer.getCapacity() || ensureCapacity(1)) { 4408 buffer[len++] = (uint8_t)b; 4409 } 4410 } 4411 4412 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) { 4413 if(!ok) { 4414 return FALSE; 4415 } 4416 int32_t newCapacity = 2 * buffer.getCapacity(); 4417 int32_t altCapacity = len + 2 * appendCapacity; 4418 if (newCapacity < altCapacity) { 4419 newCapacity = altCapacity; 4420 } 4421 if (newCapacity < 200) { 4422 newCapacity = 200; 4423 } 4424 if(buffer.resize(newCapacity, len)==NULL) { 4425 return ok = FALSE; 4426 } 4427 return TRUE; 4428 } 4429 4430 U_NAMESPACE_END 4431 4432 /* sortkey API */ 4433 U_CAPI int32_t U_EXPORT2 4434 ucol_getSortKey(const UCollator *coll, 4435 const UChar *source, 4436 int32_t sourceLength, 4437 uint8_t *result, 4438 int32_t resultLength) 4439 { 4440 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 4441 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 4442 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 4443 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 4444 } 4445 4446 if(coll->delegate != NULL) { 4447 return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength); 4448 } 4449 4450 UErrorCode status = U_ZERO_ERROR; 4451 int32_t keySize = 0; 4452 4453 if(source != NULL) { 4454 // source == NULL is actually an error situation, but we would need to 4455 // have an error code to return it. Until we introduce a new 4456 // API, it stays like this 4457 4458 /* this uses the function pointer that is set in updateinternalstate */ 4459 /* currently, there are two funcs: */ 4460 /*ucol_calcSortKey(...);*/ 4461 /*ucol_calcSortKeySimpleTertiary(...);*/ 4462 4463 uint8_t noDest[1] = { 0 }; 4464 if(result == NULL) { 4465 // Distinguish pure preflighting from an allocation error. 4466 result = noDest; 4467 resultLength = 0; 4468 } 4469 FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength); 4470 coll->sortKeyGen(coll, source, sourceLength, sink, &status); 4471 if(U_SUCCESS(status)) { 4472 keySize = sink.NumberOfBytesAppended(); 4473 } 4474 } 4475 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 4476 UTRACE_EXIT_STATUS(status); 4477 return keySize; 4478 } 4479 4480 U_CFUNC int32_t 4481 ucol_getCollationKey(const UCollator *coll, 4482 const UChar *source, int32_t sourceLength, 4483 CollationKey &key, 4484 UErrorCode &errorCode) { 4485 CollationKeyByteSink sink(key); 4486 coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode); 4487 return sink.NumberOfBytesAppended(); 4488 } 4489 4490 // Is this primary weight compressible? 4491 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). 4492 // TODO: This should use per-lead-byte flags from FractionalUCA.txt. 4493 static inline UBool 4494 isCompressible(const UCollator * /*coll*/, uint8_t primary1) { 4495 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary; 4496 } 4497 4498 static 4499 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) { 4500 if (caseShift == 0) { 4501 cases.appendByte(UCOL_CASE_BYTE_START); 4502 caseShift = UCOL_CASE_SHIFT_START; 4503 } 4504 } 4505 4506 // Packs the secondary buffer when processing French locale. 4507 static void 4508 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) { 4509 secondaries += secsize; // We read the secondary-level bytes back to front. 4510 uint8_t secondary; 4511 int32_t count2 = 0; 4512 int32_t i = 0; 4513 // we use i here since the key size already accounts for terminators, so we'll discard the increment 4514 for(i = 0; i<secsize; i++) { 4515 secondary = *(secondaries-i-1); 4516 /* This is compression code. */ 4517 if (secondary == UCOL_COMMON2) { 4518 ++count2; 4519 } else { 4520 if (count2 > 0) { 4521 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4522 while (count2 > UCOL_TOP_COUNT2) { 4523 result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 4524 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4525 } 4526 result.Append(UCOL_COMMON_TOP2 - (count2-1)); 4527 } else { 4528 while (count2 > UCOL_BOT_COUNT2) { 4529 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4530 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4531 } 4532 result.Append(UCOL_COMMON_BOT2 + (count2-1)); 4533 } 4534 count2 = 0; 4535 } 4536 result.Append(secondary); 4537 } 4538 } 4539 if (count2 > 0) { 4540 while (count2 > UCOL_BOT_COUNT2) { 4541 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4542 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4543 } 4544 result.Append(UCOL_COMMON_BOT2 + (count2-1)); 4545 } 4546 } 4547 4548 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 4549 4550 /* This is the sortkey work horse function */ 4551 U_CFUNC void U_CALLCONV 4552 ucol_calcSortKey(const UCollator *coll, 4553 const UChar *source, 4554 int32_t sourceLength, 4555 SortKeyByteSink &result, 4556 UErrorCode *status) 4557 { 4558 if(U_FAILURE(*status)) { 4559 return; 4560 } 4561 4562 SortKeyByteSink &primaries = result; 4563 SortKeyLevel secondaries; 4564 SortKeyLevel tertiaries; 4565 SortKeyLevel cases; 4566 SortKeyLevel quads; 4567 4568 UnicodeString normSource; 4569 4570 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); 4571 4572 UColAttributeValue strength = coll->strength; 4573 4574 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4575 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4576 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4577 UBool compareIdent = (strength == UCOL_IDENTICAL); 4578 UBool doCase = (coll->caseLevel == UCOL_ON); 4579 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4580 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4581 //UBool qShifted = shifted && (compareQuad == 0); 4582 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4583 4584 uint32_t variableTopValue = coll->variableTopValue; 4585 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no 4586 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. 4587 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4588 uint8_t UCOL_HIRAGANA_QUAD = 0; 4589 if(doHiragana) { 4590 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; 4591 /* allocate one more space for hiragana, value for hiragana */ 4592 } 4593 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4594 4595 /* support for special features like caselevel and funky secondaries */ 4596 int32_t lastSecondaryLength = 0; 4597 uint32_t caseShift = 0; 4598 4599 /* If we need to normalize, we'll do it all at once at the beginning! */ 4600 const Normalizer2 *norm2; 4601 if(compareIdent) { 4602 norm2 = Normalizer2Factory::getNFDInstance(*status); 4603 } else if(coll->normalizationMode != UCOL_OFF) { 4604 norm2 = Normalizer2Factory::getFCDInstance(*status); 4605 } else { 4606 norm2 = NULL; 4607 } 4608 if(norm2 != NULL) { 4609 normSource.setTo(FALSE, source, len); 4610 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 4611 if(qcYesLength != len) { 4612 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 4613 normSource.truncate(qcYesLength); 4614 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 4615 source = normSource.getBuffer(); 4616 len = normSource.length(); 4617 } 4618 } 4619 collIterate s; 4620 IInit_collIterate(coll, source, len, &s, status); 4621 if(U_FAILURE(*status)) { 4622 return; 4623 } 4624 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 4625 4626 uint32_t order = 0; 4627 4628 uint8_t primary1 = 0; 4629 uint8_t primary2 = 0; 4630 uint8_t secondary = 0; 4631 uint8_t tertiary = 0; 4632 uint8_t caseSwitch = coll->caseSwitch; 4633 uint8_t tertiaryMask = coll->tertiaryMask; 4634 int8_t tertiaryAddition = coll->tertiaryAddition; 4635 uint8_t tertiaryTop = coll->tertiaryTop; 4636 uint8_t tertiaryBottom = coll->tertiaryBottom; 4637 uint8_t tertiaryCommon = coll->tertiaryCommon; 4638 uint8_t caseBits = 0; 4639 4640 UBool wasShifted = FALSE; 4641 UBool notIsContinuation = FALSE; 4642 4643 uint32_t count2 = 0, count3 = 0, count4 = 0; 4644 uint8_t leadPrimary = 0; 4645 4646 for(;;) { 4647 order = ucol_IGetNextCE(coll, &s, status); 4648 if(order == UCOL_NO_MORE_CES) { 4649 break; 4650 } 4651 4652 if(order == 0) { 4653 continue; 4654 } 4655 4656 notIsContinuation = !isContinuation(order); 4657 4658 if(notIsContinuation) { 4659 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); 4660 } else { 4661 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4662 } 4663 4664 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4665 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4666 primary1 = (uint8_t)(order >> 8); 4667 4668 uint8_t originalPrimary1 = primary1; 4669 if(notIsContinuation && coll->leadBytePermutationTable != NULL) { 4670 primary1 = coll->leadBytePermutationTable[primary1]; 4671 } 4672 4673 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4674 || (!notIsContinuation && wasShifted))) 4675 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 4676 { 4677 /* and other ignorables should be removed if following a shifted code point */ 4678 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4679 /* we should just completely ignore it */ 4680 continue; 4681 } 4682 if(compareQuad == 0) { 4683 if(count4 > 0) { 4684 while (count4 > UCOL_BOT_COUNT4) { 4685 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4686 count4 -= UCOL_BOT_COUNT4; 4687 } 4688 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); 4689 count4 = 0; 4690 } 4691 /* We are dealing with a variable and we're treating them as shifted */ 4692 /* This is a shifted ignorable */ 4693 if(primary1 != 0) { /* we need to check this since we could be in continuation */ 4694 quads.appendByte(primary1); 4695 } 4696 if(primary2 != 0) { 4697 quads.appendByte(primary2); 4698 } 4699 } 4700 wasShifted = TRUE; 4701 } else { 4702 wasShifted = FALSE; 4703 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4704 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 4705 /* regular and simple sortkey calc */ 4706 if(primary1 != UCOL_IGNORABLE) { 4707 if(notIsContinuation) { 4708 if(leadPrimary == primary1) { 4709 primaries.Append(primary2); 4710 } else { 4711 if(leadPrimary != 0) { 4712 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 4713 } 4714 if(primary2 == UCOL_IGNORABLE) { 4715 /* one byter, not compressed */ 4716 primaries.Append(primary1); 4717 leadPrimary = 0; 4718 } else if(isCompressible(coll, originalPrimary1)) { 4719 /* compress */ 4720 primaries.Append(leadPrimary = primary1, primary2); 4721 } else { 4722 leadPrimary = 0; 4723 primaries.Append(primary1, primary2); 4724 } 4725 } 4726 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 4727 if(primary2 == UCOL_IGNORABLE) { 4728 primaries.Append(primary1); 4729 } else { 4730 primaries.Append(primary1, primary2); 4731 } 4732 } 4733 } 4734 4735 if(secondary > compareSec) { 4736 if(!isFrenchSec) { 4737 /* This is compression code. */ 4738 if (secondary == UCOL_COMMON2 && notIsContinuation) { 4739 ++count2; 4740 } else { 4741 if (count2 > 0) { 4742 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4743 while (count2 > UCOL_TOP_COUNT2) { 4744 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 4745 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4746 } 4747 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1)); 4748 } else { 4749 while (count2 > UCOL_BOT_COUNT2) { 4750 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4751 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4752 } 4753 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 4754 } 4755 count2 = 0; 4756 } 4757 secondaries.appendByte(secondary); 4758 } 4759 } else { 4760 /* Do the special handling for French secondaries */ 4761 /* We need to get continuation elements and do intermediate restore */ 4762 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ 4763 if(notIsContinuation) { 4764 if (lastSecondaryLength > 1) { 4765 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength); 4766 if (frenchStartPtr != NULL) { 4767 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4768 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1; 4769 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4770 } 4771 } 4772 lastSecondaryLength = 1; 4773 } else { 4774 ++lastSecondaryLength; 4775 } 4776 secondaries.appendByte(secondary); 4777 } 4778 } 4779 4780 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 4781 // do the case level if we need to do it. We don't want to calculate 4782 // case level for primary ignorables if we have only primary strength and case level 4783 // otherwise we would break well formedness of CEs 4784 doCaseShift(cases, caseShift); 4785 if(notIsContinuation) { 4786 caseBits = (uint8_t)(tertiary & 0xC0); 4787 4788 if(tertiary != 0) { 4789 if(coll->caseFirst == UCOL_UPPER_FIRST) { 4790 if((caseBits & 0xC0) == 0) { 4791 cases.lastByte() |= 1 << (--caseShift); 4792 } else { 4793 cases.lastByte() |= 0 << (--caseShift); 4794 /* second bit */ 4795 doCaseShift(cases, caseShift); 4796 cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift); 4797 } 4798 } else { 4799 if((caseBits & 0xC0) == 0) { 4800 cases.lastByte() |= 0 << (--caseShift); 4801 } else { 4802 cases.lastByte() |= 1 << (--caseShift); 4803 /* second bit */ 4804 doCaseShift(cases, caseShift); 4805 cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift); 4806 } 4807 } 4808 } 4809 } 4810 } else { 4811 if(notIsContinuation) { 4812 tertiary ^= caseSwitch; 4813 } 4814 } 4815 4816 tertiary &= tertiaryMask; 4817 if(tertiary > compareTer) { 4818 /* This is compression code. */ 4819 /* sequence size check is included in the if clause */ 4820 if (tertiary == tertiaryCommon && notIsContinuation) { 4821 ++count3; 4822 } else { 4823 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 4824 tertiary += tertiaryAddition; 4825 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 4826 tertiary -= tertiaryAddition; 4827 } 4828 if (count3 > 0) { 4829 if ((tertiary > tertiaryCommon)) { 4830 while (count3 > coll->tertiaryTopCount) { 4831 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 4832 count3 -= (uint32_t)coll->tertiaryTopCount; 4833 } 4834 tertiaries.appendByte(tertiaryTop - (count3-1)); 4835 } else { 4836 while (count3 > coll->tertiaryBottomCount) { 4837 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 4838 count3 -= (uint32_t)coll->tertiaryBottomCount; 4839 } 4840 tertiaries.appendByte(tertiaryBottom + (count3-1)); 4841 } 4842 count3 = 0; 4843 } 4844 tertiaries.appendByte(tertiary); 4845 } 4846 } 4847 4848 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 4849 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 4850 if(count4>0) { // Close this part 4851 while (count4 > UCOL_BOT_COUNT4) { 4852 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4853 count4 -= UCOL_BOT_COUNT4; 4854 } 4855 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); 4856 count4 = 0; 4857 } 4858 quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana 4859 } else { // This wasn't Hiragana, so we can continue adding stuff 4860 count4++; 4861 } 4862 } 4863 } 4864 } 4865 4866 /* Here, we are generally done with processing */ 4867 /* bailing out would not be too productive */ 4868 4869 UBool ok = TRUE; 4870 if(U_SUCCESS(*status)) { 4871 /* we have done all the CE's, now let's put them together to form a key */ 4872 if(compareSec == 0) { 4873 if (count2 > 0) { 4874 while (count2 > UCOL_BOT_COUNT2) { 4875 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4876 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4877 } 4878 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 4879 } 4880 result.Append(UCOL_LEVELTERMINATOR); 4881 if(!secondaries.isOk()) { 4882 ok = FALSE; 4883 } else if(!isFrenchSec) { 4884 secondaries.appendTo(result); 4885 } else { 4886 // If there are any unresolved continuation secondaries, 4887 // reverse them here so that we can reverse the whole secondary thing. 4888 if (lastSecondaryLength > 1) { 4889 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength); 4890 if (frenchStartPtr != NULL) { 4891 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4892 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1; 4893 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4894 } 4895 } 4896 packFrench(secondaries.data(), secondaries.length(), result); 4897 } 4898 } 4899 4900 if(doCase) { 4901 ok &= cases.isOk(); 4902 result.Append(UCOL_LEVELTERMINATOR); 4903 cases.appendTo(result); 4904 } 4905 4906 if(compareTer == 0) { 4907 if (count3 > 0) { 4908 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { 4909 while (count3 >= coll->tertiaryTopCount) { 4910 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 4911 count3 -= (uint32_t)coll->tertiaryTopCount; 4912 } 4913 tertiaries.appendByte(tertiaryTop - count3); 4914 } else { 4915 while (count3 > coll->tertiaryBottomCount) { 4916 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 4917 count3 -= (uint32_t)coll->tertiaryBottomCount; 4918 } 4919 tertiaries.appendByte(tertiaryBottom + (count3-1)); 4920 } 4921 } 4922 ok &= tertiaries.isOk(); 4923 result.Append(UCOL_LEVELTERMINATOR); 4924 tertiaries.appendTo(result); 4925 4926 if(compareQuad == 0/*qShifted == TRUE*/) { 4927 if(count4 > 0) { 4928 while (count4 > UCOL_BOT_COUNT4) { 4929 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4930 count4 -= UCOL_BOT_COUNT4; 4931 } 4932 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); 4933 } 4934 ok &= quads.isOk(); 4935 result.Append(UCOL_LEVELTERMINATOR); 4936 quads.appendTo(result); 4937 } 4938 4939 if(compareIdent) { 4940 result.Append(UCOL_LEVELTERMINATOR); 4941 u_writeIdenticalLevelRun(s.string, len, result); 4942 } 4943 } 4944 result.Append(0); 4945 } 4946 4947 /* To avoid memory leak, free the offset buffer if necessary. */ 4948 ucol_freeOffsetBuffer(&s); 4949 4950 ok &= result.IsOk(); 4951 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } 4952 } 4953 4954 4955 U_CFUNC void U_CALLCONV 4956 ucol_calcSortKeySimpleTertiary(const UCollator *coll, 4957 const UChar *source, 4958 int32_t sourceLength, 4959 SortKeyByteSink &result, 4960 UErrorCode *status) 4961 { 4962 U_ALIGN_CODE(16); 4963 4964 if(U_FAILURE(*status)) { 4965 return; 4966 } 4967 4968 SortKeyByteSink &primaries = result; 4969 SortKeyLevel secondaries; 4970 SortKeyLevel tertiaries; 4971 4972 UnicodeString normSource; 4973 4974 int32_t len = sourceLength; 4975 4976 /* If we need to normalize, we'll do it all at once at the beginning! */ 4977 if(coll->normalizationMode != UCOL_OFF) { 4978 normSource.setTo(len < 0, source, len); 4979 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); 4980 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 4981 if(qcYesLength != normSource.length()) { 4982 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 4983 normSource.truncate(qcYesLength); 4984 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 4985 source = normSource.getBuffer(); 4986 len = normSource.length(); 4987 } 4988 } 4989 collIterate s; 4990 IInit_collIterate(coll, (UChar *)source, len, &s, status); 4991 if(U_FAILURE(*status)) { 4992 return; 4993 } 4994 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 4995 4996 uint32_t order = 0; 4997 4998 uint8_t primary1 = 0; 4999 uint8_t primary2 = 0; 5000 uint8_t secondary = 0; 5001 uint8_t tertiary = 0; 5002 uint8_t caseSwitch = coll->caseSwitch; 5003 uint8_t tertiaryMask = coll->tertiaryMask; 5004 int8_t tertiaryAddition = coll->tertiaryAddition; 5005 uint8_t tertiaryTop = coll->tertiaryTop; 5006 uint8_t tertiaryBottom = coll->tertiaryBottom; 5007 uint8_t tertiaryCommon = coll->tertiaryCommon; 5008 5009 UBool notIsContinuation = FALSE; 5010 5011 uint32_t count2 = 0, count3 = 0; 5012 uint8_t leadPrimary = 0; 5013 5014 for(;;) { 5015 order = ucol_IGetNextCE(coll, &s, status); 5016 5017 if(order == 0) { 5018 continue; 5019 } 5020 5021 if(order == UCOL_NO_MORE_CES) { 5022 break; 5023 } 5024 5025 notIsContinuation = !isContinuation(order); 5026 5027 if(notIsContinuation) { 5028 tertiary = (uint8_t)((order & tertiaryMask)); 5029 } else { 5030 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 5031 } 5032 5033 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5034 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5035 primary1 = (uint8_t)(order >> 8); 5036 5037 uint8_t originalPrimary1 = primary1; 5038 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { 5039 primary1 = coll->leadBytePermutationTable[primary1]; 5040 } 5041 5042 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 5043 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 5044 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ 5045 /* regular and simple sortkey calc */ 5046 if(primary1 != UCOL_IGNORABLE) { 5047 if(notIsContinuation) { 5048 if(leadPrimary == primary1) { 5049 primaries.Append(primary2); 5050 } else { 5051 if(leadPrimary != 0) { 5052 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 5053 } 5054 if(primary2 == UCOL_IGNORABLE) { 5055 /* one byter, not compressed */ 5056 primaries.Append(primary1); 5057 leadPrimary = 0; 5058 } else if(isCompressible(coll, originalPrimary1)) { 5059 /* compress */ 5060 primaries.Append(leadPrimary = primary1, primary2); 5061 } else { 5062 leadPrimary = 0; 5063 primaries.Append(primary1, primary2); 5064 } 5065 } 5066 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 5067 if(primary2 == UCOL_IGNORABLE) { 5068 primaries.Append(primary1); 5069 } else { 5070 primaries.Append(primary1, primary2); 5071 } 5072 } 5073 } 5074 5075 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ 5076 /* This is compression code. */ 5077 if (secondary == UCOL_COMMON2 && notIsContinuation) { 5078 ++count2; 5079 } else { 5080 if (count2 > 0) { 5081 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 5082 while (count2 > UCOL_TOP_COUNT2) { 5083 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 5084 count2 -= (uint32_t)UCOL_TOP_COUNT2; 5085 } 5086 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1)); 5087 } else { 5088 while (count2 > UCOL_BOT_COUNT2) { 5089 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5090 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5091 } 5092 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 5093 } 5094 count2 = 0; 5095 } 5096 secondaries.appendByte(secondary); 5097 } 5098 } 5099 5100 if(notIsContinuation) { 5101 tertiary ^= caseSwitch; 5102 } 5103 5104 if(tertiary > 0) { 5105 /* This is compression code. */ 5106 /* sequence size check is included in the if clause */ 5107 if (tertiary == tertiaryCommon && notIsContinuation) { 5108 ++count3; 5109 } else { 5110 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 5111 tertiary += tertiaryAddition; 5112 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 5113 tertiary -= tertiaryAddition; 5114 } 5115 if (count3 > 0) { 5116 if ((tertiary > tertiaryCommon)) { 5117 while (count3 > coll->tertiaryTopCount) { 5118 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 5119 count3 -= (uint32_t)coll->tertiaryTopCount; 5120 } 5121 tertiaries.appendByte(tertiaryTop - (count3-1)); 5122 } else { 5123 while (count3 > coll->tertiaryBottomCount) { 5124 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 5125 count3 -= (uint32_t)coll->tertiaryBottomCount; 5126 } 5127 tertiaries.appendByte(tertiaryBottom + (count3-1)); 5128 } 5129 count3 = 0; 5130 } 5131 tertiaries.appendByte(tertiary); 5132 } 5133 } 5134 } 5135 5136 UBool ok = TRUE; 5137 if(U_SUCCESS(*status)) { 5138 /* we have done all the CE's, now let's put them together to form a key */ 5139 if (count2 > 0) { 5140 while (count2 > UCOL_BOT_COUNT2) { 5141 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5142 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5143 } 5144 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 5145 } 5146 ok &= secondaries.isOk(); 5147 result.Append(UCOL_LEVELTERMINATOR); 5148 secondaries.appendTo(result); 5149 5150 if (count3 > 0) { 5151 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { 5152 while (count3 >= coll->tertiaryTopCount) { 5153 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 5154 count3 -= (uint32_t)coll->tertiaryTopCount; 5155 } 5156 tertiaries.appendByte(tertiaryTop - count3); 5157 } else { 5158 while (count3 > coll->tertiaryBottomCount) { 5159 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 5160 count3 -= (uint32_t)coll->tertiaryBottomCount; 5161 } 5162 tertiaries.appendByte(tertiaryBottom + (count3-1)); 5163 } 5164 } 5165 ok &= tertiaries.isOk(); 5166 result.Append(UCOL_LEVELTERMINATOR); 5167 tertiaries.appendTo(result); 5168 5169 result.Append(0); 5170 } 5171 5172 /* To avoid memory leak, free the offset buffer if necessary. */ 5173 ucol_freeOffsetBuffer(&s); 5174 5175 ok &= result.IsOk(); 5176 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } 5177 } 5178 5179 static inline 5180 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { 5181 UBool notIsContinuation = !isContinuation(CE); 5182 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); 5183 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) 5184 || (!notIsContinuation && *wasShifted))) 5185 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 5186 { 5187 // The stuff below should probably be in the sortkey code... maybe not... 5188 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ 5189 /* we should just completely ignore it */ 5190 *wasShifted = TRUE; 5191 //continue; 5192 } 5193 //*wasShifted = TRUE; 5194 return TRUE; 5195 } else { 5196 *wasShifted = FALSE; 5197 return FALSE; 5198 } 5199 } 5200 static inline 5201 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { 5202 if(level < maxLevel) { 5203 dest[i++] = UCOL_LEVELTERMINATOR; 5204 } else { 5205 dest[i++] = 0; 5206 } 5207 } 5208 5209 /** enumeration of level identifiers for partial sort key generation */ 5210 enum { 5211 UCOL_PSK_PRIMARY = 0, 5212 UCOL_PSK_SECONDARY = 1, 5213 UCOL_PSK_CASE = 2, 5214 UCOL_PSK_TERTIARY = 3, 5215 UCOL_PSK_QUATERNARY = 4, 5216 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */ 5217 UCOL_PSK_IDENTICAL = 6, 5218 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */ 5219 UCOL_PSK_LIMIT 5220 }; 5221 5222 /** collation state enum. *_SHIFT value is how much to shift right 5223 * to get the state piece to the right. *_MASK value should be 5224 * ANDed with the shifted state. This data is stored in state[1] 5225 * field. 5226 */ 5227 enum { 5228 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */ 5229 UCOL_PSK_LEVEL_MASK = 7, /** three bits */ 5230 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */ 5231 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, 5232 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary 5233 * This field is also used to denote that the French secondary level is finished 5234 */ 5235 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ 5236 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ 5237 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */ 5238 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ 5239 /** When we do French we need to reverse secondary values. However, continuations 5240 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba 5241 */ 5242 UCOL_PSK_BOCSU_BYTES_SHIFT = 7, 5243 UCOL_PSK_BOCSU_BYTES_MASK = 3, 5244 UCOL_PSK_CONSUMED_CES_SHIFT = 9, 5245 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF 5246 }; 5247 5248 // macro calculating the number of expansion CEs available 5249 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn 5250 5251 5252 /** main sortkey part procedure. On the first call, 5253 * you should pass in a collator, an iterator, empty state 5254 * state[0] == state[1] == 0, a buffer to hold results 5255 * number of bytes you need and an error code pointer. 5256 * Make sure your buffer is big enough to hold the wanted 5257 * number of sortkey bytes. I don't check. 5258 * The only meaningful status you can get back is 5259 * U_BUFFER_OVERFLOW_ERROR, which basically means that you 5260 * have been dealt a raw deal and that you probably won't 5261 * be able to use partial sortkey generation for this 5262 * particular combination of string and collator. This 5263 * is highly unlikely, but you should still check the error code. 5264 * Any other status means that you're not in a sane situation 5265 * anymore. After the first call, preserve state values and 5266 * use them on subsequent calls to obtain more bytes of a sortkey. 5267 * Use until the number of bytes written is smaller than the requested 5268 * number of bytes. Generated sortkey is not compatible with the 5269 * one generated by ucol_getSortKey, as we don't do any compression. 5270 * However, levels are still terminated by a 1 (one) and the sortkey 5271 * is terminated by a 0 (zero). Identical level is the same as in the 5272 * regular sortkey - internal bocu-1 implementation is used. 5273 * For curious, although you cannot do much about this, here is 5274 * the structure of state words. 5275 * state[0] - iterator state. Depends on the iterator implementation, 5276 * but allows the iterator to continue where it stopped in 5277 * the last iteration. 5278 * state[1] - collation processing state. Here is the distribution 5279 * of the bits: 5280 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary 5281 * quaternary, quin (we don't use this one), identical and 5282 * null (producing only zeroes - first one to terminate the 5283 * sortkey and subsequent to fill the buffer). 5284 * 3 - byte count. Number of bytes written on the primary level. 5285 * 4 - was shifted. Whether the previous iteration finished in the 5286 * shifted state. 5287 * 5, 6 - French continuation bytes written. See the comment in the enum 5288 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on 5289 * the identical level. 5290 * 9..31 - CEs consumed. Number of getCE or next32 operations performed 5291 * since thes last successful update of the iterator state. 5292 */ 5293 U_CAPI int32_t U_EXPORT2 5294 ucol_nextSortKeyPart(const UCollator *coll, 5295 UCharIterator *iter, 5296 uint32_t state[2], 5297 uint8_t *dest, int32_t count, 5298 UErrorCode *status) 5299 { 5300 /* error checking */ 5301 if(status==NULL || U_FAILURE(*status)) { 5302 return 0; 5303 } 5304 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 5305 if( coll==NULL || iter==NULL || 5306 state==NULL || 5307 count<0 || (count>0 && dest==NULL) 5308 ) { 5309 *status=U_ILLEGAL_ARGUMENT_ERROR; 5310 UTRACE_EXIT_STATUS(status); 5311 return 0; 5312 } 5313 5314 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 5315 coll, iter, state[0], state[1], dest, count); 5316 5317 if(count==0) { 5318 /* nothing to do */ 5319 UTRACE_EXIT_VALUE(0); 5320 return 0; 5321 } 5322 /** Setting up situation according to the state we got from the previous iteration */ 5323 // The state of the iterator from the previous invocation 5324 uint32_t iterState = state[0]; 5325 // Has the last iteration ended in the shifted state 5326 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE; 5327 // What is the current level of the sortkey? 5328 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; 5329 // Have we written only one byte from a two byte primary in the previous iteration? 5330 // Also on secondary level - have we finished with the French secondary? 5331 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; 5332 // number of bytes in the continuation buffer for French 5333 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; 5334 // Number of bytes already written from a bocsu sequence. Since 5335 // the longes bocsu sequence is 4 long, this can be up to 3. 5336 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK; 5337 // Number of elements that need to be consumed in this iteration because 5338 // the iterator returned UITER_NO_STATE at the end of the last iteration, 5339 // so we had to save the last valid state. 5340 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK; 5341 5342 /** values that depend on the collator attributes */ 5343 // strength of the collator. 5344 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); 5345 // maximal level of the partial sortkey. Need to take whether case level is done 5346 int32_t maxLevel = 0; 5347 if(strength < UCOL_TERTIARY) { 5348 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 5349 maxLevel = UCOL_PSK_CASE; 5350 } else { 5351 maxLevel = strength; 5352 } 5353 } else { 5354 if(strength == UCOL_TERTIARY) { 5355 maxLevel = UCOL_PSK_TERTIARY; 5356 } else if(strength == UCOL_QUATERNARY) { 5357 maxLevel = UCOL_PSK_QUATERNARY; 5358 } else { // identical 5359 maxLevel = UCOL_IDENTICAL; 5360 } 5361 } 5362 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation 5363 uint8_t UCOL_HIRAGANA_QUAD = 5364 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF; 5365 // Boundary value that decides whether a CE is shifted or not 5366 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0; 5367 // Are we doing French collation? 5368 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON); 5369 5370 /** initializing the collation state */ 5371 UBool notIsContinuation = FALSE; 5372 uint32_t CE = UCOL_NO_MORE_CES; 5373 5374 collIterate s; 5375 IInit_collIterate(coll, NULL, -1, &s, status); 5376 if(U_FAILURE(*status)) { 5377 UTRACE_EXIT_STATUS(*status); 5378 return 0; 5379 } 5380 s.iterator = iter; 5381 s.flags |= UCOL_USE_ITERATOR; 5382 // This variable tells us whether we have produced some other levels in this iteration 5383 // before we moved to the identical level. In that case, we need to switch the 5384 // type of the iterator. 5385 UBool doingIdenticalFromStart = FALSE; 5386 // Normalizing iterator 5387 // The division for the array length may truncate the array size to 5388 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 5389 // for all platforms anyway. 5390 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 5391 UNormIterator *normIter = NULL; 5392 // If the normalization is turned on for the collator and we are below identical level 5393 // we will use a FCD normalizing iterator 5394 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { 5395 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5396 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); 5397 s.flags &= ~UCOL_ITER_NORM; 5398 if(U_FAILURE(*status)) { 5399 UTRACE_EXIT_STATUS(*status); 5400 return 0; 5401 } 5402 } else if(level == UCOL_PSK_IDENTICAL) { 5403 // for identical level, we need a NFD iterator. We need to instantiate it here, since we 5404 // will be updating the state - and this cannot be done on an ordinary iterator. 5405 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5406 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5407 s.flags &= ~UCOL_ITER_NORM; 5408 if(U_FAILURE(*status)) { 5409 UTRACE_EXIT_STATUS(*status); 5410 return 0; 5411 } 5412 doingIdenticalFromStart = TRUE; 5413 } 5414 5415 // This is the tentative new state of the iterator. The problem 5416 // is that the iterator might return an undefined state, in 5417 // which case we should save the last valid state and increase 5418 // the iterator skip value. 5419 uint32_t newState = 0; 5420 5421 // First, we set the iterator to the last valid position 5422 // from the last iteration. This was saved in state[0]. 5423 if(iterState == 0) { 5424 /* initial state */ 5425 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { 5426 s.iterator->move(s.iterator, 0, UITER_LIMIT); 5427 } else { 5428 s.iterator->move(s.iterator, 0, UITER_START); 5429 } 5430 } else { 5431 /* reset to previous state */ 5432 s.iterator->setState(s.iterator, iterState, status); 5433 if(U_FAILURE(*status)) { 5434 UTRACE_EXIT_STATUS(*status); 5435 return 0; 5436 } 5437 } 5438 5439 5440 5441 // This variable tells us whether we can attempt to update the state 5442 // of iterator. Situations where we don't want to update iterator state 5443 // are the existence of expansion CEs that are not yet processed, and 5444 // finishing the case level without enough space in the buffer to insert 5445 // a level terminator. 5446 UBool canUpdateState = TRUE; 5447 5448 // Consume all the CEs that were consumed at the end of the previous 5449 // iteration without updating the iterator state. On identical level, 5450 // consume the code points. 5451 int32_t counter = cces; 5452 if(level < UCOL_PSK_IDENTICAL) { 5453 while(counter-->0) { 5454 // If we're doing French and we are on the secondary level, 5455 // we go backwards. 5456 if(level == UCOL_PSK_SECONDARY && doingFrench) { 5457 CE = ucol_IGetPrevCE(coll, &s, status); 5458 } else { 5459 CE = ucol_IGetNextCE(coll, &s, status); 5460 } 5461 if(CE==UCOL_NO_MORE_CES) { 5462 /* should not happen */ 5463 *status=U_INTERNAL_PROGRAM_ERROR; 5464 UTRACE_EXIT_STATUS(*status); 5465 return 0; 5466 } 5467 if(uprv_numAvailableExpCEs(s)) { 5468 canUpdateState = FALSE; 5469 } 5470 } 5471 } else { 5472 while(counter-->0) { 5473 uiter_next32(s.iterator); 5474 } 5475 } 5476 5477 // French secondary needs to know whether the iterator state of zero came from previous level OR 5478 // from a new invocation... 5479 UBool wasDoingPrimary = FALSE; 5480 // destination buffer byte counter. When this guy 5481 // gets to count, we're done with the iteration 5482 int32_t i = 0; 5483 // used to count the zero bytes written after we 5484 // have finished with the sort key 5485 int32_t j = 0; 5486 5487 5488 // Hm.... I think we're ready to plunge in. Basic story is as following: 5489 // we have a fall through case based on level. This is used for initial 5490 // positioning on iteration start. Every level processor contains a 5491 // for(;;) which will be broken when we exhaust all the CEs. Other 5492 // way to exit is a goto saveState, which happens when we have filled 5493 // out our buffer. 5494 switch(level) { 5495 case UCOL_PSK_PRIMARY: 5496 wasDoingPrimary = TRUE; 5497 for(;;) { 5498 if(i==count) { 5499 goto saveState; 5500 } 5501 // We should save the state only if we 5502 // are sure that we are done with the 5503 // previous iterator state 5504 if(canUpdateState && byteCountOrFrenchDone == 0) { 5505 newState = s.iterator->getState(s.iterator); 5506 if(newState != UITER_NO_STATE) { 5507 iterState = newState; 5508 cces = 0; 5509 } 5510 } 5511 CE = ucol_IGetNextCE(coll, &s, status); 5512 cces++; 5513 if(CE==UCOL_NO_MORE_CES) { 5514 // Add the level separator 5515 terminatePSKLevel(level, maxLevel, i, dest); 5516 byteCountOrFrenchDone=0; 5517 // Restart the iteration an move to the 5518 // second level 5519 s.iterator->move(s.iterator, 0, UITER_START); 5520 cces = 0; 5521 level = UCOL_PSK_SECONDARY; 5522 break; 5523 } 5524 if(!isContinuation(CE)){ 5525 if(coll->leadBytePermutationTable != NULL){ 5526 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF); 5527 } 5528 } 5529 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5530 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ 5531 if(CE != 0) { 5532 if(byteCountOrFrenchDone == 0) { 5533 // get the second byte of primary 5534 dest[i++]=(uint8_t)(CE >> 8); 5535 } else { 5536 byteCountOrFrenchDone = 0; 5537 } 5538 if((CE &=0xff)!=0) { 5539 if(i==count) { 5540 /* overflow */ 5541 byteCountOrFrenchDone = 1; 5542 cces--; 5543 goto saveState; 5544 } 5545 dest[i++]=(uint8_t)CE; 5546 } 5547 } 5548 } 5549 if(uprv_numAvailableExpCEs(s)) { 5550 canUpdateState = FALSE; 5551 } else { 5552 canUpdateState = TRUE; 5553 } 5554 } 5555 /* fall through to next level */ 5556 case UCOL_PSK_SECONDARY: 5557 if(strength >= UCOL_SECONDARY) { 5558 if(!doingFrench) { 5559 for(;;) { 5560 if(i == count) { 5561 goto saveState; 5562 } 5563 // We should save the state only if we 5564 // are sure that we are done with the 5565 // previous iterator state 5566 if(canUpdateState) { 5567 newState = s.iterator->getState(s.iterator); 5568 if(newState != UITER_NO_STATE) { 5569 iterState = newState; 5570 cces = 0; 5571 } 5572 } 5573 CE = ucol_IGetNextCE(coll, &s, status); 5574 cces++; 5575 if(CE==UCOL_NO_MORE_CES) { 5576 // Add the level separator 5577 terminatePSKLevel(level, maxLevel, i, dest); 5578 byteCountOrFrenchDone = 0; 5579 // Restart the iteration an move to the 5580 // second level 5581 s.iterator->move(s.iterator, 0, UITER_START); 5582 cces = 0; 5583 level = UCOL_PSK_CASE; 5584 break; 5585 } 5586 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5587 CE >>= 8; /* get secondary */ 5588 if(CE != 0) { 5589 dest[i++]=(uint8_t)CE; 5590 } 5591 } 5592 if(uprv_numAvailableExpCEs(s)) { 5593 canUpdateState = FALSE; 5594 } else { 5595 canUpdateState = TRUE; 5596 } 5597 } 5598 } else { // French secondary processing 5599 uint8_t frenchBuff[UCOL_MAX_BUFFER]; 5600 int32_t frenchIndex = 0; 5601 // Here we are going backwards. 5602 // If the iterator is at the beggining, it should be 5603 // moved to end. 5604 if(wasDoingPrimary) { 5605 s.iterator->move(s.iterator, 0, UITER_LIMIT); 5606 cces = 0; 5607 } 5608 for(;;) { 5609 if(i == count) { 5610 goto saveState; 5611 } 5612 if(canUpdateState) { 5613 newState = s.iterator->getState(s.iterator); 5614 if(newState != UITER_NO_STATE) { 5615 iterState = newState; 5616 cces = 0; 5617 } 5618 } 5619 CE = ucol_IGetPrevCE(coll, &s, status); 5620 cces++; 5621 if(CE==UCOL_NO_MORE_CES) { 5622 // Add the level separator 5623 terminatePSKLevel(level, maxLevel, i, dest); 5624 byteCountOrFrenchDone = 0; 5625 // Restart the iteration an move to the next level 5626 s.iterator->move(s.iterator, 0, UITER_START); 5627 level = UCOL_PSK_CASE; 5628 break; 5629 } 5630 if(isContinuation(CE)) { // if it's a continuation, we want to save it and 5631 // reverse when we get a first non-continuation CE. 5632 CE >>= 8; 5633 frenchBuff[frenchIndex++] = (uint8_t)CE; 5634 } else if(!isShiftedCE(CE, LVT, &wasShifted)) { 5635 CE >>= 8; /* get secondary */ 5636 if(!frenchIndex) { 5637 if(CE != 0) { 5638 dest[i++]=(uint8_t)CE; 5639 } 5640 } else { 5641 frenchBuff[frenchIndex++] = (uint8_t)CE; 5642 frenchIndex -= usedFrench; 5643 usedFrench = 0; 5644 while(i < count && frenchIndex) { 5645 dest[i++] = frenchBuff[--frenchIndex]; 5646 usedFrench++; 5647 } 5648 } 5649 } 5650 if(uprv_numAvailableExpCEs(s)) { 5651 canUpdateState = FALSE; 5652 } else { 5653 canUpdateState = TRUE; 5654 } 5655 } 5656 } 5657 } else { 5658 level = UCOL_PSK_CASE; 5659 } 5660 /* fall through to next level */ 5661 case UCOL_PSK_CASE: 5662 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 5663 uint32_t caseShift = UCOL_CASE_SHIFT_START; 5664 uint8_t caseByte = UCOL_CASE_BYTE_START; 5665 uint8_t caseBits = 0; 5666 5667 for(;;) { 5668 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); 5669 if(i == count) { 5670 goto saveState; 5671 } 5672 // We should save the state only if we 5673 // are sure that we are done with the 5674 // previous iterator state 5675 if(canUpdateState) { 5676 newState = s.iterator->getState(s.iterator); 5677 if(newState != UITER_NO_STATE) { 5678 iterState = newState; 5679 cces = 0; 5680 } 5681 } 5682 CE = ucol_IGetNextCE(coll, &s, status); 5683 cces++; 5684 if(CE==UCOL_NO_MORE_CES) { 5685 // On the case level we might have an unfinished 5686 // case byte. Add one if it's started. 5687 if(caseShift != UCOL_CASE_SHIFT_START) { 5688 dest[i++] = caseByte; 5689 } 5690 cces = 0; 5691 // We have finished processing CEs on this level. 5692 // However, we don't know if we have enough space 5693 // to add a case level terminator. 5694 if(i < count) { 5695 // Add the level separator 5696 terminatePSKLevel(level, maxLevel, i, dest); 5697 // Restart the iteration and move to the 5698 // next level 5699 s.iterator->move(s.iterator, 0, UITER_START); 5700 level = UCOL_PSK_TERTIARY; 5701 } else { 5702 canUpdateState = FALSE; 5703 } 5704 break; 5705 } 5706 5707 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5708 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) { 5709 // do the case level if we need to do it. We don't want to calculate 5710 // case level for primary ignorables if we have only primary strength and case level 5711 // otherwise we would break well formedness of CEs 5712 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 5713 caseBits = (uint8_t)(CE & 0xC0); 5714 // this copies the case level logic from the 5715 // sort key generation code 5716 if(CE != 0) { 5717 if (caseShift == 0) { 5718 dest[i++] = caseByte; 5719 caseShift = UCOL_CASE_SHIFT_START; 5720 caseByte = UCOL_CASE_BYTE_START; 5721 } 5722 if(coll->caseFirst == UCOL_UPPER_FIRST) { 5723 if((caseBits & 0xC0) == 0) { 5724 caseByte |= 1 << (--caseShift); 5725 } else { 5726 caseByte |= 0 << (--caseShift); 5727 /* second bit */ 5728 if(caseShift == 0) { 5729 dest[i++] = caseByte; 5730 caseShift = UCOL_CASE_SHIFT_START; 5731 caseByte = UCOL_CASE_BYTE_START; 5732 } 5733 caseByte |= ((caseBits>>6)&1) << (--caseShift); 5734 } 5735 } else { 5736 if((caseBits & 0xC0) == 0) { 5737 caseByte |= 0 << (--caseShift); 5738 } else { 5739 caseByte |= 1 << (--caseShift); 5740 /* second bit */ 5741 if(caseShift == 0) { 5742 dest[i++] = caseByte; 5743 caseShift = UCOL_CASE_SHIFT_START; 5744 caseByte = UCOL_CASE_BYTE_START; 5745 } 5746 caseByte |= ((caseBits>>7)&1) << (--caseShift); 5747 } 5748 } 5749 } 5750 5751 } 5752 } 5753 // Not sure this is correct for the case level - revisit 5754 if(uprv_numAvailableExpCEs(s)) { 5755 canUpdateState = FALSE; 5756 } else { 5757 canUpdateState = TRUE; 5758 } 5759 } 5760 } else { 5761 level = UCOL_PSK_TERTIARY; 5762 } 5763 /* fall through to next level */ 5764 case UCOL_PSK_TERTIARY: 5765 if(strength >= UCOL_TERTIARY) { 5766 for(;;) { 5767 if(i == count) { 5768 goto saveState; 5769 } 5770 // We should save the state only if we 5771 // are sure that we are done with the 5772 // previous iterator state 5773 if(canUpdateState) { 5774 newState = s.iterator->getState(s.iterator); 5775 if(newState != UITER_NO_STATE) { 5776 iterState = newState; 5777 cces = 0; 5778 } 5779 } 5780 CE = ucol_IGetNextCE(coll, &s, status); 5781 cces++; 5782 if(CE==UCOL_NO_MORE_CES) { 5783 // Add the level separator 5784 terminatePSKLevel(level, maxLevel, i, dest); 5785 byteCountOrFrenchDone = 0; 5786 // Restart the iteration an move to the 5787 // second level 5788 s.iterator->move(s.iterator, 0, UITER_START); 5789 cces = 0; 5790 level = UCOL_PSK_QUATERNARY; 5791 break; 5792 } 5793 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5794 notIsContinuation = !isContinuation(CE); 5795 5796 if(notIsContinuation) { 5797 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 5798 CE ^= coll->caseSwitch; 5799 CE &= coll->tertiaryMask; 5800 } else { 5801 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 5802 } 5803 5804 if(CE != 0) { 5805 dest[i++]=(uint8_t)CE; 5806 } 5807 } 5808 if(uprv_numAvailableExpCEs(s)) { 5809 canUpdateState = FALSE; 5810 } else { 5811 canUpdateState = TRUE; 5812 } 5813 } 5814 } else { 5815 // if we're not doing tertiary 5816 // skip to the end 5817 level = UCOL_PSK_NULL; 5818 } 5819 /* fall through to next level */ 5820 case UCOL_PSK_QUATERNARY: 5821 if(strength >= UCOL_QUATERNARY) { 5822 for(;;) { 5823 if(i == count) { 5824 goto saveState; 5825 } 5826 // We should save the state only if we 5827 // are sure that we are done with the 5828 // previous iterator state 5829 if(canUpdateState) { 5830 newState = s.iterator->getState(s.iterator); 5831 if(newState != UITER_NO_STATE) { 5832 iterState = newState; 5833 cces = 0; 5834 } 5835 } 5836 CE = ucol_IGetNextCE(coll, &s, status); 5837 cces++; 5838 if(CE==UCOL_NO_MORE_CES) { 5839 // Add the level separator 5840 terminatePSKLevel(level, maxLevel, i, dest); 5841 //dest[i++] = UCOL_LEVELTERMINATOR; 5842 byteCountOrFrenchDone = 0; 5843 // Restart the iteration an move to the 5844 // second level 5845 s.iterator->move(s.iterator, 0, UITER_START); 5846 cces = 0; 5847 level = UCOL_PSK_QUIN; 5848 break; 5849 } 5850 if(CE==0) 5851 continue; 5852 if(isShiftedCE(CE, LVT, &wasShifted)) { 5853 CE >>= 16; /* get primary */ 5854 if(CE != 0) { 5855 if(byteCountOrFrenchDone == 0) { 5856 dest[i++]=(uint8_t)(CE >> 8); 5857 } else { 5858 byteCountOrFrenchDone = 0; 5859 } 5860 if((CE &=0xff)!=0) { 5861 if(i==count) { 5862 /* overflow */ 5863 byteCountOrFrenchDone = 1; 5864 goto saveState; 5865 } 5866 dest[i++]=(uint8_t)CE; 5867 } 5868 } 5869 } else { 5870 notIsContinuation = !isContinuation(CE); 5871 if(notIsContinuation) { 5872 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 5873 dest[i++] = UCOL_HIRAGANA_QUAD; 5874 } else { 5875 dest[i++] = 0xFF; 5876 } 5877 } 5878 } 5879 if(uprv_numAvailableExpCEs(s)) { 5880 canUpdateState = FALSE; 5881 } else { 5882 canUpdateState = TRUE; 5883 } 5884 } 5885 } else { 5886 // if we're not doing quaternary 5887 // skip to the end 5888 level = UCOL_PSK_NULL; 5889 } 5890 /* fall through to next level */ 5891 case UCOL_PSK_QUIN: 5892 level = UCOL_PSK_IDENTICAL; 5893 /* fall through to next level */ 5894 case UCOL_PSK_IDENTICAL: 5895 if(strength >= UCOL_IDENTICAL) { 5896 UChar32 first, second; 5897 int32_t bocsuBytesWritten = 0; 5898 // We always need to do identical on 5899 // the NFD form of the string. 5900 if(normIter == NULL) { 5901 // we arrived from the level below and 5902 // normalization was not turned on. 5903 // therefore, we need to make a fresh NFD iterator 5904 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5905 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5906 } else if(!doingIdenticalFromStart) { 5907 // there is an iterator, but we did some other levels. 5908 // therefore, we have a FCD iterator - need to make 5909 // a NFD one. 5910 // normIter being at the beginning does not guarantee 5911 // that the underlying iterator is at the beginning 5912 iter->move(iter, 0, UITER_START); 5913 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5914 } 5915 // At this point we have a NFD iterator that is positioned 5916 // in the right place 5917 if(U_FAILURE(*status)) { 5918 UTRACE_EXIT_STATUS(*status); 5919 return 0; 5920 } 5921 first = uiter_previous32(s.iterator); 5922 // maybe we're at the start of the string 5923 if(first == U_SENTINEL) { 5924 first = 0; 5925 } else { 5926 uiter_next32(s.iterator); 5927 } 5928 5929 j = 0; 5930 for(;;) { 5931 if(i == count) { 5932 if(j+1 < bocsuBytesWritten) { 5933 bocsuBytesUsed = j+1; 5934 } 5935 goto saveState; 5936 } 5937 5938 // On identical level, we will always save 5939 // the state if we reach this point, since 5940 // we don't depend on getNextCE for content 5941 // all the content is in our buffer and we 5942 // already either stored the full buffer OR 5943 // otherwise we won't arrive here. 5944 newState = s.iterator->getState(s.iterator); 5945 if(newState != UITER_NO_STATE) { 5946 iterState = newState; 5947 cces = 0; 5948 } 5949 5950 uint8_t buff[4]; 5951 second = uiter_next32(s.iterator); 5952 cces++; 5953 5954 // end condition for identical level 5955 if(second == U_SENTINEL) { 5956 terminatePSKLevel(level, maxLevel, i, dest); 5957 level = UCOL_PSK_NULL; 5958 break; 5959 } 5960 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); 5961 first = second; 5962 5963 j = 0; 5964 if(bocsuBytesUsed != 0) { 5965 while(bocsuBytesUsed-->0) { 5966 j++; 5967 } 5968 } 5969 5970 while(i < count && j < bocsuBytesWritten) { 5971 dest[i++] = buff[j++]; 5972 } 5973 } 5974 5975 } else { 5976 level = UCOL_PSK_NULL; 5977 } 5978 /* fall through to next level */ 5979 case UCOL_PSK_NULL: 5980 j = i; 5981 while(j<count) { 5982 dest[j++]=0; 5983 } 5984 break; 5985 default: 5986 *status = U_INTERNAL_PROGRAM_ERROR; 5987 UTRACE_EXIT_STATUS(*status); 5988 return 0; 5989 } 5990 5991 saveState: 5992 // Now we need to return stuff. First we want to see whether we have 5993 // done everything for the current state of iterator. 5994 if(byteCountOrFrenchDone 5995 || canUpdateState == FALSE 5996 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) 5997 { 5998 // Any of above mean that the previous transaction 5999 // wasn't finished and that we should store the 6000 // previous iterator state. 6001 state[0] = iterState; 6002 } else { 6003 // The transaction is complete. We will continue in the next iteration. 6004 state[0] = s.iterator->getState(s.iterator); 6005 cces = 0; 6006 } 6007 // Store the number of bocsu bytes written. 6008 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { 6009 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6010 } 6011 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT; 6012 6013 // Next we put in the level of comparison 6014 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); 6015 6016 // If we are doing French, we need to store whether we have just finished the French level 6017 if(level == UCOL_PSK_SECONDARY && doingFrench) { 6018 state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6019 } else { 6020 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6021 } 6022 6023 // Was the latest CE shifted 6024 if(wasShifted) { 6025 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; 6026 } 6027 // Check for cces overflow 6028 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { 6029 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6030 } 6031 // Store cces 6032 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT); 6033 6034 // Check for French overflow 6035 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { 6036 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6037 } 6038 // Store number of bytes written in the French secondary continuation sequence 6039 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); 6040 6041 6042 // If we have used normalizing iterator, get rid of it 6043 if(normIter != NULL) { 6044 unorm_closeIter(normIter); 6045 } 6046 6047 /* To avoid memory leak, free the offset buffer if necessary. */ 6048 ucol_freeOffsetBuffer(&s); 6049 6050 // Return number of meaningful sortkey bytes. 6051 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 6052 dest,i, state[0], state[1]); 6053 UTRACE_EXIT_VALUE(i); 6054 return i; 6055 } 6056 6057 /** 6058 * Produce a bound for a given sortkey and a number of levels. 6059 */ 6060 U_CAPI int32_t U_EXPORT2 6061 ucol_getBound(const uint8_t *source, 6062 int32_t sourceLength, 6063 UColBoundMode boundType, 6064 uint32_t noOfLevels, 6065 uint8_t *result, 6066 int32_t resultLength, 6067 UErrorCode *status) 6068 { 6069 // consistency checks 6070 if(status == NULL || U_FAILURE(*status)) { 6071 return 0; 6072 } 6073 if(source == NULL) { 6074 *status = U_ILLEGAL_ARGUMENT_ERROR; 6075 return 0; 6076 } 6077 6078 int32_t sourceIndex = 0; 6079 // Scan the string until we skip enough of the key OR reach the end of the key 6080 do { 6081 sourceIndex++; 6082 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { 6083 noOfLevels--; 6084 } 6085 } while (noOfLevels > 0 6086 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 6087 6088 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 6089 && noOfLevels > 0) { 6090 *status = U_SORT_KEY_TOO_SHORT_WARNING; 6091 } 6092 6093 6094 // READ ME: this code assumes that the values for boundType 6095 // enum will not changes. They are set so that the enum value 6096 // corresponds to the number of extra bytes each bound type 6097 // needs. 6098 if(result != NULL && resultLength >= sourceIndex+boundType) { 6099 uprv_memcpy(result, source, sourceIndex); 6100 switch(boundType) { 6101 // Lower bound just gets terminated. No extra bytes 6102 case UCOL_BOUND_LOWER: // = 0 6103 break; 6104 // Upper bound needs one extra byte 6105 case UCOL_BOUND_UPPER: // = 1 6106 result[sourceIndex++] = 2; 6107 break; 6108 // Upper long bound needs two extra bytes 6109 case UCOL_BOUND_UPPER_LONG: // = 2 6110 result[sourceIndex++] = 0xFF; 6111 result[sourceIndex++] = 0xFF; 6112 break; 6113 default: 6114 *status = U_ILLEGAL_ARGUMENT_ERROR; 6115 return 0; 6116 } 6117 result[sourceIndex++] = 0; 6118 6119 return sourceIndex; 6120 } else { 6121 return sourceIndex+boundType+1; 6122 } 6123 } 6124 6125 /****************************************************************************/ 6126 /* Following are the functions that deal with the properties of a collator */ 6127 /* there are new APIs and some compatibility APIs */ 6128 /****************************************************************************/ 6129 6130 static inline void 6131 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, 6132 int32_t *primShift, int32_t *secShift, int32_t *terShift) 6133 { 6134 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; 6135 UBool reverseSecondary = FALSE; 6136 UBool continuation = isContinuation(CE); 6137 if(!continuation) { 6138 tertiary = (uint8_t)((CE & coll->tertiaryMask)); 6139 tertiary ^= coll->caseSwitch; 6140 reverseSecondary = TRUE; 6141 } else { 6142 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6143 tertiary &= UCOL_REMOVE_CASE; 6144 reverseSecondary = FALSE; 6145 } 6146 6147 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6148 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6149 primary1 = (uint8_t)(CE >> 8); 6150 6151 if(primary1 != 0) { 6152 if (coll->leadBytePermutationTable != NULL && !continuation) { 6153 primary1 = coll->leadBytePermutationTable[primary1]; 6154 } 6155 6156 coll->latinOneCEs[ch] |= (primary1 << *primShift); 6157 *primShift -= 8; 6158 } 6159 if(primary2 != 0) { 6160 if(*primShift < 0) { 6161 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6162 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6163 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6164 return; 6165 } 6166 coll->latinOneCEs[ch] |= (primary2 << *primShift); 6167 *primShift -= 8; 6168 } 6169 if(secondary != 0) { 6170 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary 6171 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary 6172 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); 6173 } else { // normal case 6174 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); 6175 } 6176 *secShift -= 8; 6177 } 6178 if(tertiary != 0) { 6179 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); 6180 *terShift -= 8; 6181 } 6182 } 6183 6184 static inline UBool 6185 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { 6186 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); 6187 if(newTable == NULL) { 6188 *status = U_MEMORY_ALLOCATION_ERROR; 6189 coll->latinOneFailed = TRUE; 6190 return FALSE; 6191 } 6192 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t); 6193 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); 6194 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); 6195 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy); 6196 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy); 6197 coll->latinOneTableLen = size; 6198 uprv_free(coll->latinOneCEs); 6199 coll->latinOneCEs = newTable; 6200 return TRUE; 6201 } 6202 6203 static UBool 6204 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { 6205 UBool result = TRUE; 6206 if(coll->latinOneCEs == NULL) { 6207 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); 6208 if(coll->latinOneCEs == NULL) { 6209 *status = U_MEMORY_ALLOCATION_ERROR; 6210 return FALSE; 6211 } 6212 coll->latinOneTableLen = UCOL_LATINONETABLELEN; 6213 } 6214 UChar ch = 0; 6215 UCollationElements *it = ucol_openElements(coll, &ch, 1, status); 6216 // Check for null pointer 6217 if (U_FAILURE(*status)) { 6218 ucol_closeElements(it); 6219 return FALSE; 6220 } 6221 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); 6222 6223 int32_t primShift = 24, secShift = 24, terShift = 24; 6224 uint32_t CE = 0; 6225 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; 6226 6227 // TODO: make safe if you get more than you wanted... 6228 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { 6229 primShift = 24; secShift = 24; terShift = 24; 6230 if(ch < 0x100) { 6231 CE = coll->latinOneMapping[ch]; 6232 } else { 6233 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 6234 if(CE == UCOL_NOT_FOUND && coll->UCA) { 6235 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 6236 } 6237 } 6238 if(CE < UCOL_NOT_FOUND) { 6239 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6240 } else { 6241 switch (getCETag(CE)) { 6242 case EXPANSION_TAG: 6243 case DIGIT_TAG: 6244 ucol_setText(it, &ch, 1, status); 6245 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { 6246 if(primShift < 0 || secShift < 0 || terShift < 0) { 6247 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6248 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6249 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6250 break; 6251 } 6252 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6253 } 6254 break; 6255 case CONTRACTION_TAG: 6256 // here is the trick 6257 // F2 is contraction. We do something very similar to contractions 6258 // but have two indices, one in the real contraction table and the 6259 // other to where we stuffed things. This hopes that we don't have 6260 // many contractions (this should work for latin-1 tables). 6261 { 6262 if((CE & 0x00FFF000) != 0) { 6263 *status = U_UNSUPPORTED_ERROR; 6264 goto cleanup_after_failure; 6265 } 6266 6267 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 6268 6269 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table 6270 6271 coll->latinOneCEs[ch] = CE; 6272 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; 6273 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; 6274 6275 // We're going to jump into contraction table, pick the elements 6276 // and use them 6277 do { 6278 CE = *(coll->contractionCEs + 6279 (UCharOffset - coll->contractionIndex)); 6280 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { 6281 uint32_t size; 6282 uint32_t i; /* general counter */ 6283 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 6284 size = getExpansionCount(CE); 6285 //CE = *CEOffset++; 6286 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 6287 for(i = 0; i<size; i++) { 6288 if(primShift < 0 || secShift < 0 || terShift < 0) { 6289 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6290 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6291 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6292 break; 6293 } 6294 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6295 } 6296 } else { /* else, we do */ 6297 while(*CEOffset != 0) { 6298 if(primShift < 0 || secShift < 0 || terShift < 0) { 6299 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6300 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6301 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6302 break; 6303 } 6304 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6305 } 6306 } 6307 contractionOffset++; 6308 } else if(CE < UCOL_NOT_FOUND) { 6309 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); 6310 } else { 6311 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6312 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6313 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6314 contractionOffset++; 6315 } 6316 UCharOffset++; 6317 primShift = 24; secShift = 24; terShift = 24; 6318 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate 6319 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { 6320 goto cleanup_after_failure; 6321 } 6322 } 6323 } while(*UCharOffset != 0xFFFF); 6324 } 6325 break;; 6326 case SPEC_PROC_TAG: 6327 { 6328 // 0xB7 is a precontext character defined in UCA5.1, a special 6329 // handle is implemeted in order to save LatinOne table for 6330 // most locales. 6331 if (ch==0xb7) { 6332 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6333 } 6334 else { 6335 goto cleanup_after_failure; 6336 } 6337 } 6338 break; 6339 default: 6340 goto cleanup_after_failure; 6341 } 6342 } 6343 } 6344 // compact table 6345 if(contractionOffset < coll->latinOneTableLen) { 6346 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { 6347 goto cleanup_after_failure; 6348 } 6349 } 6350 ucol_closeElements(it); 6351 return result; 6352 6353 cleanup_after_failure: 6354 // status should already be set before arriving here. 6355 coll->latinOneFailed = TRUE; 6356 ucol_closeElements(it); 6357 return FALSE; 6358 } 6359 6360 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { 6361 if(U_SUCCESS(*status)) { 6362 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6363 coll->caseSwitch = UCOL_CASE_SWITCH; 6364 } else { 6365 coll->caseSwitch = UCOL_NO_CASE_SWITCH; 6366 } 6367 6368 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { 6369 coll->tertiaryMask = UCOL_REMOVE_CASE; 6370 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6371 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */ 6372 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; 6373 coll->tertiaryBottom = UCOL_COMMON_BOT3; 6374 } else { 6375 coll->tertiaryMask = UCOL_KEEP_CASE; 6376 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; 6377 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6378 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; 6379 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; 6380 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; 6381 } else { 6382 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6383 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; 6384 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; 6385 } 6386 } 6387 6388 /* Set the compression values */ 6389 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1); 6390 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */ 6391 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); 6392 6393 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY 6394 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) 6395 { 6396 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; 6397 } else { 6398 coll->sortKeyGen = ucol_calcSortKey; 6399 } 6400 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF 6401 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) 6402 { 6403 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { 6404 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it 6405 //fprintf(stderr, "F"); 6406 coll->latinOneUse = TRUE; 6407 } else { 6408 coll->latinOneUse = FALSE; 6409 } 6410 if(*status == U_UNSUPPORTED_ERROR) { 6411 *status = U_ZERO_ERROR; 6412 } 6413 } else { // latin1Table exists and it doesn't need to be regenerated, just use it 6414 coll->latinOneUse = TRUE; 6415 } 6416 } else { 6417 coll->latinOneUse = FALSE; 6418 } 6419 } 6420 } 6421 6422 U_CAPI uint32_t U_EXPORT2 6423 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 6424 if(U_FAILURE(*status) || coll == NULL) { 6425 return 0; 6426 } 6427 if(len == -1) { 6428 len = u_strlen(varTop); 6429 } 6430 if(len == 0) { 6431 *status = U_ILLEGAL_ARGUMENT_ERROR; 6432 return 0; 6433 } 6434 6435 if(coll->delegate!=NULL) { 6436 return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status); 6437 } 6438 6439 6440 collIterate s; 6441 IInit_collIterate(coll, varTop, len, &s, status); 6442 if(U_FAILURE(*status)) { 6443 return 0; 6444 } 6445 6446 uint32_t CE = ucol_IGetNextCE(coll, &s, status); 6447 6448 /* here we check if we have consumed all characters */ 6449 /* you can put in either one character or a contraction */ 6450 /* you shouldn't put more... */ 6451 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { 6452 *status = U_CE_NOT_FOUND_ERROR; 6453 return 0; 6454 } 6455 6456 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); 6457 6458 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { 6459 *status = U_PRIMARY_TOO_LONG_ERROR; 6460 return 0; 6461 } 6462 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { 6463 coll->variableTopValueisDefault = FALSE; 6464 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; 6465 } 6466 6467 /* To avoid memory leak, free the offset buffer if necessary. */ 6468 ucol_freeOffsetBuffer(&s); 6469 6470 return CE & UCOL_PRIMARYMASK; 6471 } 6472 6473 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 6474 if(U_FAILURE(*status) || coll == NULL) { 6475 return 0; 6476 } 6477 if(coll->delegate!=NULL) { 6478 return ((const Collator*)coll->delegate)->getVariableTop(*status); 6479 } 6480 return coll->variableTopValue<<16; 6481 } 6482 6483 U_CAPI void U_EXPORT2 6484 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 6485 if(U_FAILURE(*status) || coll == NULL) { 6486 return; 6487 } 6488 6489 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { 6490 coll->variableTopValueisDefault = FALSE; 6491 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; 6492 } 6493 } 6494 /* Attribute setter API */ 6495 U_CAPI void U_EXPORT2 6496 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 6497 if(U_FAILURE(*status) || coll == NULL) { 6498 return; 6499 } 6500 6501 if(coll->delegate != NULL) { 6502 ((Collator*)coll->delegate)->setAttribute(attr,value,*status); 6503 return; 6504 } 6505 6506 UColAttributeValue oldFrench = coll->frenchCollation; 6507 UColAttributeValue oldCaseFirst = coll->caseFirst; 6508 switch(attr) { 6509 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ 6510 if(value == UCOL_ON) { 6511 coll->numericCollation = UCOL_ON; 6512 coll->numericCollationisDefault = FALSE; 6513 } else if (value == UCOL_OFF) { 6514 coll->numericCollation = UCOL_OFF; 6515 coll->numericCollationisDefault = FALSE; 6516 } else if (value == UCOL_DEFAULT) { 6517 coll->numericCollationisDefault = TRUE; 6518 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; 6519 } else { 6520 *status = U_ILLEGAL_ARGUMENT_ERROR; 6521 } 6522 break; 6523 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ 6524 if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) { 6525 // This attribute is an implementation detail of the CLDR Japanese tailoring. 6526 // The implementation might change to use a different mechanism 6527 // to achieve the same Japanese sort order. 6528 // Since ICU 50, this attribute is not settable any more via API functions. 6529 } else { 6530 *status = U_ILLEGAL_ARGUMENT_ERROR; 6531 } 6532 break; 6533 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 6534 if(value == UCOL_ON) { 6535 coll->frenchCollation = UCOL_ON; 6536 coll->frenchCollationisDefault = FALSE; 6537 } else if (value == UCOL_OFF) { 6538 coll->frenchCollation = UCOL_OFF; 6539 coll->frenchCollationisDefault = FALSE; 6540 } else if (value == UCOL_DEFAULT) { 6541 coll->frenchCollationisDefault = TRUE; 6542 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation; 6543 } else { 6544 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6545 } 6546 break; 6547 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 6548 if(value == UCOL_SHIFTED) { 6549 coll->alternateHandling = UCOL_SHIFTED; 6550 coll->alternateHandlingisDefault = FALSE; 6551 } else if (value == UCOL_NON_IGNORABLE) { 6552 coll->alternateHandling = UCOL_NON_IGNORABLE; 6553 coll->alternateHandlingisDefault = FALSE; 6554 } else if (value == UCOL_DEFAULT) { 6555 coll->alternateHandlingisDefault = TRUE; 6556 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ; 6557 } else { 6558 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6559 } 6560 break; 6561 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 6562 if(value == UCOL_LOWER_FIRST) { 6563 coll->caseFirst = UCOL_LOWER_FIRST; 6564 coll->caseFirstisDefault = FALSE; 6565 } else if (value == UCOL_UPPER_FIRST) { 6566 coll->caseFirst = UCOL_UPPER_FIRST; 6567 coll->caseFirstisDefault = FALSE; 6568 } else if (value == UCOL_OFF) { 6569 coll->caseFirst = UCOL_OFF; 6570 coll->caseFirstisDefault = FALSE; 6571 } else if (value == UCOL_DEFAULT) { 6572 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; 6573 coll->caseFirstisDefault = TRUE; 6574 } else { 6575 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6576 } 6577 break; 6578 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 6579 if(value == UCOL_ON) { 6580 coll->caseLevel = UCOL_ON; 6581 coll->caseLevelisDefault = FALSE; 6582 } else if (value == UCOL_OFF) { 6583 coll->caseLevel = UCOL_OFF; 6584 coll->caseLevelisDefault = FALSE; 6585 } else if (value == UCOL_DEFAULT) { 6586 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; 6587 coll->caseLevelisDefault = TRUE; 6588 } else { 6589 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6590 } 6591 break; 6592 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 6593 if(value == UCOL_ON) { 6594 coll->normalizationMode = UCOL_ON; 6595 coll->normalizationModeisDefault = FALSE; 6596 initializeFCD(status); 6597 } else if (value == UCOL_OFF) { 6598 coll->normalizationMode = UCOL_OFF; 6599 coll->normalizationModeisDefault = FALSE; 6600 } else if (value == UCOL_DEFAULT) { 6601 coll->normalizationModeisDefault = TRUE; 6602 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode; 6603 if(coll->normalizationMode == UCOL_ON) { 6604 initializeFCD(status); 6605 } 6606 } else { 6607 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6608 } 6609 break; 6610 case UCOL_STRENGTH: /* attribute for strength */ 6611 if (value == UCOL_DEFAULT) { 6612 coll->strengthisDefault = TRUE; 6613 coll->strength = (UColAttributeValue)coll->options->strength; 6614 } else if (value <= UCOL_IDENTICAL) { 6615 coll->strengthisDefault = FALSE; 6616 coll->strength = value; 6617 } else { 6618 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6619 } 6620 break; 6621 case UCOL_ATTRIBUTE_COUNT: 6622 default: 6623 *status = U_ILLEGAL_ARGUMENT_ERROR; 6624 break; 6625 } 6626 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { 6627 coll->latinOneRegenTable = TRUE; 6628 } else { 6629 coll->latinOneRegenTable = FALSE; 6630 } 6631 ucol_updateInternalState(coll, status); 6632 } 6633 6634 U_CAPI UColAttributeValue U_EXPORT2 6635 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 6636 if(U_FAILURE(*status) || coll == NULL) { 6637 return UCOL_DEFAULT; 6638 } 6639 6640 if(coll->delegate != NULL) { 6641 return ((Collator*)coll->delegate)->getAttribute(attr,*status); 6642 } 6643 6644 switch(attr) { 6645 case UCOL_NUMERIC_COLLATION: 6646 return coll->numericCollation; 6647 case UCOL_HIRAGANA_QUATERNARY_MODE: 6648 return coll->hiraganaQ; 6649 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 6650 return coll->frenchCollation; 6651 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 6652 return coll->alternateHandling; 6653 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 6654 return coll->caseFirst; 6655 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 6656 return coll->caseLevel; 6657 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 6658 return coll->normalizationMode; 6659 case UCOL_STRENGTH: /* attribute for strength */ 6660 return coll->strength; 6661 case UCOL_ATTRIBUTE_COUNT: 6662 default: 6663 *status = U_ILLEGAL_ARGUMENT_ERROR; 6664 break; 6665 } 6666 return UCOL_DEFAULT; 6667 } 6668 6669 U_CAPI void U_EXPORT2 6670 ucol_setStrength( UCollator *coll, 6671 UCollationStrength strength) 6672 { 6673 UErrorCode status = U_ZERO_ERROR; 6674 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 6675 } 6676 6677 U_CAPI UCollationStrength U_EXPORT2 6678 ucol_getStrength(const UCollator *coll) 6679 { 6680 UErrorCode status = U_ZERO_ERROR; 6681 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 6682 } 6683 6684 U_CAPI int32_t U_EXPORT2 6685 ucol_getReorderCodes(const UCollator *coll, 6686 int32_t *dest, 6687 int32_t destCapacity, 6688 UErrorCode *status) { 6689 if (U_FAILURE(*status)) { 6690 return 0; 6691 } 6692 6693 if(coll->delegate!=NULL) { 6694 return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status); 6695 } 6696 6697 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 6698 *status = U_ILLEGAL_ARGUMENT_ERROR; 6699 return 0; 6700 } 6701 6702 #ifdef UCOL_DEBUG 6703 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength); 6704 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength); 6705 #endif 6706 6707 if (coll->reorderCodesLength > destCapacity) { 6708 *status = U_BUFFER_OVERFLOW_ERROR; 6709 return coll->reorderCodesLength; 6710 } 6711 for (int32_t i = 0; i < coll->reorderCodesLength; i++) { 6712 dest[i] = coll->reorderCodes[i]; 6713 } 6714 return coll->reorderCodesLength; 6715 } 6716 6717 U_CAPI void U_EXPORT2 6718 ucol_setReorderCodes(UCollator* coll, 6719 const int32_t* reorderCodes, 6720 int32_t reorderCodesLength, 6721 UErrorCode *status) { 6722 if (U_FAILURE(*status)) { 6723 return; 6724 } 6725 6726 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) { 6727 *status = U_ILLEGAL_ARGUMENT_ERROR; 6728 return; 6729 } 6730 6731 if(coll->delegate!=NULL) { 6732 ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status); 6733 return; 6734 } 6735 6736 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { 6737 uprv_free(coll->reorderCodes); 6738 } 6739 coll->reorderCodes = NULL; 6740 coll->freeReorderCodesOnClose = FALSE; 6741 coll->reorderCodesLength = 0; 6742 if (reorderCodesLength == 0) { 6743 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) { 6744 uprv_free(coll->leadBytePermutationTable); 6745 } 6746 coll->leadBytePermutationTable = NULL; 6747 coll->freeLeadBytePermutationTableOnClose = FALSE; 6748 return; 6749 } 6750 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t)); 6751 if (coll->reorderCodes == NULL) { 6752 *status = U_MEMORY_ALLOCATION_ERROR; 6753 return; 6754 } 6755 coll->freeReorderCodesOnClose = TRUE; 6756 for (int32_t i = 0; i < reorderCodesLength; i++) { 6757 coll->reorderCodes[i] = reorderCodes[i]; 6758 } 6759 coll->reorderCodesLength = reorderCodesLength; 6760 ucol_buildPermutationTable(coll, status); 6761 } 6762 6763 U_CAPI int32_t U_EXPORT2 6764 ucol_getEquivalentReorderCodes(int32_t reorderCode, 6765 int32_t* dest, 6766 int32_t destCapacity, 6767 UErrorCode *pErrorCode) { 6768 bool equivalentCodesSet[USCRIPT_CODE_LIMIT]; 6769 uint16_t leadBytes[256]; 6770 int leadBytesCount; 6771 int leadByteIndex; 6772 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT]; 6773 int reorderCodesForLeadByteCount; 6774 int reorderCodeIndex; 6775 6776 int32_t equivalentCodesCount = 0; 6777 int setIndex; 6778 6779 if (U_FAILURE(*pErrorCode)) { 6780 return 0; 6781 } 6782 6783 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 6784 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 6785 return 0; 6786 } 6787 6788 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool)); 6789 6790 const UCollator* uca = ucol_initUCA(pErrorCode); 6791 if (U_FAILURE(*pErrorCode)) { 6792 return 0; 6793 } 6794 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256); 6795 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) { 6796 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte( 6797 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT); 6798 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) { 6799 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true; 6800 } 6801 } 6802 6803 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { 6804 if (equivalentCodesSet[setIndex] == true) { 6805 equivalentCodesCount++; 6806 } 6807 } 6808 6809 if (destCapacity == 0) { 6810 return equivalentCodesCount; 6811 } 6812 6813 equivalentCodesCount = 0; 6814 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { 6815 if (equivalentCodesSet[setIndex] == true) { 6816 dest[equivalentCodesCount++] = setIndex; 6817 if (equivalentCodesCount >= destCapacity) { 6818 break; 6819 } 6820 } 6821 } 6822 return equivalentCodesCount; 6823 } 6824 6825 6826 /****************************************************************************/ 6827 /* Following are misc functions */ 6828 /* there are new APIs and some compatibility APIs */ 6829 /****************************************************************************/ 6830 6831 U_CAPI void U_EXPORT2 6832 ucol_getVersion(const UCollator* coll, 6833 UVersionInfo versionInfo) 6834 { 6835 if(coll->delegate!=NULL) { 6836 ((const Collator*)coll->delegate)->getVersion(versionInfo); 6837 return; 6838 } 6839 /* RunTime version */ 6840 uint8_t rtVersion = UCOL_RUNTIME_VERSION; 6841 /* Builder version*/ 6842 uint8_t bdVersion = coll->image->version[0]; 6843 6844 /* Charset Version. Need to get the version from cnv files 6845 * makeconv should populate cnv files with version and 6846 * an api has to be provided in ucnv.h to obtain this version 6847 */ 6848 uint8_t csVersion = 0; 6849 6850 /* combine the version info */ 6851 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion)); 6852 6853 /* Tailoring rules */ 6854 versionInfo[0] = (uint8_t)(cmbVersion>>8); 6855 versionInfo[1] = (uint8_t)cmbVersion; 6856 versionInfo[2] = coll->image->version[1]; 6857 if(coll->UCA) { 6858 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */ 6859 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07); 6860 } else { 6861 versionInfo[3] = 0; 6862 } 6863 } 6864 6865 6866 /* This internal API checks whether a character is tailored or not */ 6867 U_CAPI UBool U_EXPORT2 6868 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { 6869 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { 6870 return FALSE; 6871 } 6872 6873 uint32_t CE = UCOL_NOT_FOUND; 6874 const UChar *ContractionStart = NULL; 6875 if(u < 0x100) { /* latin-1 */ 6876 CE = coll->latinOneMapping[u]; 6877 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { 6878 return FALSE; 6879 } 6880 } else { /* regular */ 6881 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); 6882 } 6883 6884 if(isContraction(CE)) { 6885 ContractionStart = (UChar *)coll->image+getContractOffset(CE); 6886 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); 6887 } 6888 6889 return (UBool)(CE != UCOL_NOT_FOUND); 6890 } 6891 6892 6893 /****************************************************************************/ 6894 /* Following are the string compare functions */ 6895 /* */ 6896 /****************************************************************************/ 6897 6898 6899 /* ucol_checkIdent internal function. Does byte level string compare. */ 6900 /* Used by strcoll if strength == identical and strings */ 6901 /* are otherwise equal. */ 6902 /* */ 6903 /* Comparison must be done on NFD normalized strings. */ 6904 /* FCD is not good enough. */ 6905 6906 static 6907 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status) 6908 { 6909 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both 6910 // of same type, but that doesn't really mean that it will stay that way. 6911 int32_t comparison; 6912 6913 if (sColl->flags & UCOL_USE_ITERATOR) { 6914 // The division for the array length may truncate the array size to 6915 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 6916 // for all platforms anyway. 6917 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 6918 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 6919 UNormIterator *sNIt = NULL, *tNIt = NULL; 6920 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 6921 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 6922 sColl->iterator->move(sColl->iterator, 0, UITER_START); 6923 tColl->iterator->move(tColl->iterator, 0, UITER_START); 6924 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); 6925 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); 6926 comparison = u_strCompareIter(sIt, tIt, TRUE); 6927 unorm_closeIter(sNIt); 6928 unorm_closeIter(tNIt); 6929 } else { 6930 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1; 6931 const UChar *sBuf = sColl->string; 6932 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1; 6933 const UChar *tBuf = tColl->string; 6934 6935 if (normalize) { 6936 *status = U_ZERO_ERROR; 6937 // Note: We could use Normalizer::compare() or similar, but for short strings 6938 // which may not be in FCD it might be faster to just NFD them. 6939 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than 6940 // NFD'ing immediately might be faster for long strings, 6941 // but string comparison is usually done on relatively short strings. 6942 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen), 6943 sColl->writableBuffer, 6944 *status); 6945 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen), 6946 tColl->writableBuffer, 6947 *status); 6948 if(U_FAILURE(*status)) { 6949 return UCOL_LESS; 6950 } 6951 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer); 6952 } else { 6953 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); 6954 } 6955 } 6956 6957 if (comparison < 0) { 6958 return UCOL_LESS; 6959 } else if (comparison == 0) { 6960 return UCOL_EQUAL; 6961 } else /* comparison > 0 */ { 6962 return UCOL_GREATER; 6963 } 6964 } 6965 6966 /* CEBuf - A struct and some inline functions to handle the saving */ 6967 /* of CEs in a buffer within ucol_strcoll */ 6968 6969 #define UCOL_CEBUF_SIZE 512 6970 typedef struct ucol_CEBuf { 6971 uint32_t *buf; 6972 uint32_t *endp; 6973 uint32_t *pos; 6974 uint32_t localArray[UCOL_CEBUF_SIZE]; 6975 } ucol_CEBuf; 6976 6977 6978 static 6979 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { 6980 (b)->buf = (b)->pos = (b)->localArray; 6981 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; 6982 } 6983 6984 static 6985 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { 6986 uint32_t oldSize; 6987 uint32_t newSize; 6988 uint32_t *newBuf; 6989 6990 ci->flags |= UCOL_ITER_ALLOCATED; 6991 oldSize = (uint32_t)(b->pos - b->buf); 6992 newSize = oldSize * 2; 6993 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); 6994 if(newBuf == NULL) { 6995 *status = U_MEMORY_ALLOCATION_ERROR; 6996 } 6997 else { 6998 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); 6999 if (b->buf != b->localArray) { 7000 uprv_free(b->buf); 7001 } 7002 b->buf = newBuf; 7003 b->endp = b->buf + newSize; 7004 b->pos = b->buf + oldSize; 7005 } 7006 } 7007 7008 static 7009 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) { 7010 if (b->pos == b->endp) { 7011 ucol_CEBuf_Expand(b, ci, status); 7012 } 7013 if (U_SUCCESS(*status)) { 7014 *(b)->pos++ = ce; 7015 } 7016 } 7017 7018 /* This is a trick string compare function that goes in and uses sortkeys to compare */ 7019 /* It is used when compare gets in trouble and needs to bail out */ 7020 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, 7021 collIterate *tColl, 7022 UErrorCode *status) 7023 { 7024 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; 7025 uint8_t *sourceKeyP = sourceKey; 7026 uint8_t *targetKeyP = targetKey; 7027 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; 7028 const UCollator *coll = sColl->coll; 7029 const UChar *source = NULL; 7030 const UChar *target = NULL; 7031 int32_t result = UCOL_EQUAL; 7032 UnicodeString sourceString, targetString; 7033 int32_t sourceLength; 7034 int32_t targetLength; 7035 7036 if(sColl->flags & UCOL_USE_ITERATOR) { 7037 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7038 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7039 UChar32 c; 7040 while((c=sColl->iterator->next(sColl->iterator))>=0) { 7041 sourceString.append((UChar)c); 7042 } 7043 while((c=tColl->iterator->next(tColl->iterator))>=0) { 7044 targetString.append((UChar)c); 7045 } 7046 source = sourceString.getBuffer(); 7047 sourceLength = sourceString.length(); 7048 target = targetString.getBuffer(); 7049 targetLength = targetString.length(); 7050 } else { // no iterators 7051 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1; 7052 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1; 7053 source = sColl->string; 7054 target = tColl->string; 7055 } 7056 7057 7058 7059 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7060 if(sourceKeyLen > UCOL_MAX_BUFFER) { 7061 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); 7062 if(sourceKeyP == NULL) { 7063 *status = U_MEMORY_ALLOCATION_ERROR; 7064 goto cleanup_and_do_compare; 7065 } 7066 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7067 } 7068 7069 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7070 if(targetKeyLen > UCOL_MAX_BUFFER) { 7071 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); 7072 if(targetKeyP == NULL) { 7073 *status = U_MEMORY_ALLOCATION_ERROR; 7074 goto cleanup_and_do_compare; 7075 } 7076 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7077 } 7078 7079 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); 7080 7081 cleanup_and_do_compare: 7082 if(sourceKeyP != NULL && sourceKeyP != sourceKey) { 7083 uprv_free(sourceKeyP); 7084 } 7085 7086 if(targetKeyP != NULL && targetKeyP != targetKey) { 7087 uprv_free(targetKeyP); 7088 } 7089 7090 if(result<0) { 7091 return UCOL_LESS; 7092 } else if(result>0) { 7093 return UCOL_GREATER; 7094 } else { 7095 return UCOL_EQUAL; 7096 } 7097 } 7098 7099 7100 static UCollationResult 7101 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) 7102 { 7103 U_ALIGN_CODE(16); 7104 7105 const UCollator *coll = sColl->coll; 7106 7107 7108 // setting up the collator parameters 7109 UColAttributeValue strength = coll->strength; 7110 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); 7111 7112 UBool checkSecTer = initialCheckSecTer; 7113 UBool checkTertiary = (strength >= UCOL_TERTIARY); 7114 UBool checkQuad = (strength >= UCOL_QUATERNARY); 7115 UBool checkIdent = (strength == UCOL_IDENTICAL); 7116 UBool checkCase = (coll->caseLevel == UCOL_ON); 7117 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; 7118 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 7119 UBool qShifted = shifted && checkQuad; 7120 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; 7121 7122 if(doHiragana && shifted) { 7123 return (ucol_compareUsingSortKeys(sColl, tColl, status)); 7124 } 7125 uint8_t caseSwitch = coll->caseSwitch; 7126 uint8_t tertiaryMask = coll->tertiaryMask; 7127 7128 // This is the lowest primary value that will not be ignored if shifted 7129 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; 7130 7131 UCollationResult result = UCOL_EQUAL; 7132 UCollationResult hirResult = UCOL_EQUAL; 7133 7134 // Preparing the CE buffers. They will be filled during the primary phase 7135 ucol_CEBuf sCEs; 7136 ucol_CEBuf tCEs; 7137 UCOL_INIT_CEBUF(&sCEs); 7138 UCOL_INIT_CEBUF(&tCEs); 7139 7140 uint32_t secS = 0, secT = 0; 7141 uint32_t sOrder=0, tOrder=0; 7142 7143 // Non shifted primary processing is quite simple 7144 if(!shifted) { 7145 for(;;) { 7146 // We fetch CEs until we hit a non ignorable primary or end. 7147 uint32_t sPrimary; 7148 do { 7149 // We get the next CE 7150 sOrder = ucol_IGetNextCE(coll, sColl, status); 7151 // Stuff it in the buffer 7152 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7153 // And keep just the primary part. 7154 sPrimary = sOrder & UCOL_PRIMARYMASK; 7155 } while(sPrimary == 0); 7156 7157 // see the comments on the above block 7158 uint32_t tPrimary; 7159 do { 7160 tOrder = ucol_IGetNextCE(coll, tColl, status); 7161 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7162 tPrimary = tOrder & UCOL_PRIMARYMASK; 7163 } while(tPrimary == 0); 7164 7165 // if both primaries are the same 7166 if(sPrimary == tPrimary) { 7167 // and there are no more CEs, we advance to the next level 7168 if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) { 7169 break; 7170 } 7171 if(doHiragana && hirResult == UCOL_EQUAL) { 7172 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { 7173 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) 7174 ? UCOL_LESS:UCOL_GREATER; 7175 } 7176 } 7177 } else { 7178 // only need to check one for continuation 7179 // if one is then the other must be or the preceding CE would be a prefix of the other 7180 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) { 7181 sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] << 24) | (sPrimary & 0x00FFFFFF); 7182 tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] << 24) | (tPrimary & 0x00FFFFFF); 7183 } 7184 // if two primaries are different, we are done 7185 result = (sPrimary < tPrimary) ? UCOL_LESS: UCOL_GREATER; 7186 goto commonReturn; 7187 } 7188 } // no primary difference... do the rest from the buffers 7189 } else { // shifted - do a slightly more complicated processing :) 7190 for(;;) { 7191 UBool sInShifted = FALSE; 7192 UBool tInShifted = FALSE; 7193 // This version of code can be refactored. However, it seems easier to understand this way. 7194 // Source loop. Same as the target loop. 7195 for(;;) { 7196 sOrder = ucol_IGetNextCE(coll, sColl, status); 7197 if(sOrder == UCOL_NO_MORE_CES) { 7198 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7199 break; 7200 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { 7201 /* UCA amendment - ignore ignorables that follow shifted code points */ 7202 continue; 7203 } else if(isContinuation(sOrder)) { 7204 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7205 if(sInShifted) { 7206 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7207 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7208 continue; 7209 } else { 7210 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7211 break; 7212 } 7213 } else { /* Just lower level values */ 7214 if(sInShifted) { 7215 continue; 7216 } else { 7217 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7218 continue; 7219 } 7220 } 7221 } else { /* regular */ 7222 if(coll->leadBytePermutationTable != NULL){ 7223 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); 7224 } 7225 if((sOrder & UCOL_PRIMARYMASK) > LVT) { 7226 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7227 break; 7228 } else { 7229 if((sOrder & UCOL_PRIMARYMASK) > 0) { 7230 sInShifted = TRUE; 7231 sOrder &= UCOL_PRIMARYMASK; 7232 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7233 continue; 7234 } else { 7235 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7236 sInShifted = FALSE; 7237 continue; 7238 } 7239 } 7240 } 7241 } 7242 sOrder &= UCOL_PRIMARYMASK; 7243 sInShifted = FALSE; 7244 7245 for(;;) { 7246 tOrder = ucol_IGetNextCE(coll, tColl, status); 7247 if(tOrder == UCOL_NO_MORE_CES) { 7248 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7249 break; 7250 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { 7251 /* UCA amendment - ignore ignorables that follow shifted code points */ 7252 continue; 7253 } else if(isContinuation(tOrder)) { 7254 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7255 if(tInShifted) { 7256 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7257 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7258 continue; 7259 } else { 7260 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7261 break; 7262 } 7263 } else { /* Just lower level values */ 7264 if(tInShifted) { 7265 continue; 7266 } else { 7267 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7268 continue; 7269 } 7270 } 7271 } else { /* regular */ 7272 if(coll->leadBytePermutationTable != NULL){ 7273 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); 7274 } 7275 if((tOrder & UCOL_PRIMARYMASK) > LVT) { 7276 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7277 break; 7278 } else { 7279 if((tOrder & UCOL_PRIMARYMASK) > 0) { 7280 tInShifted = TRUE; 7281 tOrder &= UCOL_PRIMARYMASK; 7282 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7283 continue; 7284 } else { 7285 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7286 tInShifted = FALSE; 7287 continue; 7288 } 7289 } 7290 } 7291 } 7292 tOrder &= UCOL_PRIMARYMASK; 7293 tInShifted = FALSE; 7294 7295 if(sOrder == tOrder) { 7296 /* 7297 if(doHiragana && hirResult == UCOL_EQUAL) { 7298 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { 7299 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) 7300 ? UCOL_LESS:UCOL_GREATER; 7301 } 7302 } 7303 */ 7304 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7305 break; 7306 } else { 7307 sOrder = 0; 7308 tOrder = 0; 7309 continue; 7310 } 7311 } else { 7312 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; 7313 goto commonReturn; 7314 } 7315 } /* no primary difference... do the rest from the buffers */ 7316 } 7317 7318 /* now, we're gonna reexamine collected CEs */ 7319 uint32_t *sCE; 7320 uint32_t *tCE; 7321 7322 /* This is the secondary level of comparison */ 7323 if(checkSecTer) { 7324 if(!isFrenchSec) { /* normal */ 7325 sCE = sCEs.buf; 7326 tCE = tCEs.buf; 7327 for(;;) { 7328 while (secS == 0) { 7329 secS = *(sCE++) & UCOL_SECONDARYMASK; 7330 } 7331 7332 while(secT == 0) { 7333 secT = *(tCE++) & UCOL_SECONDARYMASK; 7334 } 7335 7336 if(secS == secT) { 7337 if(secS == UCOL_NO_MORE_CES_SECONDARY) { 7338 break; 7339 } else { 7340 secS = 0; secT = 0; 7341 continue; 7342 } 7343 } else { 7344 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7345 goto commonReturn; 7346 } 7347 } 7348 } else { /* do the French */ 7349 uint32_t *sCESave = NULL; 7350 uint32_t *tCESave = NULL; 7351 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ 7352 tCE = tCEs.pos-2; 7353 for(;;) { 7354 while (secS == 0 && sCE >= sCEs.buf) { 7355 if(sCESave == NULL) { 7356 secS = *(sCE--); 7357 if(isContinuation(secS)) { 7358 while(isContinuation(secS = *(sCE--))) 7359 ; 7360 /* after this, secS has the start of continuation, and sCEs points before that */ 7361 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7362 sCE+=2; /* need to point to the first continuation CP */ 7363 /* However, now you can just continue doing stuff */ 7364 } 7365 } else { 7366 secS = *(sCE++); 7367 if(!isContinuation(secS)) { /* This means we have finished with this cont */ 7368 sCE = sCESave; /* reset the pointer to before continuation */ 7369 sCESave = NULL; 7370 secS = 0; /* Fetch a fresh CE before the continuation sequence. */ 7371 continue; 7372 } 7373 } 7374 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7375 } 7376 7377 while(secT == 0 && tCE >= tCEs.buf) { 7378 if(tCESave == NULL) { 7379 secT = *(tCE--); 7380 if(isContinuation(secT)) { 7381 while(isContinuation(secT = *(tCE--))) 7382 ; 7383 /* after this, secS has the start of continuation, and sCEs points before that */ 7384 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7385 tCE+=2; /* need to point to the first continuation CP */ 7386 /* However, now you can just continue doing stuff */ 7387 } 7388 } else { 7389 secT = *(tCE++); 7390 if(!isContinuation(secT)) { /* This means we have finished with this cont */ 7391 tCE = tCESave; /* reset the pointer to before continuation */ 7392 tCESave = NULL; 7393 secT = 0; /* Fetch a fresh CE before the continuation sequence. */ 7394 continue; 7395 } 7396 } 7397 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7398 } 7399 7400 if(secS == secT) { 7401 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { 7402 break; 7403 } else { 7404 secS = 0; secT = 0; 7405 continue; 7406 } 7407 } else { 7408 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7409 goto commonReturn; 7410 } 7411 } 7412 } 7413 } 7414 7415 /* doing the case bit */ 7416 if(checkCase) { 7417 sCE = sCEs.buf; 7418 tCE = tCEs.buf; 7419 for(;;) { 7420 while((secS & UCOL_REMOVE_CASE) == 0) { 7421 if(!isContinuation(*sCE++)) { 7422 secS =*(sCE-1); 7423 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7424 // primary ignorables should not be considered on the case level when the strength is primary 7425 // otherwise, the CEs stop being well-formed 7426 secS &= UCOL_TERT_CASE_MASK; 7427 secS ^= caseSwitch; 7428 } else { 7429 secS = 0; 7430 } 7431 } else { 7432 secS = 0; 7433 } 7434 } 7435 7436 while((secT & UCOL_REMOVE_CASE) == 0) { 7437 if(!isContinuation(*tCE++)) { 7438 secT = *(tCE-1); 7439 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7440 // primary ignorables should not be considered on the case level when the strength is primary 7441 // otherwise, the CEs stop being well-formed 7442 secT &= UCOL_TERT_CASE_MASK; 7443 secT ^= caseSwitch; 7444 } else { 7445 secT = 0; 7446 } 7447 } else { 7448 secT = 0; 7449 } 7450 } 7451 7452 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { 7453 result = UCOL_LESS; 7454 goto commonReturn; 7455 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { 7456 result = UCOL_GREATER; 7457 goto commonReturn; 7458 } 7459 7460 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { 7461 break; 7462 } else { 7463 secS = 0; 7464 secT = 0; 7465 } 7466 } 7467 } 7468 7469 /* Tertiary level */ 7470 if(checkTertiary) { 7471 secS = 0; 7472 secT = 0; 7473 sCE = sCEs.buf; 7474 tCE = tCEs.buf; 7475 for(;;) { 7476 while((secS & UCOL_REMOVE_CASE) == 0) { 7477 sOrder = *sCE++; 7478 secS = sOrder & tertiaryMask; 7479 if(!isContinuation(sOrder)) { 7480 secS ^= caseSwitch; 7481 } else { 7482 secS &= UCOL_REMOVE_CASE; 7483 } 7484 } 7485 7486 while((secT & UCOL_REMOVE_CASE) == 0) { 7487 tOrder = *tCE++; 7488 secT = tOrder & tertiaryMask; 7489 if(!isContinuation(tOrder)) { 7490 secT ^= caseSwitch; 7491 } else { 7492 secT &= UCOL_REMOVE_CASE; 7493 } 7494 } 7495 7496 if(secS == secT) { 7497 if((secS & UCOL_REMOVE_CASE) == 1) { 7498 break; 7499 } else { 7500 secS = 0; secT = 0; 7501 continue; 7502 } 7503 } else { 7504 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7505 goto commonReturn; 7506 } 7507 } 7508 } 7509 7510 7511 if(qShifted /*checkQuad*/) { 7512 UBool sInShifted = TRUE; 7513 UBool tInShifted = TRUE; 7514 secS = 0; 7515 secT = 0; 7516 sCE = sCEs.buf; 7517 tCE = tCEs.buf; 7518 for(;;) { 7519 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) { 7520 secS = *(sCE++); 7521 if(isContinuation(secS)) { 7522 if(!sInShifted) { 7523 continue; 7524 } 7525 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ 7526 secS = UCOL_PRIMARYMASK; 7527 sInShifted = FALSE; 7528 } else { 7529 sInShifted = TRUE; 7530 } 7531 } 7532 secS &= UCOL_PRIMARYMASK; 7533 7534 7535 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) { 7536 secT = *(tCE++); 7537 if(isContinuation(secT)) { 7538 if(!tInShifted) { 7539 continue; 7540 } 7541 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { 7542 secT = UCOL_PRIMARYMASK; 7543 tInShifted = FALSE; 7544 } else { 7545 tInShifted = TRUE; 7546 } 7547 } 7548 secT &= UCOL_PRIMARYMASK; 7549 7550 if(secS == secT) { 7551 if(secS == UCOL_NO_MORE_CES_PRIMARY) { 7552 break; 7553 } else { 7554 secS = 0; secT = 0; 7555 continue; 7556 } 7557 } else { 7558 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7559 goto commonReturn; 7560 } 7561 } 7562 } else if(doHiragana && hirResult != UCOL_EQUAL) { 7563 // If we're fine on quaternaries, we might be different 7564 // on Hiragana. This, however, might fail us in shifted. 7565 result = hirResult; 7566 goto commonReturn; 7567 } 7568 7569 /* For IDENTICAL comparisons, we use a bitwise character comparison */ 7570 /* as a tiebreaker if all else is equal. */ 7571 /* Getting here should be quite rare - strings are not identical - */ 7572 /* that is checked first, but compared == through all other checks. */ 7573 if(checkIdent) 7574 { 7575 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON); 7576 result = ucol_checkIdent(sColl, tColl, TRUE, status); 7577 } 7578 7579 commonReturn: 7580 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { 7581 if (sCEs.buf != sCEs.localArray ) { 7582 uprv_free(sCEs.buf); 7583 } 7584 if (tCEs.buf != tCEs.localArray ) { 7585 uprv_free(tCEs.buf); 7586 } 7587 } 7588 7589 return result; 7590 } 7591 7592 static UCollationResult 7593 ucol_strcollRegular(const UCollator *coll, 7594 const UChar *source, int32_t sourceLength, 7595 const UChar *target, int32_t targetLength, 7596 UErrorCode *status) { 7597 collIterate sColl, tColl; 7598 // Preparing the context objects for iterating over strings 7599 IInit_collIterate(coll, source, sourceLength, &sColl, status); 7600 IInit_collIterate(coll, target, targetLength, &tColl, status); 7601 if(U_FAILURE(*status)) { 7602 return UCOL_LESS; 7603 } 7604 return ucol_strcollRegular(&sColl, &tColl, status); 7605 } 7606 7607 static inline uint32_t 7608 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, 7609 uint32_t CE, const UChar *s, int32_t *index, int32_t len) 7610 { 7611 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 7612 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 7613 int32_t offset = 1; 7614 UChar schar = 0, tchar = 0; 7615 7616 for(;;) { 7617 if(len == -1) { 7618 if(s[*index] == 0) { // end of string 7619 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7620 } else { 7621 schar = s[*index]; 7622 } 7623 } else { 7624 if(*index == len) { 7625 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7626 } else { 7627 schar = s[*index]; 7628 } 7629 } 7630 7631 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 7632 offset++; 7633 } 7634 7635 if (schar == tchar) { 7636 (*index)++; 7637 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 7638 } 7639 else 7640 { 7641 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 7642 return UCOL_BAIL_OUT_CE; 7643 } 7644 // skip completely ignorables 7645 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 7646 if(isZeroCE == 0) { // we have to ignore completely ignorables 7647 (*index)++; 7648 continue; 7649 } 7650 7651 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7652 } 7653 } 7654 } 7655 7656 7657 /** 7658 * This is a fast strcoll, geared towards text in Latin-1. 7659 * It supports contractions of size two, French secondaries 7660 * and case switching. You can use it with strengths primary 7661 * to tertiary. It does not support shifted and case level. 7662 * It relies on the table build by setupLatin1Table. If it 7663 * doesn't understand something, it will go to the regular 7664 * strcoll. 7665 */ 7666 static UCollationResult 7667 ucol_strcollUseLatin1( const UCollator *coll, 7668 const UChar *source, 7669 int32_t sLen, 7670 const UChar *target, 7671 int32_t tLen, 7672 UErrorCode *status) 7673 { 7674 U_ALIGN_CODE(16); 7675 int32_t strength = coll->strength; 7676 7677 int32_t sIndex = 0, tIndex = 0; 7678 UChar sChar = 0, tChar = 0; 7679 uint32_t sOrder=0, tOrder=0; 7680 7681 UBool endOfSource = FALSE; 7682 7683 uint32_t *elements = coll->latinOneCEs; 7684 7685 UBool haveContractions = FALSE; // if we have contractions in our string 7686 // we cannot do French secondary 7687 7688 // Do the primary level 7689 for(;;) { 7690 while(sOrder==0) { // this loop skips primary ignorables 7691 // sOrder=getNextlatinOneCE(source); 7692 if(sLen==-1) { // handling zero terminated strings 7693 sChar=source[sIndex++]; 7694 if(sChar==0) { 7695 endOfSource = TRUE; 7696 break; 7697 } 7698 } else { // handling strings with known length 7699 if(sIndex==sLen) { 7700 endOfSource = TRUE; 7701 break; 7702 } 7703 sChar=source[sIndex++]; 7704 } 7705 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 7706 //fprintf(stderr, "R"); 7707 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7708 } 7709 sOrder = elements[sChar]; 7710 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 7711 // specials can basically be either contractions or bail-out signs. If we get anything 7712 // else, we'll bail out anywasy 7713 if(getCETag(sOrder) == CONTRACTION_TAG) { 7714 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 7715 haveContractions = TRUE; // if there are contractions, we cannot do French secondary 7716 // However, if there are contractions in the table, but we always use just one char, 7717 // we might be able to do French. This should be checked out. 7718 } 7719 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 7720 //fprintf(stderr, "S"); 7721 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7722 } 7723 } 7724 } 7725 7726 while(tOrder==0) { // this loop skips primary ignorables 7727 // tOrder=getNextlatinOneCE(target); 7728 if(tLen==-1) { // handling zero terminated strings 7729 tChar=target[tIndex++]; 7730 if(tChar==0) { 7731 if(endOfSource) { // this is different than source loop, 7732 // as we already know that source loop is done here, 7733 // so we can either finish the primary loop if both 7734 // strings are done or anounce the result if only 7735 // target is done. Same below. 7736 goto endOfPrimLoop; 7737 } else { 7738 return UCOL_GREATER; 7739 } 7740 } 7741 } else { // handling strings with known length 7742 if(tIndex==tLen) { 7743 if(endOfSource) { 7744 goto endOfPrimLoop; 7745 } else { 7746 return UCOL_GREATER; 7747 } 7748 } 7749 tChar=target[tIndex++]; 7750 } 7751 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 7752 //fprintf(stderr, "R"); 7753 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7754 } 7755 tOrder = elements[tChar]; 7756 if(tOrder >= UCOL_NOT_FOUND) { 7757 // Handling specials, see the comments for source 7758 if(getCETag(tOrder) == CONTRACTION_TAG) { 7759 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 7760 haveContractions = TRUE; 7761 } 7762 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 7763 //fprintf(stderr, "S"); 7764 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7765 } 7766 } 7767 } 7768 if(endOfSource) { // source is finished, but target is not, say the result. 7769 return UCOL_LESS; 7770 } 7771 7772 if(sOrder == tOrder) { // if we have same CEs, we continue the loop 7773 sOrder = 0; tOrder = 0; 7774 continue; 7775 } else { 7776 // compare current top bytes 7777 if(((sOrder^tOrder)&0xFF000000)!=0) { 7778 // top bytes differ, return difference 7779 if(sOrder < tOrder) { 7780 return UCOL_LESS; 7781 } else if(sOrder > tOrder) { 7782 return UCOL_GREATER; 7783 } 7784 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 7785 // since we must return enum value 7786 } 7787 7788 // top bytes match, continue with following bytes 7789 sOrder<<=8; 7790 tOrder<<=8; 7791 } 7792 } 7793 7794 endOfPrimLoop: 7795 // after primary loop, we definitely know the sizes of strings, 7796 // so we set it and use simpler loop for secondaries and tertiaries 7797 sLen = sIndex; tLen = tIndex; 7798 if(strength >= UCOL_SECONDARY) { 7799 // adjust the table beggining 7800 elements += coll->latinOneTableLen; 7801 endOfSource = FALSE; 7802 7803 if(coll->frenchCollation == UCOL_OFF) { // non French 7804 // This loop is a simplified copy of primary loop 7805 // at this point we know that whole strings are latin-1, so we don't 7806 // check for that. We also know that we only have contractions as 7807 // specials. 7808 sIndex = 0; tIndex = 0; 7809 for(;;) { 7810 while(sOrder==0) { 7811 if(sIndex==sLen) { 7812 endOfSource = TRUE; 7813 break; 7814 } 7815 sChar=source[sIndex++]; 7816 sOrder = elements[sChar]; 7817 if(sOrder > UCOL_NOT_FOUND) { 7818 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 7819 } 7820 } 7821 7822 while(tOrder==0) { 7823 if(tIndex==tLen) { 7824 if(endOfSource) { 7825 goto endOfSecLoop; 7826 } else { 7827 return UCOL_GREATER; 7828 } 7829 } 7830 tChar=target[tIndex++]; 7831 tOrder = elements[tChar]; 7832 if(tOrder > UCOL_NOT_FOUND) { 7833 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 7834 } 7835 } 7836 if(endOfSource) { 7837 return UCOL_LESS; 7838 } 7839 7840 if(sOrder == tOrder) { 7841 sOrder = 0; tOrder = 0; 7842 continue; 7843 } else { 7844 // see primary loop for comments on this 7845 if(((sOrder^tOrder)&0xFF000000)!=0) { 7846 if(sOrder < tOrder) { 7847 return UCOL_LESS; 7848 } else if(sOrder > tOrder) { 7849 return UCOL_GREATER; 7850 } 7851 } 7852 sOrder<<=8; 7853 tOrder<<=8; 7854 } 7855 } 7856 } else { // French 7857 if(haveContractions) { // if we have contractions, we have to bail out 7858 // since we don't really know how to handle them here 7859 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7860 } 7861 // For French, we go backwards 7862 sIndex = sLen; tIndex = tLen; 7863 for(;;) { 7864 while(sOrder==0) { 7865 if(sIndex==0) { 7866 endOfSource = TRUE; 7867 break; 7868 } 7869 sChar=source[--sIndex]; 7870 sOrder = elements[sChar]; 7871 // don't even look for contractions 7872 } 7873 7874 while(tOrder==0) { 7875 if(tIndex==0) { 7876 if(endOfSource) { 7877 goto endOfSecLoop; 7878 } else { 7879 return UCOL_GREATER; 7880 } 7881 } 7882 tChar=target[--tIndex]; 7883 tOrder = elements[tChar]; 7884 // don't even look for contractions 7885 } 7886 if(endOfSource) { 7887 return UCOL_LESS; 7888 } 7889 7890 if(sOrder == tOrder) { 7891 sOrder = 0; tOrder = 0; 7892 continue; 7893 } else { 7894 // see the primary loop for comments 7895 if(((sOrder^tOrder)&0xFF000000)!=0) { 7896 if(sOrder < tOrder) { 7897 return UCOL_LESS; 7898 } else if(sOrder > tOrder) { 7899 return UCOL_GREATER; 7900 } 7901 } 7902 sOrder<<=8; 7903 tOrder<<=8; 7904 } 7905 } 7906 } 7907 } 7908 7909 endOfSecLoop: 7910 if(strength >= UCOL_TERTIARY) { 7911 // tertiary loop is the same as secondary (except no French) 7912 elements += coll->latinOneTableLen; 7913 sIndex = 0; tIndex = 0; 7914 endOfSource = FALSE; 7915 for(;;) { 7916 while(sOrder==0) { 7917 if(sIndex==sLen) { 7918 endOfSource = TRUE; 7919 break; 7920 } 7921 sChar=source[sIndex++]; 7922 sOrder = elements[sChar]; 7923 if(sOrder > UCOL_NOT_FOUND) { 7924 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 7925 } 7926 } 7927 while(tOrder==0) { 7928 if(tIndex==tLen) { 7929 if(endOfSource) { 7930 return UCOL_EQUAL; // if both strings are at the end, they are equal 7931 } else { 7932 return UCOL_GREATER; 7933 } 7934 } 7935 tChar=target[tIndex++]; 7936 tOrder = elements[tChar]; 7937 if(tOrder > UCOL_NOT_FOUND) { 7938 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 7939 } 7940 } 7941 if(endOfSource) { 7942 return UCOL_LESS; 7943 } 7944 if(sOrder == tOrder) { 7945 sOrder = 0; tOrder = 0; 7946 continue; 7947 } else { 7948 if(((sOrder^tOrder)&0xff000000)!=0) { 7949 if(sOrder < tOrder) { 7950 return UCOL_LESS; 7951 } else if(sOrder > tOrder) { 7952 return UCOL_GREATER; 7953 } 7954 } 7955 sOrder<<=8; 7956 tOrder<<=8; 7957 } 7958 } 7959 } 7960 return UCOL_EQUAL; 7961 } 7962 7963 /* 7964 Note: ucol_strcollUTF8 supports null terminated input. Calculating length of 7965 null terminated input string takes extra amount of CPU cycles. 7966 */ 7967 static UCollationResult 7968 ucol_strcollRegularUTF8( 7969 const UCollator *coll, 7970 const char *source, 7971 int32_t sourceLength, 7972 const char *target, 7973 int32_t targetLength, 7974 UErrorCode *status) 7975 { 7976 UCharIterator src; 7977 UCharIterator tgt; 7978 7979 uiter_setUTF8(&src, source, sourceLength); 7980 uiter_setUTF8(&tgt, target, targetLength); 7981 7982 // Preparing the context objects for iterating over strings 7983 collIterate sColl, tColl; 7984 IInit_collIterate(coll, NULL, -1, &sColl, status); 7985 IInit_collIterate(coll, NULL, -1, &tColl, status); 7986 if(U_FAILURE(*status)) { 7987 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 7988 return UCOL_EQUAL; 7989 } 7990 // The division for the array length may truncate the array size to 7991 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 7992 // for all platforms anyway. 7993 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 7994 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 7995 UNormIterator *sNormIter = NULL, *tNormIter = NULL; 7996 7997 sColl.iterator = &src; 7998 sColl.flags |= UCOL_USE_ITERATOR; 7999 tColl.flags |= UCOL_USE_ITERATOR; 8000 tColl.iterator = &tgt; 8001 8002 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 8003 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 8004 sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status); 8005 sColl.flags &= ~UCOL_ITER_NORM; 8006 8007 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 8008 tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status); 8009 tColl.flags &= ~UCOL_ITER_NORM; 8010 } 8011 8012 return ucol_strcollRegular(&sColl, &tColl, status); 8013 } 8014 8015 static inline uint32_t 8016 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength, 8017 uint32_t CE, const char *s, int32_t *index, int32_t len) 8018 { 8019 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 8020 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 8021 int32_t offset = 1; 8022 UChar32 schar = 0, tchar = 0; 8023 8024 for(;;) { 8025 if (*index == len) { 8026 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8027 } 8028 U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar); 8029 if (len < 0 && schar == 0) { 8030 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8031 } 8032 8033 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 8034 offset++; 8035 } 8036 8037 if (schar == tchar) { 8038 U8_FWD_1(s, *index, len); 8039 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 8040 } 8041 else 8042 { 8043 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 8044 return UCOL_BAIL_OUT_CE; 8045 } 8046 // skip completely ignorables 8047 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 8048 if(isZeroCE == 0) { // we have to ignore completely ignorables 8049 U8_FWD_1(s, *index, len); 8050 continue; 8051 } 8052 8053 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8054 } 8055 } 8056 } 8057 8058 static inline UCollationResult 8059 ucol_strcollUseLatin1UTF8( 8060 const UCollator *coll, 8061 const char *source, 8062 int32_t sLen, 8063 const char *target, 8064 int32_t tLen, 8065 UErrorCode *status) 8066 { 8067 U_ALIGN_CODE(16); 8068 int32_t strength = coll->strength; 8069 8070 int32_t sIndex = 0, tIndex = 0; 8071 UChar32 sChar = 0, tChar = 0; 8072 uint32_t sOrder=0, tOrder=0; 8073 8074 UBool endOfSource = FALSE; 8075 8076 uint32_t *elements = coll->latinOneCEs; 8077 8078 UBool haveContractions = FALSE; // if we have contractions in our string 8079 // we cannot do French secondary 8080 8081 // Do the primary level 8082 for(;;) { 8083 while(sOrder==0) { // this loop skips primary ignorables 8084 // sOrder=getNextlatinOneCE(source); 8085 if (sIndex == sLen) { 8086 endOfSource = TRUE; 8087 break; 8088 } 8089 U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar); 8090 if (sLen < 0 && sChar == 0) { 8091 endOfSource = TRUE; 8092 sLen = sIndex; 8093 break; 8094 } 8095 if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 8096 //fprintf(stderr, "R"); 8097 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 8098 } 8099 sOrder = elements[sChar]; 8100 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 8101 // specials can basically be either contractions or bail-out signs. If we get anything 8102 // else, we'll bail out anywasy 8103 if(getCETag(sOrder) == CONTRACTION_TAG) { 8104 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 8105 haveContractions = TRUE; // if there are contractions, we cannot do French secondary 8106 // However, if there are contractions in the table, but we always use just one char, 8107 // we might be able to do French. This should be checked out. 8108 } 8109 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 8110 //fprintf(stderr, "S"); 8111 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 8112 } 8113 } 8114 } 8115 8116 while(tOrder==0) { // this loop skips primary ignorables 8117 // tOrder=getNextlatinOneCE(target); 8118 if (tIndex == tLen) { 8119 if(endOfSource) { 8120 goto endOfPrimLoopU8; 8121 } else { 8122 return UCOL_GREATER; 8123 } 8124 } 8125 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); 8126 if (tLen < 0 && tChar == 0) { 8127 if(endOfSource) { 8128 tLen = tIndex; 8129 goto endOfPrimLoopU8; 8130 } else { 8131 return UCOL_GREATER; 8132 } 8133 } 8134 if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 8135 //fprintf(stderr, "R"); 8136 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 8137 } 8138 tOrder = elements[tChar]; 8139 if(tOrder >= UCOL_NOT_FOUND) { 8140 // Handling specials, see the comments for source 8141 if(getCETag(tOrder) == CONTRACTION_TAG) { 8142 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 8143 haveContractions = TRUE; 8144 } 8145 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 8146 //fprintf(stderr, "S"); 8147 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 8148 } 8149 } 8150 } 8151 if(endOfSource) { // source is finished, but target is not, say the result. 8152 return UCOL_LESS; 8153 } 8154 8155 if(sOrder == tOrder) { // if we have same CEs, we continue the loop 8156 sOrder = 0; tOrder = 0; 8157 continue; 8158 } else { 8159 // compare current top bytes 8160 if(((sOrder^tOrder)&0xFF000000)!=0) { 8161 // top bytes differ, return difference 8162 if(sOrder < tOrder) { 8163 return UCOL_LESS; 8164 } else if(sOrder > tOrder) { 8165 return UCOL_GREATER; 8166 } 8167 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 8168 // since we must return enum value 8169 } 8170 8171 // top bytes match, continue with following bytes 8172 sOrder<<=8; 8173 tOrder<<=8; 8174 } 8175 } 8176 8177 endOfPrimLoopU8: 8178 // after primary loop, we definitely know the sizes of strings, 8179 // so we set it and use simpler loop for secondaries and tertiaries 8180 sLen = sIndex; tLen = tIndex; 8181 if(strength >= UCOL_SECONDARY) { 8182 // adjust the table beggining 8183 elements += coll->latinOneTableLen; 8184 endOfSource = FALSE; 8185 8186 if(coll->frenchCollation == UCOL_OFF) { // non French 8187 // This loop is a simplified copy of primary loop 8188 // at this point we know that whole strings are latin-1, so we don't 8189 // check for that. We also know that we only have contractions as 8190 // specials. 8191 sIndex = 0; tIndex = 0; 8192 for(;;) { 8193 while(sOrder==0) { 8194 if(sIndex==sLen) { 8195 endOfSource = TRUE; 8196 break; 8197 } 8198 U_ASSERT(sLen >= 0); 8199 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); 8200 U_ASSERT(sChar >= 0 && sChar <= 0xFF); 8201 sOrder = elements[sChar]; 8202 if(sOrder > UCOL_NOT_FOUND) { 8203 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 8204 } 8205 } 8206 8207 while(tOrder==0) { 8208 if(tIndex==tLen) { 8209 if(endOfSource) { 8210 goto endOfSecLoopU8; 8211 } else { 8212 return UCOL_GREATER; 8213 } 8214 } 8215 U_ASSERT(tLen >= 0); 8216 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); 8217 U_ASSERT(tChar >= 0 && tChar <= 0xFF); 8218 tOrder = elements[tChar]; 8219 if(tOrder > UCOL_NOT_FOUND) { 8220 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 8221 } 8222 } 8223 if(endOfSource) { 8224 return UCOL_LESS; 8225 } 8226 8227 if(sOrder == tOrder) { 8228 sOrder = 0; tOrder = 0; 8229 continue; 8230 } else { 8231 // see primary loop for comments on this 8232 if(((sOrder^tOrder)&0xFF000000)!=0) { 8233 if(sOrder < tOrder) { 8234 return UCOL_LESS; 8235 } else if(sOrder > tOrder) { 8236 return UCOL_GREATER; 8237 } 8238 } 8239 sOrder<<=8; 8240 tOrder<<=8; 8241 } 8242 } 8243 } else { // French 8244 if(haveContractions) { // if we have contractions, we have to bail out 8245 // since we don't really know how to handle them here 8246 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 8247 } 8248 // For French, we go backwards 8249 sIndex = sLen; tIndex = tLen; 8250 for(;;) { 8251 while(sOrder==0) { 8252 if(sIndex==0) { 8253 endOfSource = TRUE; 8254 break; 8255 } 8256 U8_PREV_OR_FFFD(source, 0, sIndex, sChar); 8257 U_ASSERT(sChar >= 0 && sChar <= 0xFF); 8258 sOrder = elements[sChar]; 8259 // don't even look for contractions 8260 } 8261 8262 while(tOrder==0) { 8263 if(tIndex==0) { 8264 if(endOfSource) { 8265 goto endOfSecLoopU8; 8266 } else { 8267 return UCOL_GREATER; 8268 } 8269 } 8270 U8_PREV_OR_FFFD(target, 0, tIndex, tChar); 8271 U_ASSERT(tChar >= 0 && tChar <= 0xFF); 8272 tOrder = elements[tChar]; 8273 // don't even look for contractions 8274 } 8275 if(endOfSource) { 8276 return UCOL_LESS; 8277 } 8278 8279 if(sOrder == tOrder) { 8280 sOrder = 0; tOrder = 0; 8281 continue; 8282 } else { 8283 // see the primary loop for comments 8284 if(((sOrder^tOrder)&0xFF000000)!=0) { 8285 if(sOrder < tOrder) { 8286 return UCOL_LESS; 8287 } else if(sOrder > tOrder) { 8288 return UCOL_GREATER; 8289 } 8290 } 8291 sOrder<<=8; 8292 tOrder<<=8; 8293 } 8294 } 8295 } 8296 } 8297 8298 endOfSecLoopU8: 8299 if(strength >= UCOL_TERTIARY) { 8300 // tertiary loop is the same as secondary (except no French) 8301 elements += coll->latinOneTableLen; 8302 sIndex = 0; tIndex = 0; 8303 endOfSource = FALSE; 8304 for(;;) { 8305 while(sOrder==0) { 8306 if(sIndex==sLen) { 8307 endOfSource = TRUE; 8308 break; 8309 } 8310 U_ASSERT(sLen >= 0); 8311 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); 8312 U_ASSERT(sChar >= 0 && sChar <= 0xFF); 8313 sOrder = elements[sChar]; 8314 if(sOrder > UCOL_NOT_FOUND) { 8315 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 8316 } 8317 } 8318 while(tOrder==0) { 8319 if(tIndex==tLen) { 8320 if(endOfSource) { 8321 return UCOL_EQUAL; // if both strings are at the end, they are equal 8322 } else { 8323 return UCOL_GREATER; 8324 } 8325 } 8326 U_ASSERT(tLen >= 0); 8327 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); 8328 U_ASSERT(tChar >= 0 && tChar <= 0xFF); 8329 tOrder = elements[tChar]; 8330 if(tOrder > UCOL_NOT_FOUND) { 8331 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 8332 } 8333 } 8334 if(endOfSource) { 8335 return UCOL_LESS; 8336 } 8337 if(sOrder == tOrder) { 8338 sOrder = 0; tOrder = 0; 8339 continue; 8340 } else { 8341 if(((sOrder^tOrder)&0xff000000)!=0) { 8342 if(sOrder < tOrder) { 8343 return UCOL_LESS; 8344 } else if(sOrder > tOrder) { 8345 return UCOL_GREATER; 8346 } 8347 } 8348 sOrder<<=8; 8349 tOrder<<=8; 8350 } 8351 } 8352 } 8353 return UCOL_EQUAL; 8354 } 8355 8356 U_CAPI UCollationResult U_EXPORT2 8357 ucol_strcollIter( const UCollator *coll, 8358 UCharIterator *sIter, 8359 UCharIterator *tIter, 8360 UErrorCode *status) 8361 { 8362 if(!status || U_FAILURE(*status)) { 8363 return UCOL_EQUAL; 8364 } 8365 8366 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 8367 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 8368 8369 if (sIter == tIter) { 8370 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8371 return UCOL_EQUAL; 8372 } 8373 if(sIter == NULL || tIter == NULL || coll == NULL) { 8374 *status = U_ILLEGAL_ARGUMENT_ERROR; 8375 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8376 return UCOL_EQUAL; 8377 } 8378 8379 UCollationResult result = UCOL_EQUAL; 8380 8381 // Preparing the context objects for iterating over strings 8382 collIterate sColl, tColl; 8383 IInit_collIterate(coll, NULL, -1, &sColl, status); 8384 IInit_collIterate(coll, NULL, -1, &tColl, status); 8385 if(U_FAILURE(*status)) { 8386 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8387 return UCOL_EQUAL; 8388 } 8389 // The division for the array length may truncate the array size to 8390 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 8391 // for all platforms anyway. 8392 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8393 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8394 UNormIterator *sNormIter = NULL, *tNormIter = NULL; 8395 8396 sColl.iterator = sIter; 8397 sColl.flags |= UCOL_USE_ITERATOR; 8398 tColl.flags |= UCOL_USE_ITERATOR; 8399 tColl.iterator = tIter; 8400 8401 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 8402 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 8403 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); 8404 sColl.flags &= ~UCOL_ITER_NORM; 8405 8406 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 8407 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); 8408 tColl.flags &= ~UCOL_ITER_NORM; 8409 } 8410 8411 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; 8412 8413 while((sChar = sColl.iterator->next(sColl.iterator)) == 8414 (tChar = tColl.iterator->next(tColl.iterator))) { 8415 if(sChar == U_SENTINEL) { 8416 result = UCOL_EQUAL; 8417 goto end_compare; 8418 } 8419 } 8420 8421 if(sChar == U_SENTINEL) { 8422 tChar = tColl.iterator->previous(tColl.iterator); 8423 } 8424 8425 if(tChar == U_SENTINEL) { 8426 sChar = sColl.iterator->previous(sColl.iterator); 8427 } 8428 8429 sChar = sColl.iterator->previous(sColl.iterator); 8430 tChar = tColl.iterator->previous(tColl.iterator); 8431 8432 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) 8433 { 8434 // We are stopped in the middle of a contraction. 8435 // Scan backwards through the == part of the string looking for the start of the contraction. 8436 // It doesn't matter which string we scan, since they are the same in this region. 8437 do 8438 { 8439 sChar = sColl.iterator->previous(sColl.iterator); 8440 tChar = tColl.iterator->previous(tColl.iterator); 8441 } 8442 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); 8443 } 8444 8445 8446 if(U_SUCCESS(*status)) { 8447 result = ucol_strcollRegular(&sColl, &tColl, status); 8448 } 8449 8450 end_compare: 8451 if(sNormIter || tNormIter) { 8452 unorm_closeIter(sNormIter); 8453 unorm_closeIter(tNormIter); 8454 } 8455 8456 UTRACE_EXIT_VALUE_STATUS(result, *status) 8457 return result; 8458 } 8459 8460 8461 /* */ 8462 /* ucol_strcoll Main public API string comparison function */ 8463 /* */ 8464 U_CAPI UCollationResult U_EXPORT2 8465 ucol_strcoll( const UCollator *coll, 8466 const UChar *source, 8467 int32_t sourceLength, 8468 const UChar *target, 8469 int32_t targetLength) 8470 { 8471 U_ALIGN_CODE(16); 8472 8473 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 8474 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 8475 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 8476 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 8477 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 8478 } 8479 8480 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) { 8481 // do not crash, but return. Should have 8482 // status argument to return error. 8483 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8484 return UCOL_EQUAL; 8485 } 8486 8487 /* Quick check if source and target are same strings. */ 8488 /* They should either both be NULL terminated or the explicit length should be set on both. */ 8489 if (source==target && sourceLength==targetLength) { 8490 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8491 return UCOL_EQUAL; 8492 } 8493 8494 if(coll->delegate != NULL) { 8495 UErrorCode status = U_ZERO_ERROR; 8496 return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status); 8497 } 8498 8499 /* Scan the strings. Find: */ 8500 /* The length of any leading portion that is equal */ 8501 /* Whether they are exactly equal. (in which case we just return) */ 8502 const UChar *pSrc = source; 8503 const UChar *pTarg = target; 8504 int32_t equalLength; 8505 8506 if (sourceLength == -1 && targetLength == -1) { 8507 // Both strings are null terminated. 8508 // Scan through any leading equal portion. 8509 while (*pSrc == *pTarg && *pSrc != 0) { 8510 pSrc++; 8511 pTarg++; 8512 } 8513 if (*pSrc == 0 && *pTarg == 0) { 8514 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8515 return UCOL_EQUAL; 8516 } 8517 equalLength = (int32_t)(pSrc - source); 8518 } 8519 else 8520 { 8521 // One or both strings has an explicit length. 8522 const UChar *pSrcEnd = source + sourceLength; 8523 const UChar *pTargEnd = target + targetLength; 8524 8525 // Scan while the strings are bitwise ==, or until one is exhausted. 8526 for (;;) { 8527 if (pSrc == pSrcEnd || pTarg == pTargEnd) { 8528 break; 8529 } 8530 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 8531 break; 8532 } 8533 if (*pSrc != *pTarg) { 8534 break; 8535 } 8536 pSrc++; 8537 pTarg++; 8538 } 8539 equalLength = (int32_t)(pSrc - source); 8540 8541 // If we made it all the way through both strings, we are done. They are == 8542 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */ 8543 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */ 8544 { 8545 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8546 return UCOL_EQUAL; 8547 } 8548 } 8549 if (equalLength > 0) { 8550 /* There is an identical portion at the beginning of the two strings. */ 8551 /* If the identical portion ends within a contraction or a comibining */ 8552 /* character sequence, back up to the start of that sequence. */ 8553 8554 // These values should already be set by the code above. 8555 //pSrc = source + equalLength; /* point to the first differing chars */ 8556 //pTarg = target + equalLength; 8557 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) || 8558 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))) 8559 { 8560 // We are stopped in the middle of a contraction. 8561 // Scan backwards through the == part of the string looking for the start of the contraction. 8562 // It doesn't matter which string we scan, since they are the same in this region. 8563 do 8564 { 8565 equalLength--; 8566 pSrc--; 8567 } 8568 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); 8569 } 8570 8571 source += equalLength; 8572 target += equalLength; 8573 if (sourceLength > 0) { 8574 sourceLength -= equalLength; 8575 } 8576 if (targetLength > 0) { 8577 targetLength -= equalLength; 8578 } 8579 } 8580 8581 UErrorCode status = U_ZERO_ERROR; 8582 UCollationResult returnVal; 8583 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { 8584 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status); 8585 } else { 8586 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); 8587 } 8588 UTRACE_EXIT_VALUE(returnVal); 8589 return returnVal; 8590 } 8591 8592 U_CAPI UCollationResult U_EXPORT2 8593 ucol_strcollUTF8( 8594 const UCollator *coll, 8595 const char *source, 8596 int32_t sourceLength, 8597 const char *target, 8598 int32_t targetLength, 8599 UErrorCode *status) 8600 { 8601 U_ALIGN_CODE(16); 8602 8603 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); 8604 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 8605 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 8606 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength); 8607 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength); 8608 } 8609 8610 if (U_FAILURE(*status)) { 8611 /* do nothing */ 8612 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 8613 return UCOL_EQUAL; 8614 } 8615 8616 if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) { 8617 *status = U_ILLEGAL_ARGUMENT_ERROR; 8618 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 8619 return UCOL_EQUAL; 8620 } 8621 8622 /* Quick check if source and target are same strings. */ 8623 /* They should either both be NULL terminated or the explicit length should be set on both. */ 8624 if (source==target && sourceLength==targetLength) { 8625 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 8626 return UCOL_EQUAL; 8627 } 8628 8629 if(coll->delegate != NULL) { 8630 return ((const Collator*)coll->delegate)->compareUTF8( 8631 StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength), 8632 StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength), 8633 *status); 8634 } 8635 8636 /* Scan the strings. Find: */ 8637 /* The length of any leading portion that is equal */ 8638 /* Whether they are exactly equal. (in which case we just return) */ 8639 const char *pSrc = source; 8640 const char *pTarg = target; 8641 UBool bSrcLimit = FALSE; 8642 UBool bTargLimit = FALSE; 8643 8644 if (sourceLength == -1 && targetLength == -1) { 8645 // Both strings are null terminated. 8646 // Scan through any leading equal portion. 8647 while (*pSrc == *pTarg && *pSrc != 0) { 8648 pSrc++; 8649 pTarg++; 8650 } 8651 if (*pSrc == 0 && *pTarg == 0) { 8652 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 8653 return UCOL_EQUAL; 8654 } 8655 bSrcLimit = (*pSrc == 0); 8656 bTargLimit = (*pTarg == 0); 8657 } 8658 else 8659 { 8660 // One or both strings has an explicit length. 8661 const char *pSrcEnd = source + sourceLength; 8662 const char *pTargEnd = target + targetLength; 8663 8664 // Scan while the strings are bitwise ==, or until one is exhausted. 8665 for (;;) { 8666 if (pSrc == pSrcEnd || pTarg == pTargEnd) { 8667 break; 8668 } 8669 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 8670 break; 8671 } 8672 if (*pSrc != *pTarg) { 8673 break; 8674 } 8675 pSrc++; 8676 pTarg++; 8677 } 8678 bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)); 8679 bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)); 8680 8681 // If we made it all the way through both strings, we are done. They are == 8682 if (bSrcLimit && /* At end of src string, however it was specified. */ 8683 bTargLimit) /* and also at end of dest string */ 8684 { 8685 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 8686 return UCOL_EQUAL; 8687 } 8688 } 8689 8690 U_ASSERT(!(bSrcLimit && bTargLimit)); 8691 8692 int32_t equalLength = pSrc - source; 8693 UBool bSawNonLatin1 = FALSE; 8694 8695 if (equalLength > 0) { 8696 // Align position to the start of UTF-8 code point. 8697 if (bTargLimit) { 8698 U8_SET_CP_START((const uint8_t*)source, 0, equalLength); 8699 } else { 8700 U8_SET_CP_START((const uint8_t*)target, 0, equalLength); 8701 } 8702 pSrc = source + equalLength; 8703 pTarg = target + equalLength; 8704 } 8705 8706 if (equalLength > 0) { 8707 /* There is an identical portion at the beginning of the two strings. */ 8708 /* If the identical portion ends within a contraction or a comibining */ 8709 /* character sequence, back up to the start of that sequence. */ 8710 UBool bUnsafeCP = FALSE; 8711 UChar32 uc32 = -1; 8712 8713 if (!bSrcLimit) { 8714 U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32); 8715 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { 8716 bUnsafeCP = TRUE; 8717 } 8718 bSawNonLatin1 |= (uc32 > 0xff); 8719 } 8720 if (!bTargLimit) { 8721 U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32); 8722 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { 8723 bUnsafeCP = TRUE; 8724 } 8725 bSawNonLatin1 |= (uc32 > 0xff); 8726 } 8727 8728 if (bUnsafeCP) { 8729 while (equalLength > 0) { 8730 // We are stopped in the middle of a contraction. 8731 // Scan backwards through the == part of the string looking for the start of the contraction. 8732 // It doesn't matter which string we scan, since they are the same in this region. 8733 U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32); 8734 bSawNonLatin1 |= (uc32 > 0xff); 8735 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) { 8736 break; 8737 } 8738 } 8739 } 8740 source += equalLength; 8741 target += equalLength; 8742 if (sourceLength > 0) { 8743 sourceLength -= equalLength; 8744 } 8745 if (targetLength > 0) { 8746 targetLength -= equalLength; 8747 } 8748 } else { 8749 // Lead byte of Latin 1 character is 0x00 - 0xC3 8750 bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3); 8751 bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3); 8752 } 8753 8754 UCollationResult returnVal; 8755 8756 if(!coll->latinOneUse || bSawNonLatin1) { 8757 returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status); 8758 } else { 8759 returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status); 8760 } 8761 UTRACE_EXIT_VALUE_STATUS(returnVal, *status); 8762 return returnVal; 8763 } 8764 8765 8766 /* convenience function for comparing strings */ 8767 U_CAPI UBool U_EXPORT2 8768 ucol_greater( const UCollator *coll, 8769 const UChar *source, 8770 int32_t sourceLength, 8771 const UChar *target, 8772 int32_t targetLength) 8773 { 8774 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8775 == UCOL_GREATER); 8776 } 8777 8778 /* convenience function for comparing strings */ 8779 U_CAPI UBool U_EXPORT2 8780 ucol_greaterOrEqual( const UCollator *coll, 8781 const UChar *source, 8782 int32_t sourceLength, 8783 const UChar *target, 8784 int32_t targetLength) 8785 { 8786 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8787 != UCOL_LESS); 8788 } 8789 8790 /* convenience function for comparing strings */ 8791 U_CAPI UBool U_EXPORT2 8792 ucol_equal( const UCollator *coll, 8793 const UChar *source, 8794 int32_t sourceLength, 8795 const UChar *target, 8796 int32_t targetLength) 8797 { 8798 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8799 == UCOL_EQUAL); 8800 } 8801 8802 U_CAPI void U_EXPORT2 8803 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 8804 if(coll && coll->UCA) { 8805 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); 8806 } 8807 } 8808 8809 #endif /* #if !UCONFIG_NO_COLLATION */ 8810