1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucol.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Modification history 12 * Date Name Comments 13 * 1996-1999 various members of ICU team maintained C API for collation framework 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE 15 * 03/01/2001 synwee Added maxexpansion functionality. 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_COLLATION 22 23 #include "unicode/bytestream.h" 24 #include "unicode/coleitr.h" 25 #include "unicode/unorm.h" 26 #include "unicode/udata.h" 27 #include "unicode/ustring.h" 28 #include "unicode/utf8.h" 29 30 #include "ucol_imp.h" 31 #include "bocsu.h" 32 33 #include "normalizer2impl.h" 34 #include "unorm_it.h" 35 #include "umutex.h" 36 #include "cmemory.h" 37 #include "ucln_in.h" 38 #include "cstring.h" 39 #include "utracimp.h" 40 #include "putilimp.h" 41 #include "uassert.h" 42 #include "unicode/coll.h" 43 44 #ifdef UCOL_DEBUG 45 #include <stdio.h> 46 #endif 47 48 U_NAMESPACE_USE 49 50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 51 52 #define LAST_BYTE_MASK_ 0xFF 53 #define SECOND_LAST_BYTE_SHIFT_ 8 54 55 #define ZERO_CC_LIMIT_ 0xC0 56 57 // These are static pointers to the NFC/NFD implementation instance. 58 // Each of them is always the same between calls to u_cleanup 59 // and therefore writing to it is not synchronized. 60 // They are cleaned in ucol_cleanup 61 static const Normalizer2 *g_nfd = NULL; 62 static const Normalizer2Impl *g_nfcImpl = NULL; 63 64 // These are values from UCA required for 65 // implicit generation and supressing sort key compression 66 // they should regularly be in the UCA, but if one 67 // is running without UCA, it could be a problem 68 static const int32_t maxRegularPrimary = 0x7A; 69 static const int32_t minImplicitPrimary = 0xE0; 70 static const int32_t maxImplicitPrimary = 0xE4; 71 72 U_CDECL_BEGIN 73 static UBool U_CALLCONV 74 ucol_cleanup(void) 75 { 76 g_nfd = NULL; 77 g_nfcImpl = NULL; 78 return TRUE; 79 } 80 81 static int32_t U_CALLCONV 82 _getFoldingOffset(uint32_t data) { 83 return (int32_t)(data&0xFFFFFF); 84 } 85 86 U_CDECL_END 87 88 static inline 89 UBool initializeNFD(UErrorCode *status) { 90 if (g_nfd != NULL) { 91 return TRUE; 92 } else { 93 // The result is constant, until the library is reloaded. 94 g_nfd = Normalizer2Factory::getNFDInstance(*status); 95 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 96 return U_SUCCESS(*status); 97 } 98 } 99 100 // init FCD data 101 static inline 102 UBool initializeFCD(UErrorCode *status) { 103 if (g_nfcImpl != NULL) { 104 return TRUE; 105 } else { 106 // The result is constant, until the library is reloaded. 107 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); 108 // Note: Alternatively, we could also store this pointer in each collIterate struct, 109 // same as Normalizer2Factory::getImpl(collIterate->nfd). 110 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 111 return U_SUCCESS(*status); 112 } 113 } 114 115 static 116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, 117 int32_t sourceLen, collIterate *s, 118 UErrorCode *status) 119 { 120 (s)->string = (s)->pos = sourceString; 121 (s)->origFlags = 0; 122 (s)->flags = 0; 123 if (sourceLen >= 0) { 124 s->flags |= UCOL_ITER_HASLEN; 125 (s)->endp = (UChar *)sourceString+sourceLen; 126 } 127 else { 128 /* change to enable easier checking for end of string for fcdpositon */ 129 (s)->endp = NULL; 130 } 131 (s)->extendCEs = NULL; 132 (s)->extendCEsSize = 0; 133 (s)->CEpos = (s)->toReturn = (s)->CEs; 134 (s)->offsetBuffer = NULL; 135 (s)->offsetBufferSize = 0; 136 (s)->offsetReturn = (s)->offsetStore = NULL; 137 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; 138 (s)->coll = (collator); 139 if (initializeNFD(status)) { 140 (s)->nfd = g_nfd; 141 } else { 142 return; 143 } 144 (s)->fcdPosition = 0; 145 if(collator->normalizationMode == UCOL_ON) { 146 (s)->flags |= UCOL_ITER_NORM; 147 } 148 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { 149 (s)->flags |= UCOL_HIRAGANA_Q; 150 } 151 (s)->iterator = NULL; 152 //(s)->iteratorIndex = 0; 153 } 154 155 U_CAPI void U_EXPORT2 156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, 157 int32_t sourceLen, collIterate *s, 158 UErrorCode *status) { 159 /* Out-of-line version for use from other files. */ 160 IInit_collIterate(collator, sourceString, sourceLen, s, status); 161 } 162 163 U_CAPI collIterate * U_EXPORT2 164 uprv_new_collIterate(UErrorCode *status) { 165 if(U_FAILURE(*status)) { 166 return NULL; 167 } 168 collIterate *s = new collIterate; 169 if(s == NULL) { 170 *status = U_MEMORY_ALLOCATION_ERROR; 171 return NULL; 172 } 173 return s; 174 } 175 176 U_CAPI void U_EXPORT2 177 uprv_delete_collIterate(collIterate *s) { 178 delete s; 179 } 180 181 U_CAPI UBool U_EXPORT2 182 uprv_collIterateAtEnd(collIterate *s) { 183 return s == NULL || s->pos == s->endp; 184 } 185 186 /** 187 * Backup the state of the collIterate struct data 188 * @param data collIterate to backup 189 * @param backup storage 190 */ 191 static 192 inline void backupState(const collIterate *data, collIterateState *backup) 193 { 194 backup->fcdPosition = data->fcdPosition; 195 backup->flags = data->flags; 196 backup->origFlags = data->origFlags; 197 backup->pos = data->pos; 198 backup->bufferaddress = data->writableBuffer.getBuffer(); 199 backup->buffersize = data->writableBuffer.length(); 200 backup->iteratorMove = 0; 201 backup->iteratorIndex = 0; 202 if(data->iterator != NULL) { 203 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); 204 backup->iteratorIndex = data->iterator->getState(data->iterator); 205 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE 206 if(backup->iteratorIndex == UITER_NO_STATE) { 207 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { 208 backup->iteratorMove++; 209 data->iterator->move(data->iterator, -1, UITER_CURRENT); 210 } 211 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 212 } 213 } 214 } 215 216 /** 217 * Loads the state into the collIterate struct data 218 * @param data collIterate to backup 219 * @param backup storage 220 * @param forwards boolean to indicate if forwards iteration is used, 221 * false indicates backwards iteration 222 */ 223 static 224 inline void loadState(collIterate *data, const collIterateState *backup, 225 UBool forwards) 226 { 227 UErrorCode status = U_ZERO_ERROR; 228 data->flags = backup->flags; 229 data->origFlags = backup->origFlags; 230 if(data->iterator != NULL) { 231 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); 232 data->iterator->setState(data->iterator, backup->iteratorIndex, &status); 233 if(backup->iteratorMove != 0) { 234 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 235 } 236 } 237 data->pos = backup->pos; 238 239 if ((data->flags & UCOL_ITER_INNORMBUF) && 240 data->writableBuffer.getBuffer() != backup->bufferaddress) { 241 /* 242 this is when a new buffer has been reallocated and we'll have to 243 calculate the new position. 244 note the new buffer has to contain the contents of the old buffer. 245 */ 246 if (forwards) { 247 data->pos = data->writableBuffer.getTerminatedBuffer() + 248 (data->pos - backup->bufferaddress); 249 } 250 else { 251 /* backwards direction */ 252 int32_t temp = backup->buffersize - 253 (int32_t)(data->pos - backup->bufferaddress); 254 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp); 255 } 256 } 257 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 258 /* 259 this is alittle tricky. 260 if we are initially not in the normalization buffer, even if we 261 normalize in the later stage, the data in the buffer will be 262 ignored, since we skip back up to the data string. 263 however if we are already in the normalization buffer, any 264 further normalization will pull data into the normalization 265 buffer and modify the fcdPosition. 266 since we are keeping the data in the buffer for use, the 267 fcdPosition can not be reverted back. 268 arrgghh.... 269 */ 270 data->fcdPosition = backup->fcdPosition; 271 } 272 } 273 274 static UBool 275 reallocCEs(collIterate *data, int32_t newCapacity) { 276 uint32_t *oldCEs = data->extendCEs; 277 if(oldCEs == NULL) { 278 oldCEs = data->CEs; 279 } 280 int32_t length = data->CEpos - oldCEs; 281 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); 282 if(newCEs == NULL) { 283 return FALSE; 284 } 285 uprv_memcpy(newCEs, oldCEs, length * 4); 286 uprv_free(data->extendCEs); 287 data->extendCEs = newCEs; 288 data->extendCEsSize = newCapacity; 289 data->CEpos = newCEs + length; 290 return TRUE; 291 } 292 293 static UBool 294 increaseCEsCapacity(collIterate *data) { 295 int32_t oldCapacity; 296 if(data->extendCEs != NULL) { 297 oldCapacity = data->extendCEsSize; 298 } else { 299 oldCapacity = LENGTHOF(data->CEs); 300 } 301 return reallocCEs(data, 2 * oldCapacity); 302 } 303 304 static UBool 305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) { 306 int32_t oldCapacity; 307 if(data->extendCEs != NULL) { 308 oldCapacity = data->extendCEsSize; 309 } else { 310 oldCapacity = LENGTHOF(data->CEs); 311 } 312 if(minCapacity <= oldCapacity) { 313 return TRUE; 314 } 315 oldCapacity *= 2; 316 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity); 317 } 318 319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) { 320 if(U_FAILURE(errorCode)) { 321 return; 322 } 323 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer); 324 U_ASSERT(length >= offsetBufferSize || offsetStore != NULL); 325 if(length >= offsetBufferSize) { 326 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE; 327 int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4)); 328 if(newBuffer == NULL) { 329 errorCode = U_MEMORY_ALLOCATION_ERROR; 330 return; 331 } 332 if(length > 0) { 333 uprv_memcpy(newBuffer, offsetBuffer, length * 4); 334 } 335 uprv_free(offsetBuffer); 336 offsetBuffer = newBuffer; 337 offsetStore = offsetBuffer + length; 338 offsetBufferSize = newCapacity; 339 } 340 *offsetStore++ = offset; 341 } 342 343 /* 344 * collIter_eos() 345 * Checks for a collIterate being positioned at the end of 346 * its source string. 347 * 348 */ 349 static 350 inline UBool collIter_eos(collIterate *s) { 351 if(s->flags & UCOL_USE_ITERATOR) { 352 return !(s->iterator->hasNext(s->iterator)); 353 } 354 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { 355 // Null terminated string, but not at null, so not at end. 356 // Whether in main or normalization buffer doesn't matter. 357 return FALSE; 358 } 359 360 // String with length. Can't be in normalization buffer, which is always 361 // null termintated. 362 if (s->flags & UCOL_ITER_HASLEN) { 363 return (s->pos == s->endp); 364 } 365 366 // We are at a null termination, could be either normalization buffer or main string. 367 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { 368 // At null at end of main string. 369 return TRUE; 370 } 371 372 // At null at end of normalization buffer. Need to check whether there there are 373 // any characters left in the main buffer. 374 if(s->origFlags & UCOL_USE_ITERATOR) { 375 return !(s->iterator->hasNext(s->iterator)); 376 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { 377 // Null terminated main string. fcdPosition is the 'return' position into main buf. 378 return (*s->fcdPosition == 0); 379 } 380 else { 381 // Main string with an end pointer. 382 return s->fcdPosition == s->endp; 383 } 384 } 385 386 /* 387 * collIter_bos() 388 * Checks for a collIterate being positioned at the start of 389 * its source string. 390 * 391 */ 392 static 393 inline UBool collIter_bos(collIterate *source) { 394 // if we're going backwards, we need to know whether there is more in the 395 // iterator, even if we are in the side buffer 396 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 397 return !source->iterator->hasPrevious(source->iterator); 398 } 399 if (source->pos <= source->string || 400 ((source->flags & UCOL_ITER_INNORMBUF) && 401 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { 402 return TRUE; 403 } 404 return FALSE; 405 } 406 407 /*static 408 inline UBool collIter_SimpleBos(collIterate *source) { 409 // if we're going backwards, we need to know whether there is more in the 410 // iterator, even if we are in the side buffer 411 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 412 return !source->iterator->hasPrevious(source->iterator); 413 } 414 if (source->pos == source->string) { 415 return TRUE; 416 } 417 return FALSE; 418 }*/ 419 //return (data->pos == data->string) || 420 421 422 /****************************************************************************/ 423 /* Following are the open/close functions */ 424 /* */ 425 /****************************************************************************/ 426 427 static UCollator* 428 ucol_initFromBinary(const uint8_t *bin, int32_t length, 429 const UCollator *base, 430 UCollator *fillIn, 431 UErrorCode *status) 432 { 433 UCollator *result = fillIn; 434 if(U_FAILURE(*status)) { 435 return NULL; 436 } 437 /* 438 if(base == NULL) { 439 // we don't support null base yet 440 *status = U_ILLEGAL_ARGUMENT_ERROR; 441 return NULL; 442 } 443 */ 444 // We need these and we could be running without UCA 445 uprv_uca_initImplicitConstants(status); 446 UCATableHeader *colData = (UCATableHeader *)bin; 447 // do we want version check here? We're trying to figure out whether collators are compatible 448 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || 449 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || 450 colData->version[0] != UCOL_BUILDER_VERSION) 451 { 452 *status = U_COLLATOR_VERSION_MISMATCH; 453 return NULL; 454 } 455 else { 456 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { 457 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); 458 if(U_FAILURE(*status)){ 459 return NULL; 460 } 461 result->hasRealData = TRUE; 462 } 463 else { 464 if(base) { 465 result = ucol_initCollator(base->image, result, base, status); 466 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); 467 if(U_FAILURE(*status)){ 468 return NULL; 469 } 470 result->hasRealData = FALSE; 471 } 472 else { 473 *status = U_USELESS_COLLATOR_ERROR; 474 return NULL; 475 } 476 } 477 result->freeImageOnClose = FALSE; 478 } 479 result->actualLocale = NULL; 480 result->validLocale = NULL; 481 result->requestedLocale = NULL; 482 result->rules = NULL; 483 result->rulesLength = 0; 484 result->freeRulesOnClose = FALSE; 485 result->ucaRules = NULL; 486 return result; 487 } 488 489 U_CAPI UCollator* U_EXPORT2 490 ucol_openBinary(const uint8_t *bin, int32_t length, 491 const UCollator *base, 492 UErrorCode *status) 493 { 494 return ucol_initFromBinary(bin, length, base, NULL, status); 495 } 496 497 U_CAPI int32_t U_EXPORT2 498 ucol_cloneBinary(const UCollator *coll, 499 uint8_t *buffer, int32_t capacity, 500 UErrorCode *status) 501 { 502 int32_t length = 0; 503 if(U_FAILURE(*status)) { 504 return length; 505 } 506 if(capacity < 0) { 507 *status = U_ILLEGAL_ARGUMENT_ERROR; 508 return length; 509 } 510 if(coll->hasRealData == TRUE) { 511 length = coll->image->size; 512 if(length <= capacity) { 513 uprv_memcpy(buffer, coll->image, length); 514 } else { 515 *status = U_BUFFER_OVERFLOW_ERROR; 516 } 517 } else { 518 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 519 if(length <= capacity) { 520 /* build the UCATableHeader with minimal entries */ 521 /* do not copy the header from the UCA file because its values are wrong! */ 522 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 523 524 /* reset everything */ 525 uprv_memset(buffer, 0, length); 526 527 /* set the tailoring-specific values */ 528 UCATableHeader *myData = (UCATableHeader *)buffer; 529 myData->size = length; 530 531 /* offset for the options, the only part of the data that is present after the header */ 532 myData->options = sizeof(UCATableHeader); 533 534 /* need to always set the expansion value for an upper bound of the options */ 535 myData->expansion = myData->options + sizeof(UColOptionSet); 536 537 myData->magic = UCOL_HEADER_MAGIC; 538 myData->isBigEndian = U_IS_BIG_ENDIAN; 539 myData->charSetFamily = U_CHARSET_FAMILY; 540 541 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 542 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 543 544 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 545 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 546 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 547 myData->jamoSpecial = coll->image->jamoSpecial; 548 549 /* copy the collator options */ 550 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 551 } else { 552 *status = U_BUFFER_OVERFLOW_ERROR; 553 } 554 } 555 return length; 556 } 557 558 U_CAPI UCollator* U_EXPORT2 559 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) 560 { 561 UCollator * localCollator; 562 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); 563 char *stackBufferChars = (char *)stackBuffer; 564 int32_t imageSize = 0; 565 int32_t rulesSize = 0; 566 int32_t rulesPadding = 0; 567 int32_t defaultReorderCodesSize = 0; 568 int32_t reorderCodesSize = 0; 569 uint8_t *image; 570 UChar *rules; 571 int32_t* defaultReorderCodes; 572 int32_t* reorderCodes; 573 uint8_t* leadBytePermutationTable; 574 UBool colAllocated = FALSE; 575 UBool imageAllocated = FALSE; 576 577 if (status == NULL || U_FAILURE(*status)){ 578 return 0; 579 } 580 if ((stackBuffer && !pBufferSize) || !coll){ 581 *status = U_ILLEGAL_ARGUMENT_ERROR; 582 return 0; 583 } 584 585 if (coll->rules && coll->freeRulesOnClose) { 586 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); 587 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); 588 bufferSizeNeeded += rulesSize + rulesPadding; 589 } 590 // no padding for alignment needed from here since the next two are 4 byte quantities 591 if (coll->defaultReorderCodes) { 592 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t); 593 bufferSizeNeeded += defaultReorderCodesSize; 594 } 595 if (coll->reorderCodes) { 596 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t); 597 bufferSizeNeeded += reorderCodesSize; 598 } 599 if (coll->leadBytePermutationTable) { 600 bufferSizeNeeded += 256 * sizeof(uint8_t); 601 } 602 603 if (stackBuffer && *pBufferSize <= 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 604 *pBufferSize = bufferSizeNeeded; 605 return 0; 606 } 607 608 /* Pointers on 64-bit platforms need to be aligned 609 * on a 64-bit boundry in memory. 610 */ 611 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 612 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); 613 if (*pBufferSize > offsetUp) { 614 *pBufferSize -= offsetUp; 615 stackBufferChars += offsetUp; 616 } 617 else { 618 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */ 619 *pBufferSize = 1; 620 } 621 } 622 stackBuffer = (void *)stackBufferChars; 623 624 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) { 625 /* allocate one here...*/ 626 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); 627 // Null pointer check. 628 if (stackBufferChars == NULL) { 629 *status = U_MEMORY_ALLOCATION_ERROR; 630 return NULL; 631 } 632 colAllocated = TRUE; 633 if (U_SUCCESS(*status)) { 634 *status = U_SAFECLONE_ALLOCATED_WARNING; 635 } 636 } 637 localCollator = (UCollator *)stackBufferChars; 638 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); 639 defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize); 640 reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize); 641 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize; 642 643 { 644 UErrorCode tempStatus = U_ZERO_ERROR; 645 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); 646 } 647 if (coll->freeImageOnClose) { 648 image = (uint8_t *)uprv_malloc(imageSize); 649 // Null pointer check 650 if (image == NULL) { 651 *status = U_MEMORY_ALLOCATION_ERROR; 652 return NULL; 653 } 654 ucol_cloneBinary(coll, image, imageSize, status); 655 imageAllocated = TRUE; 656 } 657 else { 658 image = (uint8_t *)coll->image; 659 } 660 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); 661 if (U_FAILURE(*status)) { 662 return NULL; 663 } 664 665 if (coll->rules) { 666 if (coll->freeRulesOnClose) { 667 localCollator->rules = u_strcpy(rules, coll->rules); 668 //bufferEnd += rulesSize; 669 } 670 else { 671 localCollator->rules = coll->rules; 672 } 673 localCollator->freeRulesOnClose = FALSE; 674 localCollator->rulesLength = coll->rulesLength; 675 } 676 677 // collator reordering 678 if (coll->defaultReorderCodes) { 679 localCollator->defaultReorderCodes = 680 (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t)); 681 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength; 682 localCollator->freeDefaultReorderCodesOnClose = FALSE; 683 } 684 if (coll->reorderCodes) { 685 localCollator->reorderCodes = 686 (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t)); 687 localCollator->reorderCodesLength = coll->reorderCodesLength; 688 localCollator->freeReorderCodesOnClose = FALSE; 689 } 690 if (coll->leadBytePermutationTable) { 691 localCollator->leadBytePermutationTable = 692 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256); 693 localCollator->freeLeadBytePermutationTableOnClose = FALSE; 694 } 695 696 int32_t i; 697 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 698 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); 699 } 700 // zero copies of pointers 701 localCollator->actualLocale = NULL; 702 localCollator->validLocale = NULL; 703 localCollator->requestedLocale = NULL; 704 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. 705 localCollator->freeOnClose = colAllocated; 706 localCollator->freeImageOnClose = imageAllocated; 707 return localCollator; 708 } 709 710 U_CAPI void U_EXPORT2 711 ucol_close(UCollator *coll) 712 { 713 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 714 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 715 if(coll != NULL) { 716 // these are always owned by each UCollator struct, 717 // so we always free them 718 if(coll->validLocale != NULL) { 719 uprv_free(coll->validLocale); 720 } 721 if(coll->actualLocale != NULL) { 722 uprv_free(coll->actualLocale); 723 } 724 if(coll->requestedLocale != NULL) { 725 uprv_free(coll->requestedLocale); 726 } 727 if(coll->latinOneCEs != NULL) { 728 uprv_free(coll->latinOneCEs); 729 } 730 if(coll->options != NULL && coll->freeOptionsOnClose) { 731 uprv_free(coll->options); 732 } 733 if(coll->rules != NULL && coll->freeRulesOnClose) { 734 uprv_free((UChar *)coll->rules); 735 } 736 if(coll->image != NULL && coll->freeImageOnClose) { 737 uprv_free((UCATableHeader *)coll->image); 738 } 739 740 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) { 741 uprv_free(coll->leadBytePermutationTable); 742 } 743 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) { 744 uprv_free(coll->defaultReorderCodes); 745 } 746 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { 747 uprv_free(coll->reorderCodes); 748 } 749 750 if(coll->delegate != NULL) { 751 delete (Collator*)coll->delegate; 752 } 753 754 /* Here, it would be advisable to close: */ 755 /* - UData for UCA (unless we stuff it in the root resb */ 756 /* Again, do we need additional housekeeping... HMMM! */ 757 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); 758 if(coll->freeOnClose){ 759 /* for safeClone, if freeOnClose is FALSE, 760 don't free the other instance data */ 761 uprv_free(coll); 762 } 763 } 764 UTRACE_EXIT(); 765 } 766 767 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ 768 /* you should be able to get the binary chunk to write out... Doesn't look very full now */ 769 U_CFUNC uint8_t* U_EXPORT2 770 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) 771 { 772 uint8_t *result = NULL; 773 if(U_FAILURE(*status)) { 774 return NULL; 775 } 776 if(coll->hasRealData == TRUE) { 777 *length = coll->image->size; 778 result = (uint8_t *)uprv_malloc(*length); 779 /* test for NULL */ 780 if (result == NULL) { 781 *status = U_MEMORY_ALLOCATION_ERROR; 782 return NULL; 783 } 784 uprv_memcpy(result, coll->image, *length); 785 } else { 786 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 787 result = (uint8_t *)uprv_malloc(*length); 788 /* test for NULL */ 789 if (result == NULL) { 790 *status = U_MEMORY_ALLOCATION_ERROR; 791 return NULL; 792 } 793 794 /* build the UCATableHeader with minimal entries */ 795 /* do not copy the header from the UCA file because its values are wrong! */ 796 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 797 798 /* reset everything */ 799 uprv_memset(result, 0, *length); 800 801 /* set the tailoring-specific values */ 802 UCATableHeader *myData = (UCATableHeader *)result; 803 myData->size = *length; 804 805 /* offset for the options, the only part of the data that is present after the header */ 806 myData->options = sizeof(UCATableHeader); 807 808 /* need to always set the expansion value for an upper bound of the options */ 809 myData->expansion = myData->options + sizeof(UColOptionSet); 810 811 myData->magic = UCOL_HEADER_MAGIC; 812 myData->isBigEndian = U_IS_BIG_ENDIAN; 813 myData->charSetFamily = U_CHARSET_FAMILY; 814 815 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 816 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 817 818 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 819 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 820 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 821 myData->jamoSpecial = coll->image->jamoSpecial; 822 823 /* copy the collator options */ 824 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 825 } 826 return result; 827 } 828 829 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { 830 if(U_FAILURE(*status)) { 831 return; 832 } 833 result->caseFirst = (UColAttributeValue)opts->caseFirst; 834 result->caseLevel = (UColAttributeValue)opts->caseLevel; 835 result->frenchCollation = (UColAttributeValue)opts->frenchCollation; 836 result->normalizationMode = (UColAttributeValue)opts->normalizationMode; 837 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) { 838 return; 839 } 840 result->strength = (UColAttributeValue)opts->strength; 841 result->variableTopValue = opts->variableTopValue; 842 result->alternateHandling = (UColAttributeValue)opts->alternateHandling; 843 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; 844 result->numericCollation = (UColAttributeValue)opts->numericCollation; 845 result->caseFirstisDefault = TRUE; 846 result->caseLevelisDefault = TRUE; 847 result->frenchCollationisDefault = TRUE; 848 result->normalizationModeisDefault = TRUE; 849 result->strengthisDefault = TRUE; 850 result->variableTopValueisDefault = TRUE; 851 result->alternateHandlingisDefault = TRUE; 852 result->hiraganaQisDefault = TRUE; 853 result->numericCollationisDefault = TRUE; 854 855 ucol_updateInternalState(result, status); 856 857 result->options = opts; 858 } 859 860 861 /** 862 * Approximate determination if a character is at a contraction end. 863 * Guaranteed to be TRUE if a character is at the end of a contraction, 864 * otherwise it is not deterministic. 865 * @param c character to be determined 866 * @param coll collator 867 */ 868 static 869 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { 870 if (c < coll->minContrEndCP) { 871 return FALSE; 872 } 873 874 int32_t hash = c; 875 uint8_t htbyte; 876 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 877 if (U16_IS_TRAIL(c)) { 878 return TRUE; 879 } 880 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 881 } 882 htbyte = coll->contrEndCP[hash>>3]; 883 return (((htbyte >> (hash & 7)) & 1) == 1); 884 } 885 886 887 888 /* 889 * i_getCombiningClass() 890 * A fast, at least partly inline version of u_getCombiningClass() 891 * This is a candidate for further optimization. Used heavily 892 * in contraction processing. 893 */ 894 static 895 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { 896 uint8_t sCC = 0; 897 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { 898 sCC = u_getCombiningClass(c); 899 } 900 return sCC; 901 } 902 903 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { 904 UChar c; 905 UCollator *result = fillIn; 906 if(U_FAILURE(*status) || image == NULL) { 907 return NULL; 908 } 909 910 if(result == NULL) { 911 result = (UCollator *)uprv_malloc(sizeof(UCollator)); 912 if(result == NULL) { 913 *status = U_MEMORY_ALLOCATION_ERROR; 914 return result; 915 } 916 result->freeOnClose = TRUE; 917 } else { 918 result->freeOnClose = FALSE; 919 } 920 921 result->delegate = NULL; 922 923 result->image = image; 924 result->mapping.getFoldingOffset = _getFoldingOffset; 925 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; 926 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); 927 if(U_FAILURE(*status)) { 928 if(result->freeOnClose == TRUE) { 929 uprv_free(result); 930 result = NULL; 931 } 932 return result; 933 } 934 935 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); 936 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); 937 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); 938 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); 939 result->rules = NULL; 940 result->rulesLength = 0; 941 result->freeRulesOnClose = FALSE; 942 result->defaultReorderCodes = NULL; 943 result->defaultReorderCodesLength = 0; 944 result->freeDefaultReorderCodesOnClose = FALSE; 945 result->reorderCodes = NULL; 946 result->reorderCodesLength = 0; 947 result->freeReorderCodesOnClose = FALSE; 948 result->leadBytePermutationTable = NULL; 949 result->freeLeadBytePermutationTableOnClose = FALSE; 950 951 /* get the version info from UCATableHeader and populate the Collator struct*/ 952 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ 953 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ 954 result->dataVersion[2] = 0; 955 result->dataVersion[3] = 0; 956 957 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; 958 result->minUnsafeCP = 0; 959 for (c=0; c<0x300; c++) { // Find the smallest unsafe char. 960 if (ucol_unsafeCP(c, result)) break; 961 } 962 result->minUnsafeCP = c; 963 964 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; 965 result->minContrEndCP = 0; 966 for (c=0; c<0x300; c++) { // Find the Contraction-ending char. 967 if (ucol_contractionEndCP(c, result)) break; 968 } 969 result->minContrEndCP = c; 970 971 /* max expansion tables */ 972 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + 973 result->image->endExpansionCE); 974 result->lastEndExpansionCE = result->endExpansionCE + 975 result->image->endExpansionCECount - 1; 976 result->expansionCESize = (uint8_t*)result->image + 977 result->image->expansionCESize; 978 979 980 //result->errorCode = *status; 981 982 result->latinOneCEs = NULL; 983 984 result->latinOneRegenTable = FALSE; 985 result->latinOneFailed = FALSE; 986 result->UCA = UCA; 987 988 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ 989 result->ucaRules = NULL; 990 result->actualLocale = NULL; 991 result->validLocale = NULL; 992 result->requestedLocale = NULL; 993 result->hasRealData = FALSE; // real data lives in .dat file... 994 result->freeImageOnClose = FALSE; 995 996 /* set attributes */ 997 ucol_setOptionsFromHeader( 998 result, 999 (UColOptionSet*)((uint8_t*)result->image+result->image->options), 1000 status); 1001 result->freeOptionsOnClose = FALSE; 1002 1003 return result; 1004 } 1005 1006 /* new Mark's code */ 1007 1008 /** 1009 * For generation of Implicit CEs 1010 * @author Davis 1011 * 1012 * Cleaned up so that changes can be made more easily. 1013 * Old values: 1014 # First Implicit: E26A792D 1015 # Last Implicit: E3DC70C0 1016 # First CJK: E0030300 1017 # Last CJK: E0A9DD00 1018 # First CJK_A: E0A9DF00 1019 # Last CJK_A: E0DE3100 1020 */ 1021 /* Following is a port of Mark's code for new treatment of implicits. 1022 * It is positioned here, since ucol_initUCA need to initialize the 1023 * variables below according to the data in the fractional UCA. 1024 */ 1025 1026 /** 1027 * Function used to: 1028 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and 1029 * b) bump any non-CJK characters by 10FFFF. 1030 * The relevant blocks are: 1031 * A: 4E00..9FFF; CJK Unified Ideographs 1032 * F900..FAFF; CJK Compatibility Ideographs 1033 * B: 3400..4DBF; CJK Unified Ideographs Extension A 1034 * 20000..XX; CJK Unified Ideographs Extension B (and others later on) 1035 * As long as 1036 * no new B characters are allocated between 4E00 and FAFF, and 1037 * no new A characters are outside of this range, 1038 * (very high probability) this simple code will work. 1039 * The reordered blocks are: 1040 * Block1 is CJK 1041 * Block2 is CJK_COMPAT_USED 1042 * Block3 is CJK_A 1043 * (all contiguous) 1044 * Any other CJK gets its normal code point 1045 * Any non-CJK gets +10FFFF 1046 * When we reorder Block1, we make sure that it is at the very start, 1047 * so that it will use a 3-byte form. 1048 * Warning: the we only pick up the compatibility characters that are 1049 * NOT decomposed, so that block is smaller! 1050 */ 1051 1052 // CONSTANTS 1053 static const UChar32 1054 NON_CJK_OFFSET = 0x110000, 1055 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 1056 1057 /** 1058 * Precomputed by initImplicitConstants() 1059 */ 1060 static int32_t 1061 final3Multiplier = 0, 1062 final4Multiplier = 0, 1063 final3Count = 0, 1064 final4Count = 0, 1065 medialCount = 0, 1066 min3Primary = 0, 1067 min4Primary = 0, 1068 max4Primary = 0, 1069 minTrail = 0, 1070 maxTrail = 0, 1071 max3Trail = 0, 1072 max4Trail = 0, 1073 min4Boundary = 0; 1074 1075 static const UChar32 1076 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 1077 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1) 1078 CJK_BASE = 0x4E00, 1079 CJK_LIMIT = 0x9FCC+1, 1080 // Unified CJK ideographs in the compatibility ideographs block. 1081 CJK_COMPAT_USED_BASE = 0xFA0E, 1082 CJK_COMPAT_USED_LIMIT = 0xFA2F+1, 1083 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 1084 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 1085 CJK_A_BASE = 0x3400, 1086 CJK_A_LIMIT = 0x4DB5+1, 1087 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; 1088 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; 1089 CJK_B_BASE = 0x20000, 1090 CJK_B_LIMIT = 0x2A6D6+1, 1091 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; 1092 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; 1093 CJK_C_BASE = 0x2A700, 1094 CJK_C_LIMIT = 0x2B734+1, 1095 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; 1096 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; 1097 CJK_D_BASE = 0x2B740, 1098 CJK_D_LIMIT = 0x2B81D+1; 1099 // when adding to this list, look for all occurrences (in project) 1100 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!! 1101 1102 static UChar32 swapCJK(UChar32 i) { 1103 if (i < CJK_A_BASE) { 1104 // non-CJK 1105 } else if (i < CJK_A_LIMIT) { 1106 // Extension A has lower code points than the original Unihan+compat 1107 // but sorts higher. 1108 return i - CJK_A_BASE 1109 + (CJK_LIMIT - CJK_BASE) 1110 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1111 } else if (i < CJK_BASE) { 1112 // non-CJK 1113 } else if (i < CJK_LIMIT) { 1114 return i - CJK_BASE; 1115 } else if (i < CJK_COMPAT_USED_BASE) { 1116 // non-CJK 1117 } else if (i < CJK_COMPAT_USED_LIMIT) { 1118 return i - CJK_COMPAT_USED_BASE 1119 + (CJK_LIMIT - CJK_BASE); 1120 } else if (i < CJK_B_BASE) { 1121 // non-CJK 1122 } else if (i < CJK_B_LIMIT) { 1123 return i; // non-BMP-CJK 1124 } else if (i < CJK_C_BASE) { 1125 // non-CJK 1126 } else if (i < CJK_C_LIMIT) { 1127 return i; // non-BMP-CJK 1128 } else if (i < CJK_D_BASE) { 1129 // non-CJK 1130 } else if (i < CJK_D_LIMIT) { 1131 return i; // non-BMP-CJK 1132 } 1133 return i + NON_CJK_OFFSET; // non-CJK 1134 } 1135 1136 U_CAPI UChar32 U_EXPORT2 1137 uprv_uca_getRawFromCodePoint(UChar32 i) { 1138 return swapCJK(i)+1; 1139 } 1140 1141 U_CAPI UChar32 U_EXPORT2 1142 uprv_uca_getCodePointFromRaw(UChar32 i) { 1143 i--; 1144 UChar32 result = 0; 1145 if(i >= NON_CJK_OFFSET) { 1146 result = i - NON_CJK_OFFSET; 1147 } else if(i >= CJK_B_BASE) { 1148 result = i; 1149 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted 1150 if(i < CJK_LIMIT - CJK_BASE) { 1151 result = i + CJK_BASE; 1152 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { 1153 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); 1154 } else { 1155 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1156 } 1157 } else { 1158 result = -1; 1159 } 1160 return result; 1161 } 1162 1163 // GET IMPLICIT PRIMARY WEIGHTS 1164 // Return value is left justified primary key 1165 U_CAPI uint32_t U_EXPORT2 1166 uprv_uca_getImplicitFromRaw(UChar32 cp) { 1167 /* 1168 if (cp < 0 || cp > UCOL_MAX_INPUT) { 1169 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); 1170 } 1171 */ 1172 int32_t last0 = cp - min4Boundary; 1173 if (last0 < 0) { 1174 int32_t last1 = cp / final3Count; 1175 last0 = cp % final3Count; 1176 1177 int32_t last2 = last1 / medialCount; 1178 last1 %= medialCount; 1179 1180 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start 1181 last1 = minTrail + last1; // offset 1182 last2 = min3Primary + last2; // offset 1183 /* 1184 if (last2 >= min4Primary) { 1185 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); 1186 } 1187 */ 1188 return (last2 << 24) + (last1 << 16) + (last0 << 8); 1189 } else { 1190 int32_t last1 = last0 / final4Count; 1191 last0 %= final4Count; 1192 1193 int32_t last2 = last1 / medialCount; 1194 last1 %= medialCount; 1195 1196 int32_t last3 = last2 / medialCount; 1197 last2 %= medialCount; 1198 1199 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start 1200 last1 = minTrail + last1; // offset 1201 last2 = minTrail + last2; // offset 1202 last3 = min4Primary + last3; // offset 1203 /* 1204 if (last3 > max4Primary) { 1205 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); 1206 } 1207 */ 1208 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; 1209 } 1210 } 1211 1212 static uint32_t U_EXPORT2 1213 uprv_uca_getImplicitPrimary(UChar32 cp) { 1214 //fprintf(stdout, "Incoming: %04x\n", cp); 1215 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); 1216 1217 cp = swapCJK(cp); 1218 cp++; 1219 // we now have a range of numbers from 0 to 21FFFF. 1220 1221 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); 1222 //fprintf(stdout, "CJK swapped: %04x\n", cp); 1223 1224 return uprv_uca_getImplicitFromRaw(cp); 1225 } 1226 1227 /** 1228 * Converts implicit CE into raw integer ("code point") 1229 * @param implicit 1230 * @return -1 if illegal format 1231 */ 1232 U_CAPI UChar32 U_EXPORT2 1233 uprv_uca_getRawFromImplicit(uint32_t implicit) { 1234 UChar32 result; 1235 UChar32 b3 = implicit & 0xFF; 1236 UChar32 b2 = (implicit >> 8) & 0xFF; 1237 UChar32 b1 = (implicit >> 16) & 0xFF; 1238 UChar32 b0 = (implicit >> 24) & 0xFF; 1239 1240 // simple parameter checks 1241 if (b0 < min3Primary || b0 > max4Primary 1242 || b1 < minTrail || b1 > maxTrail) 1243 return -1; 1244 // normal offsets 1245 b1 -= minTrail; 1246 1247 // take care of the final values, and compose 1248 if (b0 < min4Primary) { 1249 if (b2 < minTrail || b2 > max3Trail || b3 != 0) 1250 return -1; 1251 b2 -= minTrail; 1252 UChar32 remainder = b2 % final3Multiplier; 1253 if (remainder != 0) 1254 return -1; 1255 b0 -= min3Primary; 1256 b2 /= final3Multiplier; 1257 result = ((b0 * medialCount) + b1) * final3Count + b2; 1258 } else { 1259 if (b2 < minTrail || b2 > maxTrail 1260 || b3 < minTrail || b3 > max4Trail) 1261 return -1; 1262 b2 -= minTrail; 1263 b3 -= minTrail; 1264 UChar32 remainder = b3 % final4Multiplier; 1265 if (remainder != 0) 1266 return -1; 1267 b3 /= final4Multiplier; 1268 b0 -= min4Primary; 1269 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; 1270 } 1271 // final check 1272 if (result < 0 || result > UCOL_MAX_INPUT) 1273 return -1; 1274 return result; 1275 } 1276 1277 1278 static inline int32_t divideAndRoundUp(int a, int b) { 1279 return 1 + (a-1)/b; 1280 } 1281 1282 /* this function is either called from initUCA or from genUCA before 1283 * doing canonical closure for the UCA. 1284 */ 1285 1286 /** 1287 * Set up to generate implicits. 1288 * Maintenance Note: this function may end up being called more than once, due 1289 * to threading races during initialization. Make sure that 1290 * none of the Constants is ever transiently assigned an 1291 * incorrect value. 1292 * @param minPrimary 1293 * @param maxPrimary 1294 * @param minTrail final byte 1295 * @param maxTrail final byte 1296 * @param gap3 the gap we leave for tailoring for 3-byte forms 1297 * @param gap4 the gap we leave for tailoring for 4-byte forms 1298 */ 1299 static void initImplicitConstants(int minPrimary, int maxPrimary, 1300 int minTrailIn, int maxTrailIn, 1301 int gap3, int primaries3count, 1302 UErrorCode *status) { 1303 // some simple parameter checks 1304 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) 1305 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) 1306 || (primaries3count < 1)) 1307 { 1308 *status = U_ILLEGAL_ARGUMENT_ERROR; 1309 return; 1310 }; 1311 1312 minTrail = minTrailIn; 1313 maxTrail = maxTrailIn; 1314 1315 min3Primary = minPrimary; 1316 max4Primary = maxPrimary; 1317 // compute constants for use later. 1318 // number of values we can use in trailing bytes 1319 // leave room for empty values between AND above, e.g. if gap = 2 1320 // range 3..7 => +3 -4 -5 -6 -7: so 1 value 1321 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values 1322 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values 1323 final3Multiplier = gap3 + 1; 1324 final3Count = (maxTrail - minTrail + 1) / final3Multiplier; 1325 max3Trail = minTrail + (final3Count - 1) * final3Multiplier; 1326 1327 // medials can use full range 1328 medialCount = (maxTrail - minTrail + 1); 1329 // find out how many values fit in each form 1330 int32_t threeByteCount = medialCount * final3Count; 1331 // now determine where the 3/4 boundary is. 1332 // we use 3 bytes below the boundary, and 4 above 1333 int32_t primariesAvailable = maxPrimary - minPrimary + 1; 1334 int32_t primaries4count = primariesAvailable - primaries3count; 1335 1336 1337 int32_t min3ByteCoverage = primaries3count * threeByteCount; 1338 min4Primary = minPrimary + primaries3count; 1339 min4Boundary = min3ByteCoverage; 1340 // Now expand out the multiplier for the 4 bytes, and redo. 1341 1342 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; 1343 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); 1344 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); 1345 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; 1346 if (gap4 < 1) { 1347 *status = U_ILLEGAL_ARGUMENT_ERROR; 1348 return; 1349 } 1350 final4Multiplier = gap4 + 1; 1351 final4Count = neededPerFinalByte; 1352 max4Trail = minTrail + (final4Count - 1) * final4Multiplier; 1353 } 1354 1355 /** 1356 * Supply parameters for generating implicit CEs 1357 */ 1358 U_CAPI void U_EXPORT2 1359 uprv_uca_initImplicitConstants(UErrorCode *status) { 1360 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. 1361 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); 1362 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); 1363 } 1364 1365 1366 /* collIterNormalize Incremental Normalization happens here. */ 1367 /* pick up the range of chars identifed by FCD, */ 1368 /* normalize it into the collIterate's writable buffer, */ 1369 /* switch the collIterate's state to use the writable buffer. */ 1370 /* */ 1371 static 1372 void collIterNormalize(collIterate *collationSource) 1373 { 1374 UErrorCode status = U_ZERO_ERROR; 1375 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ 1376 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ 1377 1378 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)), 1379 collationSource->writableBuffer, 1380 status); 1381 if (U_FAILURE(status)) { 1382 #ifdef UCOL_DEBUG 1383 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status)); 1384 #endif 1385 return; 1386 } 1387 1388 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer(); 1389 collationSource->origFlags = collationSource->flags; 1390 collationSource->flags |= UCOL_ITER_INNORMBUF; 1391 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1392 } 1393 1394 1395 // This function takes the iterator and extracts normalized stuff up to the next boundary 1396 // It is similar in the end results to the collIterNormalize, but for the cases when we 1397 // use an iterator 1398 /*static 1399 inline void normalizeIterator(collIterate *collationSource) { 1400 UErrorCode status = U_ZERO_ERROR; 1401 UBool wasNormalized = FALSE; 1402 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); 1403 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); 1404 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1405 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1406 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { 1407 // reallocate and terminate 1408 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, 1409 &collationSource->writableBuffer, 1410 (int32_t *)&collationSource->writableBufSize, normLen + 1, 1411 0) 1412 ) { 1413 #ifdef UCOL_DEBUG 1414 fprintf(stderr, "normalizeIterator(), out of memory\n"); 1415 #endif 1416 return; 1417 } 1418 status = U_ZERO_ERROR; 1419 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); 1420 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); 1421 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1422 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1423 } 1424 // Terminate the buffer - we already checked that it is big enough 1425 collationSource->writableBuffer[normLen] = 0; 1426 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { 1427 collationSource->flags |= UCOL_ITER_ALLOCATED; 1428 } 1429 collationSource->pos = collationSource->writableBuffer; 1430 collationSource->origFlags = collationSource->flags; 1431 collationSource->flags |= UCOL_ITER_INNORMBUF; 1432 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1433 }*/ 1434 1435 1436 /* Incremental FCD check and normalize */ 1437 /* Called from getNextCE when normalization state is suspect. */ 1438 /* When entering, the state is known to be this: */ 1439 /* o We are working in the main buffer of the collIterate, not the side */ 1440 /* writable buffer. When in the side buffer, normalization mode is always off, */ 1441 /* so we won't get here. */ 1442 /* o The leading combining class from the current character is 0 or */ 1443 /* the trailing combining class of the previous char was zero. */ 1444 /* True because the previous call to this function will have always exited */ 1445 /* that way, and we get called for every char where cc might be non-zero. */ 1446 static 1447 inline UBool collIterFCD(collIterate *collationSource) { 1448 const UChar *srcP, *endP; 1449 uint8_t leadingCC; 1450 uint8_t prevTrailingCC = 0; 1451 uint16_t fcd; 1452 UBool needNormalize = FALSE; 1453 1454 srcP = collationSource->pos-1; 1455 1456 if (collationSource->flags & UCOL_ITER_HASLEN) { 1457 endP = collationSource->endp; 1458 } else { 1459 endP = NULL; 1460 } 1461 1462 // Get the trailing combining class of the current character. If it's zero, we are OK. 1463 fcd = g_nfcImpl->nextFCD16(srcP, endP); 1464 if (fcd != 0) { 1465 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1466 1467 if (prevTrailingCC != 0) { 1468 // The current char has a non-zero trailing CC. Scan forward until we find 1469 // a char with a leading cc of zero. 1470 while (endP == NULL || srcP != endP) 1471 { 1472 const UChar *savedSrcP = srcP; 1473 1474 fcd = g_nfcImpl->nextFCD16(srcP, endP); 1475 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1476 if (leadingCC == 0) { 1477 srcP = savedSrcP; // Hit char that is not part of combining sequence. 1478 // back up over it. (Could be surrogate pair!) 1479 break; 1480 } 1481 1482 if (leadingCC < prevTrailingCC) { 1483 needNormalize = TRUE; 1484 } 1485 1486 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1487 } 1488 } 1489 } 1490 1491 collationSource->fcdPosition = (UChar *)srcP; 1492 1493 return needNormalize; 1494 } 1495 1496 /****************************************************************************/ 1497 /* Following are the CE retrieval functions */ 1498 /* */ 1499 /****************************************************************************/ 1500 1501 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); 1502 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); 1503 1504 /* there should be a macro version of this function in the header file */ 1505 /* This is the first function that tries to fetch a collation element */ 1506 /* If it's not succesfull or it encounters a more difficult situation */ 1507 /* some more sofisticated and slower functions are invoked */ 1508 static 1509 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1510 uint32_t order = 0; 1511 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ 1512 order = *(collationSource->toReturn++); /* if so, return them */ 1513 if(collationSource->CEpos == collationSource->toReturn) { 1514 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; 1515 } 1516 return order; 1517 } 1518 1519 UChar ch = 0; 1520 collationSource->offsetReturn = NULL; 1521 1522 do { 1523 for (;;) /* Loop handles case when incremental normalize switches */ 1524 { /* to or from the side buffer / original string, and we */ 1525 /* need to start again to get the next character. */ 1526 1527 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) 1528 { 1529 // The source string is null terminated and we're not working from the side buffer, 1530 // and we're not normalizing. This is the fast path. 1531 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) 1532 ch = *collationSource->pos++; 1533 if (ch != 0) { 1534 break; 1535 } 1536 else { 1537 return UCOL_NO_MORE_CES; 1538 } 1539 } 1540 1541 if (collationSource->flags & UCOL_ITER_HASLEN) { 1542 // Normal path for strings when length is specified. 1543 // (We can't be in side buffer because it is always null terminated.) 1544 if (collationSource->pos >= collationSource->endp) { 1545 // Ran off of the end of the main source string. We're done. 1546 return UCOL_NO_MORE_CES; 1547 } 1548 ch = *collationSource->pos++; 1549 } 1550 else if(collationSource->flags & UCOL_USE_ITERATOR) { 1551 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); 1552 if(iterCh == U_SENTINEL) { 1553 return UCOL_NO_MORE_CES; 1554 } 1555 ch = (UChar)iterCh; 1556 } 1557 else 1558 { 1559 // Null terminated string. 1560 ch = *collationSource->pos++; 1561 if (ch == 0) { 1562 // Ran off end of buffer. 1563 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1564 // Ran off end of main string. backing up one character. 1565 collationSource->pos--; 1566 return UCOL_NO_MORE_CES; 1567 } 1568 else 1569 { 1570 // Hit null in the normalize side buffer. 1571 // Usually this means the end of the normalized data, 1572 // except for one odd case: a null followed by combining chars, 1573 // which is the case if we are at the start of the buffer. 1574 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) { 1575 break; 1576 } 1577 1578 // Null marked end of side buffer. 1579 // Revert to the main string and 1580 // loop back to top to try again to get a character. 1581 collationSource->pos = collationSource->fcdPosition; 1582 collationSource->flags = collationSource->origFlags; 1583 continue; 1584 } 1585 } 1586 } 1587 1588 if(collationSource->flags&UCOL_HIRAGANA_Q) { 1589 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag 1590 * based on whether the previous codepoint was Hiragana or Katakana. 1591 */ 1592 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || 1593 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { 1594 collationSource->flags |= UCOL_WAS_HIRAGANA; 1595 } else { 1596 collationSource->flags &= ~UCOL_WAS_HIRAGANA; 1597 } 1598 } 1599 1600 // We've got a character. See if there's any fcd and/or normalization stuff to do. 1601 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. 1602 if ((collationSource->flags & UCOL_ITER_NORM) == 0) { 1603 break; 1604 } 1605 1606 if (collationSource->fcdPosition >= collationSource->pos) { 1607 // An earlier FCD check has already covered the current character. 1608 // We can go ahead and process this char. 1609 break; 1610 } 1611 1612 if (ch < ZERO_CC_LIMIT_ ) { 1613 // Fast fcd safe path. Trailing combining class == 0. This char is OK. 1614 break; 1615 } 1616 1617 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1618 // We need to peek at the next character in order to tell if we are FCD 1619 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { 1620 // We are at the last char of source string. 1621 // It is always OK for FCD check. 1622 break; 1623 } 1624 1625 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test 1626 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { 1627 break; 1628 } 1629 } 1630 1631 1632 // Need a more complete FCD check and possible normalization. 1633 if (collIterFCD(collationSource)) { 1634 collIterNormalize(collationSource); 1635 } 1636 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1637 // No normalization was needed. Go ahead and process the char we already had. 1638 break; 1639 } 1640 1641 // Some normalization happened. Next loop iteration will pick up a char 1642 // from the normalization buffer. 1643 1644 } // end for (;;) 1645 1646 1647 if (ch <= 0xFF) { 1648 /* For latin-1 characters we never need to fall back to the UCA table */ 1649 /* because all of the UCA data is replicated in the latinOneMapping array */ 1650 order = coll->latinOneMapping[ch]; 1651 if (order > UCOL_NOT_FOUND) { 1652 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); 1653 } 1654 } 1655 else 1656 { 1657 // Always use UCA for Han, Hangul 1658 // (Han extension A is before main Han block) 1659 // **** Han compatibility chars ?? **** 1660 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1661 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { 1662 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { 1663 // between the two target ranges; do normal lookup 1664 // **** this range is YI, Modifier tone letters, **** 1665 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1666 // **** Latin-D might be tailored, so we need to **** 1667 // **** do the normal lookup for these guys. **** 1668 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1669 } else { 1670 // in one of the target ranges; use UCA 1671 order = UCOL_NOT_FOUND; 1672 } 1673 } else { 1674 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1675 } 1676 1677 if(order > UCOL_NOT_FOUND) { /* if a CE is special */ 1678 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ 1679 } 1680 1681 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ 1682 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ 1683 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1684 1685 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ 1686 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); 1687 } 1688 } 1689 } 1690 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); 1691 1692 if(order == UCOL_NOT_FOUND) { 1693 order = getImplicit(ch, collationSource); 1694 } 1695 return order; /* return the CE */ 1696 } 1697 1698 /* ucol_getNextCE, out-of-line version for use from other files. */ 1699 U_CAPI uint32_t U_EXPORT2 1700 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1701 return ucol_IGetNextCE(coll, collationSource, status); 1702 } 1703 1704 1705 /** 1706 * Incremental previous normalization happens here. Pick up the range of chars 1707 * identifed by FCD, normalize it into the collIterate's writable buffer, 1708 * switch the collIterate's state to use the writable buffer. 1709 * @param data collation iterator data 1710 */ 1711 static 1712 void collPrevIterNormalize(collIterate *data) 1713 { 1714 UErrorCode status = U_ZERO_ERROR; 1715 const UChar *pEnd = data->pos; /* End normalize + 1 */ 1716 const UChar *pStart; 1717 1718 /* Start normalize */ 1719 if (data->fcdPosition == NULL) { 1720 pStart = data->string; 1721 } 1722 else { 1723 pStart = data->fcdPosition + 1; 1724 } 1725 1726 int32_t normLen = 1727 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)), 1728 data->writableBuffer, 1729 status). 1730 length(); 1731 if(U_FAILURE(status)) { 1732 return; 1733 } 1734 /* 1735 this puts the null termination infront of the normalized string instead 1736 of the end 1737 */ 1738 data->writableBuffer.insert(0, (UChar)0); 1739 1740 /* 1741 * The usual case at this point is that we've got a base 1742 * character followed by marks that were normalized. If 1743 * fcdPosition is NULL, that means that we backed up to 1744 * the beginning of the string and there's no base character. 1745 * 1746 * Forward processing will usually normalize when it sees 1747 * the first mark, so that mark will get it's natural offset 1748 * and the rest will get the offset of the character following 1749 * the marks. The base character will also get its natural offset. 1750 * 1751 * We write the offset of the base character, if there is one, 1752 * followed by the offset of the first mark and then the offsets 1753 * of the rest of the marks. 1754 */ 1755 int32_t firstMarkOffset = 0; 1756 int32_t trailOffset = (int32_t)(data->pos - data->string + 1); 1757 int32_t trailCount = normLen - 1; 1758 1759 if (data->fcdPosition != NULL) { 1760 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); 1761 UChar baseChar = *data->fcdPosition; 1762 1763 firstMarkOffset = baseOffset + 1; 1764 1765 /* 1766 * If the base character is the start of a contraction, forward processing 1767 * will normalize the marks while checking for the contraction, which means 1768 * that the offset of the first mark will the same as the other marks. 1769 * 1770 * **** THIS IS PROBABLY NOT A COMPLETE TEST **** 1771 */ 1772 if (baseChar >= 0x100) { 1773 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); 1774 1775 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { 1776 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); 1777 } 1778 1779 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { 1780 firstMarkOffset = trailOffset; 1781 } 1782 } 1783 1784 data->appendOffset(baseOffset, status); 1785 } 1786 1787 data->appendOffset(firstMarkOffset, status); 1788 1789 for (int32_t i = 0; i < trailCount; i += 1) { 1790 data->appendOffset(trailOffset, status); 1791 } 1792 1793 data->offsetRepeatValue = trailOffset; 1794 1795 data->offsetReturn = data->offsetStore - 1; 1796 if (data->offsetReturn == data->offsetBuffer) { 1797 data->offsetStore = data->offsetBuffer; 1798 } 1799 1800 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; 1801 data->origFlags = data->flags; 1802 data->flags |= UCOL_ITER_INNORMBUF; 1803 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1804 } 1805 1806 1807 /** 1808 * Incremental FCD check for previous iteration and normalize. Called from 1809 * getPrevCE when normalization state is suspect. 1810 * When entering, the state is known to be this: 1811 * o We are working in the main buffer of the collIterate, not the side 1812 * writable buffer. When in the side buffer, normalization mode is always 1813 * off, so we won't get here. 1814 * o The leading combining class from the current character is 0 or the 1815 * trailing combining class of the previous char was zero. 1816 * True because the previous call to this function will have always exited 1817 * that way, and we get called for every char where cc might be non-zero. 1818 * @param data collation iterate struct 1819 * @return normalization status, TRUE for normalization to be done, FALSE 1820 * otherwise 1821 */ 1822 static 1823 inline UBool collPrevIterFCD(collIterate *data) 1824 { 1825 const UChar *src, *start; 1826 uint8_t leadingCC; 1827 uint8_t trailingCC = 0; 1828 uint16_t fcd; 1829 UBool result = FALSE; 1830 1831 start = data->string; 1832 src = data->pos + 1; 1833 1834 /* Get the trailing combining class of the current character. */ 1835 fcd = g_nfcImpl->previousFCD16(start, src); 1836 1837 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1838 1839 if (leadingCC != 0) { 1840 /* 1841 The current char has a non-zero leading combining class. 1842 Scan backward until we find a char with a trailing cc of zero. 1843 */ 1844 for (;;) 1845 { 1846 if (start == src) { 1847 data->fcdPosition = NULL; 1848 return result; 1849 } 1850 1851 fcd = g_nfcImpl->previousFCD16(start, src); 1852 1853 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1854 1855 if (trailingCC == 0) { 1856 break; 1857 } 1858 1859 if (leadingCC < trailingCC) { 1860 result = TRUE; 1861 } 1862 1863 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1864 } 1865 } 1866 1867 data->fcdPosition = (UChar *)src; 1868 1869 return result; 1870 } 1871 1872 /** gets a code unit from the string at a given offset 1873 * Handles both normal and iterative cases. 1874 * No error checking - caller beware! 1875 */ 1876 static inline 1877 UChar peekCodeUnit(collIterate *source, int32_t offset) { 1878 if(source->pos != NULL) { 1879 return *(source->pos + offset); 1880 } else if(source->iterator != NULL) { 1881 UChar32 c; 1882 if(offset != 0) { 1883 source->iterator->move(source->iterator, offset, UITER_CURRENT); 1884 c = source->iterator->next(source->iterator); 1885 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); 1886 } else { 1887 c = source->iterator->current(source->iterator); 1888 } 1889 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0. 1890 } else { 1891 return 0xfffd; 1892 } 1893 } 1894 1895 // Code point version. Treats the offset as a _code point_ delta. 1896 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16. 1897 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer. 1898 static inline 1899 UChar32 peekCodePoint(collIterate *source, int32_t offset) { 1900 UChar32 c; 1901 if(source->pos != NULL) { 1902 const UChar *p = source->pos; 1903 if(offset >= 0) { 1904 // Skip forward over (offset-1) code points. 1905 while(--offset >= 0) { 1906 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) { 1907 ++p; 1908 } 1909 } 1910 // Read the code point there. 1911 c = *p++; 1912 UChar trail; 1913 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) { 1914 c = U16_GET_SUPPLEMENTARY(c, trail); 1915 } 1916 } else /* offset<0 */ { 1917 // Skip backward over (offset-1) code points. 1918 while(++offset < 0) { 1919 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) { 1920 --p; 1921 } 1922 } 1923 // Read the code point before that. 1924 c = *--p; 1925 UChar lead; 1926 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) { 1927 c = U16_GET_SUPPLEMENTARY(lead, c); 1928 } 1929 } 1930 } else if(source->iterator != NULL) { 1931 if(offset >= 0) { 1932 // Skip forward over (offset-1) code points. 1933 int32_t fwd = offset; 1934 while(fwd-- > 0) { 1935 uiter_next32(source->iterator); 1936 } 1937 // Read the code point there. 1938 c = uiter_current32(source->iterator); 1939 // Return to the starting point, skipping backward over (offset-1) code points. 1940 while(offset-- > 0) { 1941 uiter_previous32(source->iterator); 1942 } 1943 } else /* offset<0 */ { 1944 // Read backward, reading offset code points, remember only the last-read one. 1945 int32_t back = offset; 1946 do { 1947 c = uiter_previous32(source->iterator); 1948 } while(++back < 0); 1949 // Return to the starting position, skipping forward over offset code points. 1950 do { 1951 uiter_next32(source->iterator); 1952 } while(++offset < 0); 1953 } 1954 } else { 1955 c = U_SENTINEL; 1956 } 1957 return c; 1958 } 1959 1960 /** 1961 * Determines if we are at the start of the data string in the backwards 1962 * collation iterator 1963 * @param data collation iterator 1964 * @return TRUE if we are at the start 1965 */ 1966 static 1967 inline UBool isAtStartPrevIterate(collIterate *data) { 1968 if(data->pos == NULL && data->iterator != NULL) { 1969 return !data->iterator->hasPrevious(data->iterator); 1970 } 1971 //return (collIter_bos(data)) || 1972 return (data->pos == data->string) || 1973 ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) && 1974 *(data->pos - 1) == 0 && data->fcdPosition == NULL); 1975 } 1976 1977 static 1978 inline void goBackOne(collIterate *data) { 1979 # if 0 1980 // somehow, it looks like we need to keep iterator synced up 1981 // at all times, as above. 1982 if(data->pos) { 1983 data->pos--; 1984 } 1985 if(data->iterator) { 1986 data->iterator->previous(data->iterator); 1987 } 1988 #endif 1989 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { 1990 data->iterator->previous(data->iterator); 1991 } 1992 if(data->pos) { 1993 data->pos --; 1994 } 1995 } 1996 1997 /** 1998 * Inline function that gets a simple CE. 1999 * So what it does is that it will first check the expansion buffer. If the 2000 * expansion buffer is not empty, ie the end pointer to the expansion buffer 2001 * is different from the string pointer, we return the collation element at the 2002 * return pointer and decrement it. 2003 * For more complicated CEs it resorts to getComplicatedCE. 2004 * @param coll collator data 2005 * @param data collation iterator struct 2006 * @param status error status 2007 */ 2008 static 2009 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, 2010 UErrorCode *status) 2011 { 2012 uint32_t result = (uint32_t)UCOL_NULLORDER; 2013 2014 if (data->offsetReturn != NULL) { 2015 if (data->offsetRepeatCount > 0) { 2016 data->offsetRepeatCount -= 1; 2017 } else { 2018 if (data->offsetReturn == data->offsetBuffer) { 2019 data->offsetReturn = NULL; 2020 data->offsetStore = data->offsetBuffer; 2021 } else { 2022 data->offsetReturn -= 1; 2023 } 2024 } 2025 } 2026 2027 if ((data->extendCEs && data->toReturn > data->extendCEs) || 2028 (!data->extendCEs && data->toReturn > data->CEs)) 2029 { 2030 data->toReturn -= 1; 2031 result = *(data->toReturn); 2032 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { 2033 data->CEpos = data->toReturn; 2034 } 2035 } 2036 else { 2037 UChar ch = 0; 2038 2039 do { 2040 /* 2041 Loop handles case when incremental normalize switches to or from the 2042 side buffer / original string, and we need to start again to get the 2043 next character. 2044 */ 2045 for (;;) { 2046 if (data->flags & UCOL_ITER_HASLEN) { 2047 /* 2048 Normal path for strings when length is specified. 2049 Not in side buffer because it is always null terminated. 2050 */ 2051 if (data->pos <= data->string) { 2052 /* End of the main source string */ 2053 return UCOL_NO_MORE_CES; 2054 } 2055 data->pos --; 2056 ch = *data->pos; 2057 } 2058 // we are using an iterator to go back. Pray for us! 2059 else if (data->flags & UCOL_USE_ITERATOR) { 2060 UChar32 iterCh = data->iterator->previous(data->iterator); 2061 if(iterCh == U_SENTINEL) { 2062 return UCOL_NO_MORE_CES; 2063 } else { 2064 ch = (UChar)iterCh; 2065 } 2066 } 2067 else { 2068 data->pos --; 2069 ch = *data->pos; 2070 /* we are in the side buffer. */ 2071 if (ch == 0) { 2072 /* 2073 At the start of the normalize side buffer. 2074 Go back to string. 2075 Because pointer points to the last accessed character, 2076 hence we have to increment it by one here. 2077 */ 2078 data->flags = data->origFlags; 2079 data->offsetRepeatValue = 0; 2080 2081 if (data->fcdPosition == NULL) { 2082 data->pos = data->string; 2083 return UCOL_NO_MORE_CES; 2084 } 2085 else { 2086 data->pos = data->fcdPosition + 1; 2087 } 2088 2089 continue; 2090 } 2091 } 2092 2093 if(data->flags&UCOL_HIRAGANA_Q) { 2094 if(ch>=0x3040 && ch<=0x309f) { 2095 data->flags |= UCOL_WAS_HIRAGANA; 2096 } else { 2097 data->flags &= ~UCOL_WAS_HIRAGANA; 2098 } 2099 } 2100 2101 /* 2102 * got a character to determine if there's fcd and/or normalization 2103 * stuff to do. 2104 * if the current character is not fcd. 2105 * if current character is at the start of the string 2106 * Trailing combining class == 0. 2107 * Note if pos is in the writablebuffer, norm is always 0 2108 */ 2109 if (ch < ZERO_CC_LIMIT_ || 2110 // this should propel us out of the loop in the iterator case 2111 (data->flags & UCOL_ITER_NORM) == 0 || 2112 (data->fcdPosition != NULL && data->fcdPosition <= data->pos) 2113 || data->string == data->pos) { 2114 break; 2115 } 2116 2117 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 2118 /* if next character is FCD */ 2119 if (data->pos == data->string) { 2120 /* First char of string is always OK for FCD check */ 2121 break; 2122 } 2123 2124 /* Not first char of string, do the FCD fast test */ 2125 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { 2126 break; 2127 } 2128 } 2129 2130 /* Need a more complete FCD check and possible normalization. */ 2131 if (collPrevIterFCD(data)) { 2132 collPrevIterNormalize(data); 2133 } 2134 2135 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2136 /* No normalization. Go ahead and process the char. */ 2137 break; 2138 } 2139 2140 /* 2141 Some normalization happened. 2142 Next loop picks up a char from the normalization buffer. 2143 */ 2144 } 2145 2146 /* attempt to handle contractions, after removal of the backwards 2147 contraction 2148 */ 2149 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { 2150 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); 2151 } else { 2152 if (ch <= 0xFF) { 2153 result = coll->latinOneMapping[ch]; 2154 } 2155 else { 2156 // Always use UCA for [3400..9FFF], [AC00..D7AF] 2157 // **** [FA0E..FA2F] ?? **** 2158 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 2159 (ch >= 0x3400 && ch <= 0xD7AF)) { 2160 if (ch > 0x9FFF && ch < 0xAC00) { 2161 // between the two target ranges; do normal lookup 2162 // **** this range is YI, Modifier tone letters, **** 2163 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 2164 // **** Latin-D might be tailored, so we need to **** 2165 // **** do the normal lookup for these guys. **** 2166 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 2167 } else { 2168 result = UCOL_NOT_FOUND; 2169 } 2170 } else { 2171 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 2172 } 2173 } 2174 if (result > UCOL_NOT_FOUND) { 2175 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); 2176 } 2177 if (result == UCOL_NOT_FOUND) { // Not found in master list 2178 if (!isAtStartPrevIterate(data) && 2179 ucol_contractionEndCP(ch, data->coll)) 2180 { 2181 result = UCOL_CONTRACTION; 2182 } else { 2183 if(coll->UCA) { 2184 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 2185 } 2186 } 2187 2188 if (result > UCOL_NOT_FOUND) { 2189 if(coll->UCA) { 2190 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); 2191 } 2192 } 2193 } 2194 } 2195 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); 2196 2197 if(result == UCOL_NOT_FOUND) { 2198 result = getPrevImplicit(ch, data); 2199 } 2200 } 2201 2202 return result; 2203 } 2204 2205 2206 /* ucol_getPrevCE, out-of-line version for use from other files. */ 2207 U_CFUNC uint32_t U_EXPORT2 2208 ucol_getPrevCE(const UCollator *coll, collIterate *data, 2209 UErrorCode *status) { 2210 return ucol_IGetPrevCE(coll, data, status); 2211 } 2212 2213 2214 /* this should be connected to special Jamo handling */ 2215 U_CFUNC uint32_t U_EXPORT2 2216 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { 2217 collIterate colIt; 2218 IInit_collIterate(coll, &u, 1, &colIt, status); 2219 if(U_FAILURE(*status)) { 2220 return 0; 2221 } 2222 return ucol_IGetNextCE(coll, &colIt, status); 2223 } 2224 2225 /** 2226 * Inserts the argument character into the end of the buffer pushing back the 2227 * null terminator. 2228 * @param data collIterate struct data 2229 * @param ch character to be appended 2230 * @return the position of the new addition 2231 */ 2232 static 2233 inline const UChar * insertBufferEnd(collIterate *data, UChar ch) 2234 { 2235 int32_t oldLength = data->writableBuffer.length(); 2236 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; 2237 } 2238 2239 /** 2240 * Inserts the argument string into the end of the buffer pushing back the 2241 * null terminator. 2242 * @param data collIterate struct data 2243 * @param string to be appended 2244 * @param length of the string to be appended 2245 * @return the position of the new addition 2246 */ 2247 static 2248 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length) 2249 { 2250 int32_t oldLength = data->writableBuffer.length(); 2251 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength; 2252 } 2253 2254 /** 2255 * Special normalization function for contraction in the forwards iterator. 2256 * This normalization sequence will place the current character at source->pos 2257 * and its following normalized sequence into the buffer. 2258 * The fcd position, pos will be changed. 2259 * pos will now point to positions in the buffer. 2260 * Flags will be changed accordingly. 2261 * @param data collation iterator data 2262 */ 2263 static 2264 inline void normalizeNextContraction(collIterate *data) 2265 { 2266 int32_t strsize; 2267 UErrorCode status = U_ZERO_ERROR; 2268 /* because the pointer points to the next character */ 2269 const UChar *pStart = data->pos - 1; 2270 const UChar *pEnd; 2271 2272 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2273 data->writableBuffer.setTo(*(pStart - 1)); 2274 strsize = 1; 2275 } 2276 else { 2277 strsize = data->writableBuffer.length(); 2278 } 2279 2280 pEnd = data->fcdPosition; 2281 2282 data->writableBuffer.append( 2283 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status)); 2284 if(U_FAILURE(status)) { 2285 return; 2286 } 2287 2288 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; 2289 data->origFlags = data->flags; 2290 data->flags |= UCOL_ITER_INNORMBUF; 2291 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2292 } 2293 2294 /** 2295 * Contraction character management function that returns the next character 2296 * for the forwards iterator. 2297 * Does nothing if the next character is in buffer and not the first character 2298 * in it. 2299 * Else it checks next character in data string to see if it is normalizable. 2300 * If it is not, the character is simply copied into the buffer, else 2301 * the whole normalized substring is copied into the buffer, including the 2302 * current character. 2303 * @param data collation element iterator data 2304 * @return next character 2305 */ 2306 static 2307 inline UChar getNextNormalizedChar(collIterate *data) 2308 { 2309 UChar nextch; 2310 UChar ch; 2311 // Here we need to add the iterator code. One problem is the way 2312 // end of string is handled. If we just return next char, it could 2313 // be the sentinel. Most of the cases already check for this, but we 2314 // need to be sure. 2315 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { 2316 /* if no normalization and not in buffer. */ 2317 if(data->flags & UCOL_USE_ITERATOR) { 2318 return (UChar)data->iterator->next(data->iterator); 2319 } else { 2320 return *(data->pos ++); 2321 } 2322 } 2323 2324 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { 2325 //normalizeIterator(data); 2326 //} 2327 2328 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2329 if ((innormbuf && *data->pos != 0) || 2330 (data->fcdPosition != NULL && !innormbuf && 2331 data->pos < data->fcdPosition)) { 2332 /* 2333 if next character is in normalized buffer, no further normalization 2334 is required 2335 */ 2336 return *(data->pos ++); 2337 } 2338 2339 if (data->flags & UCOL_ITER_HASLEN) { 2340 /* in data string */ 2341 if (data->pos + 1 == data->endp) { 2342 return *(data->pos ++); 2343 } 2344 } 2345 else { 2346 if (innormbuf) { 2347 // inside the normalization buffer, but at the end 2348 // (since we encountered zero). This means, in the 2349 // case we're using char iterator, that we need to 2350 // do another round of normalization. 2351 //if(data->origFlags & UCOL_USE_ITERATOR) { 2352 // we need to restore original flags, 2353 // otherwise, we'll lose them 2354 //data->flags = data->origFlags; 2355 //normalizeIterator(data); 2356 //return *(data->pos++); 2357 //} else { 2358 /* 2359 in writable buffer, at this point fcdPosition can not be 2360 pointing to the end of the data string. see contracting tag. 2361 */ 2362 if(data->fcdPosition) { 2363 if (*(data->fcdPosition + 1) == 0 || 2364 data->fcdPosition + 1 == data->endp) { 2365 /* at the end of the string, dump it into the normalizer */ 2366 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; 2367 // Check if data->pos received a null pointer 2368 if (data->pos == NULL) { 2369 return (UChar)-1; // Return to indicate error. 2370 } 2371 return *(data->fcdPosition ++); 2372 } 2373 data->pos = data->fcdPosition; 2374 } else if(data->origFlags & UCOL_USE_ITERATOR) { 2375 // if we are here, we're using a normalizing iterator. 2376 // we should just continue further. 2377 data->flags = data->origFlags; 2378 data->pos = NULL; 2379 return (UChar)data->iterator->next(data->iterator); 2380 } 2381 //} 2382 } 2383 else { 2384 if (*(data->pos + 1) == 0) { 2385 return *(data->pos ++); 2386 } 2387 } 2388 } 2389 2390 ch = *data->pos ++; 2391 nextch = *data->pos; 2392 2393 /* 2394 * if the current character is not fcd. 2395 * Trailing combining class == 0. 2396 */ 2397 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && 2398 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || 2399 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { 2400 /* 2401 Need a more complete FCD check and possible normalization. 2402 normalize substring will be appended to buffer 2403 */ 2404 if (collIterFCD(data)) { 2405 normalizeNextContraction(data); 2406 return *(data->pos ++); 2407 } 2408 else if (innormbuf) { 2409 /* fcdposition shifted even when there's no normalization, if we 2410 don't input the rest into this, we'll get the wrong position when 2411 we reach the end of the writableBuffer */ 2412 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); 2413 data->pos = insertBufferEnd(data, data->pos - 1, length); 2414 // Check if data->pos received a null pointer 2415 if (data->pos == NULL) { 2416 return (UChar)-1; // Return to indicate error. 2417 } 2418 return *(data->pos ++); 2419 } 2420 } 2421 2422 if (innormbuf) { 2423 /* 2424 no normalization is to be done hence only one character will be 2425 appended to the buffer. 2426 */ 2427 data->pos = insertBufferEnd(data, ch) + 1; 2428 // Check if data->pos received a null pointer 2429 if (data->pos == NULL) { 2430 return (UChar)-1; // Return to indicate error. 2431 } 2432 } 2433 2434 /* points back to the pos in string */ 2435 return ch; 2436 } 2437 2438 2439 2440 /** 2441 * Function to copy the buffer into writableBuffer and sets the fcd position to 2442 * the correct position 2443 * @param source data string source 2444 * @param buffer character buffer 2445 */ 2446 static 2447 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer) 2448 { 2449 /* okay confusing part here. to ensure that the skipped characters are 2450 considered later, we need to place it in the appropriate position in the 2451 normalization buffer and reassign the pos pointer. simple case if pos 2452 reside in string, simply copy to normalization buffer and 2453 fcdposition = pos, pos = start of normalization buffer. if pos in 2454 normalization buffer, we'll insert the copy infront of pos and point pos 2455 to the start of the normalization buffer. why am i doing these copies? 2456 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does 2457 not require any changes, which be really painful. */ 2458 if (source->flags & UCOL_ITER_INNORMBUF) { 2459 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer(); 2460 source->writableBuffer.replace(0, replaceLength, buffer); 2461 } 2462 else { 2463 source->fcdPosition = source->pos; 2464 source->origFlags = source->flags; 2465 source->flags |= UCOL_ITER_INNORMBUF; 2466 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 2467 source->writableBuffer = buffer; 2468 } 2469 2470 source->pos = source->writableBuffer.getTerminatedBuffer(); 2471 } 2472 2473 /** 2474 * Function to get the discontiguos collation element within the source. 2475 * Note this function will set the position to the appropriate places. 2476 * @param coll current collator used 2477 * @param source data string source 2478 * @param constart index to the start character in the contraction table 2479 * @return discontiguos collation element offset 2480 */ 2481 static 2482 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, 2483 const UChar *constart) 2484 { 2485 /* source->pos currently points to the second combining character after 2486 the start character */ 2487 const UChar *temppos = source->pos; 2488 UnicodeString buffer; 2489 const UChar *tempconstart = constart; 2490 uint8_t tempflags = source->flags; 2491 UBool multicontraction = FALSE; 2492 collIterateState discState; 2493 2494 backupState(source, &discState); 2495 2496 buffer.setTo(peekCodePoint(source, -1)); 2497 for (;;) { 2498 UChar *UCharOffset; 2499 UChar schar, 2500 tchar; 2501 uint32_t result; 2502 2503 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) 2504 || (peekCodeUnit(source, 0) == 0 && 2505 //|| (*source->pos == 0 && 2506 ((source->flags & UCOL_ITER_INNORMBUF) == 0 || 2507 source->fcdPosition == NULL || 2508 source->fcdPosition == source->endp || 2509 *(source->fcdPosition) == 0 || 2510 u_getCombiningClass(*(source->fcdPosition)) == 0)) || 2511 /* end of string in null terminated string or stopped by a 2512 null character, note fcd does not always point to a base 2513 character after the discontiguos change */ 2514 u_getCombiningClass(peekCodePoint(source, 0)) == 0) { 2515 //u_getCombiningClass(*(source->pos)) == 0) { 2516 //constart = (UChar *)coll->image + getContractOffset(CE); 2517 if (multicontraction) { 2518 source->pos = temppos - 1; 2519 setDiscontiguosAttribute(source, buffer); 2520 return *(coll->contractionCEs + 2521 (tempconstart - coll->contractionIndex)); 2522 } 2523 constart = tempconstart; 2524 break; 2525 } 2526 2527 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ 2528 schar = getNextNormalizedChar(source); 2529 2530 while (schar > (tchar = *UCharOffset)) { 2531 UCharOffset++; 2532 } 2533 2534 if (schar != tchar) { 2535 /* not the correct codepoint. we stuff the current codepoint into 2536 the discontiguos buffer and try the next character */ 2537 buffer.append(schar); 2538 continue; 2539 } 2540 else { 2541 if (u_getCombiningClass(schar) == 2542 u_getCombiningClass(peekCodePoint(source, -2))) { 2543 buffer.append(schar); 2544 continue; 2545 } 2546 result = *(coll->contractionCEs + 2547 (UCharOffset - coll->contractionIndex)); 2548 } 2549 2550 if (result == UCOL_NOT_FOUND) { 2551 break; 2552 } else if (isContraction(result)) { 2553 /* this is a multi-contraction*/ 2554 tempconstart = (UChar *)coll->image + getContractOffset(result); 2555 if (*(coll->contractionCEs + (constart - coll->contractionIndex)) 2556 != UCOL_NOT_FOUND) { 2557 multicontraction = TRUE; 2558 temppos = source->pos + 1; 2559 } 2560 } else { 2561 setDiscontiguosAttribute(source, buffer); 2562 return result; 2563 } 2564 } 2565 2566 /* no problems simply reverting just like that, 2567 if we are in string before getting into this function, points back to 2568 string hence no problem. 2569 if we are in normalization buffer before getting into this function, 2570 since we'll never use another normalization within this function, we 2571 know that fcdposition points to a base character. the normalization buffer 2572 never change, hence this revert works. */ 2573 loadState(source, &discState, TRUE); 2574 goBackOne(source); 2575 2576 //source->pos = temppos - 1; 2577 source->flags = tempflags; 2578 return *(coll->contractionCEs + (constart - coll->contractionIndex)); 2579 } 2580 2581 /* now uses Mark's getImplicitPrimary code */ 2582 static 2583 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { 2584 uint32_t r = uprv_uca_getImplicitPrimary(cp); 2585 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; 2586 collationSource->offsetRepeatCount += 1; 2587 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' 2588 } 2589 2590 /** 2591 * Inserts the argument character into the front of the buffer replacing the 2592 * front null terminator. 2593 * @param data collation element iterator data 2594 * @param ch character to be appended 2595 */ 2596 static 2597 inline void insertBufferFront(collIterate *data, UChar ch) 2598 { 2599 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2; 2600 } 2601 2602 /** 2603 * Special normalization function for contraction in the previous iterator. 2604 * This normalization sequence will place the current character at source->pos 2605 * and its following normalized sequence into the buffer. 2606 * The fcd position, pos will be changed. 2607 * pos will now point to positions in the buffer. 2608 * Flags will be changed accordingly. 2609 * @param data collation iterator data 2610 */ 2611 static 2612 inline void normalizePrevContraction(collIterate *data, UErrorCode *status) 2613 { 2614 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ 2615 const UChar *pStart; 2616 2617 UnicodeString endOfBuffer; 2618 if (data->flags & UCOL_ITER_HASLEN) { 2619 /* 2620 normalization buffer not used yet, we'll pull down the next 2621 character into the end of the buffer 2622 */ 2623 endOfBuffer.setTo(*pEnd); 2624 } 2625 else { 2626 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL 2627 } 2628 2629 if (data->fcdPosition == NULL) { 2630 pStart = data->string; 2631 } 2632 else { 2633 pStart = data->fcdPosition + 1; 2634 } 2635 int32_t normLen = 2636 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), 2637 data->writableBuffer, 2638 *status). 2639 length(); 2640 if(U_FAILURE(*status)) { 2641 return; 2642 } 2643 /* 2644 this puts the null termination infront of the normalized string instead 2645 of the end 2646 */ 2647 data->pos = 2648 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() + 2649 1 + normLen; 2650 data->origFlags = data->flags; 2651 data->flags |= UCOL_ITER_INNORMBUF; 2652 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2653 } 2654 2655 /** 2656 * Contraction character management function that returns the previous character 2657 * for the backwards iterator. 2658 * Does nothing if the previous character is in buffer and not the first 2659 * character in it. 2660 * Else it checks previous character in data string to see if it is 2661 * normalizable. 2662 * If it is not, the character is simply copied into the buffer, else 2663 * the whole normalized substring is copied into the buffer, including the 2664 * current character. 2665 * @param data collation element iterator data 2666 * @return previous character 2667 */ 2668 static 2669 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) 2670 { 2671 UChar prevch; 2672 UChar ch; 2673 const UChar *start; 2674 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2675 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || 2676 (innormbuf && *(data->pos - 1) != 0)) { 2677 /* 2678 if no normalization. 2679 if previous character is in normalized buffer, no further normalization 2680 is required 2681 */ 2682 if(data->flags & UCOL_USE_ITERATOR) { 2683 data->iterator->move(data->iterator, -1, UITER_CURRENT); 2684 return (UChar)data->iterator->next(data->iterator); 2685 } else { 2686 return *(data->pos - 1); 2687 } 2688 } 2689 2690 start = data->pos; 2691 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { 2692 /* in data string */ 2693 if ((start - 1) == data->string) { 2694 return *(start - 1); 2695 } 2696 start --; 2697 ch = *start; 2698 prevch = *(start - 1); 2699 } 2700 else { 2701 /* 2702 in writable buffer, at this point fcdPosition can not be NULL. 2703 see contracting tag. 2704 */ 2705 if (data->fcdPosition == data->string) { 2706 /* at the start of the string, just dump it into the normalizer */ 2707 insertBufferFront(data, *(data->fcdPosition)); 2708 data->fcdPosition = NULL; 2709 return *(data->pos - 1); 2710 } 2711 start = data->fcdPosition; 2712 ch = *start; 2713 prevch = *(start - 1); 2714 } 2715 /* 2716 * if the current character is not fcd. 2717 * Trailing combining class == 0. 2718 */ 2719 if (data->fcdPosition > start && 2720 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) 2721 { 2722 /* 2723 Need a more complete FCD check and possible normalization. 2724 normalize substring will be appended to buffer 2725 */ 2726 const UChar *backuppos = data->pos; 2727 data->pos = start; 2728 if (collPrevIterFCD(data)) { 2729 normalizePrevContraction(data, status); 2730 return *(data->pos - 1); 2731 } 2732 data->pos = backuppos; 2733 data->fcdPosition ++; 2734 } 2735 2736 if (innormbuf) { 2737 /* 2738 no normalization is to be done hence only one character will be 2739 appended to the buffer. 2740 */ 2741 insertBufferFront(data, ch); 2742 data->fcdPosition --; 2743 } 2744 2745 return ch; 2746 } 2747 2748 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ 2749 /* It is called by getNextCE */ 2750 2751 /* The following should be even */ 2752 #define UCOL_MAX_DIGITS_FOR_NUMBER 254 2753 2754 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { 2755 collIterateState entryState; 2756 backupState(source, &entryState); 2757 UChar32 cp = ch; 2758 2759 for (;;) { 2760 // This loop will repeat only in the case of contractions, and only when a contraction 2761 // is found and the first CE resulting from that contraction is itself a special 2762 // (an expansion, for example.) All other special CE types are fully handled the 2763 // first time through, and the loop exits. 2764 2765 const uint32_t *CEOffset = NULL; 2766 switch(getCETag(CE)) { 2767 case NOT_FOUND_TAG: 2768 /* This one is not found, and we'll let somebody else bother about it... no more games */ 2769 return CE; 2770 case SPEC_PROC_TAG: 2771 { 2772 // Special processing is getting a CE that is preceded by a certain prefix 2773 // Currently this is only needed for optimizing Japanese length and iteration marks. 2774 // When we encouter a special processing tag, we go backwards and try to see if 2775 // we have a match. 2776 // Contraction tables are used - so the whole process is not unlike contraction. 2777 // prefix data is stored backwards in the table. 2778 const UChar *UCharOffset; 2779 UChar schar, tchar; 2780 collIterateState prefixState; 2781 backupState(source, &prefixState); 2782 loadState(source, &entryState, TRUE); 2783 goBackOne(source); // We want to look at the point where we entered - actually one 2784 // before that... 2785 2786 for(;;) { 2787 // This loop will run once per source string character, for as long as we 2788 // are matching a potential contraction sequence 2789 2790 // First we position ourselves at the begining of contraction sequence 2791 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2792 if (collIter_bos(source)) { 2793 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2794 break; 2795 } 2796 schar = getPrevNormalizedChar(source, status); 2797 goBackOne(source); 2798 2799 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2800 UCharOffset++; 2801 } 2802 2803 if (schar == tchar) { 2804 // Found the source string char in the table. 2805 // Pick up the corresponding CE from the table. 2806 CE = *(coll->contractionCEs + 2807 (UCharOffset - coll->contractionIndex)); 2808 } 2809 else 2810 { 2811 // Source string char was not in the table. 2812 // We have not found the prefix. 2813 CE = *(coll->contractionCEs + 2814 (ContractionStart - coll->contractionIndex)); 2815 } 2816 2817 if(!isPrefix(CE)) { 2818 // The source string char was in the contraction table, and the corresponding 2819 // CE is not a prefix CE. We found the prefix, break 2820 // out of loop, this CE will end up being returned. This is the normal 2821 // way out of prefix handling when the source actually contained 2822 // the prefix. 2823 break; 2824 } 2825 } 2826 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue 2827 loadState(source, &prefixState, TRUE); 2828 if(source->origFlags & UCOL_USE_ITERATOR) { 2829 source->flags = source->origFlags; 2830 } 2831 } else { // prefix search was a failure, we have to backup all the way to the start 2832 loadState(source, &entryState, TRUE); 2833 } 2834 break; 2835 } 2836 case CONTRACTION_TAG: 2837 { 2838 /* This should handle contractions */ 2839 collIterateState state; 2840 backupState(source, &state); 2841 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; 2842 const UChar *UCharOffset; 2843 UChar schar, tchar; 2844 2845 for (;;) { 2846 /* This loop will run once per source string character, for as long as we */ 2847 /* are matching a potential contraction sequence */ 2848 2849 /* First we position ourselves at the begining of contraction sequence */ 2850 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2851 2852 if (collIter_eos(source)) { 2853 // Ran off the end of the source string. 2854 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2855 // So we'll pick whatever we have at the point... 2856 if (CE == UCOL_NOT_FOUND) { 2857 // back up the source over all the chars we scanned going into this contraction. 2858 CE = firstCE; 2859 loadState(source, &state, TRUE); 2860 if(source->origFlags & UCOL_USE_ITERATOR) { 2861 source->flags = source->origFlags; 2862 } 2863 } 2864 break; 2865 } 2866 2867 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ 2868 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); 2869 2870 schar = getNextNormalizedChar(source); 2871 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2872 UCharOffset++; 2873 } 2874 2875 if (schar == tchar) { 2876 // Found the source string char in the contraction table. 2877 // Pick up the corresponding CE from the table. 2878 CE = *(coll->contractionCEs + 2879 (UCharOffset - coll->contractionIndex)); 2880 } 2881 else 2882 { 2883 // Source string char was not in contraction table. 2884 // Unless we have a discontiguous contraction, we have finished 2885 // with this contraction. 2886 // in order to do the proper detection, we 2887 // need to see if we're dealing with a supplementary 2888 /* We test whether the next two char are surrogate pairs. 2889 * This test is done if the iterator is not NULL. 2890 * If there is no surrogate pair, the iterator 2891 * goes back one if needed. */ 2892 UChar32 miss = schar; 2893 if (source->iterator) { 2894 UChar32 surrNextChar; /* the next char in the iteration to test */ 2895 int32_t prevPos; /* holds the previous position before move forward of the source iterator */ 2896 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { 2897 prevPos = source->iterator->index; 2898 surrNextChar = getNextNormalizedChar(source); 2899 if (U16_IS_TRAIL(surrNextChar)) { 2900 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); 2901 } else if (prevPos < source->iterator->index){ 2902 goBackOne(source); 2903 } 2904 } 2905 } else if (U16_IS_LEAD(schar)) { 2906 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); 2907 } 2908 2909 uint8_t sCC; 2910 if (miss < 0x300 || 2911 maxCC == 0 || 2912 (sCC = i_getCombiningClass(miss, coll)) == 0 || 2913 sCC>maxCC || 2914 (allSame != 0 && sCC == maxCC) || 2915 collIter_eos(source)) 2916 { 2917 // Contraction can not be discontiguous. 2918 goBackOne(source); // back up the source string by one, 2919 // because the character we just looked at was 2920 // not part of the contraction. */ 2921 if(U_IS_SUPPLEMENTARY(miss)) { 2922 goBackOne(source); 2923 } 2924 CE = *(coll->contractionCEs + 2925 (ContractionStart - coll->contractionIndex)); 2926 } else { 2927 // 2928 // Contraction is possibly discontiguous. 2929 // Scan more of source string looking for a match 2930 // 2931 UChar tempchar; 2932 /* find the next character if schar is not a base character 2933 and we are not yet at the end of the string */ 2934 tempchar = getNextNormalizedChar(source); 2935 // probably need another supplementary thingie here 2936 goBackOne(source); 2937 if (i_getCombiningClass(tempchar, coll) == 0) { 2938 goBackOne(source); 2939 if(U_IS_SUPPLEMENTARY(miss)) { 2940 goBackOne(source); 2941 } 2942 /* Spit out the last char of the string, wasn't tasty enough */ 2943 CE = *(coll->contractionCEs + 2944 (ContractionStart - coll->contractionIndex)); 2945 } else { 2946 CE = getDiscontiguous(coll, source, ContractionStart); 2947 } 2948 } 2949 } // else after if(schar == tchar) 2950 2951 if(CE == UCOL_NOT_FOUND) { 2952 /* The Source string did not match the contraction that we were checking. */ 2953 /* Back up the source position to undo the effects of having partially */ 2954 /* scanned through what ultimately proved to not be a contraction. */ 2955 loadState(source, &state, TRUE); 2956 CE = firstCE; 2957 break; 2958 } 2959 2960 if(!isContraction(CE)) { 2961 // The source string char was in the contraction table, and the corresponding 2962 // CE is not a contraction CE. We completed the contraction, break 2963 // out of loop, this CE will end up being returned. This is the normal 2964 // way out of contraction handling when the source actually contained 2965 // the contraction. 2966 break; 2967 } 2968 2969 2970 // The source string char was in the contraction table, and the corresponding 2971 // CE is IS a contraction CE. We will continue looping to check the source 2972 // string for the remaining chars in the contraction. 2973 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); 2974 if(tempCE != UCOL_NOT_FOUND) { 2975 // We have scanned a a section of source string for which there is a 2976 // CE from the contraction table. Remember the CE and scan position, so 2977 // that we can return to this point if further scanning fails to 2978 // match a longer contraction sequence. 2979 firstCE = tempCE; 2980 2981 goBackOne(source); 2982 backupState(source, &state); 2983 getNextNormalizedChar(source); 2984 2985 // Another way to do this is: 2986 //collIterateState tempState; 2987 //backupState(source, &tempState); 2988 //goBackOne(source); 2989 //backupState(source, &state); 2990 //loadState(source, &tempState, TRUE); 2991 2992 // The problem is that for incomplete contractions we have to remember the previous 2993 // position. Before, the only thing I needed to do was state.pos--; 2994 // After iterator introduction and especially after introduction of normalizing 2995 // iterators, it became much more difficult to decrease the saved state. 2996 // I'm not yet sure which of the two methods above is faster. 2997 } 2998 } // for(;;) 2999 break; 3000 } // case CONTRACTION_TAG: 3001 case LONG_PRIMARY_TAG: 3002 { 3003 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 3004 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 3005 source->offsetRepeatCount += 1; 3006 return CE; 3007 } 3008 case EXPANSION_TAG: 3009 { 3010 /* This should handle expansion. */ 3011 /* NOTE: we can encounter both continuations and expansions in an expansion! */ 3012 /* I have to decide where continuations are going to be dealt with */ 3013 uint32_t size; 3014 uint32_t i; /* general counter */ 3015 3016 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 3017 size = getExpansionCount(CE); 3018 CE = *CEOffset++; 3019 //source->offsetRepeatCount = -1; 3020 3021 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 3022 for(i = 1; i<size; i++) { 3023 *(source->CEpos++) = *CEOffset++; 3024 source->offsetRepeatCount += 1; 3025 } 3026 } else { /* else, we do */ 3027 while(*CEOffset != 0) { 3028 *(source->CEpos++) = *CEOffset++; 3029 source->offsetRepeatCount += 1; 3030 } 3031 } 3032 3033 return CE; 3034 } 3035 case DIGIT_TAG: 3036 { 3037 /* 3038 We do a check to see if we want to collate digits as numbers; if so we generate 3039 a custom collation key. Otherwise we pull out the value stored in the expansion table. 3040 */ 3041 //uint32_t size; 3042 uint32_t i; /* general counter */ 3043 3044 if (source->coll->numericCollation == UCOL_ON){ 3045 collIterateState digitState = {0,0,0,0,0,0,0,0,0}; 3046 UChar32 char32 = 0; 3047 int32_t digVal = 0; 3048 3049 uint32_t digIndx = 0; 3050 uint32_t endIndex = 0; 3051 uint32_t trailingZeroIndex = 0; 3052 3053 uint8_t collateVal = 0; 3054 3055 UBool nonZeroValReached = FALSE; 3056 3057 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. 3058 /* 3059 We parse the source string until we hit a char that's NOT a digit. 3060 Use this u_charDigitValue. This might be slow because we have to 3061 handle surrogates... 3062 */ 3063 /* 3064 if (U16_IS_LEAD(ch)){ 3065 if (!collIter_eos(source)) { 3066 backupState(source, &digitState); 3067 UChar trail = getNextNormalizedChar(source); 3068 if(U16_IS_TRAIL(trail)) { 3069 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 3070 } else { 3071 loadState(source, &digitState, TRUE); 3072 char32 = ch; 3073 } 3074 } else { 3075 char32 = ch; 3076 } 3077 } else { 3078 char32 = ch; 3079 } 3080 digVal = u_charDigitValue(char32); 3081 */ 3082 digVal = u_charDigitValue(cp); // if we have arrived here, we have 3083 // already processed possible supplementaries that trigered the digit tag - 3084 // all supplementaries are marked in the UCA. 3085 /* 3086 We pad a zero in front of the first element anyways. This takes 3087 care of the (probably) most common case where people are sorting things followed 3088 by a single digit 3089 */ 3090 digIndx++; 3091 for(;;){ 3092 // Make sure we have enough space. No longer needed; 3093 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER 3094 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough 3095 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). 3096 3097 // Skipping over leading zeroes. 3098 if (digVal != 0) { 3099 nonZeroValReached = TRUE; 3100 } 3101 if (nonZeroValReached) { 3102 /* 3103 We parse the digit string into base 100 numbers (this fits into a byte). 3104 We only add to the buffer in twos, thus if we are parsing an odd character, 3105 that serves as the 'tens' digit while the if we are parsing an even one, that 3106 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3107 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3108 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3109 than all the other bytes. 3110 */ 3111 3112 if (digIndx % 2 == 1){ 3113 collateVal += (uint8_t)digVal; 3114 3115 // We don't enter the low-order-digit case unless we've already seen 3116 // the high order, or for the first digit, which is always non-zero. 3117 if (collateVal != 0) 3118 trailingZeroIndex = 0; 3119 3120 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3121 collateVal = 0; 3122 } 3123 else{ 3124 // We drop the collation value into the buffer so if we need to do 3125 // a "front patch" we don't have to check to see if we're hitting the 3126 // last element. 3127 collateVal = (uint8_t)(digVal * 10); 3128 3129 // Check for trailing zeroes. 3130 if (collateVal == 0) 3131 { 3132 if (!trailingZeroIndex) 3133 trailingZeroIndex = (digIndx/2) + 2; 3134 } 3135 else 3136 trailingZeroIndex = 0; 3137 3138 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3139 } 3140 digIndx++; 3141 } 3142 3143 // Get next character. 3144 if (!collIter_eos(source)){ 3145 ch = getNextNormalizedChar(source); 3146 if (U16_IS_LEAD(ch)){ 3147 if (!collIter_eos(source)) { 3148 backupState(source, &digitState); 3149 UChar trail = getNextNormalizedChar(source); 3150 if(U16_IS_TRAIL(trail)) { 3151 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 3152 } else { 3153 loadState(source, &digitState, TRUE); 3154 char32 = ch; 3155 } 3156 } 3157 } else { 3158 char32 = ch; 3159 } 3160 3161 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){ 3162 // Resetting position to point to the next unprocessed char. We 3163 // overshot it when doing our test/set for numbers. 3164 if (char32 > 0xFFFF) { // For surrogates. 3165 loadState(source, &digitState, TRUE); 3166 //goBackOne(source); 3167 } 3168 goBackOne(source); 3169 break; 3170 } 3171 } else { 3172 break; 3173 } 3174 } 3175 3176 if (nonZeroValReached == FALSE){ 3177 digIndx = 2; 3178 numTempBuf[2] = 6; 3179 } 3180 3181 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; 3182 if (digIndx % 2 != 0){ 3183 /* 3184 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what 3185 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. 3186 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a 3187 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. 3188 */ 3189 3190 for(i = 2; i < endIndex; i++){ 3191 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + 3192 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; 3193 } 3194 --digIndx; 3195 } 3196 3197 // Subtract one off of the last byte. 3198 numTempBuf[endIndex-1] -= 1; 3199 3200 /* 3201 We want to skip over the first two slots in the buffer. The first slot 3202 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3203 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3204 */ 3205 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3206 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); 3207 3208 // Now transfer the collation key to our collIterate struct. 3209 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. 3210 //size = ((endIndex+1) & ~1)/2; 3211 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3212 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3213 UCOL_BYTE_COMMON; // Tertiary weight. 3214 i = 2; // Reset the index into the buffer. 3215 while(i < endIndex) 3216 { 3217 uint32_t primWeight = numTempBuf[i++] << 8; 3218 if ( i < endIndex) 3219 primWeight |= numTempBuf[i++]; 3220 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3221 } 3222 3223 } else { 3224 // no numeric mode, we'll just switch to whatever we stashed and continue 3225 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 3226 CE = *CEOffset++; 3227 break; 3228 } 3229 return CE; 3230 } 3231 /* various implicits optimization */ 3232 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 3233 /* UCA is filled with these. Tailorings are NOT_FOUND */ 3234 return getImplicit(cp, source); 3235 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 3236 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit 3237 return getImplicit(cp, source); 3238 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3239 { 3240 static const uint32_t 3241 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3242 //const uint32_t LCount = 19; 3243 static const uint32_t VCount = 21; 3244 static const uint32_t TCount = 28; 3245 //const uint32_t NCount = VCount * TCount; // 588 3246 //const uint32_t SCount = LCount * NCount; // 11172 3247 uint32_t L = ch - SBase; 3248 3249 // divide into pieces 3250 3251 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation 3252 L /= TCount; 3253 uint32_t V = L % VCount; 3254 L /= VCount; 3255 3256 // offset them 3257 3258 L += LBase; 3259 V += VBase; 3260 T += TBase; 3261 3262 // return the first CE, but first put the rest into the expansion buffer 3263 if (!source->coll->image->jamoSpecial) { // FAST PATH 3264 3265 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3266 if (T != TBase) { 3267 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3268 } 3269 3270 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3271 3272 } else { // Jamo is Special 3273 // Since Hanguls pass the FCD check, it is 3274 // guaranteed that we won't be in 3275 // the normalization buffer if something like this happens 3276 3277 // However, if we are using a uchar iterator and normalization 3278 // is ON, the Hangul that lead us here is going to be in that 3279 // normalization buffer. Here we want to restore the uchar 3280 // iterator state and pull out of the normalization buffer 3281 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { 3282 source->flags = source->origFlags; // restore the iterator 3283 source->pos = NULL; 3284 } 3285 3286 // Move Jamos into normalization buffer 3287 UChar *buffer = source->writableBuffer.getBuffer(4); 3288 int32_t bufferLength; 3289 buffer[0] = (UChar)L; 3290 buffer[1] = (UChar)V; 3291 if (T != TBase) { 3292 buffer[2] = (UChar)T; 3293 bufferLength = 3; 3294 } else { 3295 bufferLength = 2; 3296 } 3297 source->writableBuffer.releaseBuffer(bufferLength); 3298 3299 // Indicate where to continue in main input string after exhausting the writableBuffer 3300 source->fcdPosition = source->pos; 3301 3302 source->pos = source->writableBuffer.getTerminatedBuffer(); 3303 source->origFlags = source->flags; 3304 source->flags |= UCOL_ITER_INNORMBUF; 3305 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 3306 3307 return(UCOL_IGNORABLE); 3308 } 3309 } 3310 case SURROGATE_TAG: 3311 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ 3312 /* two things can happen here: next code point can be a trailing surrogate - we will use it */ 3313 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ 3314 /* we treat it like an unassigned code point. */ 3315 { 3316 UChar trail; 3317 collIterateState state; 3318 backupState(source, &state); 3319 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { 3320 // we chould have stepped one char forward and it might have turned that it 3321 // was not a trail surrogate. In that case, we have to backup. 3322 loadState(source, &state, TRUE); 3323 return UCOL_NOT_FOUND; 3324 } else { 3325 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ 3326 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail); 3327 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. 3328 // We need to backup 3329 loadState(source, &state, TRUE); 3330 return CE; 3331 } 3332 // calculate the supplementary code point value, if surrogate was not tailored 3333 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 3334 } 3335 } 3336 break; 3337 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 3338 UChar nextChar; 3339 if( source->flags & UCOL_USE_ITERATOR) { 3340 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { 3341 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3342 source->iterator->next(source->iterator); 3343 return getImplicit(cp, source); 3344 } 3345 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) && 3346 U_IS_TRAIL((nextChar=*source->pos))) { 3347 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3348 source->pos++; 3349 return getImplicit(cp, source); 3350 } 3351 return UCOL_NOT_FOUND; 3352 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 3353 return UCOL_NOT_FOUND; /* broken surrogate sequence */ 3354 case CHARSET_TAG: 3355 /* not yet implemented */ 3356 /* probably after 1.8 */ 3357 return UCOL_NOT_FOUND; 3358 default: 3359 *status = U_INTERNAL_PROGRAM_ERROR; 3360 CE=0; 3361 break; 3362 } 3363 if (CE <= UCOL_NOT_FOUND) break; 3364 } 3365 return CE; 3366 } 3367 3368 3369 /* now uses Mark's getImplicitPrimary code */ 3370 static 3371 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { 3372 uint32_t r = uprv_uca_getImplicitPrimary(cp); 3373 3374 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; 3375 collationSource->toReturn = collationSource->CEpos; 3376 3377 // **** doesn't work if using iterator **** 3378 if (collationSource->flags & UCOL_ITER_INNORMBUF) { 3379 collationSource->offsetRepeatCount = 1; 3380 } else { 3381 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string); 3382 3383 UErrorCode errorCode = U_ZERO_ERROR; 3384 collationSource->appendOffset(firstOffset, errorCode); 3385 collationSource->appendOffset(firstOffset + 1, errorCode); 3386 3387 collationSource->offsetReturn = collationSource->offsetStore - 1; 3388 *(collationSource->offsetBuffer) = firstOffset; 3389 if (collationSource->offsetReturn == collationSource->offsetBuffer) { 3390 collationSource->offsetStore = collationSource->offsetBuffer; 3391 } 3392 } 3393 3394 return ((r & 0x0000FFFF)<<16) | 0x000000C0; 3395 } 3396 3397 /** 3398 * This function handles the special CEs like contractions, expansions, 3399 * surrogates, Thai. 3400 * It is called by both getPrevCE 3401 */ 3402 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, 3403 collIterate *source, 3404 UErrorCode *status) 3405 { 3406 const uint32_t *CEOffset = NULL; 3407 UChar *UCharOffset = NULL; 3408 UChar schar; 3409 const UChar *constart = NULL; 3410 uint32_t size; 3411 UChar buffer[UCOL_MAX_BUFFER]; 3412 uint32_t *endCEBuffer; 3413 UChar *strbuffer; 3414 int32_t noChars = 0; 3415 int32_t CECount = 0; 3416 3417 for(;;) 3418 { 3419 /* the only ces that loops are thai and contractions */ 3420 switch (getCETag(CE)) 3421 { 3422 case NOT_FOUND_TAG: /* this tag always returns */ 3423 return CE; 3424 3425 case SPEC_PROC_TAG: 3426 { 3427 // Special processing is getting a CE that is preceded by a certain prefix 3428 // Currently this is only needed for optimizing Japanese length and iteration marks. 3429 // When we encouter a special processing tag, we go backwards and try to see if 3430 // we have a match. 3431 // Contraction tables are used - so the whole process is not unlike contraction. 3432 // prefix data is stored backwards in the table. 3433 const UChar *UCharOffset; 3434 UChar schar, tchar; 3435 collIterateState prefixState; 3436 backupState(source, &prefixState); 3437 for(;;) { 3438 // This loop will run once per source string character, for as long as we 3439 // are matching a potential contraction sequence 3440 3441 // First we position ourselves at the begining of contraction sequence 3442 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 3443 3444 if (collIter_bos(source)) { 3445 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 3446 break; 3447 } 3448 schar = getPrevNormalizedChar(source, status); 3449 goBackOne(source); 3450 3451 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 3452 UCharOffset++; 3453 } 3454 3455 if (schar == tchar) { 3456 // Found the source string char in the table. 3457 // Pick up the corresponding CE from the table. 3458 CE = *(coll->contractionCEs + 3459 (UCharOffset - coll->contractionIndex)); 3460 } 3461 else 3462 { 3463 // if there is a completely ignorable code point in the middle of 3464 // a prefix, we need to act as if it's not there 3465 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) 3466 // lone surrogates cannot be set to zero as it would break other processing 3467 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 3468 // it's easy for BMP code points 3469 if(isZeroCE == 0) { 3470 continue; 3471 } else if(U16_IS_SURROGATE(schar)) { 3472 // for supplementary code points, we have to check the next one 3473 // situations where we are going to ignore 3474 // 1. beginning of the string: schar is a lone surrogate 3475 // 2. schar is a lone surrogate 3476 // 3. schar is a trail surrogate in a valid surrogate sequence 3477 // that is explicitly set to zero. 3478 if (!collIter_bos(source)) { 3479 UChar lead; 3480 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) { 3481 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead); 3482 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) { 3483 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); 3484 if(finalCE == 0) { 3485 // this is a real, assigned completely ignorable code point 3486 goBackOne(source); 3487 continue; 3488 } 3489 } 3490 } else { 3491 // lone surrogate, treat like unassigned 3492 return UCOL_NOT_FOUND; 3493 } 3494 } else { 3495 // lone surrogate at the beggining, treat like unassigned 3496 return UCOL_NOT_FOUND; 3497 } 3498 } 3499 // Source string char was not in the table. 3500 // We have not found the prefix. 3501 CE = *(coll->contractionCEs + 3502 (ContractionStart - coll->contractionIndex)); 3503 } 3504 3505 if(!isPrefix(CE)) { 3506 // The source string char was in the contraction table, and the corresponding 3507 // CE is not a prefix CE. We found the prefix, break 3508 // out of loop, this CE will end up being returned. This is the normal 3509 // way out of prefix handling when the source actually contained 3510 // the prefix. 3511 break; 3512 } 3513 } 3514 loadState(source, &prefixState, TRUE); 3515 break; 3516 } 3517 3518 case CONTRACTION_TAG: { 3519 /* to ensure that the backwards and forwards iteration matches, we 3520 take the current region of most possible match and pass it through 3521 the forward iteration. this will ensure that the obstinate problem of 3522 overlapping contractions will not occur. 3523 */ 3524 schar = peekCodeUnit(source, 0); 3525 constart = (UChar *)coll->image + getContractOffset(CE); 3526 if (isAtStartPrevIterate(source) 3527 /* commented away contraction end checks after adding the checks 3528 in getPrevCE */) { 3529 /* start of string or this is not the end of any contraction */ 3530 CE = *(coll->contractionCEs + 3531 (constart - coll->contractionIndex)); 3532 break; 3533 } 3534 strbuffer = buffer; 3535 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); 3536 *(UCharOffset --) = 0; 3537 noChars = 0; 3538 // have to swap thai characters 3539 while (ucol_unsafeCP(schar, coll)) { 3540 *(UCharOffset) = schar; 3541 noChars++; 3542 UCharOffset --; 3543 schar = getPrevNormalizedChar(source, status); 3544 goBackOne(source); 3545 // TODO: when we exhaust the contraction buffer, 3546 // it needs to get reallocated. The problem is 3547 // that the size depends on the string which is 3548 // not iterated over. However, since we're travelling 3549 // backwards, we already had to set the iterator at 3550 // the end - so we might as well know where we are? 3551 if (UCharOffset + 1 == buffer) { 3552 /* we have exhausted the buffer */ 3553 int32_t newsize = 0; 3554 if(source->pos) { // actually dealing with a position 3555 newsize = (int32_t)(source->pos - source->string + 1); 3556 } else { // iterator 3557 newsize = 4 * UCOL_MAX_BUFFER; 3558 } 3559 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * 3560 (newsize + UCOL_MAX_BUFFER)); 3561 /* test for NULL */ 3562 if (strbuffer == NULL) { 3563 *status = U_MEMORY_ALLOCATION_ERROR; 3564 return UCOL_NO_MORE_CES; 3565 } 3566 UCharOffset = strbuffer + newsize; 3567 uprv_memcpy(UCharOffset, buffer, 3568 UCOL_MAX_BUFFER * sizeof(UChar)); 3569 UCharOffset --; 3570 } 3571 if ((source->pos && (source->pos == source->string || 3572 ((source->flags & UCOL_ITER_INNORMBUF) && 3573 *(source->pos - 1) == 0 && source->fcdPosition == NULL))) 3574 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { 3575 break; 3576 } 3577 } 3578 /* adds the initial base character to the string */ 3579 *(UCharOffset) = schar; 3580 noChars++; 3581 3582 int32_t offsetBias; 3583 3584 // **** doesn't work if using iterator **** 3585 if (source->flags & UCOL_ITER_INNORMBUF) { 3586 offsetBias = -1; 3587 } else { 3588 offsetBias = (int32_t)(source->pos - source->string); 3589 } 3590 3591 /* a new collIterate is used to simplify things, since using the current 3592 collIterate will mean that the forward and backwards iteration will 3593 share and change the same buffers. we don't want to get into that. */ 3594 collIterate temp; 3595 int32_t rawOffset; 3596 3597 IInit_collIterate(coll, UCharOffset, noChars, &temp, status); 3598 if(U_FAILURE(*status)) { 3599 return (uint32_t)UCOL_NULLORDER; 3600 } 3601 temp.flags &= ~UCOL_ITER_NORM; 3602 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; 3603 3604 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero? 3605 CE = ucol_IGetNextCE(coll, &temp, status); 3606 3607 if (source->extendCEs) { 3608 endCEBuffer = source->extendCEs + source->extendCEsSize; 3609 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t)); 3610 } else { 3611 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; 3612 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t)); 3613 } 3614 3615 while (CE != UCOL_NO_MORE_CES) { 3616 *(source->CEpos ++) = CE; 3617 3618 if (offsetBias >= 0) { 3619 source->appendOffset(rawOffset + offsetBias, *status); 3620 } 3621 3622 CECount++; 3623 if (source->CEpos == endCEBuffer) { 3624 /* ran out of CE space, reallocate to new buffer. 3625 If reallocation fails, reset pointers and bail out, 3626 there's no guarantee of the right character position after 3627 this bail*/ 3628 if (!increaseCEsCapacity(source)) { 3629 *status = U_MEMORY_ALLOCATION_ERROR; 3630 break; 3631 } 3632 3633 endCEBuffer = source->extendCEs + source->extendCEsSize; 3634 } 3635 3636 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { 3637 rawOffset = (int32_t)(temp.fcdPosition - temp.string); 3638 } else { 3639 rawOffset = (int32_t)(temp.pos - temp.string); 3640 } 3641 3642 CE = ucol_IGetNextCE(coll, &temp, status); 3643 } 3644 3645 if (strbuffer != buffer) { 3646 uprv_free(strbuffer); 3647 } 3648 if (U_FAILURE(*status)) { 3649 return (uint32_t)UCOL_NULLORDER; 3650 } 3651 3652 if (source->offsetRepeatValue != 0) { 3653 if (CECount > noChars) { 3654 source->offsetRepeatCount += temp.offsetRepeatCount; 3655 } else { 3656 // **** does this really skip the right offsets? **** 3657 source->offsetReturn -= (noChars - CECount); 3658 } 3659 } 3660 3661 if (offsetBias >= 0) { 3662 source->offsetReturn = source->offsetStore - 1; 3663 if (source->offsetReturn == source->offsetBuffer) { 3664 source->offsetStore = source->offsetBuffer; 3665 } 3666 } 3667 3668 source->toReturn = source->CEpos - 1; 3669 if (source->toReturn == source->CEs) { 3670 source->CEpos = source->CEs; 3671 } 3672 3673 return *(source->toReturn); 3674 } 3675 case LONG_PRIMARY_TAG: 3676 { 3677 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 3678 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 3679 source->toReturn = source->CEpos - 1; 3680 3681 if (source->flags & UCOL_ITER_INNORMBUF) { 3682 source->offsetRepeatCount = 1; 3683 } else { 3684 int32_t firstOffset = (int32_t)(source->pos - source->string); 3685 3686 source->appendOffset(firstOffset, *status); 3687 source->appendOffset(firstOffset + 1, *status); 3688 3689 source->offsetReturn = source->offsetStore - 1; 3690 *(source->offsetBuffer) = firstOffset; 3691 if (source->offsetReturn == source->offsetBuffer) { 3692 source->offsetStore = source->offsetBuffer; 3693 } 3694 } 3695 3696 3697 return *(source->toReturn); 3698 } 3699 3700 case EXPANSION_TAG: /* this tag always returns */ 3701 { 3702 /* 3703 This should handle expansion. 3704 NOTE: we can encounter both continuations and expansions in an expansion! 3705 I have to decide where continuations are going to be dealt with 3706 */ 3707 int32_t firstOffset = (int32_t)(source->pos - source->string); 3708 3709 // **** doesn't work if using iterator **** 3710 if (source->offsetReturn != NULL) { 3711 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) { 3712 source->offsetStore = source->offsetBuffer; 3713 }else { 3714 firstOffset = -1; 3715 } 3716 } 3717 3718 /* find the offset to expansion table */ 3719 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3720 size = getExpansionCount(CE); 3721 if (size != 0) { 3722 /* 3723 if there are less than 16 elements in expansion, we don't terminate 3724 */ 3725 uint32_t count; 3726 3727 for (count = 0; count < size; count++) { 3728 *(source->CEpos ++) = *CEOffset++; 3729 3730 if (firstOffset >= 0) { 3731 source->appendOffset(firstOffset + 1, *status); 3732 } 3733 } 3734 } else { 3735 /* else, we do */ 3736 while (*CEOffset != 0) { 3737 *(source->CEpos ++) = *CEOffset ++; 3738 3739 if (firstOffset >= 0) { 3740 source->appendOffset(firstOffset + 1, *status); 3741 } 3742 } 3743 } 3744 3745 if (firstOffset >= 0) { 3746 source->offsetReturn = source->offsetStore - 1; 3747 *(source->offsetBuffer) = firstOffset; 3748 if (source->offsetReturn == source->offsetBuffer) { 3749 source->offsetStore = source->offsetBuffer; 3750 } 3751 } else { 3752 source->offsetRepeatCount += size - 1; 3753 } 3754 3755 source->toReturn = source->CEpos - 1; 3756 // in case of one element expansion, we 3757 // want to immediately return CEpos 3758 if(source->toReturn == source->CEs) { 3759 source->CEpos = source->CEs; 3760 } 3761 3762 return *(source->toReturn); 3763 } 3764 3765 case DIGIT_TAG: 3766 { 3767 /* 3768 We do a check to see if we want to collate digits as numbers; if so we generate 3769 a custom collation key. Otherwise we pull out the value stored in the expansion table. 3770 */ 3771 uint32_t i; /* general counter */ 3772 3773 if (source->coll->numericCollation == UCOL_ON){ 3774 uint32_t digIndx = 0; 3775 uint32_t endIndex = 0; 3776 uint32_t leadingZeroIndex = 0; 3777 uint32_t trailingZeroCount = 0; 3778 3779 uint8_t collateVal = 0; 3780 3781 UBool nonZeroValReached = FALSE; 3782 3783 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs. 3784 /* 3785 We parse the source string until we hit a char that's NOT a digit. 3786 Use this u_charDigitValue. This might be slow because we have to 3787 handle surrogates... 3788 */ 3789 /* 3790 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less, 3791 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation 3792 element we process when going backward. To determine how long that chunk might be, we may need to make 3793 two passes through the loop that collects digits - one to see how long the string is (and how much is 3794 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has 3795 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation 3796 element chunk after resetting the state to the initialState at the right side of the digit string. 3797 */ 3798 uint32_t ceLimit = 0; 3799 UChar initial_ch = ch; 3800 collIterateState initialState = {0,0,0,0,0,0,0,0,0}; 3801 backupState(source, &initialState); 3802 3803 for(;;) { 3804 collIterateState state = {0,0,0,0,0,0,0,0,0}; 3805 UChar32 char32 = 0; 3806 int32_t digVal = 0; 3807 3808 if (U16_IS_TRAIL (ch)) { 3809 if (!collIter_bos(source)){ 3810 UChar lead = getPrevNormalizedChar(source, status); 3811 if(U16_IS_LEAD(lead)) { 3812 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3813 goBackOne(source); 3814 } else { 3815 char32 = ch; 3816 } 3817 } else { 3818 char32 = ch; 3819 } 3820 } else { 3821 char32 = ch; 3822 } 3823 digVal = u_charDigitValue(char32); 3824 3825 for(;;) { 3826 // Make sure we have enough space. No longer needed; 3827 // at this point the largest value of digIndx when we need to save data in numTempBuf 3828 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure 3829 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2). 3830 3831 // Skip over trailing zeroes, and keep a count of them. 3832 if (digVal != 0) 3833 nonZeroValReached = TRUE; 3834 3835 if (nonZeroValReached) { 3836 /* 3837 We parse the digit string into base 100 numbers (this fits into a byte). 3838 We only add to the buffer in twos, thus if we are parsing an odd character, 3839 that serves as the 'tens' digit while the if we are parsing an even one, that 3840 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3841 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3842 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3843 than all the other bytes. 3844 3845 Since we're doing in this reverse we want to put the first digit encountered into the 3846 ones place and the second digit encountered into the tens place. 3847 */ 3848 3849 if ((digIndx + trailingZeroCount) % 2 == 1) { 3850 // High-order digit case (tens place) 3851 collateVal += (uint8_t)(digVal * 10); 3852 3853 // We cannot set leadingZeroIndex unless it has been set for the 3854 // low-order digit. Therefore, all we can do for the high-order 3855 // digit is turn it off, never on. 3856 // The only time we will have a high digit without a low is for 3857 // the very first non-zero digit, so no zero check is necessary. 3858 if (collateVal != 0) 3859 leadingZeroIndex = 0; 3860 3861 // The first pass through, digIndx may exceed the limit, but in that case 3862 // we no longer care about numTempBuf contents since they will be discarded 3863 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) { 3864 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3865 } 3866 collateVal = 0; 3867 } else { 3868 // Low-order digit case (ones place) 3869 collateVal = (uint8_t)digVal; 3870 3871 // Check for leading zeroes. 3872 if (collateVal == 0) { 3873 if (!leadingZeroIndex) 3874 leadingZeroIndex = (digIndx/2) + 2; 3875 } else 3876 leadingZeroIndex = 0; 3877 3878 // No need to write to buffer; the case of a last odd digit 3879 // is handled below. 3880 } 3881 ++digIndx; 3882 } else 3883 ++trailingZeroCount; 3884 3885 if (!collIter_bos(source)) { 3886 ch = getPrevNormalizedChar(source, status); 3887 //goBackOne(source); 3888 if (U16_IS_TRAIL(ch)) { 3889 backupState(source, &state); 3890 if (!collIter_bos(source)) { 3891 goBackOne(source); 3892 UChar lead = getPrevNormalizedChar(source, status); 3893 3894 if(U16_IS_LEAD(lead)) { 3895 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3896 } else { 3897 loadState(source, &state, FALSE); 3898 char32 = ch; 3899 } 3900 } 3901 } else 3902 char32 = ch; 3903 3904 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { 3905 if (char32 > 0xFFFF) {// For surrogates. 3906 loadState(source, &state, FALSE); 3907 } 3908 // Don't need to "reverse" the goBackOne call, 3909 // as this points to the next position to process.. 3910 //if (char32 > 0xFFFF) // For surrogates. 3911 //getNextNormalizedChar(source); 3912 break; 3913 } 3914 3915 goBackOne(source); 3916 }else 3917 break; 3918 } 3919 3920 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) { 3921 // our collation element is not too big, go ahead and finish with it 3922 break; 3923 } 3924 // our digit string is too long for a collation element; 3925 // set the limit for it, reset the state and begin again 3926 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER; 3927 if ( ceLimit == 0 ) { 3928 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; 3929 } 3930 ch = initial_ch; 3931 loadState(source, &initialState, FALSE); 3932 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0; 3933 collateVal = 0; 3934 nonZeroValReached = FALSE; 3935 } 3936 3937 if (! nonZeroValReached) { 3938 digIndx = 2; 3939 trailingZeroCount = 0; 3940 numTempBuf[2] = 6; 3941 } 3942 3943 if ((digIndx + trailingZeroCount) % 2 != 0) { 3944 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; 3945 digIndx += 1; // The implicit leading zero 3946 } 3947 if (trailingZeroCount % 2 != 0) { 3948 // We had to consume one trailing zero for the low digit 3949 // of the least significant byte 3950 digIndx += 1; // The trailing zero not in the exponent 3951 trailingZeroCount -= 1; 3952 } 3953 3954 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; 3955 3956 // Subtract one off of the last byte. Really the first byte here, but it's reversed... 3957 numTempBuf[2] -= 1; 3958 3959 /* 3960 We want to skip over the first two slots in the buffer. The first slot 3961 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3962 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3963 The exponent must be adjusted by the number of leading zeroes, and the number of 3964 trailing zeroes. 3965 */ 3966 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3967 uint32_t exponent = (digIndx+trailingZeroCount)/2; 3968 if (leadingZeroIndex) 3969 exponent -= ((digIndx/2) + 2 - leadingZeroIndex); 3970 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); 3971 3972 // Now transfer the collation key to our collIterate struct. 3973 // The total size for our collation key is half of endIndex, rounded up. 3974 int32_t size = (endIndex+1)/2; 3975 if(!ensureCEsCapacity(source, size)) { 3976 return (uint32_t)UCOL_NULLORDER; 3977 } 3978 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3979 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3980 UCOL_BYTE_COMMON; // Tertiary weight. 3981 i = endIndex - 1; // Reset the index into the buffer. 3982 while(i >= 2) { 3983 uint32_t primWeight = numTempBuf[i--] << 8; 3984 if ( i >= 2) 3985 primWeight |= numTempBuf[i--]; 3986 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3987 } 3988 3989 source->toReturn = source->CEpos -1; 3990 return *(source->toReturn); 3991 } else { 3992 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3993 CE = *(CEOffset++); 3994 break; 3995 } 3996 } 3997 3998 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3999 { 4000 static const uint32_t 4001 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 4002 //const uint32_t LCount = 19; 4003 static const uint32_t VCount = 21; 4004 static const uint32_t TCount = 28; 4005 //const uint32_t NCount = VCount * TCount; /* 588 */ 4006 //const uint32_t SCount = LCount * NCount; /* 11172 */ 4007 4008 uint32_t L = ch - SBase; 4009 /* 4010 divide into pieces. 4011 we do it in this order since some compilers can do % and / in one 4012 operation 4013 */ 4014 uint32_t T = L % TCount; 4015 L /= TCount; 4016 uint32_t V = L % VCount; 4017 L /= VCount; 4018 4019 /* offset them */ 4020 L += LBase; 4021 V += VBase; 4022 T += TBase; 4023 4024 int32_t firstOffset = (int32_t)(source->pos - source->string); 4025 source->appendOffset(firstOffset, *status); 4026 4027 /* 4028 * return the first CE, but first put the rest into the expansion buffer 4029 */ 4030 if (!source->coll->image->jamoSpecial) { 4031 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 4032 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 4033 source->appendOffset(firstOffset + 1, *status); 4034 4035 if (T != TBase) { 4036 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 4037 source->appendOffset(firstOffset + 1, *status); 4038 } 4039 4040 source->toReturn = source->CEpos - 1; 4041 4042 source->offsetReturn = source->offsetStore - 1; 4043 if (source->offsetReturn == source->offsetBuffer) { 4044 source->offsetStore = source->offsetBuffer; 4045 } 4046 4047 return *(source->toReturn); 4048 } else { 4049 // Since Hanguls pass the FCD check, it is 4050 // guaranteed that we won't be in 4051 // the normalization buffer if something like this happens 4052 4053 // Move Jamos into normalization buffer 4054 UChar *tempbuffer = source->writableBuffer.getBuffer(5); 4055 int32_t tempbufferLength, jamoOffset; 4056 tempbuffer[0] = 0; 4057 tempbuffer[1] = (UChar)L; 4058 tempbuffer[2] = (UChar)V; 4059 if (T != TBase) { 4060 tempbuffer[3] = (UChar)T; 4061 tempbufferLength = 4; 4062 } else { 4063 tempbufferLength = 3; 4064 } 4065 source->writableBuffer.releaseBuffer(tempbufferLength); 4066 4067 // Indicate where to continue in main input string after exhausting the writableBuffer 4068 if (source->pos == source->string) { 4069 jamoOffset = 0; 4070 source->fcdPosition = NULL; 4071 } else { 4072 jamoOffset = source->pos - source->string; 4073 source->fcdPosition = source->pos-1; 4074 } 4075 4076 // Append offsets for the additional chars 4077 // (not the 0, and not the L whose offsets match the original Hangul) 4078 int32_t jamoRemaining = tempbufferLength - 2; 4079 jamoOffset++; // appended offsets should match end of original Hangul 4080 while (jamoRemaining-- > 0) { 4081 source->appendOffset(jamoOffset, *status); 4082 } 4083 4084 source->offsetRepeatValue = jamoOffset; 4085 4086 source->offsetReturn = source->offsetStore - 1; 4087 if (source->offsetReturn == source->offsetBuffer) { 4088 source->offsetStore = source->offsetBuffer; 4089 } 4090 4091 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength; 4092 source->origFlags = source->flags; 4093 source->flags |= UCOL_ITER_INNORMBUF; 4094 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 4095 4096 return(UCOL_IGNORABLE); 4097 } 4098 } 4099 4100 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 4101 return getPrevImplicit(ch, source); 4102 4103 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function 4104 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 4105 return getPrevImplicit(ch, source); 4106 4107 case SURROGATE_TAG: /* This is a surrogate pair */ 4108 /* essentially an engaged lead surrogate. */ 4109 /* if you have encountered it here, it means that a */ 4110 /* broken sequence was encountered and this is an error */ 4111 return UCOL_NOT_FOUND; 4112 4113 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 4114 return UCOL_NOT_FOUND; /* broken surrogate sequence */ 4115 4116 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 4117 { 4118 UChar32 cp = 0; 4119 UChar prevChar; 4120 const UChar *prev; 4121 if (isAtStartPrevIterate(source)) { 4122 /* we are at the start of the string, wrong place to be at */ 4123 return UCOL_NOT_FOUND; 4124 } 4125 if (source->pos != source->writableBuffer.getBuffer()) { 4126 prev = source->pos - 1; 4127 } else { 4128 prev = source->fcdPosition; 4129 } 4130 prevChar = *prev; 4131 4132 /* Handles Han and Supplementary characters here.*/ 4133 if (U16_IS_LEAD(prevChar)) { 4134 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 4135 source->pos = prev; 4136 } else { 4137 return UCOL_NOT_FOUND; /* like unassigned */ 4138 } 4139 4140 return getPrevImplicit(cp, source); 4141 } 4142 4143 /* UCA is filled with these. Tailorings are NOT_FOUND */ 4144 /* not yet implemented */ 4145 case CHARSET_TAG: /* this tag always returns */ 4146 /* probably after 1.8 */ 4147 return UCOL_NOT_FOUND; 4148 4149 default: /* this tag always returns */ 4150 *status = U_INTERNAL_PROGRAM_ERROR; 4151 CE=0; 4152 break; 4153 } 4154 4155 if (CE <= UCOL_NOT_FOUND) { 4156 break; 4157 } 4158 } 4159 4160 return CE; 4161 } 4162 4163 /* This should really be a macro */ 4164 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */ 4165 /* secondaries in French */ 4166 /* 4167 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { 4168 uint8_t temp; 4169 while(start<end) { 4170 temp = *start; 4171 *start++ = *end; 4172 *end-- = temp; 4173 } 4174 } 4175 */ 4176 4177 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ 4178 TYPE tempA; \ 4179 while((start)<(end)) { \ 4180 tempA = *(start); \ 4181 *(start)++ = *(end); \ 4182 *(end)-- = tempA; \ 4183 } \ 4184 } 4185 4186 /****************************************************************************/ 4187 /* Following are the sortkey generation functions */ 4188 /* */ 4189 /****************************************************************************/ 4190 4191 U_CAPI int32_t U_EXPORT2 4192 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 4193 const uint8_t *src2, int32_t src2Length, 4194 uint8_t *dest, int32_t destCapacity) { 4195 /* check arguments */ 4196 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 4197 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 4198 destCapacity<0 || (destCapacity>0 && dest==NULL) 4199 ) { 4200 /* error, attempt to write a zero byte and return 0 */ 4201 if(dest!=NULL && destCapacity>0) { 4202 *dest=0; 4203 } 4204 return 0; 4205 } 4206 4207 /* check lengths and capacity */ 4208 if(src1Length<0) { 4209 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 4210 } 4211 if(src2Length<0) { 4212 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 4213 } 4214 4215 int32_t destLength=src1Length+src2Length; 4216 if(destLength>destCapacity) { 4217 /* the merged sort key does not fit into the destination */ 4218 return destLength; 4219 } 4220 4221 /* merge the sort keys with the same number of levels */ 4222 uint8_t *p=dest; 4223 for(;;) { 4224 /* copy level from src1 not including 00 or 01 */ 4225 uint8_t b; 4226 while((b=*src1)>=2) { 4227 ++src1; 4228 *p++=b; 4229 } 4230 4231 /* add a 02 merge separator */ 4232 *p++=2; 4233 4234 /* copy level from src2 not including 00 or 01 */ 4235 while((b=*src2)>=2) { 4236 ++src2; 4237 *p++=b; 4238 } 4239 4240 /* if both sort keys have another level, then add a 01 level separator and continue */ 4241 if(*src1==1 && *src2==1) { 4242 ++src1; 4243 ++src2; 4244 *p++=1; 4245 } else { 4246 break; 4247 } 4248 } 4249 4250 /* 4251 * here, at least one sort key is finished now, but the other one 4252 * might have some contents left from containing more levels; 4253 * that contents is just appended to the result 4254 */ 4255 if(*src1!=0) { 4256 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 4257 src2=src1; 4258 } 4259 /* append src2, "the other, unfinished sort key" */ 4260 while((*p++=*src2++)!=0) {} 4261 4262 /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */ 4263 return (int32_t)(p-dest); 4264 } 4265 4266 U_NAMESPACE_BEGIN 4267 4268 class SortKeyByteSink : public ByteSink { 4269 public: 4270 SortKeyByteSink(char *dest, int32_t destCapacity) 4271 : buffer_(dest), capacity_(destCapacity), 4272 appended_(0) { 4273 if (buffer_ == NULL) { 4274 capacity_ = 0; 4275 } else if(capacity_ < 0) { 4276 buffer_ = NULL; 4277 capacity_ = 0; 4278 } 4279 } 4280 virtual ~SortKeyByteSink(); 4281 4282 virtual void Append(const char *bytes, int32_t n); 4283 void Append(uint32_t b) { 4284 if (appended_ < capacity_ || Resize(1, appended_)) { 4285 buffer_[appended_] = (char)b; 4286 } 4287 ++appended_; 4288 } 4289 void Append(uint32_t b1, uint32_t b2) { 4290 int32_t a2 = appended_ + 2; 4291 if (a2 <= capacity_ || Resize(2, appended_)) { 4292 buffer_[appended_] = (char)b1; 4293 buffer_[appended_ + 1] = (char)b2; 4294 } else if(appended_ < capacity_) { 4295 buffer_[appended_] = (char)b1; 4296 } 4297 appended_ = a2; 4298 } 4299 virtual char *GetAppendBuffer(int32_t min_capacity, 4300 int32_t desired_capacity_hint, 4301 char *scratch, int32_t scratch_capacity, 4302 int32_t *result_capacity); 4303 int32_t NumberOfBytesAppended() const { return appended_; } 4304 /** @return FALSE if memory allocation failed */ 4305 UBool IsOk() const { return buffer_ != NULL; } 4306 4307 protected: 4308 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0; 4309 virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0; 4310 4311 void SetNotOk() { 4312 buffer_ = NULL; 4313 capacity_ = 0; 4314 } 4315 4316 char *buffer_; 4317 int32_t capacity_; 4318 int32_t appended_; 4319 4320 private: 4321 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented 4322 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented 4323 }; 4324 4325 SortKeyByteSink::~SortKeyByteSink() {} 4326 4327 void 4328 SortKeyByteSink::Append(const char *bytes, int32_t n) { 4329 if (n <= 0 || bytes == NULL) { 4330 return; 4331 } 4332 int32_t length = appended_; 4333 appended_ += n; 4334 if ((buffer_ + length) == bytes) { 4335 return; // the caller used GetAppendBuffer() and wrote the bytes already 4336 } 4337 int32_t available = capacity_ - length; 4338 if (n <= available) { 4339 uprv_memcpy(buffer_ + length, bytes, n); 4340 } else { 4341 AppendBeyondCapacity(bytes, n, length); 4342 } 4343 } 4344 4345 char * 4346 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity, 4347 int32_t desired_capacity_hint, 4348 char *scratch, 4349 int32_t scratch_capacity, 4350 int32_t *result_capacity) { 4351 if (min_capacity < 1 || scratch_capacity < min_capacity) { 4352 *result_capacity = 0; 4353 return NULL; 4354 } 4355 int32_t available = capacity_ - appended_; 4356 if (available >= min_capacity) { 4357 *result_capacity = available; 4358 return buffer_ + appended_; 4359 } else if (Resize(desired_capacity_hint, appended_)) { 4360 *result_capacity = capacity_ - appended_; 4361 return buffer_ + appended_; 4362 } else { 4363 *result_capacity = scratch_capacity; 4364 return scratch; 4365 } 4366 } 4367 4368 class FixedSortKeyByteSink : public SortKeyByteSink { 4369 public: 4370 FixedSortKeyByteSink(char *dest, int32_t destCapacity) 4371 : SortKeyByteSink(dest, destCapacity) {} 4372 virtual ~FixedSortKeyByteSink(); 4373 4374 private: 4375 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 4376 virtual UBool Resize(int32_t appendCapacity, int32_t length); 4377 }; 4378 4379 FixedSortKeyByteSink::~FixedSortKeyByteSink() {} 4380 4381 void 4382 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) { 4383 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 4384 // Fill the buffer completely. 4385 int32_t available = capacity_ - length; 4386 if (available > 0) { 4387 uprv_memcpy(buffer_ + length, bytes, available); 4388 } 4389 } 4390 4391 UBool 4392 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) { 4393 return FALSE; 4394 } 4395 4396 class CollationKeyByteSink : public SortKeyByteSink { 4397 public: 4398 CollationKeyByteSink(CollationKey &key) 4399 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()), 4400 key_(key) {} 4401 virtual ~CollationKeyByteSink(); 4402 4403 private: 4404 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 4405 virtual UBool Resize(int32_t appendCapacity, int32_t length); 4406 4407 CollationKey &key_; 4408 }; 4409 4410 CollationKeyByteSink::~CollationKeyByteSink() {} 4411 4412 void 4413 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) { 4414 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 4415 if (Resize(n, length)) { 4416 uprv_memcpy(buffer_ + length, bytes, n); 4417 } 4418 } 4419 4420 UBool 4421 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { 4422 if (buffer_ == NULL) { 4423 return FALSE; // allocation failed before already 4424 } 4425 int32_t newCapacity = 2 * capacity_; 4426 int32_t altCapacity = length + 2 * appendCapacity; 4427 if (newCapacity < altCapacity) { 4428 newCapacity = altCapacity; 4429 } 4430 if (newCapacity < 200) { 4431 newCapacity = 200; 4432 } 4433 uint8_t *newBuffer = key_.reallocate(newCapacity, length); 4434 if (newBuffer == NULL) { 4435 SetNotOk(); 4436 return FALSE; 4437 } 4438 buffer_ = reinterpret_cast<char *>(newBuffer); 4439 capacity_ = newCapacity; 4440 return TRUE; 4441 } 4442 4443 /** 4444 * uint8_t byte buffer, similar to CharString but simpler. 4445 */ 4446 class SortKeyLevel : public UMemory { 4447 public: 4448 SortKeyLevel() : len(0), ok(TRUE) {} 4449 ~SortKeyLevel() {} 4450 4451 /** @return FALSE if memory allocation failed */ 4452 UBool isOk() const { return ok; } 4453 UBool isEmpty() const { return len == 0; } 4454 int32_t length() const { return len; } 4455 const uint8_t *data() const { return buffer.getAlias(); } 4456 uint8_t operator[](int32_t index) const { return buffer[index]; } 4457 4458 void appendByte(uint32_t b); 4459 4460 void appendTo(ByteSink &sink) const { 4461 sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len); 4462 } 4463 4464 uint8_t &lastByte() { 4465 U_ASSERT(len > 0); 4466 return buffer[len - 1]; 4467 } 4468 4469 uint8_t *getLastFewBytes(int32_t n) { 4470 if (ok && len >= n) { 4471 return buffer.getAlias() + len - n; 4472 } else { 4473 return NULL; 4474 } 4475 } 4476 4477 private: 4478 MaybeStackArray<uint8_t, 40> buffer; 4479 int32_t len; 4480 UBool ok; 4481 4482 UBool ensureCapacity(int32_t appendCapacity); 4483 4484 SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class 4485 SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class 4486 }; 4487 4488 void SortKeyLevel::appendByte(uint32_t b) { 4489 if(len < buffer.getCapacity() || ensureCapacity(1)) { 4490 buffer[len++] = (uint8_t)b; 4491 } 4492 } 4493 4494 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) { 4495 if(!ok) { 4496 return FALSE; 4497 } 4498 int32_t newCapacity = 2 * buffer.getCapacity(); 4499 int32_t altCapacity = len + 2 * appendCapacity; 4500 if (newCapacity < altCapacity) { 4501 newCapacity = altCapacity; 4502 } 4503 if (newCapacity < 200) { 4504 newCapacity = 200; 4505 } 4506 if(buffer.resize(newCapacity, len)==NULL) { 4507 return ok = FALSE; 4508 } 4509 return TRUE; 4510 } 4511 4512 U_NAMESPACE_END 4513 4514 /* sortkey API */ 4515 U_CAPI int32_t U_EXPORT2 4516 ucol_getSortKey(const UCollator *coll, 4517 const UChar *source, 4518 int32_t sourceLength, 4519 uint8_t *result, 4520 int32_t resultLength) 4521 { 4522 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 4523 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 4524 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 4525 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 4526 } 4527 4528 if(coll->delegate != NULL) { 4529 return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength); 4530 } 4531 4532 UErrorCode status = U_ZERO_ERROR; 4533 int32_t keySize = 0; 4534 4535 if(source != NULL) { 4536 // source == NULL is actually an error situation, but we would need to 4537 // have an error code to return it. Until we introduce a new 4538 // API, it stays like this 4539 4540 /* this uses the function pointer that is set in updateinternalstate */ 4541 /* currently, there are two funcs: */ 4542 /*ucol_calcSortKey(...);*/ 4543 /*ucol_calcSortKeySimpleTertiary(...);*/ 4544 4545 uint8_t noDest[1] = { 0 }; 4546 if(result == NULL) { 4547 // Distinguish pure preflighting from an allocation error. 4548 result = noDest; 4549 resultLength = 0; 4550 } 4551 FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength); 4552 coll->sortKeyGen(coll, source, sourceLength, sink, &status); 4553 if(U_SUCCESS(status)) { 4554 keySize = sink.NumberOfBytesAppended(); 4555 } 4556 } 4557 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 4558 UTRACE_EXIT_STATUS(status); 4559 return keySize; 4560 } 4561 4562 U_CFUNC int32_t 4563 ucol_getCollationKey(const UCollator *coll, 4564 const UChar *source, int32_t sourceLength, 4565 CollationKey &key, 4566 UErrorCode &errorCode) { 4567 CollationKeyByteSink sink(key); 4568 coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode); 4569 return sink.NumberOfBytesAppended(); 4570 } 4571 4572 // Is this primary weight compressible? 4573 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). 4574 // TODO: This should use per-lead-byte flags from FractionalUCA.txt. 4575 static inline UBool 4576 isCompressible(const UCollator * /*coll*/, uint8_t primary1) { 4577 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary; 4578 } 4579 4580 static 4581 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) { 4582 if (caseShift == 0) { 4583 cases.appendByte(UCOL_CASE_BYTE_START); 4584 caseShift = UCOL_CASE_SHIFT_START; 4585 } 4586 } 4587 4588 // Packs the secondary buffer when processing French locale. 4589 static void 4590 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) { 4591 secondaries += secsize; // We read the secondary-level bytes back to front. 4592 uint8_t secondary; 4593 int32_t count2 = 0; 4594 int32_t i = 0; 4595 // we use i here since the key size already accounts for terminators, so we'll discard the increment 4596 for(i = 0; i<secsize; i++) { 4597 secondary = *(secondaries-i-1); 4598 /* This is compression code. */ 4599 if (secondary == UCOL_COMMON2) { 4600 ++count2; 4601 } else { 4602 if (count2 > 0) { 4603 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4604 while (count2 > UCOL_TOP_COUNT2) { 4605 result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 4606 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4607 } 4608 result.Append(UCOL_COMMON_TOP2 - (count2-1)); 4609 } else { 4610 while (count2 > UCOL_BOT_COUNT2) { 4611 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4612 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4613 } 4614 result.Append(UCOL_COMMON_BOT2 + (count2-1)); 4615 } 4616 count2 = 0; 4617 } 4618 result.Append(secondary); 4619 } 4620 } 4621 if (count2 > 0) { 4622 while (count2 > UCOL_BOT_COUNT2) { 4623 result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4624 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4625 } 4626 result.Append(UCOL_COMMON_BOT2 + (count2-1)); 4627 } 4628 } 4629 4630 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 4631 4632 /* This is the sortkey work horse function */ 4633 U_CFUNC void U_CALLCONV 4634 ucol_calcSortKey(const UCollator *coll, 4635 const UChar *source, 4636 int32_t sourceLength, 4637 SortKeyByteSink &result, 4638 UErrorCode *status) 4639 { 4640 if(U_FAILURE(*status)) { 4641 return; 4642 } 4643 4644 SortKeyByteSink &primaries = result; 4645 SortKeyLevel secondaries; 4646 SortKeyLevel tertiaries; 4647 SortKeyLevel cases; 4648 SortKeyLevel quads; 4649 4650 UnicodeString normSource; 4651 4652 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); 4653 4654 UColAttributeValue strength = coll->strength; 4655 4656 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4657 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4658 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4659 UBool compareIdent = (strength == UCOL_IDENTICAL); 4660 UBool doCase = (coll->caseLevel == UCOL_ON); 4661 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4662 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4663 //UBool qShifted = shifted && (compareQuad == 0); 4664 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4665 4666 uint32_t variableTopValue = coll->variableTopValue; 4667 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no 4668 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. 4669 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4670 uint8_t UCOL_HIRAGANA_QUAD = 0; 4671 if(doHiragana) { 4672 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; 4673 /* allocate one more space for hiragana, value for hiragana */ 4674 } 4675 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4676 4677 /* support for special features like caselevel and funky secondaries */ 4678 int32_t lastSecondaryLength = 0; 4679 uint32_t caseShift = 0; 4680 4681 /* If we need to normalize, we'll do it all at once at the beginning! */ 4682 const Normalizer2 *norm2; 4683 if(compareIdent) { 4684 norm2 = Normalizer2Factory::getNFDInstance(*status); 4685 } else if(coll->normalizationMode != UCOL_OFF) { 4686 norm2 = Normalizer2Factory::getFCDInstance(*status); 4687 } else { 4688 norm2 = NULL; 4689 } 4690 if(norm2 != NULL) { 4691 normSource.setTo(FALSE, source, len); 4692 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 4693 if(qcYesLength != len) { 4694 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 4695 normSource.truncate(qcYesLength); 4696 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 4697 source = normSource.getBuffer(); 4698 len = normSource.length(); 4699 } 4700 } 4701 collIterate s; 4702 IInit_collIterate(coll, source, len, &s, status); 4703 if(U_FAILURE(*status)) { 4704 return; 4705 } 4706 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 4707 4708 uint32_t order = 0; 4709 4710 uint8_t primary1 = 0; 4711 uint8_t primary2 = 0; 4712 uint8_t secondary = 0; 4713 uint8_t tertiary = 0; 4714 uint8_t caseSwitch = coll->caseSwitch; 4715 uint8_t tertiaryMask = coll->tertiaryMask; 4716 int8_t tertiaryAddition = coll->tertiaryAddition; 4717 uint8_t tertiaryTop = coll->tertiaryTop; 4718 uint8_t tertiaryBottom = coll->tertiaryBottom; 4719 uint8_t tertiaryCommon = coll->tertiaryCommon; 4720 uint8_t caseBits = 0; 4721 4722 UBool wasShifted = FALSE; 4723 UBool notIsContinuation = FALSE; 4724 4725 uint32_t count2 = 0, count3 = 0, count4 = 0; 4726 uint8_t leadPrimary = 0; 4727 4728 for(;;) { 4729 order = ucol_IGetNextCE(coll, &s, status); 4730 if(order == UCOL_NO_MORE_CES) { 4731 break; 4732 } 4733 4734 if(order == 0) { 4735 continue; 4736 } 4737 4738 notIsContinuation = !isContinuation(order); 4739 4740 if(notIsContinuation) { 4741 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); 4742 } else { 4743 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4744 } 4745 4746 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4747 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4748 primary1 = (uint8_t)(order >> 8); 4749 4750 uint8_t originalPrimary1 = primary1; 4751 if(notIsContinuation && coll->leadBytePermutationTable != NULL) { 4752 primary1 = coll->leadBytePermutationTable[primary1]; 4753 } 4754 4755 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4756 || (!notIsContinuation && wasShifted))) 4757 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 4758 { 4759 /* and other ignorables should be removed if following a shifted code point */ 4760 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4761 /* we should just completely ignore it */ 4762 continue; 4763 } 4764 if(compareQuad == 0) { 4765 if(count4 > 0) { 4766 while (count4 > UCOL_BOT_COUNT4) { 4767 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4768 count4 -= UCOL_BOT_COUNT4; 4769 } 4770 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); 4771 count4 = 0; 4772 } 4773 /* We are dealing with a variable and we're treating them as shifted */ 4774 /* This is a shifted ignorable */ 4775 if(primary1 != 0) { /* we need to check this since we could be in continuation */ 4776 quads.appendByte(primary1); 4777 } 4778 if(primary2 != 0) { 4779 quads.appendByte(primary2); 4780 } 4781 } 4782 wasShifted = TRUE; 4783 } else { 4784 wasShifted = FALSE; 4785 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4786 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 4787 /* regular and simple sortkey calc */ 4788 if(primary1 != UCOL_IGNORABLE) { 4789 if(notIsContinuation) { 4790 if(leadPrimary == primary1) { 4791 primaries.Append(primary2); 4792 } else { 4793 if(leadPrimary != 0) { 4794 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 4795 } 4796 if(primary2 == UCOL_IGNORABLE) { 4797 /* one byter, not compressed */ 4798 primaries.Append(primary1); 4799 leadPrimary = 0; 4800 } else if(isCompressible(coll, originalPrimary1)) { 4801 /* compress */ 4802 primaries.Append(leadPrimary = primary1, primary2); 4803 } else { 4804 leadPrimary = 0; 4805 primaries.Append(primary1, primary2); 4806 } 4807 } 4808 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 4809 if(primary2 == UCOL_IGNORABLE) { 4810 primaries.Append(primary1); 4811 } else { 4812 primaries.Append(primary1, primary2); 4813 } 4814 } 4815 } 4816 4817 if(secondary > compareSec) { 4818 if(!isFrenchSec) { 4819 /* This is compression code. */ 4820 if (secondary == UCOL_COMMON2 && notIsContinuation) { 4821 ++count2; 4822 } else { 4823 if (count2 > 0) { 4824 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4825 while (count2 > UCOL_TOP_COUNT2) { 4826 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 4827 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4828 } 4829 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1)); 4830 } else { 4831 while (count2 > UCOL_BOT_COUNT2) { 4832 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4833 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4834 } 4835 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 4836 } 4837 count2 = 0; 4838 } 4839 secondaries.appendByte(secondary); 4840 } 4841 } else { 4842 /* Do the special handling for French secondaries */ 4843 /* We need to get continuation elements and do intermediate restore */ 4844 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ 4845 if(notIsContinuation) { 4846 if (lastSecondaryLength > 1) { 4847 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength); 4848 if (frenchStartPtr != NULL) { 4849 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4850 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1; 4851 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4852 } 4853 } 4854 lastSecondaryLength = 1; 4855 } else { 4856 ++lastSecondaryLength; 4857 } 4858 secondaries.appendByte(secondary); 4859 } 4860 } 4861 4862 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 4863 // do the case level if we need to do it. We don't want to calculate 4864 // case level for primary ignorables if we have only primary strength and case level 4865 // otherwise we would break well formedness of CEs 4866 doCaseShift(cases, caseShift); 4867 if(notIsContinuation) { 4868 caseBits = (uint8_t)(tertiary & 0xC0); 4869 4870 if(tertiary != 0) { 4871 if(coll->caseFirst == UCOL_UPPER_FIRST) { 4872 if((caseBits & 0xC0) == 0) { 4873 cases.lastByte() |= 1 << (--caseShift); 4874 } else { 4875 cases.lastByte() |= 0 << (--caseShift); 4876 /* second bit */ 4877 doCaseShift(cases, caseShift); 4878 cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift); 4879 } 4880 } else { 4881 if((caseBits & 0xC0) == 0) { 4882 cases.lastByte() |= 0 << (--caseShift); 4883 } else { 4884 cases.lastByte() |= 1 << (--caseShift); 4885 /* second bit */ 4886 doCaseShift(cases, caseShift); 4887 cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift); 4888 } 4889 } 4890 } 4891 } 4892 } else { 4893 if(notIsContinuation) { 4894 tertiary ^= caseSwitch; 4895 } 4896 } 4897 4898 tertiary &= tertiaryMask; 4899 if(tertiary > compareTer) { 4900 /* This is compression code. */ 4901 /* sequence size check is included in the if clause */ 4902 if (tertiary == tertiaryCommon && notIsContinuation) { 4903 ++count3; 4904 } else { 4905 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 4906 tertiary += tertiaryAddition; 4907 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 4908 tertiary -= tertiaryAddition; 4909 } 4910 if (count3 > 0) { 4911 if ((tertiary > tertiaryCommon)) { 4912 while (count3 > coll->tertiaryTopCount) { 4913 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 4914 count3 -= (uint32_t)coll->tertiaryTopCount; 4915 } 4916 tertiaries.appendByte(tertiaryTop - (count3-1)); 4917 } else { 4918 while (count3 > coll->tertiaryBottomCount) { 4919 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 4920 count3 -= (uint32_t)coll->tertiaryBottomCount; 4921 } 4922 tertiaries.appendByte(tertiaryBottom + (count3-1)); 4923 } 4924 count3 = 0; 4925 } 4926 tertiaries.appendByte(tertiary); 4927 } 4928 } 4929 4930 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 4931 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 4932 if(count4>0) { // Close this part 4933 while (count4 > UCOL_BOT_COUNT4) { 4934 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4935 count4 -= UCOL_BOT_COUNT4; 4936 } 4937 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); 4938 count4 = 0; 4939 } 4940 quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana 4941 } else { // This wasn't Hiragana, so we can continue adding stuff 4942 count4++; 4943 } 4944 } 4945 } 4946 } 4947 4948 /* Here, we are generally done with processing */ 4949 /* bailing out would not be too productive */ 4950 4951 UBool ok = TRUE; 4952 if(U_SUCCESS(*status)) { 4953 /* we have done all the CE's, now let's put them together to form a key */ 4954 if(compareSec == 0) { 4955 if (count2 > 0) { 4956 while (count2 > UCOL_BOT_COUNT2) { 4957 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4958 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4959 } 4960 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 4961 } 4962 result.Append(UCOL_LEVELTERMINATOR); 4963 if(!secondaries.isOk()) { 4964 ok = FALSE; 4965 } else if(!isFrenchSec) { 4966 secondaries.appendTo(result); 4967 } else { 4968 // If there are any unresolved continuation secondaries, 4969 // reverse them here so that we can reverse the whole secondary thing. 4970 if (lastSecondaryLength > 1) { 4971 uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength); 4972 if (frenchStartPtr != NULL) { 4973 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4974 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1; 4975 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4976 } 4977 } 4978 packFrench(secondaries.data(), secondaries.length(), result); 4979 } 4980 } 4981 4982 if(doCase) { 4983 ok &= cases.isOk(); 4984 result.Append(UCOL_LEVELTERMINATOR); 4985 cases.appendTo(result); 4986 } 4987 4988 if(compareTer == 0) { 4989 if (count3 > 0) { 4990 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { 4991 while (count3 >= coll->tertiaryTopCount) { 4992 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 4993 count3 -= (uint32_t)coll->tertiaryTopCount; 4994 } 4995 tertiaries.appendByte(tertiaryTop - count3); 4996 } else { 4997 while (count3 > coll->tertiaryBottomCount) { 4998 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 4999 count3 -= (uint32_t)coll->tertiaryBottomCount; 5000 } 5001 tertiaries.appendByte(tertiaryBottom + (count3-1)); 5002 } 5003 } 5004 ok &= tertiaries.isOk(); 5005 result.Append(UCOL_LEVELTERMINATOR); 5006 tertiaries.appendTo(result); 5007 5008 if(compareQuad == 0/*qShifted == TRUE*/) { 5009 if(count4 > 0) { 5010 while (count4 > UCOL_BOT_COUNT4) { 5011 quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 5012 count4 -= UCOL_BOT_COUNT4; 5013 } 5014 quads.appendByte(UCOL_COMMON_BOT4 + (count4-1)); 5015 } 5016 ok &= quads.isOk(); 5017 result.Append(UCOL_LEVELTERMINATOR); 5018 quads.appendTo(result); 5019 } 5020 5021 if(compareIdent) { 5022 result.Append(UCOL_LEVELTERMINATOR); 5023 u_writeIdenticalLevelRun(s.string, len, result); 5024 } 5025 } 5026 result.Append(0); 5027 } 5028 5029 /* To avoid memory leak, free the offset buffer if necessary. */ 5030 ucol_freeOffsetBuffer(&s); 5031 5032 ok &= result.IsOk(); 5033 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } 5034 } 5035 5036 5037 U_CFUNC void U_CALLCONV 5038 ucol_calcSortKeySimpleTertiary(const UCollator *coll, 5039 const UChar *source, 5040 int32_t sourceLength, 5041 SortKeyByteSink &result, 5042 UErrorCode *status) 5043 { 5044 U_ALIGN_CODE(16); 5045 5046 if(U_FAILURE(*status)) { 5047 return; 5048 } 5049 5050 SortKeyByteSink &primaries = result; 5051 SortKeyLevel secondaries; 5052 SortKeyLevel tertiaries; 5053 5054 UnicodeString normSource; 5055 5056 int32_t len = sourceLength; 5057 5058 /* If we need to normalize, we'll do it all at once at the beginning! */ 5059 if(coll->normalizationMode != UCOL_OFF) { 5060 normSource.setTo(len < 0, source, len); 5061 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); 5062 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 5063 if(qcYesLength != normSource.length()) { 5064 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 5065 normSource.truncate(qcYesLength); 5066 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 5067 source = normSource.getBuffer(); 5068 len = normSource.length(); 5069 } 5070 } 5071 collIterate s; 5072 IInit_collIterate(coll, (UChar *)source, len, &s, status); 5073 if(U_FAILURE(*status)) { 5074 return; 5075 } 5076 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 5077 5078 uint32_t order = 0; 5079 5080 uint8_t primary1 = 0; 5081 uint8_t primary2 = 0; 5082 uint8_t secondary = 0; 5083 uint8_t tertiary = 0; 5084 uint8_t caseSwitch = coll->caseSwitch; 5085 uint8_t tertiaryMask = coll->tertiaryMask; 5086 int8_t tertiaryAddition = coll->tertiaryAddition; 5087 uint8_t tertiaryTop = coll->tertiaryTop; 5088 uint8_t tertiaryBottom = coll->tertiaryBottom; 5089 uint8_t tertiaryCommon = coll->tertiaryCommon; 5090 5091 UBool notIsContinuation = FALSE; 5092 5093 uint32_t count2 = 0, count3 = 0; 5094 uint8_t leadPrimary = 0; 5095 5096 for(;;) { 5097 order = ucol_IGetNextCE(coll, &s, status); 5098 5099 if(order == 0) { 5100 continue; 5101 } 5102 5103 if(order == UCOL_NO_MORE_CES) { 5104 break; 5105 } 5106 5107 notIsContinuation = !isContinuation(order); 5108 5109 if(notIsContinuation) { 5110 tertiary = (uint8_t)((order & tertiaryMask)); 5111 } else { 5112 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 5113 } 5114 5115 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5116 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5117 primary1 = (uint8_t)(order >> 8); 5118 5119 uint8_t originalPrimary1 = primary1; 5120 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { 5121 primary1 = coll->leadBytePermutationTable[primary1]; 5122 } 5123 5124 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 5125 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 5126 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ 5127 /* regular and simple sortkey calc */ 5128 if(primary1 != UCOL_IGNORABLE) { 5129 if(notIsContinuation) { 5130 if(leadPrimary == primary1) { 5131 primaries.Append(primary2); 5132 } else { 5133 if(leadPrimary != 0) { 5134 primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 5135 } 5136 if(primary2 == UCOL_IGNORABLE) { 5137 /* one byter, not compressed */ 5138 primaries.Append(primary1); 5139 leadPrimary = 0; 5140 } else if(isCompressible(coll, originalPrimary1)) { 5141 /* compress */ 5142 primaries.Append(leadPrimary = primary1, primary2); 5143 } else { 5144 leadPrimary = 0; 5145 primaries.Append(primary1, primary2); 5146 } 5147 } 5148 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 5149 if(primary2 == UCOL_IGNORABLE) { 5150 primaries.Append(primary1); 5151 } else { 5152 primaries.Append(primary1, primary2); 5153 } 5154 } 5155 } 5156 5157 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ 5158 /* This is compression code. */ 5159 if (secondary == UCOL_COMMON2 && notIsContinuation) { 5160 ++count2; 5161 } else { 5162 if (count2 > 0) { 5163 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 5164 while (count2 > UCOL_TOP_COUNT2) { 5165 secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 5166 count2 -= (uint32_t)UCOL_TOP_COUNT2; 5167 } 5168 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1)); 5169 } else { 5170 while (count2 > UCOL_BOT_COUNT2) { 5171 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5172 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5173 } 5174 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 5175 } 5176 count2 = 0; 5177 } 5178 secondaries.appendByte(secondary); 5179 } 5180 } 5181 5182 if(notIsContinuation) { 5183 tertiary ^= caseSwitch; 5184 } 5185 5186 if(tertiary > 0) { 5187 /* This is compression code. */ 5188 /* sequence size check is included in the if clause */ 5189 if (tertiary == tertiaryCommon && notIsContinuation) { 5190 ++count3; 5191 } else { 5192 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 5193 tertiary += tertiaryAddition; 5194 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 5195 tertiary -= tertiaryAddition; 5196 } 5197 if (count3 > 0) { 5198 if ((tertiary > tertiaryCommon)) { 5199 while (count3 > coll->tertiaryTopCount) { 5200 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 5201 count3 -= (uint32_t)coll->tertiaryTopCount; 5202 } 5203 tertiaries.appendByte(tertiaryTop - (count3-1)); 5204 } else { 5205 while (count3 > coll->tertiaryBottomCount) { 5206 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 5207 count3 -= (uint32_t)coll->tertiaryBottomCount; 5208 } 5209 tertiaries.appendByte(tertiaryBottom + (count3-1)); 5210 } 5211 count3 = 0; 5212 } 5213 tertiaries.appendByte(tertiary); 5214 } 5215 } 5216 } 5217 5218 UBool ok = TRUE; 5219 if(U_SUCCESS(*status)) { 5220 /* we have done all the CE's, now let's put them together to form a key */ 5221 if (count2 > 0) { 5222 while (count2 > UCOL_BOT_COUNT2) { 5223 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5224 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5225 } 5226 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1)); 5227 } 5228 ok &= secondaries.isOk(); 5229 result.Append(UCOL_LEVELTERMINATOR); 5230 secondaries.appendTo(result); 5231 5232 if (count3 > 0) { 5233 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { 5234 while (count3 >= coll->tertiaryTopCount) { 5235 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount); 5236 count3 -= (uint32_t)coll->tertiaryTopCount; 5237 } 5238 tertiaries.appendByte(tertiaryTop - count3); 5239 } else { 5240 while (count3 > coll->tertiaryBottomCount) { 5241 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount); 5242 count3 -= (uint32_t)coll->tertiaryBottomCount; 5243 } 5244 tertiaries.appendByte(tertiaryBottom + (count3-1)); 5245 } 5246 } 5247 ok &= tertiaries.isOk(); 5248 result.Append(UCOL_LEVELTERMINATOR); 5249 tertiaries.appendTo(result); 5250 5251 result.Append(0); 5252 } 5253 5254 /* To avoid memory leak, free the offset buffer if necessary. */ 5255 ucol_freeOffsetBuffer(&s); 5256 5257 ok &= result.IsOk(); 5258 if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; } 5259 } 5260 5261 static inline 5262 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { 5263 UBool notIsContinuation = !isContinuation(CE); 5264 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); 5265 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) 5266 || (!notIsContinuation && *wasShifted))) 5267 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 5268 { 5269 // The stuff below should probably be in the sortkey code... maybe not... 5270 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ 5271 /* we should just completely ignore it */ 5272 *wasShifted = TRUE; 5273 //continue; 5274 } 5275 //*wasShifted = TRUE; 5276 return TRUE; 5277 } else { 5278 *wasShifted = FALSE; 5279 return FALSE; 5280 } 5281 } 5282 static inline 5283 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { 5284 if(level < maxLevel) { 5285 dest[i++] = UCOL_LEVELTERMINATOR; 5286 } else { 5287 dest[i++] = 0; 5288 } 5289 } 5290 5291 /** enumeration of level identifiers for partial sort key generation */ 5292 enum { 5293 UCOL_PSK_PRIMARY = 0, 5294 UCOL_PSK_SECONDARY = 1, 5295 UCOL_PSK_CASE = 2, 5296 UCOL_PSK_TERTIARY = 3, 5297 UCOL_PSK_QUATERNARY = 4, 5298 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */ 5299 UCOL_PSK_IDENTICAL = 6, 5300 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */ 5301 UCOL_PSK_LIMIT 5302 }; 5303 5304 /** collation state enum. *_SHIFT value is how much to shift right 5305 * to get the state piece to the right. *_MASK value should be 5306 * ANDed with the shifted state. This data is stored in state[1] 5307 * field. 5308 */ 5309 enum { 5310 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */ 5311 UCOL_PSK_LEVEL_MASK = 7, /** three bits */ 5312 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */ 5313 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, 5314 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary 5315 * This field is also used to denote that the French secondary level is finished 5316 */ 5317 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ 5318 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ 5319 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */ 5320 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ 5321 /** When we do French we need to reverse secondary values. However, continuations 5322 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba 5323 */ 5324 UCOL_PSK_BOCSU_BYTES_SHIFT = 7, 5325 UCOL_PSK_BOCSU_BYTES_MASK = 3, 5326 UCOL_PSK_CONSUMED_CES_SHIFT = 9, 5327 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF 5328 }; 5329 5330 // macro calculating the number of expansion CEs available 5331 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn 5332 5333 5334 /** main sortkey part procedure. On the first call, 5335 * you should pass in a collator, an iterator, empty state 5336 * state[0] == state[1] == 0, a buffer to hold results 5337 * number of bytes you need and an error code pointer. 5338 * Make sure your buffer is big enough to hold the wanted 5339 * number of sortkey bytes. I don't check. 5340 * The only meaningful status you can get back is 5341 * U_BUFFER_OVERFLOW_ERROR, which basically means that you 5342 * have been dealt a raw deal and that you probably won't 5343 * be able to use partial sortkey generation for this 5344 * particular combination of string and collator. This 5345 * is highly unlikely, but you should still check the error code. 5346 * Any other status means that you're not in a sane situation 5347 * anymore. After the first call, preserve state values and 5348 * use them on subsequent calls to obtain more bytes of a sortkey. 5349 * Use until the number of bytes written is smaller than the requested 5350 * number of bytes. Generated sortkey is not compatible with the 5351 * one generated by ucol_getSortKey, as we don't do any compression. 5352 * However, levels are still terminated by a 1 (one) and the sortkey 5353 * is terminated by a 0 (zero). Identical level is the same as in the 5354 * regular sortkey - internal bocu-1 implementation is used. 5355 * For curious, although you cannot do much about this, here is 5356 * the structure of state words. 5357 * state[0] - iterator state. Depends on the iterator implementation, 5358 * but allows the iterator to continue where it stopped in 5359 * the last iteration. 5360 * state[1] - collation processing state. Here is the distribution 5361 * of the bits: 5362 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary 5363 * quaternary, quin (we don't use this one), identical and 5364 * null (producing only zeroes - first one to terminate the 5365 * sortkey and subsequent to fill the buffer). 5366 * 3 - byte count. Number of bytes written on the primary level. 5367 * 4 - was shifted. Whether the previous iteration finished in the 5368 * shifted state. 5369 * 5, 6 - French continuation bytes written. See the comment in the enum 5370 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on 5371 * the identical level. 5372 * 9..31 - CEs consumed. Number of getCE or next32 operations performed 5373 * since thes last successful update of the iterator state. 5374 */ 5375 U_CAPI int32_t U_EXPORT2 5376 ucol_nextSortKeyPart(const UCollator *coll, 5377 UCharIterator *iter, 5378 uint32_t state[2], 5379 uint8_t *dest, int32_t count, 5380 UErrorCode *status) 5381 { 5382 /* error checking */ 5383 if(status==NULL || U_FAILURE(*status)) { 5384 return 0; 5385 } 5386 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 5387 if( coll==NULL || iter==NULL || 5388 state==NULL || 5389 count<0 || (count>0 && dest==NULL) 5390 ) { 5391 *status=U_ILLEGAL_ARGUMENT_ERROR; 5392 UTRACE_EXIT_STATUS(status); 5393 return 0; 5394 } 5395 5396 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 5397 coll, iter, state[0], state[1], dest, count); 5398 5399 if(count==0) { 5400 /* nothing to do */ 5401 UTRACE_EXIT_VALUE(0); 5402 return 0; 5403 } 5404 /** Setting up situation according to the state we got from the previous iteration */ 5405 // The state of the iterator from the previous invocation 5406 uint32_t iterState = state[0]; 5407 // Has the last iteration ended in the shifted state 5408 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE; 5409 // What is the current level of the sortkey? 5410 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; 5411 // Have we written only one byte from a two byte primary in the previous iteration? 5412 // Also on secondary level - have we finished with the French secondary? 5413 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; 5414 // number of bytes in the continuation buffer for French 5415 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; 5416 // Number of bytes already written from a bocsu sequence. Since 5417 // the longes bocsu sequence is 4 long, this can be up to 3. 5418 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK; 5419 // Number of elements that need to be consumed in this iteration because 5420 // the iterator returned UITER_NO_STATE at the end of the last iteration, 5421 // so we had to save the last valid state. 5422 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK; 5423 5424 /** values that depend on the collator attributes */ 5425 // strength of the collator. 5426 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); 5427 // maximal level of the partial sortkey. Need to take whether case level is done 5428 int32_t maxLevel = 0; 5429 if(strength < UCOL_TERTIARY) { 5430 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 5431 maxLevel = UCOL_PSK_CASE; 5432 } else { 5433 maxLevel = strength; 5434 } 5435 } else { 5436 if(strength == UCOL_TERTIARY) { 5437 maxLevel = UCOL_PSK_TERTIARY; 5438 } else if(strength == UCOL_QUATERNARY) { 5439 maxLevel = UCOL_PSK_QUATERNARY; 5440 } else { // identical 5441 maxLevel = UCOL_IDENTICAL; 5442 } 5443 } 5444 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation 5445 uint8_t UCOL_HIRAGANA_QUAD = 5446 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF; 5447 // Boundary value that decides whether a CE is shifted or not 5448 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0; 5449 // Are we doing French collation? 5450 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON); 5451 5452 /** initializing the collation state */ 5453 UBool notIsContinuation = FALSE; 5454 uint32_t CE = UCOL_NO_MORE_CES; 5455 5456 collIterate s; 5457 IInit_collIterate(coll, NULL, -1, &s, status); 5458 if(U_FAILURE(*status)) { 5459 UTRACE_EXIT_STATUS(*status); 5460 return 0; 5461 } 5462 s.iterator = iter; 5463 s.flags |= UCOL_USE_ITERATOR; 5464 // This variable tells us whether we have produced some other levels in this iteration 5465 // before we moved to the identical level. In that case, we need to switch the 5466 // type of the iterator. 5467 UBool doingIdenticalFromStart = FALSE; 5468 // Normalizing iterator 5469 // The division for the array length may truncate the array size to 5470 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 5471 // for all platforms anyway. 5472 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 5473 UNormIterator *normIter = NULL; 5474 // If the normalization is turned on for the collator and we are below identical level 5475 // we will use a FCD normalizing iterator 5476 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { 5477 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5478 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); 5479 s.flags &= ~UCOL_ITER_NORM; 5480 if(U_FAILURE(*status)) { 5481 UTRACE_EXIT_STATUS(*status); 5482 return 0; 5483 } 5484 } else if(level == UCOL_PSK_IDENTICAL) { 5485 // for identical level, we need a NFD iterator. We need to instantiate it here, since we 5486 // will be updating the state - and this cannot be done on an ordinary iterator. 5487 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5488 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5489 s.flags &= ~UCOL_ITER_NORM; 5490 if(U_FAILURE(*status)) { 5491 UTRACE_EXIT_STATUS(*status); 5492 return 0; 5493 } 5494 doingIdenticalFromStart = TRUE; 5495 } 5496 5497 // This is the tentative new state of the iterator. The problem 5498 // is that the iterator might return an undefined state, in 5499 // which case we should save the last valid state and increase 5500 // the iterator skip value. 5501 uint32_t newState = 0; 5502 5503 // First, we set the iterator to the last valid position 5504 // from the last iteration. This was saved in state[0]. 5505 if(iterState == 0) { 5506 /* initial state */ 5507 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { 5508 s.iterator->move(s.iterator, 0, UITER_LIMIT); 5509 } else { 5510 s.iterator->move(s.iterator, 0, UITER_START); 5511 } 5512 } else { 5513 /* reset to previous state */ 5514 s.iterator->setState(s.iterator, iterState, status); 5515 if(U_FAILURE(*status)) { 5516 UTRACE_EXIT_STATUS(*status); 5517 return 0; 5518 } 5519 } 5520 5521 5522 5523 // This variable tells us whether we can attempt to update the state 5524 // of iterator. Situations where we don't want to update iterator state 5525 // are the existence of expansion CEs that are not yet processed, and 5526 // finishing the case level without enough space in the buffer to insert 5527 // a level terminator. 5528 UBool canUpdateState = TRUE; 5529 5530 // Consume all the CEs that were consumed at the end of the previous 5531 // iteration without updating the iterator state. On identical level, 5532 // consume the code points. 5533 int32_t counter = cces; 5534 if(level < UCOL_PSK_IDENTICAL) { 5535 while(counter-->0) { 5536 // If we're doing French and we are on the secondary level, 5537 // we go backwards. 5538 if(level == UCOL_PSK_SECONDARY && doingFrench) { 5539 CE = ucol_IGetPrevCE(coll, &s, status); 5540 } else { 5541 CE = ucol_IGetNextCE(coll, &s, status); 5542 } 5543 if(CE==UCOL_NO_MORE_CES) { 5544 /* should not happen */ 5545 *status=U_INTERNAL_PROGRAM_ERROR; 5546 UTRACE_EXIT_STATUS(*status); 5547 return 0; 5548 } 5549 if(uprv_numAvailableExpCEs(s)) { 5550 canUpdateState = FALSE; 5551 } 5552 } 5553 } else { 5554 while(counter-->0) { 5555 uiter_next32(s.iterator); 5556 } 5557 } 5558 5559 // French secondary needs to know whether the iterator state of zero came from previous level OR 5560 // from a new invocation... 5561 UBool wasDoingPrimary = FALSE; 5562 // destination buffer byte counter. When this guy 5563 // gets to count, we're done with the iteration 5564 int32_t i = 0; 5565 // used to count the zero bytes written after we 5566 // have finished with the sort key 5567 int32_t j = 0; 5568 5569 5570 // Hm.... I think we're ready to plunge in. Basic story is as following: 5571 // we have a fall through case based on level. This is used for initial 5572 // positioning on iteration start. Every level processor contains a 5573 // for(;;) which will be broken when we exhaust all the CEs. Other 5574 // way to exit is a goto saveState, which happens when we have filled 5575 // out our buffer. 5576 switch(level) { 5577 case UCOL_PSK_PRIMARY: 5578 wasDoingPrimary = TRUE; 5579 for(;;) { 5580 if(i==count) { 5581 goto saveState; 5582 } 5583 // We should save the state only if we 5584 // are sure that we are done with the 5585 // previous iterator state 5586 if(canUpdateState && byteCountOrFrenchDone == 0) { 5587 newState = s.iterator->getState(s.iterator); 5588 if(newState != UITER_NO_STATE) { 5589 iterState = newState; 5590 cces = 0; 5591 } 5592 } 5593 CE = ucol_IGetNextCE(coll, &s, status); 5594 cces++; 5595 if(CE==UCOL_NO_MORE_CES) { 5596 // Add the level separator 5597 terminatePSKLevel(level, maxLevel, i, dest); 5598 byteCountOrFrenchDone=0; 5599 // Restart the iteration an move to the 5600 // second level 5601 s.iterator->move(s.iterator, 0, UITER_START); 5602 cces = 0; 5603 level = UCOL_PSK_SECONDARY; 5604 break; 5605 } 5606 if(!isContinuation(CE)){ 5607 if(coll->leadBytePermutationTable != NULL){ 5608 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF); 5609 } 5610 } 5611 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5612 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ 5613 if(CE != 0) { 5614 if(byteCountOrFrenchDone == 0) { 5615 // get the second byte of primary 5616 dest[i++]=(uint8_t)(CE >> 8); 5617 } else { 5618 byteCountOrFrenchDone = 0; 5619 } 5620 if((CE &=0xff)!=0) { 5621 if(i==count) { 5622 /* overflow */ 5623 byteCountOrFrenchDone = 1; 5624 cces--; 5625 goto saveState; 5626 } 5627 dest[i++]=(uint8_t)CE; 5628 } 5629 } 5630 } 5631 if(uprv_numAvailableExpCEs(s)) { 5632 canUpdateState = FALSE; 5633 } else { 5634 canUpdateState = TRUE; 5635 } 5636 } 5637 /* fall through to next level */ 5638 case UCOL_PSK_SECONDARY: 5639 if(strength >= UCOL_SECONDARY) { 5640 if(!doingFrench) { 5641 for(;;) { 5642 if(i == count) { 5643 goto saveState; 5644 } 5645 // We should save the state only if we 5646 // are sure that we are done with the 5647 // previous iterator state 5648 if(canUpdateState) { 5649 newState = s.iterator->getState(s.iterator); 5650 if(newState != UITER_NO_STATE) { 5651 iterState = newState; 5652 cces = 0; 5653 } 5654 } 5655 CE = ucol_IGetNextCE(coll, &s, status); 5656 cces++; 5657 if(CE==UCOL_NO_MORE_CES) { 5658 // Add the level separator 5659 terminatePSKLevel(level, maxLevel, i, dest); 5660 byteCountOrFrenchDone = 0; 5661 // Restart the iteration an move to the 5662 // second level 5663 s.iterator->move(s.iterator, 0, UITER_START); 5664 cces = 0; 5665 level = UCOL_PSK_CASE; 5666 break; 5667 } 5668 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5669 CE >>= 8; /* get secondary */ 5670 if(CE != 0) { 5671 dest[i++]=(uint8_t)CE; 5672 } 5673 } 5674 if(uprv_numAvailableExpCEs(s)) { 5675 canUpdateState = FALSE; 5676 } else { 5677 canUpdateState = TRUE; 5678 } 5679 } 5680 } else { // French secondary processing 5681 uint8_t frenchBuff[UCOL_MAX_BUFFER]; 5682 int32_t frenchIndex = 0; 5683 // Here we are going backwards. 5684 // If the iterator is at the beggining, it should be 5685 // moved to end. 5686 if(wasDoingPrimary) { 5687 s.iterator->move(s.iterator, 0, UITER_LIMIT); 5688 cces = 0; 5689 } 5690 for(;;) { 5691 if(i == count) { 5692 goto saveState; 5693 } 5694 if(canUpdateState) { 5695 newState = s.iterator->getState(s.iterator); 5696 if(newState != UITER_NO_STATE) { 5697 iterState = newState; 5698 cces = 0; 5699 } 5700 } 5701 CE = ucol_IGetPrevCE(coll, &s, status); 5702 cces++; 5703 if(CE==UCOL_NO_MORE_CES) { 5704 // Add the level separator 5705 terminatePSKLevel(level, maxLevel, i, dest); 5706 byteCountOrFrenchDone = 0; 5707 // Restart the iteration an move to the next level 5708 s.iterator->move(s.iterator, 0, UITER_START); 5709 level = UCOL_PSK_CASE; 5710 break; 5711 } 5712 if(isContinuation(CE)) { // if it's a continuation, we want to save it and 5713 // reverse when we get a first non-continuation CE. 5714 CE >>= 8; 5715 frenchBuff[frenchIndex++] = (uint8_t)CE; 5716 } else if(!isShiftedCE(CE, LVT, &wasShifted)) { 5717 CE >>= 8; /* get secondary */ 5718 if(!frenchIndex) { 5719 if(CE != 0) { 5720 dest[i++]=(uint8_t)CE; 5721 } 5722 } else { 5723 frenchBuff[frenchIndex++] = (uint8_t)CE; 5724 frenchIndex -= usedFrench; 5725 usedFrench = 0; 5726 while(i < count && frenchIndex) { 5727 dest[i++] = frenchBuff[--frenchIndex]; 5728 usedFrench++; 5729 } 5730 } 5731 } 5732 if(uprv_numAvailableExpCEs(s)) { 5733 canUpdateState = FALSE; 5734 } else { 5735 canUpdateState = TRUE; 5736 } 5737 } 5738 } 5739 } else { 5740 level = UCOL_PSK_CASE; 5741 } 5742 /* fall through to next level */ 5743 case UCOL_PSK_CASE: 5744 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 5745 uint32_t caseShift = UCOL_CASE_SHIFT_START; 5746 uint8_t caseByte = UCOL_CASE_BYTE_START; 5747 uint8_t caseBits = 0; 5748 5749 for(;;) { 5750 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); 5751 if(i == count) { 5752 goto saveState; 5753 } 5754 // We should save the state only if we 5755 // are sure that we are done with the 5756 // previous iterator state 5757 if(canUpdateState) { 5758 newState = s.iterator->getState(s.iterator); 5759 if(newState != UITER_NO_STATE) { 5760 iterState = newState; 5761 cces = 0; 5762 } 5763 } 5764 CE = ucol_IGetNextCE(coll, &s, status); 5765 cces++; 5766 if(CE==UCOL_NO_MORE_CES) { 5767 // On the case level we might have an unfinished 5768 // case byte. Add one if it's started. 5769 if(caseShift != UCOL_CASE_SHIFT_START) { 5770 dest[i++] = caseByte; 5771 } 5772 cces = 0; 5773 // We have finished processing CEs on this level. 5774 // However, we don't know if we have enough space 5775 // to add a case level terminator. 5776 if(i < count) { 5777 // Add the level separator 5778 terminatePSKLevel(level, maxLevel, i, dest); 5779 // Restart the iteration and move to the 5780 // next level 5781 s.iterator->move(s.iterator, 0, UITER_START); 5782 level = UCOL_PSK_TERTIARY; 5783 } else { 5784 canUpdateState = FALSE; 5785 } 5786 break; 5787 } 5788 5789 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5790 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) { 5791 // do the case level if we need to do it. We don't want to calculate 5792 // case level for primary ignorables if we have only primary strength and case level 5793 // otherwise we would break well formedness of CEs 5794 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 5795 caseBits = (uint8_t)(CE & 0xC0); 5796 // this copies the case level logic from the 5797 // sort key generation code 5798 if(CE != 0) { 5799 if (caseShift == 0) { 5800 dest[i++] = caseByte; 5801 caseShift = UCOL_CASE_SHIFT_START; 5802 caseByte = UCOL_CASE_BYTE_START; 5803 } 5804 if(coll->caseFirst == UCOL_UPPER_FIRST) { 5805 if((caseBits & 0xC0) == 0) { 5806 caseByte |= 1 << (--caseShift); 5807 } else { 5808 caseByte |= 0 << (--caseShift); 5809 /* second bit */ 5810 if(caseShift == 0) { 5811 dest[i++] = caseByte; 5812 caseShift = UCOL_CASE_SHIFT_START; 5813 caseByte = UCOL_CASE_BYTE_START; 5814 } 5815 caseByte |= ((caseBits>>6)&1) << (--caseShift); 5816 } 5817 } else { 5818 if((caseBits & 0xC0) == 0) { 5819 caseByte |= 0 << (--caseShift); 5820 } else { 5821 caseByte |= 1 << (--caseShift); 5822 /* second bit */ 5823 if(caseShift == 0) { 5824 dest[i++] = caseByte; 5825 caseShift = UCOL_CASE_SHIFT_START; 5826 caseByte = UCOL_CASE_BYTE_START; 5827 } 5828 caseByte |= ((caseBits>>7)&1) << (--caseShift); 5829 } 5830 } 5831 } 5832 5833 } 5834 } 5835 // Not sure this is correct for the case level - revisit 5836 if(uprv_numAvailableExpCEs(s)) { 5837 canUpdateState = FALSE; 5838 } else { 5839 canUpdateState = TRUE; 5840 } 5841 } 5842 } else { 5843 level = UCOL_PSK_TERTIARY; 5844 } 5845 /* fall through to next level */ 5846 case UCOL_PSK_TERTIARY: 5847 if(strength >= UCOL_TERTIARY) { 5848 for(;;) { 5849 if(i == count) { 5850 goto saveState; 5851 } 5852 // We should save the state only if we 5853 // are sure that we are done with the 5854 // previous iterator state 5855 if(canUpdateState) { 5856 newState = s.iterator->getState(s.iterator); 5857 if(newState != UITER_NO_STATE) { 5858 iterState = newState; 5859 cces = 0; 5860 } 5861 } 5862 CE = ucol_IGetNextCE(coll, &s, status); 5863 cces++; 5864 if(CE==UCOL_NO_MORE_CES) { 5865 // Add the level separator 5866 terminatePSKLevel(level, maxLevel, i, dest); 5867 byteCountOrFrenchDone = 0; 5868 // Restart the iteration an move to the 5869 // second level 5870 s.iterator->move(s.iterator, 0, UITER_START); 5871 cces = 0; 5872 level = UCOL_PSK_QUATERNARY; 5873 break; 5874 } 5875 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5876 notIsContinuation = !isContinuation(CE); 5877 5878 if(notIsContinuation) { 5879 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 5880 CE ^= coll->caseSwitch; 5881 CE &= coll->tertiaryMask; 5882 } else { 5883 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 5884 } 5885 5886 if(CE != 0) { 5887 dest[i++]=(uint8_t)CE; 5888 } 5889 } 5890 if(uprv_numAvailableExpCEs(s)) { 5891 canUpdateState = FALSE; 5892 } else { 5893 canUpdateState = TRUE; 5894 } 5895 } 5896 } else { 5897 // if we're not doing tertiary 5898 // skip to the end 5899 level = UCOL_PSK_NULL; 5900 } 5901 /* fall through to next level */ 5902 case UCOL_PSK_QUATERNARY: 5903 if(strength >= UCOL_QUATERNARY) { 5904 for(;;) { 5905 if(i == count) { 5906 goto saveState; 5907 } 5908 // We should save the state only if we 5909 // are sure that we are done with the 5910 // previous iterator state 5911 if(canUpdateState) { 5912 newState = s.iterator->getState(s.iterator); 5913 if(newState != UITER_NO_STATE) { 5914 iterState = newState; 5915 cces = 0; 5916 } 5917 } 5918 CE = ucol_IGetNextCE(coll, &s, status); 5919 cces++; 5920 if(CE==UCOL_NO_MORE_CES) { 5921 // Add the level separator 5922 terminatePSKLevel(level, maxLevel, i, dest); 5923 //dest[i++] = UCOL_LEVELTERMINATOR; 5924 byteCountOrFrenchDone = 0; 5925 // Restart the iteration an move to the 5926 // second level 5927 s.iterator->move(s.iterator, 0, UITER_START); 5928 cces = 0; 5929 level = UCOL_PSK_QUIN; 5930 break; 5931 } 5932 if(CE==0) 5933 continue; 5934 if(isShiftedCE(CE, LVT, &wasShifted)) { 5935 CE >>= 16; /* get primary */ 5936 if(CE != 0) { 5937 if(byteCountOrFrenchDone == 0) { 5938 dest[i++]=(uint8_t)(CE >> 8); 5939 } else { 5940 byteCountOrFrenchDone = 0; 5941 } 5942 if((CE &=0xff)!=0) { 5943 if(i==count) { 5944 /* overflow */ 5945 byteCountOrFrenchDone = 1; 5946 goto saveState; 5947 } 5948 dest[i++]=(uint8_t)CE; 5949 } 5950 } 5951 } else { 5952 notIsContinuation = !isContinuation(CE); 5953 if(notIsContinuation) { 5954 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 5955 dest[i++] = UCOL_HIRAGANA_QUAD; 5956 } else { 5957 dest[i++] = 0xFF; 5958 } 5959 } 5960 } 5961 if(uprv_numAvailableExpCEs(s)) { 5962 canUpdateState = FALSE; 5963 } else { 5964 canUpdateState = TRUE; 5965 } 5966 } 5967 } else { 5968 // if we're not doing quaternary 5969 // skip to the end 5970 level = UCOL_PSK_NULL; 5971 } 5972 /* fall through to next level */ 5973 case UCOL_PSK_QUIN: 5974 level = UCOL_PSK_IDENTICAL; 5975 /* fall through to next level */ 5976 case UCOL_PSK_IDENTICAL: 5977 if(strength >= UCOL_IDENTICAL) { 5978 UChar32 first, second; 5979 int32_t bocsuBytesWritten = 0; 5980 // We always need to do identical on 5981 // the NFD form of the string. 5982 if(normIter == NULL) { 5983 // we arrived from the level below and 5984 // normalization was not turned on. 5985 // therefore, we need to make a fresh NFD iterator 5986 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5987 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5988 } else if(!doingIdenticalFromStart) { 5989 // there is an iterator, but we did some other levels. 5990 // therefore, we have a FCD iterator - need to make 5991 // a NFD one. 5992 // normIter being at the beginning does not guarantee 5993 // that the underlying iterator is at the beginning 5994 iter->move(iter, 0, UITER_START); 5995 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5996 } 5997 // At this point we have a NFD iterator that is positioned 5998 // in the right place 5999 if(U_FAILURE(*status)) { 6000 UTRACE_EXIT_STATUS(*status); 6001 return 0; 6002 } 6003 first = uiter_previous32(s.iterator); 6004 // maybe we're at the start of the string 6005 if(first == U_SENTINEL) { 6006 first = 0; 6007 } else { 6008 uiter_next32(s.iterator); 6009 } 6010 6011 j = 0; 6012 for(;;) { 6013 if(i == count) { 6014 if(j+1 < bocsuBytesWritten) { 6015 bocsuBytesUsed = j+1; 6016 } 6017 goto saveState; 6018 } 6019 6020 // On identical level, we will always save 6021 // the state if we reach this point, since 6022 // we don't depend on getNextCE for content 6023 // all the content is in our buffer and we 6024 // already either stored the full buffer OR 6025 // otherwise we won't arrive here. 6026 newState = s.iterator->getState(s.iterator); 6027 if(newState != UITER_NO_STATE) { 6028 iterState = newState; 6029 cces = 0; 6030 } 6031 6032 uint8_t buff[4]; 6033 second = uiter_next32(s.iterator); 6034 cces++; 6035 6036 // end condition for identical level 6037 if(second == U_SENTINEL) { 6038 terminatePSKLevel(level, maxLevel, i, dest); 6039 level = UCOL_PSK_NULL; 6040 break; 6041 } 6042 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); 6043 first = second; 6044 6045 j = 0; 6046 if(bocsuBytesUsed != 0) { 6047 while(bocsuBytesUsed-->0) { 6048 j++; 6049 } 6050 } 6051 6052 while(i < count && j < bocsuBytesWritten) { 6053 dest[i++] = buff[j++]; 6054 } 6055 } 6056 6057 } else { 6058 level = UCOL_PSK_NULL; 6059 } 6060 /* fall through to next level */ 6061 case UCOL_PSK_NULL: 6062 j = i; 6063 while(j<count) { 6064 dest[j++]=0; 6065 } 6066 break; 6067 default: 6068 *status = U_INTERNAL_PROGRAM_ERROR; 6069 UTRACE_EXIT_STATUS(*status); 6070 return 0; 6071 } 6072 6073 saveState: 6074 // Now we need to return stuff. First we want to see whether we have 6075 // done everything for the current state of iterator. 6076 if(byteCountOrFrenchDone 6077 || canUpdateState == FALSE 6078 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) 6079 { 6080 // Any of above mean that the previous transaction 6081 // wasn't finished and that we should store the 6082 // previous iterator state. 6083 state[0] = iterState; 6084 } else { 6085 // The transaction is complete. We will continue in the next iteration. 6086 state[0] = s.iterator->getState(s.iterator); 6087 cces = 0; 6088 } 6089 // Store the number of bocsu bytes written. 6090 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { 6091 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6092 } 6093 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT; 6094 6095 // Next we put in the level of comparison 6096 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); 6097 6098 // If we are doing French, we need to store whether we have just finished the French level 6099 if(level == UCOL_PSK_SECONDARY && doingFrench) { 6100 state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6101 } else { 6102 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6103 } 6104 6105 // Was the latest CE shifted 6106 if(wasShifted) { 6107 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; 6108 } 6109 // Check for cces overflow 6110 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { 6111 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6112 } 6113 // Store cces 6114 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT); 6115 6116 // Check for French overflow 6117 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { 6118 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6119 } 6120 // Store number of bytes written in the French secondary continuation sequence 6121 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); 6122 6123 6124 // If we have used normalizing iterator, get rid of it 6125 if(normIter != NULL) { 6126 unorm_closeIter(normIter); 6127 } 6128 6129 /* To avoid memory leak, free the offset buffer if necessary. */ 6130 ucol_freeOffsetBuffer(&s); 6131 6132 // Return number of meaningful sortkey bytes. 6133 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 6134 dest,i, state[0], state[1]); 6135 UTRACE_EXIT_VALUE(i); 6136 return i; 6137 } 6138 6139 /** 6140 * Produce a bound for a given sortkey and a number of levels. 6141 */ 6142 U_CAPI int32_t U_EXPORT2 6143 ucol_getBound(const uint8_t *source, 6144 int32_t sourceLength, 6145 UColBoundMode boundType, 6146 uint32_t noOfLevels, 6147 uint8_t *result, 6148 int32_t resultLength, 6149 UErrorCode *status) 6150 { 6151 // consistency checks 6152 if(status == NULL || U_FAILURE(*status)) { 6153 return 0; 6154 } 6155 if(source == NULL) { 6156 *status = U_ILLEGAL_ARGUMENT_ERROR; 6157 return 0; 6158 } 6159 6160 int32_t sourceIndex = 0; 6161 // Scan the string until we skip enough of the key OR reach the end of the key 6162 do { 6163 sourceIndex++; 6164 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { 6165 noOfLevels--; 6166 } 6167 } while (noOfLevels > 0 6168 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 6169 6170 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 6171 && noOfLevels > 0) { 6172 *status = U_SORT_KEY_TOO_SHORT_WARNING; 6173 } 6174 6175 6176 // READ ME: this code assumes that the values for boundType 6177 // enum will not changes. They are set so that the enum value 6178 // corresponds to the number of extra bytes each bound type 6179 // needs. 6180 if(result != NULL && resultLength >= sourceIndex+boundType) { 6181 uprv_memcpy(result, source, sourceIndex); 6182 switch(boundType) { 6183 // Lower bound just gets terminated. No extra bytes 6184 case UCOL_BOUND_LOWER: // = 0 6185 break; 6186 // Upper bound needs one extra byte 6187 case UCOL_BOUND_UPPER: // = 1 6188 result[sourceIndex++] = 2; 6189 break; 6190 // Upper long bound needs two extra bytes 6191 case UCOL_BOUND_UPPER_LONG: // = 2 6192 result[sourceIndex++] = 0xFF; 6193 result[sourceIndex++] = 0xFF; 6194 break; 6195 default: 6196 *status = U_ILLEGAL_ARGUMENT_ERROR; 6197 return 0; 6198 } 6199 result[sourceIndex++] = 0; 6200 6201 return sourceIndex; 6202 } else { 6203 return sourceIndex+boundType+1; 6204 } 6205 } 6206 6207 /****************************************************************************/ 6208 /* Following are the functions that deal with the properties of a collator */ 6209 /* there are new APIs and some compatibility APIs */ 6210 /****************************************************************************/ 6211 6212 static inline void 6213 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, 6214 int32_t *primShift, int32_t *secShift, int32_t *terShift) 6215 { 6216 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; 6217 UBool reverseSecondary = FALSE; 6218 UBool continuation = isContinuation(CE); 6219 if(!continuation) { 6220 tertiary = (uint8_t)((CE & coll->tertiaryMask)); 6221 tertiary ^= coll->caseSwitch; 6222 reverseSecondary = TRUE; 6223 } else { 6224 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6225 tertiary &= UCOL_REMOVE_CASE; 6226 reverseSecondary = FALSE; 6227 } 6228 6229 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6230 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6231 primary1 = (uint8_t)(CE >> 8); 6232 6233 if(primary1 != 0) { 6234 if (coll->leadBytePermutationTable != NULL && !continuation) { 6235 primary1 = coll->leadBytePermutationTable[primary1]; 6236 } 6237 6238 coll->latinOneCEs[ch] |= (primary1 << *primShift); 6239 *primShift -= 8; 6240 } 6241 if(primary2 != 0) { 6242 if(*primShift < 0) { 6243 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6244 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6245 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6246 return; 6247 } 6248 coll->latinOneCEs[ch] |= (primary2 << *primShift); 6249 *primShift -= 8; 6250 } 6251 if(secondary != 0) { 6252 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary 6253 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary 6254 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); 6255 } else { // normal case 6256 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); 6257 } 6258 *secShift -= 8; 6259 } 6260 if(tertiary != 0) { 6261 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); 6262 *terShift -= 8; 6263 } 6264 } 6265 6266 static inline UBool 6267 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { 6268 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); 6269 if(newTable == NULL) { 6270 *status = U_MEMORY_ALLOCATION_ERROR; 6271 coll->latinOneFailed = TRUE; 6272 return FALSE; 6273 } 6274 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t); 6275 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); 6276 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); 6277 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy); 6278 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy); 6279 coll->latinOneTableLen = size; 6280 uprv_free(coll->latinOneCEs); 6281 coll->latinOneCEs = newTable; 6282 return TRUE; 6283 } 6284 6285 static UBool 6286 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { 6287 UBool result = TRUE; 6288 if(coll->latinOneCEs == NULL) { 6289 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); 6290 if(coll->latinOneCEs == NULL) { 6291 *status = U_MEMORY_ALLOCATION_ERROR; 6292 return FALSE; 6293 } 6294 coll->latinOneTableLen = UCOL_LATINONETABLELEN; 6295 } 6296 UChar ch = 0; 6297 UCollationElements *it = ucol_openElements(coll, &ch, 1, status); 6298 // Check for null pointer 6299 if (U_FAILURE(*status)) { 6300 ucol_closeElements(it); 6301 return FALSE; 6302 } 6303 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); 6304 6305 int32_t primShift = 24, secShift = 24, terShift = 24; 6306 uint32_t CE = 0; 6307 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; 6308 6309 // TODO: make safe if you get more than you wanted... 6310 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { 6311 primShift = 24; secShift = 24; terShift = 24; 6312 if(ch < 0x100) { 6313 CE = coll->latinOneMapping[ch]; 6314 } else { 6315 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 6316 if(CE == UCOL_NOT_FOUND && coll->UCA) { 6317 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 6318 } 6319 } 6320 if(CE < UCOL_NOT_FOUND) { 6321 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6322 } else { 6323 switch (getCETag(CE)) { 6324 case EXPANSION_TAG: 6325 case DIGIT_TAG: 6326 ucol_setText(it, &ch, 1, status); 6327 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { 6328 if(primShift < 0 || secShift < 0 || terShift < 0) { 6329 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6330 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6331 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6332 break; 6333 } 6334 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6335 } 6336 break; 6337 case CONTRACTION_TAG: 6338 // here is the trick 6339 // F2 is contraction. We do something very similar to contractions 6340 // but have two indices, one in the real contraction table and the 6341 // other to where we stuffed things. This hopes that we don't have 6342 // many contractions (this should work for latin-1 tables). 6343 { 6344 if((CE & 0x00FFF000) != 0) { 6345 *status = U_UNSUPPORTED_ERROR; 6346 goto cleanup_after_failure; 6347 } 6348 6349 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 6350 6351 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table 6352 6353 coll->latinOneCEs[ch] = CE; 6354 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; 6355 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; 6356 6357 // We're going to jump into contraction table, pick the elements 6358 // and use them 6359 do { 6360 CE = *(coll->contractionCEs + 6361 (UCharOffset - coll->contractionIndex)); 6362 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { 6363 uint32_t size; 6364 uint32_t i; /* general counter */ 6365 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 6366 size = getExpansionCount(CE); 6367 //CE = *CEOffset++; 6368 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 6369 for(i = 0; i<size; i++) { 6370 if(primShift < 0 || secShift < 0 || terShift < 0) { 6371 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6372 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6373 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6374 break; 6375 } 6376 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6377 } 6378 } else { /* else, we do */ 6379 while(*CEOffset != 0) { 6380 if(primShift < 0 || secShift < 0 || terShift < 0) { 6381 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6382 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6383 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6384 break; 6385 } 6386 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6387 } 6388 } 6389 contractionOffset++; 6390 } else if(CE < UCOL_NOT_FOUND) { 6391 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); 6392 } else { 6393 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6394 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6395 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6396 contractionOffset++; 6397 } 6398 UCharOffset++; 6399 primShift = 24; secShift = 24; terShift = 24; 6400 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate 6401 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { 6402 goto cleanup_after_failure; 6403 } 6404 } 6405 } while(*UCharOffset != 0xFFFF); 6406 } 6407 break;; 6408 case SPEC_PROC_TAG: 6409 { 6410 // 0xB7 is a precontext character defined in UCA5.1, a special 6411 // handle is implemeted in order to save LatinOne table for 6412 // most locales. 6413 if (ch==0xb7) { 6414 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6415 } 6416 else { 6417 goto cleanup_after_failure; 6418 } 6419 } 6420 break; 6421 default: 6422 goto cleanup_after_failure; 6423 } 6424 } 6425 } 6426 // compact table 6427 if(contractionOffset < coll->latinOneTableLen) { 6428 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { 6429 goto cleanup_after_failure; 6430 } 6431 } 6432 ucol_closeElements(it); 6433 return result; 6434 6435 cleanup_after_failure: 6436 // status should already be set before arriving here. 6437 coll->latinOneFailed = TRUE; 6438 ucol_closeElements(it); 6439 return FALSE; 6440 } 6441 6442 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { 6443 if(U_SUCCESS(*status)) { 6444 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6445 coll->caseSwitch = UCOL_CASE_SWITCH; 6446 } else { 6447 coll->caseSwitch = UCOL_NO_CASE_SWITCH; 6448 } 6449 6450 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { 6451 coll->tertiaryMask = UCOL_REMOVE_CASE; 6452 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6453 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */ 6454 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; 6455 coll->tertiaryBottom = UCOL_COMMON_BOT3; 6456 } else { 6457 coll->tertiaryMask = UCOL_KEEP_CASE; 6458 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; 6459 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6460 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; 6461 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; 6462 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; 6463 } else { 6464 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6465 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; 6466 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; 6467 } 6468 } 6469 6470 /* Set the compression values */ 6471 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1); 6472 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */ 6473 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); 6474 6475 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY 6476 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) 6477 { 6478 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; 6479 } else { 6480 coll->sortKeyGen = ucol_calcSortKey; 6481 } 6482 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF 6483 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) 6484 { 6485 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { 6486 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it 6487 //fprintf(stderr, "F"); 6488 coll->latinOneUse = TRUE; 6489 } else { 6490 coll->latinOneUse = FALSE; 6491 } 6492 if(*status == U_UNSUPPORTED_ERROR) { 6493 *status = U_ZERO_ERROR; 6494 } 6495 } else { // latin1Table exists and it doesn't need to be regenerated, just use it 6496 coll->latinOneUse = TRUE; 6497 } 6498 } else { 6499 coll->latinOneUse = FALSE; 6500 } 6501 } 6502 } 6503 6504 U_CAPI uint32_t U_EXPORT2 6505 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 6506 if(U_FAILURE(*status) || coll == NULL) { 6507 return 0; 6508 } 6509 if(len == -1) { 6510 len = u_strlen(varTop); 6511 } 6512 if(len == 0) { 6513 *status = U_ILLEGAL_ARGUMENT_ERROR; 6514 return 0; 6515 } 6516 6517 if(coll->delegate!=NULL) { 6518 return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status); 6519 } 6520 6521 6522 collIterate s; 6523 IInit_collIterate(coll, varTop, len, &s, status); 6524 if(U_FAILURE(*status)) { 6525 return 0; 6526 } 6527 6528 uint32_t CE = ucol_IGetNextCE(coll, &s, status); 6529 6530 /* here we check if we have consumed all characters */ 6531 /* you can put in either one character or a contraction */ 6532 /* you shouldn't put more... */ 6533 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { 6534 *status = U_CE_NOT_FOUND_ERROR; 6535 return 0; 6536 } 6537 6538 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); 6539 6540 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { 6541 *status = U_PRIMARY_TOO_LONG_ERROR; 6542 return 0; 6543 } 6544 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { 6545 coll->variableTopValueisDefault = FALSE; 6546 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; 6547 } 6548 6549 /* To avoid memory leak, free the offset buffer if necessary. */ 6550 ucol_freeOffsetBuffer(&s); 6551 6552 return CE & UCOL_PRIMARYMASK; 6553 } 6554 6555 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 6556 if(U_FAILURE(*status) || coll == NULL) { 6557 return 0; 6558 } 6559 if(coll->delegate!=NULL) { 6560 return ((const Collator*)coll->delegate)->getVariableTop(*status); 6561 } 6562 return coll->variableTopValue<<16; 6563 } 6564 6565 U_CAPI void U_EXPORT2 6566 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 6567 if(U_FAILURE(*status) || coll == NULL) { 6568 return; 6569 } 6570 6571 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { 6572 coll->variableTopValueisDefault = FALSE; 6573 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; 6574 } 6575 } 6576 /* Attribute setter API */ 6577 U_CAPI void U_EXPORT2 6578 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 6579 if(U_FAILURE(*status) || coll == NULL) { 6580 return; 6581 } 6582 6583 if(coll->delegate != NULL) { 6584 ((Collator*)coll->delegate)->setAttribute(attr,value,*status); 6585 return; 6586 } 6587 6588 UColAttributeValue oldFrench = coll->frenchCollation; 6589 UColAttributeValue oldCaseFirst = coll->caseFirst; 6590 switch(attr) { 6591 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ 6592 if(value == UCOL_ON) { 6593 coll->numericCollation = UCOL_ON; 6594 coll->numericCollationisDefault = FALSE; 6595 } else if (value == UCOL_OFF) { 6596 coll->numericCollation = UCOL_OFF; 6597 coll->numericCollationisDefault = FALSE; 6598 } else if (value == UCOL_DEFAULT) { 6599 coll->numericCollationisDefault = TRUE; 6600 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; 6601 } else { 6602 *status = U_ILLEGAL_ARGUMENT_ERROR; 6603 } 6604 break; 6605 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ 6606 if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) { 6607 // This attribute is an implementation detail of the CLDR Japanese tailoring. 6608 // The implementation might change to use a different mechanism 6609 // to achieve the same Japanese sort order. 6610 // Since ICU 50, this attribute is not settable any more via API functions. 6611 } else { 6612 *status = U_ILLEGAL_ARGUMENT_ERROR; 6613 } 6614 break; 6615 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 6616 if(value == UCOL_ON) { 6617 coll->frenchCollation = UCOL_ON; 6618 coll->frenchCollationisDefault = FALSE; 6619 } else if (value == UCOL_OFF) { 6620 coll->frenchCollation = UCOL_OFF; 6621 coll->frenchCollationisDefault = FALSE; 6622 } else if (value == UCOL_DEFAULT) { 6623 coll->frenchCollationisDefault = TRUE; 6624 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation; 6625 } else { 6626 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6627 } 6628 break; 6629 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 6630 if(value == UCOL_SHIFTED) { 6631 coll->alternateHandling = UCOL_SHIFTED; 6632 coll->alternateHandlingisDefault = FALSE; 6633 } else if (value == UCOL_NON_IGNORABLE) { 6634 coll->alternateHandling = UCOL_NON_IGNORABLE; 6635 coll->alternateHandlingisDefault = FALSE; 6636 } else if (value == UCOL_DEFAULT) { 6637 coll->alternateHandlingisDefault = TRUE; 6638 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ; 6639 } else { 6640 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6641 } 6642 break; 6643 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 6644 if(value == UCOL_LOWER_FIRST) { 6645 coll->caseFirst = UCOL_LOWER_FIRST; 6646 coll->caseFirstisDefault = FALSE; 6647 } else if (value == UCOL_UPPER_FIRST) { 6648 coll->caseFirst = UCOL_UPPER_FIRST; 6649 coll->caseFirstisDefault = FALSE; 6650 } else if (value == UCOL_OFF) { 6651 coll->caseFirst = UCOL_OFF; 6652 coll->caseFirstisDefault = FALSE; 6653 } else if (value == UCOL_DEFAULT) { 6654 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; 6655 coll->caseFirstisDefault = TRUE; 6656 } else { 6657 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6658 } 6659 break; 6660 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 6661 if(value == UCOL_ON) { 6662 coll->caseLevel = UCOL_ON; 6663 coll->caseLevelisDefault = FALSE; 6664 } else if (value == UCOL_OFF) { 6665 coll->caseLevel = UCOL_OFF; 6666 coll->caseLevelisDefault = FALSE; 6667 } else if (value == UCOL_DEFAULT) { 6668 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; 6669 coll->caseLevelisDefault = TRUE; 6670 } else { 6671 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6672 } 6673 break; 6674 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 6675 if(value == UCOL_ON) { 6676 coll->normalizationMode = UCOL_ON; 6677 coll->normalizationModeisDefault = FALSE; 6678 initializeFCD(status); 6679 } else if (value == UCOL_OFF) { 6680 coll->normalizationMode = UCOL_OFF; 6681 coll->normalizationModeisDefault = FALSE; 6682 } else if (value == UCOL_DEFAULT) { 6683 coll->normalizationModeisDefault = TRUE; 6684 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode; 6685 if(coll->normalizationMode == UCOL_ON) { 6686 initializeFCD(status); 6687 } 6688 } else { 6689 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6690 } 6691 break; 6692 case UCOL_STRENGTH: /* attribute for strength */ 6693 if (value == UCOL_DEFAULT) { 6694 coll->strengthisDefault = TRUE; 6695 coll->strength = (UColAttributeValue)coll->options->strength; 6696 } else if (value <= UCOL_IDENTICAL) { 6697 coll->strengthisDefault = FALSE; 6698 coll->strength = value; 6699 } else { 6700 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6701 } 6702 break; 6703 case UCOL_ATTRIBUTE_COUNT: 6704 default: 6705 *status = U_ILLEGAL_ARGUMENT_ERROR; 6706 break; 6707 } 6708 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { 6709 coll->latinOneRegenTable = TRUE; 6710 } else { 6711 coll->latinOneRegenTable = FALSE; 6712 } 6713 ucol_updateInternalState(coll, status); 6714 } 6715 6716 U_CAPI UColAttributeValue U_EXPORT2 6717 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 6718 if(U_FAILURE(*status) || coll == NULL) { 6719 return UCOL_DEFAULT; 6720 } 6721 6722 if(coll->delegate != NULL) { 6723 return ((Collator*)coll->delegate)->getAttribute(attr,*status); 6724 } 6725 6726 switch(attr) { 6727 case UCOL_NUMERIC_COLLATION: 6728 return coll->numericCollation; 6729 case UCOL_HIRAGANA_QUATERNARY_MODE: 6730 return coll->hiraganaQ; 6731 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 6732 return coll->frenchCollation; 6733 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 6734 return coll->alternateHandling; 6735 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 6736 return coll->caseFirst; 6737 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 6738 return coll->caseLevel; 6739 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 6740 return coll->normalizationMode; 6741 case UCOL_STRENGTH: /* attribute for strength */ 6742 return coll->strength; 6743 case UCOL_ATTRIBUTE_COUNT: 6744 default: 6745 *status = U_ILLEGAL_ARGUMENT_ERROR; 6746 break; 6747 } 6748 return UCOL_DEFAULT; 6749 } 6750 6751 U_CAPI void U_EXPORT2 6752 ucol_setStrength( UCollator *coll, 6753 UCollationStrength strength) 6754 { 6755 UErrorCode status = U_ZERO_ERROR; 6756 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 6757 } 6758 6759 U_CAPI UCollationStrength U_EXPORT2 6760 ucol_getStrength(const UCollator *coll) 6761 { 6762 UErrorCode status = U_ZERO_ERROR; 6763 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 6764 } 6765 6766 U_CAPI int32_t U_EXPORT2 6767 ucol_getReorderCodes(const UCollator *coll, 6768 int32_t *dest, 6769 int32_t destCapacity, 6770 UErrorCode *status) { 6771 if (U_FAILURE(*status)) { 6772 return 0; 6773 } 6774 6775 if(coll->delegate!=NULL) { 6776 return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status); 6777 } 6778 6779 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 6780 *status = U_ILLEGAL_ARGUMENT_ERROR; 6781 return 0; 6782 } 6783 6784 #ifdef UCOL_DEBUG 6785 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength); 6786 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength); 6787 #endif 6788 6789 if (coll->reorderCodesLength > destCapacity) { 6790 *status = U_BUFFER_OVERFLOW_ERROR; 6791 return coll->reorderCodesLength; 6792 } 6793 for (int32_t i = 0; i < coll->reorderCodesLength; i++) { 6794 dest[i] = coll->reorderCodes[i]; 6795 } 6796 return coll->reorderCodesLength; 6797 } 6798 6799 U_CAPI void U_EXPORT2 6800 ucol_setReorderCodes(UCollator* coll, 6801 const int32_t* reorderCodes, 6802 int32_t reorderCodesLength, 6803 UErrorCode *status) { 6804 if (U_FAILURE(*status)) { 6805 return; 6806 } 6807 6808 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) { 6809 *status = U_ILLEGAL_ARGUMENT_ERROR; 6810 return; 6811 } 6812 6813 if(coll->delegate!=NULL) { 6814 ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status); 6815 return; 6816 } 6817 6818 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { 6819 uprv_free(coll->reorderCodes); 6820 } 6821 coll->reorderCodes = NULL; 6822 coll->reorderCodesLength = 0; 6823 if (reorderCodesLength == 0) { 6824 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) { 6825 uprv_free(coll->leadBytePermutationTable); 6826 } 6827 coll->leadBytePermutationTable = NULL; 6828 return; 6829 } 6830 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t)); 6831 if (coll->reorderCodes == NULL) { 6832 *status = U_MEMORY_ALLOCATION_ERROR; 6833 return; 6834 } 6835 coll->freeReorderCodesOnClose = TRUE; 6836 for (int32_t i = 0; i < reorderCodesLength; i++) { 6837 coll->reorderCodes[i] = reorderCodes[i]; 6838 } 6839 coll->reorderCodesLength = reorderCodesLength; 6840 ucol_buildPermutationTable(coll, status); 6841 } 6842 6843 U_CAPI int32_t U_EXPORT2 6844 ucol_getEquivalentReorderCodes(int32_t reorderCode, 6845 int32_t* dest, 6846 int32_t destCapacity, 6847 UErrorCode *pErrorCode) { 6848 bool equivalentCodesSet[USCRIPT_CODE_LIMIT]; 6849 uint16_t leadBytes[256]; 6850 int leadBytesCount; 6851 int leadByteIndex; 6852 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT]; 6853 int reorderCodesForLeadByteCount; 6854 int reorderCodeIndex; 6855 6856 int32_t equivalentCodesCount = 0; 6857 int setIndex; 6858 6859 if (U_FAILURE(*pErrorCode)) { 6860 return 0; 6861 } 6862 6863 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 6864 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 6865 return 0; 6866 } 6867 6868 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool)); 6869 6870 const UCollator* uca = ucol_initUCA(pErrorCode); 6871 if (U_FAILURE(*pErrorCode)) { 6872 return 0; 6873 } 6874 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256); 6875 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) { 6876 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte( 6877 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT); 6878 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) { 6879 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true; 6880 } 6881 } 6882 6883 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { 6884 if (equivalentCodesSet[setIndex] == true) { 6885 equivalentCodesCount++; 6886 } 6887 } 6888 6889 if (destCapacity == 0) { 6890 return equivalentCodesCount; 6891 } 6892 6893 equivalentCodesCount = 0; 6894 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { 6895 if (equivalentCodesSet[setIndex] == true) { 6896 dest[equivalentCodesCount++] = setIndex; 6897 if (equivalentCodesCount >= destCapacity) { 6898 break; 6899 } 6900 } 6901 } 6902 return equivalentCodesCount; 6903 } 6904 6905 6906 /****************************************************************************/ 6907 /* Following are misc functions */ 6908 /* there are new APIs and some compatibility APIs */ 6909 /****************************************************************************/ 6910 6911 U_CAPI void U_EXPORT2 6912 ucol_getVersion(const UCollator* coll, 6913 UVersionInfo versionInfo) 6914 { 6915 if(coll->delegate!=NULL) { 6916 ((const Collator*)coll->delegate)->getVersion(versionInfo); 6917 return; 6918 } 6919 /* RunTime version */ 6920 uint8_t rtVersion = UCOL_RUNTIME_VERSION; 6921 /* Builder version*/ 6922 uint8_t bdVersion = coll->image->version[0]; 6923 6924 /* Charset Version. Need to get the version from cnv files 6925 * makeconv should populate cnv files with version and 6926 * an api has to be provided in ucnv.h to obtain this version 6927 */ 6928 uint8_t csVersion = 0; 6929 6930 /* combine the version info */ 6931 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion)); 6932 6933 /* Tailoring rules */ 6934 versionInfo[0] = (uint8_t)(cmbVersion>>8); 6935 versionInfo[1] = (uint8_t)cmbVersion; 6936 versionInfo[2] = coll->image->version[1]; 6937 if(coll->UCA) { 6938 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */ 6939 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07); 6940 } else { 6941 versionInfo[3] = 0; 6942 } 6943 } 6944 6945 6946 /* This internal API checks whether a character is tailored or not */ 6947 U_CAPI UBool U_EXPORT2 6948 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { 6949 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { 6950 return FALSE; 6951 } 6952 6953 uint32_t CE = UCOL_NOT_FOUND; 6954 const UChar *ContractionStart = NULL; 6955 if(u < 0x100) { /* latin-1 */ 6956 CE = coll->latinOneMapping[u]; 6957 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { 6958 return FALSE; 6959 } 6960 } else { /* regular */ 6961 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); 6962 } 6963 6964 if(isContraction(CE)) { 6965 ContractionStart = (UChar *)coll->image+getContractOffset(CE); 6966 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); 6967 } 6968 6969 return (UBool)(CE != UCOL_NOT_FOUND); 6970 } 6971 6972 6973 /****************************************************************************/ 6974 /* Following are the string compare functions */ 6975 /* */ 6976 /****************************************************************************/ 6977 6978 6979 /* ucol_checkIdent internal function. Does byte level string compare. */ 6980 /* Used by strcoll if strength == identical and strings */ 6981 /* are otherwise equal. */ 6982 /* */ 6983 /* Comparison must be done on NFD normalized strings. */ 6984 /* FCD is not good enough. */ 6985 6986 static 6987 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status) 6988 { 6989 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both 6990 // of same type, but that doesn't really mean that it will stay that way. 6991 int32_t comparison; 6992 6993 if (sColl->flags & UCOL_USE_ITERATOR) { 6994 // The division for the array length may truncate the array size to 6995 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 6996 // for all platforms anyway. 6997 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 6998 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 6999 UNormIterator *sNIt = NULL, *tNIt = NULL; 7000 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 7001 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 7002 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7003 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7004 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); 7005 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); 7006 comparison = u_strCompareIter(sIt, tIt, TRUE); 7007 unorm_closeIter(sNIt); 7008 unorm_closeIter(tNIt); 7009 } else { 7010 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1; 7011 const UChar *sBuf = sColl->string; 7012 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1; 7013 const UChar *tBuf = tColl->string; 7014 7015 if (normalize) { 7016 *status = U_ZERO_ERROR; 7017 // Note: We could use Normalizer::compare() or similar, but for short strings 7018 // which may not be in FCD it might be faster to just NFD them. 7019 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than 7020 // NFD'ing immediately might be faster for long strings, 7021 // but string comparison is usually done on relatively short strings. 7022 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen), 7023 sColl->writableBuffer, 7024 *status); 7025 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen), 7026 tColl->writableBuffer, 7027 *status); 7028 if(U_FAILURE(*status)) { 7029 return UCOL_LESS; 7030 } 7031 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer); 7032 } else { 7033 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); 7034 } 7035 } 7036 7037 if (comparison < 0) { 7038 return UCOL_LESS; 7039 } else if (comparison == 0) { 7040 return UCOL_EQUAL; 7041 } else /* comparison > 0 */ { 7042 return UCOL_GREATER; 7043 } 7044 } 7045 7046 /* CEBuf - A struct and some inline functions to handle the saving */ 7047 /* of CEs in a buffer within ucol_strcoll */ 7048 7049 #define UCOL_CEBUF_SIZE 512 7050 typedef struct ucol_CEBuf { 7051 uint32_t *buf; 7052 uint32_t *endp; 7053 uint32_t *pos; 7054 uint32_t localArray[UCOL_CEBUF_SIZE]; 7055 } ucol_CEBuf; 7056 7057 7058 static 7059 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { 7060 (b)->buf = (b)->pos = (b)->localArray; 7061 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; 7062 } 7063 7064 static 7065 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { 7066 uint32_t oldSize; 7067 uint32_t newSize; 7068 uint32_t *newBuf; 7069 7070 ci->flags |= UCOL_ITER_ALLOCATED; 7071 oldSize = (uint32_t)(b->pos - b->buf); 7072 newSize = oldSize * 2; 7073 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); 7074 if(newBuf == NULL) { 7075 *status = U_MEMORY_ALLOCATION_ERROR; 7076 } 7077 else { 7078 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); 7079 if (b->buf != b->localArray) { 7080 uprv_free(b->buf); 7081 } 7082 b->buf = newBuf; 7083 b->endp = b->buf + newSize; 7084 b->pos = b->buf + oldSize; 7085 } 7086 } 7087 7088 static 7089 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) { 7090 if (b->pos == b->endp) { 7091 ucol_CEBuf_Expand(b, ci, status); 7092 } 7093 if (U_SUCCESS(*status)) { 7094 *(b)->pos++ = ce; 7095 } 7096 } 7097 7098 /* This is a trick string compare function that goes in and uses sortkeys to compare */ 7099 /* It is used when compare gets in trouble and needs to bail out */ 7100 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, 7101 collIterate *tColl, 7102 UErrorCode *status) 7103 { 7104 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; 7105 uint8_t *sourceKeyP = sourceKey; 7106 uint8_t *targetKeyP = targetKey; 7107 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; 7108 const UCollator *coll = sColl->coll; 7109 const UChar *source = NULL; 7110 const UChar *target = NULL; 7111 int32_t result = UCOL_EQUAL; 7112 UnicodeString sourceString, targetString; 7113 int32_t sourceLength; 7114 int32_t targetLength; 7115 7116 if(sColl->flags & UCOL_USE_ITERATOR) { 7117 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7118 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7119 UChar32 c; 7120 while((c=sColl->iterator->next(sColl->iterator))>=0) { 7121 sourceString.append((UChar)c); 7122 } 7123 while((c=tColl->iterator->next(tColl->iterator))>=0) { 7124 targetString.append((UChar)c); 7125 } 7126 source = sourceString.getBuffer(); 7127 sourceLength = sourceString.length(); 7128 target = targetString.getBuffer(); 7129 targetLength = targetString.length(); 7130 } else { // no iterators 7131 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1; 7132 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1; 7133 source = sColl->string; 7134 target = tColl->string; 7135 } 7136 7137 7138 7139 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7140 if(sourceKeyLen > UCOL_MAX_BUFFER) { 7141 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); 7142 if(sourceKeyP == NULL) { 7143 *status = U_MEMORY_ALLOCATION_ERROR; 7144 goto cleanup_and_do_compare; 7145 } 7146 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7147 } 7148 7149 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7150 if(targetKeyLen > UCOL_MAX_BUFFER) { 7151 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); 7152 if(targetKeyP == NULL) { 7153 *status = U_MEMORY_ALLOCATION_ERROR; 7154 goto cleanup_and_do_compare; 7155 } 7156 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7157 } 7158 7159 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); 7160 7161 cleanup_and_do_compare: 7162 if(sourceKeyP != NULL && sourceKeyP != sourceKey) { 7163 uprv_free(sourceKeyP); 7164 } 7165 7166 if(targetKeyP != NULL && targetKeyP != targetKey) { 7167 uprv_free(targetKeyP); 7168 } 7169 7170 if(result<0) { 7171 return UCOL_LESS; 7172 } else if(result>0) { 7173 return UCOL_GREATER; 7174 } else { 7175 return UCOL_EQUAL; 7176 } 7177 } 7178 7179 7180 static UCollationResult 7181 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) 7182 { 7183 U_ALIGN_CODE(16); 7184 7185 const UCollator *coll = sColl->coll; 7186 7187 7188 // setting up the collator parameters 7189 UColAttributeValue strength = coll->strength; 7190 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); 7191 7192 UBool checkSecTer = initialCheckSecTer; 7193 UBool checkTertiary = (strength >= UCOL_TERTIARY); 7194 UBool checkQuad = (strength >= UCOL_QUATERNARY); 7195 UBool checkIdent = (strength == UCOL_IDENTICAL); 7196 UBool checkCase = (coll->caseLevel == UCOL_ON); 7197 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; 7198 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 7199 UBool qShifted = shifted && checkQuad; 7200 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; 7201 7202 if(doHiragana && shifted) { 7203 return (ucol_compareUsingSortKeys(sColl, tColl, status)); 7204 } 7205 uint8_t caseSwitch = coll->caseSwitch; 7206 uint8_t tertiaryMask = coll->tertiaryMask; 7207 7208 // This is the lowest primary value that will not be ignored if shifted 7209 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; 7210 7211 UCollationResult result = UCOL_EQUAL; 7212 UCollationResult hirResult = UCOL_EQUAL; 7213 7214 // Preparing the CE buffers. They will be filled during the primary phase 7215 ucol_CEBuf sCEs; 7216 ucol_CEBuf tCEs; 7217 UCOL_INIT_CEBUF(&sCEs); 7218 UCOL_INIT_CEBUF(&tCEs); 7219 7220 uint32_t secS = 0, secT = 0; 7221 uint32_t sOrder=0, tOrder=0; 7222 7223 // Non shifted primary processing is quite simple 7224 if(!shifted) { 7225 for(;;) { 7226 7227 // We fetch CEs until we hit a non ignorable primary or end. 7228 do { 7229 // We get the next CE 7230 sOrder = ucol_IGetNextCE(coll, sColl, status); 7231 // Stuff it in the buffer 7232 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7233 // And keep just the primary part. 7234 sOrder &= UCOL_PRIMARYMASK; 7235 } while(sOrder == 0); 7236 7237 // see the comments on the above block 7238 do { 7239 tOrder = ucol_IGetNextCE(coll, tColl, status); 7240 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7241 tOrder &= UCOL_PRIMARYMASK; 7242 } while(tOrder == 0); 7243 7244 // if both primaries are the same 7245 if(sOrder == tOrder) { 7246 // and there are no more CEs, we advance to the next level 7247 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7248 break; 7249 } 7250 if(doHiragana && hirResult == UCOL_EQUAL) { 7251 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { 7252 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) 7253 ? UCOL_LESS:UCOL_GREATER; 7254 } 7255 } 7256 } else { 7257 // only need to check one for continuation 7258 // if one is then the other must be or the preceding CE would be a prefix of the other 7259 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) { 7260 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); 7261 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); 7262 } 7263 // if two primaries are different, we are done 7264 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; 7265 goto commonReturn; 7266 } 7267 } // no primary difference... do the rest from the buffers 7268 } else { // shifted - do a slightly more complicated processing :) 7269 for(;;) { 7270 UBool sInShifted = FALSE; 7271 UBool tInShifted = FALSE; 7272 // This version of code can be refactored. However, it seems easier to understand this way. 7273 // Source loop. Sam as the target loop. 7274 for(;;) { 7275 sOrder = ucol_IGetNextCE(coll, sColl, status); 7276 if(sOrder == UCOL_NO_MORE_CES) { 7277 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7278 break; 7279 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { 7280 /* UCA amendment - ignore ignorables that follow shifted code points */ 7281 continue; 7282 } else if(isContinuation(sOrder)) { 7283 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7284 if(sInShifted) { 7285 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7286 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7287 continue; 7288 } else { 7289 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7290 break; 7291 } 7292 } else { /* Just lower level values */ 7293 if(sInShifted) { 7294 continue; 7295 } else { 7296 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7297 continue; 7298 } 7299 } 7300 } else { /* regular */ 7301 if(coll->leadBytePermutationTable != NULL){ 7302 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); 7303 } 7304 if((sOrder & UCOL_PRIMARYMASK) > LVT) { 7305 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7306 break; 7307 } else { 7308 if((sOrder & UCOL_PRIMARYMASK) > 0) { 7309 sInShifted = TRUE; 7310 sOrder &= UCOL_PRIMARYMASK; 7311 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7312 continue; 7313 } else { 7314 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7315 sInShifted = FALSE; 7316 continue; 7317 } 7318 } 7319 } 7320 } 7321 sOrder &= UCOL_PRIMARYMASK; 7322 sInShifted = FALSE; 7323 7324 for(;;) { 7325 tOrder = ucol_IGetNextCE(coll, tColl, status); 7326 if(tOrder == UCOL_NO_MORE_CES) { 7327 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7328 break; 7329 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { 7330 /* UCA amendment - ignore ignorables that follow shifted code points */ 7331 continue; 7332 } else if(isContinuation(tOrder)) { 7333 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7334 if(tInShifted) { 7335 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7336 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7337 continue; 7338 } else { 7339 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7340 break; 7341 } 7342 } else { /* Just lower level values */ 7343 if(tInShifted) { 7344 continue; 7345 } else { 7346 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7347 continue; 7348 } 7349 } 7350 } else { /* regular */ 7351 if(coll->leadBytePermutationTable != NULL){ 7352 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); 7353 } 7354 if((tOrder & UCOL_PRIMARYMASK) > LVT) { 7355 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7356 break; 7357 } else { 7358 if((tOrder & UCOL_PRIMARYMASK) > 0) { 7359 tInShifted = TRUE; 7360 tOrder &= UCOL_PRIMARYMASK; 7361 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7362 continue; 7363 } else { 7364 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7365 tInShifted = FALSE; 7366 continue; 7367 } 7368 } 7369 } 7370 } 7371 tOrder &= UCOL_PRIMARYMASK; 7372 tInShifted = FALSE; 7373 7374 if(sOrder == tOrder) { 7375 /* 7376 if(doHiragana && hirResult == UCOL_EQUAL) { 7377 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { 7378 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) 7379 ? UCOL_LESS:UCOL_GREATER; 7380 } 7381 } 7382 */ 7383 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7384 break; 7385 } else { 7386 sOrder = 0; 7387 tOrder = 0; 7388 continue; 7389 } 7390 } else { 7391 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; 7392 goto commonReturn; 7393 } 7394 } /* no primary difference... do the rest from the buffers */ 7395 } 7396 7397 /* now, we're gonna reexamine collected CEs */ 7398 uint32_t *sCE; 7399 uint32_t *tCE; 7400 7401 /* This is the secondary level of comparison */ 7402 if(checkSecTer) { 7403 if(!isFrenchSec) { /* normal */ 7404 sCE = sCEs.buf; 7405 tCE = tCEs.buf; 7406 for(;;) { 7407 while (secS == 0) { 7408 secS = *(sCE++) & UCOL_SECONDARYMASK; 7409 } 7410 7411 while(secT == 0) { 7412 secT = *(tCE++) & UCOL_SECONDARYMASK; 7413 } 7414 7415 if(secS == secT) { 7416 if(secS == UCOL_NO_MORE_CES_SECONDARY) { 7417 break; 7418 } else { 7419 secS = 0; secT = 0; 7420 continue; 7421 } 7422 } else { 7423 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7424 goto commonReturn; 7425 } 7426 } 7427 } else { /* do the French */ 7428 uint32_t *sCESave = NULL; 7429 uint32_t *tCESave = NULL; 7430 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ 7431 tCE = tCEs.pos-2; 7432 for(;;) { 7433 while (secS == 0 && sCE >= sCEs.buf) { 7434 if(sCESave == NULL) { 7435 secS = *(sCE--); 7436 if(isContinuation(secS)) { 7437 while(isContinuation(secS = *(sCE--))) 7438 ; 7439 /* after this, secS has the start of continuation, and sCEs points before that */ 7440 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7441 sCE+=2; /* need to point to the first continuation CP */ 7442 /* However, now you can just continue doing stuff */ 7443 } 7444 } else { 7445 secS = *(sCE++); 7446 if(!isContinuation(secS)) { /* This means we have finished with this cont */ 7447 sCE = sCESave; /* reset the pointer to before continuation */ 7448 sCESave = NULL; 7449 secS = 0; /* Fetch a fresh CE before the continuation sequence. */ 7450 continue; 7451 } 7452 } 7453 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7454 } 7455 7456 while(secT == 0 && tCE >= tCEs.buf) { 7457 if(tCESave == NULL) { 7458 secT = *(tCE--); 7459 if(isContinuation(secT)) { 7460 while(isContinuation(secT = *(tCE--))) 7461 ; 7462 /* after this, secS has the start of continuation, and sCEs points before that */ 7463 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7464 tCE+=2; /* need to point to the first continuation CP */ 7465 /* However, now you can just continue doing stuff */ 7466 } 7467 } else { 7468 secT = *(tCE++); 7469 if(!isContinuation(secT)) { /* This means we have finished with this cont */ 7470 tCE = tCESave; /* reset the pointer to before continuation */ 7471 tCESave = NULL; 7472 secT = 0; /* Fetch a fresh CE before the continuation sequence. */ 7473 continue; 7474 } 7475 } 7476 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7477 } 7478 7479 if(secS == secT) { 7480 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { 7481 break; 7482 } else { 7483 secS = 0; secT = 0; 7484 continue; 7485 } 7486 } else { 7487 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7488 goto commonReturn; 7489 } 7490 } 7491 } 7492 } 7493 7494 /* doing the case bit */ 7495 if(checkCase) { 7496 sCE = sCEs.buf; 7497 tCE = tCEs.buf; 7498 for(;;) { 7499 while((secS & UCOL_REMOVE_CASE) == 0) { 7500 if(!isContinuation(*sCE++)) { 7501 secS =*(sCE-1); 7502 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7503 // primary ignorables should not be considered on the case level when the strength is primary 7504 // otherwise, the CEs stop being well-formed 7505 secS &= UCOL_TERT_CASE_MASK; 7506 secS ^= caseSwitch; 7507 } else { 7508 secS = 0; 7509 } 7510 } else { 7511 secS = 0; 7512 } 7513 } 7514 7515 while((secT & UCOL_REMOVE_CASE) == 0) { 7516 if(!isContinuation(*tCE++)) { 7517 secT = *(tCE-1); 7518 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7519 // primary ignorables should not be considered on the case level when the strength is primary 7520 // otherwise, the CEs stop being well-formed 7521 secT &= UCOL_TERT_CASE_MASK; 7522 secT ^= caseSwitch; 7523 } else { 7524 secT = 0; 7525 } 7526 } else { 7527 secT = 0; 7528 } 7529 } 7530 7531 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { 7532 result = UCOL_LESS; 7533 goto commonReturn; 7534 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { 7535 result = UCOL_GREATER; 7536 goto commonReturn; 7537 } 7538 7539 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { 7540 break; 7541 } else { 7542 secS = 0; 7543 secT = 0; 7544 } 7545 } 7546 } 7547 7548 /* Tertiary level */ 7549 if(checkTertiary) { 7550 secS = 0; 7551 secT = 0; 7552 sCE = sCEs.buf; 7553 tCE = tCEs.buf; 7554 for(;;) { 7555 while((secS & UCOL_REMOVE_CASE) == 0) { 7556 secS = *(sCE++) & tertiaryMask; 7557 if(!isContinuation(secS)) { 7558 secS ^= caseSwitch; 7559 } else { 7560 secS &= UCOL_REMOVE_CASE; 7561 } 7562 } 7563 7564 while((secT & UCOL_REMOVE_CASE) == 0) { 7565 secT = *(tCE++) & tertiaryMask; 7566 if(!isContinuation(secT)) { 7567 secT ^= caseSwitch; 7568 } else { 7569 secT &= UCOL_REMOVE_CASE; 7570 } 7571 } 7572 7573 if(secS == secT) { 7574 if((secS & UCOL_REMOVE_CASE) == 1) { 7575 break; 7576 } else { 7577 secS = 0; secT = 0; 7578 continue; 7579 } 7580 } else { 7581 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7582 goto commonReturn; 7583 } 7584 } 7585 } 7586 7587 7588 if(qShifted /*checkQuad*/) { 7589 UBool sInShifted = TRUE; 7590 UBool tInShifted = TRUE; 7591 secS = 0; 7592 secT = 0; 7593 sCE = sCEs.buf; 7594 tCE = tCEs.buf; 7595 for(;;) { 7596 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) { 7597 secS = *(sCE++); 7598 if(isContinuation(secS)) { 7599 if(!sInShifted) { 7600 continue; 7601 } 7602 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ 7603 secS = UCOL_PRIMARYMASK; 7604 sInShifted = FALSE; 7605 } else { 7606 sInShifted = TRUE; 7607 } 7608 } 7609 secS &= UCOL_PRIMARYMASK; 7610 7611 7612 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) { 7613 secT = *(tCE++); 7614 if(isContinuation(secT)) { 7615 if(!tInShifted) { 7616 continue; 7617 } 7618 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { 7619 secT = UCOL_PRIMARYMASK; 7620 tInShifted = FALSE; 7621 } else { 7622 tInShifted = TRUE; 7623 } 7624 } 7625 secT &= UCOL_PRIMARYMASK; 7626 7627 if(secS == secT) { 7628 if(secS == UCOL_NO_MORE_CES_PRIMARY) { 7629 break; 7630 } else { 7631 secS = 0; secT = 0; 7632 continue; 7633 } 7634 } else { 7635 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7636 goto commonReturn; 7637 } 7638 } 7639 } else if(doHiragana && hirResult != UCOL_EQUAL) { 7640 // If we're fine on quaternaries, we might be different 7641 // on Hiragana. This, however, might fail us in shifted. 7642 result = hirResult; 7643 goto commonReturn; 7644 } 7645 7646 /* For IDENTICAL comparisons, we use a bitwise character comparison */ 7647 /* as a tiebreaker if all else is equal. */ 7648 /* Getting here should be quite rare - strings are not identical - */ 7649 /* that is checked first, but compared == through all other checks. */ 7650 if(checkIdent) 7651 { 7652 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON); 7653 result = ucol_checkIdent(sColl, tColl, TRUE, status); 7654 } 7655 7656 commonReturn: 7657 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { 7658 if (sCEs.buf != sCEs.localArray ) { 7659 uprv_free(sCEs.buf); 7660 } 7661 if (tCEs.buf != tCEs.localArray ) { 7662 uprv_free(tCEs.buf); 7663 } 7664 } 7665 7666 return result; 7667 } 7668 7669 static UCollationResult 7670 ucol_strcollRegular(const UCollator *coll, 7671 const UChar *source, int32_t sourceLength, 7672 const UChar *target, int32_t targetLength, 7673 UErrorCode *status) { 7674 collIterate sColl, tColl; 7675 // Preparing the context objects for iterating over strings 7676 IInit_collIterate(coll, source, sourceLength, &sColl, status); 7677 IInit_collIterate(coll, target, targetLength, &tColl, status); 7678 if(U_FAILURE(*status)) { 7679 return UCOL_LESS; 7680 } 7681 return ucol_strcollRegular(&sColl, &tColl, status); 7682 } 7683 7684 static inline uint32_t 7685 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, 7686 uint32_t CE, const UChar *s, int32_t *index, int32_t len) 7687 { 7688 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 7689 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 7690 int32_t offset = 1; 7691 UChar schar = 0, tchar = 0; 7692 7693 for(;;) { 7694 if(len == -1) { 7695 if(s[*index] == 0) { // end of string 7696 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7697 } else { 7698 schar = s[*index]; 7699 } 7700 } else { 7701 if(*index == len) { 7702 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7703 } else { 7704 schar = s[*index]; 7705 } 7706 } 7707 7708 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 7709 offset++; 7710 } 7711 7712 if (schar == tchar) { 7713 (*index)++; 7714 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 7715 } 7716 else 7717 { 7718 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 7719 return UCOL_BAIL_OUT_CE; 7720 } 7721 // skip completely ignorables 7722 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 7723 if(isZeroCE == 0) { // we have to ignore completely ignorables 7724 (*index)++; 7725 continue; 7726 } 7727 7728 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7729 } 7730 } 7731 } 7732 7733 7734 /** 7735 * This is a fast strcoll, geared towards text in Latin-1. 7736 * It supports contractions of size two, French secondaries 7737 * and case switching. You can use it with strengths primary 7738 * to tertiary. It does not support shifted and case level. 7739 * It relies on the table build by setupLatin1Table. If it 7740 * doesn't understand something, it will go to the regular 7741 * strcoll. 7742 */ 7743 static UCollationResult 7744 ucol_strcollUseLatin1( const UCollator *coll, 7745 const UChar *source, 7746 int32_t sLen, 7747 const UChar *target, 7748 int32_t tLen, 7749 UErrorCode *status) 7750 { 7751 U_ALIGN_CODE(16); 7752 int32_t strength = coll->strength; 7753 7754 int32_t sIndex = 0, tIndex = 0; 7755 UChar sChar = 0, tChar = 0; 7756 uint32_t sOrder=0, tOrder=0; 7757 7758 UBool endOfSource = FALSE; 7759 7760 uint32_t *elements = coll->latinOneCEs; 7761 7762 UBool haveContractions = FALSE; // if we have contractions in our string 7763 // we cannot do French secondary 7764 7765 // Do the primary level 7766 for(;;) { 7767 while(sOrder==0) { // this loop skips primary ignorables 7768 // sOrder=getNextlatinOneCE(source); 7769 if(sLen==-1) { // handling zero terminated strings 7770 sChar=source[sIndex++]; 7771 if(sChar==0) { 7772 endOfSource = TRUE; 7773 break; 7774 } 7775 } else { // handling strings with known length 7776 if(sIndex==sLen) { 7777 endOfSource = TRUE; 7778 break; 7779 } 7780 sChar=source[sIndex++]; 7781 } 7782 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 7783 //fprintf(stderr, "R"); 7784 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7785 } 7786 sOrder = elements[sChar]; 7787 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 7788 // specials can basically be either contractions or bail-out signs. If we get anything 7789 // else, we'll bail out anywasy 7790 if(getCETag(sOrder) == CONTRACTION_TAG) { 7791 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 7792 haveContractions = TRUE; // if there are contractions, we cannot do French secondary 7793 // However, if there are contractions in the table, but we always use just one char, 7794 // we might be able to do French. This should be checked out. 7795 } 7796 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 7797 //fprintf(stderr, "S"); 7798 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7799 } 7800 } 7801 } 7802 7803 while(tOrder==0) { // this loop skips primary ignorables 7804 // tOrder=getNextlatinOneCE(target); 7805 if(tLen==-1) { // handling zero terminated strings 7806 tChar=target[tIndex++]; 7807 if(tChar==0) { 7808 if(endOfSource) { // this is different than source loop, 7809 // as we already know that source loop is done here, 7810 // so we can either finish the primary loop if both 7811 // strings are done or anounce the result if only 7812 // target is done. Same below. 7813 goto endOfPrimLoop; 7814 } else { 7815 return UCOL_GREATER; 7816 } 7817 } 7818 } else { // handling strings with known length 7819 if(tIndex==tLen) { 7820 if(endOfSource) { 7821 goto endOfPrimLoop; 7822 } else { 7823 return UCOL_GREATER; 7824 } 7825 } 7826 tChar=target[tIndex++]; 7827 } 7828 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 7829 //fprintf(stderr, "R"); 7830 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7831 } 7832 tOrder = elements[tChar]; 7833 if(tOrder >= UCOL_NOT_FOUND) { 7834 // Handling specials, see the comments for source 7835 if(getCETag(tOrder) == CONTRACTION_TAG) { 7836 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 7837 haveContractions = TRUE; 7838 } 7839 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 7840 //fprintf(stderr, "S"); 7841 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7842 } 7843 } 7844 } 7845 if(endOfSource) { // source is finished, but target is not, say the result. 7846 return UCOL_LESS; 7847 } 7848 7849 if(sOrder == tOrder) { // if we have same CEs, we continue the loop 7850 sOrder = 0; tOrder = 0; 7851 continue; 7852 } else { 7853 // compare current top bytes 7854 if(((sOrder^tOrder)&0xFF000000)!=0) { 7855 // top bytes differ, return difference 7856 if(sOrder < tOrder) { 7857 return UCOL_LESS; 7858 } else if(sOrder > tOrder) { 7859 return UCOL_GREATER; 7860 } 7861 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 7862 // since we must return enum value 7863 } 7864 7865 // top bytes match, continue with following bytes 7866 sOrder<<=8; 7867 tOrder<<=8; 7868 } 7869 } 7870 7871 endOfPrimLoop: 7872 // after primary loop, we definitely know the sizes of strings, 7873 // so we set it and use simpler loop for secondaries and tertiaries 7874 sLen = sIndex; tLen = tIndex; 7875 if(strength >= UCOL_SECONDARY) { 7876 // adjust the table beggining 7877 elements += coll->latinOneTableLen; 7878 endOfSource = FALSE; 7879 7880 if(coll->frenchCollation == UCOL_OFF) { // non French 7881 // This loop is a simplified copy of primary loop 7882 // at this point we know that whole strings are latin-1, so we don't 7883 // check for that. We also know that we only have contractions as 7884 // specials. 7885 sIndex = 0; tIndex = 0; 7886 for(;;) { 7887 while(sOrder==0) { 7888 if(sIndex==sLen) { 7889 endOfSource = TRUE; 7890 break; 7891 } 7892 sChar=source[sIndex++]; 7893 sOrder = elements[sChar]; 7894 if(sOrder > UCOL_NOT_FOUND) { 7895 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 7896 } 7897 } 7898 7899 while(tOrder==0) { 7900 if(tIndex==tLen) { 7901 if(endOfSource) { 7902 goto endOfSecLoop; 7903 } else { 7904 return UCOL_GREATER; 7905 } 7906 } 7907 tChar=target[tIndex++]; 7908 tOrder = elements[tChar]; 7909 if(tOrder > UCOL_NOT_FOUND) { 7910 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 7911 } 7912 } 7913 if(endOfSource) { 7914 return UCOL_LESS; 7915 } 7916 7917 if(sOrder == tOrder) { 7918 sOrder = 0; tOrder = 0; 7919 continue; 7920 } else { 7921 // see primary loop for comments on this 7922 if(((sOrder^tOrder)&0xFF000000)!=0) { 7923 if(sOrder < tOrder) { 7924 return UCOL_LESS; 7925 } else if(sOrder > tOrder) { 7926 return UCOL_GREATER; 7927 } 7928 } 7929 sOrder<<=8; 7930 tOrder<<=8; 7931 } 7932 } 7933 } else { // French 7934 if(haveContractions) { // if we have contractions, we have to bail out 7935 // since we don't really know how to handle them here 7936 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7937 } 7938 // For French, we go backwards 7939 sIndex = sLen; tIndex = tLen; 7940 for(;;) { 7941 while(sOrder==0) { 7942 if(sIndex==0) { 7943 endOfSource = TRUE; 7944 break; 7945 } 7946 sChar=source[--sIndex]; 7947 sOrder = elements[sChar]; 7948 // don't even look for contractions 7949 } 7950 7951 while(tOrder==0) { 7952 if(tIndex==0) { 7953 if(endOfSource) { 7954 goto endOfSecLoop; 7955 } else { 7956 return UCOL_GREATER; 7957 } 7958 } 7959 tChar=target[--tIndex]; 7960 tOrder = elements[tChar]; 7961 // don't even look for contractions 7962 } 7963 if(endOfSource) { 7964 return UCOL_LESS; 7965 } 7966 7967 if(sOrder == tOrder) { 7968 sOrder = 0; tOrder = 0; 7969 continue; 7970 } else { 7971 // see the primary loop for comments 7972 if(((sOrder^tOrder)&0xFF000000)!=0) { 7973 if(sOrder < tOrder) { 7974 return UCOL_LESS; 7975 } else if(sOrder > tOrder) { 7976 return UCOL_GREATER; 7977 } 7978 } 7979 sOrder<<=8; 7980 tOrder<<=8; 7981 } 7982 } 7983 } 7984 } 7985 7986 endOfSecLoop: 7987 if(strength >= UCOL_TERTIARY) { 7988 // tertiary loop is the same as secondary (except no French) 7989 elements += coll->latinOneTableLen; 7990 sIndex = 0; tIndex = 0; 7991 endOfSource = FALSE; 7992 for(;;) { 7993 while(sOrder==0) { 7994 if(sIndex==sLen) { 7995 endOfSource = TRUE; 7996 break; 7997 } 7998 sChar=source[sIndex++]; 7999 sOrder = elements[sChar]; 8000 if(sOrder > UCOL_NOT_FOUND) { 8001 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 8002 } 8003 } 8004 while(tOrder==0) { 8005 if(tIndex==tLen) { 8006 if(endOfSource) { 8007 return UCOL_EQUAL; // if both strings are at the end, they are equal 8008 } else { 8009 return UCOL_GREATER; 8010 } 8011 } 8012 tChar=target[tIndex++]; 8013 tOrder = elements[tChar]; 8014 if(tOrder > UCOL_NOT_FOUND) { 8015 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 8016 } 8017 } 8018 if(endOfSource) { 8019 return UCOL_LESS; 8020 } 8021 if(sOrder == tOrder) { 8022 sOrder = 0; tOrder = 0; 8023 continue; 8024 } else { 8025 if(((sOrder^tOrder)&0xff000000)!=0) { 8026 if(sOrder < tOrder) { 8027 return UCOL_LESS; 8028 } else if(sOrder > tOrder) { 8029 return UCOL_GREATER; 8030 } 8031 } 8032 sOrder<<=8; 8033 tOrder<<=8; 8034 } 8035 } 8036 } 8037 return UCOL_EQUAL; 8038 } 8039 8040 /* 8041 Note: ucol_strcollUTF8 supports null terminated input. Calculating length of 8042 null terminated input string takes extra amount of CPU cycles. 8043 */ 8044 static UCollationResult 8045 ucol_strcollRegularUTF8( 8046 const UCollator *coll, 8047 const char *source, 8048 int32_t sourceLength, 8049 const char *target, 8050 int32_t targetLength, 8051 UErrorCode *status) 8052 { 8053 UCharIterator src; 8054 UCharIterator tgt; 8055 8056 uiter_setUTF8(&src, source, sourceLength); 8057 uiter_setUTF8(&tgt, target, targetLength); 8058 8059 // Preparing the context objects for iterating over strings 8060 collIterate sColl, tColl; 8061 IInit_collIterate(coll, NULL, -1, &sColl, status); 8062 IInit_collIterate(coll, NULL, -1, &tColl, status); 8063 if(U_FAILURE(*status)) { 8064 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8065 return UCOL_EQUAL; 8066 } 8067 // The division for the array length may truncate the array size to 8068 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 8069 // for all platforms anyway. 8070 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8071 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8072 UNormIterator *sNormIter = NULL, *tNormIter = NULL; 8073 8074 sColl.iterator = &src; 8075 sColl.flags |= UCOL_USE_ITERATOR; 8076 tColl.flags |= UCOL_USE_ITERATOR; 8077 tColl.iterator = &tgt; 8078 8079 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 8080 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 8081 sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status); 8082 sColl.flags &= ~UCOL_ITER_NORM; 8083 8084 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 8085 tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status); 8086 tColl.flags &= ~UCOL_ITER_NORM; 8087 } 8088 8089 return ucol_strcollRegular(&sColl, &tColl, status); 8090 } 8091 8092 static inline uint32_t 8093 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength, 8094 uint32_t CE, const char *s, int32_t *index, int32_t len) 8095 { 8096 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 8097 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 8098 int32_t offset = 1; 8099 UChar32 schar = 0, tchar = 0; 8100 8101 for(;;) { 8102 if (*index == len) { 8103 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8104 } 8105 U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar); 8106 if (len < 0 && schar == 0) { 8107 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8108 } 8109 8110 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 8111 offset++; 8112 } 8113 8114 if (schar == tchar) { 8115 U8_FWD_1(s, *index, len); 8116 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 8117 } 8118 else 8119 { 8120 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 8121 return UCOL_BAIL_OUT_CE; 8122 } 8123 // skip completely ignorables 8124 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 8125 if(isZeroCE == 0) { // we have to ignore completely ignorables 8126 U8_FWD_1(s, *index, len); 8127 continue; 8128 } 8129 8130 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8131 } 8132 } 8133 } 8134 8135 static inline UCollationResult 8136 ucol_strcollUseLatin1UTF8( 8137 const UCollator *coll, 8138 const char *source, 8139 int32_t sLen, 8140 const char *target, 8141 int32_t tLen, 8142 UErrorCode *status) 8143 { 8144 U_ALIGN_CODE(16); 8145 int32_t strength = coll->strength; 8146 8147 int32_t sIndex = 0, tIndex = 0; 8148 UChar32 sChar = 0, tChar = 0; 8149 uint32_t sOrder=0, tOrder=0; 8150 8151 UBool endOfSource = FALSE; 8152 8153 uint32_t *elements = coll->latinOneCEs; 8154 8155 UBool haveContractions = FALSE; // if we have contractions in our string 8156 // we cannot do French secondary 8157 8158 // Do the primary level 8159 for(;;) { 8160 while(sOrder==0) { // this loop skips primary ignorables 8161 // sOrder=getNextlatinOneCE(source); 8162 if (sIndex == sLen) { 8163 endOfSource = TRUE; 8164 break; 8165 } 8166 U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar); 8167 if (sLen < 0 && sChar == 0) { 8168 endOfSource = TRUE; 8169 sLen = sIndex; 8170 break; 8171 } 8172 if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 8173 //fprintf(stderr, "R"); 8174 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 8175 } 8176 sOrder = elements[sChar]; 8177 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 8178 // specials can basically be either contractions or bail-out signs. If we get anything 8179 // else, we'll bail out anywasy 8180 if(getCETag(sOrder) == CONTRACTION_TAG) { 8181 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 8182 haveContractions = TRUE; // if there are contractions, we cannot do French secondary 8183 // However, if there are contractions in the table, but we always use just one char, 8184 // we might be able to do French. This should be checked out. 8185 } 8186 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 8187 //fprintf(stderr, "S"); 8188 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 8189 } 8190 } 8191 } 8192 8193 while(tOrder==0) { // this loop skips primary ignorables 8194 // tOrder=getNextlatinOneCE(target); 8195 if (tIndex == tLen) { 8196 if(endOfSource) { 8197 goto endOfPrimLoopU8; 8198 } else { 8199 return UCOL_GREATER; 8200 } 8201 } 8202 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); 8203 if (tLen < 0 && tChar == 0) { 8204 if(endOfSource) { 8205 tLen = tIndex; 8206 goto endOfPrimLoopU8; 8207 } else { 8208 return UCOL_GREATER; 8209 } 8210 } 8211 if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 8212 //fprintf(stderr, "R"); 8213 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 8214 } 8215 tOrder = elements[tChar]; 8216 if(tOrder >= UCOL_NOT_FOUND) { 8217 // Handling specials, see the comments for source 8218 if(getCETag(tOrder) == CONTRACTION_TAG) { 8219 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 8220 haveContractions = TRUE; 8221 } 8222 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 8223 //fprintf(stderr, "S"); 8224 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 8225 } 8226 } 8227 } 8228 if(endOfSource) { // source is finished, but target is not, say the result. 8229 return UCOL_LESS; 8230 } 8231 8232 if(sOrder == tOrder) { // if we have same CEs, we continue the loop 8233 sOrder = 0; tOrder = 0; 8234 continue; 8235 } else { 8236 // compare current top bytes 8237 if(((sOrder^tOrder)&0xFF000000)!=0) { 8238 // top bytes differ, return difference 8239 if(sOrder < tOrder) { 8240 return UCOL_LESS; 8241 } else if(sOrder > tOrder) { 8242 return UCOL_GREATER; 8243 } 8244 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 8245 // since we must return enum value 8246 } 8247 8248 // top bytes match, continue with following bytes 8249 sOrder<<=8; 8250 tOrder<<=8; 8251 } 8252 } 8253 8254 endOfPrimLoopU8: 8255 // after primary loop, we definitely know the sizes of strings, 8256 // so we set it and use simpler loop for secondaries and tertiaries 8257 sLen = sIndex; tLen = tIndex; 8258 if(strength >= UCOL_SECONDARY) { 8259 // adjust the table beggining 8260 elements += coll->latinOneTableLen; 8261 endOfSource = FALSE; 8262 8263 if(coll->frenchCollation == UCOL_OFF) { // non French 8264 // This loop is a simplified copy of primary loop 8265 // at this point we know that whole strings are latin-1, so we don't 8266 // check for that. We also know that we only have contractions as 8267 // specials. 8268 sIndex = 0; tIndex = 0; 8269 for(;;) { 8270 while(sOrder==0) { 8271 if(sIndex==sLen) { 8272 endOfSource = TRUE; 8273 break; 8274 } 8275 U_ASSERT(sLen >= 0); 8276 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); 8277 U_ASSERT(sChar >= 0 && sChar <= 0xFF); 8278 sOrder = elements[sChar]; 8279 if(sOrder > UCOL_NOT_FOUND) { 8280 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 8281 } 8282 } 8283 8284 while(tOrder==0) { 8285 if(tIndex==tLen) { 8286 if(endOfSource) { 8287 goto endOfSecLoopU8; 8288 } else { 8289 return UCOL_GREATER; 8290 } 8291 } 8292 U_ASSERT(tLen >= 0); 8293 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); 8294 U_ASSERT(tChar >= 0 && tChar <= 0xFF); 8295 tOrder = elements[tChar]; 8296 if(tOrder > UCOL_NOT_FOUND) { 8297 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 8298 } 8299 } 8300 if(endOfSource) { 8301 return UCOL_LESS; 8302 } 8303 8304 if(sOrder == tOrder) { 8305 sOrder = 0; tOrder = 0; 8306 continue; 8307 } else { 8308 // see primary loop for comments on this 8309 if(((sOrder^tOrder)&0xFF000000)!=0) { 8310 if(sOrder < tOrder) { 8311 return UCOL_LESS; 8312 } else if(sOrder > tOrder) { 8313 return UCOL_GREATER; 8314 } 8315 } 8316 sOrder<<=8; 8317 tOrder<<=8; 8318 } 8319 } 8320 } else { // French 8321 if(haveContractions) { // if we have contractions, we have to bail out 8322 // since we don't really know how to handle them here 8323 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status); 8324 } 8325 // For French, we go backwards 8326 sIndex = sLen; tIndex = tLen; 8327 for(;;) { 8328 while(sOrder==0) { 8329 if(sIndex==0) { 8330 endOfSource = TRUE; 8331 break; 8332 } 8333 U8_PREV_OR_FFFD(source, 0, sIndex, sChar); 8334 U_ASSERT(sChar >= 0 && sChar <= 0xFF); 8335 sOrder = elements[sChar]; 8336 // don't even look for contractions 8337 } 8338 8339 while(tOrder==0) { 8340 if(tIndex==0) { 8341 if(endOfSource) { 8342 goto endOfSecLoopU8; 8343 } else { 8344 return UCOL_GREATER; 8345 } 8346 } 8347 U8_PREV_OR_FFFD(target, 0, tIndex, tChar); 8348 U_ASSERT(tChar >= 0 && tChar <= 0xFF); 8349 tOrder = elements[tChar]; 8350 // don't even look for contractions 8351 } 8352 if(endOfSource) { 8353 return UCOL_LESS; 8354 } 8355 8356 if(sOrder == tOrder) { 8357 sOrder = 0; tOrder = 0; 8358 continue; 8359 } else { 8360 // see the primary loop for comments 8361 if(((sOrder^tOrder)&0xFF000000)!=0) { 8362 if(sOrder < tOrder) { 8363 return UCOL_LESS; 8364 } else if(sOrder > tOrder) { 8365 return UCOL_GREATER; 8366 } 8367 } 8368 sOrder<<=8; 8369 tOrder<<=8; 8370 } 8371 } 8372 } 8373 } 8374 8375 endOfSecLoopU8: 8376 if(strength >= UCOL_TERTIARY) { 8377 // tertiary loop is the same as secondary (except no French) 8378 elements += coll->latinOneTableLen; 8379 sIndex = 0; tIndex = 0; 8380 endOfSource = FALSE; 8381 for(;;) { 8382 while(sOrder==0) { 8383 if(sIndex==sLen) { 8384 endOfSource = TRUE; 8385 break; 8386 } 8387 U_ASSERT(sLen >= 0); 8388 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar); 8389 U_ASSERT(sChar >= 0 && sChar <= 0xFF); 8390 sOrder = elements[sChar]; 8391 if(sOrder > UCOL_NOT_FOUND) { 8392 sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 8393 } 8394 } 8395 while(tOrder==0) { 8396 if(tIndex==tLen) { 8397 if(endOfSource) { 8398 return UCOL_EQUAL; // if both strings are at the end, they are equal 8399 } else { 8400 return UCOL_GREATER; 8401 } 8402 } 8403 U_ASSERT(tLen >= 0); 8404 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar); 8405 U_ASSERT(tChar >= 0 && tChar <= 0xFF); 8406 tOrder = elements[tChar]; 8407 if(tOrder > UCOL_NOT_FOUND) { 8408 tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 8409 } 8410 } 8411 if(endOfSource) { 8412 return UCOL_LESS; 8413 } 8414 if(sOrder == tOrder) { 8415 sOrder = 0; tOrder = 0; 8416 continue; 8417 } else { 8418 if(((sOrder^tOrder)&0xff000000)!=0) { 8419 if(sOrder < tOrder) { 8420 return UCOL_LESS; 8421 } else if(sOrder > tOrder) { 8422 return UCOL_GREATER; 8423 } 8424 } 8425 sOrder<<=8; 8426 tOrder<<=8; 8427 } 8428 } 8429 } 8430 return UCOL_EQUAL; 8431 } 8432 8433 U_CAPI UCollationResult U_EXPORT2 8434 ucol_strcollIter( const UCollator *coll, 8435 UCharIterator *sIter, 8436 UCharIterator *tIter, 8437 UErrorCode *status) 8438 { 8439 if(!status || U_FAILURE(*status)) { 8440 return UCOL_EQUAL; 8441 } 8442 8443 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 8444 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 8445 8446 if (sIter == tIter) { 8447 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8448 return UCOL_EQUAL; 8449 } 8450 if(sIter == NULL || tIter == NULL || coll == NULL) { 8451 *status = U_ILLEGAL_ARGUMENT_ERROR; 8452 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8453 return UCOL_EQUAL; 8454 } 8455 8456 UCollationResult result = UCOL_EQUAL; 8457 8458 // Preparing the context objects for iterating over strings 8459 collIterate sColl, tColl; 8460 IInit_collIterate(coll, NULL, -1, &sColl, status); 8461 IInit_collIterate(coll, NULL, -1, &tColl, status); 8462 if(U_FAILURE(*status)) { 8463 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8464 return UCOL_EQUAL; 8465 } 8466 // The division for the array length may truncate the array size to 8467 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 8468 // for all platforms anyway. 8469 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8470 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8471 UNormIterator *sNormIter = NULL, *tNormIter = NULL; 8472 8473 sColl.iterator = sIter; 8474 sColl.flags |= UCOL_USE_ITERATOR; 8475 tColl.flags |= UCOL_USE_ITERATOR; 8476 tColl.iterator = tIter; 8477 8478 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 8479 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 8480 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); 8481 sColl.flags &= ~UCOL_ITER_NORM; 8482 8483 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 8484 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); 8485 tColl.flags &= ~UCOL_ITER_NORM; 8486 } 8487 8488 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; 8489 8490 while((sChar = sColl.iterator->next(sColl.iterator)) == 8491 (tChar = tColl.iterator->next(tColl.iterator))) { 8492 if(sChar == U_SENTINEL) { 8493 result = UCOL_EQUAL; 8494 goto end_compare; 8495 } 8496 } 8497 8498 if(sChar == U_SENTINEL) { 8499 tChar = tColl.iterator->previous(tColl.iterator); 8500 } 8501 8502 if(tChar == U_SENTINEL) { 8503 sChar = sColl.iterator->previous(sColl.iterator); 8504 } 8505 8506 sChar = sColl.iterator->previous(sColl.iterator); 8507 tChar = tColl.iterator->previous(tColl.iterator); 8508 8509 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) 8510 { 8511 // We are stopped in the middle of a contraction. 8512 // Scan backwards through the == part of the string looking for the start of the contraction. 8513 // It doesn't matter which string we scan, since they are the same in this region. 8514 do 8515 { 8516 sChar = sColl.iterator->previous(sColl.iterator); 8517 tChar = tColl.iterator->previous(tColl.iterator); 8518 } 8519 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); 8520 } 8521 8522 8523 if(U_SUCCESS(*status)) { 8524 result = ucol_strcollRegular(&sColl, &tColl, status); 8525 } 8526 8527 end_compare: 8528 if(sNormIter || tNormIter) { 8529 unorm_closeIter(sNormIter); 8530 unorm_closeIter(tNormIter); 8531 } 8532 8533 UTRACE_EXIT_VALUE_STATUS(result, *status) 8534 return result; 8535 } 8536 8537 8538 /* */ 8539 /* ucol_strcoll Main public API string comparison function */ 8540 /* */ 8541 U_CAPI UCollationResult U_EXPORT2 8542 ucol_strcoll( const UCollator *coll, 8543 const UChar *source, 8544 int32_t sourceLength, 8545 const UChar *target, 8546 int32_t targetLength) 8547 { 8548 U_ALIGN_CODE(16); 8549 8550 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 8551 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 8552 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 8553 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 8554 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 8555 } 8556 8557 if(source == NULL || target == NULL) { 8558 // do not crash, but return. Should have 8559 // status argument to return error. 8560 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8561 return UCOL_EQUAL; 8562 } 8563 8564 /* Quick check if source and target are same strings. */ 8565 /* They should either both be NULL terminated or the explicit length should be set on both. */ 8566 if (source==target && sourceLength==targetLength) { 8567 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8568 return UCOL_EQUAL; 8569 } 8570 8571 if(coll->delegate != NULL) { 8572 UErrorCode status = U_ZERO_ERROR; 8573 return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status); 8574 } 8575 8576 /* Scan the strings. Find: */ 8577 /* The length of any leading portion that is equal */ 8578 /* Whether they are exactly equal. (in which case we just return) */ 8579 const UChar *pSrc = source; 8580 const UChar *pTarg = target; 8581 int32_t equalLength; 8582 8583 if (sourceLength == -1 && targetLength == -1) { 8584 // Both strings are null terminated. 8585 // Scan through any leading equal portion. 8586 while (*pSrc == *pTarg && *pSrc != 0) { 8587 pSrc++; 8588 pTarg++; 8589 } 8590 if (*pSrc == 0 && *pTarg == 0) { 8591 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8592 return UCOL_EQUAL; 8593 } 8594 equalLength = (int32_t)(pSrc - source); 8595 } 8596 else 8597 { 8598 // One or both strings has an explicit length. 8599 const UChar *pSrcEnd = source + sourceLength; 8600 const UChar *pTargEnd = target + targetLength; 8601 8602 // Scan while the strings are bitwise ==, or until one is exhausted. 8603 for (;;) { 8604 if (pSrc == pSrcEnd || pTarg == pTargEnd) { 8605 break; 8606 } 8607 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 8608 break; 8609 } 8610 if (*pSrc != *pTarg) { 8611 break; 8612 } 8613 pSrc++; 8614 pTarg++; 8615 } 8616 equalLength = (int32_t)(pSrc - source); 8617 8618 // If we made it all the way through both strings, we are done. They are == 8619 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */ 8620 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */ 8621 { 8622 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8623 return UCOL_EQUAL; 8624 } 8625 } 8626 if (equalLength > 0) { 8627 /* There is an identical portion at the beginning of the two strings. */ 8628 /* If the identical portion ends within a contraction or a comibining */ 8629 /* character sequence, back up to the start of that sequence. */ 8630 8631 // These values should already be set by the code above. 8632 //pSrc = source + equalLength; /* point to the first differing chars */ 8633 //pTarg = target + equalLength; 8634 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) || 8635 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))) 8636 { 8637 // We are stopped in the middle of a contraction. 8638 // Scan backwards through the == part of the string looking for the start of the contraction. 8639 // It doesn't matter which string we scan, since they are the same in this region. 8640 do 8641 { 8642 equalLength--; 8643 pSrc--; 8644 } 8645 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); 8646 } 8647 8648 source += equalLength; 8649 target += equalLength; 8650 if (sourceLength > 0) { 8651 sourceLength -= equalLength; 8652 } 8653 if (targetLength > 0) { 8654 targetLength -= equalLength; 8655 } 8656 } 8657 8658 UErrorCode status = U_ZERO_ERROR; 8659 UCollationResult returnVal; 8660 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { 8661 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status); 8662 } else { 8663 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); 8664 } 8665 UTRACE_EXIT_VALUE(returnVal); 8666 return returnVal; 8667 } 8668 8669 U_CAPI UCollationResult U_EXPORT2 8670 ucol_strcollUTF8( 8671 const UCollator *coll, 8672 const char *source, 8673 int32_t sourceLength, 8674 const char *target, 8675 int32_t targetLength, 8676 UErrorCode *status) 8677 { 8678 U_ALIGN_CODE(16); 8679 8680 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); 8681 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 8682 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 8683 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength); 8684 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength); 8685 } 8686 8687 if (U_FAILURE(*status)) { 8688 /* do nothing */ 8689 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 8690 return UCOL_EQUAL; 8691 } 8692 8693 if(source == NULL || target == NULL) { 8694 *status = U_ILLEGAL_ARGUMENT_ERROR; 8695 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 8696 return UCOL_EQUAL; 8697 } 8698 8699 /* Quick check if source and target are same strings. */ 8700 /* They should either both be NULL terminated or the explicit length should be set on both. */ 8701 if (source==target && sourceLength==targetLength) { 8702 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 8703 return UCOL_EQUAL; 8704 } 8705 8706 if(coll->delegate != NULL) { 8707 return ((const Collator*)coll->delegate)->compareUTF8( 8708 StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength), 8709 StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength), 8710 *status); 8711 } 8712 8713 /* Scan the strings. Find: */ 8714 /* The length of any leading portion that is equal */ 8715 /* Whether they are exactly equal. (in which case we just return) */ 8716 const char *pSrc = source; 8717 const char *pTarg = target; 8718 UBool bSrcLimit = FALSE; 8719 UBool bTargLimit = FALSE; 8720 8721 if (sourceLength == -1 && targetLength == -1) { 8722 // Both strings are null terminated. 8723 // Scan through any leading equal portion. 8724 while (*pSrc == *pTarg && *pSrc != 0) { 8725 pSrc++; 8726 pTarg++; 8727 } 8728 if (*pSrc == 0 && *pTarg == 0) { 8729 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 8730 return UCOL_EQUAL; 8731 } 8732 bSrcLimit = (*pSrc == 0); 8733 bTargLimit = (*pTarg == 0); 8734 } 8735 else 8736 { 8737 // One or both strings has an explicit length. 8738 const char *pSrcEnd = source + sourceLength; 8739 const char *pTargEnd = target + targetLength; 8740 8741 // Scan while the strings are bitwise ==, or until one is exhausted. 8742 for (;;) { 8743 if (pSrc == pSrcEnd || pTarg == pTargEnd) { 8744 break; 8745 } 8746 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 8747 break; 8748 } 8749 if (*pSrc != *pTarg) { 8750 break; 8751 } 8752 pSrc++; 8753 pTarg++; 8754 } 8755 bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)); 8756 bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)); 8757 8758 // If we made it all the way through both strings, we are done. They are == 8759 if (bSrcLimit && /* At end of src string, however it was specified. */ 8760 bTargLimit) /* and also at end of dest string */ 8761 { 8762 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 8763 return UCOL_EQUAL; 8764 } 8765 } 8766 8767 U_ASSERT(!(bSrcLimit && bTargLimit)); 8768 8769 int32_t equalLength = pSrc - source; 8770 UBool bSawNonLatin1 = FALSE; 8771 8772 if (equalLength > 0) { 8773 // Align position to the start of UTF-8 code point. 8774 if (bTargLimit) { 8775 U8_SET_CP_START((const uint8_t*)source, 0, equalLength); 8776 } else { 8777 U8_SET_CP_START((const uint8_t*)target, 0, equalLength); 8778 } 8779 pSrc = source + equalLength; 8780 pTarg = target + equalLength; 8781 } 8782 8783 if (equalLength > 0) { 8784 /* There is an identical portion at the beginning of the two strings. */ 8785 /* If the identical portion ends within a contraction or a comibining */ 8786 /* character sequence, back up to the start of that sequence. */ 8787 UBool bUnsafeCP = FALSE; 8788 UChar32 uc32 = -1; 8789 8790 if (!bSrcLimit) { 8791 U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32); 8792 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { 8793 bUnsafeCP = TRUE; 8794 } 8795 bSawNonLatin1 |= (uc32 > 0xff); 8796 } 8797 if (!bTargLimit) { 8798 U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32); 8799 if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) { 8800 bUnsafeCP = TRUE; 8801 } 8802 bSawNonLatin1 |= (uc32 > 0xff); 8803 } 8804 8805 if (bUnsafeCP) { 8806 while (equalLength > 0) { 8807 // We are stopped in the middle of a contraction. 8808 // Scan backwards through the == part of the string looking for the start of the contraction. 8809 // It doesn't matter which string we scan, since they are the same in this region. 8810 U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32); 8811 bSawNonLatin1 |= (uc32 > 0xff); 8812 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) { 8813 break; 8814 } 8815 } 8816 } 8817 source += equalLength; 8818 target += equalLength; 8819 if (sourceLength > 0) { 8820 sourceLength -= equalLength; 8821 } 8822 if (targetLength > 0) { 8823 targetLength -= equalLength; 8824 } 8825 } else { 8826 // Lead byte of Latin 1 character is 0x00 - 0xC3 8827 bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3); 8828 bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3); 8829 } 8830 8831 UCollationResult returnVal; 8832 8833 if(!coll->latinOneUse || bSawNonLatin1) { 8834 returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status); 8835 } else { 8836 returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status); 8837 } 8838 UTRACE_EXIT_VALUE_STATUS(returnVal, *status); 8839 return returnVal; 8840 } 8841 8842 8843 /* convenience function for comparing strings */ 8844 U_CAPI UBool U_EXPORT2 8845 ucol_greater( const UCollator *coll, 8846 const UChar *source, 8847 int32_t sourceLength, 8848 const UChar *target, 8849 int32_t targetLength) 8850 { 8851 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8852 == UCOL_GREATER); 8853 } 8854 8855 /* convenience function for comparing strings */ 8856 U_CAPI UBool U_EXPORT2 8857 ucol_greaterOrEqual( const UCollator *coll, 8858 const UChar *source, 8859 int32_t sourceLength, 8860 const UChar *target, 8861 int32_t targetLength) 8862 { 8863 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8864 != UCOL_LESS); 8865 } 8866 8867 /* convenience function for comparing strings */ 8868 U_CAPI UBool U_EXPORT2 8869 ucol_equal( const UCollator *coll, 8870 const UChar *source, 8871 int32_t sourceLength, 8872 const UChar *target, 8873 int32_t targetLength) 8874 { 8875 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8876 == UCOL_EQUAL); 8877 } 8878 8879 U_CAPI void U_EXPORT2 8880 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 8881 if(coll && coll->UCA) { 8882 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); 8883 } 8884 } 8885 8886 #endif /* #if !UCONFIG_NO_COLLATION */ 8887