1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucol.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Modification history 12 * Date Name Comments 13 * 1996-1999 various members of ICU team maintained C API for collation framework 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE 15 * 03/01/2001 synwee Added maxexpansion functionality. 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_COLLATION 22 23 #include "unicode/coleitr.h" 24 #include "unicode/unorm.h" 25 #include "unicode/udata.h" 26 #include "unicode/ustring.h" 27 28 #include "ucol_imp.h" 29 #include "bocsu.h" 30 31 #include "normalizer2impl.h" 32 #include "unorm_it.h" 33 #include "umutex.h" 34 #include "cmemory.h" 35 #include "ucln_in.h" 36 #include "cstring.h" 37 #include "utracimp.h" 38 #include "putilimp.h" 39 #include "uassert.h" 40 41 #ifdef UCOL_DEBUG 42 #include <stdio.h> 43 #endif 44 45 U_NAMESPACE_USE 46 47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 48 49 #define LAST_BYTE_MASK_ 0xFF 50 #define SECOND_LAST_BYTE_SHIFT_ 8 51 52 #define ZERO_CC_LIMIT_ 0xC0 53 54 // this is static pointer to the normalizer fcdTrieIndex 55 // it is always the same between calls to u_cleanup 56 // and therefore writing to it is not synchronized. 57 // It is cleaned in ucol_cleanup 58 static const uint16_t *fcdTrieIndex=NULL; 59 // Code points at fcdHighStart and above have a zero FCD value. 60 static UChar32 fcdHighStart = 0; 61 62 // These are values from UCA required for 63 // implicit generation and supressing sort key compression 64 // they should regularly be in the UCA, but if one 65 // is running without UCA, it could be a problem 66 static const int32_t maxRegularPrimary = 0x7A; 67 static const int32_t minImplicitPrimary = 0xE0; 68 static const int32_t maxImplicitPrimary = 0xE4; 69 70 U_CDECL_BEGIN 71 static UBool U_CALLCONV 72 ucol_cleanup(void) 73 { 74 fcdTrieIndex = NULL; 75 return TRUE; 76 } 77 78 static int32_t U_CALLCONV 79 _getFoldingOffset(uint32_t data) { 80 return (int32_t)(data&0xFFFFFF); 81 } 82 83 U_CDECL_END 84 85 // init FCD data 86 static inline 87 UBool initializeFCD(UErrorCode *status) { 88 if (fcdTrieIndex != NULL) { 89 return TRUE; 90 } else { 91 // The result is constant, until the library is reloaded. 92 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 93 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 94 return U_SUCCESS(*status); 95 } 96 } 97 98 static 99 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, 100 int32_t sourceLen, collIterate *s, 101 UErrorCode *status) 102 { 103 (s)->string = (s)->pos = sourceString; 104 (s)->origFlags = 0; 105 (s)->flags = 0; 106 if (sourceLen >= 0) { 107 s->flags |= UCOL_ITER_HASLEN; 108 (s)->endp = (UChar *)sourceString+sourceLen; 109 } 110 else { 111 /* change to enable easier checking for end of string for fcdpositon */ 112 (s)->endp = NULL; 113 } 114 (s)->extendCEs = NULL; 115 (s)->extendCEsSize = 0; 116 (s)->CEpos = (s)->toReturn = (s)->CEs; 117 (s)->offsetBuffer = NULL; 118 (s)->offsetBufferSize = 0; 119 (s)->offsetReturn = (s)->offsetStore = NULL; 120 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; 121 (s)->coll = (collator); 122 (s)->nfd = Normalizer2Factory::getNFDInstance(*status); 123 (s)->fcdPosition = 0; 124 if(collator->normalizationMode == UCOL_ON) { 125 (s)->flags |= UCOL_ITER_NORM; 126 } 127 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { 128 (s)->flags |= UCOL_HIRAGANA_Q; 129 } 130 (s)->iterator = NULL; 131 //(s)->iteratorIndex = 0; 132 } 133 134 U_CAPI void U_EXPORT2 135 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, 136 int32_t sourceLen, collIterate *s, 137 UErrorCode *status) { 138 /* Out-of-line version for use from other files. */ 139 IInit_collIterate(collator, sourceString, sourceLen, s, status); 140 } 141 142 U_CAPI collIterate * U_EXPORT2 143 uprv_new_collIterate(UErrorCode *status) { 144 if(U_FAILURE(*status)) { 145 return NULL; 146 } 147 collIterate *s = new collIterate; 148 if(s == NULL) { 149 *status = U_MEMORY_ALLOCATION_ERROR; 150 return NULL; 151 } 152 return s; 153 } 154 155 U_CAPI void U_EXPORT2 156 uprv_delete_collIterate(collIterate *s) { 157 delete s; 158 } 159 160 U_CAPI UBool U_EXPORT2 161 uprv_collIterateAtEnd(collIterate *s) { 162 return s == NULL || s->pos == s->endp; 163 } 164 165 /** 166 * Backup the state of the collIterate struct data 167 * @param data collIterate to backup 168 * @param backup storage 169 */ 170 static 171 inline void backupState(const collIterate *data, collIterateState *backup) 172 { 173 backup->fcdPosition = data->fcdPosition; 174 backup->flags = data->flags; 175 backup->origFlags = data->origFlags; 176 backup->pos = data->pos; 177 backup->bufferaddress = data->writableBuffer.getBuffer(); 178 backup->buffersize = data->writableBuffer.length(); 179 backup->iteratorMove = 0; 180 backup->iteratorIndex = 0; 181 if(data->iterator != NULL) { 182 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); 183 backup->iteratorIndex = data->iterator->getState(data->iterator); 184 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE 185 if(backup->iteratorIndex == UITER_NO_STATE) { 186 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { 187 backup->iteratorMove++; 188 data->iterator->move(data->iterator, -1, UITER_CURRENT); 189 } 190 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 191 } 192 } 193 } 194 195 /** 196 * Loads the state into the collIterate struct data 197 * @param data collIterate to backup 198 * @param backup storage 199 * @param forwards boolean to indicate if forwards iteration is used, 200 * false indicates backwards iteration 201 */ 202 static 203 inline void loadState(collIterate *data, const collIterateState *backup, 204 UBool forwards) 205 { 206 UErrorCode status = U_ZERO_ERROR; 207 data->flags = backup->flags; 208 data->origFlags = backup->origFlags; 209 if(data->iterator != NULL) { 210 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); 211 data->iterator->setState(data->iterator, backup->iteratorIndex, &status); 212 if(backup->iteratorMove != 0) { 213 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 214 } 215 } 216 data->pos = backup->pos; 217 218 if ((data->flags & UCOL_ITER_INNORMBUF) && 219 data->writableBuffer.getBuffer() != backup->bufferaddress) { 220 /* 221 this is when a new buffer has been reallocated and we'll have to 222 calculate the new position. 223 note the new buffer has to contain the contents of the old buffer. 224 */ 225 if (forwards) { 226 data->pos = data->writableBuffer.getTerminatedBuffer() + 227 (data->pos - backup->bufferaddress); 228 } 229 else { 230 /* backwards direction */ 231 int32_t temp = backup->buffersize - 232 (int32_t)(data->pos - backup->bufferaddress); 233 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp); 234 } 235 } 236 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 237 /* 238 this is alittle tricky. 239 if we are initially not in the normalization buffer, even if we 240 normalize in the later stage, the data in the buffer will be 241 ignored, since we skip back up to the data string. 242 however if we are already in the normalization buffer, any 243 further normalization will pull data into the normalization 244 buffer and modify the fcdPosition. 245 since we are keeping the data in the buffer for use, the 246 fcdPosition can not be reverted back. 247 arrgghh.... 248 */ 249 data->fcdPosition = backup->fcdPosition; 250 } 251 } 252 253 static UBool 254 reallocCEs(collIterate *data, int32_t newCapacity) { 255 uint32_t *oldCEs = data->extendCEs; 256 if(oldCEs == NULL) { 257 oldCEs = data->CEs; 258 } 259 int32_t length = data->CEpos - oldCEs; 260 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); 261 if(newCEs == NULL) { 262 return FALSE; 263 } 264 uprv_memcpy(newCEs, oldCEs, length * 4); 265 uprv_free(data->extendCEs); 266 data->extendCEs = newCEs; 267 data->extendCEsSize = newCapacity; 268 data->CEpos = newCEs + length; 269 return TRUE; 270 } 271 272 static UBool 273 increaseCEsCapacity(collIterate *data) { 274 int32_t oldCapacity; 275 if(data->extendCEs != NULL) { 276 oldCapacity = data->extendCEsSize; 277 } else { 278 oldCapacity = LENGTHOF(data->CEs); 279 } 280 return reallocCEs(data, 2 * oldCapacity); 281 } 282 283 static UBool 284 ensureCEsCapacity(collIterate *data, int32_t minCapacity) { 285 int32_t oldCapacity; 286 if(data->extendCEs != NULL) { 287 oldCapacity = data->extendCEsSize; 288 } else { 289 oldCapacity = LENGTHOF(data->CEs); 290 } 291 if(minCapacity <= oldCapacity) { 292 return TRUE; 293 } 294 oldCapacity *= 2; 295 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity); 296 } 297 298 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) { 299 if(U_FAILURE(errorCode)) { 300 return; 301 } 302 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer); 303 if(length >= offsetBufferSize) { 304 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE; 305 int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4)); 306 if(newBuffer == NULL) { 307 errorCode = U_MEMORY_ALLOCATION_ERROR; 308 return; 309 } 310 if(length > 0) { 311 uprv_memcpy(newBuffer, offsetBuffer, length * 4); 312 } 313 uprv_free(offsetBuffer); 314 offsetBuffer = newBuffer; 315 offsetStore = offsetBuffer + length; 316 offsetBufferSize = newCapacity; 317 } 318 *offsetStore++ = offset; 319 } 320 321 /* 322 * collIter_eos() 323 * Checks for a collIterate being positioned at the end of 324 * its source string. 325 * 326 */ 327 static 328 inline UBool collIter_eos(collIterate *s) { 329 if(s->flags & UCOL_USE_ITERATOR) { 330 return !(s->iterator->hasNext(s->iterator)); 331 } 332 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { 333 // Null terminated string, but not at null, so not at end. 334 // Whether in main or normalization buffer doesn't matter. 335 return FALSE; 336 } 337 338 // String with length. Can't be in normalization buffer, which is always 339 // null termintated. 340 if (s->flags & UCOL_ITER_HASLEN) { 341 return (s->pos == s->endp); 342 } 343 344 // We are at a null termination, could be either normalization buffer or main string. 345 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { 346 // At null at end of main string. 347 return TRUE; 348 } 349 350 // At null at end of normalization buffer. Need to check whether there there are 351 // any characters left in the main buffer. 352 if(s->origFlags & UCOL_USE_ITERATOR) { 353 return !(s->iterator->hasNext(s->iterator)); 354 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { 355 // Null terminated main string. fcdPosition is the 'return' position into main buf. 356 return (*s->fcdPosition == 0); 357 } 358 else { 359 // Main string with an end pointer. 360 return s->fcdPosition == s->endp; 361 } 362 } 363 364 /* 365 * collIter_bos() 366 * Checks for a collIterate being positioned at the start of 367 * its source string. 368 * 369 */ 370 static 371 inline UBool collIter_bos(collIterate *source) { 372 // if we're going backwards, we need to know whether there is more in the 373 // iterator, even if we are in the side buffer 374 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 375 return !source->iterator->hasPrevious(source->iterator); 376 } 377 if (source->pos <= source->string || 378 ((source->flags & UCOL_ITER_INNORMBUF) && 379 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { 380 return TRUE; 381 } 382 return FALSE; 383 } 384 385 /*static 386 inline UBool collIter_SimpleBos(collIterate *source) { 387 // if we're going backwards, we need to know whether there is more in the 388 // iterator, even if we are in the side buffer 389 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 390 return !source->iterator->hasPrevious(source->iterator); 391 } 392 if (source->pos == source->string) { 393 return TRUE; 394 } 395 return FALSE; 396 }*/ 397 //return (data->pos == data->string) || 398 399 400 /****************************************************************************/ 401 /* Following are the open/close functions */ 402 /* */ 403 /****************************************************************************/ 404 405 static UCollator* 406 ucol_initFromBinary(const uint8_t *bin, int32_t length, 407 const UCollator *base, 408 UCollator *fillIn, 409 UErrorCode *status) 410 { 411 UCollator *result = fillIn; 412 if(U_FAILURE(*status)) { 413 return NULL; 414 } 415 /* 416 if(base == NULL) { 417 // we don't support null base yet 418 *status = U_ILLEGAL_ARGUMENT_ERROR; 419 return NULL; 420 } 421 */ 422 // We need these and we could be running without UCA 423 uprv_uca_initImplicitConstants(status); 424 UCATableHeader *colData = (UCATableHeader *)bin; 425 // do we want version check here? We're trying to figure out whether collators are compatible 426 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || 427 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || 428 colData->version[0] != UCOL_BUILDER_VERSION) 429 { 430 *status = U_COLLATOR_VERSION_MISMATCH; 431 return NULL; 432 } 433 else { 434 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { 435 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); 436 if(U_FAILURE(*status)){ 437 return NULL; 438 } 439 result->hasRealData = TRUE; 440 } 441 else { 442 if(base) { 443 result = ucol_initCollator(base->image, result, base, status); 444 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); 445 if(U_FAILURE(*status)){ 446 return NULL; 447 } 448 result->hasRealData = FALSE; 449 } 450 else { 451 *status = U_USELESS_COLLATOR_ERROR; 452 return NULL; 453 } 454 } 455 result->freeImageOnClose = FALSE; 456 } 457 result->actualLocale = NULL; 458 result->validLocale = NULL; 459 result->requestedLocale = NULL; 460 result->rules = NULL; 461 result->rulesLength = 0; 462 result->freeRulesOnClose = FALSE; 463 result->ucaRules = NULL; 464 return result; 465 } 466 467 U_CAPI UCollator* U_EXPORT2 468 ucol_openBinary(const uint8_t *bin, int32_t length, 469 const UCollator *base, 470 UErrorCode *status) 471 { 472 return ucol_initFromBinary(bin, length, base, NULL, status); 473 } 474 475 U_CAPI int32_t U_EXPORT2 476 ucol_cloneBinary(const UCollator *coll, 477 uint8_t *buffer, int32_t capacity, 478 UErrorCode *status) 479 { 480 int32_t length = 0; 481 if(U_FAILURE(*status)) { 482 return length; 483 } 484 if(capacity < 0) { 485 *status = U_ILLEGAL_ARGUMENT_ERROR; 486 return length; 487 } 488 if(coll->hasRealData == TRUE) { 489 length = coll->image->size; 490 if(length <= capacity) { 491 uprv_memcpy(buffer, coll->image, length); 492 } else { 493 *status = U_BUFFER_OVERFLOW_ERROR; 494 } 495 } else { 496 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 497 if(length <= capacity) { 498 /* build the UCATableHeader with minimal entries */ 499 /* do not copy the header from the UCA file because its values are wrong! */ 500 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 501 502 /* reset everything */ 503 uprv_memset(buffer, 0, length); 504 505 /* set the tailoring-specific values */ 506 UCATableHeader *myData = (UCATableHeader *)buffer; 507 myData->size = length; 508 509 /* offset for the options, the only part of the data that is present after the header */ 510 myData->options = sizeof(UCATableHeader); 511 512 /* need to always set the expansion value for an upper bound of the options */ 513 myData->expansion = myData->options + sizeof(UColOptionSet); 514 515 myData->magic = UCOL_HEADER_MAGIC; 516 myData->isBigEndian = U_IS_BIG_ENDIAN; 517 myData->charSetFamily = U_CHARSET_FAMILY; 518 519 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 520 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 521 522 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 523 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 524 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 525 myData->jamoSpecial = coll->image->jamoSpecial; 526 527 /* copy the collator options */ 528 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 529 } else { 530 *status = U_BUFFER_OVERFLOW_ERROR; 531 } 532 } 533 return length; 534 } 535 536 U_CAPI UCollator* U_EXPORT2 537 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) 538 { 539 UCollator * localCollator; 540 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); 541 char *stackBufferChars = (char *)stackBuffer; 542 int32_t imageSize = 0; 543 int32_t rulesSize = 0; 544 int32_t rulesPadding = 0; 545 uint8_t *image; 546 UChar *rules; 547 UBool colAllocated = FALSE; 548 UBool imageAllocated = FALSE; 549 550 if (status == NULL || U_FAILURE(*status)){ 551 return 0; 552 } 553 if ((stackBuffer && !pBufferSize) || !coll){ 554 *status = U_ILLEGAL_ARGUMENT_ERROR; 555 return 0; 556 } 557 if (coll->rules && coll->freeRulesOnClose) { 558 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); 559 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); 560 bufferSizeNeeded += rulesSize + rulesPadding; 561 } 562 563 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ 564 *pBufferSize = bufferSizeNeeded; 565 return 0; 566 } 567 568 /* Pointers on 64-bit platforms need to be aligned 569 * on a 64-bit boundry in memory. 570 */ 571 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 572 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); 573 if (*pBufferSize > offsetUp) { 574 *pBufferSize -= offsetUp; 575 stackBufferChars += offsetUp; 576 } 577 else { 578 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */ 579 *pBufferSize = 1; 580 } 581 } 582 stackBuffer = (void *)stackBufferChars; 583 584 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) { 585 /* allocate one here...*/ 586 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); 587 // Null pointer check. 588 if (stackBufferChars == NULL) { 589 *status = U_MEMORY_ALLOCATION_ERROR; 590 return NULL; 591 } 592 colAllocated = TRUE; 593 if (U_SUCCESS(*status)) { 594 *status = U_SAFECLONE_ALLOCATED_WARNING; 595 } 596 } 597 localCollator = (UCollator *)stackBufferChars; 598 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); 599 { 600 UErrorCode tempStatus = U_ZERO_ERROR; 601 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); 602 } 603 if (coll->freeImageOnClose) { 604 image = (uint8_t *)uprv_malloc(imageSize); 605 // Null pointer check 606 if (image == NULL) { 607 *status = U_MEMORY_ALLOCATION_ERROR; 608 return NULL; 609 } 610 ucol_cloneBinary(coll, image, imageSize, status); 611 imageAllocated = TRUE; 612 } 613 else { 614 image = (uint8_t *)coll->image; 615 } 616 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); 617 if (U_FAILURE(*status)) { 618 return NULL; 619 } 620 621 if (coll->rules) { 622 if (coll->freeRulesOnClose) { 623 localCollator->rules = u_strcpy(rules, coll->rules); 624 //bufferEnd += rulesSize; 625 } 626 else { 627 localCollator->rules = coll->rules; 628 } 629 localCollator->freeRulesOnClose = FALSE; 630 localCollator->rulesLength = coll->rulesLength; 631 } 632 633 int32_t i; 634 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 635 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); 636 } 637 // zero copies of pointers 638 localCollator->actualLocale = NULL; 639 localCollator->validLocale = NULL; 640 localCollator->requestedLocale = NULL; 641 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. 642 localCollator->freeOnClose = colAllocated; 643 localCollator->freeImageOnClose = imageAllocated; 644 return localCollator; 645 } 646 647 U_CAPI void U_EXPORT2 648 ucol_close(UCollator *coll) 649 { 650 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 651 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 652 if(coll != NULL) { 653 // these are always owned by each UCollator struct, 654 // so we always free them 655 if(coll->validLocale != NULL) { 656 uprv_free(coll->validLocale); 657 } 658 if(coll->actualLocale != NULL) { 659 uprv_free(coll->actualLocale); 660 } 661 if(coll->requestedLocale != NULL) { 662 uprv_free(coll->requestedLocale); 663 } 664 if(coll->latinOneCEs != NULL) { 665 uprv_free(coll->latinOneCEs); 666 } 667 if(coll->options != NULL && coll->freeOptionsOnClose) { 668 uprv_free(coll->options); 669 } 670 if(coll->rules != NULL && coll->freeRulesOnClose) { 671 uprv_free((UChar *)coll->rules); 672 } 673 if(coll->image != NULL && coll->freeImageOnClose) { 674 uprv_free((UCATableHeader *)coll->image); 675 } 676 if(coll->leadBytePermutationTable != NULL) { 677 uprv_free(coll->leadBytePermutationTable); 678 } 679 if(coll->reorderCodes != NULL) { 680 uprv_free(coll->reorderCodes); 681 } 682 683 /* Here, it would be advisable to close: */ 684 /* - UData for UCA (unless we stuff it in the root resb */ 685 /* Again, do we need additional housekeeping... HMMM! */ 686 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); 687 if(coll->freeOnClose){ 688 /* for safeClone, if freeOnClose is FALSE, 689 don't free the other instance data */ 690 uprv_free(coll); 691 } 692 } 693 UTRACE_EXIT(); 694 } 695 696 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ 697 /* you should be able to get the binary chunk to write out... Doesn't look very full now */ 698 U_CFUNC uint8_t* U_EXPORT2 699 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) 700 { 701 uint8_t *result = NULL; 702 if(U_FAILURE(*status)) { 703 return NULL; 704 } 705 if(coll->hasRealData == TRUE) { 706 *length = coll->image->size; 707 result = (uint8_t *)uprv_malloc(*length); 708 /* test for NULL */ 709 if (result == NULL) { 710 *status = U_MEMORY_ALLOCATION_ERROR; 711 return NULL; 712 } 713 uprv_memcpy(result, coll->image, *length); 714 } else { 715 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 716 result = (uint8_t *)uprv_malloc(*length); 717 /* test for NULL */ 718 if (result == NULL) { 719 *status = U_MEMORY_ALLOCATION_ERROR; 720 return NULL; 721 } 722 723 /* build the UCATableHeader with minimal entries */ 724 /* do not copy the header from the UCA file because its values are wrong! */ 725 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 726 727 /* reset everything */ 728 uprv_memset(result, 0, *length); 729 730 /* set the tailoring-specific values */ 731 UCATableHeader *myData = (UCATableHeader *)result; 732 myData->size = *length; 733 734 /* offset for the options, the only part of the data that is present after the header */ 735 myData->options = sizeof(UCATableHeader); 736 737 /* need to always set the expansion value for an upper bound of the options */ 738 myData->expansion = myData->options + sizeof(UColOptionSet); 739 740 myData->magic = UCOL_HEADER_MAGIC; 741 myData->isBigEndian = U_IS_BIG_ENDIAN; 742 myData->charSetFamily = U_CHARSET_FAMILY; 743 744 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 745 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 746 747 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 748 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 749 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 750 myData->jamoSpecial = coll->image->jamoSpecial; 751 752 /* copy the collator options */ 753 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 754 } 755 return result; 756 } 757 758 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { 759 if(U_FAILURE(*status)) { 760 return; 761 } 762 result->caseFirst = (UColAttributeValue)opts->caseFirst; 763 result->caseLevel = (UColAttributeValue)opts->caseLevel; 764 result->frenchCollation = (UColAttributeValue)opts->frenchCollation; 765 result->normalizationMode = (UColAttributeValue)opts->normalizationMode; 766 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) { 767 return; 768 } 769 result->strength = (UColAttributeValue)opts->strength; 770 result->variableTopValue = opts->variableTopValue; 771 result->alternateHandling = (UColAttributeValue)opts->alternateHandling; 772 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; 773 result->numericCollation = (UColAttributeValue)opts->numericCollation; 774 result->caseFirstisDefault = TRUE; 775 result->caseLevelisDefault = TRUE; 776 result->frenchCollationisDefault = TRUE; 777 result->normalizationModeisDefault = TRUE; 778 result->strengthisDefault = TRUE; 779 result->variableTopValueisDefault = TRUE; 780 result->alternateHandlingisDefault = TRUE; 781 result->hiraganaQisDefault = TRUE; 782 result->numericCollationisDefault = TRUE; 783 784 ucol_updateInternalState(result, status); 785 786 result->options = opts; 787 } 788 789 790 /** 791 * Approximate determination if a character is at a contraction end. 792 * Guaranteed to be TRUE if a character is at the end of a contraction, 793 * otherwise it is not deterministic. 794 * @param c character to be determined 795 * @param coll collator 796 */ 797 static 798 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { 799 if (c < coll->minContrEndCP) { 800 return FALSE; 801 } 802 803 int32_t hash = c; 804 uint8_t htbyte; 805 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 806 if (U16_IS_TRAIL(c)) { 807 return TRUE; 808 } 809 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 810 } 811 htbyte = coll->contrEndCP[hash>>3]; 812 return (((htbyte >> (hash & 7)) & 1) == 1); 813 } 814 815 816 817 /* 818 * i_getCombiningClass() 819 * A fast, at least partly inline version of u_getCombiningClass() 820 * This is a candidate for further optimization. Used heavily 821 * in contraction processing. 822 */ 823 static 824 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { 825 uint8_t sCC = 0; 826 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { 827 sCC = u_getCombiningClass(c); 828 } 829 return sCC; 830 } 831 832 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { 833 UChar c; 834 UCollator *result = fillIn; 835 if(U_FAILURE(*status) || image == NULL) { 836 return NULL; 837 } 838 839 if(result == NULL) { 840 result = (UCollator *)uprv_malloc(sizeof(UCollator)); 841 if(result == NULL) { 842 *status = U_MEMORY_ALLOCATION_ERROR; 843 return result; 844 } 845 result->freeOnClose = TRUE; 846 } else { 847 result->freeOnClose = FALSE; 848 } 849 850 result->image = image; 851 result->mapping.getFoldingOffset = _getFoldingOffset; 852 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; 853 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); 854 if(U_FAILURE(*status)) { 855 if(result->freeOnClose == TRUE) { 856 uprv_free(result); 857 result = NULL; 858 } 859 return result; 860 } 861 862 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); 863 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); 864 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); 865 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); 866 result->rules = NULL; 867 result->rulesLength = 0; 868 result->freeRulesOnClose = FALSE; 869 result->reorderCodes = NULL; 870 result->reorderCodesLength = 0; 871 result->leadBytePermutationTable = NULL; 872 873 /* get the version info from UCATableHeader and populate the Collator struct*/ 874 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ 875 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ 876 result->dataVersion[2] = 0; 877 result->dataVersion[3] = 0; 878 879 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; 880 result->minUnsafeCP = 0; 881 for (c=0; c<0x300; c++) { // Find the smallest unsafe char. 882 if (ucol_unsafeCP(c, result)) break; 883 } 884 result->minUnsafeCP = c; 885 886 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; 887 result->minContrEndCP = 0; 888 for (c=0; c<0x300; c++) { // Find the Contraction-ending char. 889 if (ucol_contractionEndCP(c, result)) break; 890 } 891 result->minContrEndCP = c; 892 893 /* max expansion tables */ 894 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + 895 result->image->endExpansionCE); 896 result->lastEndExpansionCE = result->endExpansionCE + 897 result->image->endExpansionCECount - 1; 898 result->expansionCESize = (uint8_t*)result->image + 899 result->image->expansionCESize; 900 901 902 //result->errorCode = *status; 903 904 result->latinOneCEs = NULL; 905 906 result->latinOneRegenTable = FALSE; 907 result->latinOneFailed = FALSE; 908 result->UCA = UCA; 909 910 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ 911 result->ucaRules = NULL; 912 result->actualLocale = NULL; 913 result->validLocale = NULL; 914 result->requestedLocale = NULL; 915 result->hasRealData = FALSE; // real data lives in .dat file... 916 result->freeImageOnClose = FALSE; 917 918 /* set attributes */ 919 ucol_setOptionsFromHeader( 920 result, 921 (UColOptionSet*)((uint8_t*)result->image+result->image->options), 922 status); 923 result->freeOptionsOnClose = FALSE; 924 925 return result; 926 } 927 928 /* new Mark's code */ 929 930 /** 931 * For generation of Implicit CEs 932 * @author Davis 933 * 934 * Cleaned up so that changes can be made more easily. 935 * Old values: 936 # First Implicit: E26A792D 937 # Last Implicit: E3DC70C0 938 # First CJK: E0030300 939 # Last CJK: E0A9DD00 940 # First CJK_A: E0A9DF00 941 # Last CJK_A: E0DE3100 942 */ 943 /* Following is a port of Mark's code for new treatment of implicits. 944 * It is positioned here, since ucol_initUCA need to initialize the 945 * variables below according to the data in the fractional UCA. 946 */ 947 948 /** 949 * Function used to: 950 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and 951 * b) bump any non-CJK characters by 10FFFF. 952 * The relevant blocks are: 953 * A: 4E00..9FFF; CJK Unified Ideographs 954 * F900..FAFF; CJK Compatibility Ideographs 955 * B: 3400..4DBF; CJK Unified Ideographs Extension A 956 * 20000..XX; CJK Unified Ideographs Extension B (and others later on) 957 * As long as 958 * no new B characters are allocated between 4E00 and FAFF, and 959 * no new A characters are outside of this range, 960 * (very high probability) this simple code will work. 961 * The reordered blocks are: 962 * Block1 is CJK 963 * Block2 is CJK_COMPAT_USED 964 * Block3 is CJK_A 965 * (all contiguous) 966 * Any other CJK gets its normal code point 967 * Any non-CJK gets +10FFFF 968 * When we reorder Block1, we make sure that it is at the very start, 969 * so that it will use a 3-byte form. 970 * Warning: the we only pick up the compatibility characters that are 971 * NOT decomposed, so that block is smaller! 972 */ 973 974 // CONSTANTS 975 static const UChar32 976 NON_CJK_OFFSET = 0x110000, 977 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 978 979 /** 980 * Precomputed by initImplicitConstants() 981 */ 982 static int32_t 983 final3Multiplier = 0, 984 final4Multiplier = 0, 985 final3Count = 0, 986 final4Count = 0, 987 medialCount = 0, 988 min3Primary = 0, 989 min4Primary = 0, 990 max4Primary = 0, 991 minTrail = 0, 992 maxTrail = 0, 993 max3Trail = 0, 994 max4Trail = 0, 995 min4Boundary = 0; 996 997 static const UChar32 998 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 999 // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 1000 CJK_BASE = 0x4E00, 1001 CJK_LIMIT = 0x9FCB+1, 1002 // Unified CJK ideographs in the compatibility ideographs block. 1003 CJK_COMPAT_USED_BASE = 0xFA0E, 1004 CJK_COMPAT_USED_LIMIT = 0xFA2F+1, 1005 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 1006 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 1007 CJK_A_BASE = 0x3400, 1008 CJK_A_LIMIT = 0x4DB5+1, 1009 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; 1010 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; 1011 CJK_B_BASE = 0x20000, 1012 CJK_B_LIMIT = 0x2A6D6+1, 1013 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; 1014 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; 1015 CJK_C_BASE = 0x2A700, 1016 CJK_C_LIMIT = 0x2B734+1, 1017 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; 1018 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; 1019 CJK_D_BASE = 0x2B740, 1020 CJK_D_LIMIT = 0x2B81D+1; 1021 // when adding to this list, look for all occurrences (in project) 1022 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!! 1023 1024 static UChar32 swapCJK(UChar32 i) { 1025 if (i < CJK_A_BASE) { 1026 // non-CJK 1027 } else if (i < CJK_A_LIMIT) { 1028 // Extension A has lower code points than the original Unihan+compat 1029 // but sorts higher. 1030 return i - CJK_A_BASE 1031 + (CJK_LIMIT - CJK_BASE) 1032 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1033 } else if (i < CJK_BASE) { 1034 // non-CJK 1035 } else if (i < CJK_LIMIT) { 1036 return i - CJK_BASE; 1037 } else if (i < CJK_COMPAT_USED_BASE) { 1038 // non-CJK 1039 } else if (i < CJK_COMPAT_USED_LIMIT) { 1040 return i - CJK_COMPAT_USED_BASE 1041 + (CJK_LIMIT - CJK_BASE); 1042 } else if (i < CJK_B_BASE) { 1043 // non-CJK 1044 } else if (i < CJK_B_LIMIT) { 1045 return i; // non-BMP-CJK 1046 } else if (i < CJK_C_BASE) { 1047 // non-CJK 1048 } else if (i < CJK_C_LIMIT) { 1049 return i; // non-BMP-CJK 1050 } else if (i < CJK_D_BASE) { 1051 // non-CJK 1052 } else if (i < CJK_D_LIMIT) { 1053 return i; // non-BMP-CJK 1054 } 1055 return i + NON_CJK_OFFSET; // non-CJK 1056 } 1057 1058 U_CAPI UChar32 U_EXPORT2 1059 uprv_uca_getRawFromCodePoint(UChar32 i) { 1060 return swapCJK(i)+1; 1061 } 1062 1063 U_CAPI UChar32 U_EXPORT2 1064 uprv_uca_getCodePointFromRaw(UChar32 i) { 1065 i--; 1066 UChar32 result = 0; 1067 if(i >= NON_CJK_OFFSET) { 1068 result = i - NON_CJK_OFFSET; 1069 } else if(i >= CJK_B_BASE) { 1070 result = i; 1071 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted 1072 if(i < CJK_LIMIT - CJK_BASE) { 1073 result = i + CJK_BASE; 1074 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { 1075 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); 1076 } else { 1077 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1078 } 1079 } else { 1080 result = -1; 1081 } 1082 return result; 1083 } 1084 1085 // GET IMPLICIT PRIMARY WEIGHTS 1086 // Return value is left justified primary key 1087 U_CAPI uint32_t U_EXPORT2 1088 uprv_uca_getImplicitFromRaw(UChar32 cp) { 1089 /* 1090 if (cp < 0 || cp > UCOL_MAX_INPUT) { 1091 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); 1092 } 1093 */ 1094 int32_t last0 = cp - min4Boundary; 1095 if (last0 < 0) { 1096 int32_t last1 = cp / final3Count; 1097 last0 = cp % final3Count; 1098 1099 int32_t last2 = last1 / medialCount; 1100 last1 %= medialCount; 1101 1102 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start 1103 last1 = minTrail + last1; // offset 1104 last2 = min3Primary + last2; // offset 1105 /* 1106 if (last2 >= min4Primary) { 1107 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); 1108 } 1109 */ 1110 return (last2 << 24) + (last1 << 16) + (last0 << 8); 1111 } else { 1112 int32_t last1 = last0 / final4Count; 1113 last0 %= final4Count; 1114 1115 int32_t last2 = last1 / medialCount; 1116 last1 %= medialCount; 1117 1118 int32_t last3 = last2 / medialCount; 1119 last2 %= medialCount; 1120 1121 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start 1122 last1 = minTrail + last1; // offset 1123 last2 = minTrail + last2; // offset 1124 last3 = min4Primary + last3; // offset 1125 /* 1126 if (last3 > max4Primary) { 1127 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); 1128 } 1129 */ 1130 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; 1131 } 1132 } 1133 1134 static uint32_t U_EXPORT2 1135 uprv_uca_getImplicitPrimary(UChar32 cp) { 1136 //fprintf(stdout, "Incoming: %04x\n", cp); 1137 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); 1138 1139 cp = swapCJK(cp); 1140 cp++; 1141 // we now have a range of numbers from 0 to 21FFFF. 1142 1143 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); 1144 //fprintf(stdout, "CJK swapped: %04x\n", cp); 1145 1146 return uprv_uca_getImplicitFromRaw(cp); 1147 } 1148 1149 /** 1150 * Converts implicit CE into raw integer ("code point") 1151 * @param implicit 1152 * @return -1 if illegal format 1153 */ 1154 U_CAPI UChar32 U_EXPORT2 1155 uprv_uca_getRawFromImplicit(uint32_t implicit) { 1156 UChar32 result; 1157 UChar32 b3 = implicit & 0xFF; 1158 UChar32 b2 = (implicit >> 8) & 0xFF; 1159 UChar32 b1 = (implicit >> 16) & 0xFF; 1160 UChar32 b0 = (implicit >> 24) & 0xFF; 1161 1162 // simple parameter checks 1163 if (b0 < min3Primary || b0 > max4Primary 1164 || b1 < minTrail || b1 > maxTrail) 1165 return -1; 1166 // normal offsets 1167 b1 -= minTrail; 1168 1169 // take care of the final values, and compose 1170 if (b0 < min4Primary) { 1171 if (b2 < minTrail || b2 > max3Trail || b3 != 0) 1172 return -1; 1173 b2 -= minTrail; 1174 UChar32 remainder = b2 % final3Multiplier; 1175 if (remainder != 0) 1176 return -1; 1177 b0 -= min3Primary; 1178 b2 /= final3Multiplier; 1179 result = ((b0 * medialCount) + b1) * final3Count + b2; 1180 } else { 1181 if (b2 < minTrail || b2 > maxTrail 1182 || b3 < minTrail || b3 > max4Trail) 1183 return -1; 1184 b2 -= minTrail; 1185 b3 -= minTrail; 1186 UChar32 remainder = b3 % final4Multiplier; 1187 if (remainder != 0) 1188 return -1; 1189 b3 /= final4Multiplier; 1190 b0 -= min4Primary; 1191 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; 1192 } 1193 // final check 1194 if (result < 0 || result > UCOL_MAX_INPUT) 1195 return -1; 1196 return result; 1197 } 1198 1199 1200 static inline int32_t divideAndRoundUp(int a, int b) { 1201 return 1 + (a-1)/b; 1202 } 1203 1204 /* this function is either called from initUCA or from genUCA before 1205 * doing canonical closure for the UCA. 1206 */ 1207 1208 /** 1209 * Set up to generate implicits. 1210 * Maintenance Note: this function may end up being called more than once, due 1211 * to threading races during initialization. Make sure that 1212 * none of the Constants is ever transiently assigned an 1213 * incorrect value. 1214 * @param minPrimary 1215 * @param maxPrimary 1216 * @param minTrail final byte 1217 * @param maxTrail final byte 1218 * @param gap3 the gap we leave for tailoring for 3-byte forms 1219 * @param gap4 the gap we leave for tailoring for 4-byte forms 1220 */ 1221 static void initImplicitConstants(int minPrimary, int maxPrimary, 1222 int minTrailIn, int maxTrailIn, 1223 int gap3, int primaries3count, 1224 UErrorCode *status) { 1225 // some simple parameter checks 1226 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) 1227 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) 1228 || (primaries3count < 1)) 1229 { 1230 *status = U_ILLEGAL_ARGUMENT_ERROR; 1231 return; 1232 }; 1233 1234 minTrail = minTrailIn; 1235 maxTrail = maxTrailIn; 1236 1237 min3Primary = minPrimary; 1238 max4Primary = maxPrimary; 1239 // compute constants for use later. 1240 // number of values we can use in trailing bytes 1241 // leave room for empty values between AND above, e.g. if gap = 2 1242 // range 3..7 => +3 -4 -5 -6 -7: so 1 value 1243 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values 1244 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values 1245 final3Multiplier = gap3 + 1; 1246 final3Count = (maxTrail - minTrail + 1) / final3Multiplier; 1247 max3Trail = minTrail + (final3Count - 1) * final3Multiplier; 1248 1249 // medials can use full range 1250 medialCount = (maxTrail - minTrail + 1); 1251 // find out how many values fit in each form 1252 int32_t threeByteCount = medialCount * final3Count; 1253 // now determine where the 3/4 boundary is. 1254 // we use 3 bytes below the boundary, and 4 above 1255 int32_t primariesAvailable = maxPrimary - minPrimary + 1; 1256 int32_t primaries4count = primariesAvailable - primaries3count; 1257 1258 1259 int32_t min3ByteCoverage = primaries3count * threeByteCount; 1260 min4Primary = minPrimary + primaries3count; 1261 min4Boundary = min3ByteCoverage; 1262 // Now expand out the multiplier for the 4 bytes, and redo. 1263 1264 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; 1265 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); 1266 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); 1267 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; 1268 if (gap4 < 1) { 1269 *status = U_ILLEGAL_ARGUMENT_ERROR; 1270 return; 1271 } 1272 final4Multiplier = gap4 + 1; 1273 final4Count = neededPerFinalByte; 1274 max4Trail = minTrail + (final4Count - 1) * final4Multiplier; 1275 } 1276 1277 /** 1278 * Supply parameters for generating implicit CEs 1279 */ 1280 U_CAPI void U_EXPORT2 1281 uprv_uca_initImplicitConstants(UErrorCode *status) { 1282 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. 1283 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); 1284 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); 1285 } 1286 1287 1288 /* collIterNormalize Incremental Normalization happens here. */ 1289 /* pick up the range of chars identifed by FCD, */ 1290 /* normalize it into the collIterate's writable buffer, */ 1291 /* switch the collIterate's state to use the writable buffer. */ 1292 /* */ 1293 static 1294 void collIterNormalize(collIterate *collationSource) 1295 { 1296 UErrorCode status = U_ZERO_ERROR; 1297 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ 1298 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ 1299 1300 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)), 1301 collationSource->writableBuffer, 1302 status); 1303 if (U_FAILURE(status)) { 1304 #ifdef UCOL_DEBUG 1305 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status)); 1306 #endif 1307 return; 1308 } 1309 1310 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer(); 1311 collationSource->origFlags = collationSource->flags; 1312 collationSource->flags |= UCOL_ITER_INNORMBUF; 1313 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1314 } 1315 1316 1317 // This function takes the iterator and extracts normalized stuff up to the next boundary 1318 // It is similar in the end results to the collIterNormalize, but for the cases when we 1319 // use an iterator 1320 /*static 1321 inline void normalizeIterator(collIterate *collationSource) { 1322 UErrorCode status = U_ZERO_ERROR; 1323 UBool wasNormalized = FALSE; 1324 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); 1325 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); 1326 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1327 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1328 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { 1329 // reallocate and terminate 1330 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, 1331 &collationSource->writableBuffer, 1332 (int32_t *)&collationSource->writableBufSize, normLen + 1, 1333 0) 1334 ) { 1335 #ifdef UCOL_DEBUG 1336 fprintf(stderr, "normalizeIterator(), out of memory\n"); 1337 #endif 1338 return; 1339 } 1340 status = U_ZERO_ERROR; 1341 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); 1342 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); 1343 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1344 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1345 } 1346 // Terminate the buffer - we already checked that it is big enough 1347 collationSource->writableBuffer[normLen] = 0; 1348 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { 1349 collationSource->flags |= UCOL_ITER_ALLOCATED; 1350 } 1351 collationSource->pos = collationSource->writableBuffer; 1352 collationSource->origFlags = collationSource->flags; 1353 collationSource->flags |= UCOL_ITER_INNORMBUF; 1354 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1355 }*/ 1356 1357 1358 /* Incremental FCD check and normalize */ 1359 /* Called from getNextCE when normalization state is suspect. */ 1360 /* When entering, the state is known to be this: */ 1361 /* o We are working in the main buffer of the collIterate, not the side */ 1362 /* writable buffer. When in the side buffer, normalization mode is always off, */ 1363 /* so we won't get here. */ 1364 /* o The leading combining class from the current character is 0 or */ 1365 /* the trailing combining class of the previous char was zero. */ 1366 /* True because the previous call to this function will have always exited */ 1367 /* that way, and we get called for every char where cc might be non-zero. */ 1368 static 1369 inline UBool collIterFCD(collIterate *collationSource) { 1370 const UChar *srcP, *endP; 1371 uint8_t leadingCC; 1372 uint8_t prevTrailingCC = 0; 1373 uint16_t fcd; 1374 UBool needNormalize = FALSE; 1375 1376 srcP = collationSource->pos-1; 1377 1378 if (collationSource->flags & UCOL_ITER_HASLEN) { 1379 endP = collationSource->endp; 1380 } else { 1381 endP = NULL; 1382 } 1383 1384 // Get the trailing combining class of the current character. If it's zero, 1385 // we are OK. 1386 /* trie access */ 1387 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); 1388 if (fcd != 0) { 1389 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1390 1391 if (prevTrailingCC != 0) { 1392 // The current char has a non-zero trailing CC. Scan forward until we find 1393 // a char with a leading cc of zero. 1394 while (endP == NULL || srcP != endP) 1395 { 1396 const UChar *savedSrcP = srcP; 1397 1398 /* trie access */ 1399 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); 1400 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1401 if (leadingCC == 0) { 1402 srcP = savedSrcP; // Hit char that is not part of combining sequence. 1403 // back up over it. (Could be surrogate pair!) 1404 break; 1405 } 1406 1407 if (leadingCC < prevTrailingCC) { 1408 needNormalize = TRUE; 1409 } 1410 1411 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1412 } 1413 } 1414 } 1415 1416 collationSource->fcdPosition = (UChar *)srcP; 1417 1418 return needNormalize; 1419 } 1420 1421 /****************************************************************************/ 1422 /* Following are the CE retrieval functions */ 1423 /* */ 1424 /****************************************************************************/ 1425 1426 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); 1427 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); 1428 1429 /* there should be a macro version of this function in the header file */ 1430 /* This is the first function that tries to fetch a collation element */ 1431 /* If it's not succesfull or it encounters a more difficult situation */ 1432 /* some more sofisticated and slower functions are invoked */ 1433 static 1434 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1435 uint32_t order = 0; 1436 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ 1437 order = *(collationSource->toReturn++); /* if so, return them */ 1438 if(collationSource->CEpos == collationSource->toReturn) { 1439 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; 1440 } 1441 return order; 1442 } 1443 1444 UChar ch = 0; 1445 collationSource->offsetReturn = NULL; 1446 1447 do { 1448 for (;;) /* Loop handles case when incremental normalize switches */ 1449 { /* to or from the side buffer / original string, and we */ 1450 /* need to start again to get the next character. */ 1451 1452 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) 1453 { 1454 // The source string is null terminated and we're not working from the side buffer, 1455 // and we're not normalizing. This is the fast path. 1456 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) 1457 ch = *collationSource->pos++; 1458 if (ch != 0) { 1459 break; 1460 } 1461 else { 1462 return UCOL_NO_MORE_CES; 1463 } 1464 } 1465 1466 if (collationSource->flags & UCOL_ITER_HASLEN) { 1467 // Normal path for strings when length is specified. 1468 // (We can't be in side buffer because it is always null terminated.) 1469 if (collationSource->pos >= collationSource->endp) { 1470 // Ran off of the end of the main source string. We're done. 1471 return UCOL_NO_MORE_CES; 1472 } 1473 ch = *collationSource->pos++; 1474 } 1475 else if(collationSource->flags & UCOL_USE_ITERATOR) { 1476 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); 1477 if(iterCh == U_SENTINEL) { 1478 return UCOL_NO_MORE_CES; 1479 } 1480 ch = (UChar)iterCh; 1481 } 1482 else 1483 { 1484 // Null terminated string. 1485 ch = *collationSource->pos++; 1486 if (ch == 0) { 1487 // Ran off end of buffer. 1488 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1489 // Ran off end of main string. backing up one character. 1490 collationSource->pos--; 1491 return UCOL_NO_MORE_CES; 1492 } 1493 else 1494 { 1495 // Hit null in the normalize side buffer. 1496 // Usually this means the end of the normalized data, 1497 // except for one odd case: a null followed by combining chars, 1498 // which is the case if we are at the start of the buffer. 1499 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) { 1500 break; 1501 } 1502 1503 // Null marked end of side buffer. 1504 // Revert to the main string and 1505 // loop back to top to try again to get a character. 1506 collationSource->pos = collationSource->fcdPosition; 1507 collationSource->flags = collationSource->origFlags; 1508 continue; 1509 } 1510 } 1511 } 1512 1513 if(collationSource->flags&UCOL_HIRAGANA_Q) { 1514 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag 1515 * based on whether the previous codepoint was Hiragana or Katakana. 1516 */ 1517 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || 1518 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { 1519 collationSource->flags |= UCOL_WAS_HIRAGANA; 1520 } else { 1521 collationSource->flags &= ~UCOL_WAS_HIRAGANA; 1522 } 1523 } 1524 1525 // We've got a character. See if there's any fcd and/or normalization stuff to do. 1526 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. 1527 if ((collationSource->flags & UCOL_ITER_NORM) == 0) { 1528 break; 1529 } 1530 1531 if (collationSource->fcdPosition >= collationSource->pos) { 1532 // An earlier FCD check has already covered the current character. 1533 // We can go ahead and process this char. 1534 break; 1535 } 1536 1537 if (ch < ZERO_CC_LIMIT_ ) { 1538 // Fast fcd safe path. Trailing combining class == 0. This char is OK. 1539 break; 1540 } 1541 1542 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1543 // We need to peek at the next character in order to tell if we are FCD 1544 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { 1545 // We are at the last char of source string. 1546 // It is always OK for FCD check. 1547 break; 1548 } 1549 1550 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test 1551 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { 1552 break; 1553 } 1554 } 1555 1556 1557 // Need a more complete FCD check and possible normalization. 1558 if (collIterFCD(collationSource)) { 1559 collIterNormalize(collationSource); 1560 } 1561 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1562 // No normalization was needed. Go ahead and process the char we already had. 1563 break; 1564 } 1565 1566 // Some normalization happened. Next loop iteration will pick up a char 1567 // from the normalization buffer. 1568 1569 } // end for (;;) 1570 1571 1572 if (ch <= 0xFF) { 1573 /* For latin-1 characters we never need to fall back to the UCA table */ 1574 /* because all of the UCA data is replicated in the latinOneMapping array */ 1575 order = coll->latinOneMapping[ch]; 1576 if (order > UCOL_NOT_FOUND) { 1577 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); 1578 } 1579 } 1580 else 1581 { 1582 // Always use UCA for Han, Hangul 1583 // (Han extension A is before main Han block) 1584 // **** Han compatibility chars ?? **** 1585 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1586 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { 1587 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { 1588 // between the two target ranges; do normal lookup 1589 // **** this range is YI, Modifier tone letters, **** 1590 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1591 // **** Latin-D might be tailored, so we need to **** 1592 // **** do the normal lookup for these guys. **** 1593 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1594 } else { 1595 // in one of the target ranges; use UCA 1596 order = UCOL_NOT_FOUND; 1597 } 1598 } else { 1599 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1600 } 1601 1602 if(order > UCOL_NOT_FOUND) { /* if a CE is special */ 1603 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ 1604 } 1605 1606 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ 1607 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ 1608 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1609 1610 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ 1611 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); 1612 } 1613 } 1614 } 1615 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); 1616 1617 if(order == UCOL_NOT_FOUND) { 1618 order = getImplicit(ch, collationSource); 1619 } 1620 return order; /* return the CE */ 1621 } 1622 1623 /* ucol_getNextCE, out-of-line version for use from other files. */ 1624 U_CAPI uint32_t U_EXPORT2 1625 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1626 return ucol_IGetNextCE(coll, collationSource, status); 1627 } 1628 1629 1630 /** 1631 * Incremental previous normalization happens here. Pick up the range of chars 1632 * identifed by FCD, normalize it into the collIterate's writable buffer, 1633 * switch the collIterate's state to use the writable buffer. 1634 * @param data collation iterator data 1635 */ 1636 static 1637 void collPrevIterNormalize(collIterate *data) 1638 { 1639 UErrorCode status = U_ZERO_ERROR; 1640 const UChar *pEnd = data->pos; /* End normalize + 1 */ 1641 const UChar *pStart; 1642 1643 /* Start normalize */ 1644 if (data->fcdPosition == NULL) { 1645 pStart = data->string; 1646 } 1647 else { 1648 pStart = data->fcdPosition + 1; 1649 } 1650 1651 int32_t normLen = 1652 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)), 1653 data->writableBuffer, 1654 status). 1655 length(); 1656 if(U_FAILURE(status)) { 1657 return; 1658 } 1659 /* 1660 this puts the null termination infront of the normalized string instead 1661 of the end 1662 */ 1663 data->writableBuffer.insert(0, (UChar)0); 1664 1665 /* 1666 * The usual case at this point is that we've got a base 1667 * character followed by marks that were normalized. If 1668 * fcdPosition is NULL, that means that we backed up to 1669 * the beginning of the string and there's no base character. 1670 * 1671 * Forward processing will usually normalize when it sees 1672 * the first mark, so that mark will get it's natural offset 1673 * and the rest will get the offset of the character following 1674 * the marks. The base character will also get its natural offset. 1675 * 1676 * We write the offset of the base character, if there is one, 1677 * followed by the offset of the first mark and then the offsets 1678 * of the rest of the marks. 1679 */ 1680 int32_t firstMarkOffset = 0; 1681 int32_t trailOffset = (int32_t)(data->pos - data->string + 1); 1682 int32_t trailCount = normLen - 1; 1683 1684 if (data->fcdPosition != NULL) { 1685 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); 1686 UChar baseChar = *data->fcdPosition; 1687 1688 firstMarkOffset = baseOffset + 1; 1689 1690 /* 1691 * If the base character is the start of a contraction, forward processing 1692 * will normalize the marks while checking for the contraction, which means 1693 * that the offset of the first mark will the same as the other marks. 1694 * 1695 * **** THIS IS PROBABLY NOT A COMPLETE TEST **** 1696 */ 1697 if (baseChar >= 0x100) { 1698 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); 1699 1700 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { 1701 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); 1702 } 1703 1704 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { 1705 firstMarkOffset = trailOffset; 1706 } 1707 } 1708 1709 data->appendOffset(baseOffset, status); 1710 } 1711 1712 data->appendOffset(firstMarkOffset, status); 1713 1714 for (int32_t i = 0; i < trailCount; i += 1) { 1715 data->appendOffset(trailOffset, status); 1716 } 1717 1718 data->offsetRepeatValue = trailOffset; 1719 1720 data->offsetReturn = data->offsetStore - 1; 1721 if (data->offsetReturn == data->offsetBuffer) { 1722 data->offsetStore = data->offsetBuffer; 1723 } 1724 1725 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; 1726 data->origFlags = data->flags; 1727 data->flags |= UCOL_ITER_INNORMBUF; 1728 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1729 } 1730 1731 1732 /** 1733 * Incremental FCD check for previous iteration and normalize. Called from 1734 * getPrevCE when normalization state is suspect. 1735 * When entering, the state is known to be this: 1736 * o We are working in the main buffer of the collIterate, not the side 1737 * writable buffer. When in the side buffer, normalization mode is always 1738 * off, so we won't get here. 1739 * o The leading combining class from the current character is 0 or the 1740 * trailing combining class of the previous char was zero. 1741 * True because the previous call to this function will have always exited 1742 * that way, and we get called for every char where cc might be non-zero. 1743 * @param data collation iterate struct 1744 * @return normalization status, TRUE for normalization to be done, FALSE 1745 * otherwise 1746 */ 1747 static 1748 inline UBool collPrevIterFCD(collIterate *data) 1749 { 1750 const UChar *src, *start; 1751 uint8_t leadingCC; 1752 uint8_t trailingCC = 0; 1753 uint16_t fcd; 1754 UBool result = FALSE; 1755 1756 start = data->string; 1757 src = data->pos + 1; 1758 1759 /* Get the trailing combining class of the current character. */ 1760 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); 1761 1762 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1763 1764 if (leadingCC != 0) { 1765 /* 1766 The current char has a non-zero leading combining class. 1767 Scan backward until we find a char with a trailing cc of zero. 1768 */ 1769 for (;;) 1770 { 1771 if (start == src) { 1772 data->fcdPosition = NULL; 1773 return result; 1774 } 1775 1776 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); 1777 1778 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1779 1780 if (trailingCC == 0) { 1781 break; 1782 } 1783 1784 if (leadingCC < trailingCC) { 1785 result = TRUE; 1786 } 1787 1788 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1789 } 1790 } 1791 1792 data->fcdPosition = (UChar *)src; 1793 1794 return result; 1795 } 1796 1797 /** gets a code unit from the string at a given offset 1798 * Handles both normal and iterative cases. 1799 * No error checking - caller beware! 1800 */ 1801 static inline 1802 UChar peekCodeUnit(collIterate *source, int32_t offset) { 1803 if(source->pos != NULL) { 1804 return *(source->pos + offset); 1805 } else if(source->iterator != NULL) { 1806 UChar32 c; 1807 if(offset != 0) { 1808 source->iterator->move(source->iterator, offset, UITER_CURRENT); 1809 c = source->iterator->next(source->iterator); 1810 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); 1811 } else { 1812 c = source->iterator->current(source->iterator); 1813 } 1814 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0. 1815 } else { 1816 return 0xfffd; 1817 } 1818 } 1819 1820 // Code point version. Treats the offset as a _code point_ delta. 1821 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16. 1822 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer. 1823 static inline 1824 UChar32 peekCodePoint(collIterate *source, int32_t offset) { 1825 UChar32 c; 1826 if(source->pos != NULL) { 1827 const UChar *p = source->pos; 1828 if(offset >= 0) { 1829 // Skip forward over (offset-1) code points. 1830 while(--offset >= 0) { 1831 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) { 1832 ++p; 1833 } 1834 } 1835 // Read the code point there. 1836 c = *p++; 1837 UChar trail; 1838 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) { 1839 c = U16_GET_SUPPLEMENTARY(c, trail); 1840 } 1841 } else /* offset<0 */ { 1842 // Skip backward over (offset-1) code points. 1843 while(++offset < 0) { 1844 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) { 1845 --p; 1846 } 1847 } 1848 // Read the code point before that. 1849 c = *--p; 1850 UChar lead; 1851 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) { 1852 c = U16_GET_SUPPLEMENTARY(lead, c); 1853 } 1854 } 1855 } else if(source->iterator != NULL) { 1856 if(offset >= 0) { 1857 // Skip forward over (offset-1) code points. 1858 int32_t fwd = offset; 1859 while(fwd-- > 0) { 1860 uiter_next32(source->iterator); 1861 } 1862 // Read the code point there. 1863 c = uiter_current32(source->iterator); 1864 // Return to the starting point, skipping backward over (offset-1) code points. 1865 while(offset-- > 0) { 1866 uiter_previous32(source->iterator); 1867 } 1868 } else /* offset<0 */ { 1869 // Read backward, reading offset code points, remember only the last-read one. 1870 int32_t back = offset; 1871 do { 1872 c = uiter_previous32(source->iterator); 1873 } while(++back < 0); 1874 // Return to the starting position, skipping forward over offset code points. 1875 do { 1876 uiter_next32(source->iterator); 1877 } while(++offset < 0); 1878 } 1879 } else { 1880 c = U_SENTINEL; 1881 } 1882 return c; 1883 } 1884 1885 /** 1886 * Determines if we are at the start of the data string in the backwards 1887 * collation iterator 1888 * @param data collation iterator 1889 * @return TRUE if we are at the start 1890 */ 1891 static 1892 inline UBool isAtStartPrevIterate(collIterate *data) { 1893 if(data->pos == NULL && data->iterator != NULL) { 1894 return !data->iterator->hasPrevious(data->iterator); 1895 } 1896 //return (collIter_bos(data)) || 1897 return (data->pos == data->string) || 1898 ((data->flags & UCOL_ITER_INNORMBUF) && 1899 *(data->pos - 1) == 0 && data->fcdPosition == NULL); 1900 } 1901 1902 static 1903 inline void goBackOne(collIterate *data) { 1904 # if 0 1905 // somehow, it looks like we need to keep iterator synced up 1906 // at all times, as above. 1907 if(data->pos) { 1908 data->pos--; 1909 } 1910 if(data->iterator) { 1911 data->iterator->previous(data->iterator); 1912 } 1913 #endif 1914 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { 1915 data->iterator->previous(data->iterator); 1916 } 1917 if(data->pos) { 1918 data->pos --; 1919 } 1920 } 1921 1922 /** 1923 * Inline function that gets a simple CE. 1924 * So what it does is that it will first check the expansion buffer. If the 1925 * expansion buffer is not empty, ie the end pointer to the expansion buffer 1926 * is different from the string pointer, we return the collation element at the 1927 * return pointer and decrement it. 1928 * For more complicated CEs it resorts to getComplicatedCE. 1929 * @param coll collator data 1930 * @param data collation iterator struct 1931 * @param status error status 1932 */ 1933 static 1934 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, 1935 UErrorCode *status) 1936 { 1937 uint32_t result = (uint32_t)UCOL_NULLORDER; 1938 1939 if (data->offsetReturn != NULL) { 1940 if (data->offsetRepeatCount > 0) { 1941 data->offsetRepeatCount -= 1; 1942 } else { 1943 if (data->offsetReturn == data->offsetBuffer) { 1944 data->offsetReturn = NULL; 1945 data->offsetStore = data->offsetBuffer; 1946 } else { 1947 data->offsetReturn -= 1; 1948 } 1949 } 1950 } 1951 1952 if ((data->extendCEs && data->toReturn > data->extendCEs) || 1953 (!data->extendCEs && data->toReturn > data->CEs)) 1954 { 1955 data->toReturn -= 1; 1956 result = *(data->toReturn); 1957 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { 1958 data->CEpos = data->toReturn; 1959 } 1960 } 1961 else { 1962 UChar ch = 0; 1963 1964 do { 1965 /* 1966 Loop handles case when incremental normalize switches to or from the 1967 side buffer / original string, and we need to start again to get the 1968 next character. 1969 */ 1970 for (;;) { 1971 if (data->flags & UCOL_ITER_HASLEN) { 1972 /* 1973 Normal path for strings when length is specified. 1974 Not in side buffer because it is always null terminated. 1975 */ 1976 if (data->pos <= data->string) { 1977 /* End of the main source string */ 1978 return UCOL_NO_MORE_CES; 1979 } 1980 data->pos --; 1981 ch = *data->pos; 1982 } 1983 // we are using an iterator to go back. Pray for us! 1984 else if (data->flags & UCOL_USE_ITERATOR) { 1985 UChar32 iterCh = data->iterator->previous(data->iterator); 1986 if(iterCh == U_SENTINEL) { 1987 return UCOL_NO_MORE_CES; 1988 } else { 1989 ch = (UChar)iterCh; 1990 } 1991 } 1992 else { 1993 data->pos --; 1994 ch = *data->pos; 1995 /* we are in the side buffer. */ 1996 if (ch == 0) { 1997 /* 1998 At the start of the normalize side buffer. 1999 Go back to string. 2000 Because pointer points to the last accessed character, 2001 hence we have to increment it by one here. 2002 */ 2003 data->flags = data->origFlags; 2004 data->offsetRepeatValue = 0; 2005 2006 if (data->fcdPosition == NULL) { 2007 data->pos = data->string; 2008 return UCOL_NO_MORE_CES; 2009 } 2010 else { 2011 data->pos = data->fcdPosition + 1; 2012 } 2013 2014 continue; 2015 } 2016 } 2017 2018 if(data->flags&UCOL_HIRAGANA_Q) { 2019 if(ch>=0x3040 && ch<=0x309f) { 2020 data->flags |= UCOL_WAS_HIRAGANA; 2021 } else { 2022 data->flags &= ~UCOL_WAS_HIRAGANA; 2023 } 2024 } 2025 2026 /* 2027 * got a character to determine if there's fcd and/or normalization 2028 * stuff to do. 2029 * if the current character is not fcd. 2030 * if current character is at the start of the string 2031 * Trailing combining class == 0. 2032 * Note if pos is in the writablebuffer, norm is always 0 2033 */ 2034 if (ch < ZERO_CC_LIMIT_ || 2035 // this should propel us out of the loop in the iterator case 2036 (data->flags & UCOL_ITER_NORM) == 0 || 2037 (data->fcdPosition != NULL && data->fcdPosition <= data->pos) 2038 || data->string == data->pos) { 2039 break; 2040 } 2041 2042 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 2043 /* if next character is FCD */ 2044 if (data->pos == data->string) { 2045 /* First char of string is always OK for FCD check */ 2046 break; 2047 } 2048 2049 /* Not first char of string, do the FCD fast test */ 2050 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { 2051 break; 2052 } 2053 } 2054 2055 /* Need a more complete FCD check and possible normalization. */ 2056 if (collPrevIterFCD(data)) { 2057 collPrevIterNormalize(data); 2058 } 2059 2060 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2061 /* No normalization. Go ahead and process the char. */ 2062 break; 2063 } 2064 2065 /* 2066 Some normalization happened. 2067 Next loop picks up a char from the normalization buffer. 2068 */ 2069 } 2070 2071 /* attempt to handle contractions, after removal of the backwards 2072 contraction 2073 */ 2074 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { 2075 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); 2076 } else { 2077 if (ch <= 0xFF) { 2078 result = coll->latinOneMapping[ch]; 2079 } 2080 else { 2081 // Always use UCA for [3400..9FFF], [AC00..D7AF] 2082 // **** [FA0E..FA2F] ?? **** 2083 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 2084 (ch >= 0x3400 && ch <= 0xD7AF)) { 2085 if (ch > 0x9FFF && ch < 0xAC00) { 2086 // between the two target ranges; do normal lookup 2087 // **** this range is YI, Modifier tone letters, **** 2088 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 2089 // **** Latin-D might be tailored, so we need to **** 2090 // **** do the normal lookup for these guys. **** 2091 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 2092 } else { 2093 result = UCOL_NOT_FOUND; 2094 } 2095 } else { 2096 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 2097 } 2098 } 2099 if (result > UCOL_NOT_FOUND) { 2100 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); 2101 } 2102 if (result == UCOL_NOT_FOUND) { // Not found in master list 2103 if (!isAtStartPrevIterate(data) && 2104 ucol_contractionEndCP(ch, data->coll)) 2105 { 2106 result = UCOL_CONTRACTION; 2107 } else { 2108 if(coll->UCA) { 2109 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 2110 } 2111 } 2112 2113 if (result > UCOL_NOT_FOUND) { 2114 if(coll->UCA) { 2115 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); 2116 } 2117 } 2118 } 2119 } 2120 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); 2121 2122 if(result == UCOL_NOT_FOUND) { 2123 result = getPrevImplicit(ch, data); 2124 } 2125 } 2126 2127 return result; 2128 } 2129 2130 2131 /* ucol_getPrevCE, out-of-line version for use from other files. */ 2132 U_CFUNC uint32_t U_EXPORT2 2133 ucol_getPrevCE(const UCollator *coll, collIterate *data, 2134 UErrorCode *status) { 2135 return ucol_IGetPrevCE(coll, data, status); 2136 } 2137 2138 2139 /* this should be connected to special Jamo handling */ 2140 U_CFUNC uint32_t U_EXPORT2 2141 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { 2142 collIterate colIt; 2143 IInit_collIterate(coll, &u, 1, &colIt, status); 2144 if(U_FAILURE(*status)) { 2145 return 0; 2146 } 2147 return ucol_IGetNextCE(coll, &colIt, status); 2148 } 2149 2150 /** 2151 * Inserts the argument character into the end of the buffer pushing back the 2152 * null terminator. 2153 * @param data collIterate struct data 2154 * @param ch character to be appended 2155 * @return the position of the new addition 2156 */ 2157 static 2158 inline const UChar * insertBufferEnd(collIterate *data, UChar ch) 2159 { 2160 int32_t oldLength = data->writableBuffer.length(); 2161 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; 2162 } 2163 2164 /** 2165 * Inserts the argument string into the end of the buffer pushing back the 2166 * null terminator. 2167 * @param data collIterate struct data 2168 * @param string to be appended 2169 * @param length of the string to be appended 2170 * @return the position of the new addition 2171 */ 2172 static 2173 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length) 2174 { 2175 int32_t oldLength = data->writableBuffer.length(); 2176 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength; 2177 } 2178 2179 /** 2180 * Special normalization function for contraction in the forwards iterator. 2181 * This normalization sequence will place the current character at source->pos 2182 * and its following normalized sequence into the buffer. 2183 * The fcd position, pos will be changed. 2184 * pos will now point to positions in the buffer. 2185 * Flags will be changed accordingly. 2186 * @param data collation iterator data 2187 */ 2188 static 2189 inline void normalizeNextContraction(collIterate *data) 2190 { 2191 int32_t strsize; 2192 UErrorCode status = U_ZERO_ERROR; 2193 /* because the pointer points to the next character */ 2194 const UChar *pStart = data->pos - 1; 2195 const UChar *pEnd; 2196 2197 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2198 data->writableBuffer.setTo(*(pStart - 1)); 2199 strsize = 1; 2200 } 2201 else { 2202 strsize = data->writableBuffer.length(); 2203 } 2204 2205 pEnd = data->fcdPosition; 2206 2207 data->writableBuffer.append( 2208 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status)); 2209 if(U_FAILURE(status)) { 2210 return; 2211 } 2212 2213 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; 2214 data->origFlags = data->flags; 2215 data->flags |= UCOL_ITER_INNORMBUF; 2216 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2217 } 2218 2219 /** 2220 * Contraction character management function that returns the next character 2221 * for the forwards iterator. 2222 * Does nothing if the next character is in buffer and not the first character 2223 * in it. 2224 * Else it checks next character in data string to see if it is normalizable. 2225 * If it is not, the character is simply copied into the buffer, else 2226 * the whole normalized substring is copied into the buffer, including the 2227 * current character. 2228 * @param data collation element iterator data 2229 * @return next character 2230 */ 2231 static 2232 inline UChar getNextNormalizedChar(collIterate *data) 2233 { 2234 UChar nextch; 2235 UChar ch; 2236 // Here we need to add the iterator code. One problem is the way 2237 // end of string is handled. If we just return next char, it could 2238 // be the sentinel. Most of the cases already check for this, but we 2239 // need to be sure. 2240 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { 2241 /* if no normalization and not in buffer. */ 2242 if(data->flags & UCOL_USE_ITERATOR) { 2243 return (UChar)data->iterator->next(data->iterator); 2244 } else { 2245 return *(data->pos ++); 2246 } 2247 } 2248 2249 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { 2250 //normalizeIterator(data); 2251 //} 2252 2253 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2254 if ((innormbuf && *data->pos != 0) || 2255 (data->fcdPosition != NULL && !innormbuf && 2256 data->pos < data->fcdPosition)) { 2257 /* 2258 if next character is in normalized buffer, no further normalization 2259 is required 2260 */ 2261 return *(data->pos ++); 2262 } 2263 2264 if (data->flags & UCOL_ITER_HASLEN) { 2265 /* in data string */ 2266 if (data->pos + 1 == data->endp) { 2267 return *(data->pos ++); 2268 } 2269 } 2270 else { 2271 if (innormbuf) { 2272 // inside the normalization buffer, but at the end 2273 // (since we encountered zero). This means, in the 2274 // case we're using char iterator, that we need to 2275 // do another round of normalization. 2276 //if(data->origFlags & UCOL_USE_ITERATOR) { 2277 // we need to restore original flags, 2278 // otherwise, we'll lose them 2279 //data->flags = data->origFlags; 2280 //normalizeIterator(data); 2281 //return *(data->pos++); 2282 //} else { 2283 /* 2284 in writable buffer, at this point fcdPosition can not be 2285 pointing to the end of the data string. see contracting tag. 2286 */ 2287 if(data->fcdPosition) { 2288 if (*(data->fcdPosition + 1) == 0 || 2289 data->fcdPosition + 1 == data->endp) { 2290 /* at the end of the string, dump it into the normalizer */ 2291 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; 2292 // Check if data->pos received a null pointer 2293 if (data->pos == NULL) { 2294 return (UChar)-1; // Return to indicate error. 2295 } 2296 return *(data->fcdPosition ++); 2297 } 2298 data->pos = data->fcdPosition; 2299 } else if(data->origFlags & UCOL_USE_ITERATOR) { 2300 // if we are here, we're using a normalizing iterator. 2301 // we should just continue further. 2302 data->flags = data->origFlags; 2303 data->pos = NULL; 2304 return (UChar)data->iterator->next(data->iterator); 2305 } 2306 //} 2307 } 2308 else { 2309 if (*(data->pos + 1) == 0) { 2310 return *(data->pos ++); 2311 } 2312 } 2313 } 2314 2315 ch = *data->pos ++; 2316 nextch = *data->pos; 2317 2318 /* 2319 * if the current character is not fcd. 2320 * Trailing combining class == 0. 2321 */ 2322 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && 2323 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || 2324 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { 2325 /* 2326 Need a more complete FCD check and possible normalization. 2327 normalize substring will be appended to buffer 2328 */ 2329 if (collIterFCD(data)) { 2330 normalizeNextContraction(data); 2331 return *(data->pos ++); 2332 } 2333 else if (innormbuf) { 2334 /* fcdposition shifted even when there's no normalization, if we 2335 don't input the rest into this, we'll get the wrong position when 2336 we reach the end of the writableBuffer */ 2337 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); 2338 data->pos = insertBufferEnd(data, data->pos - 1, length); 2339 // Check if data->pos received a null pointer 2340 if (data->pos == NULL) { 2341 return (UChar)-1; // Return to indicate error. 2342 } 2343 return *(data->pos ++); 2344 } 2345 } 2346 2347 if (innormbuf) { 2348 /* 2349 no normalization is to be done hence only one character will be 2350 appended to the buffer. 2351 */ 2352 data->pos = insertBufferEnd(data, ch) + 1; 2353 // Check if data->pos received a null pointer 2354 if (data->pos == NULL) { 2355 return (UChar)-1; // Return to indicate error. 2356 } 2357 } 2358 2359 /* points back to the pos in string */ 2360 return ch; 2361 } 2362 2363 2364 2365 /** 2366 * Function to copy the buffer into writableBuffer and sets the fcd position to 2367 * the correct position 2368 * @param source data string source 2369 * @param buffer character buffer 2370 */ 2371 static 2372 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer) 2373 { 2374 /* okay confusing part here. to ensure that the skipped characters are 2375 considered later, we need to place it in the appropriate position in the 2376 normalization buffer and reassign the pos pointer. simple case if pos 2377 reside in string, simply copy to normalization buffer and 2378 fcdposition = pos, pos = start of normalization buffer. if pos in 2379 normalization buffer, we'll insert the copy infront of pos and point pos 2380 to the start of the normalization buffer. why am i doing these copies? 2381 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does 2382 not require any changes, which be really painful. */ 2383 if (source->flags & UCOL_ITER_INNORMBUF) { 2384 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer(); 2385 source->writableBuffer.replace(0, replaceLength, buffer); 2386 } 2387 else { 2388 source->fcdPosition = source->pos; 2389 source->origFlags = source->flags; 2390 source->flags |= UCOL_ITER_INNORMBUF; 2391 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 2392 source->writableBuffer = buffer; 2393 } 2394 2395 source->pos = source->writableBuffer.getTerminatedBuffer(); 2396 } 2397 2398 /** 2399 * Function to get the discontiguos collation element within the source. 2400 * Note this function will set the position to the appropriate places. 2401 * @param coll current collator used 2402 * @param source data string source 2403 * @param constart index to the start character in the contraction table 2404 * @return discontiguos collation element offset 2405 */ 2406 static 2407 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, 2408 const UChar *constart) 2409 { 2410 /* source->pos currently points to the second combining character after 2411 the start character */ 2412 const UChar *temppos = source->pos; 2413 UnicodeString buffer; 2414 const UChar *tempconstart = constart; 2415 uint8_t tempflags = source->flags; 2416 UBool multicontraction = FALSE; 2417 collIterateState discState; 2418 2419 backupState(source, &discState); 2420 2421 buffer.setTo(peekCodePoint(source, -1)); 2422 for (;;) { 2423 UChar *UCharOffset; 2424 UChar schar, 2425 tchar; 2426 uint32_t result; 2427 2428 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) 2429 || (peekCodeUnit(source, 0) == 0 && 2430 //|| (*source->pos == 0 && 2431 ((source->flags & UCOL_ITER_INNORMBUF) == 0 || 2432 source->fcdPosition == NULL || 2433 source->fcdPosition == source->endp || 2434 *(source->fcdPosition) == 0 || 2435 u_getCombiningClass(*(source->fcdPosition)) == 0)) || 2436 /* end of string in null terminated string or stopped by a 2437 null character, note fcd does not always point to a base 2438 character after the discontiguos change */ 2439 u_getCombiningClass(peekCodePoint(source, 0)) == 0) { 2440 //u_getCombiningClass(*(source->pos)) == 0) { 2441 //constart = (UChar *)coll->image + getContractOffset(CE); 2442 if (multicontraction) { 2443 source->pos = temppos - 1; 2444 setDiscontiguosAttribute(source, buffer); 2445 return *(coll->contractionCEs + 2446 (tempconstart - coll->contractionIndex)); 2447 } 2448 constart = tempconstart; 2449 break; 2450 } 2451 2452 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ 2453 schar = getNextNormalizedChar(source); 2454 2455 while (schar > (tchar = *UCharOffset)) { 2456 UCharOffset++; 2457 } 2458 2459 if (schar != tchar) { 2460 /* not the correct codepoint. we stuff the current codepoint into 2461 the discontiguos buffer and try the next character */ 2462 buffer.append(schar); 2463 continue; 2464 } 2465 else { 2466 if (u_getCombiningClass(schar) == 2467 u_getCombiningClass(peekCodePoint(source, -2))) { 2468 buffer.append(schar); 2469 continue; 2470 } 2471 result = *(coll->contractionCEs + 2472 (UCharOffset - coll->contractionIndex)); 2473 } 2474 2475 if (result == UCOL_NOT_FOUND) { 2476 break; 2477 } else if (isContraction(result)) { 2478 /* this is a multi-contraction*/ 2479 tempconstart = (UChar *)coll->image + getContractOffset(result); 2480 if (*(coll->contractionCEs + (constart - coll->contractionIndex)) 2481 != UCOL_NOT_FOUND) { 2482 multicontraction = TRUE; 2483 temppos = source->pos + 1; 2484 } 2485 } else { 2486 setDiscontiguosAttribute(source, buffer); 2487 return result; 2488 } 2489 } 2490 2491 /* no problems simply reverting just like that, 2492 if we are in string before getting into this function, points back to 2493 string hence no problem. 2494 if we are in normalization buffer before getting into this function, 2495 since we'll never use another normalization within this function, we 2496 know that fcdposition points to a base character. the normalization buffer 2497 never change, hence this revert works. */ 2498 loadState(source, &discState, TRUE); 2499 goBackOne(source); 2500 2501 //source->pos = temppos - 1; 2502 source->flags = tempflags; 2503 return *(coll->contractionCEs + (constart - coll->contractionIndex)); 2504 } 2505 2506 /* now uses Mark's getImplicitPrimary code */ 2507 static 2508 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { 2509 uint32_t r = uprv_uca_getImplicitPrimary(cp); 2510 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; 2511 collationSource->offsetRepeatCount += 1; 2512 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' 2513 } 2514 2515 /** 2516 * Inserts the argument character into the front of the buffer replacing the 2517 * front null terminator. 2518 * @param data collation element iterator data 2519 * @param ch character to be appended 2520 */ 2521 static 2522 inline void insertBufferFront(collIterate *data, UChar ch) 2523 { 2524 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2; 2525 } 2526 2527 /** 2528 * Special normalization function for contraction in the previous iterator. 2529 * This normalization sequence will place the current character at source->pos 2530 * and its following normalized sequence into the buffer. 2531 * The fcd position, pos will be changed. 2532 * pos will now point to positions in the buffer. 2533 * Flags will be changed accordingly. 2534 * @param data collation iterator data 2535 */ 2536 static 2537 inline void normalizePrevContraction(collIterate *data, UErrorCode *status) 2538 { 2539 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ 2540 const UChar *pStart; 2541 2542 UnicodeString endOfBuffer; 2543 if (data->flags & UCOL_ITER_HASLEN) { 2544 /* 2545 normalization buffer not used yet, we'll pull down the next 2546 character into the end of the buffer 2547 */ 2548 endOfBuffer.setTo(*pEnd); 2549 } 2550 else { 2551 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL 2552 } 2553 2554 if (data->fcdPosition == NULL) { 2555 pStart = data->string; 2556 } 2557 else { 2558 pStart = data->fcdPosition + 1; 2559 } 2560 int32_t normLen = 2561 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), 2562 data->writableBuffer, 2563 *status). 2564 length(); 2565 if(U_FAILURE(*status)) { 2566 return; 2567 } 2568 /* 2569 this puts the null termination infront of the normalized string instead 2570 of the end 2571 */ 2572 data->pos = 2573 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() + 2574 1 + normLen; 2575 data->origFlags = data->flags; 2576 data->flags |= UCOL_ITER_INNORMBUF; 2577 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2578 } 2579 2580 /** 2581 * Contraction character management function that returns the previous character 2582 * for the backwards iterator. 2583 * Does nothing if the previous character is in buffer and not the first 2584 * character in it. 2585 * Else it checks previous character in data string to see if it is 2586 * normalizable. 2587 * If it is not, the character is simply copied into the buffer, else 2588 * the whole normalized substring is copied into the buffer, including the 2589 * current character. 2590 * @param data collation element iterator data 2591 * @return previous character 2592 */ 2593 static 2594 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) 2595 { 2596 UChar prevch; 2597 UChar ch; 2598 const UChar *start; 2599 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2600 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || 2601 (innormbuf && *(data->pos - 1) != 0)) { 2602 /* 2603 if no normalization. 2604 if previous character is in normalized buffer, no further normalization 2605 is required 2606 */ 2607 if(data->flags & UCOL_USE_ITERATOR) { 2608 data->iterator->move(data->iterator, -1, UITER_CURRENT); 2609 return (UChar)data->iterator->next(data->iterator); 2610 } else { 2611 return *(data->pos - 1); 2612 } 2613 } 2614 2615 start = data->pos; 2616 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { 2617 /* in data string */ 2618 if ((start - 1) == data->string) { 2619 return *(start - 1); 2620 } 2621 start --; 2622 ch = *start; 2623 prevch = *(start - 1); 2624 } 2625 else { 2626 /* 2627 in writable buffer, at this point fcdPosition can not be NULL. 2628 see contracting tag. 2629 */ 2630 if (data->fcdPosition == data->string) { 2631 /* at the start of the string, just dump it into the normalizer */ 2632 insertBufferFront(data, *(data->fcdPosition)); 2633 data->fcdPosition = NULL; 2634 return *(data->pos - 1); 2635 } 2636 start = data->fcdPosition; 2637 ch = *start; 2638 prevch = *(start - 1); 2639 } 2640 /* 2641 * if the current character is not fcd. 2642 * Trailing combining class == 0. 2643 */ 2644 if (data->fcdPosition > start && 2645 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) 2646 { 2647 /* 2648 Need a more complete FCD check and possible normalization. 2649 normalize substring will be appended to buffer 2650 */ 2651 const UChar *backuppos = data->pos; 2652 data->pos = start; 2653 if (collPrevIterFCD(data)) { 2654 normalizePrevContraction(data, status); 2655 return *(data->pos - 1); 2656 } 2657 data->pos = backuppos; 2658 data->fcdPosition ++; 2659 } 2660 2661 if (innormbuf) { 2662 /* 2663 no normalization is to be done hence only one character will be 2664 appended to the buffer. 2665 */ 2666 insertBufferFront(data, ch); 2667 data->fcdPosition --; 2668 } 2669 2670 return ch; 2671 } 2672 2673 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ 2674 /* It is called by getNextCE */ 2675 2676 /* The following should be even */ 2677 #define UCOL_MAX_DIGITS_FOR_NUMBER 254 2678 2679 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { 2680 collIterateState entryState; 2681 backupState(source, &entryState); 2682 UChar32 cp = ch; 2683 2684 for (;;) { 2685 // This loop will repeat only in the case of contractions, and only when a contraction 2686 // is found and the first CE resulting from that contraction is itself a special 2687 // (an expansion, for example.) All other special CE types are fully handled the 2688 // first time through, and the loop exits. 2689 2690 const uint32_t *CEOffset = NULL; 2691 switch(getCETag(CE)) { 2692 case NOT_FOUND_TAG: 2693 /* This one is not found, and we'll let somebody else bother about it... no more games */ 2694 return CE; 2695 case SPEC_PROC_TAG: 2696 { 2697 // Special processing is getting a CE that is preceded by a certain prefix 2698 // Currently this is only needed for optimizing Japanese length and iteration marks. 2699 // When we encouter a special processing tag, we go backwards and try to see if 2700 // we have a match. 2701 // Contraction tables are used - so the whole process is not unlike contraction. 2702 // prefix data is stored backwards in the table. 2703 const UChar *UCharOffset; 2704 UChar schar, tchar; 2705 collIterateState prefixState; 2706 backupState(source, &prefixState); 2707 loadState(source, &entryState, TRUE); 2708 goBackOne(source); // We want to look at the point where we entered - actually one 2709 // before that... 2710 2711 for(;;) { 2712 // This loop will run once per source string character, for as long as we 2713 // are matching a potential contraction sequence 2714 2715 // First we position ourselves at the begining of contraction sequence 2716 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2717 if (collIter_bos(source)) { 2718 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2719 break; 2720 } 2721 schar = getPrevNormalizedChar(source, status); 2722 goBackOne(source); 2723 2724 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2725 UCharOffset++; 2726 } 2727 2728 if (schar == tchar) { 2729 // Found the source string char in the table. 2730 // Pick up the corresponding CE from the table. 2731 CE = *(coll->contractionCEs + 2732 (UCharOffset - coll->contractionIndex)); 2733 } 2734 else 2735 { 2736 // Source string char was not in the table. 2737 // We have not found the prefix. 2738 CE = *(coll->contractionCEs + 2739 (ContractionStart - coll->contractionIndex)); 2740 } 2741 2742 if(!isPrefix(CE)) { 2743 // The source string char was in the contraction table, and the corresponding 2744 // CE is not a prefix CE. We found the prefix, break 2745 // out of loop, this CE will end up being returned. This is the normal 2746 // way out of prefix handling when the source actually contained 2747 // the prefix. 2748 break; 2749 } 2750 } 2751 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue 2752 loadState(source, &prefixState, TRUE); 2753 if(source->origFlags & UCOL_USE_ITERATOR) { 2754 source->flags = source->origFlags; 2755 } 2756 } else { // prefix search was a failure, we have to backup all the way to the start 2757 loadState(source, &entryState, TRUE); 2758 } 2759 break; 2760 } 2761 case CONTRACTION_TAG: 2762 { 2763 /* This should handle contractions */ 2764 collIterateState state; 2765 backupState(source, &state); 2766 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; 2767 const UChar *UCharOffset; 2768 UChar schar, tchar; 2769 2770 for (;;) { 2771 /* This loop will run once per source string character, for as long as we */ 2772 /* are matching a potential contraction sequence */ 2773 2774 /* First we position ourselves at the begining of contraction sequence */ 2775 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2776 2777 if (collIter_eos(source)) { 2778 // Ran off the end of the source string. 2779 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2780 // So we'll pick whatever we have at the point... 2781 if (CE == UCOL_NOT_FOUND) { 2782 // back up the source over all the chars we scanned going into this contraction. 2783 CE = firstCE; 2784 loadState(source, &state, TRUE); 2785 if(source->origFlags & UCOL_USE_ITERATOR) { 2786 source->flags = source->origFlags; 2787 } 2788 } 2789 break; 2790 } 2791 2792 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ 2793 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); 2794 2795 schar = getNextNormalizedChar(source); 2796 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2797 UCharOffset++; 2798 } 2799 2800 if (schar == tchar) { 2801 // Found the source string char in the contraction table. 2802 // Pick up the corresponding CE from the table. 2803 CE = *(coll->contractionCEs + 2804 (UCharOffset - coll->contractionIndex)); 2805 } 2806 else 2807 { 2808 // Source string char was not in contraction table. 2809 // Unless we have a discontiguous contraction, we have finished 2810 // with this contraction. 2811 // in order to do the proper detection, we 2812 // need to see if we're dealing with a supplementary 2813 /* We test whether the next two char are surrogate pairs. 2814 * This test is done if the iterator is not NULL. 2815 * If there is no surrogate pair, the iterator 2816 * goes back one if needed. */ 2817 UChar32 miss = schar; 2818 if (source->iterator) { 2819 UChar32 surrNextChar; /* the next char in the iteration to test */ 2820 int32_t prevPos; /* holds the previous position before move forward of the source iterator */ 2821 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { 2822 prevPos = source->iterator->index; 2823 surrNextChar = getNextNormalizedChar(source); 2824 if (U16_IS_TRAIL(surrNextChar)) { 2825 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); 2826 } else if (prevPos < source->iterator->index){ 2827 goBackOne(source); 2828 } 2829 } 2830 } else if (U16_IS_LEAD(schar)) { 2831 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); 2832 } 2833 2834 uint8_t sCC; 2835 if (miss < 0x300 || 2836 maxCC == 0 || 2837 (sCC = i_getCombiningClass(miss, coll)) == 0 || 2838 sCC>maxCC || 2839 (allSame != 0 && sCC == maxCC) || 2840 collIter_eos(source)) 2841 { 2842 // Contraction can not be discontiguous. 2843 goBackOne(source); // back up the source string by one, 2844 // because the character we just looked at was 2845 // not part of the contraction. */ 2846 if(U_IS_SUPPLEMENTARY(miss)) { 2847 goBackOne(source); 2848 } 2849 CE = *(coll->contractionCEs + 2850 (ContractionStart - coll->contractionIndex)); 2851 } else { 2852 // 2853 // Contraction is possibly discontiguous. 2854 // Scan more of source string looking for a match 2855 // 2856 UChar tempchar; 2857 /* find the next character if schar is not a base character 2858 and we are not yet at the end of the string */ 2859 tempchar = getNextNormalizedChar(source); 2860 // probably need another supplementary thingie here 2861 goBackOne(source); 2862 if (i_getCombiningClass(tempchar, coll) == 0) { 2863 goBackOne(source); 2864 if(U_IS_SUPPLEMENTARY(miss)) { 2865 goBackOne(source); 2866 } 2867 /* Spit out the last char of the string, wasn't tasty enough */ 2868 CE = *(coll->contractionCEs + 2869 (ContractionStart - coll->contractionIndex)); 2870 } else { 2871 CE = getDiscontiguous(coll, source, ContractionStart); 2872 } 2873 } 2874 } // else after if(schar == tchar) 2875 2876 if(CE == UCOL_NOT_FOUND) { 2877 /* The Source string did not match the contraction that we were checking. */ 2878 /* Back up the source position to undo the effects of having partially */ 2879 /* scanned through what ultimately proved to not be a contraction. */ 2880 loadState(source, &state, TRUE); 2881 CE = firstCE; 2882 break; 2883 } 2884 2885 if(!isContraction(CE)) { 2886 // The source string char was in the contraction table, and the corresponding 2887 // CE is not a contraction CE. We completed the contraction, break 2888 // out of loop, this CE will end up being returned. This is the normal 2889 // way out of contraction handling when the source actually contained 2890 // the contraction. 2891 break; 2892 } 2893 2894 2895 // The source string char was in the contraction table, and the corresponding 2896 // CE is IS a contraction CE. We will continue looping to check the source 2897 // string for the remaining chars in the contraction. 2898 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); 2899 if(tempCE != UCOL_NOT_FOUND) { 2900 // We have scanned a a section of source string for which there is a 2901 // CE from the contraction table. Remember the CE and scan position, so 2902 // that we can return to this point if further scanning fails to 2903 // match a longer contraction sequence. 2904 firstCE = tempCE; 2905 2906 goBackOne(source); 2907 backupState(source, &state); 2908 getNextNormalizedChar(source); 2909 2910 // Another way to do this is: 2911 //collIterateState tempState; 2912 //backupState(source, &tempState); 2913 //goBackOne(source); 2914 //backupState(source, &state); 2915 //loadState(source, &tempState, TRUE); 2916 2917 // The problem is that for incomplete contractions we have to remember the previous 2918 // position. Before, the only thing I needed to do was state.pos--; 2919 // After iterator introduction and especially after introduction of normalizing 2920 // iterators, it became much more difficult to decrease the saved state. 2921 // I'm not yet sure which of the two methods above is faster. 2922 } 2923 } // for(;;) 2924 break; 2925 } // case CONTRACTION_TAG: 2926 case LONG_PRIMARY_TAG: 2927 { 2928 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 2929 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 2930 source->offsetRepeatCount += 1; 2931 return CE; 2932 } 2933 case EXPANSION_TAG: 2934 { 2935 /* This should handle expansion. */ 2936 /* NOTE: we can encounter both continuations and expansions in an expansion! */ 2937 /* I have to decide where continuations are going to be dealt with */ 2938 uint32_t size; 2939 uint32_t i; /* general counter */ 2940 2941 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 2942 size = getExpansionCount(CE); 2943 CE = *CEOffset++; 2944 //source->offsetRepeatCount = -1; 2945 2946 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 2947 for(i = 1; i<size; i++) { 2948 *(source->CEpos++) = *CEOffset++; 2949 source->offsetRepeatCount += 1; 2950 } 2951 } else { /* else, we do */ 2952 while(*CEOffset != 0) { 2953 *(source->CEpos++) = *CEOffset++; 2954 source->offsetRepeatCount += 1; 2955 } 2956 } 2957 2958 return CE; 2959 } 2960 case DIGIT_TAG: 2961 { 2962 /* 2963 We do a check to see if we want to collate digits as numbers; if so we generate 2964 a custom collation key. Otherwise we pull out the value stored in the expansion table. 2965 */ 2966 //uint32_t size; 2967 uint32_t i; /* general counter */ 2968 2969 if (source->coll->numericCollation == UCOL_ON){ 2970 collIterateState digitState = {0,0,0,0,0,0,0,0,0}; 2971 UChar32 char32 = 0; 2972 int32_t digVal = 0; 2973 2974 uint32_t digIndx = 0; 2975 uint32_t endIndex = 0; 2976 uint32_t trailingZeroIndex = 0; 2977 2978 uint8_t collateVal = 0; 2979 2980 UBool nonZeroValReached = FALSE; 2981 2982 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. 2983 /* 2984 We parse the source string until we hit a char that's NOT a digit. 2985 Use this u_charDigitValue. This might be slow because we have to 2986 handle surrogates... 2987 */ 2988 /* 2989 if (U16_IS_LEAD(ch)){ 2990 if (!collIter_eos(source)) { 2991 backupState(source, &digitState); 2992 UChar trail = getNextNormalizedChar(source); 2993 if(U16_IS_TRAIL(trail)) { 2994 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 2995 } else { 2996 loadState(source, &digitState, TRUE); 2997 char32 = ch; 2998 } 2999 } else { 3000 char32 = ch; 3001 } 3002 } else { 3003 char32 = ch; 3004 } 3005 digVal = u_charDigitValue(char32); 3006 */ 3007 digVal = u_charDigitValue(cp); // if we have arrived here, we have 3008 // already processed possible supplementaries that trigered the digit tag - 3009 // all supplementaries are marked in the UCA. 3010 /* 3011 We pad a zero in front of the first element anyways. This takes 3012 care of the (probably) most common case where people are sorting things followed 3013 by a single digit 3014 */ 3015 digIndx++; 3016 for(;;){ 3017 // Make sure we have enough space. No longer needed; 3018 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER 3019 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough 3020 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). 3021 3022 // Skipping over leading zeroes. 3023 if (digVal != 0) { 3024 nonZeroValReached = TRUE; 3025 } 3026 if (nonZeroValReached) { 3027 /* 3028 We parse the digit string into base 100 numbers (this fits into a byte). 3029 We only add to the buffer in twos, thus if we are parsing an odd character, 3030 that serves as the 'tens' digit while the if we are parsing an even one, that 3031 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3032 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3033 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3034 than all the other bytes. 3035 */ 3036 3037 if (digIndx % 2 == 1){ 3038 collateVal += (uint8_t)digVal; 3039 3040 // We don't enter the low-order-digit case unless we've already seen 3041 // the high order, or for the first digit, which is always non-zero. 3042 if (collateVal != 0) 3043 trailingZeroIndex = 0; 3044 3045 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3046 collateVal = 0; 3047 } 3048 else{ 3049 // We drop the collation value into the buffer so if we need to do 3050 // a "front patch" we don't have to check to see if we're hitting the 3051 // last element. 3052 collateVal = (uint8_t)(digVal * 10); 3053 3054 // Check for trailing zeroes. 3055 if (collateVal == 0) 3056 { 3057 if (!trailingZeroIndex) 3058 trailingZeroIndex = (digIndx/2) + 2; 3059 } 3060 else 3061 trailingZeroIndex = 0; 3062 3063 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3064 } 3065 digIndx++; 3066 } 3067 3068 // Get next character. 3069 if (!collIter_eos(source)){ 3070 ch = getNextNormalizedChar(source); 3071 if (U16_IS_LEAD(ch)){ 3072 if (!collIter_eos(source)) { 3073 backupState(source, &digitState); 3074 UChar trail = getNextNormalizedChar(source); 3075 if(U16_IS_TRAIL(trail)) { 3076 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 3077 } else { 3078 loadState(source, &digitState, TRUE); 3079 char32 = ch; 3080 } 3081 } 3082 } else { 3083 char32 = ch; 3084 } 3085 3086 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){ 3087 // Resetting position to point to the next unprocessed char. We 3088 // overshot it when doing our test/set for numbers. 3089 if (char32 > 0xFFFF) { // For surrogates. 3090 loadState(source, &digitState, TRUE); 3091 //goBackOne(source); 3092 } 3093 goBackOne(source); 3094 break; 3095 } 3096 } else { 3097 break; 3098 } 3099 } 3100 3101 if (nonZeroValReached == FALSE){ 3102 digIndx = 2; 3103 numTempBuf[2] = 6; 3104 } 3105 3106 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; 3107 if (digIndx % 2 != 0){ 3108 /* 3109 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what 3110 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. 3111 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a 3112 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. 3113 */ 3114 3115 for(i = 2; i < endIndex; i++){ 3116 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + 3117 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; 3118 } 3119 --digIndx; 3120 } 3121 3122 // Subtract one off of the last byte. 3123 numTempBuf[endIndex-1] -= 1; 3124 3125 /* 3126 We want to skip over the first two slots in the buffer. The first slot 3127 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3128 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3129 */ 3130 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3131 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); 3132 3133 // Now transfer the collation key to our collIterate struct. 3134 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. 3135 //size = ((endIndex+1) & ~1)/2; 3136 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3137 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3138 UCOL_BYTE_COMMON; // Tertiary weight. 3139 i = 2; // Reset the index into the buffer. 3140 while(i < endIndex) 3141 { 3142 uint32_t primWeight = numTempBuf[i++] << 8; 3143 if ( i < endIndex) 3144 primWeight |= numTempBuf[i++]; 3145 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3146 } 3147 3148 } else { 3149 // no numeric mode, we'll just switch to whatever we stashed and continue 3150 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 3151 CE = *CEOffset++; 3152 break; 3153 } 3154 return CE; 3155 } 3156 /* various implicits optimization */ 3157 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 3158 /* UCA is filled with these. Tailorings are NOT_FOUND */ 3159 return getImplicit(cp, source); 3160 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 3161 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit 3162 return getImplicit(cp, source); 3163 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3164 { 3165 static const uint32_t 3166 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3167 //const uint32_t LCount = 19; 3168 static const uint32_t VCount = 21; 3169 static const uint32_t TCount = 28; 3170 //const uint32_t NCount = VCount * TCount; // 588 3171 //const uint32_t SCount = LCount * NCount; // 11172 3172 uint32_t L = ch - SBase; 3173 3174 // divide into pieces 3175 3176 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation 3177 L /= TCount; 3178 uint32_t V = L % VCount; 3179 L /= VCount; 3180 3181 // offset them 3182 3183 L += LBase; 3184 V += VBase; 3185 T += TBase; 3186 3187 // return the first CE, but first put the rest into the expansion buffer 3188 if (!source->coll->image->jamoSpecial) { // FAST PATH 3189 3190 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3191 if (T != TBase) { 3192 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3193 } 3194 3195 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3196 3197 } else { // Jamo is Special 3198 // Since Hanguls pass the FCD check, it is 3199 // guaranteed that we won't be in 3200 // the normalization buffer if something like this happens 3201 3202 // However, if we are using a uchar iterator and normalization 3203 // is ON, the Hangul that lead us here is going to be in that 3204 // normalization buffer. Here we want to restore the uchar 3205 // iterator state and pull out of the normalization buffer 3206 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { 3207 source->flags = source->origFlags; // restore the iterator 3208 source->pos = NULL; 3209 } 3210 3211 // Move Jamos into normalization buffer 3212 UChar *buffer = source->writableBuffer.getBuffer(4); 3213 int32_t bufferLength; 3214 buffer[0] = (UChar)L; 3215 buffer[1] = (UChar)V; 3216 if (T != TBase) { 3217 buffer[2] = (UChar)T; 3218 bufferLength = 3; 3219 } else { 3220 bufferLength = 2; 3221 } 3222 source->writableBuffer.releaseBuffer(bufferLength); 3223 3224 // Indicate where to continue in main input string after exhausting the writableBuffer 3225 source->fcdPosition = source->pos; 3226 3227 source->pos = source->writableBuffer.getTerminatedBuffer(); 3228 source->origFlags = source->flags; 3229 source->flags |= UCOL_ITER_INNORMBUF; 3230 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 3231 3232 return(UCOL_IGNORABLE); 3233 } 3234 } 3235 case SURROGATE_TAG: 3236 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ 3237 /* two things can happen here: next code point can be a trailing surrogate - we will use it */ 3238 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ 3239 /* we treat it like an unassigned code point. */ 3240 { 3241 UChar trail; 3242 collIterateState state; 3243 backupState(source, &state); 3244 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { 3245 // we chould have stepped one char forward and it might have turned that it 3246 // was not a trail surrogate. In that case, we have to backup. 3247 loadState(source, &state, TRUE); 3248 return UCOL_NOT_FOUND; 3249 } else { 3250 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ 3251 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail); 3252 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. 3253 // We need to backup 3254 loadState(source, &state, TRUE); 3255 return CE; 3256 } 3257 // calculate the supplementary code point value, if surrogate was not tailored 3258 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 3259 } 3260 } 3261 break; 3262 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 3263 UChar nextChar; 3264 if( source->flags & UCOL_USE_ITERATOR) { 3265 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { 3266 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3267 source->iterator->next(source->iterator); 3268 return getImplicit(cp, source); 3269 } 3270 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) && 3271 U_IS_TRAIL((nextChar=*source->pos))) { 3272 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3273 source->pos++; 3274 return getImplicit(cp, source); 3275 } 3276 return UCOL_NOT_FOUND; 3277 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 3278 return UCOL_NOT_FOUND; /* broken surrogate sequence */ 3279 case CHARSET_TAG: 3280 /* not yet implemented */ 3281 /* probably after 1.8 */ 3282 return UCOL_NOT_FOUND; 3283 default: 3284 *status = U_INTERNAL_PROGRAM_ERROR; 3285 CE=0; 3286 break; 3287 } 3288 if (CE <= UCOL_NOT_FOUND) break; 3289 } 3290 return CE; 3291 } 3292 3293 3294 /* now uses Mark's getImplicitPrimary code */ 3295 static 3296 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { 3297 uint32_t r = uprv_uca_getImplicitPrimary(cp); 3298 3299 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; 3300 collationSource->toReturn = collationSource->CEpos; 3301 3302 // **** doesn't work if using iterator **** 3303 if (collationSource->flags & UCOL_ITER_INNORMBUF) { 3304 collationSource->offsetRepeatCount = 1; 3305 } else { 3306 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string); 3307 3308 UErrorCode errorCode = U_ZERO_ERROR; 3309 collationSource->appendOffset(firstOffset, errorCode); 3310 collationSource->appendOffset(firstOffset + 1, errorCode); 3311 3312 collationSource->offsetReturn = collationSource->offsetStore - 1; 3313 *(collationSource->offsetBuffer) = firstOffset; 3314 if (collationSource->offsetReturn == collationSource->offsetBuffer) { 3315 collationSource->offsetStore = collationSource->offsetBuffer; 3316 } 3317 } 3318 3319 return ((r & 0x0000FFFF)<<16) | 0x000000C0; 3320 } 3321 3322 /** 3323 * This function handles the special CEs like contractions, expansions, 3324 * surrogates, Thai. 3325 * It is called by both getPrevCE 3326 */ 3327 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, 3328 collIterate *source, 3329 UErrorCode *status) 3330 { 3331 const uint32_t *CEOffset = NULL; 3332 UChar *UCharOffset = NULL; 3333 UChar schar; 3334 const UChar *constart = NULL; 3335 uint32_t size; 3336 UChar buffer[UCOL_MAX_BUFFER]; 3337 uint32_t *endCEBuffer; 3338 UChar *strbuffer; 3339 int32_t noChars = 0; 3340 int32_t CECount = 0; 3341 3342 for(;;) 3343 { 3344 /* the only ces that loops are thai and contractions */ 3345 switch (getCETag(CE)) 3346 { 3347 case NOT_FOUND_TAG: /* this tag always returns */ 3348 return CE; 3349 3350 case SPEC_PROC_TAG: 3351 { 3352 // Special processing is getting a CE that is preceded by a certain prefix 3353 // Currently this is only needed for optimizing Japanese length and iteration marks. 3354 // When we encouter a special processing tag, we go backwards and try to see if 3355 // we have a match. 3356 // Contraction tables are used - so the whole process is not unlike contraction. 3357 // prefix data is stored backwards in the table. 3358 const UChar *UCharOffset; 3359 UChar schar, tchar; 3360 collIterateState prefixState; 3361 backupState(source, &prefixState); 3362 for(;;) { 3363 // This loop will run once per source string character, for as long as we 3364 // are matching a potential contraction sequence 3365 3366 // First we position ourselves at the begining of contraction sequence 3367 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 3368 3369 if (collIter_bos(source)) { 3370 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 3371 break; 3372 } 3373 schar = getPrevNormalizedChar(source, status); 3374 goBackOne(source); 3375 3376 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 3377 UCharOffset++; 3378 } 3379 3380 if (schar == tchar) { 3381 // Found the source string char in the table. 3382 // Pick up the corresponding CE from the table. 3383 CE = *(coll->contractionCEs + 3384 (UCharOffset - coll->contractionIndex)); 3385 } 3386 else 3387 { 3388 // if there is a completely ignorable code point in the middle of 3389 // a prefix, we need to act as if it's not there 3390 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) 3391 // lone surrogates cannot be set to zero as it would break other processing 3392 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 3393 // it's easy for BMP code points 3394 if(isZeroCE == 0) { 3395 continue; 3396 } else if(U16_IS_SURROGATE(schar)) { 3397 // for supplementary code points, we have to check the next one 3398 // situations where we are going to ignore 3399 // 1. beginning of the string: schar is a lone surrogate 3400 // 2. schar is a lone surrogate 3401 // 3. schar is a trail surrogate in a valid surrogate sequence 3402 // that is explicitly set to zero. 3403 if (!collIter_bos(source)) { 3404 UChar lead; 3405 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) { 3406 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead); 3407 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) { 3408 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); 3409 if(finalCE == 0) { 3410 // this is a real, assigned completely ignorable code point 3411 goBackOne(source); 3412 continue; 3413 } 3414 } 3415 } else { 3416 // lone surrogate, treat like unassigned 3417 return UCOL_NOT_FOUND; 3418 } 3419 } else { 3420 // lone surrogate at the beggining, treat like unassigned 3421 return UCOL_NOT_FOUND; 3422 } 3423 } 3424 // Source string char was not in the table. 3425 // We have not found the prefix. 3426 CE = *(coll->contractionCEs + 3427 (ContractionStart - coll->contractionIndex)); 3428 } 3429 3430 if(!isPrefix(CE)) { 3431 // The source string char was in the contraction table, and the corresponding 3432 // CE is not a prefix CE. We found the prefix, break 3433 // out of loop, this CE will end up being returned. This is the normal 3434 // way out of prefix handling when the source actually contained 3435 // the prefix. 3436 break; 3437 } 3438 } 3439 loadState(source, &prefixState, TRUE); 3440 break; 3441 } 3442 3443 case CONTRACTION_TAG: { 3444 /* to ensure that the backwards and forwards iteration matches, we 3445 take the current region of most possible match and pass it through 3446 the forward iteration. this will ensure that the obstinate problem of 3447 overlapping contractions will not occur. 3448 */ 3449 schar = peekCodeUnit(source, 0); 3450 constart = (UChar *)coll->image + getContractOffset(CE); 3451 if (isAtStartPrevIterate(source) 3452 /* commented away contraction end checks after adding the checks 3453 in getPrevCE */) { 3454 /* start of string or this is not the end of any contraction */ 3455 CE = *(coll->contractionCEs + 3456 (constart - coll->contractionIndex)); 3457 break; 3458 } 3459 strbuffer = buffer; 3460 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); 3461 *(UCharOffset --) = 0; 3462 noChars = 0; 3463 // have to swap thai characters 3464 while (ucol_unsafeCP(schar, coll)) { 3465 *(UCharOffset) = schar; 3466 noChars++; 3467 UCharOffset --; 3468 schar = getPrevNormalizedChar(source, status); 3469 goBackOne(source); 3470 // TODO: when we exhaust the contraction buffer, 3471 // it needs to get reallocated. The problem is 3472 // that the size depends on the string which is 3473 // not iterated over. However, since we're travelling 3474 // backwards, we already had to set the iterator at 3475 // the end - so we might as well know where we are? 3476 if (UCharOffset + 1 == buffer) { 3477 /* we have exhausted the buffer */ 3478 int32_t newsize = 0; 3479 if(source->pos) { // actually dealing with a position 3480 newsize = (int32_t)(source->pos - source->string + 1); 3481 } else { // iterator 3482 newsize = 4 * UCOL_MAX_BUFFER; 3483 } 3484 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * 3485 (newsize + UCOL_MAX_BUFFER)); 3486 /* test for NULL */ 3487 if (strbuffer == NULL) { 3488 *status = U_MEMORY_ALLOCATION_ERROR; 3489 return UCOL_NO_MORE_CES; 3490 } 3491 UCharOffset = strbuffer + newsize; 3492 uprv_memcpy(UCharOffset, buffer, 3493 UCOL_MAX_BUFFER * sizeof(UChar)); 3494 UCharOffset --; 3495 } 3496 if ((source->pos && (source->pos == source->string || 3497 ((source->flags & UCOL_ITER_INNORMBUF) && 3498 *(source->pos - 1) == 0 && source->fcdPosition == NULL))) 3499 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { 3500 break; 3501 } 3502 } 3503 /* adds the initial base character to the string */ 3504 *(UCharOffset) = schar; 3505 noChars++; 3506 3507 int32_t offsetBias; 3508 3509 // **** doesn't work if using iterator **** 3510 if (source->flags & UCOL_ITER_INNORMBUF) { 3511 offsetBias = -1; 3512 } else { 3513 offsetBias = (int32_t)(source->pos - source->string); 3514 } 3515 3516 /* a new collIterate is used to simplify things, since using the current 3517 collIterate will mean that the forward and backwards iteration will 3518 share and change the same buffers. we don't want to get into that. */ 3519 collIterate temp; 3520 int32_t rawOffset; 3521 3522 IInit_collIterate(coll, UCharOffset, noChars, &temp, status); 3523 if(U_FAILURE(*status)) { 3524 return UCOL_NULLORDER; 3525 } 3526 temp.flags &= ~UCOL_ITER_NORM; 3527 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; 3528 3529 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero? 3530 CE = ucol_IGetNextCE(coll, &temp, status); 3531 3532 if (source->extendCEs) { 3533 endCEBuffer = source->extendCEs + source->extendCEsSize; 3534 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t)); 3535 } else { 3536 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; 3537 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t)); 3538 } 3539 3540 while (CE != UCOL_NO_MORE_CES) { 3541 *(source->CEpos ++) = CE; 3542 3543 if (offsetBias >= 0) { 3544 source->appendOffset(rawOffset + offsetBias, *status); 3545 } 3546 3547 CECount++; 3548 if (source->CEpos == endCEBuffer) { 3549 /* ran out of CE space, reallocate to new buffer. 3550 If reallocation fails, reset pointers and bail out, 3551 there's no guarantee of the right character position after 3552 this bail*/ 3553 if (!increaseCEsCapacity(source)) { 3554 *status = U_MEMORY_ALLOCATION_ERROR; 3555 break; 3556 } 3557 3558 endCEBuffer = source->extendCEs + source->extendCEsSize; 3559 } 3560 3561 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { 3562 rawOffset = (int32_t)(temp.fcdPosition - temp.string); 3563 } else { 3564 rawOffset = (int32_t)(temp.pos - temp.string); 3565 } 3566 3567 CE = ucol_IGetNextCE(coll, &temp, status); 3568 } 3569 3570 if (strbuffer != buffer) { 3571 uprv_free(strbuffer); 3572 } 3573 if (U_FAILURE(*status)) { 3574 return (uint32_t)UCOL_NULLORDER; 3575 } 3576 3577 if (source->offsetRepeatValue != 0) { 3578 if (CECount > noChars) { 3579 source->offsetRepeatCount += temp.offsetRepeatCount; 3580 } else { 3581 // **** does this really skip the right offsets? **** 3582 source->offsetReturn -= (noChars - CECount); 3583 } 3584 } 3585 3586 if (offsetBias >= 0) { 3587 source->offsetReturn = source->offsetStore - 1; 3588 if (source->offsetReturn == source->offsetBuffer) { 3589 source->offsetStore = source->offsetBuffer; 3590 } 3591 } 3592 3593 source->toReturn = source->CEpos - 1; 3594 if (source->toReturn == source->CEs) { 3595 source->CEpos = source->CEs; 3596 } 3597 3598 return *(source->toReturn); 3599 } 3600 case LONG_PRIMARY_TAG: 3601 { 3602 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 3603 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 3604 source->toReturn = source->CEpos - 1; 3605 3606 if (source->flags & UCOL_ITER_INNORMBUF) { 3607 source->offsetRepeatCount = 1; 3608 } else { 3609 int32_t firstOffset = (int32_t)(source->pos - source->string); 3610 3611 source->appendOffset(firstOffset, *status); 3612 source->appendOffset(firstOffset + 1, *status); 3613 3614 source->offsetReturn = source->offsetStore - 1; 3615 *(source->offsetBuffer) = firstOffset; 3616 if (source->offsetReturn == source->offsetBuffer) { 3617 source->offsetStore = source->offsetBuffer; 3618 } 3619 } 3620 3621 3622 return *(source->toReturn); 3623 } 3624 3625 case EXPANSION_TAG: /* this tag always returns */ 3626 { 3627 /* 3628 This should handle expansion. 3629 NOTE: we can encounter both continuations and expansions in an expansion! 3630 I have to decide where continuations are going to be dealt with 3631 */ 3632 int32_t firstOffset = (int32_t)(source->pos - source->string); 3633 3634 // **** doesn't work if using iterator **** 3635 if (source->offsetReturn != NULL) { 3636 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) { 3637 source->offsetStore = source->offsetBuffer; 3638 }else { 3639 firstOffset = -1; 3640 } 3641 } 3642 3643 /* find the offset to expansion table */ 3644 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3645 size = getExpansionCount(CE); 3646 if (size != 0) { 3647 /* 3648 if there are less than 16 elements in expansion, we don't terminate 3649 */ 3650 uint32_t count; 3651 3652 for (count = 0; count < size; count++) { 3653 *(source->CEpos ++) = *CEOffset++; 3654 3655 if (firstOffset >= 0) { 3656 source->appendOffset(firstOffset + 1, *status); 3657 } 3658 } 3659 } else { 3660 /* else, we do */ 3661 while (*CEOffset != 0) { 3662 *(source->CEpos ++) = *CEOffset ++; 3663 3664 if (firstOffset >= 0) { 3665 source->appendOffset(firstOffset + 1, *status); 3666 } 3667 } 3668 } 3669 3670 if (firstOffset >= 0) { 3671 source->offsetReturn = source->offsetStore - 1; 3672 *(source->offsetBuffer) = firstOffset; 3673 if (source->offsetReturn == source->offsetBuffer) { 3674 source->offsetStore = source->offsetBuffer; 3675 } 3676 } else { 3677 source->offsetRepeatCount += size - 1; 3678 } 3679 3680 source->toReturn = source->CEpos - 1; 3681 // in case of one element expansion, we 3682 // want to immediately return CEpos 3683 if(source->toReturn == source->CEs) { 3684 source->CEpos = source->CEs; 3685 } 3686 3687 return *(source->toReturn); 3688 } 3689 3690 case DIGIT_TAG: 3691 { 3692 /* 3693 We do a check to see if we want to collate digits as numbers; if so we generate 3694 a custom collation key. Otherwise we pull out the value stored in the expansion table. 3695 */ 3696 uint32_t i; /* general counter */ 3697 3698 if (source->coll->numericCollation == UCOL_ON){ 3699 uint32_t digIndx = 0; 3700 uint32_t endIndex = 0; 3701 uint32_t leadingZeroIndex = 0; 3702 uint32_t trailingZeroCount = 0; 3703 3704 uint8_t collateVal = 0; 3705 3706 UBool nonZeroValReached = FALSE; 3707 3708 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs. 3709 /* 3710 We parse the source string until we hit a char that's NOT a digit. 3711 Use this u_charDigitValue. This might be slow because we have to 3712 handle surrogates... 3713 */ 3714 /* 3715 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less, 3716 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation 3717 element we process when going backward. To determine how long that chunk might be, we may need to make 3718 two passes through the loop that collects digits - one to see how long the string is (and how much is 3719 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has 3720 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation 3721 element chunk after resetting the state to the initialState at the right side of the digit string. 3722 */ 3723 uint32_t ceLimit = 0; 3724 UChar initial_ch = ch; 3725 collIterateState initialState = {0,0,0,0,0,0,0,0,0}; 3726 backupState(source, &initialState); 3727 3728 for(;;) { 3729 collIterateState state = {0,0,0,0,0,0,0,0,0}; 3730 UChar32 char32 = 0; 3731 int32_t digVal = 0; 3732 3733 if (U16_IS_TRAIL (ch)) { 3734 if (!collIter_bos(source)){ 3735 UChar lead = getPrevNormalizedChar(source, status); 3736 if(U16_IS_LEAD(lead)) { 3737 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3738 goBackOne(source); 3739 } else { 3740 char32 = ch; 3741 } 3742 } else { 3743 char32 = ch; 3744 } 3745 } else { 3746 char32 = ch; 3747 } 3748 digVal = u_charDigitValue(char32); 3749 3750 for(;;) { 3751 // Make sure we have enough space. No longer needed; 3752 // at this point the largest value of digIndx when we need to save data in numTempBuf 3753 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure 3754 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2). 3755 3756 // Skip over trailing zeroes, and keep a count of them. 3757 if (digVal != 0) 3758 nonZeroValReached = TRUE; 3759 3760 if (nonZeroValReached) { 3761 /* 3762 We parse the digit string into base 100 numbers (this fits into a byte). 3763 We only add to the buffer in twos, thus if we are parsing an odd character, 3764 that serves as the 'tens' digit while the if we are parsing an even one, that 3765 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3766 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3767 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3768 than all the other bytes. 3769 3770 Since we're doing in this reverse we want to put the first digit encountered into the 3771 ones place and the second digit encountered into the tens place. 3772 */ 3773 3774 if ((digIndx + trailingZeroCount) % 2 == 1) { 3775 // High-order digit case (tens place) 3776 collateVal += (uint8_t)(digVal * 10); 3777 3778 // We cannot set leadingZeroIndex unless it has been set for the 3779 // low-order digit. Therefore, all we can do for the high-order 3780 // digit is turn it off, never on. 3781 // The only time we will have a high digit without a low is for 3782 // the very first non-zero digit, so no zero check is necessary. 3783 if (collateVal != 0) 3784 leadingZeroIndex = 0; 3785 3786 // The first pass through, digIndx may exceed the limit, but in that case 3787 // we no longer care about numTempBuf contents since they will be discarded 3788 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) { 3789 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3790 } 3791 collateVal = 0; 3792 } else { 3793 // Low-order digit case (ones place) 3794 collateVal = (uint8_t)digVal; 3795 3796 // Check for leading zeroes. 3797 if (collateVal == 0) { 3798 if (!leadingZeroIndex) 3799 leadingZeroIndex = (digIndx/2) + 2; 3800 } else 3801 leadingZeroIndex = 0; 3802 3803 // No need to write to buffer; the case of a last odd digit 3804 // is handled below. 3805 } 3806 ++digIndx; 3807 } else 3808 ++trailingZeroCount; 3809 3810 if (!collIter_bos(source)) { 3811 ch = getPrevNormalizedChar(source, status); 3812 //goBackOne(source); 3813 if (U16_IS_TRAIL(ch)) { 3814 backupState(source, &state); 3815 if (!collIter_bos(source)) { 3816 goBackOne(source); 3817 UChar lead = getPrevNormalizedChar(source, status); 3818 3819 if(U16_IS_LEAD(lead)) { 3820 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3821 } else { 3822 loadState(source, &state, FALSE); 3823 char32 = ch; 3824 } 3825 } 3826 } else 3827 char32 = ch; 3828 3829 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { 3830 if (char32 > 0xFFFF) {// For surrogates. 3831 loadState(source, &state, FALSE); 3832 } 3833 // Don't need to "reverse" the goBackOne call, 3834 // as this points to the next position to process.. 3835 //if (char32 > 0xFFFF) // For surrogates. 3836 //getNextNormalizedChar(source); 3837 break; 3838 } 3839 3840 goBackOne(source); 3841 }else 3842 break; 3843 } 3844 3845 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) { 3846 // our collation element is not too big, go ahead and finish with it 3847 break; 3848 } 3849 // our digit string is too long for a collation element; 3850 // set the limit for it, reset the state and begin again 3851 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER; 3852 if ( ceLimit == 0 ) { 3853 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; 3854 } 3855 ch = initial_ch; 3856 loadState(source, &initialState, FALSE); 3857 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0; 3858 collateVal = 0; 3859 nonZeroValReached = FALSE; 3860 } 3861 3862 if (! nonZeroValReached) { 3863 digIndx = 2; 3864 trailingZeroCount = 0; 3865 numTempBuf[2] = 6; 3866 } 3867 3868 if ((digIndx + trailingZeroCount) % 2 != 0) { 3869 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; 3870 digIndx += 1; // The implicit leading zero 3871 } 3872 if (trailingZeroCount % 2 != 0) { 3873 // We had to consume one trailing zero for the low digit 3874 // of the least significant byte 3875 digIndx += 1; // The trailing zero not in the exponent 3876 trailingZeroCount -= 1; 3877 } 3878 3879 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; 3880 3881 // Subtract one off of the last byte. Really the first byte here, but it's reversed... 3882 numTempBuf[2] -= 1; 3883 3884 /* 3885 We want to skip over the first two slots in the buffer. The first slot 3886 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3887 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3888 The exponent must be adjusted by the number of leading zeroes, and the number of 3889 trailing zeroes. 3890 */ 3891 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3892 uint32_t exponent = (digIndx+trailingZeroCount)/2; 3893 if (leadingZeroIndex) 3894 exponent -= ((digIndx/2) + 2 - leadingZeroIndex); 3895 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); 3896 3897 // Now transfer the collation key to our collIterate struct. 3898 // The total size for our collation key is half of endIndex, rounded up. 3899 int32_t size = (endIndex+1)/2; 3900 if(!ensureCEsCapacity(source, size)) { 3901 return UCOL_NULLORDER; 3902 } 3903 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3904 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3905 UCOL_BYTE_COMMON; // Tertiary weight. 3906 i = endIndex - 1; // Reset the index into the buffer. 3907 while(i >= 2) { 3908 uint32_t primWeight = numTempBuf[i--] << 8; 3909 if ( i >= 2) 3910 primWeight |= numTempBuf[i--]; 3911 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3912 } 3913 3914 source->toReturn = source->CEpos -1; 3915 return *(source->toReturn); 3916 } else { 3917 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3918 CE = *(CEOffset++); 3919 break; 3920 } 3921 } 3922 3923 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3924 { 3925 static const uint32_t 3926 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3927 //const uint32_t LCount = 19; 3928 static const uint32_t VCount = 21; 3929 static const uint32_t TCount = 28; 3930 //const uint32_t NCount = VCount * TCount; /* 588 */ 3931 //const uint32_t SCount = LCount * NCount; /* 11172 */ 3932 3933 uint32_t L = ch - SBase; 3934 /* 3935 divide into pieces. 3936 we do it in this order since some compilers can do % and / in one 3937 operation 3938 */ 3939 uint32_t T = L % TCount; 3940 L /= TCount; 3941 uint32_t V = L % VCount; 3942 L /= VCount; 3943 3944 /* offset them */ 3945 L += LBase; 3946 V += VBase; 3947 T += TBase; 3948 3949 int32_t firstOffset = (int32_t)(source->pos - source->string); 3950 source->appendOffset(firstOffset, *status); 3951 3952 /* 3953 * return the first CE, but first put the rest into the expansion buffer 3954 */ 3955 if (!source->coll->image->jamoSpecial) { 3956 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3957 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3958 source->appendOffset(firstOffset + 1, *status); 3959 3960 if (T != TBase) { 3961 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3962 source->appendOffset(firstOffset + 1, *status); 3963 } 3964 3965 source->toReturn = source->CEpos - 1; 3966 3967 source->offsetReturn = source->offsetStore - 1; 3968 if (source->offsetReturn == source->offsetBuffer) { 3969 source->offsetStore = source->offsetBuffer; 3970 } 3971 3972 return *(source->toReturn); 3973 } else { 3974 // Since Hanguls pass the FCD check, it is 3975 // guaranteed that we won't be in 3976 // the normalization buffer if something like this happens 3977 3978 // Move Jamos into normalization buffer 3979 UChar *tempbuffer = source->writableBuffer.getBuffer(5); 3980 int32_t tempbufferLength, jamoOffset; 3981 tempbuffer[0] = 0; 3982 tempbuffer[1] = (UChar)L; 3983 tempbuffer[2] = (UChar)V; 3984 if (T != TBase) { 3985 tempbuffer[3] = (UChar)T; 3986 tempbufferLength = 4; 3987 } else { 3988 tempbufferLength = 3; 3989 } 3990 source->writableBuffer.releaseBuffer(tempbufferLength); 3991 3992 // Indicate where to continue in main input string after exhausting the writableBuffer 3993 if (source->pos == source->string) { 3994 jamoOffset = 0; 3995 source->fcdPosition = NULL; 3996 } else { 3997 jamoOffset = source->pos - source->string; 3998 source->fcdPosition = source->pos-1; 3999 } 4000 4001 // Append offsets for the additional chars 4002 // (not the 0, and not the L whose offsets match the original Hangul) 4003 int32_t jamoRemaining = tempbufferLength - 2; 4004 jamoOffset++; // appended offsets should match end of original Hangul 4005 while (jamoRemaining-- > 0) { 4006 source->appendOffset(jamoOffset, *status); 4007 } 4008 4009 source->offsetRepeatValue = jamoOffset; 4010 4011 source->offsetReturn = source->offsetStore - 1; 4012 if (source->offsetReturn == source->offsetBuffer) { 4013 source->offsetStore = source->offsetBuffer; 4014 } 4015 4016 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength; 4017 source->origFlags = source->flags; 4018 source->flags |= UCOL_ITER_INNORMBUF; 4019 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 4020 4021 return(UCOL_IGNORABLE); 4022 } 4023 } 4024 4025 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 4026 return getPrevImplicit(ch, source); 4027 4028 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function 4029 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 4030 return getPrevImplicit(ch, source); 4031 4032 case SURROGATE_TAG: /* This is a surrogate pair */ 4033 /* essentially an engaged lead surrogate. */ 4034 /* if you have encountered it here, it means that a */ 4035 /* broken sequence was encountered and this is an error */ 4036 return UCOL_NOT_FOUND; 4037 4038 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 4039 return UCOL_NOT_FOUND; /* broken surrogate sequence */ 4040 4041 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 4042 { 4043 UChar32 cp = 0; 4044 UChar prevChar; 4045 const UChar *prev; 4046 if (isAtStartPrevIterate(source)) { 4047 /* we are at the start of the string, wrong place to be at */ 4048 return UCOL_NOT_FOUND; 4049 } 4050 if (source->pos != source->writableBuffer.getBuffer()) { 4051 prev = source->pos - 1; 4052 } else { 4053 prev = source->fcdPosition; 4054 } 4055 prevChar = *prev; 4056 4057 /* Handles Han and Supplementary characters here.*/ 4058 if (U16_IS_LEAD(prevChar)) { 4059 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 4060 source->pos = prev; 4061 } else { 4062 return UCOL_NOT_FOUND; /* like unassigned */ 4063 } 4064 4065 return getPrevImplicit(cp, source); 4066 } 4067 4068 /* UCA is filled with these. Tailorings are NOT_FOUND */ 4069 /* not yet implemented */ 4070 case CHARSET_TAG: /* this tag always returns */ 4071 /* probably after 1.8 */ 4072 return UCOL_NOT_FOUND; 4073 4074 default: /* this tag always returns */ 4075 *status = U_INTERNAL_PROGRAM_ERROR; 4076 CE=0; 4077 break; 4078 } 4079 4080 if (CE <= UCOL_NOT_FOUND) { 4081 break; 4082 } 4083 } 4084 4085 return CE; 4086 } 4087 4088 /* This should really be a macro */ 4089 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */ 4090 /* anyway */ 4091 static 4092 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) { 4093 #ifdef UCOL_DEBUG 4094 fprintf(stderr, "."); 4095 #endif 4096 uint8_t *newStart = NULL; 4097 uint32_t offset = (uint32_t)(*secondaries-secStart); 4098 4099 if(secStart==second) { 4100 newStart=(uint8_t*)uprv_malloc(newSize); 4101 if(newStart==NULL) { 4102 *status = U_MEMORY_ALLOCATION_ERROR; 4103 return NULL; 4104 } 4105 uprv_memcpy(newStart, secStart, *secondaries-secStart); 4106 } else { 4107 newStart=(uint8_t*)uprv_realloc(secStart, newSize); 4108 if(newStart==NULL) { 4109 *status = U_MEMORY_ALLOCATION_ERROR; 4110 /* Since we're reallocating, return original reference so we don't loose it. */ 4111 return secStart; 4112 } 4113 } 4114 *secondaries=newStart+offset; 4115 *secSize=newSize; 4116 return newStart; 4117 } 4118 4119 4120 /* This should really be a macro */ 4121 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */ 4122 /* secondaries in French */ 4123 /* 4124 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { 4125 uint8_t temp; 4126 while(start<end) { 4127 temp = *start; 4128 *start++ = *end; 4129 *end-- = temp; 4130 } 4131 } 4132 */ 4133 4134 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ 4135 TYPE tempA; \ 4136 while((start)<(end)) { \ 4137 tempA = *(start); \ 4138 *(start)++ = *(end); \ 4139 *(end)-- = tempA; \ 4140 } \ 4141 } 4142 4143 /****************************************************************************/ 4144 /* Following are the sortkey generation functions */ 4145 /* */ 4146 /****************************************************************************/ 4147 4148 /** 4149 * Merge two sort keys. 4150 * This is useful, for example, to combine sort keys from first and last names 4151 * to sort such pairs. 4152 * Merged sort keys consider on each collation level the first part first entirely, 4153 * then the second one. 4154 * It is possible to merge multiple sort keys by consecutively merging 4155 * another one with the intermediate result. 4156 * 4157 * The length of the merge result is the sum of the lengths of the input sort keys 4158 * minus 1. 4159 * 4160 * @param src1 the first sort key 4161 * @param src1Length the length of the first sort key, including the zero byte at the end; 4162 * can be -1 if the function is to find the length 4163 * @param src2 the second sort key 4164 * @param src2Length the length of the second sort key, including the zero byte at the end; 4165 * can be -1 if the function is to find the length 4166 * @param dest the buffer where the merged sort key is written, 4167 * can be NULL if destCapacity==0 4168 * @param destCapacity the number of bytes in the dest buffer 4169 * @return the length of the merged sort key, src1Length+src2Length-1; 4170 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments), 4171 * in which cases the contents of dest is undefined 4172 * 4173 * @draft 4174 */ 4175 U_CAPI int32_t U_EXPORT2 4176 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 4177 const uint8_t *src2, int32_t src2Length, 4178 uint8_t *dest, int32_t destCapacity) { 4179 int32_t destLength; 4180 uint8_t b; 4181 4182 /* check arguments */ 4183 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 4184 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 4185 destCapacity<0 || (destCapacity>0 && dest==NULL) 4186 ) { 4187 /* error, attempt to write a zero byte and return 0 */ 4188 if(dest!=NULL && destCapacity>0) { 4189 *dest=0; 4190 } 4191 return 0; 4192 } 4193 4194 /* check lengths and capacity */ 4195 if(src1Length<0) { 4196 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 4197 } 4198 if(src2Length<0) { 4199 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 4200 } 4201 4202 destLength=src1Length+src2Length-1; 4203 if(destLength>destCapacity) { 4204 /* the merged sort key does not fit into the destination */ 4205 return destLength; 4206 } 4207 4208 /* merge the sort keys with the same number of levels */ 4209 while(*src1!=0 && *src2!=0) { /* while both have another level */ 4210 /* copy level from src1 not including 00 or 01 */ 4211 while((b=*src1)>=2) { 4212 ++src1; 4213 *dest++=b; 4214 } 4215 4216 /* add a 02 merge separator */ 4217 *dest++=2; 4218 4219 /* copy level from src2 not including 00 or 01 */ 4220 while((b=*src2)>=2) { 4221 ++src2; 4222 *dest++=b; 4223 } 4224 4225 /* if both sort keys have another level, then add a 01 level separator and continue */ 4226 if(*src1==1 && *src2==1) { 4227 ++src1; 4228 ++src2; 4229 *dest++=1; 4230 } 4231 } 4232 4233 /* 4234 * here, at least one sort key is finished now, but the other one 4235 * might have some contents left from containing more levels; 4236 * that contents is just appended to the result 4237 */ 4238 if(*src1!=0) { 4239 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 4240 src2=src1; 4241 } 4242 /* append src2, "the other, unfinished sort key" */ 4243 uprv_strcpy((char *)dest, (const char *)src2); 4244 4245 /* trust that neither sort key contained illegally embedded zero bytes */ 4246 return destLength; 4247 } 4248 4249 /* sortkey API */ 4250 U_CAPI int32_t U_EXPORT2 4251 ucol_getSortKey(const UCollator *coll, 4252 const UChar *source, 4253 int32_t sourceLength, 4254 uint8_t *result, 4255 int32_t resultLength) 4256 { 4257 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 4258 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 4259 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 4260 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 4261 } 4262 4263 UErrorCode status = U_ZERO_ERROR; 4264 int32_t keySize = 0; 4265 4266 if(source != NULL) { 4267 // source == NULL is actually an error situation, but we would need to 4268 // have an error code to return it. Until we introduce a new 4269 // API, it stays like this 4270 4271 /* this uses the function pointer that is set in updateinternalstate */ 4272 /* currently, there are two funcs: */ 4273 /*ucol_calcSortKey(...);*/ 4274 /*ucol_calcSortKeySimpleTertiary(...);*/ 4275 4276 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status); 4277 //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) { 4278 // That's not good. Something unusual happened. 4279 // We don't know how much we initialized before we failed. 4280 // NULL terminate for safety. 4281 // We have no way say that we have generated a partial sort key. 4282 //result[0] = 0; 4283 //keySize = 0; 4284 //} 4285 } 4286 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 4287 UTRACE_EXIT_STATUS(status); 4288 return keySize; 4289 } 4290 4291 /* this function is called by the C++ API for sortkey generation */ 4292 U_CFUNC int32_t 4293 ucol_getSortKeyWithAllocation(const UCollator *coll, 4294 const UChar *source, int32_t sourceLength, 4295 uint8_t **pResult, 4296 UErrorCode *pErrorCode) { 4297 *pResult = 0; 4298 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode); 4299 } 4300 4301 #define UCOL_FSEC_BUF_SIZE 256 4302 4303 // Is this primary weight compressible? 4304 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). 4305 // TODO: This should use per-lead-byte flags from FractionalUCA.txt. 4306 static inline UBool 4307 isCompressible(const UCollator * /*coll*/, uint8_t primary1) { 4308 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary; 4309 } 4310 4311 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */ 4312 /* or if we run out of space while making a sortkey and want to return ASAP */ 4313 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) { 4314 UErrorCode status = U_ZERO_ERROR; 4315 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 4316 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4317 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4318 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4319 UBool compareIdent = (strength == UCOL_IDENTICAL); 4320 UBool doCase = (coll->caseLevel == UCOL_ON); 4321 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4322 //UBool qShifted = shifted && (compareQuad == 0); 4323 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4324 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4325 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE]; 4326 uint8_t *fSecs = fSecsBuff; 4327 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE; 4328 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL; 4329 4330 uint32_t variableTopValue = coll->variableTopValue; 4331 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4332 if(doHiragana) { 4333 UCOL_COMMON_BOT4++; 4334 /* allocate one more space for hiragana */ 4335 } 4336 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4337 4338 uint32_t order = UCOL_NO_MORE_CES; 4339 uint8_t primary1 = 0; 4340 uint8_t primary2 = 0; 4341 uint8_t secondary = 0; 4342 uint8_t tertiary = 0; 4343 int32_t caseShift = 0; 4344 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */ 4345 4346 uint8_t caseSwitch = coll->caseSwitch; 4347 uint8_t tertiaryMask = coll->tertiaryMask; 4348 uint8_t tertiaryCommon = coll->tertiaryCommon; 4349 4350 UBool wasShifted = FALSE; 4351 UBool notIsContinuation = FALSE; 4352 uint8_t leadPrimary = 0; 4353 4354 4355 for(;;) { 4356 order = ucol_IGetNextCE(coll, s, &status); 4357 if(order == UCOL_NO_MORE_CES) { 4358 break; 4359 } 4360 4361 if(order == 0) { 4362 continue; 4363 } 4364 4365 notIsContinuation = !isContinuation(order); 4366 4367 4368 if(notIsContinuation) { 4369 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK)); 4370 } else { 4371 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4372 } 4373 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4374 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4375 primary1 = (uint8_t)(order >> 8); 4376 4377 /* no need to permute since the actual code values don't matter 4378 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { 4379 primary1 = coll->leadBytePermutationTable[primary1]; 4380 } 4381 */ 4382 4383 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4384 || (!notIsContinuation && wasShifted))) 4385 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */ 4386 /* and other ignorables should be removed if following a shifted code point */ 4387 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4388 /* we should just completely ignore it */ 4389 continue; 4390 } 4391 if(compareQuad == 0) { 4392 if(c4 > 0) { 4393 currentSize += (c2/UCOL_BOT_COUNT4)+1; 4394 c4 = 0; 4395 } 4396 currentSize++; 4397 if(primary2 != 0) { 4398 currentSize++; 4399 } 4400 } 4401 wasShifted = TRUE; 4402 } else { 4403 wasShifted = FALSE; 4404 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4405 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 4406 /* calculate sortkey size */ 4407 if(primary1 != UCOL_IGNORABLE) { 4408 if(notIsContinuation) { 4409 if(leadPrimary == primary1) { 4410 currentSize++; 4411 } else { 4412 if(leadPrimary != 0) { 4413 currentSize++; 4414 } 4415 if(primary2 == UCOL_IGNORABLE) { 4416 /* one byter, not compressed */ 4417 currentSize++; 4418 leadPrimary = 0; 4419 } else if(isCompressible(coll, primary1)) { 4420 /* compress */ 4421 leadPrimary = primary1; 4422 currentSize+=2; 4423 } else { 4424 leadPrimary = 0; 4425 currentSize+=2; 4426 } 4427 } 4428 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 4429 currentSize++; 4430 if(primary2 != UCOL_IGNORABLE) { 4431 currentSize++; 4432 } 4433 } 4434 } 4435 4436 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */ 4437 if(!isFrenchSec){ 4438 if (secondary == UCOL_COMMON2 && notIsContinuation) { 4439 c2++; 4440 } else { 4441 if(c2 > 0) { 4442 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4443 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1; 4444 } else { 4445 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1; 4446 } 4447 c2 = 0; 4448 } 4449 currentSize++; 4450 } 4451 } else { 4452 fSecs[fSecsLen++] = secondary; 4453 if(fSecsLen == fSecsMaxLen) { 4454 uint8_t *fSecsTemp; 4455 if(fSecs == fSecsBuff) { 4456 fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen); 4457 } else { 4458 fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen); 4459 } 4460 if(fSecsTemp == NULL) { 4461 status = U_MEMORY_ALLOCATION_ERROR; 4462 return 0; 4463 } 4464 fSecs = fSecsTemp; 4465 fSecsMaxLen *= 2; 4466 } 4467 if(notIsContinuation) { 4468 if (frenchStartPtr != NULL) { 4469 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4470 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4471 frenchStartPtr = NULL; 4472 } 4473 } else { 4474 if (frenchStartPtr == NULL) { 4475 frenchStartPtr = fSecs+fSecsLen-2; 4476 } 4477 frenchEndPtr = fSecs+fSecsLen-1; 4478 } 4479 } 4480 } 4481 4482 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 4483 // do the case level if we need to do it. We don't want to calculate 4484 // case level for primary ignorables if we have only primary strength and case level 4485 // otherwise we would break well formedness of CEs 4486 if (caseShift == 0) { 4487 currentSize++; 4488 caseShift = UCOL_CASE_SHIFT_START; 4489 } 4490 if((tertiary&0x3F) > 0 && notIsContinuation) { 4491 caseShift--; 4492 if((tertiary &0xC0) != 0) { 4493 if (caseShift == 0) { 4494 currentSize++; 4495 caseShift = UCOL_CASE_SHIFT_START; 4496 } 4497 caseShift--; 4498 } 4499 } 4500 } else { 4501 if(notIsContinuation) { 4502 tertiary ^= caseSwitch; 4503 } 4504 } 4505 4506 tertiary &= tertiaryMask; 4507 if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */ 4508 if (tertiary == tertiaryCommon && notIsContinuation) { 4509 c3++; 4510 } else { 4511 if(c3 > 0) { 4512 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) 4513 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) { 4514 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1; 4515 } else { 4516 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1; 4517 } 4518 c3 = 0; 4519 } 4520 currentSize++; 4521 } 4522 } 4523 4524 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 4525 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 4526 if(c4>0) { // Close this part 4527 currentSize += (c4/UCOL_BOT_COUNT4)+1; 4528 c4 = 0; 4529 } 4530 currentSize++; // Add the Hiragana 4531 } else { // This wasn't Hiragana, so we can continue adding stuff 4532 c4++; 4533 } 4534 } 4535 } 4536 } 4537 4538 if(!isFrenchSec){ 4539 if(c2 > 0) { 4540 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4541 } 4542 } else { 4543 uint32_t i = 0; 4544 if(frenchStartPtr != NULL) { 4545 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4546 } 4547 for(i = 0; i<fSecsLen; i++) { 4548 secondary = *(fSecs+fSecsLen-i-1); 4549 /* This is compression code. */ 4550 if (secondary == UCOL_COMMON2) { 4551 ++c2; 4552 } else { 4553 if(c2 > 0) { 4554 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4555 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0); 4556 } else { 4557 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4558 } 4559 c2 = 0; 4560 } 4561 currentSize++; 4562 } 4563 } 4564 if(c2 > 0) { 4565 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4566 } 4567 if(fSecs != fSecsBuff) { 4568 uprv_free(fSecs); 4569 } 4570 } 4571 4572 if(c3 > 0) { 4573 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0); 4574 } 4575 4576 if(c4 > 0 && compareQuad == 0) { 4577 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0); 4578 } 4579 4580 if(compareIdent) { 4581 currentSize += u_lengthOfIdenticalLevelRun(s->string, len); 4582 } 4583 return currentSize; 4584 } 4585 4586 static 4587 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) { 4588 if (caseShift == 0) { 4589 *(*cases)++ = UCOL_CASE_BYTE_START; 4590 caseShift = UCOL_CASE_SHIFT_START; 4591 } 4592 } 4593 4594 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we 4595 // know how many values we wanted to add, even if we didn't add them all 4596 static 4597 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) { 4598 size++; 4599 if(primaries < limit) { 4600 *(primaries)++ = value; 4601 } 4602 } 4603 4604 // Packs the secondary buffer when processing French locale. Adds the terminator. 4605 static 4606 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) { 4607 uint8_t secondary; 4608 int32_t count2 = 0; 4609 uint32_t i = 0, size = 0; 4610 // we use i here since the key size already accounts for terminators, so we'll discard the increment 4611 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR); 4612 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */ 4613 if(frenchStartPtr != NULL) { 4614 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4615 } 4616 for(i = 0; i<*secsize; i++) { 4617 secondary = *(secondaries-i-1); 4618 /* This is compression code. */ 4619 if (secondary == UCOL_COMMON2) { 4620 ++count2; 4621 } else { 4622 if (count2 > 0) { 4623 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4624 while (count2 > UCOL_TOP_COUNT2) { 4625 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2)); 4626 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4627 } 4628 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1))); 4629 } else { 4630 while (count2 > UCOL_BOT_COUNT2) { 4631 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4632 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4633 } 4634 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4635 } 4636 count2 = 0; 4637 } 4638 addWithIncrement(primaries, primEnd, size, secondary); 4639 } 4640 } 4641 if (count2 > 0) { 4642 while (count2 > UCOL_BOT_COUNT2) { 4643 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4644 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4645 } 4646 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4647 } 4648 *secsize = size; 4649 return primaries; 4650 } 4651 4652 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 4653 4654 /* This is the sortkey work horse function */ 4655 U_CFUNC int32_t U_CALLCONV 4656 ucol_calcSortKey(const UCollator *coll, 4657 const UChar *source, 4658 int32_t sourceLength, 4659 uint8_t **result, 4660 uint32_t resultLength, 4661 UBool allocateSKBuffer, 4662 UErrorCode *status) 4663 { 4664 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 4665 4666 uint32_t i = 0; /* general purpose counter */ 4667 4668 /* Stack allocated buffers for buffers we use */ 4669 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER]; 4670 4671 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad; 4672 4673 if(U_FAILURE(*status)) { 4674 return 0; 4675 } 4676 4677 if(primaries == NULL && allocateSKBuffer == TRUE) { 4678 primaries = *result = prim; 4679 resultLength = UCOL_PRIMARY_MAX_BUFFER; 4680 } 4681 4682 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER, 4683 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER; 4684 4685 uint32_t sortKeySize = 1; /* it is always \0 terminated */ 4686 4687 UnicodeString normSource; 4688 4689 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); 4690 4691 UColAttributeValue strength = coll->strength; 4692 4693 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4694 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4695 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4696 UBool compareIdent = (strength == UCOL_IDENTICAL); 4697 UBool doCase = (coll->caseLevel == UCOL_ON); 4698 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4699 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4700 //UBool qShifted = shifted && (compareQuad == 0); 4701 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4702 4703 uint32_t variableTopValue = coll->variableTopValue; 4704 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no 4705 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. 4706 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4707 uint8_t UCOL_HIRAGANA_QUAD = 0; 4708 if(doHiragana) { 4709 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; 4710 /* allocate one more space for hiragana, value for hiragana */ 4711 } 4712 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4713 4714 /* support for special features like caselevel and funky secondaries */ 4715 uint8_t *frenchStartPtr = NULL; 4716 uint8_t *frenchEndPtr = NULL; 4717 uint32_t caseShift = 0; 4718 4719 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0)); 4720 4721 /* If we need to normalize, we'll do it all at once at the beginning! */ 4722 const Normalizer2 *norm2; 4723 if(compareIdent) { 4724 norm2 = Normalizer2Factory::getNFDInstance(*status); 4725 } else if(coll->normalizationMode != UCOL_OFF) { 4726 norm2 = Normalizer2Factory::getFCDInstance(*status); 4727 } else { 4728 norm2 = NULL; 4729 } 4730 if(norm2 != NULL) { 4731 normSource.setTo(FALSE, source, len); 4732 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 4733 if(qcYesLength != len) { 4734 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 4735 normSource.truncate(qcYesLength); 4736 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 4737 source = normSource.getBuffer(); 4738 len = normSource.length(); 4739 } 4740 } 4741 collIterate s; 4742 IInit_collIterate(coll, source, len, &s, status); 4743 if(U_FAILURE(*status)) { 4744 return 0; 4745 } 4746 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 4747 4748 if(resultLength == 0 || primaries == NULL) { 4749 return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); 4750 } 4751 uint8_t *primarySafeEnd = primaries + resultLength - 1; 4752 if(strength > UCOL_PRIMARY) { 4753 primarySafeEnd--; 4754 } 4755 4756 uint32_t minBufferSize = UCOL_MAX_BUFFER; 4757 4758 uint8_t *primStart = primaries; 4759 uint8_t *secStart = secondaries; 4760 uint8_t *terStart = tertiaries; 4761 uint8_t *caseStart = cases; 4762 uint8_t *quadStart = quads; 4763 4764 uint32_t order = 0; 4765 4766 uint8_t primary1 = 0; 4767 uint8_t primary2 = 0; 4768 uint8_t secondary = 0; 4769 uint8_t tertiary = 0; 4770 uint8_t caseSwitch = coll->caseSwitch; 4771 uint8_t tertiaryMask = coll->tertiaryMask; 4772 int8_t tertiaryAddition = coll->tertiaryAddition; 4773 uint8_t tertiaryTop = coll->tertiaryTop; 4774 uint8_t tertiaryBottom = coll->tertiaryBottom; 4775 uint8_t tertiaryCommon = coll->tertiaryCommon; 4776 uint8_t caseBits = 0; 4777 4778 UBool finished = FALSE; 4779 UBool wasShifted = FALSE; 4780 UBool notIsContinuation = FALSE; 4781 4782 uint32_t prevBuffSize = 0; 4783 4784 uint32_t count2 = 0, count3 = 0, count4 = 0; 4785 uint8_t leadPrimary = 0; 4786 4787 for(;;) { 4788 for(i=prevBuffSize; i<minBufferSize; ++i) { 4789 4790 order = ucol_IGetNextCE(coll, &s, status); 4791 if(order == UCOL_NO_MORE_CES) { 4792 finished = TRUE; 4793 break; 4794 } 4795 4796 if(order == 0) { 4797 continue; 4798 } 4799 4800 notIsContinuation = !isContinuation(order); 4801 4802 if(notIsContinuation) { 4803 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); 4804 } else { 4805 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4806 } 4807 4808 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4809 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4810 primary1 = (uint8_t)(order >> 8); 4811 4812 uint8_t originalPrimary1 = primary1; 4813 if(notIsContinuation && coll->leadBytePermutationTable != NULL) { 4814 primary1 = coll->leadBytePermutationTable[primary1]; 4815 } 4816 4817 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4818 || (!notIsContinuation && wasShifted))) 4819 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 4820 { 4821 /* and other ignorables should be removed if following a shifted code point */ 4822 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4823 /* we should just completely ignore it */ 4824 continue; 4825 } 4826 if(compareQuad == 0) { 4827 if(count4 > 0) { 4828 while (count4 > UCOL_BOT_COUNT4) { 4829 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4830 count4 -= UCOL_BOT_COUNT4; 4831 } 4832 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 4833 count4 = 0; 4834 } 4835 /* We are dealing with a variable and we're treating them as shifted */ 4836 /* This is a shifted ignorable */ 4837 if(primary1 != 0) { /* we need to check this since we could be in continuation */ 4838 *quads++ = primary1; 4839 } 4840 if(primary2 != 0) { 4841 *quads++ = primary2; 4842 } 4843 } 4844 wasShifted = TRUE; 4845 } else { 4846 wasShifted = FALSE; 4847 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4848 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 4849 /* regular and simple sortkey calc */ 4850 if(primary1 != UCOL_IGNORABLE) { 4851 if(notIsContinuation) { 4852 if(leadPrimary == primary1) { 4853 *primaries++ = primary2; 4854 } else { 4855 if(leadPrimary != 0) { 4856 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 4857 } 4858 if(primary2 == UCOL_IGNORABLE) { 4859 /* one byter, not compressed */ 4860 *primaries++ = primary1; 4861 leadPrimary = 0; 4862 } else if(isCompressible(coll, originalPrimary1)) { 4863 /* compress */ 4864 *primaries++ = leadPrimary = primary1; 4865 if(primaries <= primarySafeEnd) { 4866 *primaries++ = primary2; 4867 } 4868 } else { 4869 leadPrimary = 0; 4870 *primaries++ = primary1; 4871 if(primaries <= primarySafeEnd) { 4872 *primaries++ = primary2; 4873 } 4874 } 4875 } 4876 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 4877 *primaries++ = primary1; 4878 if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) { 4879 *primaries++ = primary2; /* second part */ 4880 } 4881 } 4882 } 4883 4884 if(secondary > compareSec) { 4885 if(!isFrenchSec) { 4886 /* This is compression code. */ 4887 if (secondary == UCOL_COMMON2 && notIsContinuation) { 4888 ++count2; 4889 } else { 4890 if (count2 > 0) { 4891 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4892 while (count2 > UCOL_TOP_COUNT2) { 4893 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 4894 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4895 } 4896 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); 4897 } else { 4898 while (count2 > UCOL_BOT_COUNT2) { 4899 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4900 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4901 } 4902 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 4903 } 4904 count2 = 0; 4905 } 4906 *secondaries++ = secondary; 4907 } 4908 } else { 4909 *secondaries++ = secondary; 4910 /* Do the special handling for French secondaries */ 4911 /* We need to get continuation elements and do intermediate restore */ 4912 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ 4913 if(notIsContinuation) { 4914 if (frenchStartPtr != NULL) { 4915 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4916 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4917 frenchStartPtr = NULL; 4918 } 4919 } else { 4920 if (frenchStartPtr == NULL) { 4921 frenchStartPtr = secondaries - 2; 4922 } 4923 frenchEndPtr = secondaries-1; 4924 } 4925 } 4926 } 4927 4928 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 4929 // do the case level if we need to do it. We don't want to calculate 4930 // case level for primary ignorables if we have only primary strength and case level 4931 // otherwise we would break well formedness of CEs 4932 doCaseShift(&cases, caseShift); 4933 if(notIsContinuation) { 4934 caseBits = (uint8_t)(tertiary & 0xC0); 4935 4936 if(tertiary != 0) { 4937 if(coll->caseFirst == UCOL_UPPER_FIRST) { 4938 if((caseBits & 0xC0) == 0) { 4939 *(cases-1) |= 1 << (--caseShift); 4940 } else { 4941 *(cases-1) |= 0 << (--caseShift); 4942 /* second bit */ 4943 doCaseShift(&cases, caseShift); 4944 *(cases-1) |= ((caseBits>>6)&1) << (--caseShift); 4945 } 4946 } else { 4947 if((caseBits & 0xC0) == 0) { 4948 *(cases-1) |= 0 << (--caseShift); 4949 } else { 4950 *(cases-1) |= 1 << (--caseShift); 4951 /* second bit */ 4952 doCaseShift(&cases, caseShift); 4953 *(cases-1) |= ((caseBits>>7)&1) << (--caseShift); 4954 } 4955 } 4956 } 4957 4958 } 4959 } else { 4960 if(notIsContinuation) { 4961 tertiary ^= caseSwitch; 4962 } 4963 } 4964 4965 tertiary &= tertiaryMask; 4966 if(tertiary > compareTer) { 4967 /* This is compression code. */ 4968 /* sequence size check is included in the if clause */ 4969 if (tertiary == tertiaryCommon && notIsContinuation) { 4970 ++count3; 4971 } else { 4972 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 4973 tertiary += tertiaryAddition; 4974 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 4975 tertiary -= tertiaryAddition; 4976 } 4977 if (count3 > 0) { 4978 if ((tertiary > tertiaryCommon)) { 4979 while (count3 > coll->tertiaryTopCount) { 4980 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 4981 count3 -= (uint32_t)coll->tertiaryTopCount; 4982 } 4983 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); 4984 } else { 4985 while (count3 > coll->tertiaryBottomCount) { 4986 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 4987 count3 -= (uint32_t)coll->tertiaryBottomCount; 4988 } 4989 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 4990 } 4991 count3 = 0; 4992 } 4993 *tertiaries++ = tertiary; 4994 } 4995 } 4996 4997 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 4998 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 4999 if(count4>0) { // Close this part 5000 while (count4 > UCOL_BOT_COUNT4) { 5001 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 5002 count4 -= UCOL_BOT_COUNT4; 5003 } 5004 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 5005 count4 = 0; 5006 } 5007 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana 5008 } else { // This wasn't Hiragana, so we can continue adding stuff 5009 count4++; 5010 } 5011 } 5012 } 5013 5014 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ 5015 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ 5016 IInit_collIterate(coll, (UChar *)source, len, &s, status); 5017 if(U_FAILURE(*status)) { 5018 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5019 finished = TRUE; 5020 break; 5021 } 5022 s.flags &= ~UCOL_ITER_NORM; 5023 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); 5024 *status = U_BUFFER_OVERFLOW_ERROR; 5025 finished = TRUE; 5026 break; 5027 } else { /* It's much nicer if we can actually reallocate */ 5028 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart)); 5029 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); 5030 if(U_SUCCESS(*status)) { 5031 *result = primStart; 5032 primarySafeEnd = primStart + resultLength - 1; 5033 if(strength > UCOL_PRIMARY) { 5034 primarySafeEnd--; 5035 } 5036 } else { 5037 /* We ran out of memory!? We can't recover. */ 5038 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5039 finished = TRUE; 5040 break; 5041 } 5042 } 5043 } 5044 } 5045 if(finished) { 5046 break; 5047 } else { 5048 prevBuffSize = minBufferSize; 5049 5050 uint32_t frenchStartOffset = 0, frenchEndOffset = 0; 5051 if (frenchStartPtr != NULL) { 5052 frenchStartOffset = (uint32_t)(frenchStartPtr - secStart); 5053 frenchEndOffset = (uint32_t)(frenchEndPtr - secStart); 5054 } 5055 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); 5056 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); 5057 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status); 5058 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status); 5059 if(U_FAILURE(*status)) { 5060 /* We ran out of memory!? We can't recover. */ 5061 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5062 break; 5063 } 5064 if (frenchStartPtr != NULL) { 5065 frenchStartPtr = secStart + frenchStartOffset; 5066 frenchEndPtr = secStart + frenchEndOffset; 5067 } 5068 minBufferSize *= 2; 5069 } 5070 } 5071 5072 /* Here, we are generally done with processing */ 5073 /* bailing out would not be too productive */ 5074 5075 if(U_SUCCESS(*status)) { 5076 sortKeySize += (uint32_t)(primaries - primStart); 5077 /* we have done all the CE's, now let's put them together to form a key */ 5078 if(compareSec == 0) { 5079 if (count2 > 0) { 5080 while (count2 > UCOL_BOT_COUNT2) { 5081 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5082 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5083 } 5084 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5085 } 5086 uint32_t secsize = (uint32_t)(secondaries-secStart); 5087 if(!isFrenchSec) { // Regular situation, we know the length of secondaries 5088 sortKeySize += secsize; 5089 if(sortKeySize <= resultLength) { 5090 *(primaries++) = UCOL_LEVELTERMINATOR; 5091 uprv_memcpy(primaries, secStart, secsize); 5092 primaries += secsize; 5093 } else { 5094 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ 5095 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5096 if(U_SUCCESS(*status)) { 5097 *result = primStart; 5098 *(primaries++) = UCOL_LEVELTERMINATOR; 5099 uprv_memcpy(primaries, secStart, secsize); 5100 primaries += secsize; 5101 } 5102 else { 5103 /* We ran out of memory!? We can't recover. */ 5104 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5105 goto cleanup; 5106 } 5107 } else { 5108 *status = U_BUFFER_OVERFLOW_ERROR; 5109 } 5110 } 5111 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator 5112 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); 5113 sortKeySize += secsize; 5114 if(sortKeySize <= resultLength) { // if we managed to pack fine 5115 primaries = newPrim; // update the primary pointer 5116 } else { // overflow, need to reallocate and redo 5117 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ 5118 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5119 if(U_SUCCESS(*status)) { 5120 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); 5121 } 5122 else { 5123 /* We ran out of memory!? We can't recover. */ 5124 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5125 goto cleanup; 5126 } 5127 } else { 5128 *status = U_BUFFER_OVERFLOW_ERROR; 5129 } 5130 } 5131 } 5132 } 5133 5134 if(doCase) { 5135 uint32_t casesize = (uint32_t)(cases - caseStart); 5136 sortKeySize += casesize; 5137 if(sortKeySize <= resultLength) { 5138 *(primaries++) = UCOL_LEVELTERMINATOR; 5139 uprv_memcpy(primaries, caseStart, casesize); 5140 primaries += casesize; 5141 } else { 5142 if(allocateSKBuffer == TRUE) { 5143 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5144 if(U_SUCCESS(*status)) { 5145 *result = primStart; 5146 *(primaries++) = UCOL_LEVELTERMINATOR; 5147 uprv_memcpy(primaries, caseStart, casesize); 5148 } 5149 else { 5150 /* We ran out of memory!? We can't recover. */ 5151 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5152 goto cleanup; 5153 } 5154 } else { 5155 *status = U_BUFFER_OVERFLOW_ERROR; 5156 } 5157 } 5158 } 5159 5160 if(compareTer == 0) { 5161 if (count3 > 0) { 5162 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { 5163 while (count3 >= coll->tertiaryTopCount) { 5164 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5165 count3 -= (uint32_t)coll->tertiaryTopCount; 5166 } 5167 *tertiaries++ = (uint8_t)(tertiaryTop - count3); 5168 } else { 5169 while (count3 > coll->tertiaryBottomCount) { 5170 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5171 count3 -= (uint32_t)coll->tertiaryBottomCount; 5172 } 5173 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5174 } 5175 } 5176 uint32_t tersize = (uint32_t)(tertiaries - terStart); 5177 sortKeySize += tersize; 5178 if(sortKeySize <= resultLength) { 5179 *(primaries++) = UCOL_LEVELTERMINATOR; 5180 uprv_memcpy(primaries, terStart, tersize); 5181 primaries += tersize; 5182 } else { 5183 if(allocateSKBuffer == TRUE) { 5184 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5185 if(U_SUCCESS(*status)) { 5186 *result = primStart; 5187 *(primaries++) = UCOL_LEVELTERMINATOR; 5188 uprv_memcpy(primaries, terStart, tersize); 5189 } 5190 else { 5191 /* We ran out of memory!? We can't recover. */ 5192 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5193 goto cleanup; 5194 } 5195 } else { 5196 *status = U_BUFFER_OVERFLOW_ERROR; 5197 } 5198 } 5199 5200 if(compareQuad == 0/*qShifted == TRUE*/) { 5201 if(count4 > 0) { 5202 while (count4 > UCOL_BOT_COUNT4) { 5203 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 5204 count4 -= UCOL_BOT_COUNT4; 5205 } 5206 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 5207 } 5208 uint32_t quadsize = (uint32_t)(quads - quadStart); 5209 sortKeySize += quadsize; 5210 if(sortKeySize <= resultLength) { 5211 *(primaries++) = UCOL_LEVELTERMINATOR; 5212 uprv_memcpy(primaries, quadStart, quadsize); 5213 primaries += quadsize; 5214 } else { 5215 if(allocateSKBuffer == TRUE) { 5216 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5217 if(U_SUCCESS(*status)) { 5218 *result = primStart; 5219 *(primaries++) = UCOL_LEVELTERMINATOR; 5220 uprv_memcpy(primaries, quadStart, quadsize); 5221 } 5222 else { 5223 /* We ran out of memory!? We can't recover. */ 5224 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5225 goto cleanup; 5226 } 5227 } else { 5228 *status = U_BUFFER_OVERFLOW_ERROR; 5229 } 5230 } 5231 } 5232 5233 if(compareIdent) { 5234 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len); 5235 if(sortKeySize <= resultLength) { 5236 *(primaries++) = UCOL_LEVELTERMINATOR; 5237 primaries += u_writeIdenticalLevelRun(s.string, len, primaries); 5238 } else { 5239 if(allocateSKBuffer == TRUE) { 5240 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status); 5241 if(U_SUCCESS(*status)) { 5242 *result = primStart; 5243 *(primaries++) = UCOL_LEVELTERMINATOR; 5244 u_writeIdenticalLevelRun(s.string, len, primaries); 5245 } 5246 else { 5247 /* We ran out of memory!? We can't recover. */ 5248 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5249 goto cleanup; 5250 } 5251 } else { 5252 *status = U_BUFFER_OVERFLOW_ERROR; 5253 } 5254 } 5255 } 5256 } 5257 *(primaries++) = '\0'; 5258 } 5259 5260 if(allocateSKBuffer == TRUE) { 5261 *result = (uint8_t*)uprv_malloc(sortKeySize); 5262 /* test for NULL */ 5263 if (*result == NULL) { 5264 *status = U_MEMORY_ALLOCATION_ERROR; 5265 goto cleanup; 5266 } 5267 uprv_memcpy(*result, primStart, sortKeySize); 5268 if(primStart != prim) { 5269 uprv_free(primStart); 5270 } 5271 } 5272 5273 cleanup: 5274 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { 5275 /* NULL terminate for safety */ 5276 **result = 0; 5277 } 5278 if(terStart != tert) { 5279 uprv_free(terStart); 5280 uprv_free(secStart); 5281 uprv_free(caseStart); 5282 uprv_free(quadStart); 5283 } 5284 5285 /* To avoid memory leak, free the offset buffer if necessary. */ 5286 ucol_freeOffsetBuffer(&s); 5287 5288 return sortKeySize; 5289 } 5290 5291 5292 U_CFUNC int32_t U_CALLCONV 5293 ucol_calcSortKeySimpleTertiary(const UCollator *coll, 5294 const UChar *source, 5295 int32_t sourceLength, 5296 uint8_t **result, 5297 uint32_t resultLength, 5298 UBool allocateSKBuffer, 5299 UErrorCode *status) 5300 { 5301 U_ALIGN_CODE(16); 5302 5303 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 5304 uint32_t i = 0; /* general purpose counter */ 5305 5306 /* Stack allocated buffers for buffers we use */ 5307 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER]; 5308 5309 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert; 5310 5311 if(U_FAILURE(*status)) { 5312 return 0; 5313 } 5314 5315 if(primaries == NULL && allocateSKBuffer == TRUE) { 5316 primaries = *result = prim; 5317 resultLength = UCOL_PRIMARY_MAX_BUFFER; 5318 } 5319 5320 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER; 5321 5322 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */ 5323 5324 UnicodeString normSource; 5325 5326 int32_t len = sourceLength; 5327 5328 /* If we need to normalize, we'll do it all at once at the beginning! */ 5329 if(coll->normalizationMode != UCOL_OFF) { 5330 normSource.setTo(len < 0, source, len); 5331 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); 5332 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 5333 if(qcYesLength != normSource.length()) { 5334 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 5335 normSource.truncate(qcYesLength); 5336 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 5337 source = normSource.getBuffer(); 5338 len = normSource.length(); 5339 } 5340 } 5341 collIterate s; 5342 IInit_collIterate(coll, (UChar *)source, len, &s, status); 5343 if(U_FAILURE(*status)) { 5344 return 0; 5345 } 5346 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 5347 5348 if(resultLength == 0 || primaries == NULL) { 5349 return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); 5350 } 5351 5352 uint8_t *primarySafeEnd = primaries + resultLength - 2; 5353 5354 uint32_t minBufferSize = UCOL_MAX_BUFFER; 5355 5356 uint8_t *primStart = primaries; 5357 uint8_t *secStart = secondaries; 5358 uint8_t *terStart = tertiaries; 5359 5360 uint32_t order = 0; 5361 5362 uint8_t primary1 = 0; 5363 uint8_t primary2 = 0; 5364 uint8_t secondary = 0; 5365 uint8_t tertiary = 0; 5366 uint8_t caseSwitch = coll->caseSwitch; 5367 uint8_t tertiaryMask = coll->tertiaryMask; 5368 int8_t tertiaryAddition = coll->tertiaryAddition; 5369 uint8_t tertiaryTop = coll->tertiaryTop; 5370 uint8_t tertiaryBottom = coll->tertiaryBottom; 5371 uint8_t tertiaryCommon = coll->tertiaryCommon; 5372 5373 uint32_t prevBuffSize = 0; 5374 5375 UBool finished = FALSE; 5376 UBool notIsContinuation = FALSE; 5377 5378 uint32_t count2 = 0, count3 = 0; 5379 uint8_t leadPrimary = 0; 5380 5381 for(;;) { 5382 for(i=prevBuffSize; i<minBufferSize; ++i) { 5383 5384 order = ucol_IGetNextCE(coll, &s, status); 5385 5386 if(order == 0) { 5387 continue; 5388 } 5389 5390 if(order == UCOL_NO_MORE_CES) { 5391 finished = TRUE; 5392 break; 5393 } 5394 5395 notIsContinuation = !isContinuation(order); 5396 5397 if(notIsContinuation) { 5398 tertiary = (uint8_t)((order & tertiaryMask)); 5399 } else { 5400 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 5401 } 5402 5403 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5404 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5405 primary1 = (uint8_t)(order >> 8); 5406 5407 uint8_t originalPrimary1 = primary1; 5408 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { 5409 primary1 = coll->leadBytePermutationTable[primary1]; 5410 } 5411 5412 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 5413 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 5414 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ 5415 /* regular and simple sortkey calc */ 5416 if(primary1 != UCOL_IGNORABLE) { 5417 if(notIsContinuation) { 5418 if(leadPrimary == primary1) { 5419 *primaries++ = primary2; 5420 } else { 5421 if(leadPrimary != 0) { 5422 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 5423 } 5424 if(primary2 == UCOL_IGNORABLE) { 5425 /* one byter, not compressed */ 5426 *primaries++ = primary1; 5427 leadPrimary = 0; 5428 } else if(isCompressible(coll, originalPrimary1)) { 5429 /* compress */ 5430 *primaries++ = leadPrimary = primary1; 5431 *primaries++ = primary2; 5432 } else { 5433 leadPrimary = 0; 5434 *primaries++ = primary1; 5435 *primaries++ = primary2; 5436 } 5437 } 5438 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 5439 *primaries++ = primary1; 5440 if(primary2 != UCOL_IGNORABLE) { 5441 *primaries++ = primary2; /* second part */ 5442 } 5443 } 5444 } 5445 5446 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ 5447 /* This is compression code. */ 5448 if (secondary == UCOL_COMMON2 && notIsContinuation) { 5449 ++count2; 5450 } else { 5451 if (count2 > 0) { 5452 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 5453 while (count2 > UCOL_TOP_COUNT2) { 5454 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 5455 count2 -= (uint32_t)UCOL_TOP_COUNT2; 5456 } 5457 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); 5458 } else { 5459 while (count2 > UCOL_BOT_COUNT2) { 5460 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5461 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5462 } 5463 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5464 } 5465 count2 = 0; 5466 } 5467 *secondaries++ = secondary; 5468 } 5469 } 5470 5471 if(notIsContinuation) { 5472 tertiary ^= caseSwitch; 5473 } 5474 5475 if(tertiary > 0) { 5476 /* This is compression code. */ 5477 /* sequence size check is included in the if clause */ 5478 if (tertiary == tertiaryCommon && notIsContinuation) { 5479 ++count3; 5480 } else { 5481 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 5482 tertiary += tertiaryAddition; 5483 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 5484 tertiary -= tertiaryAddition; 5485 } 5486 if (count3 > 0) { 5487 if ((tertiary > tertiaryCommon)) { 5488 while (count3 > coll->tertiaryTopCount) { 5489 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5490 count3 -= (uint32_t)coll->tertiaryTopCount; 5491 } 5492 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); 5493 } else { 5494 while (count3 > coll->tertiaryBottomCount) { 5495 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5496 count3 -= (uint32_t)coll->tertiaryBottomCount; 5497 } 5498 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5499 } 5500 count3 = 0; 5501 } 5502 *tertiaries++ = tertiary; 5503 } 5504 } 5505 5506 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ 5507 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ 5508 IInit_collIterate(coll, (UChar *)source, len, &s, status); 5509 if(U_FAILURE(*status)) { 5510 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5511 finished = TRUE; 5512 break; 5513 } 5514 s.flags &= ~UCOL_ITER_NORM; 5515 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); 5516 *status = U_BUFFER_OVERFLOW_ERROR; 5517 finished = TRUE; 5518 break; 5519 } else { /* It's much nicer if we can actually reallocate */ 5520 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)); 5521 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); 5522 if(U_SUCCESS(*status)) { 5523 *result = primStart; 5524 primarySafeEnd = primStart + resultLength - 2; 5525 } else { 5526 /* We ran out of memory!? We can't recover. */ 5527 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5528 finished = TRUE; 5529 break; 5530 } 5531 } 5532 } 5533 } 5534 if(finished) { 5535 break; 5536 } else { 5537 prevBuffSize = minBufferSize; 5538 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); 5539 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); 5540 minBufferSize *= 2; 5541 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size 5542 /* We ran out of memory!? We can't recover. */ 5543 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5544 break; 5545 } 5546 } 5547 } 5548 5549 if(U_SUCCESS(*status)) { 5550 sortKeySize += (uint32_t)(primaries - primStart); 5551 /* we have done all the CE's, now let's put them together to form a key */ 5552 if (count2 > 0) { 5553 while (count2 > UCOL_BOT_COUNT2) { 5554 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5555 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5556 } 5557 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5558 } 5559 uint32_t secsize = (uint32_t)(secondaries-secStart); 5560 sortKeySize += secsize; 5561 if(sortKeySize <= resultLength) { 5562 *(primaries++) = UCOL_LEVELTERMINATOR; 5563 uprv_memcpy(primaries, secStart, secsize); 5564 primaries += secsize; 5565 } else { 5566 if(allocateSKBuffer == TRUE) { 5567 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5568 if(U_SUCCESS(*status)) { 5569 *(primaries++) = UCOL_LEVELTERMINATOR; 5570 *result = primStart; 5571 uprv_memcpy(primaries, secStart, secsize); 5572 } 5573 else { 5574 /* We ran out of memory!? We can't recover. */ 5575 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5576 goto cleanup; 5577 } 5578 } else { 5579 *status = U_BUFFER_OVERFLOW_ERROR; 5580 } 5581 } 5582 5583 if (count3 > 0) { 5584 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { 5585 while (count3 >= coll->tertiaryTopCount) { 5586 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5587 count3 -= (uint32_t)coll->tertiaryTopCount; 5588 } 5589 *tertiaries++ = (uint8_t)(tertiaryTop - count3); 5590 } else { 5591 while (count3 > coll->tertiaryBottomCount) { 5592 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5593 count3 -= (uint32_t)coll->tertiaryBottomCount; 5594 } 5595 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5596 } 5597 } 5598 uint32_t tersize = (uint32_t)(tertiaries - terStart); 5599 sortKeySize += tersize; 5600 if(sortKeySize <= resultLength) { 5601 *(primaries++) = UCOL_LEVELTERMINATOR; 5602 uprv_memcpy(primaries, terStart, tersize); 5603 primaries += tersize; 5604 } else { 5605 if(allocateSKBuffer == TRUE) { 5606 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5607 if(U_SUCCESS(*status)) { 5608 *result = primStart; 5609 *(primaries++) = UCOL_LEVELTERMINATOR; 5610 uprv_memcpy(primaries, terStart, tersize); 5611 } 5612 else { 5613 /* We ran out of memory!? We can't recover. */ 5614 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5615 goto cleanup; 5616 } 5617 } else { 5618 *status = U_BUFFER_OVERFLOW_ERROR; 5619 } 5620 } 5621 5622 *(primaries++) = '\0'; 5623 } 5624 5625 if(allocateSKBuffer == TRUE) { 5626 *result = (uint8_t*)uprv_malloc(sortKeySize); 5627 /* test for NULL */ 5628 if (*result == NULL) { 5629 *status = U_MEMORY_ALLOCATION_ERROR; 5630 goto cleanup; 5631 } 5632 uprv_memcpy(*result, primStart, sortKeySize); 5633 if(primStart != prim) { 5634 uprv_free(primStart); 5635 } 5636 } 5637 5638 cleanup: 5639 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { 5640 /* NULL terminate for safety */ 5641 **result = 0; 5642 } 5643 if(terStart != tert) { 5644 uprv_free(terStart); 5645 uprv_free(secStart); 5646 } 5647 5648 /* To avoid memory leak, free the offset buffer if necessary. */ 5649 ucol_freeOffsetBuffer(&s); 5650 5651 return sortKeySize; 5652 } 5653 5654 static inline 5655 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { 5656 UBool notIsContinuation = !isContinuation(CE); 5657 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); 5658 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) 5659 || (!notIsContinuation && *wasShifted))) 5660 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 5661 { 5662 // The stuff below should probably be in the sortkey code... maybe not... 5663 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ 5664 /* we should just completely ignore it */ 5665 *wasShifted = TRUE; 5666 //continue; 5667 } 5668 //*wasShifted = TRUE; 5669 return TRUE; 5670 } else { 5671 *wasShifted = FALSE; 5672 return FALSE; 5673 } 5674 } 5675 static inline 5676 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { 5677 if(level < maxLevel) { 5678 dest[i++] = UCOL_LEVELTERMINATOR; 5679 } else { 5680 dest[i++] = 0; 5681 } 5682 } 5683 5684 /** enumeration of level identifiers for partial sort key generation */ 5685 enum { 5686 UCOL_PSK_PRIMARY = 0, 5687 UCOL_PSK_SECONDARY = 1, 5688 UCOL_PSK_CASE = 2, 5689 UCOL_PSK_TERTIARY = 3, 5690 UCOL_PSK_QUATERNARY = 4, 5691 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */ 5692 UCOL_PSK_IDENTICAL = 6, 5693 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */ 5694 UCOL_PSK_LIMIT 5695 }; 5696 5697 /** collation state enum. *_SHIFT value is how much to shift right 5698 * to get the state piece to the right. *_MASK value should be 5699 * ANDed with the shifted state. This data is stored in state[1] 5700 * field. 5701 */ 5702 enum { 5703 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */ 5704 UCOL_PSK_LEVEL_MASK = 7, /** three bits */ 5705 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */ 5706 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, 5707 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary 5708 * This field is also used to denote that the French secondary level is finished 5709 */ 5710 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ 5711 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ 5712 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */ 5713 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ 5714 /** When we do French we need to reverse secondary values. However, continuations 5715 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba 5716 */ 5717 UCOL_PSK_BOCSU_BYTES_SHIFT = 7, 5718 UCOL_PSK_BOCSU_BYTES_MASK = 3, 5719 UCOL_PSK_CONSUMED_CES_SHIFT = 9, 5720 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF 5721 }; 5722 5723 // macro calculating the number of expansion CEs available 5724 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn 5725 5726 5727 /** main sortkey part procedure. On the first call, 5728 * you should pass in a collator, an iterator, empty state 5729 * state[0] == state[1] == 0, a buffer to hold results 5730 * number of bytes you need and an error code pointer. 5731 * Make sure your buffer is big enough to hold the wanted 5732 * number of sortkey bytes. I don't check. 5733 * The only meaningful status you can get back is 5734 * U_BUFFER_OVERFLOW_ERROR, which basically means that you 5735 * have been dealt a raw deal and that you probably won't 5736 * be able to use partial sortkey generation for this 5737 * particular combination of string and collator. This 5738 * is highly unlikely, but you should still check the error code. 5739 * Any other status means that you're not in a sane situation 5740 * anymore. After the first call, preserve state values and 5741 * use them on subsequent calls to obtain more bytes of a sortkey. 5742 * Use until the number of bytes written is smaller than the requested 5743 * number of bytes. Generated sortkey is not compatible with the 5744 * one generated by ucol_getSortKey, as we don't do any compression. 5745 * However, levels are still terminated by a 1 (one) and the sortkey 5746 * is terminated by a 0 (zero). Identical level is the same as in the 5747 * regular sortkey - internal bocu-1 implementation is used. 5748 * For curious, although you cannot do much about this, here is 5749 * the structure of state words. 5750 * state[0] - iterator state. Depends on the iterator implementation, 5751 * but allows the iterator to continue where it stopped in 5752 * the last iteration. 5753 * state[1] - collation processing state. Here is the distribution 5754 * of the bits: 5755 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary 5756 * quaternary, quin (we don't use this one), identical and 5757 * null (producing only zeroes - first one to terminate the 5758 * sortkey and subsequent to fill the buffer). 5759 * 3 - byte count. Number of bytes written on the primary level. 5760 * 4 - was shifted. Whether the previous iteration finished in the 5761 * shifted state. 5762 * 5, 6 - French continuation bytes written. See the comment in the enum 5763 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on 5764 * the identical level. 5765 * 9..31 - CEs consumed. Number of getCE or next32 operations performed 5766 * since thes last successful update of the iterator state. 5767 */ 5768 U_CAPI int32_t U_EXPORT2 5769 ucol_nextSortKeyPart(const UCollator *coll, 5770 UCharIterator *iter, 5771 uint32_t state[2], 5772 uint8_t *dest, int32_t count, 5773 UErrorCode *status) 5774 { 5775 /* error checking */ 5776 if(status==NULL || U_FAILURE(*status)) { 5777 return 0; 5778 } 5779 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 5780 if( coll==NULL || iter==NULL || 5781 state==NULL || 5782 count<0 || (count>0 && dest==NULL) 5783 ) { 5784 *status=U_ILLEGAL_ARGUMENT_ERROR; 5785 UTRACE_EXIT_STATUS(status); 5786 return 0; 5787 } 5788 5789 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 5790 coll, iter, state[0], state[1], dest, count); 5791 5792 if(count==0) { 5793 /* nothing to do */ 5794 UTRACE_EXIT_VALUE(0); 5795 return 0; 5796 } 5797 /** Setting up situation according to the state we got from the previous iteration */ 5798 // The state of the iterator from the previous invocation 5799 uint32_t iterState = state[0]; 5800 // Has the last iteration ended in the shifted state 5801 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE; 5802 // What is the current level of the sortkey? 5803 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; 5804 // Have we written only one byte from a two byte primary in the previous iteration? 5805 // Also on secondary level - have we finished with the French secondary? 5806 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; 5807 // number of bytes in the continuation buffer for French 5808 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; 5809 // Number of bytes already written from a bocsu sequence. Since 5810 // the longes bocsu sequence is 4 long, this can be up to 3. 5811 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK; 5812 // Number of elements that need to be consumed in this iteration because 5813 // the iterator returned UITER_NO_STATE at the end of the last iteration, 5814 // so we had to save the last valid state. 5815 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK; 5816 5817 /** values that depend on the collator attributes */ 5818 // strength of the collator. 5819 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); 5820 // maximal level of the partial sortkey. Need to take whether case level is done 5821 int32_t maxLevel = 0; 5822 if(strength < UCOL_TERTIARY) { 5823 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 5824 maxLevel = UCOL_PSK_CASE; 5825 } else { 5826 maxLevel = strength; 5827 } 5828 } else { 5829 if(strength == UCOL_TERTIARY) { 5830 maxLevel = UCOL_PSK_TERTIARY; 5831 } else if(strength == UCOL_QUATERNARY) { 5832 maxLevel = UCOL_PSK_QUATERNARY; 5833 } else { // identical 5834 maxLevel = UCOL_IDENTICAL; 5835 } 5836 } 5837 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation 5838 uint8_t UCOL_HIRAGANA_QUAD = 5839 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF; 5840 // Boundary value that decides whether a CE is shifted or not 5841 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0; 5842 // Are we doing French collation? 5843 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON); 5844 5845 /** initializing the collation state */ 5846 UBool notIsContinuation = FALSE; 5847 uint32_t CE = UCOL_NO_MORE_CES; 5848 5849 collIterate s; 5850 IInit_collIterate(coll, NULL, -1, &s, status); 5851 if(U_FAILURE(*status)) { 5852 UTRACE_EXIT_STATUS(*status); 5853 return 0; 5854 } 5855 s.iterator = iter; 5856 s.flags |= UCOL_USE_ITERATOR; 5857 // This variable tells us whether we have produced some other levels in this iteration 5858 // before we moved to the identical level. In that case, we need to switch the 5859 // type of the iterator. 5860 UBool doingIdenticalFromStart = FALSE; 5861 // Normalizing iterator 5862 // The division for the array length may truncate the array size to 5863 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 5864 // for all platforms anyway. 5865 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 5866 UNormIterator *normIter = NULL; 5867 // If the normalization is turned on for the collator and we are below identical level 5868 // we will use a FCD normalizing iterator 5869 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { 5870 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5871 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); 5872 s.flags &= ~UCOL_ITER_NORM; 5873 if(U_FAILURE(*status)) { 5874 UTRACE_EXIT_STATUS(*status); 5875 return 0; 5876 } 5877 } else if(level == UCOL_PSK_IDENTICAL) { 5878 // for identical level, we need a NFD iterator. We need to instantiate it here, since we 5879 // will be updating the state - and this cannot be done on an ordinary iterator. 5880 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5881 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5882 s.flags &= ~UCOL_ITER_NORM; 5883 if(U_FAILURE(*status)) { 5884 UTRACE_EXIT_STATUS(*status); 5885 return 0; 5886 } 5887 doingIdenticalFromStart = TRUE; 5888 } 5889 5890 // This is the tentative new state of the iterator. The problem 5891 // is that the iterator might return an undefined state, in 5892 // which case we should save the last valid state and increase 5893 // the iterator skip value. 5894 uint32_t newState = 0; 5895 5896 // First, we set the iterator to the last valid position 5897 // from the last iteration. This was saved in state[0]. 5898 if(iterState == 0) { 5899 /* initial state */ 5900 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { 5901 s.iterator->move(s.iterator, 0, UITER_LIMIT); 5902 } else { 5903 s.iterator->move(s.iterator, 0, UITER_START); 5904 } 5905 } else { 5906 /* reset to previous state */ 5907 s.iterator->setState(s.iterator, iterState, status); 5908 if(U_FAILURE(*status)) { 5909 UTRACE_EXIT_STATUS(*status); 5910 return 0; 5911 } 5912 } 5913 5914 5915 5916 // This variable tells us whether we can attempt to update the state 5917 // of iterator. Situations where we don't want to update iterator state 5918 // are the existence of expansion CEs that are not yet processed, and 5919 // finishing the case level without enough space in the buffer to insert 5920 // a level terminator. 5921 UBool canUpdateState = TRUE; 5922 5923 // Consume all the CEs that were consumed at the end of the previous 5924 // iteration without updating the iterator state. On identical level, 5925 // consume the code points. 5926 int32_t counter = cces; 5927 if(level < UCOL_PSK_IDENTICAL) { 5928 while(counter-->0) { 5929 // If we're doing French and we are on the secondary level, 5930 // we go backwards. 5931 if(level == UCOL_PSK_SECONDARY && doingFrench) { 5932 CE = ucol_IGetPrevCE(coll, &s, status); 5933 } else { 5934 CE = ucol_IGetNextCE(coll, &s, status); 5935 } 5936 if(CE==UCOL_NO_MORE_CES) { 5937 /* should not happen */ 5938 *status=U_INTERNAL_PROGRAM_ERROR; 5939 UTRACE_EXIT_STATUS(*status); 5940 return 0; 5941 } 5942 if(uprv_numAvailableExpCEs(s)) { 5943 canUpdateState = FALSE; 5944 } 5945 } 5946 } else { 5947 while(counter-->0) { 5948 uiter_next32(s.iterator); 5949 } 5950 } 5951 5952 // French secondary needs to know whether the iterator state of zero came from previous level OR 5953 // from a new invocation... 5954 UBool wasDoingPrimary = FALSE; 5955 // destination buffer byte counter. When this guy 5956 // gets to count, we're done with the iteration 5957 int32_t i = 0; 5958 // used to count the zero bytes written after we 5959 // have finished with the sort key 5960 int32_t j = 0; 5961 5962 5963 // Hm.... I think we're ready to plunge in. Basic story is as following: 5964 // we have a fall through case based on level. This is used for initial 5965 // positioning on iteration start. Every level processor contains a 5966 // for(;;) which will be broken when we exhaust all the CEs. Other 5967 // way to exit is a goto saveState, which happens when we have filled 5968 // out our buffer. 5969 switch(level) { 5970 case UCOL_PSK_PRIMARY: 5971 wasDoingPrimary = TRUE; 5972 for(;;) { 5973 if(i==count) { 5974 goto saveState; 5975 } 5976 // We should save the state only if we 5977 // are sure that we are done with the 5978 // previous iterator state 5979 if(canUpdateState && byteCountOrFrenchDone == 0) { 5980 newState = s.iterator->getState(s.iterator); 5981 if(newState != UITER_NO_STATE) { 5982 iterState = newState; 5983 cces = 0; 5984 } 5985 } 5986 CE = ucol_IGetNextCE(coll, &s, status); 5987 cces++; 5988 if(CE==UCOL_NO_MORE_CES) { 5989 // Add the level separator 5990 terminatePSKLevel(level, maxLevel, i, dest); 5991 byteCountOrFrenchDone=0; 5992 // Restart the iteration an move to the 5993 // second level 5994 s.iterator->move(s.iterator, 0, UITER_START); 5995 cces = 0; 5996 level = UCOL_PSK_SECONDARY; 5997 break; 5998 } 5999 if(!isContinuation(CE)){ 6000 if(coll->leadBytePermutationTable != NULL){ 6001 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF); 6002 } 6003 } 6004 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6005 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ 6006 if(CE != 0) { 6007 if(byteCountOrFrenchDone == 0) { 6008 // get the second byte of primary 6009 dest[i++]=(uint8_t)(CE >> 8); 6010 } else { 6011 byteCountOrFrenchDone = 0; 6012 } 6013 if((CE &=0xff)!=0) { 6014 if(i==count) { 6015 /* overflow */ 6016 byteCountOrFrenchDone = 1; 6017 cces--; 6018 goto saveState; 6019 } 6020 dest[i++]=(uint8_t)CE; 6021 } 6022 } 6023 } 6024 if(uprv_numAvailableExpCEs(s)) { 6025 canUpdateState = FALSE; 6026 } else { 6027 canUpdateState = TRUE; 6028 } 6029 } 6030 /* fall through to next level */ 6031 case UCOL_PSK_SECONDARY: 6032 if(strength >= UCOL_SECONDARY) { 6033 if(!doingFrench) { 6034 for(;;) { 6035 if(i == count) { 6036 goto saveState; 6037 } 6038 // We should save the state only if we 6039 // are sure that we are done with the 6040 // previous iterator state 6041 if(canUpdateState) { 6042 newState = s.iterator->getState(s.iterator); 6043 if(newState != UITER_NO_STATE) { 6044 iterState = newState; 6045 cces = 0; 6046 } 6047 } 6048 CE = ucol_IGetNextCE(coll, &s, status); 6049 cces++; 6050 if(CE==UCOL_NO_MORE_CES) { 6051 // Add the level separator 6052 terminatePSKLevel(level, maxLevel, i, dest); 6053 byteCountOrFrenchDone = 0; 6054 // Restart the iteration an move to the 6055 // second level 6056 s.iterator->move(s.iterator, 0, UITER_START); 6057 cces = 0; 6058 level = UCOL_PSK_CASE; 6059 break; 6060 } 6061 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6062 CE >>= 8; /* get secondary */ 6063 if(CE != 0) { 6064 dest[i++]=(uint8_t)CE; 6065 } 6066 } 6067 if(uprv_numAvailableExpCEs(s)) { 6068 canUpdateState = FALSE; 6069 } else { 6070 canUpdateState = TRUE; 6071 } 6072 } 6073 } else { // French secondary processing 6074 uint8_t frenchBuff[UCOL_MAX_BUFFER]; 6075 int32_t frenchIndex = 0; 6076 // Here we are going backwards. 6077 // If the iterator is at the beggining, it should be 6078 // moved to end. 6079 if(wasDoingPrimary) { 6080 s.iterator->move(s.iterator, 0, UITER_LIMIT); 6081 cces = 0; 6082 } 6083 for(;;) { 6084 if(i == count) { 6085 goto saveState; 6086 } 6087 if(canUpdateState) { 6088 newState = s.iterator->getState(s.iterator); 6089 if(newState != UITER_NO_STATE) { 6090 iterState = newState; 6091 cces = 0; 6092 } 6093 } 6094 CE = ucol_IGetPrevCE(coll, &s, status); 6095 cces++; 6096 if(CE==UCOL_NO_MORE_CES) { 6097 // Add the level separator 6098 terminatePSKLevel(level, maxLevel, i, dest); 6099 byteCountOrFrenchDone = 0; 6100 // Restart the iteration an move to the next level 6101 s.iterator->move(s.iterator, 0, UITER_START); 6102 level = UCOL_PSK_CASE; 6103 break; 6104 } 6105 if(isContinuation(CE)) { // if it's a continuation, we want to save it and 6106 // reverse when we get a first non-continuation CE. 6107 CE >>= 8; 6108 frenchBuff[frenchIndex++] = (uint8_t)CE; 6109 } else if(!isShiftedCE(CE, LVT, &wasShifted)) { 6110 CE >>= 8; /* get secondary */ 6111 if(!frenchIndex) { 6112 if(CE != 0) { 6113 dest[i++]=(uint8_t)CE; 6114 } 6115 } else { 6116 frenchBuff[frenchIndex++] = (uint8_t)CE; 6117 frenchIndex -= usedFrench; 6118 usedFrench = 0; 6119 while(i < count && frenchIndex) { 6120 dest[i++] = frenchBuff[--frenchIndex]; 6121 usedFrench++; 6122 } 6123 } 6124 } 6125 if(uprv_numAvailableExpCEs(s)) { 6126 canUpdateState = FALSE; 6127 } else { 6128 canUpdateState = TRUE; 6129 } 6130 } 6131 } 6132 } else { 6133 level = UCOL_PSK_CASE; 6134 } 6135 /* fall through to next level */ 6136 case UCOL_PSK_CASE: 6137 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 6138 uint32_t caseShift = UCOL_CASE_SHIFT_START; 6139 uint8_t caseByte = UCOL_CASE_BYTE_START; 6140 uint8_t caseBits = 0; 6141 6142 for(;;) { 6143 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); 6144 if(i == count) { 6145 goto saveState; 6146 } 6147 // We should save the state only if we 6148 // are sure that we are done with the 6149 // previous iterator state 6150 if(canUpdateState) { 6151 newState = s.iterator->getState(s.iterator); 6152 if(newState != UITER_NO_STATE) { 6153 iterState = newState; 6154 cces = 0; 6155 } 6156 } 6157 CE = ucol_IGetNextCE(coll, &s, status); 6158 cces++; 6159 if(CE==UCOL_NO_MORE_CES) { 6160 // On the case level we might have an unfinished 6161 // case byte. Add one if it's started. 6162 if(caseShift != UCOL_CASE_SHIFT_START) { 6163 dest[i++] = caseByte; 6164 } 6165 cces = 0; 6166 // We have finished processing CEs on this level. 6167 // However, we don't know if we have enough space 6168 // to add a case level terminator. 6169 if(i < count) { 6170 // Add the level separator 6171 terminatePSKLevel(level, maxLevel, i, dest); 6172 // Restart the iteration and move to the 6173 // next level 6174 s.iterator->move(s.iterator, 0, UITER_START); 6175 level = UCOL_PSK_TERTIARY; 6176 } else { 6177 canUpdateState = FALSE; 6178 } 6179 break; 6180 } 6181 6182 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6183 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) { 6184 // do the case level if we need to do it. We don't want to calculate 6185 // case level for primary ignorables if we have only primary strength and case level 6186 // otherwise we would break well formedness of CEs 6187 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 6188 caseBits = (uint8_t)(CE & 0xC0); 6189 // this copies the case level logic from the 6190 // sort key generation code 6191 if(CE != 0) { 6192 if (caseShift == 0) { 6193 dest[i++] = caseByte; 6194 caseShift = UCOL_CASE_SHIFT_START; 6195 caseByte = UCOL_CASE_BYTE_START; 6196 } 6197 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6198 if((caseBits & 0xC0) == 0) { 6199 caseByte |= 1 << (--caseShift); 6200 } else { 6201 caseByte |= 0 << (--caseShift); 6202 /* second bit */ 6203 if(caseShift == 0) { 6204 dest[i++] = caseByte; 6205 caseShift = UCOL_CASE_SHIFT_START; 6206 caseByte = UCOL_CASE_BYTE_START; 6207 } 6208 caseByte |= ((caseBits>>6)&1) << (--caseShift); 6209 } 6210 } else { 6211 if((caseBits & 0xC0) == 0) { 6212 caseByte |= 0 << (--caseShift); 6213 } else { 6214 caseByte |= 1 << (--caseShift); 6215 /* second bit */ 6216 if(caseShift == 0) { 6217 dest[i++] = caseByte; 6218 caseShift = UCOL_CASE_SHIFT_START; 6219 caseByte = UCOL_CASE_BYTE_START; 6220 } 6221 caseByte |= ((caseBits>>7)&1) << (--caseShift); 6222 } 6223 } 6224 } 6225 6226 } 6227 } 6228 // Not sure this is correct for the case level - revisit 6229 if(uprv_numAvailableExpCEs(s)) { 6230 canUpdateState = FALSE; 6231 } else { 6232 canUpdateState = TRUE; 6233 } 6234 } 6235 } else { 6236 level = UCOL_PSK_TERTIARY; 6237 } 6238 /* fall through to next level */ 6239 case UCOL_PSK_TERTIARY: 6240 if(strength >= UCOL_TERTIARY) { 6241 for(;;) { 6242 if(i == count) { 6243 goto saveState; 6244 } 6245 // We should save the state only if we 6246 // are sure that we are done with the 6247 // previous iterator state 6248 if(canUpdateState) { 6249 newState = s.iterator->getState(s.iterator); 6250 if(newState != UITER_NO_STATE) { 6251 iterState = newState; 6252 cces = 0; 6253 } 6254 } 6255 CE = ucol_IGetNextCE(coll, &s, status); 6256 cces++; 6257 if(CE==UCOL_NO_MORE_CES) { 6258 // Add the level separator 6259 terminatePSKLevel(level, maxLevel, i, dest); 6260 byteCountOrFrenchDone = 0; 6261 // Restart the iteration an move to the 6262 // second level 6263 s.iterator->move(s.iterator, 0, UITER_START); 6264 cces = 0; 6265 level = UCOL_PSK_QUATERNARY; 6266 break; 6267 } 6268 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6269 notIsContinuation = !isContinuation(CE); 6270 6271 if(notIsContinuation) { 6272 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 6273 CE ^= coll->caseSwitch; 6274 CE &= coll->tertiaryMask; 6275 } else { 6276 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6277 } 6278 6279 if(CE != 0) { 6280 dest[i++]=(uint8_t)CE; 6281 } 6282 } 6283 if(uprv_numAvailableExpCEs(s)) { 6284 canUpdateState = FALSE; 6285 } else { 6286 canUpdateState = TRUE; 6287 } 6288 } 6289 } else { 6290 // if we're not doing tertiary 6291 // skip to the end 6292 level = UCOL_PSK_NULL; 6293 } 6294 /* fall through to next level */ 6295 case UCOL_PSK_QUATERNARY: 6296 if(strength >= UCOL_QUATERNARY) { 6297 for(;;) { 6298 if(i == count) { 6299 goto saveState; 6300 } 6301 // We should save the state only if we 6302 // are sure that we are done with the 6303 // previous iterator state 6304 if(canUpdateState) { 6305 newState = s.iterator->getState(s.iterator); 6306 if(newState != UITER_NO_STATE) { 6307 iterState = newState; 6308 cces = 0; 6309 } 6310 } 6311 CE = ucol_IGetNextCE(coll, &s, status); 6312 cces++; 6313 if(CE==UCOL_NO_MORE_CES) { 6314 // Add the level separator 6315 terminatePSKLevel(level, maxLevel, i, dest); 6316 //dest[i++] = UCOL_LEVELTERMINATOR; 6317 byteCountOrFrenchDone = 0; 6318 // Restart the iteration an move to the 6319 // second level 6320 s.iterator->move(s.iterator, 0, UITER_START); 6321 cces = 0; 6322 level = UCOL_PSK_QUIN; 6323 break; 6324 } 6325 if(CE==0) 6326 continue; 6327 if(isShiftedCE(CE, LVT, &wasShifted)) { 6328 CE >>= 16; /* get primary */ 6329 if(CE != 0) { 6330 if(byteCountOrFrenchDone == 0) { 6331 dest[i++]=(uint8_t)(CE >> 8); 6332 } else { 6333 byteCountOrFrenchDone = 0; 6334 } 6335 if((CE &=0xff)!=0) { 6336 if(i==count) { 6337 /* overflow */ 6338 byteCountOrFrenchDone = 1; 6339 goto saveState; 6340 } 6341 dest[i++]=(uint8_t)CE; 6342 } 6343 } 6344 } else { 6345 notIsContinuation = !isContinuation(CE); 6346 if(notIsContinuation) { 6347 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 6348 dest[i++] = UCOL_HIRAGANA_QUAD; 6349 } else { 6350 dest[i++] = 0xFF; 6351 } 6352 } 6353 } 6354 if(uprv_numAvailableExpCEs(s)) { 6355 canUpdateState = FALSE; 6356 } else { 6357 canUpdateState = TRUE; 6358 } 6359 } 6360 } else { 6361 // if we're not doing quaternary 6362 // skip to the end 6363 level = UCOL_PSK_NULL; 6364 } 6365 /* fall through to next level */ 6366 case UCOL_PSK_QUIN: 6367 level = UCOL_PSK_IDENTICAL; 6368 /* fall through to next level */ 6369 case UCOL_PSK_IDENTICAL: 6370 if(strength >= UCOL_IDENTICAL) { 6371 UChar32 first, second; 6372 int32_t bocsuBytesWritten = 0; 6373 // We always need to do identical on 6374 // the NFD form of the string. 6375 if(normIter == NULL) { 6376 // we arrived from the level below and 6377 // normalization was not turned on. 6378 // therefore, we need to make a fresh NFD iterator 6379 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 6380 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 6381 } else if(!doingIdenticalFromStart) { 6382 // there is an iterator, but we did some other levels. 6383 // therefore, we have a FCD iterator - need to make 6384 // a NFD one. 6385 // normIter being at the beginning does not guarantee 6386 // that the underlying iterator is at the beginning 6387 iter->move(iter, 0, UITER_START); 6388 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 6389 } 6390 // At this point we have a NFD iterator that is positioned 6391 // in the right place 6392 if(U_FAILURE(*status)) { 6393 UTRACE_EXIT_STATUS(*status); 6394 return 0; 6395 } 6396 first = uiter_previous32(s.iterator); 6397 // maybe we're at the start of the string 6398 if(first == U_SENTINEL) { 6399 first = 0; 6400 } else { 6401 uiter_next32(s.iterator); 6402 } 6403 6404 j = 0; 6405 for(;;) { 6406 if(i == count) { 6407 if(j+1 < bocsuBytesWritten) { 6408 bocsuBytesUsed = j+1; 6409 } 6410 goto saveState; 6411 } 6412 6413 // On identical level, we will always save 6414 // the state if we reach this point, since 6415 // we don't depend on getNextCE for content 6416 // all the content is in our buffer and we 6417 // already either stored the full buffer OR 6418 // otherwise we won't arrive here. 6419 newState = s.iterator->getState(s.iterator); 6420 if(newState != UITER_NO_STATE) { 6421 iterState = newState; 6422 cces = 0; 6423 } 6424 6425 uint8_t buff[4]; 6426 second = uiter_next32(s.iterator); 6427 cces++; 6428 6429 // end condition for identical level 6430 if(second == U_SENTINEL) { 6431 terminatePSKLevel(level, maxLevel, i, dest); 6432 level = UCOL_PSK_NULL; 6433 break; 6434 } 6435 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); 6436 first = second; 6437 6438 j = 0; 6439 if(bocsuBytesUsed != 0) { 6440 while(bocsuBytesUsed-->0) { 6441 j++; 6442 } 6443 } 6444 6445 while(i < count && j < bocsuBytesWritten) { 6446 dest[i++] = buff[j++]; 6447 } 6448 } 6449 6450 } else { 6451 level = UCOL_PSK_NULL; 6452 } 6453 /* fall through to next level */ 6454 case UCOL_PSK_NULL: 6455 j = i; 6456 while(j<count) { 6457 dest[j++]=0; 6458 } 6459 break; 6460 default: 6461 *status = U_INTERNAL_PROGRAM_ERROR; 6462 UTRACE_EXIT_STATUS(*status); 6463 return 0; 6464 } 6465 6466 saveState: 6467 // Now we need to return stuff. First we want to see whether we have 6468 // done everything for the current state of iterator. 6469 if(byteCountOrFrenchDone 6470 || canUpdateState == FALSE 6471 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) 6472 { 6473 // Any of above mean that the previous transaction 6474 // wasn't finished and that we should store the 6475 // previous iterator state. 6476 state[0] = iterState; 6477 } else { 6478 // The transaction is complete. We will continue in the next iteration. 6479 state[0] = s.iterator->getState(s.iterator); 6480 cces = 0; 6481 } 6482 // Store the number of bocsu bytes written. 6483 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { 6484 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6485 } 6486 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT; 6487 6488 // Next we put in the level of comparison 6489 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); 6490 6491 // If we are doing French, we need to store whether we have just finished the French level 6492 if(level == UCOL_PSK_SECONDARY && doingFrench) { 6493 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6494 } else { 6495 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6496 } 6497 6498 // Was the latest CE shifted 6499 if(wasShifted) { 6500 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; 6501 } 6502 // Check for cces overflow 6503 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { 6504 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6505 } 6506 // Store cces 6507 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT); 6508 6509 // Check for French overflow 6510 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { 6511 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6512 } 6513 // Store number of bytes written in the French secondary continuation sequence 6514 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); 6515 6516 6517 // If we have used normalizing iterator, get rid of it 6518 if(normIter != NULL) { 6519 unorm_closeIter(normIter); 6520 } 6521 6522 /* To avoid memory leak, free the offset buffer if necessary. */ 6523 ucol_freeOffsetBuffer(&s); 6524 6525 // Return number of meaningful sortkey bytes. 6526 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 6527 dest,i, state[0], state[1]); 6528 UTRACE_EXIT_VALUE(i); 6529 return i; 6530 } 6531 6532 /** 6533 * Produce a bound for a given sortkey and a number of levels. 6534 */ 6535 U_CAPI int32_t U_EXPORT2 6536 ucol_getBound(const uint8_t *source, 6537 int32_t sourceLength, 6538 UColBoundMode boundType, 6539 uint32_t noOfLevels, 6540 uint8_t *result, 6541 int32_t resultLength, 6542 UErrorCode *status) 6543 { 6544 // consistency checks 6545 if(status == NULL || U_FAILURE(*status)) { 6546 return 0; 6547 } 6548 if(source == NULL) { 6549 *status = U_ILLEGAL_ARGUMENT_ERROR; 6550 return 0; 6551 } 6552 6553 int32_t sourceIndex = 0; 6554 // Scan the string until we skip enough of the key OR reach the end of the key 6555 do { 6556 sourceIndex++; 6557 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { 6558 noOfLevels--; 6559 } 6560 } while (noOfLevels > 0 6561 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 6562 6563 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 6564 && noOfLevels > 0) { 6565 *status = U_SORT_KEY_TOO_SHORT_WARNING; 6566 } 6567 6568 6569 // READ ME: this code assumes that the values for boundType 6570 // enum will not changes. They are set so that the enum value 6571 // corresponds to the number of extra bytes each bound type 6572 // needs. 6573 if(result != NULL && resultLength >= sourceIndex+boundType) { 6574 uprv_memcpy(result, source, sourceIndex); 6575 switch(boundType) { 6576 // Lower bound just gets terminated. No extra bytes 6577 case UCOL_BOUND_LOWER: // = 0 6578 break; 6579 // Upper bound needs one extra byte 6580 case UCOL_BOUND_UPPER: // = 1 6581 result[sourceIndex++] = 2; 6582 break; 6583 // Upper long bound needs two extra bytes 6584 case UCOL_BOUND_UPPER_LONG: // = 2 6585 result[sourceIndex++] = 0xFF; 6586 result[sourceIndex++] = 0xFF; 6587 break; 6588 default: 6589 *status = U_ILLEGAL_ARGUMENT_ERROR; 6590 return 0; 6591 } 6592 result[sourceIndex++] = 0; 6593 6594 return sourceIndex; 6595 } else { 6596 return sourceIndex+boundType+1; 6597 } 6598 } 6599 6600 /****************************************************************************/ 6601 /* Following are the functions that deal with the properties of a collator */ 6602 /* there are new APIs and some compatibility APIs */ 6603 /****************************************************************************/ 6604 6605 static inline void 6606 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, 6607 int32_t *primShift, int32_t *secShift, int32_t *terShift) 6608 { 6609 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; 6610 UBool reverseSecondary = FALSE; 6611 UBool continuation = isContinuation(CE); 6612 if(!continuation) { 6613 tertiary = (uint8_t)((CE & coll->tertiaryMask)); 6614 tertiary ^= coll->caseSwitch; 6615 reverseSecondary = TRUE; 6616 } else { 6617 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6618 tertiary &= UCOL_REMOVE_CASE; 6619 reverseSecondary = FALSE; 6620 } 6621 6622 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6623 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6624 primary1 = (uint8_t)(CE >> 8); 6625 6626 if(primary1 != 0) { 6627 if (coll->leadBytePermutationTable != NULL && !continuation) { 6628 primary1 = coll->leadBytePermutationTable[primary1]; 6629 } 6630 6631 coll->latinOneCEs[ch] |= (primary1 << *primShift); 6632 *primShift -= 8; 6633 } 6634 if(primary2 != 0) { 6635 if(*primShift < 0) { 6636 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6637 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6638 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6639 return; 6640 } 6641 coll->latinOneCEs[ch] |= (primary2 << *primShift); 6642 *primShift -= 8; 6643 } 6644 if(secondary != 0) { 6645 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary 6646 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary 6647 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); 6648 } else { // normal case 6649 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); 6650 } 6651 *secShift -= 8; 6652 } 6653 if(tertiary != 0) { 6654 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); 6655 *terShift -= 8; 6656 } 6657 } 6658 6659 static inline UBool 6660 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { 6661 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); 6662 if(newTable == NULL) { 6663 *status = U_MEMORY_ALLOCATION_ERROR; 6664 coll->latinOneFailed = TRUE; 6665 return FALSE; 6666 } 6667 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t); 6668 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); 6669 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); 6670 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy); 6671 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy); 6672 coll->latinOneTableLen = size; 6673 uprv_free(coll->latinOneCEs); 6674 coll->latinOneCEs = newTable; 6675 return TRUE; 6676 } 6677 6678 static UBool 6679 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { 6680 UBool result = TRUE; 6681 if(coll->latinOneCEs == NULL) { 6682 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); 6683 if(coll->latinOneCEs == NULL) { 6684 *status = U_MEMORY_ALLOCATION_ERROR; 6685 return FALSE; 6686 } 6687 coll->latinOneTableLen = UCOL_LATINONETABLELEN; 6688 } 6689 UChar ch = 0; 6690 UCollationElements *it = ucol_openElements(coll, &ch, 1, status); 6691 // Check for null pointer 6692 if (U_FAILURE(*status)) { 6693 return FALSE; 6694 } 6695 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); 6696 6697 int32_t primShift = 24, secShift = 24, terShift = 24; 6698 uint32_t CE = 0; 6699 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; 6700 6701 // TODO: make safe if you get more than you wanted... 6702 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { 6703 primShift = 24; secShift = 24; terShift = 24; 6704 if(ch < 0x100) { 6705 CE = coll->latinOneMapping[ch]; 6706 } else { 6707 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 6708 if(CE == UCOL_NOT_FOUND && coll->UCA) { 6709 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 6710 } 6711 } 6712 if(CE < UCOL_NOT_FOUND) { 6713 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6714 } else { 6715 switch (getCETag(CE)) { 6716 case EXPANSION_TAG: 6717 case DIGIT_TAG: 6718 ucol_setText(it, &ch, 1, status); 6719 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { 6720 if(primShift < 0 || secShift < 0 || terShift < 0) { 6721 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6722 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6723 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6724 break; 6725 } 6726 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6727 } 6728 break; 6729 case CONTRACTION_TAG: 6730 // here is the trick 6731 // F2 is contraction. We do something very similar to contractions 6732 // but have two indices, one in the real contraction table and the 6733 // other to where we stuffed things. This hopes that we don't have 6734 // many contractions (this should work for latin-1 tables). 6735 { 6736 if((CE & 0x00FFF000) != 0) { 6737 *status = U_UNSUPPORTED_ERROR; 6738 goto cleanup_after_failure; 6739 } 6740 6741 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 6742 6743 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table 6744 6745 coll->latinOneCEs[ch] = CE; 6746 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; 6747 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; 6748 6749 // We're going to jump into contraction table, pick the elements 6750 // and use them 6751 do { 6752 CE = *(coll->contractionCEs + 6753 (UCharOffset - coll->contractionIndex)); 6754 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { 6755 uint32_t size; 6756 uint32_t i; /* general counter */ 6757 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 6758 size = getExpansionCount(CE); 6759 //CE = *CEOffset++; 6760 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 6761 for(i = 0; i<size; i++) { 6762 if(primShift < 0 || secShift < 0 || terShift < 0) { 6763 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6764 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6765 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6766 break; 6767 } 6768 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6769 } 6770 } else { /* else, we do */ 6771 while(*CEOffset != 0) { 6772 if(primShift < 0 || secShift < 0 || terShift < 0) { 6773 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6774 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6775 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6776 break; 6777 } 6778 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6779 } 6780 } 6781 contractionOffset++; 6782 } else if(CE < UCOL_NOT_FOUND) { 6783 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); 6784 } else { 6785 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6786 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6787 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6788 contractionOffset++; 6789 } 6790 UCharOffset++; 6791 primShift = 24; secShift = 24; terShift = 24; 6792 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate 6793 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { 6794 goto cleanup_after_failure; 6795 } 6796 } 6797 } while(*UCharOffset != 0xFFFF); 6798 } 6799 break;; 6800 case SPEC_PROC_TAG: 6801 { 6802 // 0xB7 is a precontext character defined in UCA5.1, a special 6803 // handle is implemeted in order to save LatinOne table for 6804 // most locales. 6805 if (ch==0xb7) { 6806 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6807 } 6808 else { 6809 goto cleanup_after_failure; 6810 } 6811 } 6812 break; 6813 default: 6814 goto cleanup_after_failure; 6815 } 6816 } 6817 } 6818 // compact table 6819 if(contractionOffset < coll->latinOneTableLen) { 6820 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { 6821 goto cleanup_after_failure; 6822 } 6823 } 6824 ucol_closeElements(it); 6825 return result; 6826 6827 cleanup_after_failure: 6828 // status should already be set before arriving here. 6829 coll->latinOneFailed = TRUE; 6830 ucol_closeElements(it); 6831 return FALSE; 6832 } 6833 6834 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { 6835 if(U_SUCCESS(*status)) { 6836 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6837 coll->caseSwitch = UCOL_CASE_SWITCH; 6838 } else { 6839 coll->caseSwitch = UCOL_NO_CASE_SWITCH; 6840 } 6841 6842 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { 6843 coll->tertiaryMask = UCOL_REMOVE_CASE; 6844 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6845 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */ 6846 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; 6847 coll->tertiaryBottom = UCOL_COMMON_BOT3; 6848 } else { 6849 coll->tertiaryMask = UCOL_KEEP_CASE; 6850 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; 6851 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6852 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; 6853 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; 6854 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; 6855 } else { 6856 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6857 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; 6858 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; 6859 } 6860 } 6861 6862 /* Set the compression values */ 6863 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1); 6864 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */ 6865 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); 6866 6867 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY 6868 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) 6869 { 6870 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; 6871 } else { 6872 coll->sortKeyGen = ucol_calcSortKey; 6873 } 6874 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF 6875 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) 6876 { 6877 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { 6878 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it 6879 //fprintf(stderr, "F"); 6880 coll->latinOneUse = TRUE; 6881 } else { 6882 coll->latinOneUse = FALSE; 6883 } 6884 if(*status == U_UNSUPPORTED_ERROR) { 6885 *status = U_ZERO_ERROR; 6886 } 6887 } else { // latin1Table exists and it doesn't need to be regenerated, just use it 6888 coll->latinOneUse = TRUE; 6889 } 6890 } else { 6891 coll->latinOneUse = FALSE; 6892 } 6893 } 6894 } 6895 6896 U_CAPI uint32_t U_EXPORT2 6897 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 6898 if(U_FAILURE(*status) || coll == NULL) { 6899 return 0; 6900 } 6901 if(len == -1) { 6902 len = u_strlen(varTop); 6903 } 6904 if(len == 0) { 6905 *status = U_ILLEGAL_ARGUMENT_ERROR; 6906 return 0; 6907 } 6908 6909 collIterate s; 6910 IInit_collIterate(coll, varTop, len, &s, status); 6911 if(U_FAILURE(*status)) { 6912 return 0; 6913 } 6914 6915 uint32_t CE = ucol_IGetNextCE(coll, &s, status); 6916 6917 /* here we check if we have consumed all characters */ 6918 /* you can put in either one character or a contraction */ 6919 /* you shouldn't put more... */ 6920 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { 6921 *status = U_CE_NOT_FOUND_ERROR; 6922 return 0; 6923 } 6924 6925 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); 6926 6927 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { 6928 *status = U_PRIMARY_TOO_LONG_ERROR; 6929 return 0; 6930 } 6931 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { 6932 coll->variableTopValueisDefault = FALSE; 6933 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; 6934 } 6935 6936 /* To avoid memory leak, free the offset buffer if necessary. */ 6937 ucol_freeOffsetBuffer(&s); 6938 6939 return CE & UCOL_PRIMARYMASK; 6940 } 6941 6942 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 6943 if(U_FAILURE(*status) || coll == NULL) { 6944 return 0; 6945 } 6946 return coll->variableTopValue<<16; 6947 } 6948 6949 U_CAPI void U_EXPORT2 6950 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 6951 if(U_FAILURE(*status) || coll == NULL) { 6952 return; 6953 } 6954 6955 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { 6956 coll->variableTopValueisDefault = FALSE; 6957 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; 6958 } 6959 } 6960 /* Attribute setter API */ 6961 U_CAPI void U_EXPORT2 6962 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 6963 if(U_FAILURE(*status) || coll == NULL) { 6964 return; 6965 } 6966 UColAttributeValue oldFrench = coll->frenchCollation; 6967 UColAttributeValue oldCaseFirst = coll->caseFirst; 6968 switch(attr) { 6969 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ 6970 if(value == UCOL_ON) { 6971 coll->numericCollation = UCOL_ON; 6972 coll->numericCollationisDefault = FALSE; 6973 } else if (value == UCOL_OFF) { 6974 coll->numericCollation = UCOL_OFF; 6975 coll->numericCollationisDefault = FALSE; 6976 } else if (value == UCOL_DEFAULT) { 6977 coll->numericCollationisDefault = TRUE; 6978 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; 6979 } else { 6980 *status = U_ILLEGAL_ARGUMENT_ERROR; 6981 } 6982 break; 6983 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ 6984 if(value == UCOL_ON) { 6985 coll->hiraganaQ = UCOL_ON; 6986 coll->hiraganaQisDefault = FALSE; 6987 } else if (value == UCOL_OFF) { 6988 coll->hiraganaQ = UCOL_OFF; 6989 coll->hiraganaQisDefault = FALSE; 6990 } else if (value == UCOL_DEFAULT) { 6991 coll->hiraganaQisDefault = TRUE; 6992 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ; 6993 } else { 6994 *status = U_ILLEGAL_ARGUMENT_ERROR; 6995 } 6996 break; 6997 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 6998 if(value == UCOL_ON) { 6999 coll->frenchCollation = UCOL_ON; 7000 coll->frenchCollationisDefault = FALSE; 7001 } else if (value == UCOL_OFF) { 7002 coll->frenchCollation = UCOL_OFF; 7003 coll->frenchCollationisDefault = FALSE; 7004 } else if (value == UCOL_DEFAULT) { 7005 coll->frenchCollationisDefault = TRUE; 7006 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation; 7007 } else { 7008 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7009 } 7010 break; 7011 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 7012 if(value == UCOL_SHIFTED) { 7013 coll->alternateHandling = UCOL_SHIFTED; 7014 coll->alternateHandlingisDefault = FALSE; 7015 } else if (value == UCOL_NON_IGNORABLE) { 7016 coll->alternateHandling = UCOL_NON_IGNORABLE; 7017 coll->alternateHandlingisDefault = FALSE; 7018 } else if (value == UCOL_DEFAULT) { 7019 coll->alternateHandlingisDefault = TRUE; 7020 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ; 7021 } else { 7022 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7023 } 7024 break; 7025 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 7026 if(value == UCOL_LOWER_FIRST) { 7027 coll->caseFirst = UCOL_LOWER_FIRST; 7028 coll->caseFirstisDefault = FALSE; 7029 } else if (value == UCOL_UPPER_FIRST) { 7030 coll->caseFirst = UCOL_UPPER_FIRST; 7031 coll->caseFirstisDefault = FALSE; 7032 } else if (value == UCOL_OFF) { 7033 coll->caseFirst = UCOL_OFF; 7034 coll->caseFirstisDefault = FALSE; 7035 } else if (value == UCOL_DEFAULT) { 7036 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; 7037 coll->caseFirstisDefault = TRUE; 7038 } else { 7039 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7040 } 7041 break; 7042 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 7043 if(value == UCOL_ON) { 7044 coll->caseLevel = UCOL_ON; 7045 coll->caseLevelisDefault = FALSE; 7046 } else if (value == UCOL_OFF) { 7047 coll->caseLevel = UCOL_OFF; 7048 coll->caseLevelisDefault = FALSE; 7049 } else if (value == UCOL_DEFAULT) { 7050 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; 7051 coll->caseLevelisDefault = TRUE; 7052 } else { 7053 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7054 } 7055 break; 7056 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 7057 if(value == UCOL_ON) { 7058 coll->normalizationMode = UCOL_ON; 7059 coll->normalizationModeisDefault = FALSE; 7060 initializeFCD(status); 7061 } else if (value == UCOL_OFF) { 7062 coll->normalizationMode = UCOL_OFF; 7063 coll->normalizationModeisDefault = FALSE; 7064 } else if (value == UCOL_DEFAULT) { 7065 coll->normalizationModeisDefault = TRUE; 7066 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode; 7067 if(coll->normalizationMode == UCOL_ON) { 7068 initializeFCD(status); 7069 } 7070 } else { 7071 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7072 } 7073 break; 7074 case UCOL_STRENGTH: /* attribute for strength */ 7075 if (value == UCOL_DEFAULT) { 7076 coll->strengthisDefault = TRUE; 7077 coll->strength = (UColAttributeValue)coll->options->strength; 7078 } else if (value <= UCOL_IDENTICAL) { 7079 coll->strengthisDefault = FALSE; 7080 coll->strength = value; 7081 } else { 7082 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7083 } 7084 break; 7085 case UCOL_ATTRIBUTE_COUNT: 7086 default: 7087 *status = U_ILLEGAL_ARGUMENT_ERROR; 7088 break; 7089 } 7090 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { 7091 coll->latinOneRegenTable = TRUE; 7092 } else { 7093 coll->latinOneRegenTable = FALSE; 7094 } 7095 ucol_updateInternalState(coll, status); 7096 } 7097 7098 U_CAPI UColAttributeValue U_EXPORT2 7099 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 7100 if(U_FAILURE(*status) || coll == NULL) { 7101 return UCOL_DEFAULT; 7102 } 7103 switch(attr) { 7104 case UCOL_NUMERIC_COLLATION: 7105 return coll->numericCollation; 7106 case UCOL_HIRAGANA_QUATERNARY_MODE: 7107 return coll->hiraganaQ; 7108 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 7109 return coll->frenchCollation; 7110 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 7111 return coll->alternateHandling; 7112 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 7113 return coll->caseFirst; 7114 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 7115 return coll->caseLevel; 7116 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 7117 return coll->normalizationMode; 7118 case UCOL_STRENGTH: /* attribute for strength */ 7119 return coll->strength; 7120 case UCOL_ATTRIBUTE_COUNT: 7121 default: 7122 *status = U_ILLEGAL_ARGUMENT_ERROR; 7123 break; 7124 } 7125 return UCOL_DEFAULT; 7126 } 7127 7128 U_CAPI void U_EXPORT2 7129 ucol_setStrength( UCollator *coll, 7130 UCollationStrength strength) 7131 { 7132 UErrorCode status = U_ZERO_ERROR; 7133 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 7134 } 7135 7136 U_CAPI UCollationStrength U_EXPORT2 7137 ucol_getStrength(const UCollator *coll) 7138 { 7139 UErrorCode status = U_ZERO_ERROR; 7140 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 7141 } 7142 7143 U_INTERNAL int32_t U_EXPORT2 7144 ucol_getReorderCodes(const UCollator *coll, 7145 int32_t *dest, 7146 int32_t destCapacity, 7147 UErrorCode *pErrorCode) { 7148 if (U_FAILURE(*pErrorCode)) { 7149 return 0; 7150 } 7151 7152 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 7153 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 7154 return 0; 7155 } 7156 7157 if (coll->reorderCodesLength > destCapacity) { 7158 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 7159 return coll->reorderCodesLength; 7160 } 7161 for (int32_t i = 0; i < coll->reorderCodesLength; i++) { 7162 dest[i] = coll->reorderCodes[i]; 7163 } 7164 return coll->reorderCodesLength; 7165 } 7166 7167 U_INTERNAL void U_EXPORT2 7168 ucol_setReorderCodes(UCollator *coll, 7169 const int32_t *reorderCodes, 7170 int32_t reorderCodesLength, 7171 UErrorCode *pErrorCode) { 7172 if (U_FAILURE(*pErrorCode)) { 7173 return; 7174 } 7175 7176 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) { 7177 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 7178 return; 7179 } 7180 7181 uprv_free(coll->reorderCodes); 7182 coll->reorderCodes = NULL; 7183 coll->reorderCodesLength = 0; 7184 if (reorderCodesLength == 0) { 7185 uprv_free(coll->leadBytePermutationTable); 7186 coll->leadBytePermutationTable = NULL; 7187 return; 7188 } 7189 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t)); 7190 if (coll->reorderCodes == NULL) { 7191 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 7192 return; 7193 } 7194 for (int32_t i = 0; i < reorderCodesLength; i++) { 7195 coll->reorderCodes[i] = reorderCodes[i]; 7196 } 7197 coll->reorderCodesLength = reorderCodesLength; 7198 ucol_buildPermutationTable(coll, pErrorCode); 7199 if (U_FAILURE(*pErrorCode)) { 7200 uprv_free(coll->reorderCodes); 7201 coll->reorderCodes = NULL; 7202 coll->reorderCodesLength = 0; 7203 } 7204 } 7205 7206 7207 /****************************************************************************/ 7208 /* Following are misc functions */ 7209 /* there are new APIs and some compatibility APIs */ 7210 /****************************************************************************/ 7211 7212 U_CAPI void U_EXPORT2 7213 ucol_getVersion(const UCollator* coll, 7214 UVersionInfo versionInfo) 7215 { 7216 /* RunTime version */ 7217 uint8_t rtVersion = UCOL_RUNTIME_VERSION; 7218 /* Builder version*/ 7219 uint8_t bdVersion = coll->image->version[0]; 7220 7221 /* Charset Version. Need to get the version from cnv files 7222 * makeconv should populate cnv files with version and 7223 * an api has to be provided in ucnv.h to obtain this version 7224 */ 7225 uint8_t csVersion = 0; 7226 7227 /* combine the version info */ 7228 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion)); 7229 7230 /* Tailoring rules */ 7231 versionInfo[0] = (uint8_t)(cmbVersion>>8); 7232 versionInfo[1] = (uint8_t)cmbVersion; 7233 versionInfo[2] = coll->image->version[1]; 7234 if(coll->UCA) { 7235 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */ 7236 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07); 7237 } else { 7238 versionInfo[3] = 0; 7239 } 7240 } 7241 7242 7243 /* This internal API checks whether a character is tailored or not */ 7244 U_CAPI UBool U_EXPORT2 7245 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { 7246 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { 7247 return FALSE; 7248 } 7249 7250 uint32_t CE = UCOL_NOT_FOUND; 7251 const UChar *ContractionStart = NULL; 7252 if(u < 0x100) { /* latin-1 */ 7253 CE = coll->latinOneMapping[u]; 7254 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { 7255 return FALSE; 7256 } 7257 } else { /* regular */ 7258 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); 7259 } 7260 7261 if(isContraction(CE)) { 7262 ContractionStart = (UChar *)coll->image+getContractOffset(CE); 7263 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); 7264 } 7265 7266 return (UBool)(CE != UCOL_NOT_FOUND); 7267 } 7268 7269 7270 /****************************************************************************/ 7271 /* Following are the string compare functions */ 7272 /* */ 7273 /****************************************************************************/ 7274 7275 7276 /* ucol_checkIdent internal function. Does byte level string compare. */ 7277 /* Used by strcoll if strength == identical and strings */ 7278 /* are otherwise equal. */ 7279 /* */ 7280 /* Comparison must be done on NFD normalized strings. */ 7281 /* FCD is not good enough. */ 7282 7283 static 7284 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status) 7285 { 7286 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both 7287 // of same type, but that doesn't really mean that it will stay that way. 7288 int32_t comparison; 7289 7290 if (sColl->flags & UCOL_USE_ITERATOR) { 7291 // The division for the array length may truncate the array size to 7292 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 7293 // for all platforms anyway. 7294 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 7295 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 7296 UNormIterator *sNIt = NULL, *tNIt = NULL; 7297 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 7298 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 7299 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7300 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7301 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); 7302 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); 7303 comparison = u_strCompareIter(sIt, tIt, TRUE); 7304 unorm_closeIter(sNIt); 7305 unorm_closeIter(tNIt); 7306 } else { 7307 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1; 7308 const UChar *sBuf = sColl->string; 7309 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1; 7310 const UChar *tBuf = tColl->string; 7311 7312 if (normalize) { 7313 *status = U_ZERO_ERROR; 7314 // Note: We could use Normalizer::compare() or similar, but for short strings 7315 // which may not be in FCD it might be faster to just NFD them. 7316 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than 7317 // NFD'ing immediately might be faster for long strings, 7318 // but string comparison is usually done on relatively short strings. 7319 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen), 7320 sColl->writableBuffer, 7321 *status); 7322 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen), 7323 tColl->writableBuffer, 7324 *status); 7325 if(U_FAILURE(*status)) { 7326 return UCOL_LESS; 7327 } 7328 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer); 7329 } else { 7330 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); 7331 } 7332 } 7333 7334 if (comparison < 0) { 7335 return UCOL_LESS; 7336 } else if (comparison == 0) { 7337 return UCOL_EQUAL; 7338 } else /* comparison > 0 */ { 7339 return UCOL_GREATER; 7340 } 7341 } 7342 7343 /* CEBuf - A struct and some inline functions to handle the saving */ 7344 /* of CEs in a buffer within ucol_strcoll */ 7345 7346 #define UCOL_CEBUF_SIZE 512 7347 typedef struct ucol_CEBuf { 7348 uint32_t *buf; 7349 uint32_t *endp; 7350 uint32_t *pos; 7351 uint32_t localArray[UCOL_CEBUF_SIZE]; 7352 } ucol_CEBuf; 7353 7354 7355 static 7356 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { 7357 (b)->buf = (b)->pos = (b)->localArray; 7358 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; 7359 } 7360 7361 static 7362 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { 7363 uint32_t oldSize; 7364 uint32_t newSize; 7365 uint32_t *newBuf; 7366 7367 ci->flags |= UCOL_ITER_ALLOCATED; 7368 oldSize = (uint32_t)(b->pos - b->buf); 7369 newSize = oldSize * 2; 7370 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); 7371 if(newBuf == NULL) { 7372 *status = U_MEMORY_ALLOCATION_ERROR; 7373 } 7374 else { 7375 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); 7376 if (b->buf != b->localArray) { 7377 uprv_free(b->buf); 7378 } 7379 b->buf = newBuf; 7380 b->endp = b->buf + newSize; 7381 b->pos = b->buf + oldSize; 7382 } 7383 } 7384 7385 static 7386 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) { 7387 if (b->pos == b->endp) { 7388 ucol_CEBuf_Expand(b, ci, status); 7389 } 7390 if (U_SUCCESS(*status)) { 7391 *(b)->pos++ = ce; 7392 } 7393 } 7394 7395 /* This is a trick string compare function that goes in and uses sortkeys to compare */ 7396 /* It is used when compare gets in trouble and needs to bail out */ 7397 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, 7398 collIterate *tColl, 7399 UErrorCode *status) 7400 { 7401 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; 7402 uint8_t *sourceKeyP = sourceKey; 7403 uint8_t *targetKeyP = targetKey; 7404 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; 7405 const UCollator *coll = sColl->coll; 7406 const UChar *source = NULL; 7407 const UChar *target = NULL; 7408 int32_t result = UCOL_EQUAL; 7409 UnicodeString sourceString, targetString; 7410 int32_t sourceLength; 7411 int32_t targetLength; 7412 7413 if(sColl->flags & UCOL_USE_ITERATOR) { 7414 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7415 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7416 UChar32 c; 7417 while((c=sColl->iterator->next(sColl->iterator))>=0) { 7418 sourceString.append((UChar)c); 7419 } 7420 while((c=tColl->iterator->next(tColl->iterator))>=0) { 7421 targetString.append((UChar)c); 7422 } 7423 source = sourceString.getBuffer(); 7424 sourceLength = sourceString.length(); 7425 target = targetString.getBuffer(); 7426 targetLength = targetString.length(); 7427 } else { // no iterators 7428 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1; 7429 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1; 7430 source = sColl->string; 7431 target = tColl->string; 7432 } 7433 7434 7435 7436 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7437 if(sourceKeyLen > UCOL_MAX_BUFFER) { 7438 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); 7439 if(sourceKeyP == NULL) { 7440 *status = U_MEMORY_ALLOCATION_ERROR; 7441 goto cleanup_and_do_compare; 7442 } 7443 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7444 } 7445 7446 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7447 if(targetKeyLen > UCOL_MAX_BUFFER) { 7448 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); 7449 if(targetKeyP == NULL) { 7450 *status = U_MEMORY_ALLOCATION_ERROR; 7451 goto cleanup_and_do_compare; 7452 } 7453 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7454 } 7455 7456 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); 7457 7458 cleanup_and_do_compare: 7459 if(sourceKeyP != NULL && sourceKeyP != sourceKey) { 7460 uprv_free(sourceKeyP); 7461 } 7462 7463 if(targetKeyP != NULL && targetKeyP != targetKey) { 7464 uprv_free(targetKeyP); 7465 } 7466 7467 if(result<0) { 7468 return UCOL_LESS; 7469 } else if(result>0) { 7470 return UCOL_GREATER; 7471 } else { 7472 return UCOL_EQUAL; 7473 } 7474 } 7475 7476 7477 static UCollationResult 7478 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) 7479 { 7480 U_ALIGN_CODE(16); 7481 7482 const UCollator *coll = sColl->coll; 7483 7484 7485 // setting up the collator parameters 7486 UColAttributeValue strength = coll->strength; 7487 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); 7488 7489 UBool checkSecTer = initialCheckSecTer; 7490 UBool checkTertiary = (strength >= UCOL_TERTIARY); 7491 UBool checkQuad = (strength >= UCOL_QUATERNARY); 7492 UBool checkIdent = (strength == UCOL_IDENTICAL); 7493 UBool checkCase = (coll->caseLevel == UCOL_ON); 7494 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; 7495 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 7496 UBool qShifted = shifted && checkQuad; 7497 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; 7498 7499 if(doHiragana && shifted) { 7500 return (ucol_compareUsingSortKeys(sColl, tColl, status)); 7501 } 7502 uint8_t caseSwitch = coll->caseSwitch; 7503 uint8_t tertiaryMask = coll->tertiaryMask; 7504 7505 // This is the lowest primary value that will not be ignored if shifted 7506 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; 7507 7508 UCollationResult result = UCOL_EQUAL; 7509 UCollationResult hirResult = UCOL_EQUAL; 7510 7511 // Preparing the CE buffers. They will be filled during the primary phase 7512 ucol_CEBuf sCEs; 7513 ucol_CEBuf tCEs; 7514 UCOL_INIT_CEBUF(&sCEs); 7515 UCOL_INIT_CEBUF(&tCEs); 7516 7517 uint32_t secS = 0, secT = 0; 7518 uint32_t sOrder=0, tOrder=0; 7519 7520 // Non shifted primary processing is quite simple 7521 if(!shifted) { 7522 for(;;) { 7523 7524 // We fetch CEs until we hit a non ignorable primary or end. 7525 do { 7526 // We get the next CE 7527 sOrder = ucol_IGetNextCE(coll, sColl, status); 7528 // Stuff it in the buffer 7529 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7530 // And keep just the primary part. 7531 sOrder &= UCOL_PRIMARYMASK; 7532 } while(sOrder == 0); 7533 7534 // see the comments on the above block 7535 do { 7536 tOrder = ucol_IGetNextCE(coll, tColl, status); 7537 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7538 tOrder &= UCOL_PRIMARYMASK; 7539 } while(tOrder == 0); 7540 7541 // if both primaries are the same 7542 if(sOrder == tOrder) { 7543 // and there are no more CEs, we advance to the next level 7544 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7545 break; 7546 } 7547 if(doHiragana && hirResult == UCOL_EQUAL) { 7548 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { 7549 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) 7550 ? UCOL_LESS:UCOL_GREATER; 7551 } 7552 } 7553 } else { 7554 // only need to check one for continuation 7555 // if one is then the other must be or the preceding CE would be a prefix of the other 7556 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) { 7557 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); 7558 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); 7559 } 7560 // if two primaries are different, we are done 7561 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; 7562 goto commonReturn; 7563 } 7564 } // no primary difference... do the rest from the buffers 7565 } else { // shifted - do a slightly more complicated processing :) 7566 for(;;) { 7567 UBool sInShifted = FALSE; 7568 UBool tInShifted = FALSE; 7569 // This version of code can be refactored. However, it seems easier to understand this way. 7570 // Source loop. Sam as the target loop. 7571 for(;;) { 7572 sOrder = ucol_IGetNextCE(coll, sColl, status); 7573 if(sOrder == UCOL_NO_MORE_CES) { 7574 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7575 break; 7576 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { 7577 /* UCA amendment - ignore ignorables that follow shifted code points */ 7578 continue; 7579 } else if(isContinuation(sOrder)) { 7580 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7581 if(sInShifted) { 7582 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7583 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7584 continue; 7585 } else { 7586 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7587 break; 7588 } 7589 } else { /* Just lower level values */ 7590 if(sInShifted) { 7591 continue; 7592 } else { 7593 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7594 continue; 7595 } 7596 } 7597 } else { /* regular */ 7598 if(coll->leadBytePermutationTable != NULL){ 7599 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); 7600 } 7601 if((sOrder & UCOL_PRIMARYMASK) > LVT) { 7602 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7603 break; 7604 } else { 7605 if((sOrder & UCOL_PRIMARYMASK) > 0) { 7606 sInShifted = TRUE; 7607 sOrder &= UCOL_PRIMARYMASK; 7608 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7609 continue; 7610 } else { 7611 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7612 sInShifted = FALSE; 7613 continue; 7614 } 7615 } 7616 } 7617 } 7618 sOrder &= UCOL_PRIMARYMASK; 7619 sInShifted = FALSE; 7620 7621 for(;;) { 7622 tOrder = ucol_IGetNextCE(coll, tColl, status); 7623 if(tOrder == UCOL_NO_MORE_CES) { 7624 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7625 break; 7626 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { 7627 /* UCA amendment - ignore ignorables that follow shifted code points */ 7628 continue; 7629 } else if(isContinuation(tOrder)) { 7630 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7631 if(tInShifted) { 7632 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7633 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7634 continue; 7635 } else { 7636 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7637 break; 7638 } 7639 } else { /* Just lower level values */ 7640 if(tInShifted) { 7641 continue; 7642 } else { 7643 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7644 continue; 7645 } 7646 } 7647 } else { /* regular */ 7648 if(coll->leadBytePermutationTable != NULL){ 7649 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); 7650 } 7651 if((tOrder & UCOL_PRIMARYMASK) > LVT) { 7652 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7653 break; 7654 } else { 7655 if((tOrder & UCOL_PRIMARYMASK) > 0) { 7656 tInShifted = TRUE; 7657 tOrder &= UCOL_PRIMARYMASK; 7658 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7659 continue; 7660 } else { 7661 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7662 tInShifted = FALSE; 7663 continue; 7664 } 7665 } 7666 } 7667 } 7668 tOrder &= UCOL_PRIMARYMASK; 7669 tInShifted = FALSE; 7670 7671 if(sOrder == tOrder) { 7672 /* 7673 if(doHiragana && hirResult == UCOL_EQUAL) { 7674 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { 7675 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) 7676 ? UCOL_LESS:UCOL_GREATER; 7677 } 7678 } 7679 */ 7680 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7681 break; 7682 } else { 7683 sOrder = 0; 7684 tOrder = 0; 7685 continue; 7686 } 7687 } else { 7688 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; 7689 goto commonReturn; 7690 } 7691 } /* no primary difference... do the rest from the buffers */ 7692 } 7693 7694 /* now, we're gonna reexamine collected CEs */ 7695 uint32_t *sCE; 7696 uint32_t *tCE; 7697 7698 /* This is the secondary level of comparison */ 7699 if(checkSecTer) { 7700 if(!isFrenchSec) { /* normal */ 7701 sCE = sCEs.buf; 7702 tCE = tCEs.buf; 7703 for(;;) { 7704 while (secS == 0) { 7705 secS = *(sCE++) & UCOL_SECONDARYMASK; 7706 } 7707 7708 while(secT == 0) { 7709 secT = *(tCE++) & UCOL_SECONDARYMASK; 7710 } 7711 7712 if(secS == secT) { 7713 if(secS == UCOL_NO_MORE_CES_SECONDARY) { 7714 break; 7715 } else { 7716 secS = 0; secT = 0; 7717 continue; 7718 } 7719 } else { 7720 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7721 goto commonReturn; 7722 } 7723 } 7724 } else { /* do the French */ 7725 uint32_t *sCESave = NULL; 7726 uint32_t *tCESave = NULL; 7727 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ 7728 tCE = tCEs.pos-2; 7729 for(;;) { 7730 while (secS == 0 && sCE >= sCEs.buf) { 7731 if(sCESave == NULL) { 7732 secS = *(sCE--); 7733 if(isContinuation(secS)) { 7734 while(isContinuation(secS = *(sCE--))) 7735 ; 7736 /* after this, secS has the start of continuation, and sCEs points before that */ 7737 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7738 sCE+=2; /* need to point to the first continuation CP */ 7739 /* However, now you can just continue doing stuff */ 7740 } 7741 } else { 7742 secS = *(sCE++); 7743 if(!isContinuation(secS)) { /* This means we have finished with this cont */ 7744 sCE = sCESave; /* reset the pointer to before continuation */ 7745 sCESave = NULL; 7746 secS = 0; /* Fetch a fresh CE before the continuation sequence. */ 7747 continue; 7748 } 7749 } 7750 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7751 } 7752 7753 while(secT == 0 && tCE >= tCEs.buf) { 7754 if(tCESave == NULL) { 7755 secT = *(tCE--); 7756 if(isContinuation(secT)) { 7757 while(isContinuation(secT = *(tCE--))) 7758 ; 7759 /* after this, secS has the start of continuation, and sCEs points before that */ 7760 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7761 tCE+=2; /* need to point to the first continuation CP */ 7762 /* However, now you can just continue doing stuff */ 7763 } 7764 } else { 7765 secT = *(tCE++); 7766 if(!isContinuation(secT)) { /* This means we have finished with this cont */ 7767 tCE = tCESave; /* reset the pointer to before continuation */ 7768 tCESave = NULL; 7769 secT = 0; /* Fetch a fresh CE before the continuation sequence. */ 7770 continue; 7771 } 7772 } 7773 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7774 } 7775 7776 if(secS == secT) { 7777 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { 7778 break; 7779 } else { 7780 secS = 0; secT = 0; 7781 continue; 7782 } 7783 } else { 7784 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7785 goto commonReturn; 7786 } 7787 } 7788 } 7789 } 7790 7791 /* doing the case bit */ 7792 if(checkCase) { 7793 sCE = sCEs.buf; 7794 tCE = tCEs.buf; 7795 for(;;) { 7796 while((secS & UCOL_REMOVE_CASE) == 0) { 7797 if(!isContinuation(*sCE++)) { 7798 secS =*(sCE-1); 7799 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7800 // primary ignorables should not be considered on the case level when the strength is primary 7801 // otherwise, the CEs stop being well-formed 7802 secS &= UCOL_TERT_CASE_MASK; 7803 secS ^= caseSwitch; 7804 } else { 7805 secS = 0; 7806 } 7807 } else { 7808 secS = 0; 7809 } 7810 } 7811 7812 while((secT & UCOL_REMOVE_CASE) == 0) { 7813 if(!isContinuation(*tCE++)) { 7814 secT = *(tCE-1); 7815 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7816 // primary ignorables should not be considered on the case level when the strength is primary 7817 // otherwise, the CEs stop being well-formed 7818 secT &= UCOL_TERT_CASE_MASK; 7819 secT ^= caseSwitch; 7820 } else { 7821 secT = 0; 7822 } 7823 } else { 7824 secT = 0; 7825 } 7826 } 7827 7828 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { 7829 result = UCOL_LESS; 7830 goto commonReturn; 7831 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { 7832 result = UCOL_GREATER; 7833 goto commonReturn; 7834 } 7835 7836 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { 7837 break; 7838 } else { 7839 secS = 0; 7840 secT = 0; 7841 } 7842 } 7843 } 7844 7845 /* Tertiary level */ 7846 if(checkTertiary) { 7847 secS = 0; 7848 secT = 0; 7849 sCE = sCEs.buf; 7850 tCE = tCEs.buf; 7851 for(;;) { 7852 while((secS & UCOL_REMOVE_CASE) == 0) { 7853 secS = *(sCE++) & tertiaryMask; 7854 if(!isContinuation(secS)) { 7855 secS ^= caseSwitch; 7856 } else { 7857 secS &= UCOL_REMOVE_CASE; 7858 } 7859 } 7860 7861 while((secT & UCOL_REMOVE_CASE) == 0) { 7862 secT = *(tCE++) & tertiaryMask; 7863 if(!isContinuation(secT)) { 7864 secT ^= caseSwitch; 7865 } else { 7866 secT &= UCOL_REMOVE_CASE; 7867 } 7868 } 7869 7870 if(secS == secT) { 7871 if((secS & UCOL_REMOVE_CASE) == 1) { 7872 break; 7873 } else { 7874 secS = 0; secT = 0; 7875 continue; 7876 } 7877 } else { 7878 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7879 goto commonReturn; 7880 } 7881 } 7882 } 7883 7884 7885 if(qShifted /*checkQuad*/) { 7886 UBool sInShifted = TRUE; 7887 UBool tInShifted = TRUE; 7888 secS = 0; 7889 secT = 0; 7890 sCE = sCEs.buf; 7891 tCE = tCEs.buf; 7892 for(;;) { 7893 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) { 7894 secS = *(sCE++); 7895 if(isContinuation(secS)) { 7896 if(!sInShifted) { 7897 continue; 7898 } 7899 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ 7900 secS = UCOL_PRIMARYMASK; 7901 sInShifted = FALSE; 7902 } else { 7903 sInShifted = TRUE; 7904 } 7905 } 7906 secS &= UCOL_PRIMARYMASK; 7907 7908 7909 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) { 7910 secT = *(tCE++); 7911 if(isContinuation(secT)) { 7912 if(!tInShifted) { 7913 continue; 7914 } 7915 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { 7916 secT = UCOL_PRIMARYMASK; 7917 tInShifted = FALSE; 7918 } else { 7919 tInShifted = TRUE; 7920 } 7921 } 7922 secT &= UCOL_PRIMARYMASK; 7923 7924 if(secS == secT) { 7925 if(secS == UCOL_NO_MORE_CES_PRIMARY) { 7926 break; 7927 } else { 7928 secS = 0; secT = 0; 7929 continue; 7930 } 7931 } else { 7932 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7933 goto commonReturn; 7934 } 7935 } 7936 } else if(doHiragana && hirResult != UCOL_EQUAL) { 7937 // If we're fine on quaternaries, we might be different 7938 // on Hiragana. This, however, might fail us in shifted. 7939 result = hirResult; 7940 goto commonReturn; 7941 } 7942 7943 /* For IDENTICAL comparisons, we use a bitwise character comparison */ 7944 /* as a tiebreaker if all else is equal. */ 7945 /* Getting here should be quite rare - strings are not identical - */ 7946 /* that is checked first, but compared == through all other checks. */ 7947 if(checkIdent) 7948 { 7949 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON); 7950 result = ucol_checkIdent(sColl, tColl, TRUE, status); 7951 } 7952 7953 commonReturn: 7954 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { 7955 if (sCEs.buf != sCEs.localArray ) { 7956 uprv_free(sCEs.buf); 7957 } 7958 if (tCEs.buf != tCEs.localArray ) { 7959 uprv_free(tCEs.buf); 7960 } 7961 } 7962 7963 return result; 7964 } 7965 7966 static UCollationResult 7967 ucol_strcollRegular(const UCollator *coll, 7968 const UChar *source, int32_t sourceLength, 7969 const UChar *target, int32_t targetLength, 7970 UErrorCode *status) { 7971 collIterate sColl, tColl; 7972 // Preparing the context objects for iterating over strings 7973 IInit_collIterate(coll, source, sourceLength, &sColl, status); 7974 IInit_collIterate(coll, target, targetLength, &tColl, status); 7975 if(U_FAILURE(*status)) { 7976 return UCOL_LESS; 7977 } 7978 return ucol_strcollRegular(&sColl, &tColl, status); 7979 } 7980 7981 static inline uint32_t 7982 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, 7983 uint32_t CE, const UChar *s, int32_t *index, int32_t len) 7984 { 7985 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 7986 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 7987 int32_t offset = 1; 7988 UChar schar = 0, tchar = 0; 7989 7990 for(;;) { 7991 if(len == -1) { 7992 if(s[*index] == 0) { // end of string 7993 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7994 } else { 7995 schar = s[*index]; 7996 } 7997 } else { 7998 if(*index == len) { 7999 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8000 } else { 8001 schar = s[*index]; 8002 } 8003 } 8004 8005 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 8006 offset++; 8007 } 8008 8009 if (schar == tchar) { 8010 (*index)++; 8011 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 8012 } 8013 else 8014 { 8015 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 8016 return UCOL_BAIL_OUT_CE; 8017 } 8018 // skip completely ignorables 8019 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 8020 if(isZeroCE == 0) { // we have to ignore completely ignorables 8021 (*index)++; 8022 continue; 8023 } 8024 8025 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8026 } 8027 } 8028 } 8029 8030 8031 /** 8032 * This is a fast strcoll, geared towards text in Latin-1. 8033 * It supports contractions of size two, French secondaries 8034 * and case switching. You can use it with strengths primary 8035 * to tertiary. It does not support shifted and case level. 8036 * It relies on the table build by setupLatin1Table. If it 8037 * doesn't understand something, it will go to the regular 8038 * strcoll. 8039 */ 8040 static UCollationResult 8041 ucol_strcollUseLatin1( const UCollator *coll, 8042 const UChar *source, 8043 int32_t sLen, 8044 const UChar *target, 8045 int32_t tLen, 8046 UErrorCode *status) 8047 { 8048 U_ALIGN_CODE(16); 8049 int32_t strength = coll->strength; 8050 8051 int32_t sIndex = 0, tIndex = 0; 8052 UChar sChar = 0, tChar = 0; 8053 uint32_t sOrder=0, tOrder=0; 8054 8055 UBool endOfSource = FALSE; 8056 8057 uint32_t *elements = coll->latinOneCEs; 8058 8059 UBool haveContractions = FALSE; // if we have contractions in our string 8060 // we cannot do French secondary 8061 8062 // Do the primary level 8063 for(;;) { 8064 while(sOrder==0) { // this loop skips primary ignorables 8065 // sOrder=getNextlatinOneCE(source); 8066 if(sLen==-1) { // handling zero terminated strings 8067 sChar=source[sIndex++]; 8068 if(sChar==0) { 8069 endOfSource = TRUE; 8070 break; 8071 } 8072 } else { // handling strings with known length 8073 if(sIndex==sLen) { 8074 endOfSource = TRUE; 8075 break; 8076 } 8077 sChar=source[sIndex++]; 8078 } 8079 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 8080 //fprintf(stderr, "R"); 8081 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8082 } 8083 sOrder = elements[sChar]; 8084 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 8085 // specials can basically be either contractions or bail-out signs. If we get anything 8086 // else, we'll bail out anywasy 8087 if(getCETag(sOrder) == CONTRACTION_TAG) { 8088 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 8089 haveContractions = TRUE; // if there are contractions, we cannot do French secondary 8090 // However, if there are contractions in the table, but we always use just one char, 8091 // we might be able to do French. This should be checked out. 8092 } 8093 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 8094 //fprintf(stderr, "S"); 8095 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8096 } 8097 } 8098 } 8099 8100 while(tOrder==0) { // this loop skips primary ignorables 8101 // tOrder=getNextlatinOneCE(target); 8102 if(tLen==-1) { // handling zero terminated strings 8103 tChar=target[tIndex++]; 8104 if(tChar==0) { 8105 if(endOfSource) { // this is different than source loop, 8106 // as we already know that source loop is done here, 8107 // so we can either finish the primary loop if both 8108 // strings are done or anounce the result if only 8109 // target is done. Same below. 8110 goto endOfPrimLoop; 8111 } else { 8112 return UCOL_GREATER; 8113 } 8114 } 8115 } else { // handling strings with known length 8116 if(tIndex==tLen) { 8117 if(endOfSource) { 8118 goto endOfPrimLoop; 8119 } else { 8120 return UCOL_GREATER; 8121 } 8122 } 8123 tChar=target[tIndex++]; 8124 } 8125 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 8126 //fprintf(stderr, "R"); 8127 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8128 } 8129 tOrder = elements[tChar]; 8130 if(tOrder >= UCOL_NOT_FOUND) { 8131 // Handling specials, see the comments for source 8132 if(getCETag(tOrder) == CONTRACTION_TAG) { 8133 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 8134 haveContractions = TRUE; 8135 } 8136 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 8137 //fprintf(stderr, "S"); 8138 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8139 } 8140 } 8141 } 8142 if(endOfSource) { // source is finished, but target is not, say the result. 8143 return UCOL_LESS; 8144 } 8145 8146 if(sOrder == tOrder) { // if we have same CEs, we continue the loop 8147 sOrder = 0; tOrder = 0; 8148 continue; 8149 } else { 8150 // compare current top bytes 8151 if(((sOrder^tOrder)&0xFF000000)!=0) { 8152 // top bytes differ, return difference 8153 if(sOrder < tOrder) { 8154 return UCOL_LESS; 8155 } else if(sOrder > tOrder) { 8156 return UCOL_GREATER; 8157 } 8158 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 8159 // since we must return enum value 8160 } 8161 8162 // top bytes match, continue with following bytes 8163 sOrder<<=8; 8164 tOrder<<=8; 8165 } 8166 } 8167 8168 endOfPrimLoop: 8169 // after primary loop, we definitely know the sizes of strings, 8170 // so we set it and use simpler loop for secondaries and tertiaries 8171 sLen = sIndex; tLen = tIndex; 8172 if(strength >= UCOL_SECONDARY) { 8173 // adjust the table beggining 8174 elements += coll->latinOneTableLen; 8175 endOfSource = FALSE; 8176 8177 if(coll->frenchCollation == UCOL_OFF) { // non French 8178 // This loop is a simplified copy of primary loop 8179 // at this point we know that whole strings are latin-1, so we don't 8180 // check for that. We also know that we only have contractions as 8181 // specials. 8182 sIndex = 0; tIndex = 0; 8183 for(;;) { 8184 while(sOrder==0) { 8185 if(sIndex==sLen) { 8186 endOfSource = TRUE; 8187 break; 8188 } 8189 sChar=source[sIndex++]; 8190 sOrder = elements[sChar]; 8191 if(sOrder > UCOL_NOT_FOUND) { 8192 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 8193 } 8194 } 8195 8196 while(tOrder==0) { 8197 if(tIndex==tLen) { 8198 if(endOfSource) { 8199 goto endOfSecLoop; 8200 } else { 8201 return UCOL_GREATER; 8202 } 8203 } 8204 tChar=target[tIndex++]; 8205 tOrder = elements[tChar]; 8206 if(tOrder > UCOL_NOT_FOUND) { 8207 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 8208 } 8209 } 8210 if(endOfSource) { 8211 return UCOL_LESS; 8212 } 8213 8214 if(sOrder == tOrder) { 8215 sOrder = 0; tOrder = 0; 8216 continue; 8217 } else { 8218 // see primary loop for comments on this 8219 if(((sOrder^tOrder)&0xFF000000)!=0) { 8220 if(sOrder < tOrder) { 8221 return UCOL_LESS; 8222 } else if(sOrder > tOrder) { 8223 return UCOL_GREATER; 8224 } 8225 } 8226 sOrder<<=8; 8227 tOrder<<=8; 8228 } 8229 } 8230 } else { // French 8231 if(haveContractions) { // if we have contractions, we have to bail out 8232 // since we don't really know how to handle them here 8233 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8234 } 8235 // For French, we go backwards 8236 sIndex = sLen; tIndex = tLen; 8237 for(;;) { 8238 while(sOrder==0) { 8239 if(sIndex==0) { 8240 endOfSource = TRUE; 8241 break; 8242 } 8243 sChar=source[--sIndex]; 8244 sOrder = elements[sChar]; 8245 // don't even look for contractions 8246 } 8247 8248 while(tOrder==0) { 8249 if(tIndex==0) { 8250 if(endOfSource) { 8251 goto endOfSecLoop; 8252 } else { 8253 return UCOL_GREATER; 8254 } 8255 } 8256 tChar=target[--tIndex]; 8257 tOrder = elements[tChar]; 8258 // don't even look for contractions 8259 } 8260 if(endOfSource) { 8261 return UCOL_LESS; 8262 } 8263 8264 if(sOrder == tOrder) { 8265 sOrder = 0; tOrder = 0; 8266 continue; 8267 } else { 8268 // see the primary loop for comments 8269 if(((sOrder^tOrder)&0xFF000000)!=0) { 8270 if(sOrder < tOrder) { 8271 return UCOL_LESS; 8272 } else if(sOrder > tOrder) { 8273 return UCOL_GREATER; 8274 } 8275 } 8276 sOrder<<=8; 8277 tOrder<<=8; 8278 } 8279 } 8280 } 8281 } 8282 8283 endOfSecLoop: 8284 if(strength >= UCOL_TERTIARY) { 8285 // tertiary loop is the same as secondary (except no French) 8286 elements += coll->latinOneTableLen; 8287 sIndex = 0; tIndex = 0; 8288 endOfSource = FALSE; 8289 for(;;) { 8290 while(sOrder==0) { 8291 if(sIndex==sLen) { 8292 endOfSource = TRUE; 8293 break; 8294 } 8295 sChar=source[sIndex++]; 8296 sOrder = elements[sChar]; 8297 if(sOrder > UCOL_NOT_FOUND) { 8298 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 8299 } 8300 } 8301 while(tOrder==0) { 8302 if(tIndex==tLen) { 8303 if(endOfSource) { 8304 return UCOL_EQUAL; // if both strings are at the end, they are equal 8305 } else { 8306 return UCOL_GREATER; 8307 } 8308 } 8309 tChar=target[tIndex++]; 8310 tOrder = elements[tChar]; 8311 if(tOrder > UCOL_NOT_FOUND) { 8312 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 8313 } 8314 } 8315 if(endOfSource) { 8316 return UCOL_LESS; 8317 } 8318 if(sOrder == tOrder) { 8319 sOrder = 0; tOrder = 0; 8320 continue; 8321 } else { 8322 if(((sOrder^tOrder)&0xff000000)!=0) { 8323 if(sOrder < tOrder) { 8324 return UCOL_LESS; 8325 } else if(sOrder > tOrder) { 8326 return UCOL_GREATER; 8327 } 8328 } 8329 sOrder<<=8; 8330 tOrder<<=8; 8331 } 8332 } 8333 } 8334 return UCOL_EQUAL; 8335 } 8336 8337 8338 U_CAPI UCollationResult U_EXPORT2 8339 ucol_strcollIter( const UCollator *coll, 8340 UCharIterator *sIter, 8341 UCharIterator *tIter, 8342 UErrorCode *status) 8343 { 8344 if(!status || U_FAILURE(*status)) { 8345 return UCOL_EQUAL; 8346 } 8347 8348 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 8349 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 8350 8351 if (sIter == tIter) { 8352 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8353 return UCOL_EQUAL; 8354 } 8355 if(sIter == NULL || tIter == NULL || coll == NULL) { 8356 *status = U_ILLEGAL_ARGUMENT_ERROR; 8357 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8358 return UCOL_EQUAL; 8359 } 8360 8361 UCollationResult result = UCOL_EQUAL; 8362 8363 // Preparing the context objects for iterating over strings 8364 collIterate sColl, tColl; 8365 IInit_collIterate(coll, NULL, -1, &sColl, status); 8366 IInit_collIterate(coll, NULL, -1, &tColl, status); 8367 if(U_FAILURE(*status)) { 8368 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8369 return UCOL_EQUAL; 8370 } 8371 // The division for the array length may truncate the array size to 8372 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 8373 // for all platforms anyway. 8374 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8375 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8376 UNormIterator *sNormIter = NULL, *tNormIter = NULL; 8377 8378 sColl.iterator = sIter; 8379 sColl.flags |= UCOL_USE_ITERATOR; 8380 tColl.flags |= UCOL_USE_ITERATOR; 8381 tColl.iterator = tIter; 8382 8383 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 8384 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 8385 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); 8386 sColl.flags &= ~UCOL_ITER_NORM; 8387 8388 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 8389 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); 8390 tColl.flags &= ~UCOL_ITER_NORM; 8391 } 8392 8393 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; 8394 8395 while((sChar = sColl.iterator->next(sColl.iterator)) == 8396 (tChar = tColl.iterator->next(tColl.iterator))) { 8397 if(sChar == U_SENTINEL) { 8398 result = UCOL_EQUAL; 8399 goto end_compare; 8400 } 8401 } 8402 8403 if(sChar == U_SENTINEL) { 8404 tChar = tColl.iterator->previous(tColl.iterator); 8405 } 8406 8407 if(tChar == U_SENTINEL) { 8408 sChar = sColl.iterator->previous(sColl.iterator); 8409 } 8410 8411 sChar = sColl.iterator->previous(sColl.iterator); 8412 tChar = tColl.iterator->previous(tColl.iterator); 8413 8414 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) 8415 { 8416 // We are stopped in the middle of a contraction. 8417 // Scan backwards through the == part of the string looking for the start of the contraction. 8418 // It doesn't matter which string we scan, since they are the same in this region. 8419 do 8420 { 8421 sChar = sColl.iterator->previous(sColl.iterator); 8422 tChar = tColl.iterator->previous(tColl.iterator); 8423 } 8424 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); 8425 } 8426 8427 8428 if(U_SUCCESS(*status)) { 8429 result = ucol_strcollRegular(&sColl, &tColl, status); 8430 } 8431 8432 end_compare: 8433 if(sNormIter || tNormIter) { 8434 unorm_closeIter(sNormIter); 8435 unorm_closeIter(tNormIter); 8436 } 8437 8438 UTRACE_EXIT_VALUE_STATUS(result, *status) 8439 return result; 8440 } 8441 8442 8443 /* */ 8444 /* ucol_strcoll Main public API string comparison function */ 8445 /* */ 8446 U_CAPI UCollationResult U_EXPORT2 8447 ucol_strcoll( const UCollator *coll, 8448 const UChar *source, 8449 int32_t sourceLength, 8450 const UChar *target, 8451 int32_t targetLength) 8452 { 8453 U_ALIGN_CODE(16); 8454 8455 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 8456 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 8457 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 8458 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 8459 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 8460 } 8461 8462 if(source == NULL || target == NULL) { 8463 // do not crash, but return. Should have 8464 // status argument to return error. 8465 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8466 return UCOL_EQUAL; 8467 } 8468 8469 /* Quick check if source and target are same strings. */ 8470 /* They should either both be NULL terminated or the explicit length should be set on both. */ 8471 if (source==target && sourceLength==targetLength) { 8472 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8473 return UCOL_EQUAL; 8474 } 8475 8476 /* Scan the strings. Find: */ 8477 /* The length of any leading portion that is equal */ 8478 /* Whether they are exactly equal. (in which case we just return) */ 8479 const UChar *pSrc = source; 8480 const UChar *pTarg = target; 8481 int32_t equalLength; 8482 8483 if (sourceLength == -1 && targetLength == -1) { 8484 // Both strings are null terminated. 8485 // Scan through any leading equal portion. 8486 while (*pSrc == *pTarg && *pSrc != 0) { 8487 pSrc++; 8488 pTarg++; 8489 } 8490 if (*pSrc == 0 && *pTarg == 0) { 8491 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8492 return UCOL_EQUAL; 8493 } 8494 equalLength = (int32_t)(pSrc - source); 8495 } 8496 else 8497 { 8498 // One or both strings has an explicit length. 8499 const UChar *pSrcEnd = source + sourceLength; 8500 const UChar *pTargEnd = target + targetLength; 8501 8502 // Scan while the strings are bitwise ==, or until one is exhausted. 8503 for (;;) { 8504 if (pSrc == pSrcEnd || pTarg == pTargEnd) { 8505 break; 8506 } 8507 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 8508 break; 8509 } 8510 if (*pSrc != *pTarg) { 8511 break; 8512 } 8513 pSrc++; 8514 pTarg++; 8515 } 8516 equalLength = (int32_t)(pSrc - source); 8517 8518 // If we made it all the way through both strings, we are done. They are == 8519 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */ 8520 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */ 8521 { 8522 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8523 return UCOL_EQUAL; 8524 } 8525 } 8526 if (equalLength > 0) { 8527 /* There is an identical portion at the beginning of the two strings. */ 8528 /* If the identical portion ends within a contraction or a comibining */ 8529 /* character sequence, back up to the start of that sequence. */ 8530 8531 // These values should already be set by the code above. 8532 //pSrc = source + equalLength; /* point to the first differing chars */ 8533 //pTarg = target + equalLength; 8534 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) || 8535 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))) 8536 { 8537 // We are stopped in the middle of a contraction. 8538 // Scan backwards through the == part of the string looking for the start of the contraction. 8539 // It doesn't matter which string we scan, since they are the same in this region. 8540 do 8541 { 8542 equalLength--; 8543 pSrc--; 8544 } 8545 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); 8546 } 8547 8548 source += equalLength; 8549 target += equalLength; 8550 if (sourceLength > 0) { 8551 sourceLength -= equalLength; 8552 } 8553 if (targetLength > 0) { 8554 targetLength -= equalLength; 8555 } 8556 } 8557 8558 UErrorCode status = U_ZERO_ERROR; 8559 UCollationResult returnVal; 8560 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { 8561 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status); 8562 } else { 8563 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); 8564 } 8565 UTRACE_EXIT_VALUE(returnVal); 8566 return returnVal; 8567 } 8568 8569 /* convenience function for comparing strings */ 8570 U_CAPI UBool U_EXPORT2 8571 ucol_greater( const UCollator *coll, 8572 const UChar *source, 8573 int32_t sourceLength, 8574 const UChar *target, 8575 int32_t targetLength) 8576 { 8577 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8578 == UCOL_GREATER); 8579 } 8580 8581 /* convenience function for comparing strings */ 8582 U_CAPI UBool U_EXPORT2 8583 ucol_greaterOrEqual( const UCollator *coll, 8584 const UChar *source, 8585 int32_t sourceLength, 8586 const UChar *target, 8587 int32_t targetLength) 8588 { 8589 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8590 != UCOL_LESS); 8591 } 8592 8593 /* convenience function for comparing strings */ 8594 U_CAPI UBool U_EXPORT2 8595 ucol_equal( const UCollator *coll, 8596 const UChar *source, 8597 int32_t sourceLength, 8598 const UChar *target, 8599 int32_t targetLength) 8600 { 8601 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8602 == UCOL_EQUAL); 8603 } 8604 8605 U_CAPI void U_EXPORT2 8606 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 8607 if(coll && coll->UCA) { 8608 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); 8609 } 8610 } 8611 8612 #endif /* #if !UCONFIG_NO_COLLATION */ 8613