1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucol.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Modification history 12 * Date Name Comments 13 * 1996-1999 various members of ICU team maintained C API for collation framework 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE 15 * 03/01/2001 synwee Added maxexpansion functionality. 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_COLLATION 22 23 #include "unicode/coleitr.h" 24 #include "unicode/unorm.h" 25 #include "unicode/udata.h" 26 #include "unicode/ustring.h" 27 28 #include "ucol_imp.h" 29 #include "bocsu.h" 30 31 #include "normalizer2impl.h" 32 #include "unorm_it.h" 33 #include "umutex.h" 34 #include "cmemory.h" 35 #include "ucln_in.h" 36 #include "cstring.h" 37 #include "utracimp.h" 38 #include "putilimp.h" 39 #include "uassert.h" 40 41 #ifdef UCOL_DEBUG 42 #include <stdio.h> 43 #endif 44 45 U_NAMESPACE_USE 46 47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 48 49 #define LAST_BYTE_MASK_ 0xFF 50 #define SECOND_LAST_BYTE_SHIFT_ 8 51 52 #define ZERO_CC_LIMIT_ 0xC0 53 54 // this is static pointer to the normalizer fcdTrieIndex 55 // it is always the same between calls to u_cleanup 56 // and therefore writing to it is not synchronized. 57 // It is cleaned in ucol_cleanup 58 static const uint16_t *fcdTrieIndex=NULL; 59 // Code points at fcdHighStart and above have a zero FCD value. 60 static UChar32 fcdHighStart = 0; 61 62 // These are values from UCA required for 63 // implicit generation and supressing sort key compression 64 // they should regularly be in the UCA, but if one 65 // is running without UCA, it could be a problem 66 static const int32_t maxRegularPrimary = 0x7A; 67 static const int32_t minImplicitPrimary = 0xE0; 68 static const int32_t maxImplicitPrimary = 0xE4; 69 70 U_CDECL_BEGIN 71 static UBool U_CALLCONV 72 ucol_cleanup(void) 73 { 74 fcdTrieIndex = NULL; 75 return TRUE; 76 } 77 78 static int32_t U_CALLCONV 79 _getFoldingOffset(uint32_t data) { 80 return (int32_t)(data&0xFFFFFF); 81 } 82 83 U_CDECL_END 84 85 // init FCD data 86 static inline 87 UBool initializeFCD(UErrorCode *status) { 88 if (fcdTrieIndex != NULL) { 89 return TRUE; 90 } else { 91 // The result is constant, until the library is reloaded. 92 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 93 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 94 return U_SUCCESS(*status); 95 } 96 } 97 98 static 99 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, 100 int32_t sourceLen, collIterate *s, 101 UErrorCode *status) 102 { 103 (s)->string = (s)->pos = sourceString; 104 (s)->origFlags = 0; 105 (s)->flags = 0; 106 if (sourceLen >= 0) { 107 s->flags |= UCOL_ITER_HASLEN; 108 (s)->endp = (UChar *)sourceString+sourceLen; 109 } 110 else { 111 /* change to enable easier checking for end of string for fcdpositon */ 112 (s)->endp = NULL; 113 } 114 (s)->extendCEs = NULL; 115 (s)->extendCEsSize = 0; 116 (s)->CEpos = (s)->toReturn = (s)->CEs; 117 (s)->offsetBuffer = NULL; 118 (s)->offsetBufferSize = 0; 119 (s)->offsetReturn = (s)->offsetStore = NULL; 120 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; 121 (s)->coll = (collator); 122 (s)->nfd = Normalizer2Factory::getNFDInstance(*status); 123 (s)->fcdPosition = 0; 124 if(collator->normalizationMode == UCOL_ON) { 125 (s)->flags |= UCOL_ITER_NORM; 126 } 127 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { 128 (s)->flags |= UCOL_HIRAGANA_Q; 129 } 130 (s)->iterator = NULL; 131 //(s)->iteratorIndex = 0; 132 } 133 134 U_CAPI void U_EXPORT2 135 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, 136 int32_t sourceLen, collIterate *s, 137 UErrorCode *status) { 138 /* Out-of-line version for use from other files. */ 139 IInit_collIterate(collator, sourceString, sourceLen, s, status); 140 } 141 142 U_CAPI collIterate * U_EXPORT2 143 uprv_new_collIterate(UErrorCode *status) { 144 if(U_FAILURE(*status)) { 145 return NULL; 146 } 147 collIterate *s = new collIterate; 148 if(s == NULL) { 149 *status = U_MEMORY_ALLOCATION_ERROR; 150 return NULL; 151 } 152 return s; 153 } 154 155 U_CAPI void U_EXPORT2 156 uprv_delete_collIterate(collIterate *s) { 157 delete s; 158 } 159 160 U_CAPI UBool U_EXPORT2 161 uprv_collIterateAtEnd(collIterate *s) { 162 return s == NULL || s->pos == s->endp; 163 } 164 165 /** 166 * Backup the state of the collIterate struct data 167 * @param data collIterate to backup 168 * @param backup storage 169 */ 170 static 171 inline void backupState(const collIterate *data, collIterateState *backup) 172 { 173 backup->fcdPosition = data->fcdPosition; 174 backup->flags = data->flags; 175 backup->origFlags = data->origFlags; 176 backup->pos = data->pos; 177 backup->bufferaddress = data->writableBuffer.getBuffer(); 178 backup->buffersize = data->writableBuffer.length(); 179 backup->iteratorMove = 0; 180 backup->iteratorIndex = 0; 181 if(data->iterator != NULL) { 182 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); 183 backup->iteratorIndex = data->iterator->getState(data->iterator); 184 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE 185 if(backup->iteratorIndex == UITER_NO_STATE) { 186 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { 187 backup->iteratorMove++; 188 data->iterator->move(data->iterator, -1, UITER_CURRENT); 189 } 190 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 191 } 192 } 193 } 194 195 /** 196 * Loads the state into the collIterate struct data 197 * @param data collIterate to backup 198 * @param backup storage 199 * @param forwards boolean to indicate if forwards iteration is used, 200 * false indicates backwards iteration 201 */ 202 static 203 inline void loadState(collIterate *data, const collIterateState *backup, 204 UBool forwards) 205 { 206 UErrorCode status = U_ZERO_ERROR; 207 data->flags = backup->flags; 208 data->origFlags = backup->origFlags; 209 if(data->iterator != NULL) { 210 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); 211 data->iterator->setState(data->iterator, backup->iteratorIndex, &status); 212 if(backup->iteratorMove != 0) { 213 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 214 } 215 } 216 data->pos = backup->pos; 217 218 if ((data->flags & UCOL_ITER_INNORMBUF) && 219 data->writableBuffer.getBuffer() != backup->bufferaddress) { 220 /* 221 this is when a new buffer has been reallocated and we'll have to 222 calculate the new position. 223 note the new buffer has to contain the contents of the old buffer. 224 */ 225 if (forwards) { 226 data->pos = data->writableBuffer.getTerminatedBuffer() + 227 (data->pos - backup->bufferaddress); 228 } 229 else { 230 /* backwards direction */ 231 int32_t temp = backup->buffersize - 232 (int32_t)(data->pos - backup->bufferaddress); 233 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp); 234 } 235 } 236 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 237 /* 238 this is alittle tricky. 239 if we are initially not in the normalization buffer, even if we 240 normalize in the later stage, the data in the buffer will be 241 ignored, since we skip back up to the data string. 242 however if we are already in the normalization buffer, any 243 further normalization will pull data into the normalization 244 buffer and modify the fcdPosition. 245 since we are keeping the data in the buffer for use, the 246 fcdPosition can not be reverted back. 247 arrgghh.... 248 */ 249 data->fcdPosition = backup->fcdPosition; 250 } 251 } 252 253 static UBool 254 reallocCEs(collIterate *data, int32_t newCapacity) { 255 uint32_t *oldCEs = data->extendCEs; 256 if(oldCEs == NULL) { 257 oldCEs = data->CEs; 258 } 259 int32_t length = data->CEpos - oldCEs; 260 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); 261 if(newCEs == NULL) { 262 return FALSE; 263 } 264 uprv_memcpy(newCEs, oldCEs, length * 4); 265 uprv_free(data->extendCEs); 266 data->extendCEs = newCEs; 267 data->extendCEsSize = newCapacity; 268 data->CEpos = newCEs + length; 269 return TRUE; 270 } 271 272 static UBool 273 increaseCEsCapacity(collIterate *data) { 274 int32_t oldCapacity; 275 if(data->extendCEs != NULL) { 276 oldCapacity = data->extendCEsSize; 277 } else { 278 oldCapacity = LENGTHOF(data->CEs); 279 } 280 return reallocCEs(data, 2 * oldCapacity); 281 } 282 283 static UBool 284 ensureCEsCapacity(collIterate *data, int32_t minCapacity) { 285 int32_t oldCapacity; 286 if(data->extendCEs != NULL) { 287 oldCapacity = data->extendCEsSize; 288 } else { 289 oldCapacity = LENGTHOF(data->CEs); 290 } 291 if(minCapacity <= oldCapacity) { 292 return TRUE; 293 } 294 oldCapacity *= 2; 295 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity); 296 } 297 298 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) { 299 if(U_FAILURE(errorCode)) { 300 return; 301 } 302 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer); 303 if(length >= offsetBufferSize) { 304 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE; 305 int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4)); 306 if(newBuffer == NULL) { 307 errorCode = U_MEMORY_ALLOCATION_ERROR; 308 return; 309 } 310 if(length > 0) { 311 uprv_memcpy(newBuffer, offsetBuffer, length * 4); 312 } 313 uprv_free(offsetBuffer); 314 offsetBuffer = newBuffer; 315 offsetStore = offsetBuffer + length; 316 offsetBufferSize = newCapacity; 317 } 318 *offsetStore++ = offset; 319 } 320 321 /* 322 * collIter_eos() 323 * Checks for a collIterate being positioned at the end of 324 * its source string. 325 * 326 */ 327 static 328 inline UBool collIter_eos(collIterate *s) { 329 if(s->flags & UCOL_USE_ITERATOR) { 330 return !(s->iterator->hasNext(s->iterator)); 331 } 332 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { 333 // Null terminated string, but not at null, so not at end. 334 // Whether in main or normalization buffer doesn't matter. 335 return FALSE; 336 } 337 338 // String with length. Can't be in normalization buffer, which is always 339 // null termintated. 340 if (s->flags & UCOL_ITER_HASLEN) { 341 return (s->pos == s->endp); 342 } 343 344 // We are at a null termination, could be either normalization buffer or main string. 345 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { 346 // At null at end of main string. 347 return TRUE; 348 } 349 350 // At null at end of normalization buffer. Need to check whether there there are 351 // any characters left in the main buffer. 352 if(s->origFlags & UCOL_USE_ITERATOR) { 353 return !(s->iterator->hasNext(s->iterator)); 354 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { 355 // Null terminated main string. fcdPosition is the 'return' position into main buf. 356 return (*s->fcdPosition == 0); 357 } 358 else { 359 // Main string with an end pointer. 360 return s->fcdPosition == s->endp; 361 } 362 } 363 364 /* 365 * collIter_bos() 366 * Checks for a collIterate being positioned at the start of 367 * its source string. 368 * 369 */ 370 static 371 inline UBool collIter_bos(collIterate *source) { 372 // if we're going backwards, we need to know whether there is more in the 373 // iterator, even if we are in the side buffer 374 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 375 return !source->iterator->hasPrevious(source->iterator); 376 } 377 if (source->pos <= source->string || 378 ((source->flags & UCOL_ITER_INNORMBUF) && 379 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { 380 return TRUE; 381 } 382 return FALSE; 383 } 384 385 /*static 386 inline UBool collIter_SimpleBos(collIterate *source) { 387 // if we're going backwards, we need to know whether there is more in the 388 // iterator, even if we are in the side buffer 389 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 390 return !source->iterator->hasPrevious(source->iterator); 391 } 392 if (source->pos == source->string) { 393 return TRUE; 394 } 395 return FALSE; 396 }*/ 397 //return (data->pos == data->string) || 398 399 400 /****************************************************************************/ 401 /* Following are the open/close functions */ 402 /* */ 403 /****************************************************************************/ 404 405 static UCollator* 406 ucol_initFromBinary(const uint8_t *bin, int32_t length, 407 const UCollator *base, 408 UCollator *fillIn, 409 UErrorCode *status) 410 { 411 UCollator *result = fillIn; 412 if(U_FAILURE(*status)) { 413 return NULL; 414 } 415 /* 416 if(base == NULL) { 417 // we don't support null base yet 418 *status = U_ILLEGAL_ARGUMENT_ERROR; 419 return NULL; 420 } 421 */ 422 // We need these and we could be running without UCA 423 uprv_uca_initImplicitConstants(status); 424 UCATableHeader *colData = (UCATableHeader *)bin; 425 // do we want version check here? We're trying to figure out whether collators are compatible 426 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || 427 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || 428 colData->version[0] != UCOL_BUILDER_VERSION) 429 { 430 *status = U_COLLATOR_VERSION_MISMATCH; 431 return NULL; 432 } 433 else { 434 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { 435 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); 436 if(U_FAILURE(*status)){ 437 return NULL; 438 } 439 result->hasRealData = TRUE; 440 } 441 else { 442 if(base) { 443 result = ucol_initCollator(base->image, result, base, status); 444 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); 445 if(U_FAILURE(*status)){ 446 return NULL; 447 } 448 result->hasRealData = FALSE; 449 } 450 else { 451 *status = U_USELESS_COLLATOR_ERROR; 452 return NULL; 453 } 454 } 455 result->freeImageOnClose = FALSE; 456 } 457 result->actualLocale = NULL; 458 result->validLocale = NULL; 459 result->requestedLocale = NULL; 460 result->rules = NULL; 461 result->rulesLength = 0; 462 result->freeRulesOnClose = FALSE; 463 result->ucaRules = NULL; 464 return result; 465 } 466 467 U_CAPI UCollator* U_EXPORT2 468 ucol_openBinary(const uint8_t *bin, int32_t length, 469 const UCollator *base, 470 UErrorCode *status) 471 { 472 return ucol_initFromBinary(bin, length, base, NULL, status); 473 } 474 475 U_CAPI int32_t U_EXPORT2 476 ucol_cloneBinary(const UCollator *coll, 477 uint8_t *buffer, int32_t capacity, 478 UErrorCode *status) 479 { 480 int32_t length = 0; 481 if(U_FAILURE(*status)) { 482 return length; 483 } 484 if(capacity < 0) { 485 *status = U_ILLEGAL_ARGUMENT_ERROR; 486 return length; 487 } 488 if(coll->hasRealData == TRUE) { 489 length = coll->image->size; 490 if(length <= capacity) { 491 uprv_memcpy(buffer, coll->image, length); 492 } else { 493 *status = U_BUFFER_OVERFLOW_ERROR; 494 } 495 } else { 496 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 497 if(length <= capacity) { 498 /* build the UCATableHeader with minimal entries */ 499 /* do not copy the header from the UCA file because its values are wrong! */ 500 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 501 502 /* reset everything */ 503 uprv_memset(buffer, 0, length); 504 505 /* set the tailoring-specific values */ 506 UCATableHeader *myData = (UCATableHeader *)buffer; 507 myData->size = length; 508 509 /* offset for the options, the only part of the data that is present after the header */ 510 myData->options = sizeof(UCATableHeader); 511 512 /* need to always set the expansion value for an upper bound of the options */ 513 myData->expansion = myData->options + sizeof(UColOptionSet); 514 515 myData->magic = UCOL_HEADER_MAGIC; 516 myData->isBigEndian = U_IS_BIG_ENDIAN; 517 myData->charSetFamily = U_CHARSET_FAMILY; 518 519 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 520 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 521 522 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 523 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 524 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 525 myData->jamoSpecial = coll->image->jamoSpecial; 526 527 /* copy the collator options */ 528 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 529 } else { 530 *status = U_BUFFER_OVERFLOW_ERROR; 531 } 532 } 533 return length; 534 } 535 536 U_CAPI UCollator* U_EXPORT2 537 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) 538 { 539 UCollator * localCollator; 540 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); 541 char *stackBufferChars = (char *)stackBuffer; 542 int32_t imageSize = 0; 543 int32_t rulesSize = 0; 544 int32_t rulesPadding = 0; 545 uint8_t *image; 546 UChar *rules; 547 UBool colAllocated = FALSE; 548 UBool imageAllocated = FALSE; 549 550 if (status == NULL || U_FAILURE(*status)){ 551 return 0; 552 } 553 if ((stackBuffer && !pBufferSize) || !coll){ 554 *status = U_ILLEGAL_ARGUMENT_ERROR; 555 return 0; 556 } 557 if (coll->rules && coll->freeRulesOnClose) { 558 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); 559 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); 560 bufferSizeNeeded += rulesSize + rulesPadding; 561 } 562 563 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ 564 *pBufferSize = bufferSizeNeeded; 565 return 0; 566 } 567 568 /* Pointers on 64-bit platforms need to be aligned 569 * on a 64-bit boundry in memory. 570 */ 571 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 572 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); 573 if (*pBufferSize > offsetUp) { 574 *pBufferSize -= offsetUp; 575 stackBufferChars += offsetUp; 576 } 577 else { 578 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */ 579 *pBufferSize = 1; 580 } 581 } 582 stackBuffer = (void *)stackBufferChars; 583 584 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) { 585 /* allocate one here...*/ 586 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); 587 // Null pointer check. 588 if (stackBufferChars == NULL) { 589 *status = U_MEMORY_ALLOCATION_ERROR; 590 return NULL; 591 } 592 colAllocated = TRUE; 593 if (U_SUCCESS(*status)) { 594 *status = U_SAFECLONE_ALLOCATED_WARNING; 595 } 596 } 597 localCollator = (UCollator *)stackBufferChars; 598 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); 599 { 600 UErrorCode tempStatus = U_ZERO_ERROR; 601 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); 602 } 603 if (coll->freeImageOnClose) { 604 image = (uint8_t *)uprv_malloc(imageSize); 605 // Null pointer check 606 if (image == NULL) { 607 *status = U_MEMORY_ALLOCATION_ERROR; 608 return NULL; 609 } 610 ucol_cloneBinary(coll, image, imageSize, status); 611 imageAllocated = TRUE; 612 } 613 else { 614 image = (uint8_t *)coll->image; 615 } 616 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); 617 if (U_FAILURE(*status)) { 618 return NULL; 619 } 620 621 if (coll->rules) { 622 if (coll->freeRulesOnClose) { 623 localCollator->rules = u_strcpy(rules, coll->rules); 624 //bufferEnd += rulesSize; 625 } 626 else { 627 localCollator->rules = coll->rules; 628 } 629 localCollator->freeRulesOnClose = FALSE; 630 localCollator->rulesLength = coll->rulesLength; 631 } 632 633 int32_t i; 634 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 635 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); 636 } 637 // zero copies of pointers 638 localCollator->actualLocale = NULL; 639 localCollator->validLocale = NULL; 640 localCollator->requestedLocale = NULL; 641 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. 642 localCollator->freeOnClose = colAllocated; 643 localCollator->freeImageOnClose = imageAllocated; 644 return localCollator; 645 } 646 647 U_CAPI void U_EXPORT2 648 ucol_close(UCollator *coll) 649 { 650 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 651 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 652 if(coll != NULL) { 653 // these are always owned by each UCollator struct, 654 // so we always free them 655 if(coll->validLocale != NULL) { 656 uprv_free(coll->validLocale); 657 } 658 if(coll->actualLocale != NULL) { 659 uprv_free(coll->actualLocale); 660 } 661 if(coll->requestedLocale != NULL) { 662 uprv_free(coll->requestedLocale); 663 } 664 if(coll->latinOneCEs != NULL) { 665 uprv_free(coll->latinOneCEs); 666 } 667 if(coll->options != NULL && coll->freeOptionsOnClose) { 668 uprv_free(coll->options); 669 } 670 if(coll->rules != NULL && coll->freeRulesOnClose) { 671 uprv_free((UChar *)coll->rules); 672 } 673 if(coll->image != NULL && coll->freeImageOnClose) { 674 uprv_free((UCATableHeader *)coll->image); 675 } 676 if(coll->leadBytePermutationTable != NULL) { 677 uprv_free(coll->leadBytePermutationTable); 678 } 679 if(coll->reorderCodes != NULL) { 680 uprv_free(coll->reorderCodes); 681 } 682 683 /* Here, it would be advisable to close: */ 684 /* - UData for UCA (unless we stuff it in the root resb */ 685 /* Again, do we need additional housekeeping... HMMM! */ 686 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); 687 if(coll->freeOnClose){ 688 /* for safeClone, if freeOnClose is FALSE, 689 don't free the other instance data */ 690 uprv_free(coll); 691 } 692 } 693 UTRACE_EXIT(); 694 } 695 696 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ 697 /* you should be able to get the binary chunk to write out... Doesn't look very full now */ 698 U_CFUNC uint8_t* U_EXPORT2 699 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) 700 { 701 uint8_t *result = NULL; 702 if(U_FAILURE(*status)) { 703 return NULL; 704 } 705 if(coll->hasRealData == TRUE) { 706 *length = coll->image->size; 707 result = (uint8_t *)uprv_malloc(*length); 708 /* test for NULL */ 709 if (result == NULL) { 710 *status = U_MEMORY_ALLOCATION_ERROR; 711 return NULL; 712 } 713 uprv_memcpy(result, coll->image, *length); 714 } else { 715 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 716 result = (uint8_t *)uprv_malloc(*length); 717 /* test for NULL */ 718 if (result == NULL) { 719 *status = U_MEMORY_ALLOCATION_ERROR; 720 return NULL; 721 } 722 723 /* build the UCATableHeader with minimal entries */ 724 /* do not copy the header from the UCA file because its values are wrong! */ 725 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 726 727 /* reset everything */ 728 uprv_memset(result, 0, *length); 729 730 /* set the tailoring-specific values */ 731 UCATableHeader *myData = (UCATableHeader *)result; 732 myData->size = *length; 733 734 /* offset for the options, the only part of the data that is present after the header */ 735 myData->options = sizeof(UCATableHeader); 736 737 /* need to always set the expansion value for an upper bound of the options */ 738 myData->expansion = myData->options + sizeof(UColOptionSet); 739 740 myData->magic = UCOL_HEADER_MAGIC; 741 myData->isBigEndian = U_IS_BIG_ENDIAN; 742 myData->charSetFamily = U_CHARSET_FAMILY; 743 744 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 745 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 746 747 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 748 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 749 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 750 myData->jamoSpecial = coll->image->jamoSpecial; 751 752 /* copy the collator options */ 753 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 754 } 755 return result; 756 } 757 758 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { 759 if(U_FAILURE(*status)) { 760 return; 761 } 762 result->caseFirst = (UColAttributeValue)opts->caseFirst; 763 result->caseLevel = (UColAttributeValue)opts->caseLevel; 764 result->frenchCollation = (UColAttributeValue)opts->frenchCollation; 765 result->normalizationMode = (UColAttributeValue)opts->normalizationMode; 766 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) { 767 return; 768 } 769 result->strength = (UColAttributeValue)opts->strength; 770 result->variableTopValue = opts->variableTopValue; 771 result->alternateHandling = (UColAttributeValue)opts->alternateHandling; 772 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; 773 result->numericCollation = (UColAttributeValue)opts->numericCollation; 774 result->caseFirstisDefault = TRUE; 775 result->caseLevelisDefault = TRUE; 776 result->frenchCollationisDefault = TRUE; 777 result->normalizationModeisDefault = TRUE; 778 result->strengthisDefault = TRUE; 779 result->variableTopValueisDefault = TRUE; 780 result->alternateHandlingisDefault = TRUE; 781 result->hiraganaQisDefault = TRUE; 782 result->numericCollationisDefault = TRUE; 783 784 ucol_updateInternalState(result, status); 785 786 result->options = opts; 787 } 788 789 790 /** 791 * Approximate determination if a character is at a contraction end. 792 * Guaranteed to be TRUE if a character is at the end of a contraction, 793 * otherwise it is not deterministic. 794 * @param c character to be determined 795 * @param coll collator 796 */ 797 static 798 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { 799 if (c < coll->minContrEndCP) { 800 return FALSE; 801 } 802 803 int32_t hash = c; 804 uint8_t htbyte; 805 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 806 if (U16_IS_TRAIL(c)) { 807 return TRUE; 808 } 809 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 810 } 811 htbyte = coll->contrEndCP[hash>>3]; 812 return (((htbyte >> (hash & 7)) & 1) == 1); 813 } 814 815 816 817 /* 818 * i_getCombiningClass() 819 * A fast, at least partly inline version of u_getCombiningClass() 820 * This is a candidate for further optimization. Used heavily 821 * in contraction processing. 822 */ 823 static 824 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { 825 uint8_t sCC = 0; 826 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { 827 sCC = u_getCombiningClass(c); 828 } 829 return sCC; 830 } 831 832 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { 833 UChar c; 834 UCollator *result = fillIn; 835 if(U_FAILURE(*status) || image == NULL) { 836 return NULL; 837 } 838 839 if(result == NULL) { 840 result = (UCollator *)uprv_malloc(sizeof(UCollator)); 841 if(result == NULL) { 842 *status = U_MEMORY_ALLOCATION_ERROR; 843 return result; 844 } 845 result->freeOnClose = TRUE; 846 } else { 847 result->freeOnClose = FALSE; 848 } 849 850 result->image = image; 851 result->mapping.getFoldingOffset = _getFoldingOffset; 852 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; 853 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); 854 if(U_FAILURE(*status)) { 855 if(result->freeOnClose == TRUE) { 856 uprv_free(result); 857 result = NULL; 858 } 859 return result; 860 } 861 862 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); 863 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); 864 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); 865 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); 866 result->rules = NULL; 867 result->rulesLength = 0; 868 result->freeRulesOnClose = FALSE; 869 result->reorderCodes = NULL; 870 result->reorderCodesLength = 0; 871 result->leadBytePermutationTable = NULL; 872 873 /* get the version info from UCATableHeader and populate the Collator struct*/ 874 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ 875 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ 876 result->dataVersion[2] = 0; 877 result->dataVersion[3] = 0; 878 879 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; 880 result->minUnsafeCP = 0; 881 for (c=0; c<0x300; c++) { // Find the smallest unsafe char. 882 if (ucol_unsafeCP(c, result)) break; 883 } 884 result->minUnsafeCP = c; 885 886 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; 887 result->minContrEndCP = 0; 888 for (c=0; c<0x300; c++) { // Find the Contraction-ending char. 889 if (ucol_contractionEndCP(c, result)) break; 890 } 891 result->minContrEndCP = c; 892 893 /* max expansion tables */ 894 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + 895 result->image->endExpansionCE); 896 result->lastEndExpansionCE = result->endExpansionCE + 897 result->image->endExpansionCECount - 1; 898 result->expansionCESize = (uint8_t*)result->image + 899 result->image->expansionCESize; 900 901 902 //result->errorCode = *status; 903 904 result->latinOneCEs = NULL; 905 906 result->latinOneRegenTable = FALSE; 907 result->latinOneFailed = FALSE; 908 result->UCA = UCA; 909 910 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ 911 result->ucaRules = NULL; 912 result->actualLocale = NULL; 913 result->validLocale = NULL; 914 result->requestedLocale = NULL; 915 result->hasRealData = FALSE; // real data lives in .dat file... 916 result->freeImageOnClose = FALSE; 917 918 /* set attributes */ 919 ucol_setOptionsFromHeader( 920 result, 921 (UColOptionSet*)((uint8_t*)result->image+result->image->options), 922 status); 923 result->freeOptionsOnClose = FALSE; 924 925 return result; 926 } 927 928 /* new Mark's code */ 929 930 /** 931 * For generation of Implicit CEs 932 * @author Davis 933 * 934 * Cleaned up so that changes can be made more easily. 935 * Old values: 936 # First Implicit: E26A792D 937 # Last Implicit: E3DC70C0 938 # First CJK: E0030300 939 # Last CJK: E0A9DD00 940 # First CJK_A: E0A9DF00 941 # Last CJK_A: E0DE3100 942 */ 943 /* Following is a port of Mark's code for new treatment of implicits. 944 * It is positioned here, since ucol_initUCA need to initialize the 945 * variables below according to the data in the fractional UCA. 946 */ 947 948 /** 949 * Function used to: 950 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and 951 * b) bump any non-CJK characters by 10FFFF. 952 * The relevant blocks are: 953 * A: 4E00..9FFF; CJK Unified Ideographs 954 * F900..FAFF; CJK Compatibility Ideographs 955 * B: 3400..4DBF; CJK Unified Ideographs Extension A 956 * 20000..XX; CJK Unified Ideographs Extension B (and others later on) 957 * As long as 958 * no new B characters are allocated between 4E00 and FAFF, and 959 * no new A characters are outside of this range, 960 * (very high probability) this simple code will work. 961 * The reordered blocks are: 962 * Block1 is CJK 963 * Block2 is CJK_COMPAT_USED 964 * Block3 is CJK_A 965 * (all contiguous) 966 * Any other CJK gets its normal code point 967 * Any non-CJK gets +10FFFF 968 * When we reorder Block1, we make sure that it is at the very start, 969 * so that it will use a 3-byte form. 970 * Warning: the we only pick up the compatibility characters that are 971 * NOT decomposed, so that block is smaller! 972 */ 973 974 // CONSTANTS 975 static const UChar32 976 NON_CJK_OFFSET = 0x110000, 977 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 978 979 /** 980 * Precomputed by initImplicitConstants() 981 */ 982 static int32_t 983 final3Multiplier = 0, 984 final4Multiplier = 0, 985 final3Count = 0, 986 final4Count = 0, 987 medialCount = 0, 988 min3Primary = 0, 989 min4Primary = 0, 990 max4Primary = 0, 991 minTrail = 0, 992 maxTrail = 0, 993 max3Trail = 0, 994 max4Trail = 0, 995 min4Boundary = 0; 996 997 static const UChar32 998 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 999 // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 1000 CJK_BASE = 0x4E00, 1001 CJK_LIMIT = 0x9FCB+1, 1002 // Unified CJK ideographs in the compatibility ideographs block. 1003 CJK_COMPAT_USED_BASE = 0xFA0E, 1004 CJK_COMPAT_USED_LIMIT = 0xFA2F+1, 1005 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 1006 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 1007 CJK_A_BASE = 0x3400, 1008 CJK_A_LIMIT = 0x4DB5+1, 1009 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; 1010 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; 1011 CJK_B_BASE = 0x20000, 1012 CJK_B_LIMIT = 0x2A6D6+1, 1013 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; 1014 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; 1015 CJK_C_BASE = 0x2A700, 1016 CJK_C_LIMIT = 0x2B734+1, 1017 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; 1018 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; 1019 CJK_D_BASE = 0x2B740, 1020 CJK_D_LIMIT = 0x2B81D+1; 1021 // when adding to this list, look for all occurrences (in project) 1022 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!! 1023 1024 static UChar32 swapCJK(UChar32 i) { 1025 if (i < CJK_A_BASE) { 1026 // non-CJK 1027 } else if (i < CJK_A_LIMIT) { 1028 // Extension A has lower code points than the original Unihan+compat 1029 // but sorts higher. 1030 return i - CJK_A_BASE 1031 + (CJK_LIMIT - CJK_BASE) 1032 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1033 } else if (i < CJK_BASE) { 1034 // non-CJK 1035 } else if (i < CJK_LIMIT) { 1036 return i - CJK_BASE; 1037 } else if (i < CJK_COMPAT_USED_BASE) { 1038 // non-CJK 1039 } else if (i < CJK_COMPAT_USED_LIMIT) { 1040 return i - CJK_COMPAT_USED_BASE 1041 + (CJK_LIMIT - CJK_BASE); 1042 } else if (i < CJK_B_BASE) { 1043 // non-CJK 1044 } else if (i < CJK_B_LIMIT) { 1045 return i; // non-BMP-CJK 1046 } else if (i < CJK_C_BASE) { 1047 // non-CJK 1048 } else if (i < CJK_C_LIMIT) { 1049 return i; // non-BMP-CJK 1050 } else if (i < CJK_D_BASE) { 1051 // non-CJK 1052 } else if (i < CJK_D_LIMIT) { 1053 return i; // non-BMP-CJK 1054 } 1055 return i + NON_CJK_OFFSET; // non-CJK 1056 } 1057 1058 U_CAPI UChar32 U_EXPORT2 1059 uprv_uca_getRawFromCodePoint(UChar32 i) { 1060 return swapCJK(i)+1; 1061 } 1062 1063 U_CAPI UChar32 U_EXPORT2 1064 uprv_uca_getCodePointFromRaw(UChar32 i) { 1065 i--; 1066 UChar32 result = 0; 1067 if(i >= NON_CJK_OFFSET) { 1068 result = i - NON_CJK_OFFSET; 1069 } else if(i >= CJK_B_BASE) { 1070 result = i; 1071 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted 1072 if(i < CJK_LIMIT - CJK_BASE) { 1073 result = i + CJK_BASE; 1074 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { 1075 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); 1076 } else { 1077 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1078 } 1079 } else { 1080 result = -1; 1081 } 1082 return result; 1083 } 1084 1085 // GET IMPLICIT PRIMARY WEIGHTS 1086 // Return value is left justified primary key 1087 U_CAPI uint32_t U_EXPORT2 1088 uprv_uca_getImplicitFromRaw(UChar32 cp) { 1089 /* 1090 if (cp < 0 || cp > UCOL_MAX_INPUT) { 1091 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); 1092 } 1093 */ 1094 int32_t last0 = cp - min4Boundary; 1095 if (last0 < 0) { 1096 int32_t last1 = cp / final3Count; 1097 last0 = cp % final3Count; 1098 1099 int32_t last2 = last1 / medialCount; 1100 last1 %= medialCount; 1101 1102 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start 1103 last1 = minTrail + last1; // offset 1104 last2 = min3Primary + last2; // offset 1105 /* 1106 if (last2 >= min4Primary) { 1107 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); 1108 } 1109 */ 1110 return (last2 << 24) + (last1 << 16) + (last0 << 8); 1111 } else { 1112 int32_t last1 = last0 / final4Count; 1113 last0 %= final4Count; 1114 1115 int32_t last2 = last1 / medialCount; 1116 last1 %= medialCount; 1117 1118 int32_t last3 = last2 / medialCount; 1119 last2 %= medialCount; 1120 1121 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start 1122 last1 = minTrail + last1; // offset 1123 last2 = minTrail + last2; // offset 1124 last3 = min4Primary + last3; // offset 1125 /* 1126 if (last3 > max4Primary) { 1127 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); 1128 } 1129 */ 1130 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; 1131 } 1132 } 1133 1134 static uint32_t U_EXPORT2 1135 uprv_uca_getImplicitPrimary(UChar32 cp) { 1136 //fprintf(stdout, "Incoming: %04x\n", cp); 1137 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); 1138 1139 cp = swapCJK(cp); 1140 cp++; 1141 // we now have a range of numbers from 0 to 21FFFF. 1142 1143 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); 1144 //fprintf(stdout, "CJK swapped: %04x\n", cp); 1145 1146 return uprv_uca_getImplicitFromRaw(cp); 1147 } 1148 1149 /** 1150 * Converts implicit CE into raw integer ("code point") 1151 * @param implicit 1152 * @return -1 if illegal format 1153 */ 1154 U_CAPI UChar32 U_EXPORT2 1155 uprv_uca_getRawFromImplicit(uint32_t implicit) { 1156 UChar32 result; 1157 UChar32 b3 = implicit & 0xFF; 1158 UChar32 b2 = (implicit >> 8) & 0xFF; 1159 UChar32 b1 = (implicit >> 16) & 0xFF; 1160 UChar32 b0 = (implicit >> 24) & 0xFF; 1161 1162 // simple parameter checks 1163 if (b0 < min3Primary || b0 > max4Primary 1164 || b1 < minTrail || b1 > maxTrail) 1165 return -1; 1166 // normal offsets 1167 b1 -= minTrail; 1168 1169 // take care of the final values, and compose 1170 if (b0 < min4Primary) { 1171 if (b2 < minTrail || b2 > max3Trail || b3 != 0) 1172 return -1; 1173 b2 -= minTrail; 1174 UChar32 remainder = b2 % final3Multiplier; 1175 if (remainder != 0) 1176 return -1; 1177 b0 -= min3Primary; 1178 b2 /= final3Multiplier; 1179 result = ((b0 * medialCount) + b1) * final3Count + b2; 1180 } else { 1181 if (b2 < minTrail || b2 > maxTrail 1182 || b3 < minTrail || b3 > max4Trail) 1183 return -1; 1184 b2 -= minTrail; 1185 b3 -= minTrail; 1186 UChar32 remainder = b3 % final4Multiplier; 1187 if (remainder != 0) 1188 return -1; 1189 b3 /= final4Multiplier; 1190 b0 -= min4Primary; 1191 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; 1192 } 1193 // final check 1194 if (result < 0 || result > UCOL_MAX_INPUT) 1195 return -1; 1196 return result; 1197 } 1198 1199 1200 static inline int32_t divideAndRoundUp(int a, int b) { 1201 return 1 + (a-1)/b; 1202 } 1203 1204 /* this function is either called from initUCA or from genUCA before 1205 * doing canonical closure for the UCA. 1206 */ 1207 1208 /** 1209 * Set up to generate implicits. 1210 * Maintenance Note: this function may end up being called more than once, due 1211 * to threading races during initialization. Make sure that 1212 * none of the Constants is ever transiently assigned an 1213 * incorrect value. 1214 * @param minPrimary 1215 * @param maxPrimary 1216 * @param minTrail final byte 1217 * @param maxTrail final byte 1218 * @param gap3 the gap we leave for tailoring for 3-byte forms 1219 * @param gap4 the gap we leave for tailoring for 4-byte forms 1220 */ 1221 static void initImplicitConstants(int minPrimary, int maxPrimary, 1222 int minTrailIn, int maxTrailIn, 1223 int gap3, int primaries3count, 1224 UErrorCode *status) { 1225 // some simple parameter checks 1226 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) 1227 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) 1228 || (primaries3count < 1)) 1229 { 1230 *status = U_ILLEGAL_ARGUMENT_ERROR; 1231 return; 1232 }; 1233 1234 minTrail = minTrailIn; 1235 maxTrail = maxTrailIn; 1236 1237 min3Primary = minPrimary; 1238 max4Primary = maxPrimary; 1239 // compute constants for use later. 1240 // number of values we can use in trailing bytes 1241 // leave room for empty values between AND above, e.g. if gap = 2 1242 // range 3..7 => +3 -4 -5 -6 -7: so 1 value 1243 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values 1244 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values 1245 final3Multiplier = gap3 + 1; 1246 final3Count = (maxTrail - minTrail + 1) / final3Multiplier; 1247 max3Trail = minTrail + (final3Count - 1) * final3Multiplier; 1248 1249 // medials can use full range 1250 medialCount = (maxTrail - minTrail + 1); 1251 // find out how many values fit in each form 1252 int32_t threeByteCount = medialCount * final3Count; 1253 // now determine where the 3/4 boundary is. 1254 // we use 3 bytes below the boundary, and 4 above 1255 int32_t primariesAvailable = maxPrimary - minPrimary + 1; 1256 int32_t primaries4count = primariesAvailable - primaries3count; 1257 1258 1259 int32_t min3ByteCoverage = primaries3count * threeByteCount; 1260 min4Primary = minPrimary + primaries3count; 1261 min4Boundary = min3ByteCoverage; 1262 // Now expand out the multiplier for the 4 bytes, and redo. 1263 1264 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; 1265 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); 1266 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); 1267 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; 1268 if (gap4 < 1) { 1269 *status = U_ILLEGAL_ARGUMENT_ERROR; 1270 return; 1271 } 1272 final4Multiplier = gap4 + 1; 1273 final4Count = neededPerFinalByte; 1274 max4Trail = minTrail + (final4Count - 1) * final4Multiplier; 1275 } 1276 1277 /** 1278 * Supply parameters for generating implicit CEs 1279 */ 1280 U_CAPI void U_EXPORT2 1281 uprv_uca_initImplicitConstants(UErrorCode *status) { 1282 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. 1283 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); 1284 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); 1285 } 1286 1287 1288 /* collIterNormalize Incremental Normalization happens here. */ 1289 /* pick up the range of chars identifed by FCD, */ 1290 /* normalize it into the collIterate's writable buffer, */ 1291 /* switch the collIterate's state to use the writable buffer. */ 1292 /* */ 1293 static 1294 void collIterNormalize(collIterate *collationSource) 1295 { 1296 UErrorCode status = U_ZERO_ERROR; 1297 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ 1298 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ 1299 1300 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)), 1301 collationSource->writableBuffer, 1302 status); 1303 if (U_FAILURE(status)) { 1304 #ifdef UCOL_DEBUG 1305 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status)); 1306 #endif 1307 return; 1308 } 1309 1310 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer(); 1311 collationSource->origFlags = collationSource->flags; 1312 collationSource->flags |= UCOL_ITER_INNORMBUF; 1313 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1314 } 1315 1316 1317 // This function takes the iterator and extracts normalized stuff up to the next boundary 1318 // It is similar in the end results to the collIterNormalize, but for the cases when we 1319 // use an iterator 1320 /*static 1321 inline void normalizeIterator(collIterate *collationSource) { 1322 UErrorCode status = U_ZERO_ERROR; 1323 UBool wasNormalized = FALSE; 1324 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); 1325 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); 1326 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1327 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1328 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { 1329 // reallocate and terminate 1330 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, 1331 &collationSource->writableBuffer, 1332 (int32_t *)&collationSource->writableBufSize, normLen + 1, 1333 0) 1334 ) { 1335 #ifdef UCOL_DEBUG 1336 fprintf(stderr, "normalizeIterator(), out of memory\n"); 1337 #endif 1338 return; 1339 } 1340 status = U_ZERO_ERROR; 1341 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); 1342 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); 1343 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1344 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1345 } 1346 // Terminate the buffer - we already checked that it is big enough 1347 collationSource->writableBuffer[normLen] = 0; 1348 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { 1349 collationSource->flags |= UCOL_ITER_ALLOCATED; 1350 } 1351 collationSource->pos = collationSource->writableBuffer; 1352 collationSource->origFlags = collationSource->flags; 1353 collationSource->flags |= UCOL_ITER_INNORMBUF; 1354 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1355 }*/ 1356 1357 1358 /* Incremental FCD check and normalize */ 1359 /* Called from getNextCE when normalization state is suspect. */ 1360 /* When entering, the state is known to be this: */ 1361 /* o We are working in the main buffer of the collIterate, not the side */ 1362 /* writable buffer. When in the side buffer, normalization mode is always off, */ 1363 /* so we won't get here. */ 1364 /* o The leading combining class from the current character is 0 or */ 1365 /* the trailing combining class of the previous char was zero. */ 1366 /* True because the previous call to this function will have always exited */ 1367 /* that way, and we get called for every char where cc might be non-zero. */ 1368 static 1369 inline UBool collIterFCD(collIterate *collationSource) { 1370 const UChar *srcP, *endP; 1371 uint8_t leadingCC; 1372 uint8_t prevTrailingCC = 0; 1373 uint16_t fcd; 1374 UBool needNormalize = FALSE; 1375 1376 srcP = collationSource->pos-1; 1377 1378 if (collationSource->flags & UCOL_ITER_HASLEN) { 1379 endP = collationSource->endp; 1380 } else { 1381 endP = NULL; 1382 } 1383 1384 // Get the trailing combining class of the current character. If it's zero, 1385 // we are OK. 1386 /* trie access */ 1387 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); 1388 if (fcd != 0) { 1389 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1390 1391 if (prevTrailingCC != 0) { 1392 // The current char has a non-zero trailing CC. Scan forward until we find 1393 // a char with a leading cc of zero. 1394 while (endP == NULL || srcP != endP) 1395 { 1396 const UChar *savedSrcP = srcP; 1397 1398 /* trie access */ 1399 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); 1400 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1401 if (leadingCC == 0) { 1402 srcP = savedSrcP; // Hit char that is not part of combining sequence. 1403 // back up over it. (Could be surrogate pair!) 1404 break; 1405 } 1406 1407 if (leadingCC < prevTrailingCC) { 1408 needNormalize = TRUE; 1409 } 1410 1411 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1412 } 1413 } 1414 } 1415 1416 collationSource->fcdPosition = (UChar *)srcP; 1417 1418 return needNormalize; 1419 } 1420 1421 /****************************************************************************/ 1422 /* Following are the CE retrieval functions */ 1423 /* */ 1424 /****************************************************************************/ 1425 1426 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); 1427 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); 1428 1429 /* there should be a macro version of this function in the header file */ 1430 /* This is the first function that tries to fetch a collation element */ 1431 /* If it's not succesfull or it encounters a more difficult situation */ 1432 /* some more sofisticated and slower functions are invoked */ 1433 static 1434 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1435 uint32_t order = 0; 1436 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ 1437 order = *(collationSource->toReturn++); /* if so, return them */ 1438 if(collationSource->CEpos == collationSource->toReturn) { 1439 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; 1440 } 1441 return order; 1442 } 1443 1444 UChar ch = 0; 1445 collationSource->offsetReturn = NULL; 1446 1447 for (;;) /* Loop handles case when incremental normalize switches */ 1448 { /* to or from the side buffer / original string, and we */ 1449 /* need to start again to get the next character. */ 1450 1451 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) 1452 { 1453 // The source string is null terminated and we're not working from the side buffer, 1454 // and we're not normalizing. This is the fast path. 1455 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) 1456 ch = *collationSource->pos++; 1457 if (ch != 0) { 1458 break; 1459 } 1460 else { 1461 return UCOL_NO_MORE_CES; 1462 } 1463 } 1464 1465 if (collationSource->flags & UCOL_ITER_HASLEN) { 1466 // Normal path for strings when length is specified. 1467 // (We can't be in side buffer because it is always null terminated.) 1468 if (collationSource->pos >= collationSource->endp) { 1469 // Ran off of the end of the main source string. We're done. 1470 return UCOL_NO_MORE_CES; 1471 } 1472 ch = *collationSource->pos++; 1473 } 1474 else if(collationSource->flags & UCOL_USE_ITERATOR) { 1475 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); 1476 if(iterCh == U_SENTINEL) { 1477 return UCOL_NO_MORE_CES; 1478 } 1479 ch = (UChar)iterCh; 1480 } 1481 else 1482 { 1483 // Null terminated string. 1484 ch = *collationSource->pos++; 1485 if (ch == 0) { 1486 // Ran off end of buffer. 1487 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1488 // Ran off end of main string. backing up one character. 1489 collationSource->pos--; 1490 return UCOL_NO_MORE_CES; 1491 } 1492 else 1493 { 1494 // Hit null in the normalize side buffer. 1495 // Usually this means the end of the normalized data, 1496 // except for one odd case: a null followed by combining chars, 1497 // which is the case if we are at the start of the buffer. 1498 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) { 1499 break; 1500 } 1501 1502 // Null marked end of side buffer. 1503 // Revert to the main string and 1504 // loop back to top to try again to get a character. 1505 collationSource->pos = collationSource->fcdPosition; 1506 collationSource->flags = collationSource->origFlags; 1507 continue; 1508 } 1509 } 1510 } 1511 1512 if(collationSource->flags&UCOL_HIRAGANA_Q) { 1513 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag 1514 * based on whether the previous codepoint was Hiragana or Katakana. 1515 */ 1516 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || 1517 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { 1518 collationSource->flags |= UCOL_WAS_HIRAGANA; 1519 } else { 1520 collationSource->flags &= ~UCOL_WAS_HIRAGANA; 1521 } 1522 } 1523 1524 // We've got a character. See if there's any fcd and/or normalization stuff to do. 1525 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. 1526 if ((collationSource->flags & UCOL_ITER_NORM) == 0) { 1527 break; 1528 } 1529 1530 if (collationSource->fcdPosition >= collationSource->pos) { 1531 // An earlier FCD check has already covered the current character. 1532 // We can go ahead and process this char. 1533 break; 1534 } 1535 1536 if (ch < ZERO_CC_LIMIT_ ) { 1537 // Fast fcd safe path. Trailing combining class == 0. This char is OK. 1538 break; 1539 } 1540 1541 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1542 // We need to peek at the next character in order to tell if we are FCD 1543 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { 1544 // We are at the last char of source string. 1545 // It is always OK for FCD check. 1546 break; 1547 } 1548 1549 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test 1550 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { 1551 break; 1552 } 1553 } 1554 1555 1556 // Need a more complete FCD check and possible normalization. 1557 if (collIterFCD(collationSource)) { 1558 collIterNormalize(collationSource); 1559 } 1560 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1561 // No normalization was needed. Go ahead and process the char we already had. 1562 break; 1563 } 1564 1565 // Some normalization happened. Next loop iteration will pick up a char 1566 // from the normalization buffer. 1567 1568 } // end for (;;) 1569 1570 1571 if (ch <= 0xFF) { 1572 /* For latin-1 characters we never need to fall back to the UCA table */ 1573 /* because all of the UCA data is replicated in the latinOneMapping array */ 1574 order = coll->latinOneMapping[ch]; 1575 if (order > UCOL_NOT_FOUND) { 1576 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); 1577 } 1578 } 1579 else 1580 { 1581 // Always use UCA for Han, Hangul 1582 // (Han extension A is before main Han block) 1583 // **** Han compatibility chars ?? **** 1584 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1585 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { 1586 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { 1587 // between the two target ranges; do normal lookup 1588 // **** this range is YI, Modifier tone letters, **** 1589 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1590 // **** Latin-D might be tailored, so we need to **** 1591 // **** do the normal lookup for these guys. **** 1592 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1593 } else { 1594 // in one of the target ranges; use UCA 1595 order = UCOL_NOT_FOUND; 1596 } 1597 } else { 1598 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1599 } 1600 1601 if(order > UCOL_NOT_FOUND) { /* if a CE is special */ 1602 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ 1603 } 1604 1605 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ 1606 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ 1607 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1608 1609 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ 1610 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); 1611 } 1612 } 1613 } 1614 if(order == UCOL_NOT_FOUND) { 1615 order = getImplicit(ch, collationSource); 1616 } 1617 return order; /* return the CE */ 1618 } 1619 1620 /* ucol_getNextCE, out-of-line version for use from other files. */ 1621 U_CAPI uint32_t U_EXPORT2 1622 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1623 return ucol_IGetNextCE(coll, collationSource, status); 1624 } 1625 1626 1627 /** 1628 * Incremental previous normalization happens here. Pick up the range of chars 1629 * identifed by FCD, normalize it into the collIterate's writable buffer, 1630 * switch the collIterate's state to use the writable buffer. 1631 * @param data collation iterator data 1632 */ 1633 static 1634 void collPrevIterNormalize(collIterate *data) 1635 { 1636 UErrorCode status = U_ZERO_ERROR; 1637 const UChar *pEnd = data->pos; /* End normalize + 1 */ 1638 const UChar *pStart; 1639 1640 /* Start normalize */ 1641 if (data->fcdPosition == NULL) { 1642 pStart = data->string; 1643 } 1644 else { 1645 pStart = data->fcdPosition + 1; 1646 } 1647 1648 int32_t normLen = 1649 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)), 1650 data->writableBuffer, 1651 status). 1652 length(); 1653 if(U_FAILURE(status)) { 1654 return; 1655 } 1656 /* 1657 this puts the null termination infront of the normalized string instead 1658 of the end 1659 */ 1660 data->writableBuffer.insert(0, (UChar)0); 1661 1662 /* 1663 * The usual case at this point is that we've got a base 1664 * character followed by marks that were normalized. If 1665 * fcdPosition is NULL, that means that we backed up to 1666 * the beginning of the string and there's no base character. 1667 * 1668 * Forward processing will usually normalize when it sees 1669 * the first mark, so that mark will get it's natural offset 1670 * and the rest will get the offset of the character following 1671 * the marks. The base character will also get its natural offset. 1672 * 1673 * We write the offset of the base character, if there is one, 1674 * followed by the offset of the first mark and then the offsets 1675 * of the rest of the marks. 1676 */ 1677 int32_t firstMarkOffset = 0; 1678 int32_t trailOffset = (int32_t)(data->pos - data->string + 1); 1679 int32_t trailCount = normLen - 1; 1680 1681 if (data->fcdPosition != NULL) { 1682 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); 1683 UChar baseChar = *data->fcdPosition; 1684 1685 firstMarkOffset = baseOffset + 1; 1686 1687 /* 1688 * If the base character is the start of a contraction, forward processing 1689 * will normalize the marks while checking for the contraction, which means 1690 * that the offset of the first mark will the same as the other marks. 1691 * 1692 * **** THIS IS PROBABLY NOT A COMPLETE TEST **** 1693 */ 1694 if (baseChar >= 0x100) { 1695 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); 1696 1697 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { 1698 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); 1699 } 1700 1701 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { 1702 firstMarkOffset = trailOffset; 1703 } 1704 } 1705 1706 data->appendOffset(baseOffset, status); 1707 } 1708 1709 data->appendOffset(firstMarkOffset, status); 1710 1711 for (int32_t i = 0; i < trailCount; i += 1) { 1712 data->appendOffset(trailOffset, status); 1713 } 1714 1715 data->offsetRepeatValue = trailOffset; 1716 1717 data->offsetReturn = data->offsetStore - 1; 1718 if (data->offsetReturn == data->offsetBuffer) { 1719 data->offsetStore = data->offsetBuffer; 1720 } 1721 1722 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; 1723 data->origFlags = data->flags; 1724 data->flags |= UCOL_ITER_INNORMBUF; 1725 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1726 } 1727 1728 1729 /** 1730 * Incremental FCD check for previous iteration and normalize. Called from 1731 * getPrevCE when normalization state is suspect. 1732 * When entering, the state is known to be this: 1733 * o We are working in the main buffer of the collIterate, not the side 1734 * writable buffer. When in the side buffer, normalization mode is always 1735 * off, so we won't get here. 1736 * o The leading combining class from the current character is 0 or the 1737 * trailing combining class of the previous char was zero. 1738 * True because the previous call to this function will have always exited 1739 * that way, and we get called for every char where cc might be non-zero. 1740 * @param data collation iterate struct 1741 * @return normalization status, TRUE for normalization to be done, FALSE 1742 * otherwise 1743 */ 1744 static 1745 inline UBool collPrevIterFCD(collIterate *data) 1746 { 1747 const UChar *src, *start; 1748 uint8_t leadingCC; 1749 uint8_t trailingCC = 0; 1750 uint16_t fcd; 1751 UBool result = FALSE; 1752 1753 start = data->string; 1754 src = data->pos + 1; 1755 1756 /* Get the trailing combining class of the current character. */ 1757 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); 1758 1759 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1760 1761 if (leadingCC != 0) { 1762 /* 1763 The current char has a non-zero leading combining class. 1764 Scan backward until we find a char with a trailing cc of zero. 1765 */ 1766 for (;;) 1767 { 1768 if (start == src) { 1769 data->fcdPosition = NULL; 1770 return result; 1771 } 1772 1773 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); 1774 1775 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1776 1777 if (trailingCC == 0) { 1778 break; 1779 } 1780 1781 if (leadingCC < trailingCC) { 1782 result = TRUE; 1783 } 1784 1785 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1786 } 1787 } 1788 1789 data->fcdPosition = (UChar *)src; 1790 1791 return result; 1792 } 1793 1794 /** gets a code unit from the string at a given offset 1795 * Handles both normal and iterative cases. 1796 * No error checking - caller beware! 1797 */ 1798 static inline 1799 UChar peekCodeUnit(collIterate *source, int32_t offset) { 1800 if(source->pos != NULL) { 1801 return *(source->pos + offset); 1802 } else if(source->iterator != NULL) { 1803 UChar32 c; 1804 if(offset != 0) { 1805 source->iterator->move(source->iterator, offset, UITER_CURRENT); 1806 c = source->iterator->next(source->iterator); 1807 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); 1808 } else { 1809 c = source->iterator->current(source->iterator); 1810 } 1811 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0. 1812 } else { 1813 return 0xfffd; 1814 } 1815 } 1816 1817 // Code point version. Treats the offset as a _code point_ delta. 1818 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16. 1819 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer. 1820 static inline 1821 UChar32 peekCodePoint(collIterate *source, int32_t offset) { 1822 UChar32 c; 1823 if(source->pos != NULL) { 1824 const UChar *p = source->pos; 1825 if(offset >= 0) { 1826 // Skip forward over (offset-1) code points. 1827 while(--offset >= 0) { 1828 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) { 1829 ++p; 1830 } 1831 } 1832 // Read the code point there. 1833 c = *p++; 1834 UChar trail; 1835 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) { 1836 c = U16_GET_SUPPLEMENTARY(c, trail); 1837 } 1838 } else /* offset<0 */ { 1839 // Skip backward over (offset-1) code points. 1840 while(++offset < 0) { 1841 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) { 1842 --p; 1843 } 1844 } 1845 // Read the code point before that. 1846 c = *--p; 1847 UChar lead; 1848 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) { 1849 c = U16_GET_SUPPLEMENTARY(lead, c); 1850 } 1851 } 1852 } else if(source->iterator != NULL) { 1853 if(offset >= 0) { 1854 // Skip forward over (offset-1) code points. 1855 int32_t fwd = offset; 1856 while(fwd-- > 0) { 1857 uiter_next32(source->iterator); 1858 } 1859 // Read the code point there. 1860 c = uiter_current32(source->iterator); 1861 // Return to the starting point, skipping backward over (offset-1) code points. 1862 while(offset-- > 0) { 1863 uiter_previous32(source->iterator); 1864 } 1865 } else /* offset<0 */ { 1866 // Read backward, reading offset code points, remember only the last-read one. 1867 int32_t back = offset; 1868 do { 1869 c = uiter_previous32(source->iterator); 1870 } while(++back < 0); 1871 // Return to the starting position, skipping forward over offset code points. 1872 do { 1873 uiter_next32(source->iterator); 1874 } while(++offset < 0); 1875 } 1876 } else { 1877 c = U_SENTINEL; 1878 } 1879 return c; 1880 } 1881 1882 /** 1883 * Determines if we are at the start of the data string in the backwards 1884 * collation iterator 1885 * @param data collation iterator 1886 * @return TRUE if we are at the start 1887 */ 1888 static 1889 inline UBool isAtStartPrevIterate(collIterate *data) { 1890 if(data->pos == NULL && data->iterator != NULL) { 1891 return !data->iterator->hasPrevious(data->iterator); 1892 } 1893 //return (collIter_bos(data)) || 1894 return (data->pos == data->string) || 1895 ((data->flags & UCOL_ITER_INNORMBUF) && 1896 *(data->pos - 1) == 0 && data->fcdPosition == NULL); 1897 } 1898 1899 static 1900 inline void goBackOne(collIterate *data) { 1901 # if 0 1902 // somehow, it looks like we need to keep iterator synced up 1903 // at all times, as above. 1904 if(data->pos) { 1905 data->pos--; 1906 } 1907 if(data->iterator) { 1908 data->iterator->previous(data->iterator); 1909 } 1910 #endif 1911 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { 1912 data->iterator->previous(data->iterator); 1913 } 1914 if(data->pos) { 1915 data->pos --; 1916 } 1917 } 1918 1919 /** 1920 * Inline function that gets a simple CE. 1921 * So what it does is that it will first check the expansion buffer. If the 1922 * expansion buffer is not empty, ie the end pointer to the expansion buffer 1923 * is different from the string pointer, we return the collation element at the 1924 * return pointer and decrement it. 1925 * For more complicated CEs it resorts to getComplicatedCE. 1926 * @param coll collator data 1927 * @param data collation iterator struct 1928 * @param status error status 1929 */ 1930 static 1931 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, 1932 UErrorCode *status) 1933 { 1934 uint32_t result = (uint32_t)UCOL_NULLORDER; 1935 1936 if (data->offsetReturn != NULL) { 1937 if (data->offsetRepeatCount > 0) { 1938 data->offsetRepeatCount -= 1; 1939 } else { 1940 if (data->offsetReturn == data->offsetBuffer) { 1941 data->offsetReturn = NULL; 1942 data->offsetStore = data->offsetBuffer; 1943 } else { 1944 data->offsetReturn -= 1; 1945 } 1946 } 1947 } 1948 1949 if ((data->extendCEs && data->toReturn > data->extendCEs) || 1950 (!data->extendCEs && data->toReturn > data->CEs)) 1951 { 1952 data->toReturn -= 1; 1953 result = *(data->toReturn); 1954 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { 1955 data->CEpos = data->toReturn; 1956 } 1957 } 1958 else { 1959 UChar ch = 0; 1960 1961 /* 1962 Loop handles case when incremental normalize switches to or from the 1963 side buffer / original string, and we need to start again to get the 1964 next character. 1965 */ 1966 for (;;) { 1967 if (data->flags & UCOL_ITER_HASLEN) { 1968 /* 1969 Normal path for strings when length is specified. 1970 Not in side buffer because it is always null terminated. 1971 */ 1972 if (data->pos <= data->string) { 1973 /* End of the main source string */ 1974 return UCOL_NO_MORE_CES; 1975 } 1976 data->pos --; 1977 ch = *data->pos; 1978 } 1979 // we are using an iterator to go back. Pray for us! 1980 else if (data->flags & UCOL_USE_ITERATOR) { 1981 UChar32 iterCh = data->iterator->previous(data->iterator); 1982 if(iterCh == U_SENTINEL) { 1983 return UCOL_NO_MORE_CES; 1984 } else { 1985 ch = (UChar)iterCh; 1986 } 1987 } 1988 else { 1989 data->pos --; 1990 ch = *data->pos; 1991 /* we are in the side buffer. */ 1992 if (ch == 0) { 1993 /* 1994 At the start of the normalize side buffer. 1995 Go back to string. 1996 Because pointer points to the last accessed character, 1997 hence we have to increment it by one here. 1998 */ 1999 data->flags = data->origFlags; 2000 data->offsetRepeatValue = 0; 2001 2002 if (data->fcdPosition == NULL) { 2003 data->pos = data->string; 2004 return UCOL_NO_MORE_CES; 2005 } 2006 else { 2007 data->pos = data->fcdPosition + 1; 2008 } 2009 2010 continue; 2011 } 2012 } 2013 2014 if(data->flags&UCOL_HIRAGANA_Q) { 2015 if(ch>=0x3040 && ch<=0x309f) { 2016 data->flags |= UCOL_WAS_HIRAGANA; 2017 } else { 2018 data->flags &= ~UCOL_WAS_HIRAGANA; 2019 } 2020 } 2021 2022 /* 2023 * got a character to determine if there's fcd and/or normalization 2024 * stuff to do. 2025 * if the current character is not fcd. 2026 * if current character is at the start of the string 2027 * Trailing combining class == 0. 2028 * Note if pos is in the writablebuffer, norm is always 0 2029 */ 2030 if (ch < ZERO_CC_LIMIT_ || 2031 // this should propel us out of the loop in the iterator case 2032 (data->flags & UCOL_ITER_NORM) == 0 || 2033 (data->fcdPosition != NULL && data->fcdPosition <= data->pos) 2034 || data->string == data->pos) { 2035 break; 2036 } 2037 2038 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 2039 /* if next character is FCD */ 2040 if (data->pos == data->string) { 2041 /* First char of string is always OK for FCD check */ 2042 break; 2043 } 2044 2045 /* Not first char of string, do the FCD fast test */ 2046 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { 2047 break; 2048 } 2049 } 2050 2051 /* Need a more complete FCD check and possible normalization. */ 2052 if (collPrevIterFCD(data)) { 2053 collPrevIterNormalize(data); 2054 } 2055 2056 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2057 /* No normalization. Go ahead and process the char. */ 2058 break; 2059 } 2060 2061 /* 2062 Some normalization happened. 2063 Next loop picks up a char from the normalization buffer. 2064 */ 2065 } 2066 2067 /* attempt to handle contractions, after removal of the backwards 2068 contraction 2069 */ 2070 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { 2071 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); 2072 } else { 2073 if (ch <= 0xFF) { 2074 result = coll->latinOneMapping[ch]; 2075 } 2076 else { 2077 // Always use UCA for [3400..9FFF], [AC00..D7AF] 2078 // **** [FA0E..FA2F] ?? **** 2079 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 2080 (ch >= 0x3400 && ch <= 0xD7AF)) { 2081 if (ch > 0x9FFF && ch < 0xAC00) { 2082 // between the two target ranges; do normal lookup 2083 // **** this range is YI, Modifier tone letters, **** 2084 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 2085 // **** Latin-D might be tailored, so we need to **** 2086 // **** do the normal lookup for these guys. **** 2087 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 2088 } else { 2089 result = UCOL_NOT_FOUND; 2090 } 2091 } else { 2092 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 2093 } 2094 } 2095 if (result > UCOL_NOT_FOUND) { 2096 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); 2097 } 2098 if (result == UCOL_NOT_FOUND) { // Not found in master list 2099 if (!isAtStartPrevIterate(data) && 2100 ucol_contractionEndCP(ch, data->coll)) 2101 { 2102 result = UCOL_CONTRACTION; 2103 } else { 2104 if(coll->UCA) { 2105 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 2106 } 2107 } 2108 2109 if (result > UCOL_NOT_FOUND) { 2110 if(coll->UCA) { 2111 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); 2112 } 2113 } 2114 } 2115 } 2116 2117 if(result == UCOL_NOT_FOUND) { 2118 result = getPrevImplicit(ch, data); 2119 } 2120 } 2121 2122 return result; 2123 } 2124 2125 2126 /* ucol_getPrevCE, out-of-line version for use from other files. */ 2127 U_CFUNC uint32_t U_EXPORT2 2128 ucol_getPrevCE(const UCollator *coll, collIterate *data, 2129 UErrorCode *status) { 2130 return ucol_IGetPrevCE(coll, data, status); 2131 } 2132 2133 2134 /* this should be connected to special Jamo handling */ 2135 U_CFUNC uint32_t U_EXPORT2 2136 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { 2137 collIterate colIt; 2138 IInit_collIterate(coll, &u, 1, &colIt, status); 2139 if(U_FAILURE(*status)) { 2140 return 0; 2141 } 2142 return ucol_IGetNextCE(coll, &colIt, status); 2143 } 2144 2145 /** 2146 * Inserts the argument character into the end of the buffer pushing back the 2147 * null terminator. 2148 * @param data collIterate struct data 2149 * @param ch character to be appended 2150 * @return the position of the new addition 2151 */ 2152 static 2153 inline const UChar * insertBufferEnd(collIterate *data, UChar ch) 2154 { 2155 int32_t oldLength = data->writableBuffer.length(); 2156 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; 2157 } 2158 2159 /** 2160 * Inserts the argument string into the end of the buffer pushing back the 2161 * null terminator. 2162 * @param data collIterate struct data 2163 * @param string to be appended 2164 * @param length of the string to be appended 2165 * @return the position of the new addition 2166 */ 2167 static 2168 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length) 2169 { 2170 int32_t oldLength = data->writableBuffer.length(); 2171 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength; 2172 } 2173 2174 /** 2175 * Special normalization function for contraction in the forwards iterator. 2176 * This normalization sequence will place the current character at source->pos 2177 * and its following normalized sequence into the buffer. 2178 * The fcd position, pos will be changed. 2179 * pos will now point to positions in the buffer. 2180 * Flags will be changed accordingly. 2181 * @param data collation iterator data 2182 */ 2183 static 2184 inline void normalizeNextContraction(collIterate *data) 2185 { 2186 int32_t strsize; 2187 UErrorCode status = U_ZERO_ERROR; 2188 /* because the pointer points to the next character */ 2189 const UChar *pStart = data->pos - 1; 2190 const UChar *pEnd; 2191 2192 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2193 data->writableBuffer.setTo(*(pStart - 1)); 2194 strsize = 1; 2195 } 2196 else { 2197 strsize = data->writableBuffer.length(); 2198 } 2199 2200 pEnd = data->fcdPosition; 2201 2202 data->writableBuffer.append( 2203 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status)); 2204 if(U_FAILURE(status)) { 2205 return; 2206 } 2207 2208 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; 2209 data->origFlags = data->flags; 2210 data->flags |= UCOL_ITER_INNORMBUF; 2211 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2212 } 2213 2214 /** 2215 * Contraction character management function that returns the next character 2216 * for the forwards iterator. 2217 * Does nothing if the next character is in buffer and not the first character 2218 * in it. 2219 * Else it checks next character in data string to see if it is normalizable. 2220 * If it is not, the character is simply copied into the buffer, else 2221 * the whole normalized substring is copied into the buffer, including the 2222 * current character. 2223 * @param data collation element iterator data 2224 * @return next character 2225 */ 2226 static 2227 inline UChar getNextNormalizedChar(collIterate *data) 2228 { 2229 UChar nextch; 2230 UChar ch; 2231 // Here we need to add the iterator code. One problem is the way 2232 // end of string is handled. If we just return next char, it could 2233 // be the sentinel. Most of the cases already check for this, but we 2234 // need to be sure. 2235 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { 2236 /* if no normalization and not in buffer. */ 2237 if(data->flags & UCOL_USE_ITERATOR) { 2238 return (UChar)data->iterator->next(data->iterator); 2239 } else { 2240 return *(data->pos ++); 2241 } 2242 } 2243 2244 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { 2245 //normalizeIterator(data); 2246 //} 2247 2248 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2249 if ((innormbuf && *data->pos != 0) || 2250 (data->fcdPosition != NULL && !innormbuf && 2251 data->pos < data->fcdPosition)) { 2252 /* 2253 if next character is in normalized buffer, no further normalization 2254 is required 2255 */ 2256 return *(data->pos ++); 2257 } 2258 2259 if (data->flags & UCOL_ITER_HASLEN) { 2260 /* in data string */ 2261 if (data->pos + 1 == data->endp) { 2262 return *(data->pos ++); 2263 } 2264 } 2265 else { 2266 if (innormbuf) { 2267 // inside the normalization buffer, but at the end 2268 // (since we encountered zero). This means, in the 2269 // case we're using char iterator, that we need to 2270 // do another round of normalization. 2271 //if(data->origFlags & UCOL_USE_ITERATOR) { 2272 // we need to restore original flags, 2273 // otherwise, we'll lose them 2274 //data->flags = data->origFlags; 2275 //normalizeIterator(data); 2276 //return *(data->pos++); 2277 //} else { 2278 /* 2279 in writable buffer, at this point fcdPosition can not be 2280 pointing to the end of the data string. see contracting tag. 2281 */ 2282 if(data->fcdPosition) { 2283 if (*(data->fcdPosition + 1) == 0 || 2284 data->fcdPosition + 1 == data->endp) { 2285 /* at the end of the string, dump it into the normalizer */ 2286 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; 2287 // Check if data->pos received a null pointer 2288 if (data->pos == NULL) { 2289 return (UChar)-1; // Return to indicate error. 2290 } 2291 return *(data->fcdPosition ++); 2292 } 2293 data->pos = data->fcdPosition; 2294 } else if(data->origFlags & UCOL_USE_ITERATOR) { 2295 // if we are here, we're using a normalizing iterator. 2296 // we should just continue further. 2297 data->flags = data->origFlags; 2298 data->pos = NULL; 2299 return (UChar)data->iterator->next(data->iterator); 2300 } 2301 //} 2302 } 2303 else { 2304 if (*(data->pos + 1) == 0) { 2305 return *(data->pos ++); 2306 } 2307 } 2308 } 2309 2310 ch = *data->pos ++; 2311 nextch = *data->pos; 2312 2313 /* 2314 * if the current character is not fcd. 2315 * Trailing combining class == 0. 2316 */ 2317 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && 2318 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || 2319 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { 2320 /* 2321 Need a more complete FCD check and possible normalization. 2322 normalize substring will be appended to buffer 2323 */ 2324 if (collIterFCD(data)) { 2325 normalizeNextContraction(data); 2326 return *(data->pos ++); 2327 } 2328 else if (innormbuf) { 2329 /* fcdposition shifted even when there's no normalization, if we 2330 don't input the rest into this, we'll get the wrong position when 2331 we reach the end of the writableBuffer */ 2332 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); 2333 data->pos = insertBufferEnd(data, data->pos - 1, length); 2334 // Check if data->pos received a null pointer 2335 if (data->pos == NULL) { 2336 return (UChar)-1; // Return to indicate error. 2337 } 2338 return *(data->pos ++); 2339 } 2340 } 2341 2342 if (innormbuf) { 2343 /* 2344 no normalization is to be done hence only one character will be 2345 appended to the buffer. 2346 */ 2347 data->pos = insertBufferEnd(data, ch) + 1; 2348 // Check if data->pos received a null pointer 2349 if (data->pos == NULL) { 2350 return (UChar)-1; // Return to indicate error. 2351 } 2352 } 2353 2354 /* points back to the pos in string */ 2355 return ch; 2356 } 2357 2358 2359 2360 /** 2361 * Function to copy the buffer into writableBuffer and sets the fcd position to 2362 * the correct position 2363 * @param source data string source 2364 * @param buffer character buffer 2365 */ 2366 static 2367 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer) 2368 { 2369 /* okay confusing part here. to ensure that the skipped characters are 2370 considered later, we need to place it in the appropriate position in the 2371 normalization buffer and reassign the pos pointer. simple case if pos 2372 reside in string, simply copy to normalization buffer and 2373 fcdposition = pos, pos = start of normalization buffer. if pos in 2374 normalization buffer, we'll insert the copy infront of pos and point pos 2375 to the start of the normalization buffer. why am i doing these copies? 2376 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does 2377 not require any changes, which be really painful. */ 2378 if (source->flags & UCOL_ITER_INNORMBUF) { 2379 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer(); 2380 source->writableBuffer.replace(0, replaceLength, buffer); 2381 } 2382 else { 2383 source->fcdPosition = source->pos; 2384 source->origFlags = source->flags; 2385 source->flags |= UCOL_ITER_INNORMBUF; 2386 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 2387 source->writableBuffer = buffer; 2388 } 2389 2390 source->pos = source->writableBuffer.getTerminatedBuffer(); 2391 } 2392 2393 /** 2394 * Function to get the discontiguos collation element within the source. 2395 * Note this function will set the position to the appropriate places. 2396 * @param coll current collator used 2397 * @param source data string source 2398 * @param constart index to the start character in the contraction table 2399 * @return discontiguos collation element offset 2400 */ 2401 static 2402 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, 2403 const UChar *constart) 2404 { 2405 /* source->pos currently points to the second combining character after 2406 the start character */ 2407 const UChar *temppos = source->pos; 2408 UnicodeString buffer; 2409 const UChar *tempconstart = constart; 2410 uint8_t tempflags = source->flags; 2411 UBool multicontraction = FALSE; 2412 collIterateState discState; 2413 2414 backupState(source, &discState); 2415 2416 buffer.setTo(peekCodePoint(source, -1)); 2417 for (;;) { 2418 UChar *UCharOffset; 2419 UChar schar, 2420 tchar; 2421 uint32_t result; 2422 2423 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) 2424 || (peekCodeUnit(source, 0) == 0 && 2425 //|| (*source->pos == 0 && 2426 ((source->flags & UCOL_ITER_INNORMBUF) == 0 || 2427 source->fcdPosition == NULL || 2428 source->fcdPosition == source->endp || 2429 *(source->fcdPosition) == 0 || 2430 u_getCombiningClass(*(source->fcdPosition)) == 0)) || 2431 /* end of string in null terminated string or stopped by a 2432 null character, note fcd does not always point to a base 2433 character after the discontiguos change */ 2434 u_getCombiningClass(peekCodePoint(source, 0)) == 0) { 2435 //u_getCombiningClass(*(source->pos)) == 0) { 2436 //constart = (UChar *)coll->image + getContractOffset(CE); 2437 if (multicontraction) { 2438 source->pos = temppos - 1; 2439 setDiscontiguosAttribute(source, buffer); 2440 return *(coll->contractionCEs + 2441 (tempconstart - coll->contractionIndex)); 2442 } 2443 constart = tempconstart; 2444 break; 2445 } 2446 2447 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ 2448 schar = getNextNormalizedChar(source); 2449 2450 while (schar > (tchar = *UCharOffset)) { 2451 UCharOffset++; 2452 } 2453 2454 if (schar != tchar) { 2455 /* not the correct codepoint. we stuff the current codepoint into 2456 the discontiguos buffer and try the next character */ 2457 buffer.append(schar); 2458 continue; 2459 } 2460 else { 2461 if (u_getCombiningClass(schar) == 2462 u_getCombiningClass(peekCodePoint(source, -2))) { 2463 buffer.append(schar); 2464 continue; 2465 } 2466 result = *(coll->contractionCEs + 2467 (UCharOffset - coll->contractionIndex)); 2468 } 2469 2470 if (result == UCOL_NOT_FOUND) { 2471 break; 2472 } else if (isContraction(result)) { 2473 /* this is a multi-contraction*/ 2474 tempconstart = (UChar *)coll->image + getContractOffset(result); 2475 if (*(coll->contractionCEs + (constart - coll->contractionIndex)) 2476 != UCOL_NOT_FOUND) { 2477 multicontraction = TRUE; 2478 temppos = source->pos + 1; 2479 } 2480 } else { 2481 setDiscontiguosAttribute(source, buffer); 2482 return result; 2483 } 2484 } 2485 2486 /* no problems simply reverting just like that, 2487 if we are in string before getting into this function, points back to 2488 string hence no problem. 2489 if we are in normalization buffer before getting into this function, 2490 since we'll never use another normalization within this function, we 2491 know that fcdposition points to a base character. the normalization buffer 2492 never change, hence this revert works. */ 2493 loadState(source, &discState, TRUE); 2494 goBackOne(source); 2495 2496 //source->pos = temppos - 1; 2497 source->flags = tempflags; 2498 return *(coll->contractionCEs + (constart - coll->contractionIndex)); 2499 } 2500 2501 /* now uses Mark's getImplicitPrimary code */ 2502 static 2503 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { 2504 uint32_t r = uprv_uca_getImplicitPrimary(cp); 2505 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; 2506 collationSource->offsetRepeatCount += 1; 2507 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' 2508 } 2509 2510 /** 2511 * Inserts the argument character into the front of the buffer replacing the 2512 * front null terminator. 2513 * @param data collation element iterator data 2514 * @param ch character to be appended 2515 */ 2516 static 2517 inline void insertBufferFront(collIterate *data, UChar ch) 2518 { 2519 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2; 2520 } 2521 2522 /** 2523 * Special normalization function for contraction in the previous iterator. 2524 * This normalization sequence will place the current character at source->pos 2525 * and its following normalized sequence into the buffer. 2526 * The fcd position, pos will be changed. 2527 * pos will now point to positions in the buffer. 2528 * Flags will be changed accordingly. 2529 * @param data collation iterator data 2530 */ 2531 static 2532 inline void normalizePrevContraction(collIterate *data, UErrorCode *status) 2533 { 2534 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ 2535 const UChar *pStart; 2536 2537 UnicodeString endOfBuffer; 2538 if (data->flags & UCOL_ITER_HASLEN) { 2539 /* 2540 normalization buffer not used yet, we'll pull down the next 2541 character into the end of the buffer 2542 */ 2543 endOfBuffer.setTo(*pEnd); 2544 } 2545 else { 2546 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL 2547 } 2548 2549 if (data->fcdPosition == NULL) { 2550 pStart = data->string; 2551 } 2552 else { 2553 pStart = data->fcdPosition + 1; 2554 } 2555 int32_t normLen = 2556 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), 2557 data->writableBuffer, 2558 *status). 2559 length(); 2560 if(U_FAILURE(*status)) { 2561 return; 2562 } 2563 /* 2564 this puts the null termination infront of the normalized string instead 2565 of the end 2566 */ 2567 data->pos = 2568 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() + 2569 1 + normLen; 2570 data->origFlags = data->flags; 2571 data->flags |= UCOL_ITER_INNORMBUF; 2572 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2573 } 2574 2575 /** 2576 * Contraction character management function that returns the previous character 2577 * for the backwards iterator. 2578 * Does nothing if the previous character is in buffer and not the first 2579 * character in it. 2580 * Else it checks previous character in data string to see if it is 2581 * normalizable. 2582 * If it is not, the character is simply copied into the buffer, else 2583 * the whole normalized substring is copied into the buffer, including the 2584 * current character. 2585 * @param data collation element iterator data 2586 * @return previous character 2587 */ 2588 static 2589 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) 2590 { 2591 UChar prevch; 2592 UChar ch; 2593 const UChar *start; 2594 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2595 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || 2596 (innormbuf && *(data->pos - 1) != 0)) { 2597 /* 2598 if no normalization. 2599 if previous character is in normalized buffer, no further normalization 2600 is required 2601 */ 2602 if(data->flags & UCOL_USE_ITERATOR) { 2603 data->iterator->move(data->iterator, -1, UITER_CURRENT); 2604 return (UChar)data->iterator->next(data->iterator); 2605 } else { 2606 return *(data->pos - 1); 2607 } 2608 } 2609 2610 start = data->pos; 2611 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { 2612 /* in data string */ 2613 if ((start - 1) == data->string) { 2614 return *(start - 1); 2615 } 2616 start --; 2617 ch = *start; 2618 prevch = *(start - 1); 2619 } 2620 else { 2621 /* 2622 in writable buffer, at this point fcdPosition can not be NULL. 2623 see contracting tag. 2624 */ 2625 if (data->fcdPosition == data->string) { 2626 /* at the start of the string, just dump it into the normalizer */ 2627 insertBufferFront(data, *(data->fcdPosition)); 2628 data->fcdPosition = NULL; 2629 return *(data->pos - 1); 2630 } 2631 start = data->fcdPosition; 2632 ch = *start; 2633 prevch = *(start - 1); 2634 } 2635 /* 2636 * if the current character is not fcd. 2637 * Trailing combining class == 0. 2638 */ 2639 if (data->fcdPosition > start && 2640 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) 2641 { 2642 /* 2643 Need a more complete FCD check and possible normalization. 2644 normalize substring will be appended to buffer 2645 */ 2646 const UChar *backuppos = data->pos; 2647 data->pos = start; 2648 if (collPrevIterFCD(data)) { 2649 normalizePrevContraction(data, status); 2650 return *(data->pos - 1); 2651 } 2652 data->pos = backuppos; 2653 data->fcdPosition ++; 2654 } 2655 2656 if (innormbuf) { 2657 /* 2658 no normalization is to be done hence only one character will be 2659 appended to the buffer. 2660 */ 2661 insertBufferFront(data, ch); 2662 data->fcdPosition --; 2663 } 2664 2665 return ch; 2666 } 2667 2668 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ 2669 /* It is called by getNextCE */ 2670 2671 /* The following should be even */ 2672 #define UCOL_MAX_DIGITS_FOR_NUMBER 254 2673 2674 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { 2675 collIterateState entryState; 2676 backupState(source, &entryState); 2677 UChar32 cp = ch; 2678 2679 for (;;) { 2680 // This loop will repeat only in the case of contractions, and only when a contraction 2681 // is found and the first CE resulting from that contraction is itself a special 2682 // (an expansion, for example.) All other special CE types are fully handled the 2683 // first time through, and the loop exits. 2684 2685 const uint32_t *CEOffset = NULL; 2686 switch(getCETag(CE)) { 2687 case NOT_FOUND_TAG: 2688 /* This one is not found, and we'll let somebody else bother about it... no more games */ 2689 return CE; 2690 case SPEC_PROC_TAG: 2691 { 2692 // Special processing is getting a CE that is preceded by a certain prefix 2693 // Currently this is only needed for optimizing Japanese length and iteration marks. 2694 // When we encouter a special processing tag, we go backwards and try to see if 2695 // we have a match. 2696 // Contraction tables are used - so the whole process is not unlike contraction. 2697 // prefix data is stored backwards in the table. 2698 const UChar *UCharOffset; 2699 UChar schar, tchar; 2700 collIterateState prefixState; 2701 backupState(source, &prefixState); 2702 loadState(source, &entryState, TRUE); 2703 goBackOne(source); // We want to look at the point where we entered - actually one 2704 // before that... 2705 2706 for(;;) { 2707 // This loop will run once per source string character, for as long as we 2708 // are matching a potential contraction sequence 2709 2710 // First we position ourselves at the begining of contraction sequence 2711 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2712 if (collIter_bos(source)) { 2713 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2714 break; 2715 } 2716 schar = getPrevNormalizedChar(source, status); 2717 goBackOne(source); 2718 2719 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2720 UCharOffset++; 2721 } 2722 2723 if (schar == tchar) { 2724 // Found the source string char in the table. 2725 // Pick up the corresponding CE from the table. 2726 CE = *(coll->contractionCEs + 2727 (UCharOffset - coll->contractionIndex)); 2728 } 2729 else 2730 { 2731 // Source string char was not in the table. 2732 // We have not found the prefix. 2733 CE = *(coll->contractionCEs + 2734 (ContractionStart - coll->contractionIndex)); 2735 } 2736 2737 if(!isPrefix(CE)) { 2738 // The source string char was in the contraction table, and the corresponding 2739 // CE is not a prefix CE. We found the prefix, break 2740 // out of loop, this CE will end up being returned. This is the normal 2741 // way out of prefix handling when the source actually contained 2742 // the prefix. 2743 break; 2744 } 2745 } 2746 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue 2747 loadState(source, &prefixState, TRUE); 2748 if(source->origFlags & UCOL_USE_ITERATOR) { 2749 source->flags = source->origFlags; 2750 } 2751 } else { // prefix search was a failure, we have to backup all the way to the start 2752 loadState(source, &entryState, TRUE); 2753 } 2754 break; 2755 } 2756 case CONTRACTION_TAG: 2757 { 2758 /* This should handle contractions */ 2759 collIterateState state; 2760 backupState(source, &state); 2761 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; 2762 const UChar *UCharOffset; 2763 UChar schar, tchar; 2764 2765 for (;;) { 2766 /* This loop will run once per source string character, for as long as we */ 2767 /* are matching a potential contraction sequence */ 2768 2769 /* First we position ourselves at the begining of contraction sequence */ 2770 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2771 2772 if (collIter_eos(source)) { 2773 // Ran off the end of the source string. 2774 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2775 // So we'll pick whatever we have at the point... 2776 if (CE == UCOL_NOT_FOUND) { 2777 // back up the source over all the chars we scanned going into this contraction. 2778 CE = firstCE; 2779 loadState(source, &state, TRUE); 2780 if(source->origFlags & UCOL_USE_ITERATOR) { 2781 source->flags = source->origFlags; 2782 } 2783 } 2784 break; 2785 } 2786 2787 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ 2788 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); 2789 2790 schar = getNextNormalizedChar(source); 2791 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2792 UCharOffset++; 2793 } 2794 2795 if (schar == tchar) { 2796 // Found the source string char in the contraction table. 2797 // Pick up the corresponding CE from the table. 2798 CE = *(coll->contractionCEs + 2799 (UCharOffset - coll->contractionIndex)); 2800 } 2801 else 2802 { 2803 // Source string char was not in contraction table. 2804 // Unless we have a discontiguous contraction, we have finished 2805 // with this contraction. 2806 // in order to do the proper detection, we 2807 // need to see if we're dealing with a supplementary 2808 /* We test whether the next two char are surrogate pairs. 2809 * This test is done if the iterator is not NULL. 2810 * If there is no surrogate pair, the iterator 2811 * goes back one if needed. */ 2812 UChar32 miss = schar; 2813 if (source->iterator) { 2814 UChar32 surrNextChar; /* the next char in the iteration to test */ 2815 int32_t prevPos; /* holds the previous position before move forward of the source iterator */ 2816 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { 2817 prevPos = source->iterator->index; 2818 surrNextChar = getNextNormalizedChar(source); 2819 if (U16_IS_TRAIL(surrNextChar)) { 2820 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); 2821 } else if (prevPos < source->iterator->index){ 2822 goBackOne(source); 2823 } 2824 } 2825 } else if (U16_IS_LEAD(schar)) { 2826 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); 2827 } 2828 2829 uint8_t sCC; 2830 if (miss < 0x300 || 2831 maxCC == 0 || 2832 (sCC = i_getCombiningClass(miss, coll)) == 0 || 2833 sCC>maxCC || 2834 (allSame != 0 && sCC == maxCC) || 2835 collIter_eos(source)) 2836 { 2837 // Contraction can not be discontiguous. 2838 goBackOne(source); // back up the source string by one, 2839 // because the character we just looked at was 2840 // not part of the contraction. */ 2841 if(U_IS_SUPPLEMENTARY(miss)) { 2842 goBackOne(source); 2843 } 2844 CE = *(coll->contractionCEs + 2845 (ContractionStart - coll->contractionIndex)); 2846 } else { 2847 // 2848 // Contraction is possibly discontiguous. 2849 // Scan more of source string looking for a match 2850 // 2851 UChar tempchar; 2852 /* find the next character if schar is not a base character 2853 and we are not yet at the end of the string */ 2854 tempchar = getNextNormalizedChar(source); 2855 // probably need another supplementary thingie here 2856 goBackOne(source); 2857 if (i_getCombiningClass(tempchar, coll) == 0) { 2858 goBackOne(source); 2859 if(U_IS_SUPPLEMENTARY(miss)) { 2860 goBackOne(source); 2861 } 2862 /* Spit out the last char of the string, wasn't tasty enough */ 2863 CE = *(coll->contractionCEs + 2864 (ContractionStart - coll->contractionIndex)); 2865 } else { 2866 CE = getDiscontiguous(coll, source, ContractionStart); 2867 } 2868 } 2869 } // else after if(schar == tchar) 2870 2871 if(CE == UCOL_NOT_FOUND) { 2872 /* The Source string did not match the contraction that we were checking. */ 2873 /* Back up the source position to undo the effects of having partially */ 2874 /* scanned through what ultimately proved to not be a contraction. */ 2875 loadState(source, &state, TRUE); 2876 CE = firstCE; 2877 break; 2878 } 2879 2880 if(!isContraction(CE)) { 2881 // The source string char was in the contraction table, and the corresponding 2882 // CE is not a contraction CE. We completed the contraction, break 2883 // out of loop, this CE will end up being returned. This is the normal 2884 // way out of contraction handling when the source actually contained 2885 // the contraction. 2886 break; 2887 } 2888 2889 2890 // The source string char was in the contraction table, and the corresponding 2891 // CE is IS a contraction CE. We will continue looping to check the source 2892 // string for the remaining chars in the contraction. 2893 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); 2894 if(tempCE != UCOL_NOT_FOUND) { 2895 // We have scanned a a section of source string for which there is a 2896 // CE from the contraction table. Remember the CE and scan position, so 2897 // that we can return to this point if further scanning fails to 2898 // match a longer contraction sequence. 2899 firstCE = tempCE; 2900 2901 goBackOne(source); 2902 backupState(source, &state); 2903 getNextNormalizedChar(source); 2904 2905 // Another way to do this is: 2906 //collIterateState tempState; 2907 //backupState(source, &tempState); 2908 //goBackOne(source); 2909 //backupState(source, &state); 2910 //loadState(source, &tempState, TRUE); 2911 2912 // The problem is that for incomplete contractions we have to remember the previous 2913 // position. Before, the only thing I needed to do was state.pos--; 2914 // After iterator introduction and especially after introduction of normalizing 2915 // iterators, it became much more difficult to decrease the saved state. 2916 // I'm not yet sure which of the two methods above is faster. 2917 } 2918 } // for(;;) 2919 break; 2920 } // case CONTRACTION_TAG: 2921 case LONG_PRIMARY_TAG: 2922 { 2923 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 2924 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 2925 source->offsetRepeatCount += 1; 2926 return CE; 2927 } 2928 case EXPANSION_TAG: 2929 { 2930 /* This should handle expansion. */ 2931 /* NOTE: we can encounter both continuations and expansions in an expansion! */ 2932 /* I have to decide where continuations are going to be dealt with */ 2933 uint32_t size; 2934 uint32_t i; /* general counter */ 2935 2936 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 2937 size = getExpansionCount(CE); 2938 CE = *CEOffset++; 2939 //source->offsetRepeatCount = -1; 2940 2941 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 2942 for(i = 1; i<size; i++) { 2943 *(source->CEpos++) = *CEOffset++; 2944 source->offsetRepeatCount += 1; 2945 } 2946 } else { /* else, we do */ 2947 while(*CEOffset != 0) { 2948 *(source->CEpos++) = *CEOffset++; 2949 source->offsetRepeatCount += 1; 2950 } 2951 } 2952 2953 return CE; 2954 } 2955 case DIGIT_TAG: 2956 { 2957 /* 2958 We do a check to see if we want to collate digits as numbers; if so we generate 2959 a custom collation key. Otherwise we pull out the value stored in the expansion table. 2960 */ 2961 //uint32_t size; 2962 uint32_t i; /* general counter */ 2963 2964 if (source->coll->numericCollation == UCOL_ON){ 2965 collIterateState digitState = {0,0,0,0,0,0,0,0,0}; 2966 UChar32 char32 = 0; 2967 int32_t digVal = 0; 2968 2969 uint32_t digIndx = 0; 2970 uint32_t endIndex = 0; 2971 uint32_t trailingZeroIndex = 0; 2972 2973 uint8_t collateVal = 0; 2974 2975 UBool nonZeroValReached = FALSE; 2976 2977 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. 2978 /* 2979 We parse the source string until we hit a char that's NOT a digit. 2980 Use this u_charDigitValue. This might be slow because we have to 2981 handle surrogates... 2982 */ 2983 /* 2984 if (U16_IS_LEAD(ch)){ 2985 if (!collIter_eos(source)) { 2986 backupState(source, &digitState); 2987 UChar trail = getNextNormalizedChar(source); 2988 if(U16_IS_TRAIL(trail)) { 2989 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 2990 } else { 2991 loadState(source, &digitState, TRUE); 2992 char32 = ch; 2993 } 2994 } else { 2995 char32 = ch; 2996 } 2997 } else { 2998 char32 = ch; 2999 } 3000 digVal = u_charDigitValue(char32); 3001 */ 3002 digVal = u_charDigitValue(cp); // if we have arrived here, we have 3003 // already processed possible supplementaries that trigered the digit tag - 3004 // all supplementaries are marked in the UCA. 3005 /* 3006 We pad a zero in front of the first element anyways. This takes 3007 care of the (probably) most common case where people are sorting things followed 3008 by a single digit 3009 */ 3010 digIndx++; 3011 for(;;){ 3012 // Make sure we have enough space. No longer needed; 3013 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER 3014 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough 3015 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). 3016 3017 // Skipping over leading zeroes. 3018 if (digVal != 0) { 3019 nonZeroValReached = TRUE; 3020 } 3021 if (nonZeroValReached) { 3022 /* 3023 We parse the digit string into base 100 numbers (this fits into a byte). 3024 We only add to the buffer in twos, thus if we are parsing an odd character, 3025 that serves as the 'tens' digit while the if we are parsing an even one, that 3026 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3027 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3028 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3029 than all the other bytes. 3030 */ 3031 3032 if (digIndx % 2 == 1){ 3033 collateVal += (uint8_t)digVal; 3034 3035 // We don't enter the low-order-digit case unless we've already seen 3036 // the high order, or for the first digit, which is always non-zero. 3037 if (collateVal != 0) 3038 trailingZeroIndex = 0; 3039 3040 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3041 collateVal = 0; 3042 } 3043 else{ 3044 // We drop the collation value into the buffer so if we need to do 3045 // a "front patch" we don't have to check to see if we're hitting the 3046 // last element. 3047 collateVal = (uint8_t)(digVal * 10); 3048 3049 // Check for trailing zeroes. 3050 if (collateVal == 0) 3051 { 3052 if (!trailingZeroIndex) 3053 trailingZeroIndex = (digIndx/2) + 2; 3054 } 3055 else 3056 trailingZeroIndex = 0; 3057 3058 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3059 } 3060 digIndx++; 3061 } 3062 3063 // Get next character. 3064 if (!collIter_eos(source)){ 3065 ch = getNextNormalizedChar(source); 3066 if (U16_IS_LEAD(ch)){ 3067 if (!collIter_eos(source)) { 3068 backupState(source, &digitState); 3069 UChar trail = getNextNormalizedChar(source); 3070 if(U16_IS_TRAIL(trail)) { 3071 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 3072 } else { 3073 loadState(source, &digitState, TRUE); 3074 char32 = ch; 3075 } 3076 } 3077 } else { 3078 char32 = ch; 3079 } 3080 3081 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){ 3082 // Resetting position to point to the next unprocessed char. We 3083 // overshot it when doing our test/set for numbers. 3084 if (char32 > 0xFFFF) { // For surrogates. 3085 loadState(source, &digitState, TRUE); 3086 //goBackOne(source); 3087 } 3088 goBackOne(source); 3089 break; 3090 } 3091 } else { 3092 break; 3093 } 3094 } 3095 3096 if (nonZeroValReached == FALSE){ 3097 digIndx = 2; 3098 numTempBuf[2] = 6; 3099 } 3100 3101 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; 3102 if (digIndx % 2 != 0){ 3103 /* 3104 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what 3105 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. 3106 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a 3107 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. 3108 */ 3109 3110 for(i = 2; i < endIndex; i++){ 3111 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + 3112 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; 3113 } 3114 --digIndx; 3115 } 3116 3117 // Subtract one off of the last byte. 3118 numTempBuf[endIndex-1] -= 1; 3119 3120 /* 3121 We want to skip over the first two slots in the buffer. The first slot 3122 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3123 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3124 */ 3125 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3126 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); 3127 3128 // Now transfer the collation key to our collIterate struct. 3129 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. 3130 //size = ((endIndex+1) & ~1)/2; 3131 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3132 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3133 UCOL_BYTE_COMMON; // Tertiary weight. 3134 i = 2; // Reset the index into the buffer. 3135 while(i < endIndex) 3136 { 3137 uint32_t primWeight = numTempBuf[i++] << 8; 3138 if ( i < endIndex) 3139 primWeight |= numTempBuf[i++]; 3140 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3141 } 3142 3143 } else { 3144 // no numeric mode, we'll just switch to whatever we stashed and continue 3145 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 3146 CE = *CEOffset++; 3147 break; 3148 } 3149 return CE; 3150 } 3151 /* various implicits optimization */ 3152 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 3153 /* UCA is filled with these. Tailorings are NOT_FOUND */ 3154 return getImplicit(cp, source); 3155 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 3156 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit 3157 return getImplicit(cp, source); 3158 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3159 { 3160 static const uint32_t 3161 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3162 //const uint32_t LCount = 19; 3163 static const uint32_t VCount = 21; 3164 static const uint32_t TCount = 28; 3165 //const uint32_t NCount = VCount * TCount; // 588 3166 //const uint32_t SCount = LCount * NCount; // 11172 3167 uint32_t L = ch - SBase; 3168 3169 // divide into pieces 3170 3171 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation 3172 L /= TCount; 3173 uint32_t V = L % VCount; 3174 L /= VCount; 3175 3176 // offset them 3177 3178 L += LBase; 3179 V += VBase; 3180 T += TBase; 3181 3182 // return the first CE, but first put the rest into the expansion buffer 3183 if (!source->coll->image->jamoSpecial) { // FAST PATH 3184 3185 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3186 if (T != TBase) { 3187 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3188 } 3189 3190 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3191 3192 } else { // Jamo is Special 3193 // Since Hanguls pass the FCD check, it is 3194 // guaranteed that we won't be in 3195 // the normalization buffer if something like this happens 3196 // However, if we are using a uchar iterator and normalization 3197 // is ON, the Hangul that lead us here is going to be in that 3198 // normalization buffer. Here we want to restore the uchar 3199 // iterator state and pull out of the normalization buffer 3200 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { 3201 source->flags = source->origFlags; // restore the iterator 3202 source->pos = NULL; 3203 } 3204 // Move Jamos into normalization buffer 3205 UChar *buffer = source->writableBuffer.getBuffer(4); 3206 int32_t bufferLength; 3207 buffer[0] = (UChar)L; 3208 buffer[1] = (UChar)V; 3209 if (T != TBase) { 3210 buffer[2] = (UChar)T; 3211 bufferLength = 3; 3212 } else { 3213 bufferLength = 2; 3214 } 3215 source->writableBuffer.releaseBuffer(bufferLength); 3216 3217 source->fcdPosition = source->pos; // Indicate where to continue in main input string 3218 // after exhausting the writableBuffer 3219 source->pos = source->writableBuffer.getTerminatedBuffer(); 3220 source->origFlags = source->flags; 3221 source->flags |= UCOL_ITER_INNORMBUF; 3222 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 3223 3224 return(UCOL_IGNORABLE); 3225 } 3226 } 3227 case SURROGATE_TAG: 3228 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ 3229 /* two things can happen here: next code point can be a trailing surrogate - we will use it */ 3230 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ 3231 /* we treat it like an unassigned code point. */ 3232 { 3233 UChar trail; 3234 collIterateState state; 3235 backupState(source, &state); 3236 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { 3237 // we chould have stepped one char forward and it might have turned that it 3238 // was not a trail surrogate. In that case, we have to backup. 3239 loadState(source, &state, TRUE); 3240 return UCOL_NOT_FOUND; 3241 } else { 3242 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ 3243 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail); 3244 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. 3245 // We need to backup 3246 loadState(source, &state, TRUE); 3247 return CE; 3248 } 3249 // calculate the supplementary code point value, if surrogate was not tailored 3250 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 3251 } 3252 } 3253 break; 3254 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 3255 UChar nextChar; 3256 if( source->flags & UCOL_USE_ITERATOR) { 3257 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { 3258 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3259 source->iterator->next(source->iterator); 3260 return getImplicit(cp, source); 3261 } 3262 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) && 3263 U_IS_TRAIL((nextChar=*source->pos))) { 3264 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3265 source->pos++; 3266 return getImplicit(cp, source); 3267 } 3268 return UCOL_NOT_FOUND; 3269 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 3270 return UCOL_NOT_FOUND; /* broken surrogate sequence */ 3271 case CHARSET_TAG: 3272 /* not yet implemented */ 3273 /* probably after 1.8 */ 3274 return UCOL_NOT_FOUND; 3275 default: 3276 *status = U_INTERNAL_PROGRAM_ERROR; 3277 CE=0; 3278 break; 3279 } 3280 if (CE <= UCOL_NOT_FOUND) break; 3281 } 3282 return CE; 3283 } 3284 3285 3286 /* now uses Mark's getImplicitPrimary code */ 3287 static 3288 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { 3289 uint32_t r = uprv_uca_getImplicitPrimary(cp); 3290 3291 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; 3292 collationSource->toReturn = collationSource->CEpos; 3293 3294 // **** doesn't work if using iterator **** 3295 if (collationSource->flags & UCOL_ITER_INNORMBUF) { 3296 collationSource->offsetRepeatCount = 1; 3297 } else { 3298 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string); 3299 3300 UErrorCode errorCode = U_ZERO_ERROR; 3301 collationSource->appendOffset(firstOffset, errorCode); 3302 collationSource->appendOffset(firstOffset + 1, errorCode); 3303 3304 collationSource->offsetReturn = collationSource->offsetStore - 1; 3305 *(collationSource->offsetBuffer) = firstOffset; 3306 if (collationSource->offsetReturn == collationSource->offsetBuffer) { 3307 collationSource->offsetStore = collationSource->offsetBuffer; 3308 } 3309 } 3310 3311 return ((r & 0x0000FFFF)<<16) | 0x000000C0; 3312 } 3313 3314 /** 3315 * This function handles the special CEs like contractions, expansions, 3316 * surrogates, Thai. 3317 * It is called by both getPrevCE 3318 */ 3319 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, 3320 collIterate *source, 3321 UErrorCode *status) 3322 { 3323 const uint32_t *CEOffset = NULL; 3324 UChar *UCharOffset = NULL; 3325 UChar schar; 3326 const UChar *constart = NULL; 3327 uint32_t size; 3328 UChar buffer[UCOL_MAX_BUFFER]; 3329 uint32_t *endCEBuffer; 3330 UChar *strbuffer; 3331 int32_t noChars = 0; 3332 int32_t CECount = 0; 3333 3334 for(;;) 3335 { 3336 /* the only ces that loops are thai and contractions */ 3337 switch (getCETag(CE)) 3338 { 3339 case NOT_FOUND_TAG: /* this tag always returns */ 3340 return CE; 3341 3342 case SPEC_PROC_TAG: 3343 { 3344 // Special processing is getting a CE that is preceded by a certain prefix 3345 // Currently this is only needed for optimizing Japanese length and iteration marks. 3346 // When we encouter a special processing tag, we go backwards and try to see if 3347 // we have a match. 3348 // Contraction tables are used - so the whole process is not unlike contraction. 3349 // prefix data is stored backwards in the table. 3350 const UChar *UCharOffset; 3351 UChar schar, tchar; 3352 collIterateState prefixState; 3353 backupState(source, &prefixState); 3354 for(;;) { 3355 // This loop will run once per source string character, for as long as we 3356 // are matching a potential contraction sequence 3357 3358 // First we position ourselves at the begining of contraction sequence 3359 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 3360 3361 if (collIter_bos(source)) { 3362 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 3363 break; 3364 } 3365 schar = getPrevNormalizedChar(source, status); 3366 goBackOne(source); 3367 3368 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 3369 UCharOffset++; 3370 } 3371 3372 if (schar == tchar) { 3373 // Found the source string char in the table. 3374 // Pick up the corresponding CE from the table. 3375 CE = *(coll->contractionCEs + 3376 (UCharOffset - coll->contractionIndex)); 3377 } 3378 else 3379 { 3380 // if there is a completely ignorable code point in the middle of 3381 // a prefix, we need to act as if it's not there 3382 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) 3383 // lone surrogates cannot be set to zero as it would break other processing 3384 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 3385 // it's easy for BMP code points 3386 if(isZeroCE == 0) { 3387 continue; 3388 } else if(U16_IS_SURROGATE(schar)) { 3389 // for supplementary code points, we have to check the next one 3390 // situations where we are going to ignore 3391 // 1. beginning of the string: schar is a lone surrogate 3392 // 2. schar is a lone surrogate 3393 // 3. schar is a trail surrogate in a valid surrogate sequence 3394 // that is explicitly set to zero. 3395 if (!collIter_bos(source)) { 3396 UChar lead; 3397 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) { 3398 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead); 3399 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) { 3400 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); 3401 if(finalCE == 0) { 3402 // this is a real, assigned completely ignorable code point 3403 goBackOne(source); 3404 continue; 3405 } 3406 } 3407 } else { 3408 // lone surrogate, treat like unassigned 3409 return UCOL_NOT_FOUND; 3410 } 3411 } else { 3412 // lone surrogate at the beggining, treat like unassigned 3413 return UCOL_NOT_FOUND; 3414 } 3415 } 3416 // Source string char was not in the table. 3417 // We have not found the prefix. 3418 CE = *(coll->contractionCEs + 3419 (ContractionStart - coll->contractionIndex)); 3420 } 3421 3422 if(!isPrefix(CE)) { 3423 // The source string char was in the contraction table, and the corresponding 3424 // CE is not a prefix CE. We found the prefix, break 3425 // out of loop, this CE will end up being returned. This is the normal 3426 // way out of prefix handling when the source actually contained 3427 // the prefix. 3428 break; 3429 } 3430 } 3431 loadState(source, &prefixState, TRUE); 3432 break; 3433 } 3434 3435 case CONTRACTION_TAG: { 3436 /* to ensure that the backwards and forwards iteration matches, we 3437 take the current region of most possible match and pass it through 3438 the forward iteration. this will ensure that the obstinate problem of 3439 overlapping contractions will not occur. 3440 */ 3441 schar = peekCodeUnit(source, 0); 3442 constart = (UChar *)coll->image + getContractOffset(CE); 3443 if (isAtStartPrevIterate(source) 3444 /* commented away contraction end checks after adding the checks 3445 in getPrevCE */) { 3446 /* start of string or this is not the end of any contraction */ 3447 CE = *(coll->contractionCEs + 3448 (constart - coll->contractionIndex)); 3449 break; 3450 } 3451 strbuffer = buffer; 3452 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); 3453 *(UCharOffset --) = 0; 3454 noChars = 0; 3455 // have to swap thai characters 3456 while (ucol_unsafeCP(schar, coll)) { 3457 *(UCharOffset) = schar; 3458 noChars++; 3459 UCharOffset --; 3460 schar = getPrevNormalizedChar(source, status); 3461 goBackOne(source); 3462 // TODO: when we exhaust the contraction buffer, 3463 // it needs to get reallocated. The problem is 3464 // that the size depends on the string which is 3465 // not iterated over. However, since we're travelling 3466 // backwards, we already had to set the iterator at 3467 // the end - so we might as well know where we are? 3468 if (UCharOffset + 1 == buffer) { 3469 /* we have exhausted the buffer */ 3470 int32_t newsize = 0; 3471 if(source->pos) { // actually dealing with a position 3472 newsize = (int32_t)(source->pos - source->string + 1); 3473 } else { // iterator 3474 newsize = 4 * UCOL_MAX_BUFFER; 3475 } 3476 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * 3477 (newsize + UCOL_MAX_BUFFER)); 3478 /* test for NULL */ 3479 if (strbuffer == NULL) { 3480 *status = U_MEMORY_ALLOCATION_ERROR; 3481 return UCOL_NO_MORE_CES; 3482 } 3483 UCharOffset = strbuffer + newsize; 3484 uprv_memcpy(UCharOffset, buffer, 3485 UCOL_MAX_BUFFER * sizeof(UChar)); 3486 UCharOffset --; 3487 } 3488 if ((source->pos && (source->pos == source->string || 3489 ((source->flags & UCOL_ITER_INNORMBUF) && 3490 *(source->pos - 1) == 0 && source->fcdPosition == NULL))) 3491 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { 3492 break; 3493 } 3494 } 3495 /* adds the initial base character to the string */ 3496 *(UCharOffset) = schar; 3497 noChars++; 3498 3499 int32_t offsetBias; 3500 3501 // **** doesn't work if using iterator **** 3502 if (source->flags & UCOL_ITER_INNORMBUF) { 3503 offsetBias = -1; 3504 } else { 3505 offsetBias = (int32_t)(source->pos - source->string); 3506 } 3507 3508 /* a new collIterate is used to simplify things, since using the current 3509 collIterate will mean that the forward and backwards iteration will 3510 share and change the same buffers. we don't want to get into that. */ 3511 collIterate temp; 3512 int32_t rawOffset; 3513 3514 IInit_collIterate(coll, UCharOffset, noChars, &temp, status); 3515 if(U_FAILURE(*status)) { 3516 return UCOL_NULLORDER; 3517 } 3518 temp.flags &= ~UCOL_ITER_NORM; 3519 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; 3520 3521 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero? 3522 CE = ucol_IGetNextCE(coll, &temp, status); 3523 3524 if (source->extendCEs) { 3525 endCEBuffer = source->extendCEs + source->extendCEsSize; 3526 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t)); 3527 } else { 3528 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; 3529 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t)); 3530 } 3531 3532 while (CE != UCOL_NO_MORE_CES) { 3533 *(source->CEpos ++) = CE; 3534 3535 if (offsetBias >= 0) { 3536 source->appendOffset(rawOffset + offsetBias, *status); 3537 } 3538 3539 CECount++; 3540 if (source->CEpos == endCEBuffer) { 3541 /* ran out of CE space, reallocate to new buffer. 3542 If reallocation fails, reset pointers and bail out, 3543 there's no guarantee of the right character position after 3544 this bail*/ 3545 if (!increaseCEsCapacity(source)) { 3546 *status = U_MEMORY_ALLOCATION_ERROR; 3547 break; 3548 } 3549 3550 endCEBuffer = source->extendCEs + source->extendCEsSize; 3551 } 3552 3553 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { 3554 rawOffset = (int32_t)(temp.fcdPosition - temp.string); 3555 } else { 3556 rawOffset = (int32_t)(temp.pos - temp.string); 3557 } 3558 3559 CE = ucol_IGetNextCE(coll, &temp, status); 3560 } 3561 3562 if (strbuffer != buffer) { 3563 uprv_free(strbuffer); 3564 } 3565 if (U_FAILURE(*status)) { 3566 return (uint32_t)UCOL_NULLORDER; 3567 } 3568 3569 if (source->offsetRepeatValue != 0) { 3570 if (CECount > noChars) { 3571 source->offsetRepeatCount += temp.offsetRepeatCount; 3572 } else { 3573 // **** does this really skip the right offsets? **** 3574 source->offsetReturn -= (noChars - CECount); 3575 } 3576 } 3577 3578 if (offsetBias >= 0) { 3579 source->offsetReturn = source->offsetStore - 1; 3580 if (source->offsetReturn == source->offsetBuffer) { 3581 source->offsetStore = source->offsetBuffer; 3582 } 3583 } 3584 3585 source->toReturn = source->CEpos - 1; 3586 if (source->toReturn == source->CEs) { 3587 source->CEpos = source->CEs; 3588 } 3589 3590 return *(source->toReturn); 3591 } 3592 case LONG_PRIMARY_TAG: 3593 { 3594 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 3595 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 3596 source->toReturn = source->CEpos - 1; 3597 3598 if (source->flags & UCOL_ITER_INNORMBUF) { 3599 source->offsetRepeatCount = 1; 3600 } else { 3601 int32_t firstOffset = (int32_t)(source->pos - source->string); 3602 3603 source->appendOffset(firstOffset, *status); 3604 source->appendOffset(firstOffset + 1, *status); 3605 3606 source->offsetReturn = source->offsetStore - 1; 3607 *(source->offsetBuffer) = firstOffset; 3608 if (source->offsetReturn == source->offsetBuffer) { 3609 source->offsetStore = source->offsetBuffer; 3610 } 3611 } 3612 3613 3614 return *(source->toReturn); 3615 } 3616 3617 case EXPANSION_TAG: /* this tag always returns */ 3618 { 3619 /* 3620 This should handle expansion. 3621 NOTE: we can encounter both continuations and expansions in an expansion! 3622 I have to decide where continuations are going to be dealt with 3623 */ 3624 int32_t firstOffset = (int32_t)(source->pos - source->string); 3625 3626 // **** doesn't work if using iterator **** 3627 if (source->offsetReturn != NULL) { 3628 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) { 3629 source->offsetStore = source->offsetBuffer; 3630 }else { 3631 firstOffset = -1; 3632 } 3633 } 3634 3635 /* find the offset to expansion table */ 3636 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3637 size = getExpansionCount(CE); 3638 if (size != 0) { 3639 /* 3640 if there are less than 16 elements in expansion, we don't terminate 3641 */ 3642 uint32_t count; 3643 3644 for (count = 0; count < size; count++) { 3645 *(source->CEpos ++) = *CEOffset++; 3646 3647 if (firstOffset >= 0) { 3648 source->appendOffset(firstOffset + 1, *status); 3649 } 3650 } 3651 } else { 3652 /* else, we do */ 3653 while (*CEOffset != 0) { 3654 *(source->CEpos ++) = *CEOffset ++; 3655 3656 if (firstOffset >= 0) { 3657 source->appendOffset(firstOffset + 1, *status); 3658 } 3659 } 3660 } 3661 3662 if (firstOffset >= 0) { 3663 source->offsetReturn = source->offsetStore - 1; 3664 *(source->offsetBuffer) = firstOffset; 3665 if (source->offsetReturn == source->offsetBuffer) { 3666 source->offsetStore = source->offsetBuffer; 3667 } 3668 } else { 3669 source->offsetRepeatCount += size - 1; 3670 } 3671 3672 source->toReturn = source->CEpos - 1; 3673 // in case of one element expansion, we 3674 // want to immediately return CEpos 3675 if(source->toReturn == source->CEs) { 3676 source->CEpos = source->CEs; 3677 } 3678 3679 return *(source->toReturn); 3680 } 3681 3682 case DIGIT_TAG: 3683 { 3684 /* 3685 We do a check to see if we want to collate digits as numbers; if so we generate 3686 a custom collation key. Otherwise we pull out the value stored in the expansion table. 3687 */ 3688 uint32_t i; /* general counter */ 3689 3690 if (source->coll->numericCollation == UCOL_ON){ 3691 uint32_t digIndx = 0; 3692 uint32_t endIndex = 0; 3693 uint32_t leadingZeroIndex = 0; 3694 uint32_t trailingZeroCount = 0; 3695 3696 uint8_t collateVal = 0; 3697 3698 UBool nonZeroValReached = FALSE; 3699 3700 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs. 3701 /* 3702 We parse the source string until we hit a char that's NOT a digit. 3703 Use this u_charDigitValue. This might be slow because we have to 3704 handle surrogates... 3705 */ 3706 /* 3707 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less, 3708 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation 3709 element we process when going backward. To determine how long that chunk might be, we may need to make 3710 two passes through the loop that collects digits - one to see how long the string is (and how much is 3711 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has 3712 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation 3713 element chunk after resetting the state to the initialState at the right side of the digit string. 3714 */ 3715 uint32_t ceLimit = 0; 3716 UChar initial_ch = ch; 3717 collIterateState initialState = {0,0,0,0,0,0,0,0,0}; 3718 backupState(source, &initialState); 3719 3720 for(;;) { 3721 collIterateState state = {0,0,0,0,0,0,0,0,0}; 3722 UChar32 char32 = 0; 3723 int32_t digVal = 0; 3724 3725 if (U16_IS_TRAIL (ch)) { 3726 if (!collIter_bos(source)){ 3727 UChar lead = getPrevNormalizedChar(source, status); 3728 if(U16_IS_LEAD(lead)) { 3729 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3730 goBackOne(source); 3731 } else { 3732 char32 = ch; 3733 } 3734 } else { 3735 char32 = ch; 3736 } 3737 } else { 3738 char32 = ch; 3739 } 3740 digVal = u_charDigitValue(char32); 3741 3742 for(;;) { 3743 // Make sure we have enough space. No longer needed; 3744 // at this point the largest value of digIndx when we need to save data in numTempBuf 3745 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure 3746 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2). 3747 3748 // Skip over trailing zeroes, and keep a count of them. 3749 if (digVal != 0) 3750 nonZeroValReached = TRUE; 3751 3752 if (nonZeroValReached) { 3753 /* 3754 We parse the digit string into base 100 numbers (this fits into a byte). 3755 We only add to the buffer in twos, thus if we are parsing an odd character, 3756 that serves as the 'tens' digit while the if we are parsing an even one, that 3757 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3758 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3759 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3760 than all the other bytes. 3761 3762 Since we're doing in this reverse we want to put the first digit encountered into the 3763 ones place and the second digit encountered into the tens place. 3764 */ 3765 3766 if ((digIndx + trailingZeroCount) % 2 == 1) { 3767 // High-order digit case (tens place) 3768 collateVal += (uint8_t)(digVal * 10); 3769 3770 // We cannot set leadingZeroIndex unless it has been set for the 3771 // low-order digit. Therefore, all we can do for the high-order 3772 // digit is turn it off, never on. 3773 // The only time we will have a high digit without a low is for 3774 // the very first non-zero digit, so no zero check is necessary. 3775 if (collateVal != 0) 3776 leadingZeroIndex = 0; 3777 3778 // The first pass through, digIndx may exceed the limit, but in that case 3779 // we no longer care about numTempBuf contents since they will be discarded 3780 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) { 3781 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3782 } 3783 collateVal = 0; 3784 } else { 3785 // Low-order digit case (ones place) 3786 collateVal = (uint8_t)digVal; 3787 3788 // Check for leading zeroes. 3789 if (collateVal == 0) { 3790 if (!leadingZeroIndex) 3791 leadingZeroIndex = (digIndx/2) + 2; 3792 } else 3793 leadingZeroIndex = 0; 3794 3795 // No need to write to buffer; the case of a last odd digit 3796 // is handled below. 3797 } 3798 ++digIndx; 3799 } else 3800 ++trailingZeroCount; 3801 3802 if (!collIter_bos(source)) { 3803 ch = getPrevNormalizedChar(source, status); 3804 //goBackOne(source); 3805 if (U16_IS_TRAIL(ch)) { 3806 backupState(source, &state); 3807 if (!collIter_bos(source)) { 3808 goBackOne(source); 3809 UChar lead = getPrevNormalizedChar(source, status); 3810 3811 if(U16_IS_LEAD(lead)) { 3812 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3813 } else { 3814 loadState(source, &state, FALSE); 3815 char32 = ch; 3816 } 3817 } 3818 } else 3819 char32 = ch; 3820 3821 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { 3822 if (char32 > 0xFFFF) {// For surrogates. 3823 loadState(source, &state, FALSE); 3824 } 3825 // Don't need to "reverse" the goBackOne call, 3826 // as this points to the next position to process.. 3827 //if (char32 > 0xFFFF) // For surrogates. 3828 //getNextNormalizedChar(source); 3829 break; 3830 } 3831 3832 goBackOne(source); 3833 }else 3834 break; 3835 } 3836 3837 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) { 3838 // our collation element is not too big, go ahead and finish with it 3839 break; 3840 } 3841 // our digit string is too long for a collation element; 3842 // set the limit for it, reset the state and begin again 3843 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER; 3844 if ( ceLimit == 0 ) { 3845 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; 3846 } 3847 ch = initial_ch; 3848 loadState(source, &initialState, FALSE); 3849 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0; 3850 collateVal = 0; 3851 nonZeroValReached = FALSE; 3852 } 3853 3854 if (! nonZeroValReached) { 3855 digIndx = 2; 3856 trailingZeroCount = 0; 3857 numTempBuf[2] = 6; 3858 } 3859 3860 if ((digIndx + trailingZeroCount) % 2 != 0) { 3861 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; 3862 digIndx += 1; // The implicit leading zero 3863 } 3864 if (trailingZeroCount % 2 != 0) { 3865 // We had to consume one trailing zero for the low digit 3866 // of the least significant byte 3867 digIndx += 1; // The trailing zero not in the exponent 3868 trailingZeroCount -= 1; 3869 } 3870 3871 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; 3872 3873 // Subtract one off of the last byte. Really the first byte here, but it's reversed... 3874 numTempBuf[2] -= 1; 3875 3876 /* 3877 We want to skip over the first two slots in the buffer. The first slot 3878 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3879 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3880 The exponent must be adjusted by the number of leading zeroes, and the number of 3881 trailing zeroes. 3882 */ 3883 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3884 uint32_t exponent = (digIndx+trailingZeroCount)/2; 3885 if (leadingZeroIndex) 3886 exponent -= ((digIndx/2) + 2 - leadingZeroIndex); 3887 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); 3888 3889 // Now transfer the collation key to our collIterate struct. 3890 // The total size for our collation key is half of endIndex, rounded up. 3891 int32_t size = (endIndex+1)/2; 3892 if(!ensureCEsCapacity(source, size)) { 3893 return UCOL_NULLORDER; 3894 } 3895 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3896 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3897 UCOL_BYTE_COMMON; // Tertiary weight. 3898 i = endIndex - 1; // Reset the index into the buffer. 3899 while(i >= 2) { 3900 uint32_t primWeight = numTempBuf[i--] << 8; 3901 if ( i >= 2) 3902 primWeight |= numTempBuf[i--]; 3903 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3904 } 3905 3906 source->toReturn = source->CEpos -1; 3907 return *(source->toReturn); 3908 } else { 3909 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3910 CE = *(CEOffset++); 3911 break; 3912 } 3913 } 3914 3915 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3916 { 3917 static const uint32_t 3918 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3919 //const uint32_t LCount = 19; 3920 static const uint32_t VCount = 21; 3921 static const uint32_t TCount = 28; 3922 //const uint32_t NCount = VCount * TCount; /* 588 */ 3923 //const uint32_t SCount = LCount * NCount; /* 11172 */ 3924 3925 uint32_t L = ch - SBase; 3926 /* 3927 divide into pieces. 3928 we do it in this order since some compilers can do % and / in one 3929 operation 3930 */ 3931 uint32_t T = L % TCount; 3932 L /= TCount; 3933 uint32_t V = L % VCount; 3934 L /= VCount; 3935 3936 /* offset them */ 3937 L += LBase; 3938 V += VBase; 3939 T += TBase; 3940 3941 int32_t firstOffset = (int32_t)(source->pos - source->string); 3942 source->appendOffset(firstOffset, *status); 3943 3944 /* 3945 * return the first CE, but first put the rest into the expansion buffer 3946 */ 3947 if (!source->coll->image->jamoSpecial) { 3948 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3949 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3950 source->appendOffset(firstOffset + 1, *status); 3951 3952 if (T != TBase) { 3953 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3954 source->appendOffset(firstOffset + 1, *status); 3955 } 3956 3957 source->toReturn = source->CEpos - 1; 3958 3959 source->offsetReturn = source->offsetStore - 1; 3960 if (source->offsetReturn == source->offsetBuffer) { 3961 source->offsetStore = source->offsetBuffer; 3962 } 3963 3964 return *(source->toReturn); 3965 } else { 3966 // Since Hanguls pass the FCD check, it is 3967 // guaranteed that we won't be in 3968 // the normalization buffer if something like this happens 3969 // Move Jamos into normalization buffer 3970 /* 3971 Move the Jamos into the 3972 normalization buffer 3973 */ 3974 UChar *tempbuffer = source->writableBuffer.getBuffer(5); 3975 int32_t tempbufferLength; 3976 tempbuffer[0] = 0; 3977 tempbuffer[1] = (UChar)L; 3978 tempbuffer[2] = (UChar)V; 3979 if (T != TBase) { 3980 tempbuffer[3] = (UChar)T; 3981 tempbufferLength = 4; 3982 } else { 3983 tempbufferLength = 3; 3984 } 3985 source->writableBuffer.releaseBuffer(tempbufferLength); 3986 3987 /* 3988 Indicate where to continue in main input string after exhausting 3989 the writableBuffer 3990 */ 3991 if (source->pos == source->string) { 3992 source->fcdPosition = NULL; 3993 } else { 3994 source->fcdPosition = source->pos-1; 3995 } 3996 3997 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength; 3998 source->origFlags = source->flags; 3999 source->flags |= UCOL_ITER_INNORMBUF; 4000 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 4001 4002 return(UCOL_IGNORABLE); 4003 } 4004 } 4005 4006 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 4007 return getPrevImplicit(ch, source); 4008 4009 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function 4010 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 4011 return getPrevImplicit(ch, source); 4012 4013 case SURROGATE_TAG: /* This is a surrogate pair */ 4014 /* essentially an engaged lead surrogate. */ 4015 /* if you have encountered it here, it means that a */ 4016 /* broken sequence was encountered and this is an error */ 4017 return UCOL_NOT_FOUND; 4018 4019 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 4020 return UCOL_NOT_FOUND; /* broken surrogate sequence */ 4021 4022 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 4023 { 4024 UChar32 cp = 0; 4025 UChar prevChar; 4026 const UChar *prev; 4027 if (isAtStartPrevIterate(source)) { 4028 /* we are at the start of the string, wrong place to be at */ 4029 return UCOL_NOT_FOUND; 4030 } 4031 if (source->pos != source->writableBuffer.getBuffer()) { 4032 prev = source->pos - 1; 4033 } else { 4034 prev = source->fcdPosition; 4035 } 4036 prevChar = *prev; 4037 4038 /* Handles Han and Supplementary characters here.*/ 4039 if (U16_IS_LEAD(prevChar)) { 4040 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 4041 source->pos = prev; 4042 } else { 4043 return UCOL_NOT_FOUND; /* like unassigned */ 4044 } 4045 4046 return getPrevImplicit(cp, source); 4047 } 4048 4049 /* UCA is filled with these. Tailorings are NOT_FOUND */ 4050 /* not yet implemented */ 4051 case CHARSET_TAG: /* this tag always returns */ 4052 /* probably after 1.8 */ 4053 return UCOL_NOT_FOUND; 4054 4055 default: /* this tag always returns */ 4056 *status = U_INTERNAL_PROGRAM_ERROR; 4057 CE=0; 4058 break; 4059 } 4060 4061 if (CE <= UCOL_NOT_FOUND) { 4062 break; 4063 } 4064 } 4065 4066 return CE; 4067 } 4068 4069 /* This should really be a macro */ 4070 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */ 4071 /* anyway */ 4072 static 4073 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) { 4074 #ifdef UCOL_DEBUG 4075 fprintf(stderr, "."); 4076 #endif 4077 uint8_t *newStart = NULL; 4078 uint32_t offset = (uint32_t)(*secondaries-secStart); 4079 4080 if(secStart==second) { 4081 newStart=(uint8_t*)uprv_malloc(newSize); 4082 if(newStart==NULL) { 4083 *status = U_MEMORY_ALLOCATION_ERROR; 4084 return NULL; 4085 } 4086 uprv_memcpy(newStart, secStart, *secondaries-secStart); 4087 } else { 4088 newStart=(uint8_t*)uprv_realloc(secStart, newSize); 4089 if(newStart==NULL) { 4090 *status = U_MEMORY_ALLOCATION_ERROR; 4091 /* Since we're reallocating, return original reference so we don't loose it. */ 4092 return secStart; 4093 } 4094 } 4095 *secondaries=newStart+offset; 4096 *secSize=newSize; 4097 return newStart; 4098 } 4099 4100 4101 /* This should really be a macro */ 4102 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */ 4103 /* secondaries in French */ 4104 /* 4105 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { 4106 uint8_t temp; 4107 while(start<end) { 4108 temp = *start; 4109 *start++ = *end; 4110 *end-- = temp; 4111 } 4112 } 4113 */ 4114 4115 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ 4116 TYPE tempA; \ 4117 while((start)<(end)) { \ 4118 tempA = *(start); \ 4119 *(start)++ = *(end); \ 4120 *(end)-- = tempA; \ 4121 } \ 4122 } 4123 4124 /****************************************************************************/ 4125 /* Following are the sortkey generation functions */ 4126 /* */ 4127 /****************************************************************************/ 4128 4129 /** 4130 * Merge two sort keys. 4131 * This is useful, for example, to combine sort keys from first and last names 4132 * to sort such pairs. 4133 * Merged sort keys consider on each collation level the first part first entirely, 4134 * then the second one. 4135 * It is possible to merge multiple sort keys by consecutively merging 4136 * another one with the intermediate result. 4137 * 4138 * The length of the merge result is the sum of the lengths of the input sort keys 4139 * minus 1. 4140 * 4141 * @param src1 the first sort key 4142 * @param src1Length the length of the first sort key, including the zero byte at the end; 4143 * can be -1 if the function is to find the length 4144 * @param src2 the second sort key 4145 * @param src2Length the length of the second sort key, including the zero byte at the end; 4146 * can be -1 if the function is to find the length 4147 * @param dest the buffer where the merged sort key is written, 4148 * can be NULL if destCapacity==0 4149 * @param destCapacity the number of bytes in the dest buffer 4150 * @return the length of the merged sort key, src1Length+src2Length-1; 4151 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments), 4152 * in which cases the contents of dest is undefined 4153 * 4154 * @draft 4155 */ 4156 U_CAPI int32_t U_EXPORT2 4157 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 4158 const uint8_t *src2, int32_t src2Length, 4159 uint8_t *dest, int32_t destCapacity) { 4160 int32_t destLength; 4161 uint8_t b; 4162 4163 /* check arguments */ 4164 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 4165 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 4166 destCapacity<0 || (destCapacity>0 && dest==NULL) 4167 ) { 4168 /* error, attempt to write a zero byte and return 0 */ 4169 if(dest!=NULL && destCapacity>0) { 4170 *dest=0; 4171 } 4172 return 0; 4173 } 4174 4175 /* check lengths and capacity */ 4176 if(src1Length<0) { 4177 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 4178 } 4179 if(src2Length<0) { 4180 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 4181 } 4182 4183 destLength=src1Length+src2Length-1; 4184 if(destLength>destCapacity) { 4185 /* the merged sort key does not fit into the destination */ 4186 return destLength; 4187 } 4188 4189 /* merge the sort keys with the same number of levels */ 4190 while(*src1!=0 && *src2!=0) { /* while both have another level */ 4191 /* copy level from src1 not including 00 or 01 */ 4192 while((b=*src1)>=2) { 4193 ++src1; 4194 *dest++=b; 4195 } 4196 4197 /* add a 02 merge separator */ 4198 *dest++=2; 4199 4200 /* copy level from src2 not including 00 or 01 */ 4201 while((b=*src2)>=2) { 4202 ++src2; 4203 *dest++=b; 4204 } 4205 4206 /* if both sort keys have another level, then add a 01 level separator and continue */ 4207 if(*src1==1 && *src2==1) { 4208 ++src1; 4209 ++src2; 4210 *dest++=1; 4211 } 4212 } 4213 4214 /* 4215 * here, at least one sort key is finished now, but the other one 4216 * might have some contents left from containing more levels; 4217 * that contents is just appended to the result 4218 */ 4219 if(*src1!=0) { 4220 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 4221 src2=src1; 4222 } 4223 /* append src2, "the other, unfinished sort key" */ 4224 uprv_strcpy((char *)dest, (const char *)src2); 4225 4226 /* trust that neither sort key contained illegally embedded zero bytes */ 4227 return destLength; 4228 } 4229 4230 /* sortkey API */ 4231 U_CAPI int32_t U_EXPORT2 4232 ucol_getSortKey(const UCollator *coll, 4233 const UChar *source, 4234 int32_t sourceLength, 4235 uint8_t *result, 4236 int32_t resultLength) 4237 { 4238 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 4239 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 4240 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 4241 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 4242 } 4243 4244 UErrorCode status = U_ZERO_ERROR; 4245 int32_t keySize = 0; 4246 4247 if(source != NULL) { 4248 // source == NULL is actually an error situation, but we would need to 4249 // have an error code to return it. Until we introduce a new 4250 // API, it stays like this 4251 4252 /* this uses the function pointer that is set in updateinternalstate */ 4253 /* currently, there are two funcs: */ 4254 /*ucol_calcSortKey(...);*/ 4255 /*ucol_calcSortKeySimpleTertiary(...);*/ 4256 4257 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status); 4258 //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) { 4259 // That's not good. Something unusual happened. 4260 // We don't know how much we initialized before we failed. 4261 // NULL terminate for safety. 4262 // We have no way say that we have generated a partial sort key. 4263 //result[0] = 0; 4264 //keySize = 0; 4265 //} 4266 } 4267 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 4268 UTRACE_EXIT_STATUS(status); 4269 return keySize; 4270 } 4271 4272 /* this function is called by the C++ API for sortkey generation */ 4273 U_CFUNC int32_t 4274 ucol_getSortKeyWithAllocation(const UCollator *coll, 4275 const UChar *source, int32_t sourceLength, 4276 uint8_t **pResult, 4277 UErrorCode *pErrorCode) { 4278 *pResult = 0; 4279 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode); 4280 } 4281 4282 #define UCOL_FSEC_BUF_SIZE 256 4283 4284 // Is this primary weight compressible? 4285 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). 4286 // TODO: This should use per-lead-byte flags from FractionalUCA.txt. 4287 static inline UBool 4288 isCompressible(const UCollator * /*coll*/, uint8_t primary1) { 4289 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary; 4290 } 4291 4292 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */ 4293 /* or if we run out of space while making a sortkey and want to return ASAP */ 4294 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) { 4295 UErrorCode status = U_ZERO_ERROR; 4296 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 4297 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4298 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4299 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4300 UBool compareIdent = (strength == UCOL_IDENTICAL); 4301 UBool doCase = (coll->caseLevel == UCOL_ON); 4302 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4303 //UBool qShifted = shifted && (compareQuad == 0); 4304 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4305 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4306 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE]; 4307 uint8_t *fSecs = fSecsBuff; 4308 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE; 4309 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL; 4310 4311 uint32_t variableTopValue = coll->variableTopValue; 4312 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4313 if(doHiragana) { 4314 UCOL_COMMON_BOT4++; 4315 /* allocate one more space for hiragana */ 4316 } 4317 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4318 4319 uint32_t order = UCOL_NO_MORE_CES; 4320 uint8_t primary1 = 0; 4321 uint8_t primary2 = 0; 4322 uint8_t secondary = 0; 4323 uint8_t tertiary = 0; 4324 int32_t caseShift = 0; 4325 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */ 4326 4327 uint8_t caseSwitch = coll->caseSwitch; 4328 uint8_t tertiaryMask = coll->tertiaryMask; 4329 uint8_t tertiaryCommon = coll->tertiaryCommon; 4330 4331 UBool wasShifted = FALSE; 4332 UBool notIsContinuation = FALSE; 4333 uint8_t leadPrimary = 0; 4334 4335 4336 for(;;) { 4337 order = ucol_IGetNextCE(coll, s, &status); 4338 if(order == UCOL_NO_MORE_CES) { 4339 break; 4340 } 4341 4342 if(order == 0) { 4343 continue; 4344 } 4345 4346 notIsContinuation = !isContinuation(order); 4347 4348 4349 if(notIsContinuation) { 4350 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK)); 4351 } else { 4352 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4353 } 4354 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4355 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4356 primary1 = (uint8_t)(order >> 8); 4357 4358 /* no need to permute since the actual code values don't matter 4359 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { 4360 primary1 = coll->leadBytePermutationTable[primary1]; 4361 } 4362 */ 4363 4364 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4365 || (!notIsContinuation && wasShifted))) 4366 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */ 4367 /* and other ignorables should be removed if following a shifted code point */ 4368 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4369 /* we should just completely ignore it */ 4370 continue; 4371 } 4372 if(compareQuad == 0) { 4373 if(c4 > 0) { 4374 currentSize += (c2/UCOL_BOT_COUNT4)+1; 4375 c4 = 0; 4376 } 4377 currentSize++; 4378 if(primary2 != 0) { 4379 currentSize++; 4380 } 4381 } 4382 wasShifted = TRUE; 4383 } else { 4384 wasShifted = FALSE; 4385 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4386 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 4387 /* calculate sortkey size */ 4388 if(primary1 != UCOL_IGNORABLE) { 4389 if(notIsContinuation) { 4390 if(leadPrimary == primary1) { 4391 currentSize++; 4392 } else { 4393 if(leadPrimary != 0) { 4394 currentSize++; 4395 } 4396 if(primary2 == UCOL_IGNORABLE) { 4397 /* one byter, not compressed */ 4398 currentSize++; 4399 leadPrimary = 0; 4400 } else if(isCompressible(coll, primary1)) { 4401 /* compress */ 4402 leadPrimary = primary1; 4403 currentSize+=2; 4404 } else { 4405 leadPrimary = 0; 4406 currentSize+=2; 4407 } 4408 } 4409 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 4410 currentSize++; 4411 if(primary2 != UCOL_IGNORABLE) { 4412 currentSize++; 4413 } 4414 } 4415 } 4416 4417 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */ 4418 if(!isFrenchSec){ 4419 if (secondary == UCOL_COMMON2 && notIsContinuation) { 4420 c2++; 4421 } else { 4422 if(c2 > 0) { 4423 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4424 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1; 4425 } else { 4426 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1; 4427 } 4428 c2 = 0; 4429 } 4430 currentSize++; 4431 } 4432 } else { 4433 fSecs[fSecsLen++] = secondary; 4434 if(fSecsLen == fSecsMaxLen) { 4435 uint8_t *fSecsTemp; 4436 if(fSecs == fSecsBuff) { 4437 fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen); 4438 } else { 4439 fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen); 4440 } 4441 if(fSecsTemp == NULL) { 4442 status = U_MEMORY_ALLOCATION_ERROR; 4443 return 0; 4444 } 4445 fSecs = fSecsTemp; 4446 fSecsMaxLen *= 2; 4447 } 4448 if(notIsContinuation) { 4449 if (frenchStartPtr != NULL) { 4450 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4451 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4452 frenchStartPtr = NULL; 4453 } 4454 } else { 4455 if (frenchStartPtr == NULL) { 4456 frenchStartPtr = fSecs+fSecsLen-2; 4457 } 4458 frenchEndPtr = fSecs+fSecsLen-1; 4459 } 4460 } 4461 } 4462 4463 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 4464 // do the case level if we need to do it. We don't want to calculate 4465 // case level for primary ignorables if we have only primary strength and case level 4466 // otherwise we would break well formedness of CEs 4467 if (caseShift == 0) { 4468 currentSize++; 4469 caseShift = UCOL_CASE_SHIFT_START; 4470 } 4471 if((tertiary&0x3F) > 0 && notIsContinuation) { 4472 caseShift--; 4473 if((tertiary &0xC0) != 0) { 4474 if (caseShift == 0) { 4475 currentSize++; 4476 caseShift = UCOL_CASE_SHIFT_START; 4477 } 4478 caseShift--; 4479 } 4480 } 4481 } else { 4482 if(notIsContinuation) { 4483 tertiary ^= caseSwitch; 4484 } 4485 } 4486 4487 tertiary &= tertiaryMask; 4488 if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */ 4489 if (tertiary == tertiaryCommon && notIsContinuation) { 4490 c3++; 4491 } else { 4492 if(c3 > 0) { 4493 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) 4494 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) { 4495 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1; 4496 } else { 4497 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1; 4498 } 4499 c3 = 0; 4500 } 4501 currentSize++; 4502 } 4503 } 4504 4505 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 4506 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 4507 if(c4>0) { // Close this part 4508 currentSize += (c4/UCOL_BOT_COUNT4)+1; 4509 c4 = 0; 4510 } 4511 currentSize++; // Add the Hiragana 4512 } else { // This wasn't Hiragana, so we can continue adding stuff 4513 c4++; 4514 } 4515 } 4516 } 4517 } 4518 4519 if(!isFrenchSec){ 4520 if(c2 > 0) { 4521 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4522 } 4523 } else { 4524 uint32_t i = 0; 4525 if(frenchStartPtr != NULL) { 4526 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4527 } 4528 for(i = 0; i<fSecsLen; i++) { 4529 secondary = *(fSecs+fSecsLen-i-1); 4530 /* This is compression code. */ 4531 if (secondary == UCOL_COMMON2) { 4532 ++c2; 4533 } else { 4534 if(c2 > 0) { 4535 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4536 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0); 4537 } else { 4538 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4539 } 4540 c2 = 0; 4541 } 4542 currentSize++; 4543 } 4544 } 4545 if(c2 > 0) { 4546 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4547 } 4548 if(fSecs != fSecsBuff) { 4549 uprv_free(fSecs); 4550 } 4551 } 4552 4553 if(c3 > 0) { 4554 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0); 4555 } 4556 4557 if(c4 > 0 && compareQuad == 0) { 4558 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0); 4559 } 4560 4561 if(compareIdent) { 4562 currentSize += u_lengthOfIdenticalLevelRun(s->string, len); 4563 } 4564 return currentSize; 4565 } 4566 4567 static 4568 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) { 4569 if (caseShift == 0) { 4570 *(*cases)++ = UCOL_CASE_BYTE_START; 4571 caseShift = UCOL_CASE_SHIFT_START; 4572 } 4573 } 4574 4575 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we 4576 // know how many values we wanted to add, even if we didn't add them all 4577 static 4578 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) { 4579 size++; 4580 if(primaries < limit) { 4581 *(primaries)++ = value; 4582 } 4583 } 4584 4585 // Packs the secondary buffer when processing French locale. Adds the terminator. 4586 static 4587 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) { 4588 uint8_t secondary; 4589 int32_t count2 = 0; 4590 uint32_t i = 0, size = 0; 4591 // we use i here since the key size already accounts for terminators, so we'll discard the increment 4592 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR); 4593 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */ 4594 if(frenchStartPtr != NULL) { 4595 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4596 } 4597 for(i = 0; i<*secsize; i++) { 4598 secondary = *(secondaries-i-1); 4599 /* This is compression code. */ 4600 if (secondary == UCOL_COMMON2) { 4601 ++count2; 4602 } else { 4603 if (count2 > 0) { 4604 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4605 while (count2 > UCOL_TOP_COUNT2) { 4606 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2)); 4607 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4608 } 4609 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1))); 4610 } else { 4611 while (count2 > UCOL_BOT_COUNT2) { 4612 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4613 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4614 } 4615 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4616 } 4617 count2 = 0; 4618 } 4619 addWithIncrement(primaries, primEnd, size, secondary); 4620 } 4621 } 4622 if (count2 > 0) { 4623 while (count2 > UCOL_BOT_COUNT2) { 4624 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4625 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4626 } 4627 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4628 } 4629 *secsize = size; 4630 return primaries; 4631 } 4632 4633 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 4634 4635 /* This is the sortkey work horse function */ 4636 U_CFUNC int32_t U_CALLCONV 4637 ucol_calcSortKey(const UCollator *coll, 4638 const UChar *source, 4639 int32_t sourceLength, 4640 uint8_t **result, 4641 uint32_t resultLength, 4642 UBool allocateSKBuffer, 4643 UErrorCode *status) 4644 { 4645 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 4646 4647 uint32_t i = 0; /* general purpose counter */ 4648 4649 /* Stack allocated buffers for buffers we use */ 4650 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER]; 4651 4652 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad; 4653 4654 if(U_FAILURE(*status)) { 4655 return 0; 4656 } 4657 4658 if(primaries == NULL && allocateSKBuffer == TRUE) { 4659 primaries = *result = prim; 4660 resultLength = UCOL_PRIMARY_MAX_BUFFER; 4661 } 4662 4663 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER, 4664 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER; 4665 4666 uint32_t sortKeySize = 1; /* it is always \0 terminated */ 4667 4668 UnicodeString normSource; 4669 4670 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); 4671 4672 UColAttributeValue strength = coll->strength; 4673 4674 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4675 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4676 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4677 UBool compareIdent = (strength == UCOL_IDENTICAL); 4678 UBool doCase = (coll->caseLevel == UCOL_ON); 4679 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4680 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4681 //UBool qShifted = shifted && (compareQuad == 0); 4682 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4683 4684 uint32_t variableTopValue = coll->variableTopValue; 4685 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no 4686 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. 4687 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4688 uint8_t UCOL_HIRAGANA_QUAD = 0; 4689 if(doHiragana) { 4690 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; 4691 /* allocate one more space for hiragana, value for hiragana */ 4692 } 4693 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4694 4695 /* support for special features like caselevel and funky secondaries */ 4696 uint8_t *frenchStartPtr = NULL; 4697 uint8_t *frenchEndPtr = NULL; 4698 uint32_t caseShift = 0; 4699 4700 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0)); 4701 4702 /* If we need to normalize, we'll do it all at once at the beginning! */ 4703 const Normalizer2 *norm2; 4704 if(compareIdent) { 4705 norm2 = Normalizer2Factory::getNFDInstance(*status); 4706 } else if(coll->normalizationMode != UCOL_OFF) { 4707 norm2 = Normalizer2Factory::getFCDInstance(*status); 4708 } else { 4709 norm2 = NULL; 4710 } 4711 if(norm2 != NULL) { 4712 normSource.setTo(FALSE, source, len); 4713 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 4714 if(qcYesLength != len) { 4715 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 4716 normSource.truncate(qcYesLength); 4717 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 4718 source = normSource.getBuffer(); 4719 len = normSource.length(); 4720 } 4721 } 4722 collIterate s; 4723 IInit_collIterate(coll, source, len, &s, status); 4724 if(U_FAILURE(*status)) { 4725 return 0; 4726 } 4727 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 4728 4729 if(resultLength == 0 || primaries == NULL) { 4730 return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); 4731 } 4732 uint8_t *primarySafeEnd = primaries + resultLength - 1; 4733 if(strength > UCOL_PRIMARY) { 4734 primarySafeEnd--; 4735 } 4736 4737 uint32_t minBufferSize = UCOL_MAX_BUFFER; 4738 4739 uint8_t *primStart = primaries; 4740 uint8_t *secStart = secondaries; 4741 uint8_t *terStart = tertiaries; 4742 uint8_t *caseStart = cases; 4743 uint8_t *quadStart = quads; 4744 4745 uint32_t order = 0; 4746 4747 uint8_t primary1 = 0; 4748 uint8_t primary2 = 0; 4749 uint8_t secondary = 0; 4750 uint8_t tertiary = 0; 4751 uint8_t caseSwitch = coll->caseSwitch; 4752 uint8_t tertiaryMask = coll->tertiaryMask; 4753 int8_t tertiaryAddition = coll->tertiaryAddition; 4754 uint8_t tertiaryTop = coll->tertiaryTop; 4755 uint8_t tertiaryBottom = coll->tertiaryBottom; 4756 uint8_t tertiaryCommon = coll->tertiaryCommon; 4757 uint8_t caseBits = 0; 4758 4759 UBool finished = FALSE; 4760 UBool wasShifted = FALSE; 4761 UBool notIsContinuation = FALSE; 4762 4763 uint32_t prevBuffSize = 0; 4764 4765 uint32_t count2 = 0, count3 = 0, count4 = 0; 4766 uint8_t leadPrimary = 0; 4767 4768 for(;;) { 4769 for(i=prevBuffSize; i<minBufferSize; ++i) { 4770 4771 order = ucol_IGetNextCE(coll, &s, status); 4772 if(order == UCOL_NO_MORE_CES) { 4773 finished = TRUE; 4774 break; 4775 } 4776 4777 if(order == 0) { 4778 continue; 4779 } 4780 4781 notIsContinuation = !isContinuation(order); 4782 4783 if(notIsContinuation) { 4784 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); 4785 } else { 4786 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4787 } 4788 4789 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4790 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4791 primary1 = (uint8_t)(order >> 8); 4792 4793 uint8_t originalPrimary1 = primary1; 4794 if(notIsContinuation && coll->leadBytePermutationTable != NULL) { 4795 primary1 = coll->leadBytePermutationTable[primary1]; 4796 } 4797 4798 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4799 || (!notIsContinuation && wasShifted))) 4800 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 4801 { 4802 /* and other ignorables should be removed if following a shifted code point */ 4803 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4804 /* we should just completely ignore it */ 4805 continue; 4806 } 4807 if(compareQuad == 0) { 4808 if(count4 > 0) { 4809 while (count4 > UCOL_BOT_COUNT4) { 4810 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4811 count4 -= UCOL_BOT_COUNT4; 4812 } 4813 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 4814 count4 = 0; 4815 } 4816 /* We are dealing with a variable and we're treating them as shifted */ 4817 /* This is a shifted ignorable */ 4818 if(primary1 != 0) { /* we need to check this since we could be in continuation */ 4819 *quads++ = primary1; 4820 } 4821 if(primary2 != 0) { 4822 *quads++ = primary2; 4823 } 4824 } 4825 wasShifted = TRUE; 4826 } else { 4827 wasShifted = FALSE; 4828 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4829 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 4830 /* regular and simple sortkey calc */ 4831 if(primary1 != UCOL_IGNORABLE) { 4832 if(notIsContinuation) { 4833 if(leadPrimary == primary1) { 4834 *primaries++ = primary2; 4835 } else { 4836 if(leadPrimary != 0) { 4837 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 4838 } 4839 if(primary2 == UCOL_IGNORABLE) { 4840 /* one byter, not compressed */ 4841 *primaries++ = primary1; 4842 leadPrimary = 0; 4843 } else if(isCompressible(coll, originalPrimary1)) { 4844 /* compress */ 4845 *primaries++ = leadPrimary = primary1; 4846 if(primaries <= primarySafeEnd) { 4847 *primaries++ = primary2; 4848 } 4849 } else { 4850 leadPrimary = 0; 4851 *primaries++ = primary1; 4852 if(primaries <= primarySafeEnd) { 4853 *primaries++ = primary2; 4854 } 4855 } 4856 } 4857 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 4858 *primaries++ = primary1; 4859 if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) { 4860 *primaries++ = primary2; /* second part */ 4861 } 4862 } 4863 } 4864 4865 if(secondary > compareSec) { 4866 if(!isFrenchSec) { 4867 /* This is compression code. */ 4868 if (secondary == UCOL_COMMON2 && notIsContinuation) { 4869 ++count2; 4870 } else { 4871 if (count2 > 0) { 4872 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4873 while (count2 > UCOL_TOP_COUNT2) { 4874 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 4875 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4876 } 4877 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); 4878 } else { 4879 while (count2 > UCOL_BOT_COUNT2) { 4880 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4881 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4882 } 4883 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 4884 } 4885 count2 = 0; 4886 } 4887 *secondaries++ = secondary; 4888 } 4889 } else { 4890 *secondaries++ = secondary; 4891 /* Do the special handling for French secondaries */ 4892 /* We need to get continuation elements and do intermediate restore */ 4893 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ 4894 if(notIsContinuation) { 4895 if (frenchStartPtr != NULL) { 4896 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4897 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4898 frenchStartPtr = NULL; 4899 } 4900 } else { 4901 if (frenchStartPtr == NULL) { 4902 frenchStartPtr = secondaries - 2; 4903 } 4904 frenchEndPtr = secondaries-1; 4905 } 4906 } 4907 } 4908 4909 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 4910 // do the case level if we need to do it. We don't want to calculate 4911 // case level for primary ignorables if we have only primary strength and case level 4912 // otherwise we would break well formedness of CEs 4913 doCaseShift(&cases, caseShift); 4914 if(notIsContinuation) { 4915 caseBits = (uint8_t)(tertiary & 0xC0); 4916 4917 if(tertiary != 0) { 4918 if(coll->caseFirst == UCOL_UPPER_FIRST) { 4919 if((caseBits & 0xC0) == 0) { 4920 *(cases-1) |= 1 << (--caseShift); 4921 } else { 4922 *(cases-1) |= 0 << (--caseShift); 4923 /* second bit */ 4924 doCaseShift(&cases, caseShift); 4925 *(cases-1) |= ((caseBits>>6)&1) << (--caseShift); 4926 } 4927 } else { 4928 if((caseBits & 0xC0) == 0) { 4929 *(cases-1) |= 0 << (--caseShift); 4930 } else { 4931 *(cases-1) |= 1 << (--caseShift); 4932 /* second bit */ 4933 doCaseShift(&cases, caseShift); 4934 *(cases-1) |= ((caseBits>>7)&1) << (--caseShift); 4935 } 4936 } 4937 } 4938 4939 } 4940 } else { 4941 if(notIsContinuation) { 4942 tertiary ^= caseSwitch; 4943 } 4944 } 4945 4946 tertiary &= tertiaryMask; 4947 if(tertiary > compareTer) { 4948 /* This is compression code. */ 4949 /* sequence size check is included in the if clause */ 4950 if (tertiary == tertiaryCommon && notIsContinuation) { 4951 ++count3; 4952 } else { 4953 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 4954 tertiary += tertiaryAddition; 4955 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 4956 tertiary -= tertiaryAddition; 4957 } 4958 if (count3 > 0) { 4959 if ((tertiary > tertiaryCommon)) { 4960 while (count3 > coll->tertiaryTopCount) { 4961 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 4962 count3 -= (uint32_t)coll->tertiaryTopCount; 4963 } 4964 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); 4965 } else { 4966 while (count3 > coll->tertiaryBottomCount) { 4967 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 4968 count3 -= (uint32_t)coll->tertiaryBottomCount; 4969 } 4970 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 4971 } 4972 count3 = 0; 4973 } 4974 *tertiaries++ = tertiary; 4975 } 4976 } 4977 4978 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 4979 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 4980 if(count4>0) { // Close this part 4981 while (count4 > UCOL_BOT_COUNT4) { 4982 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4983 count4 -= UCOL_BOT_COUNT4; 4984 } 4985 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 4986 count4 = 0; 4987 } 4988 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana 4989 } else { // This wasn't Hiragana, so we can continue adding stuff 4990 count4++; 4991 } 4992 } 4993 } 4994 4995 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ 4996 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ 4997 IInit_collIterate(coll, (UChar *)source, len, &s, status); 4998 if(U_FAILURE(*status)) { 4999 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5000 finished = TRUE; 5001 break; 5002 } 5003 s.flags &= ~UCOL_ITER_NORM; 5004 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); 5005 *status = U_BUFFER_OVERFLOW_ERROR; 5006 finished = TRUE; 5007 break; 5008 } else { /* It's much nicer if we can actually reallocate */ 5009 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart)); 5010 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); 5011 if(U_SUCCESS(*status)) { 5012 *result = primStart; 5013 primarySafeEnd = primStart + resultLength - 1; 5014 if(strength > UCOL_PRIMARY) { 5015 primarySafeEnd--; 5016 } 5017 } else { 5018 /* We ran out of memory!? We can't recover. */ 5019 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5020 finished = TRUE; 5021 break; 5022 } 5023 } 5024 } 5025 } 5026 if(finished) { 5027 break; 5028 } else { 5029 prevBuffSize = minBufferSize; 5030 5031 uint32_t frenchStartOffset = 0, frenchEndOffset = 0; 5032 if (frenchStartPtr != NULL) { 5033 frenchStartOffset = (uint32_t)(frenchStartPtr - secStart); 5034 frenchEndOffset = (uint32_t)(frenchEndPtr - secStart); 5035 } 5036 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); 5037 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); 5038 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status); 5039 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status); 5040 if(U_FAILURE(*status)) { 5041 /* We ran out of memory!? We can't recover. */ 5042 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5043 break; 5044 } 5045 if (frenchStartPtr != NULL) { 5046 frenchStartPtr = secStart + frenchStartOffset; 5047 frenchEndPtr = secStart + frenchEndOffset; 5048 } 5049 minBufferSize *= 2; 5050 } 5051 } 5052 5053 /* Here, we are generally done with processing */ 5054 /* bailing out would not be too productive */ 5055 5056 if(U_SUCCESS(*status)) { 5057 sortKeySize += (uint32_t)(primaries - primStart); 5058 /* we have done all the CE's, now let's put them together to form a key */ 5059 if(compareSec == 0) { 5060 if (count2 > 0) { 5061 while (count2 > UCOL_BOT_COUNT2) { 5062 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5063 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5064 } 5065 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5066 } 5067 uint32_t secsize = (uint32_t)(secondaries-secStart); 5068 if(!isFrenchSec) { // Regular situation, we know the length of secondaries 5069 sortKeySize += secsize; 5070 if(sortKeySize <= resultLength) { 5071 *(primaries++) = UCOL_LEVELTERMINATOR; 5072 uprv_memcpy(primaries, secStart, secsize); 5073 primaries += secsize; 5074 } else { 5075 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ 5076 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5077 if(U_SUCCESS(*status)) { 5078 *result = primStart; 5079 *(primaries++) = UCOL_LEVELTERMINATOR; 5080 uprv_memcpy(primaries, secStart, secsize); 5081 primaries += secsize; 5082 } 5083 else { 5084 /* We ran out of memory!? We can't recover. */ 5085 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5086 goto cleanup; 5087 } 5088 } else { 5089 *status = U_BUFFER_OVERFLOW_ERROR; 5090 } 5091 } 5092 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator 5093 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); 5094 sortKeySize += secsize; 5095 if(sortKeySize <= resultLength) { // if we managed to pack fine 5096 primaries = newPrim; // update the primary pointer 5097 } else { // overflow, need to reallocate and redo 5098 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ 5099 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5100 if(U_SUCCESS(*status)) { 5101 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); 5102 } 5103 else { 5104 /* We ran out of memory!? We can't recover. */ 5105 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5106 goto cleanup; 5107 } 5108 } else { 5109 *status = U_BUFFER_OVERFLOW_ERROR; 5110 } 5111 } 5112 } 5113 } 5114 5115 if(doCase) { 5116 uint32_t casesize = (uint32_t)(cases - caseStart); 5117 sortKeySize += casesize; 5118 if(sortKeySize <= resultLength) { 5119 *(primaries++) = UCOL_LEVELTERMINATOR; 5120 uprv_memcpy(primaries, caseStart, casesize); 5121 primaries += casesize; 5122 } else { 5123 if(allocateSKBuffer == TRUE) { 5124 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5125 if(U_SUCCESS(*status)) { 5126 *result = primStart; 5127 *(primaries++) = UCOL_LEVELTERMINATOR; 5128 uprv_memcpy(primaries, caseStart, casesize); 5129 } 5130 else { 5131 /* We ran out of memory!? We can't recover. */ 5132 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5133 goto cleanup; 5134 } 5135 } else { 5136 *status = U_BUFFER_OVERFLOW_ERROR; 5137 } 5138 } 5139 } 5140 5141 if(compareTer == 0) { 5142 if (count3 > 0) { 5143 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { 5144 while (count3 >= coll->tertiaryTopCount) { 5145 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5146 count3 -= (uint32_t)coll->tertiaryTopCount; 5147 } 5148 *tertiaries++ = (uint8_t)(tertiaryTop - count3); 5149 } else { 5150 while (count3 > coll->tertiaryBottomCount) { 5151 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5152 count3 -= (uint32_t)coll->tertiaryBottomCount; 5153 } 5154 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5155 } 5156 } 5157 uint32_t tersize = (uint32_t)(tertiaries - terStart); 5158 sortKeySize += tersize; 5159 if(sortKeySize <= resultLength) { 5160 *(primaries++) = UCOL_LEVELTERMINATOR; 5161 uprv_memcpy(primaries, terStart, tersize); 5162 primaries += tersize; 5163 } else { 5164 if(allocateSKBuffer == TRUE) { 5165 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5166 if(U_SUCCESS(*status)) { 5167 *result = primStart; 5168 *(primaries++) = UCOL_LEVELTERMINATOR; 5169 uprv_memcpy(primaries, terStart, tersize); 5170 } 5171 else { 5172 /* We ran out of memory!? We can't recover. */ 5173 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5174 goto cleanup; 5175 } 5176 } else { 5177 *status = U_BUFFER_OVERFLOW_ERROR; 5178 } 5179 } 5180 5181 if(compareQuad == 0/*qShifted == TRUE*/) { 5182 if(count4 > 0) { 5183 while (count4 > UCOL_BOT_COUNT4) { 5184 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 5185 count4 -= UCOL_BOT_COUNT4; 5186 } 5187 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 5188 } 5189 uint32_t quadsize = (uint32_t)(quads - quadStart); 5190 sortKeySize += quadsize; 5191 if(sortKeySize <= resultLength) { 5192 *(primaries++) = UCOL_LEVELTERMINATOR; 5193 uprv_memcpy(primaries, quadStart, quadsize); 5194 primaries += quadsize; 5195 } else { 5196 if(allocateSKBuffer == TRUE) { 5197 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5198 if(U_SUCCESS(*status)) { 5199 *result = primStart; 5200 *(primaries++) = UCOL_LEVELTERMINATOR; 5201 uprv_memcpy(primaries, quadStart, quadsize); 5202 } 5203 else { 5204 /* We ran out of memory!? We can't recover. */ 5205 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5206 goto cleanup; 5207 } 5208 } else { 5209 *status = U_BUFFER_OVERFLOW_ERROR; 5210 } 5211 } 5212 } 5213 5214 if(compareIdent) { 5215 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len); 5216 if(sortKeySize <= resultLength) { 5217 *(primaries++) = UCOL_LEVELTERMINATOR; 5218 primaries += u_writeIdenticalLevelRun(s.string, len, primaries); 5219 } else { 5220 if(allocateSKBuffer == TRUE) { 5221 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status); 5222 if(U_SUCCESS(*status)) { 5223 *result = primStart; 5224 *(primaries++) = UCOL_LEVELTERMINATOR; 5225 u_writeIdenticalLevelRun(s.string, len, primaries); 5226 } 5227 else { 5228 /* We ran out of memory!? We can't recover. */ 5229 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5230 goto cleanup; 5231 } 5232 } else { 5233 *status = U_BUFFER_OVERFLOW_ERROR; 5234 } 5235 } 5236 } 5237 } 5238 *(primaries++) = '\0'; 5239 } 5240 5241 if(allocateSKBuffer == TRUE) { 5242 *result = (uint8_t*)uprv_malloc(sortKeySize); 5243 /* test for NULL */ 5244 if (*result == NULL) { 5245 *status = U_MEMORY_ALLOCATION_ERROR; 5246 goto cleanup; 5247 } 5248 uprv_memcpy(*result, primStart, sortKeySize); 5249 if(primStart != prim) { 5250 uprv_free(primStart); 5251 } 5252 } 5253 5254 cleanup: 5255 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { 5256 /* NULL terminate for safety */ 5257 **result = 0; 5258 } 5259 if(terStart != tert) { 5260 uprv_free(terStart); 5261 uprv_free(secStart); 5262 uprv_free(caseStart); 5263 uprv_free(quadStart); 5264 } 5265 5266 /* To avoid memory leak, free the offset buffer if necessary. */ 5267 ucol_freeOffsetBuffer(&s); 5268 5269 return sortKeySize; 5270 } 5271 5272 5273 U_CFUNC int32_t U_CALLCONV 5274 ucol_calcSortKeySimpleTertiary(const UCollator *coll, 5275 const UChar *source, 5276 int32_t sourceLength, 5277 uint8_t **result, 5278 uint32_t resultLength, 5279 UBool allocateSKBuffer, 5280 UErrorCode *status) 5281 { 5282 U_ALIGN_CODE(16); 5283 5284 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 5285 uint32_t i = 0; /* general purpose counter */ 5286 5287 /* Stack allocated buffers for buffers we use */ 5288 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER]; 5289 5290 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert; 5291 5292 if(U_FAILURE(*status)) { 5293 return 0; 5294 } 5295 5296 if(primaries == NULL && allocateSKBuffer == TRUE) { 5297 primaries = *result = prim; 5298 resultLength = UCOL_PRIMARY_MAX_BUFFER; 5299 } 5300 5301 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER; 5302 5303 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */ 5304 5305 UnicodeString normSource; 5306 5307 int32_t len = sourceLength; 5308 5309 /* If we need to normalize, we'll do it all at once at the beginning! */ 5310 if(coll->normalizationMode != UCOL_OFF) { 5311 normSource.setTo(len < 0, source, len); 5312 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); 5313 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 5314 if(qcYesLength != normSource.length()) { 5315 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 5316 normSource.truncate(qcYesLength); 5317 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 5318 source = normSource.getBuffer(); 5319 len = normSource.length(); 5320 } 5321 } 5322 collIterate s; 5323 IInit_collIterate(coll, (UChar *)source, len, &s, status); 5324 if(U_FAILURE(*status)) { 5325 return 0; 5326 } 5327 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 5328 5329 if(resultLength == 0 || primaries == NULL) { 5330 return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); 5331 } 5332 5333 uint8_t *primarySafeEnd = primaries + resultLength - 2; 5334 5335 uint32_t minBufferSize = UCOL_MAX_BUFFER; 5336 5337 uint8_t *primStart = primaries; 5338 uint8_t *secStart = secondaries; 5339 uint8_t *terStart = tertiaries; 5340 5341 uint32_t order = 0; 5342 5343 uint8_t primary1 = 0; 5344 uint8_t primary2 = 0; 5345 uint8_t secondary = 0; 5346 uint8_t tertiary = 0; 5347 uint8_t caseSwitch = coll->caseSwitch; 5348 uint8_t tertiaryMask = coll->tertiaryMask; 5349 int8_t tertiaryAddition = coll->tertiaryAddition; 5350 uint8_t tertiaryTop = coll->tertiaryTop; 5351 uint8_t tertiaryBottom = coll->tertiaryBottom; 5352 uint8_t tertiaryCommon = coll->tertiaryCommon; 5353 5354 uint32_t prevBuffSize = 0; 5355 5356 UBool finished = FALSE; 5357 UBool notIsContinuation = FALSE; 5358 5359 uint32_t count2 = 0, count3 = 0; 5360 uint8_t leadPrimary = 0; 5361 5362 for(;;) { 5363 for(i=prevBuffSize; i<minBufferSize; ++i) { 5364 5365 order = ucol_IGetNextCE(coll, &s, status); 5366 5367 if(order == 0) { 5368 continue; 5369 } 5370 5371 if(order == UCOL_NO_MORE_CES) { 5372 finished = TRUE; 5373 break; 5374 } 5375 5376 notIsContinuation = !isContinuation(order); 5377 5378 if(notIsContinuation) { 5379 tertiary = (uint8_t)((order & tertiaryMask)); 5380 } else { 5381 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 5382 } 5383 5384 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5385 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5386 primary1 = (uint8_t)(order >> 8); 5387 5388 uint8_t originalPrimary1 = primary1; 5389 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { 5390 primary1 = coll->leadBytePermutationTable[primary1]; 5391 } 5392 5393 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 5394 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 5395 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ 5396 /* regular and simple sortkey calc */ 5397 if(primary1 != UCOL_IGNORABLE) { 5398 if(notIsContinuation) { 5399 if(leadPrimary == primary1) { 5400 *primaries++ = primary2; 5401 } else { 5402 if(leadPrimary != 0) { 5403 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 5404 } 5405 if(primary2 == UCOL_IGNORABLE) { 5406 /* one byter, not compressed */ 5407 *primaries++ = primary1; 5408 leadPrimary = 0; 5409 } else if(isCompressible(coll, originalPrimary1)) { 5410 /* compress */ 5411 *primaries++ = leadPrimary = primary1; 5412 *primaries++ = primary2; 5413 } else { 5414 leadPrimary = 0; 5415 *primaries++ = primary1; 5416 *primaries++ = primary2; 5417 } 5418 } 5419 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 5420 *primaries++ = primary1; 5421 if(primary2 != UCOL_IGNORABLE) { 5422 *primaries++ = primary2; /* second part */ 5423 } 5424 } 5425 } 5426 5427 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ 5428 /* This is compression code. */ 5429 if (secondary == UCOL_COMMON2 && notIsContinuation) { 5430 ++count2; 5431 } else { 5432 if (count2 > 0) { 5433 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 5434 while (count2 > UCOL_TOP_COUNT2) { 5435 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 5436 count2 -= (uint32_t)UCOL_TOP_COUNT2; 5437 } 5438 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); 5439 } else { 5440 while (count2 > UCOL_BOT_COUNT2) { 5441 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5442 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5443 } 5444 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5445 } 5446 count2 = 0; 5447 } 5448 *secondaries++ = secondary; 5449 } 5450 } 5451 5452 if(notIsContinuation) { 5453 tertiary ^= caseSwitch; 5454 } 5455 5456 if(tertiary > 0) { 5457 /* This is compression code. */ 5458 /* sequence size check is included in the if clause */ 5459 if (tertiary == tertiaryCommon && notIsContinuation) { 5460 ++count3; 5461 } else { 5462 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 5463 tertiary += tertiaryAddition; 5464 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 5465 tertiary -= tertiaryAddition; 5466 } 5467 if (count3 > 0) { 5468 if ((tertiary > tertiaryCommon)) { 5469 while (count3 > coll->tertiaryTopCount) { 5470 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5471 count3 -= (uint32_t)coll->tertiaryTopCount; 5472 } 5473 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); 5474 } else { 5475 while (count3 > coll->tertiaryBottomCount) { 5476 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5477 count3 -= (uint32_t)coll->tertiaryBottomCount; 5478 } 5479 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5480 } 5481 count3 = 0; 5482 } 5483 *tertiaries++ = tertiary; 5484 } 5485 } 5486 5487 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ 5488 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ 5489 IInit_collIterate(coll, (UChar *)source, len, &s, status); 5490 if(U_FAILURE(*status)) { 5491 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5492 finished = TRUE; 5493 break; 5494 } 5495 s.flags &= ~UCOL_ITER_NORM; 5496 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); 5497 *status = U_BUFFER_OVERFLOW_ERROR; 5498 finished = TRUE; 5499 break; 5500 } else { /* It's much nicer if we can actually reallocate */ 5501 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)); 5502 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); 5503 if(U_SUCCESS(*status)) { 5504 *result = primStart; 5505 primarySafeEnd = primStart + resultLength - 2; 5506 } else { 5507 /* We ran out of memory!? We can't recover. */ 5508 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5509 finished = TRUE; 5510 break; 5511 } 5512 } 5513 } 5514 } 5515 if(finished) { 5516 break; 5517 } else { 5518 prevBuffSize = minBufferSize; 5519 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); 5520 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); 5521 minBufferSize *= 2; 5522 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size 5523 /* We ran out of memory!? We can't recover. */ 5524 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5525 break; 5526 } 5527 } 5528 } 5529 5530 if(U_SUCCESS(*status)) { 5531 sortKeySize += (uint32_t)(primaries - primStart); 5532 /* we have done all the CE's, now let's put them together to form a key */ 5533 if (count2 > 0) { 5534 while (count2 > UCOL_BOT_COUNT2) { 5535 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5536 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5537 } 5538 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5539 } 5540 uint32_t secsize = (uint32_t)(secondaries-secStart); 5541 sortKeySize += secsize; 5542 if(sortKeySize <= resultLength) { 5543 *(primaries++) = UCOL_LEVELTERMINATOR; 5544 uprv_memcpy(primaries, secStart, secsize); 5545 primaries += secsize; 5546 } else { 5547 if(allocateSKBuffer == TRUE) { 5548 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5549 if(U_SUCCESS(*status)) { 5550 *(primaries++) = UCOL_LEVELTERMINATOR; 5551 *result = primStart; 5552 uprv_memcpy(primaries, secStart, secsize); 5553 } 5554 else { 5555 /* We ran out of memory!? We can't recover. */ 5556 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5557 goto cleanup; 5558 } 5559 } else { 5560 *status = U_BUFFER_OVERFLOW_ERROR; 5561 } 5562 } 5563 5564 if (count3 > 0) { 5565 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { 5566 while (count3 >= coll->tertiaryTopCount) { 5567 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5568 count3 -= (uint32_t)coll->tertiaryTopCount; 5569 } 5570 *tertiaries++ = (uint8_t)(tertiaryTop - count3); 5571 } else { 5572 while (count3 > coll->tertiaryBottomCount) { 5573 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5574 count3 -= (uint32_t)coll->tertiaryBottomCount; 5575 } 5576 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5577 } 5578 } 5579 uint32_t tersize = (uint32_t)(tertiaries - terStart); 5580 sortKeySize += tersize; 5581 if(sortKeySize <= resultLength) { 5582 *(primaries++) = UCOL_LEVELTERMINATOR; 5583 uprv_memcpy(primaries, terStart, tersize); 5584 primaries += tersize; 5585 } else { 5586 if(allocateSKBuffer == TRUE) { 5587 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5588 if(U_SUCCESS(*status)) { 5589 *result = primStart; 5590 *(primaries++) = UCOL_LEVELTERMINATOR; 5591 uprv_memcpy(primaries, terStart, tersize); 5592 } 5593 else { 5594 /* We ran out of memory!? We can't recover. */ 5595 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5596 goto cleanup; 5597 } 5598 } else { 5599 *status = U_BUFFER_OVERFLOW_ERROR; 5600 } 5601 } 5602 5603 *(primaries++) = '\0'; 5604 } 5605 5606 if(allocateSKBuffer == TRUE) { 5607 *result = (uint8_t*)uprv_malloc(sortKeySize); 5608 /* test for NULL */ 5609 if (*result == NULL) { 5610 *status = U_MEMORY_ALLOCATION_ERROR; 5611 goto cleanup; 5612 } 5613 uprv_memcpy(*result, primStart, sortKeySize); 5614 if(primStart != prim) { 5615 uprv_free(primStart); 5616 } 5617 } 5618 5619 cleanup: 5620 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { 5621 /* NULL terminate for safety */ 5622 **result = 0; 5623 } 5624 if(terStart != tert) { 5625 uprv_free(terStart); 5626 uprv_free(secStart); 5627 } 5628 5629 /* To avoid memory leak, free the offset buffer if necessary. */ 5630 ucol_freeOffsetBuffer(&s); 5631 5632 return sortKeySize; 5633 } 5634 5635 static inline 5636 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { 5637 UBool notIsContinuation = !isContinuation(CE); 5638 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); 5639 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) 5640 || (!notIsContinuation && *wasShifted))) 5641 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 5642 { 5643 // The stuff below should probably be in the sortkey code... maybe not... 5644 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ 5645 /* we should just completely ignore it */ 5646 *wasShifted = TRUE; 5647 //continue; 5648 } 5649 //*wasShifted = TRUE; 5650 return TRUE; 5651 } else { 5652 *wasShifted = FALSE; 5653 return FALSE; 5654 } 5655 } 5656 static inline 5657 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { 5658 if(level < maxLevel) { 5659 dest[i++] = UCOL_LEVELTERMINATOR; 5660 } else { 5661 dest[i++] = 0; 5662 } 5663 } 5664 5665 /** enumeration of level identifiers for partial sort key generation */ 5666 enum { 5667 UCOL_PSK_PRIMARY = 0, 5668 UCOL_PSK_SECONDARY = 1, 5669 UCOL_PSK_CASE = 2, 5670 UCOL_PSK_TERTIARY = 3, 5671 UCOL_PSK_QUATERNARY = 4, 5672 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */ 5673 UCOL_PSK_IDENTICAL = 6, 5674 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */ 5675 UCOL_PSK_LIMIT 5676 }; 5677 5678 /** collation state enum. *_SHIFT value is how much to shift right 5679 * to get the state piece to the right. *_MASK value should be 5680 * ANDed with the shifted state. This data is stored in state[1] 5681 * field. 5682 */ 5683 enum { 5684 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */ 5685 UCOL_PSK_LEVEL_MASK = 7, /** three bits */ 5686 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */ 5687 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, 5688 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary 5689 * This field is also used to denote that the French secondary level is finished 5690 */ 5691 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ 5692 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ 5693 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */ 5694 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ 5695 /** When we do French we need to reverse secondary values. However, continuations 5696 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba 5697 */ 5698 UCOL_PSK_BOCSU_BYTES_SHIFT = 7, 5699 UCOL_PSK_BOCSU_BYTES_MASK = 3, 5700 UCOL_PSK_CONSUMED_CES_SHIFT = 9, 5701 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF 5702 }; 5703 5704 // macro calculating the number of expansion CEs available 5705 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn 5706 5707 5708 /** main sortkey part procedure. On the first call, 5709 * you should pass in a collator, an iterator, empty state 5710 * state[0] == state[1] == 0, a buffer to hold results 5711 * number of bytes you need and an error code pointer. 5712 * Make sure your buffer is big enough to hold the wanted 5713 * number of sortkey bytes. I don't check. 5714 * The only meaningful status you can get back is 5715 * U_BUFFER_OVERFLOW_ERROR, which basically means that you 5716 * have been dealt a raw deal and that you probably won't 5717 * be able to use partial sortkey generation for this 5718 * particular combination of string and collator. This 5719 * is highly unlikely, but you should still check the error code. 5720 * Any other status means that you're not in a sane situation 5721 * anymore. After the first call, preserve state values and 5722 * use them on subsequent calls to obtain more bytes of a sortkey. 5723 * Use until the number of bytes written is smaller than the requested 5724 * number of bytes. Generated sortkey is not compatible with the 5725 * one generated by ucol_getSortKey, as we don't do any compression. 5726 * However, levels are still terminated by a 1 (one) and the sortkey 5727 * is terminated by a 0 (zero). Identical level is the same as in the 5728 * regular sortkey - internal bocu-1 implementation is used. 5729 * For curious, although you cannot do much about this, here is 5730 * the structure of state words. 5731 * state[0] - iterator state. Depends on the iterator implementation, 5732 * but allows the iterator to continue where it stopped in 5733 * the last iteration. 5734 * state[1] - collation processing state. Here is the distribution 5735 * of the bits: 5736 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary 5737 * quaternary, quin (we don't use this one), identical and 5738 * null (producing only zeroes - first one to terminate the 5739 * sortkey and subsequent to fill the buffer). 5740 * 3 - byte count. Number of bytes written on the primary level. 5741 * 4 - was shifted. Whether the previous iteration finished in the 5742 * shifted state. 5743 * 5, 6 - French continuation bytes written. See the comment in the enum 5744 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on 5745 * the identical level. 5746 * 9..31 - CEs consumed. Number of getCE or next32 operations performed 5747 * since thes last successful update of the iterator state. 5748 */ 5749 U_CAPI int32_t U_EXPORT2 5750 ucol_nextSortKeyPart(const UCollator *coll, 5751 UCharIterator *iter, 5752 uint32_t state[2], 5753 uint8_t *dest, int32_t count, 5754 UErrorCode *status) 5755 { 5756 /* error checking */ 5757 if(status==NULL || U_FAILURE(*status)) { 5758 return 0; 5759 } 5760 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 5761 if( coll==NULL || iter==NULL || 5762 state==NULL || 5763 count<0 || (count>0 && dest==NULL) 5764 ) { 5765 *status=U_ILLEGAL_ARGUMENT_ERROR; 5766 UTRACE_EXIT_STATUS(status); 5767 return 0; 5768 } 5769 5770 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 5771 coll, iter, state[0], state[1], dest, count); 5772 5773 if(count==0) { 5774 /* nothing to do */ 5775 UTRACE_EXIT_VALUE(0); 5776 return 0; 5777 } 5778 /** Setting up situation according to the state we got from the previous iteration */ 5779 // The state of the iterator from the previous invocation 5780 uint32_t iterState = state[0]; 5781 // Has the last iteration ended in the shifted state 5782 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE; 5783 // What is the current level of the sortkey? 5784 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; 5785 // Have we written only one byte from a two byte primary in the previous iteration? 5786 // Also on secondary level - have we finished with the French secondary? 5787 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; 5788 // number of bytes in the continuation buffer for French 5789 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; 5790 // Number of bytes already written from a bocsu sequence. Since 5791 // the longes bocsu sequence is 4 long, this can be up to 3. 5792 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK; 5793 // Number of elements that need to be consumed in this iteration because 5794 // the iterator returned UITER_NO_STATE at the end of the last iteration, 5795 // so we had to save the last valid state. 5796 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK; 5797 5798 /** values that depend on the collator attributes */ 5799 // strength of the collator. 5800 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); 5801 // maximal level of the partial sortkey. Need to take whether case level is done 5802 int32_t maxLevel = 0; 5803 if(strength < UCOL_TERTIARY) { 5804 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 5805 maxLevel = UCOL_PSK_CASE; 5806 } else { 5807 maxLevel = strength; 5808 } 5809 } else { 5810 if(strength == UCOL_TERTIARY) { 5811 maxLevel = UCOL_PSK_TERTIARY; 5812 } else if(strength == UCOL_QUATERNARY) { 5813 maxLevel = UCOL_PSK_QUATERNARY; 5814 } else { // identical 5815 maxLevel = UCOL_IDENTICAL; 5816 } 5817 } 5818 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation 5819 uint8_t UCOL_HIRAGANA_QUAD = 5820 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF; 5821 // Boundary value that decides whether a CE is shifted or not 5822 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0; 5823 // Are we doing French collation? 5824 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON); 5825 5826 /** initializing the collation state */ 5827 UBool notIsContinuation = FALSE; 5828 uint32_t CE = UCOL_NO_MORE_CES; 5829 5830 collIterate s; 5831 IInit_collIterate(coll, NULL, -1, &s, status); 5832 if(U_FAILURE(*status)) { 5833 UTRACE_EXIT_STATUS(*status); 5834 return 0; 5835 } 5836 s.iterator = iter; 5837 s.flags |= UCOL_USE_ITERATOR; 5838 // This variable tells us whether we have produced some other levels in this iteration 5839 // before we moved to the identical level. In that case, we need to switch the 5840 // type of the iterator. 5841 UBool doingIdenticalFromStart = FALSE; 5842 // Normalizing iterator 5843 // The division for the array length may truncate the array size to 5844 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 5845 // for all platforms anyway. 5846 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 5847 UNormIterator *normIter = NULL; 5848 // If the normalization is turned on for the collator and we are below identical level 5849 // we will use a FCD normalizing iterator 5850 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { 5851 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5852 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); 5853 s.flags &= ~UCOL_ITER_NORM; 5854 if(U_FAILURE(*status)) { 5855 UTRACE_EXIT_STATUS(*status); 5856 return 0; 5857 } 5858 } else if(level == UCOL_PSK_IDENTICAL) { 5859 // for identical level, we need a NFD iterator. We need to instantiate it here, since we 5860 // will be updating the state - and this cannot be done on an ordinary iterator. 5861 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5862 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5863 s.flags &= ~UCOL_ITER_NORM; 5864 if(U_FAILURE(*status)) { 5865 UTRACE_EXIT_STATUS(*status); 5866 return 0; 5867 } 5868 doingIdenticalFromStart = TRUE; 5869 } 5870 5871 // This is the tentative new state of the iterator. The problem 5872 // is that the iterator might return an undefined state, in 5873 // which case we should save the last valid state and increase 5874 // the iterator skip value. 5875 uint32_t newState = 0; 5876 5877 // First, we set the iterator to the last valid position 5878 // from the last iteration. This was saved in state[0]. 5879 if(iterState == 0) { 5880 /* initial state */ 5881 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { 5882 s.iterator->move(s.iterator, 0, UITER_LIMIT); 5883 } else { 5884 s.iterator->move(s.iterator, 0, UITER_START); 5885 } 5886 } else { 5887 /* reset to previous state */ 5888 s.iterator->setState(s.iterator, iterState, status); 5889 if(U_FAILURE(*status)) { 5890 UTRACE_EXIT_STATUS(*status); 5891 return 0; 5892 } 5893 } 5894 5895 5896 5897 // This variable tells us whether we can attempt to update the state 5898 // of iterator. Situations where we don't want to update iterator state 5899 // are the existence of expansion CEs that are not yet processed, and 5900 // finishing the case level without enough space in the buffer to insert 5901 // a level terminator. 5902 UBool canUpdateState = TRUE; 5903 5904 // Consume all the CEs that were consumed at the end of the previous 5905 // iteration without updating the iterator state. On identical level, 5906 // consume the code points. 5907 int32_t counter = cces; 5908 if(level < UCOL_PSK_IDENTICAL) { 5909 while(counter-->0) { 5910 // If we're doing French and we are on the secondary level, 5911 // we go backwards. 5912 if(level == UCOL_PSK_SECONDARY && doingFrench) { 5913 CE = ucol_IGetPrevCE(coll, &s, status); 5914 } else { 5915 CE = ucol_IGetNextCE(coll, &s, status); 5916 } 5917 if(CE==UCOL_NO_MORE_CES) { 5918 /* should not happen */ 5919 *status=U_INTERNAL_PROGRAM_ERROR; 5920 UTRACE_EXIT_STATUS(*status); 5921 return 0; 5922 } 5923 if(uprv_numAvailableExpCEs(s)) { 5924 canUpdateState = FALSE; 5925 } 5926 } 5927 } else { 5928 while(counter-->0) { 5929 uiter_next32(s.iterator); 5930 } 5931 } 5932 5933 // French secondary needs to know whether the iterator state of zero came from previous level OR 5934 // from a new invocation... 5935 UBool wasDoingPrimary = FALSE; 5936 // destination buffer byte counter. When this guy 5937 // gets to count, we're done with the iteration 5938 int32_t i = 0; 5939 // used to count the zero bytes written after we 5940 // have finished with the sort key 5941 int32_t j = 0; 5942 5943 5944 // Hm.... I think we're ready to plunge in. Basic story is as following: 5945 // we have a fall through case based on level. This is used for initial 5946 // positioning on iteration start. Every level processor contains a 5947 // for(;;) which will be broken when we exhaust all the CEs. Other 5948 // way to exit is a goto saveState, which happens when we have filled 5949 // out our buffer. 5950 switch(level) { 5951 case UCOL_PSK_PRIMARY: 5952 wasDoingPrimary = TRUE; 5953 for(;;) { 5954 if(i==count) { 5955 goto saveState; 5956 } 5957 // We should save the state only if we 5958 // are sure that we are done with the 5959 // previous iterator state 5960 if(canUpdateState && byteCountOrFrenchDone == 0) { 5961 newState = s.iterator->getState(s.iterator); 5962 if(newState != UITER_NO_STATE) { 5963 iterState = newState; 5964 cces = 0; 5965 } 5966 } 5967 CE = ucol_IGetNextCE(coll, &s, status); 5968 cces++; 5969 if(CE==UCOL_NO_MORE_CES) { 5970 // Add the level separator 5971 terminatePSKLevel(level, maxLevel, i, dest); 5972 byteCountOrFrenchDone=0; 5973 // Restart the iteration an move to the 5974 // second level 5975 s.iterator->move(s.iterator, 0, UITER_START); 5976 cces = 0; 5977 level = UCOL_PSK_SECONDARY; 5978 break; 5979 } 5980 if(!isContinuation(CE)){ 5981 if(coll->leadBytePermutationTable != NULL){ 5982 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF); 5983 } 5984 } 5985 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5986 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ 5987 if(CE != 0) { 5988 if(byteCountOrFrenchDone == 0) { 5989 // get the second byte of primary 5990 dest[i++]=(uint8_t)(CE >> 8); 5991 } else { 5992 byteCountOrFrenchDone = 0; 5993 } 5994 if((CE &=0xff)!=0) { 5995 if(i==count) { 5996 /* overflow */ 5997 byteCountOrFrenchDone = 1; 5998 cces--; 5999 goto saveState; 6000 } 6001 dest[i++]=(uint8_t)CE; 6002 } 6003 } 6004 } 6005 if(uprv_numAvailableExpCEs(s)) { 6006 canUpdateState = FALSE; 6007 } else { 6008 canUpdateState = TRUE; 6009 } 6010 } 6011 /* fall through to next level */ 6012 case UCOL_PSK_SECONDARY: 6013 if(strength >= UCOL_SECONDARY) { 6014 if(!doingFrench) { 6015 for(;;) { 6016 if(i == count) { 6017 goto saveState; 6018 } 6019 // We should save the state only if we 6020 // are sure that we are done with the 6021 // previous iterator state 6022 if(canUpdateState) { 6023 newState = s.iterator->getState(s.iterator); 6024 if(newState != UITER_NO_STATE) { 6025 iterState = newState; 6026 cces = 0; 6027 } 6028 } 6029 CE = ucol_IGetNextCE(coll, &s, status); 6030 cces++; 6031 if(CE==UCOL_NO_MORE_CES) { 6032 // Add the level separator 6033 terminatePSKLevel(level, maxLevel, i, dest); 6034 byteCountOrFrenchDone = 0; 6035 // Restart the iteration an move to the 6036 // second level 6037 s.iterator->move(s.iterator, 0, UITER_START); 6038 cces = 0; 6039 level = UCOL_PSK_CASE; 6040 break; 6041 } 6042 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6043 CE >>= 8; /* get secondary */ 6044 if(CE != 0) { 6045 dest[i++]=(uint8_t)CE; 6046 } 6047 } 6048 if(uprv_numAvailableExpCEs(s)) { 6049 canUpdateState = FALSE; 6050 } else { 6051 canUpdateState = TRUE; 6052 } 6053 } 6054 } else { // French secondary processing 6055 uint8_t frenchBuff[UCOL_MAX_BUFFER]; 6056 int32_t frenchIndex = 0; 6057 // Here we are going backwards. 6058 // If the iterator is at the beggining, it should be 6059 // moved to end. 6060 if(wasDoingPrimary) { 6061 s.iterator->move(s.iterator, 0, UITER_LIMIT); 6062 cces = 0; 6063 } 6064 for(;;) { 6065 if(i == count) { 6066 goto saveState; 6067 } 6068 if(canUpdateState) { 6069 newState = s.iterator->getState(s.iterator); 6070 if(newState != UITER_NO_STATE) { 6071 iterState = newState; 6072 cces = 0; 6073 } 6074 } 6075 CE = ucol_IGetPrevCE(coll, &s, status); 6076 cces++; 6077 if(CE==UCOL_NO_MORE_CES) { 6078 // Add the level separator 6079 terminatePSKLevel(level, maxLevel, i, dest); 6080 byteCountOrFrenchDone = 0; 6081 // Restart the iteration an move to the next level 6082 s.iterator->move(s.iterator, 0, UITER_START); 6083 level = UCOL_PSK_CASE; 6084 break; 6085 } 6086 if(isContinuation(CE)) { // if it's a continuation, we want to save it and 6087 // reverse when we get a first non-continuation CE. 6088 CE >>= 8; 6089 frenchBuff[frenchIndex++] = (uint8_t)CE; 6090 } else if(!isShiftedCE(CE, LVT, &wasShifted)) { 6091 CE >>= 8; /* get secondary */ 6092 if(!frenchIndex) { 6093 if(CE != 0) { 6094 dest[i++]=(uint8_t)CE; 6095 } 6096 } else { 6097 frenchBuff[frenchIndex++] = (uint8_t)CE; 6098 frenchIndex -= usedFrench; 6099 usedFrench = 0; 6100 while(i < count && frenchIndex) { 6101 dest[i++] = frenchBuff[--frenchIndex]; 6102 usedFrench++; 6103 } 6104 } 6105 } 6106 if(uprv_numAvailableExpCEs(s)) { 6107 canUpdateState = FALSE; 6108 } else { 6109 canUpdateState = TRUE; 6110 } 6111 } 6112 } 6113 } else { 6114 level = UCOL_PSK_CASE; 6115 } 6116 /* fall through to next level */ 6117 case UCOL_PSK_CASE: 6118 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 6119 uint32_t caseShift = UCOL_CASE_SHIFT_START; 6120 uint8_t caseByte = UCOL_CASE_BYTE_START; 6121 uint8_t caseBits = 0; 6122 6123 for(;;) { 6124 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); 6125 if(i == count) { 6126 goto saveState; 6127 } 6128 // We should save the state only if we 6129 // are sure that we are done with the 6130 // previous iterator state 6131 if(canUpdateState) { 6132 newState = s.iterator->getState(s.iterator); 6133 if(newState != UITER_NO_STATE) { 6134 iterState = newState; 6135 cces = 0; 6136 } 6137 } 6138 CE = ucol_IGetNextCE(coll, &s, status); 6139 cces++; 6140 if(CE==UCOL_NO_MORE_CES) { 6141 // On the case level we might have an unfinished 6142 // case byte. Add one if it's started. 6143 if(caseShift != UCOL_CASE_SHIFT_START) { 6144 dest[i++] = caseByte; 6145 } 6146 cces = 0; 6147 // We have finished processing CEs on this level. 6148 // However, we don't know if we have enough space 6149 // to add a case level terminator. 6150 if(i < count) { 6151 // Add the level separator 6152 terminatePSKLevel(level, maxLevel, i, dest); 6153 // Restart the iteration and move to the 6154 // next level 6155 s.iterator->move(s.iterator, 0, UITER_START); 6156 level = UCOL_PSK_TERTIARY; 6157 } else { 6158 canUpdateState = FALSE; 6159 } 6160 break; 6161 } 6162 6163 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6164 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) { 6165 // do the case level if we need to do it. We don't want to calculate 6166 // case level for primary ignorables if we have only primary strength and case level 6167 // otherwise we would break well formedness of CEs 6168 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 6169 caseBits = (uint8_t)(CE & 0xC0); 6170 // this copies the case level logic from the 6171 // sort key generation code 6172 if(CE != 0) { 6173 if (caseShift == 0) { 6174 dest[i++] = caseByte; 6175 caseShift = UCOL_CASE_SHIFT_START; 6176 caseByte = UCOL_CASE_BYTE_START; 6177 } 6178 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6179 if((caseBits & 0xC0) == 0) { 6180 caseByte |= 1 << (--caseShift); 6181 } else { 6182 caseByte |= 0 << (--caseShift); 6183 /* second bit */ 6184 if(caseShift == 0) { 6185 dest[i++] = caseByte; 6186 caseShift = UCOL_CASE_SHIFT_START; 6187 caseByte = UCOL_CASE_BYTE_START; 6188 } 6189 caseByte |= ((caseBits>>6)&1) << (--caseShift); 6190 } 6191 } else { 6192 if((caseBits & 0xC0) == 0) { 6193 caseByte |= 0 << (--caseShift); 6194 } else { 6195 caseByte |= 1 << (--caseShift); 6196 /* second bit */ 6197 if(caseShift == 0) { 6198 dest[i++] = caseByte; 6199 caseShift = UCOL_CASE_SHIFT_START; 6200 caseByte = UCOL_CASE_BYTE_START; 6201 } 6202 caseByte |= ((caseBits>>7)&1) << (--caseShift); 6203 } 6204 } 6205 } 6206 6207 } 6208 } 6209 // Not sure this is correct for the case level - revisit 6210 if(uprv_numAvailableExpCEs(s)) { 6211 canUpdateState = FALSE; 6212 } else { 6213 canUpdateState = TRUE; 6214 } 6215 } 6216 } else { 6217 level = UCOL_PSK_TERTIARY; 6218 } 6219 /* fall through to next level */ 6220 case UCOL_PSK_TERTIARY: 6221 if(strength >= UCOL_TERTIARY) { 6222 for(;;) { 6223 if(i == count) { 6224 goto saveState; 6225 } 6226 // We should save the state only if we 6227 // are sure that we are done with the 6228 // previous iterator state 6229 if(canUpdateState) { 6230 newState = s.iterator->getState(s.iterator); 6231 if(newState != UITER_NO_STATE) { 6232 iterState = newState; 6233 cces = 0; 6234 } 6235 } 6236 CE = ucol_IGetNextCE(coll, &s, status); 6237 cces++; 6238 if(CE==UCOL_NO_MORE_CES) { 6239 // Add the level separator 6240 terminatePSKLevel(level, maxLevel, i, dest); 6241 byteCountOrFrenchDone = 0; 6242 // Restart the iteration an move to the 6243 // second level 6244 s.iterator->move(s.iterator, 0, UITER_START); 6245 cces = 0; 6246 level = UCOL_PSK_QUATERNARY; 6247 break; 6248 } 6249 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6250 notIsContinuation = !isContinuation(CE); 6251 6252 if(notIsContinuation) { 6253 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 6254 CE ^= coll->caseSwitch; 6255 CE &= coll->tertiaryMask; 6256 } else { 6257 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6258 } 6259 6260 if(CE != 0) { 6261 dest[i++]=(uint8_t)CE; 6262 } 6263 } 6264 if(uprv_numAvailableExpCEs(s)) { 6265 canUpdateState = FALSE; 6266 } else { 6267 canUpdateState = TRUE; 6268 } 6269 } 6270 } else { 6271 // if we're not doing tertiary 6272 // skip to the end 6273 level = UCOL_PSK_NULL; 6274 } 6275 /* fall through to next level */ 6276 case UCOL_PSK_QUATERNARY: 6277 if(strength >= UCOL_QUATERNARY) { 6278 for(;;) { 6279 if(i == count) { 6280 goto saveState; 6281 } 6282 // We should save the state only if we 6283 // are sure that we are done with the 6284 // previous iterator state 6285 if(canUpdateState) { 6286 newState = s.iterator->getState(s.iterator); 6287 if(newState != UITER_NO_STATE) { 6288 iterState = newState; 6289 cces = 0; 6290 } 6291 } 6292 CE = ucol_IGetNextCE(coll, &s, status); 6293 cces++; 6294 if(CE==UCOL_NO_MORE_CES) { 6295 // Add the level separator 6296 terminatePSKLevel(level, maxLevel, i, dest); 6297 //dest[i++] = UCOL_LEVELTERMINATOR; 6298 byteCountOrFrenchDone = 0; 6299 // Restart the iteration an move to the 6300 // second level 6301 s.iterator->move(s.iterator, 0, UITER_START); 6302 cces = 0; 6303 level = UCOL_PSK_QUIN; 6304 break; 6305 } 6306 if(CE==0) 6307 continue; 6308 if(isShiftedCE(CE, LVT, &wasShifted)) { 6309 CE >>= 16; /* get primary */ 6310 if(CE != 0) { 6311 if(byteCountOrFrenchDone == 0) { 6312 dest[i++]=(uint8_t)(CE >> 8); 6313 } else { 6314 byteCountOrFrenchDone = 0; 6315 } 6316 if((CE &=0xff)!=0) { 6317 if(i==count) { 6318 /* overflow */ 6319 byteCountOrFrenchDone = 1; 6320 goto saveState; 6321 } 6322 dest[i++]=(uint8_t)CE; 6323 } 6324 } 6325 } else { 6326 notIsContinuation = !isContinuation(CE); 6327 if(notIsContinuation) { 6328 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 6329 dest[i++] = UCOL_HIRAGANA_QUAD; 6330 } else { 6331 dest[i++] = 0xFF; 6332 } 6333 } 6334 } 6335 if(uprv_numAvailableExpCEs(s)) { 6336 canUpdateState = FALSE; 6337 } else { 6338 canUpdateState = TRUE; 6339 } 6340 } 6341 } else { 6342 // if we're not doing quaternary 6343 // skip to the end 6344 level = UCOL_PSK_NULL; 6345 } 6346 /* fall through to next level */ 6347 case UCOL_PSK_QUIN: 6348 level = UCOL_PSK_IDENTICAL; 6349 /* fall through to next level */ 6350 case UCOL_PSK_IDENTICAL: 6351 if(strength >= UCOL_IDENTICAL) { 6352 UChar32 first, second; 6353 int32_t bocsuBytesWritten = 0; 6354 // We always need to do identical on 6355 // the NFD form of the string. 6356 if(normIter == NULL) { 6357 // we arrived from the level below and 6358 // normalization was not turned on. 6359 // therefore, we need to make a fresh NFD iterator 6360 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 6361 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 6362 } else if(!doingIdenticalFromStart) { 6363 // there is an iterator, but we did some other levels. 6364 // therefore, we have a FCD iterator - need to make 6365 // a NFD one. 6366 // normIter being at the beginning does not guarantee 6367 // that the underlying iterator is at the beginning 6368 iter->move(iter, 0, UITER_START); 6369 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 6370 } 6371 // At this point we have a NFD iterator that is positioned 6372 // in the right place 6373 if(U_FAILURE(*status)) { 6374 UTRACE_EXIT_STATUS(*status); 6375 return 0; 6376 } 6377 first = uiter_previous32(s.iterator); 6378 // maybe we're at the start of the string 6379 if(first == U_SENTINEL) { 6380 first = 0; 6381 } else { 6382 uiter_next32(s.iterator); 6383 } 6384 6385 j = 0; 6386 for(;;) { 6387 if(i == count) { 6388 if(j+1 < bocsuBytesWritten) { 6389 bocsuBytesUsed = j+1; 6390 } 6391 goto saveState; 6392 } 6393 6394 // On identical level, we will always save 6395 // the state if we reach this point, since 6396 // we don't depend on getNextCE for content 6397 // all the content is in our buffer and we 6398 // already either stored the full buffer OR 6399 // otherwise we won't arrive here. 6400 newState = s.iterator->getState(s.iterator); 6401 if(newState != UITER_NO_STATE) { 6402 iterState = newState; 6403 cces = 0; 6404 } 6405 6406 uint8_t buff[4]; 6407 second = uiter_next32(s.iterator); 6408 cces++; 6409 6410 // end condition for identical level 6411 if(second == U_SENTINEL) { 6412 terminatePSKLevel(level, maxLevel, i, dest); 6413 level = UCOL_PSK_NULL; 6414 break; 6415 } 6416 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); 6417 first = second; 6418 6419 j = 0; 6420 if(bocsuBytesUsed != 0) { 6421 while(bocsuBytesUsed-->0) { 6422 j++; 6423 } 6424 } 6425 6426 while(i < count && j < bocsuBytesWritten) { 6427 dest[i++] = buff[j++]; 6428 } 6429 } 6430 6431 } else { 6432 level = UCOL_PSK_NULL; 6433 } 6434 /* fall through to next level */ 6435 case UCOL_PSK_NULL: 6436 j = i; 6437 while(j<count) { 6438 dest[j++]=0; 6439 } 6440 break; 6441 default: 6442 *status = U_INTERNAL_PROGRAM_ERROR; 6443 UTRACE_EXIT_STATUS(*status); 6444 return 0; 6445 } 6446 6447 saveState: 6448 // Now we need to return stuff. First we want to see whether we have 6449 // done everything for the current state of iterator. 6450 if(byteCountOrFrenchDone 6451 || canUpdateState == FALSE 6452 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) 6453 { 6454 // Any of above mean that the previous transaction 6455 // wasn't finished and that we should store the 6456 // previous iterator state. 6457 state[0] = iterState; 6458 } else { 6459 // The transaction is complete. We will continue in the next iteration. 6460 state[0] = s.iterator->getState(s.iterator); 6461 cces = 0; 6462 } 6463 // Store the number of bocsu bytes written. 6464 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { 6465 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6466 } 6467 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT; 6468 6469 // Next we put in the level of comparison 6470 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); 6471 6472 // If we are doing French, we need to store whether we have just finished the French level 6473 if(level == UCOL_PSK_SECONDARY && doingFrench) { 6474 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6475 } else { 6476 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6477 } 6478 6479 // Was the latest CE shifted 6480 if(wasShifted) { 6481 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; 6482 } 6483 // Check for cces overflow 6484 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { 6485 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6486 } 6487 // Store cces 6488 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT); 6489 6490 // Check for French overflow 6491 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { 6492 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6493 } 6494 // Store number of bytes written in the French secondary continuation sequence 6495 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); 6496 6497 6498 // If we have used normalizing iterator, get rid of it 6499 if(normIter != NULL) { 6500 unorm_closeIter(normIter); 6501 } 6502 6503 /* To avoid memory leak, free the offset buffer if necessary. */ 6504 ucol_freeOffsetBuffer(&s); 6505 6506 // Return number of meaningful sortkey bytes. 6507 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 6508 dest,i, state[0], state[1]); 6509 UTRACE_EXIT_VALUE(i); 6510 return i; 6511 } 6512 6513 /** 6514 * Produce a bound for a given sortkey and a number of levels. 6515 */ 6516 U_CAPI int32_t U_EXPORT2 6517 ucol_getBound(const uint8_t *source, 6518 int32_t sourceLength, 6519 UColBoundMode boundType, 6520 uint32_t noOfLevels, 6521 uint8_t *result, 6522 int32_t resultLength, 6523 UErrorCode *status) 6524 { 6525 // consistency checks 6526 if(status == NULL || U_FAILURE(*status)) { 6527 return 0; 6528 } 6529 if(source == NULL) { 6530 *status = U_ILLEGAL_ARGUMENT_ERROR; 6531 return 0; 6532 } 6533 6534 int32_t sourceIndex = 0; 6535 // Scan the string until we skip enough of the key OR reach the end of the key 6536 do { 6537 sourceIndex++; 6538 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { 6539 noOfLevels--; 6540 } 6541 } while (noOfLevels > 0 6542 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 6543 6544 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 6545 && noOfLevels > 0) { 6546 *status = U_SORT_KEY_TOO_SHORT_WARNING; 6547 } 6548 6549 6550 // READ ME: this code assumes that the values for boundType 6551 // enum will not changes. They are set so that the enum value 6552 // corresponds to the number of extra bytes each bound type 6553 // needs. 6554 if(result != NULL && resultLength >= sourceIndex+boundType) { 6555 uprv_memcpy(result, source, sourceIndex); 6556 switch(boundType) { 6557 // Lower bound just gets terminated. No extra bytes 6558 case UCOL_BOUND_LOWER: // = 0 6559 break; 6560 // Upper bound needs one extra byte 6561 case UCOL_BOUND_UPPER: // = 1 6562 result[sourceIndex++] = 2; 6563 break; 6564 // Upper long bound needs two extra bytes 6565 case UCOL_BOUND_UPPER_LONG: // = 2 6566 result[sourceIndex++] = 0xFF; 6567 result[sourceIndex++] = 0xFF; 6568 break; 6569 default: 6570 *status = U_ILLEGAL_ARGUMENT_ERROR; 6571 return 0; 6572 } 6573 result[sourceIndex++] = 0; 6574 6575 return sourceIndex; 6576 } else { 6577 return sourceIndex+boundType+1; 6578 } 6579 } 6580 6581 /****************************************************************************/ 6582 /* Following are the functions that deal with the properties of a collator */ 6583 /* there are new APIs and some compatibility APIs */ 6584 /****************************************************************************/ 6585 6586 static inline void 6587 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, 6588 int32_t *primShift, int32_t *secShift, int32_t *terShift) 6589 { 6590 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; 6591 UBool reverseSecondary = FALSE; 6592 UBool continuation = isContinuation(CE); 6593 if(!continuation) { 6594 tertiary = (uint8_t)((CE & coll->tertiaryMask)); 6595 tertiary ^= coll->caseSwitch; 6596 reverseSecondary = TRUE; 6597 } else { 6598 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6599 tertiary &= UCOL_REMOVE_CASE; 6600 reverseSecondary = FALSE; 6601 } 6602 6603 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6604 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6605 primary1 = (uint8_t)(CE >> 8); 6606 6607 if(primary1 != 0) { 6608 if (coll->leadBytePermutationTable != NULL && !continuation) { 6609 primary1 = coll->leadBytePermutationTable[primary1]; 6610 } 6611 6612 coll->latinOneCEs[ch] |= (primary1 << *primShift); 6613 *primShift -= 8; 6614 } 6615 if(primary2 != 0) { 6616 if(*primShift < 0) { 6617 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6618 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6619 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6620 return; 6621 } 6622 coll->latinOneCEs[ch] |= (primary2 << *primShift); 6623 *primShift -= 8; 6624 } 6625 if(secondary != 0) { 6626 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary 6627 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary 6628 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); 6629 } else { // normal case 6630 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); 6631 } 6632 *secShift -= 8; 6633 } 6634 if(tertiary != 0) { 6635 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); 6636 *terShift -= 8; 6637 } 6638 } 6639 6640 static inline UBool 6641 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { 6642 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); 6643 if(newTable == NULL) { 6644 *status = U_MEMORY_ALLOCATION_ERROR; 6645 coll->latinOneFailed = TRUE; 6646 return FALSE; 6647 } 6648 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t); 6649 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); 6650 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); 6651 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy); 6652 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy); 6653 coll->latinOneTableLen = size; 6654 uprv_free(coll->latinOneCEs); 6655 coll->latinOneCEs = newTable; 6656 return TRUE; 6657 } 6658 6659 static UBool 6660 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { 6661 UBool result = TRUE; 6662 if(coll->latinOneCEs == NULL) { 6663 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); 6664 if(coll->latinOneCEs == NULL) { 6665 *status = U_MEMORY_ALLOCATION_ERROR; 6666 return FALSE; 6667 } 6668 coll->latinOneTableLen = UCOL_LATINONETABLELEN; 6669 } 6670 UChar ch = 0; 6671 UCollationElements *it = ucol_openElements(coll, &ch, 1, status); 6672 // Check for null pointer 6673 if (U_FAILURE(*status)) { 6674 return FALSE; 6675 } 6676 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); 6677 6678 int32_t primShift = 24, secShift = 24, terShift = 24; 6679 uint32_t CE = 0; 6680 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; 6681 6682 // TODO: make safe if you get more than you wanted... 6683 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { 6684 primShift = 24; secShift = 24; terShift = 24; 6685 if(ch < 0x100) { 6686 CE = coll->latinOneMapping[ch]; 6687 } else { 6688 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 6689 if(CE == UCOL_NOT_FOUND && coll->UCA) { 6690 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 6691 } 6692 } 6693 if(CE < UCOL_NOT_FOUND) { 6694 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6695 } else { 6696 switch (getCETag(CE)) { 6697 case EXPANSION_TAG: 6698 case DIGIT_TAG: 6699 ucol_setText(it, &ch, 1, status); 6700 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { 6701 if(primShift < 0 || secShift < 0 || terShift < 0) { 6702 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6703 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6704 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6705 break; 6706 } 6707 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6708 } 6709 break; 6710 case CONTRACTION_TAG: 6711 // here is the trick 6712 // F2 is contraction. We do something very similar to contractions 6713 // but have two indices, one in the real contraction table and the 6714 // other to where we stuffed things. This hopes that we don't have 6715 // many contractions (this should work for latin-1 tables). 6716 { 6717 if((CE & 0x00FFF000) != 0) { 6718 *status = U_UNSUPPORTED_ERROR; 6719 goto cleanup_after_failure; 6720 } 6721 6722 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 6723 6724 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table 6725 6726 coll->latinOneCEs[ch] = CE; 6727 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; 6728 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; 6729 6730 // We're going to jump into contraction table, pick the elements 6731 // and use them 6732 do { 6733 CE = *(coll->contractionCEs + 6734 (UCharOffset - coll->contractionIndex)); 6735 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { 6736 uint32_t size; 6737 uint32_t i; /* general counter */ 6738 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 6739 size = getExpansionCount(CE); 6740 //CE = *CEOffset++; 6741 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 6742 for(i = 0; i<size; i++) { 6743 if(primShift < 0 || secShift < 0 || terShift < 0) { 6744 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6745 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6746 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6747 break; 6748 } 6749 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6750 } 6751 } else { /* else, we do */ 6752 while(*CEOffset != 0) { 6753 if(primShift < 0 || secShift < 0 || terShift < 0) { 6754 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6755 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6756 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6757 break; 6758 } 6759 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6760 } 6761 } 6762 contractionOffset++; 6763 } else if(CE < UCOL_NOT_FOUND) { 6764 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); 6765 } else { 6766 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6767 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6768 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6769 contractionOffset++; 6770 } 6771 UCharOffset++; 6772 primShift = 24; secShift = 24; terShift = 24; 6773 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate 6774 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { 6775 goto cleanup_after_failure; 6776 } 6777 } 6778 } while(*UCharOffset != 0xFFFF); 6779 } 6780 break;; 6781 case SPEC_PROC_TAG: 6782 { 6783 // 0xB7 is a precontext character defined in UCA5.1, a special 6784 // handle is implemeted in order to save LatinOne table for 6785 // most locales. 6786 if (ch==0xb7) { 6787 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6788 } 6789 else { 6790 goto cleanup_after_failure; 6791 } 6792 } 6793 break; 6794 default: 6795 goto cleanup_after_failure; 6796 } 6797 } 6798 } 6799 // compact table 6800 if(contractionOffset < coll->latinOneTableLen) { 6801 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { 6802 goto cleanup_after_failure; 6803 } 6804 } 6805 ucol_closeElements(it); 6806 return result; 6807 6808 cleanup_after_failure: 6809 // status should already be set before arriving here. 6810 coll->latinOneFailed = TRUE; 6811 ucol_closeElements(it); 6812 return FALSE; 6813 } 6814 6815 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { 6816 if(U_SUCCESS(*status)) { 6817 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6818 coll->caseSwitch = UCOL_CASE_SWITCH; 6819 } else { 6820 coll->caseSwitch = UCOL_NO_CASE_SWITCH; 6821 } 6822 6823 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { 6824 coll->tertiaryMask = UCOL_REMOVE_CASE; 6825 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6826 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */ 6827 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; 6828 coll->tertiaryBottom = UCOL_COMMON_BOT3; 6829 } else { 6830 coll->tertiaryMask = UCOL_KEEP_CASE; 6831 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; 6832 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6833 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; 6834 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; 6835 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; 6836 } else { 6837 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6838 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; 6839 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; 6840 } 6841 } 6842 6843 /* Set the compression values */ 6844 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1); 6845 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */ 6846 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); 6847 6848 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY 6849 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) 6850 { 6851 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; 6852 } else { 6853 coll->sortKeyGen = ucol_calcSortKey; 6854 } 6855 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF 6856 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) 6857 { 6858 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { 6859 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it 6860 //fprintf(stderr, "F"); 6861 coll->latinOneUse = TRUE; 6862 } else { 6863 coll->latinOneUse = FALSE; 6864 } 6865 if(*status == U_UNSUPPORTED_ERROR) { 6866 *status = U_ZERO_ERROR; 6867 } 6868 } else { // latin1Table exists and it doesn't need to be regenerated, just use it 6869 coll->latinOneUse = TRUE; 6870 } 6871 } else { 6872 coll->latinOneUse = FALSE; 6873 } 6874 } 6875 } 6876 6877 U_CAPI uint32_t U_EXPORT2 6878 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 6879 if(U_FAILURE(*status) || coll == NULL) { 6880 return 0; 6881 } 6882 if(len == -1) { 6883 len = u_strlen(varTop); 6884 } 6885 if(len == 0) { 6886 *status = U_ILLEGAL_ARGUMENT_ERROR; 6887 return 0; 6888 } 6889 6890 collIterate s; 6891 IInit_collIterate(coll, varTop, len, &s, status); 6892 if(U_FAILURE(*status)) { 6893 return 0; 6894 } 6895 6896 uint32_t CE = ucol_IGetNextCE(coll, &s, status); 6897 6898 /* here we check if we have consumed all characters */ 6899 /* you can put in either one character or a contraction */ 6900 /* you shouldn't put more... */ 6901 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { 6902 *status = U_CE_NOT_FOUND_ERROR; 6903 return 0; 6904 } 6905 6906 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); 6907 6908 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { 6909 *status = U_PRIMARY_TOO_LONG_ERROR; 6910 return 0; 6911 } 6912 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { 6913 coll->variableTopValueisDefault = FALSE; 6914 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; 6915 } 6916 6917 /* To avoid memory leak, free the offset buffer if necessary. */ 6918 ucol_freeOffsetBuffer(&s); 6919 6920 return CE & UCOL_PRIMARYMASK; 6921 } 6922 6923 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 6924 if(U_FAILURE(*status) || coll == NULL) { 6925 return 0; 6926 } 6927 return coll->variableTopValue<<16; 6928 } 6929 6930 U_CAPI void U_EXPORT2 6931 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 6932 if(U_FAILURE(*status) || coll == NULL) { 6933 return; 6934 } 6935 6936 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { 6937 coll->variableTopValueisDefault = FALSE; 6938 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; 6939 } 6940 } 6941 /* Attribute setter API */ 6942 U_CAPI void U_EXPORT2 6943 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 6944 if(U_FAILURE(*status) || coll == NULL) { 6945 return; 6946 } 6947 UColAttributeValue oldFrench = coll->frenchCollation; 6948 UColAttributeValue oldCaseFirst = coll->caseFirst; 6949 switch(attr) { 6950 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ 6951 if(value == UCOL_ON) { 6952 coll->numericCollation = UCOL_ON; 6953 coll->numericCollationisDefault = FALSE; 6954 } else if (value == UCOL_OFF) { 6955 coll->numericCollation = UCOL_OFF; 6956 coll->numericCollationisDefault = FALSE; 6957 } else if (value == UCOL_DEFAULT) { 6958 coll->numericCollationisDefault = TRUE; 6959 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; 6960 } else { 6961 *status = U_ILLEGAL_ARGUMENT_ERROR; 6962 } 6963 break; 6964 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ 6965 if(value == UCOL_ON) { 6966 coll->hiraganaQ = UCOL_ON; 6967 coll->hiraganaQisDefault = FALSE; 6968 } else if (value == UCOL_OFF) { 6969 coll->hiraganaQ = UCOL_OFF; 6970 coll->hiraganaQisDefault = FALSE; 6971 } else if (value == UCOL_DEFAULT) { 6972 coll->hiraganaQisDefault = TRUE; 6973 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ; 6974 } else { 6975 *status = U_ILLEGAL_ARGUMENT_ERROR; 6976 } 6977 break; 6978 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 6979 if(value == UCOL_ON) { 6980 coll->frenchCollation = UCOL_ON; 6981 coll->frenchCollationisDefault = FALSE; 6982 } else if (value == UCOL_OFF) { 6983 coll->frenchCollation = UCOL_OFF; 6984 coll->frenchCollationisDefault = FALSE; 6985 } else if (value == UCOL_DEFAULT) { 6986 coll->frenchCollationisDefault = TRUE; 6987 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation; 6988 } else { 6989 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6990 } 6991 break; 6992 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 6993 if(value == UCOL_SHIFTED) { 6994 coll->alternateHandling = UCOL_SHIFTED; 6995 coll->alternateHandlingisDefault = FALSE; 6996 } else if (value == UCOL_NON_IGNORABLE) { 6997 coll->alternateHandling = UCOL_NON_IGNORABLE; 6998 coll->alternateHandlingisDefault = FALSE; 6999 } else if (value == UCOL_DEFAULT) { 7000 coll->alternateHandlingisDefault = TRUE; 7001 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ; 7002 } else { 7003 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7004 } 7005 break; 7006 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 7007 if(value == UCOL_LOWER_FIRST) { 7008 coll->caseFirst = UCOL_LOWER_FIRST; 7009 coll->caseFirstisDefault = FALSE; 7010 } else if (value == UCOL_UPPER_FIRST) { 7011 coll->caseFirst = UCOL_UPPER_FIRST; 7012 coll->caseFirstisDefault = FALSE; 7013 } else if (value == UCOL_OFF) { 7014 coll->caseFirst = UCOL_OFF; 7015 coll->caseFirstisDefault = FALSE; 7016 } else if (value == UCOL_DEFAULT) { 7017 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; 7018 coll->caseFirstisDefault = TRUE; 7019 } else { 7020 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7021 } 7022 break; 7023 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 7024 if(value == UCOL_ON) { 7025 coll->caseLevel = UCOL_ON; 7026 coll->caseLevelisDefault = FALSE; 7027 } else if (value == UCOL_OFF) { 7028 coll->caseLevel = UCOL_OFF; 7029 coll->caseLevelisDefault = FALSE; 7030 } else if (value == UCOL_DEFAULT) { 7031 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; 7032 coll->caseLevelisDefault = TRUE; 7033 } else { 7034 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7035 } 7036 break; 7037 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 7038 if(value == UCOL_ON) { 7039 coll->normalizationMode = UCOL_ON; 7040 coll->normalizationModeisDefault = FALSE; 7041 initializeFCD(status); 7042 } else if (value == UCOL_OFF) { 7043 coll->normalizationMode = UCOL_OFF; 7044 coll->normalizationModeisDefault = FALSE; 7045 } else if (value == UCOL_DEFAULT) { 7046 coll->normalizationModeisDefault = TRUE; 7047 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode; 7048 if(coll->normalizationMode == UCOL_ON) { 7049 initializeFCD(status); 7050 } 7051 } else { 7052 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7053 } 7054 break; 7055 case UCOL_STRENGTH: /* attribute for strength */ 7056 if (value == UCOL_DEFAULT) { 7057 coll->strengthisDefault = TRUE; 7058 coll->strength = (UColAttributeValue)coll->options->strength; 7059 } else if (value <= UCOL_IDENTICAL) { 7060 coll->strengthisDefault = FALSE; 7061 coll->strength = value; 7062 } else { 7063 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7064 } 7065 break; 7066 case UCOL_ATTRIBUTE_COUNT: 7067 default: 7068 *status = U_ILLEGAL_ARGUMENT_ERROR; 7069 break; 7070 } 7071 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { 7072 coll->latinOneRegenTable = TRUE; 7073 } else { 7074 coll->latinOneRegenTable = FALSE; 7075 } 7076 ucol_updateInternalState(coll, status); 7077 } 7078 7079 U_CAPI UColAttributeValue U_EXPORT2 7080 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 7081 if(U_FAILURE(*status) || coll == NULL) { 7082 return UCOL_DEFAULT; 7083 } 7084 switch(attr) { 7085 case UCOL_NUMERIC_COLLATION: 7086 return coll->numericCollation; 7087 case UCOL_HIRAGANA_QUATERNARY_MODE: 7088 return coll->hiraganaQ; 7089 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 7090 return coll->frenchCollation; 7091 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 7092 return coll->alternateHandling; 7093 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 7094 return coll->caseFirst; 7095 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 7096 return coll->caseLevel; 7097 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 7098 return coll->normalizationMode; 7099 case UCOL_STRENGTH: /* attribute for strength */ 7100 return coll->strength; 7101 case UCOL_ATTRIBUTE_COUNT: 7102 default: 7103 *status = U_ILLEGAL_ARGUMENT_ERROR; 7104 break; 7105 } 7106 return UCOL_DEFAULT; 7107 } 7108 7109 U_CAPI void U_EXPORT2 7110 ucol_setStrength( UCollator *coll, 7111 UCollationStrength strength) 7112 { 7113 UErrorCode status = U_ZERO_ERROR; 7114 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 7115 } 7116 7117 U_CAPI UCollationStrength U_EXPORT2 7118 ucol_getStrength(const UCollator *coll) 7119 { 7120 UErrorCode status = U_ZERO_ERROR; 7121 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 7122 } 7123 7124 U_INTERNAL int32_t U_EXPORT2 7125 ucol_getReorderCodes(const UCollator *coll, 7126 int32_t *dest, 7127 int32_t destCapacity, 7128 UErrorCode *pErrorCode) { 7129 if (U_FAILURE(*pErrorCode)) { 7130 return 0; 7131 } 7132 7133 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 7134 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 7135 return 0; 7136 } 7137 7138 if (coll->reorderCodesLength > destCapacity) { 7139 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 7140 return coll->reorderCodesLength; 7141 } 7142 for (int32_t i = 0; i < coll->reorderCodesLength; i++) { 7143 dest[i] = coll->reorderCodes[i]; 7144 } 7145 return coll->reorderCodesLength; 7146 } 7147 7148 U_INTERNAL void U_EXPORT2 7149 ucol_setReorderCodes(UCollator *coll, 7150 const int32_t *reorderCodes, 7151 int32_t reorderCodesLength, 7152 UErrorCode *pErrorCode) { 7153 if (U_FAILURE(*pErrorCode)) { 7154 return; 7155 } 7156 7157 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) { 7158 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 7159 return; 7160 } 7161 7162 uprv_free(coll->reorderCodes); 7163 coll->reorderCodes = NULL; 7164 coll->reorderCodesLength = 0; 7165 if (reorderCodesLength == 0) { 7166 uprv_free(coll->leadBytePermutationTable); 7167 coll->leadBytePermutationTable = NULL; 7168 return; 7169 } 7170 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t)); 7171 if (coll->reorderCodes == NULL) { 7172 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 7173 return; 7174 } 7175 for (int32_t i = 0; i < reorderCodesLength; i++) { 7176 coll->reorderCodes[i] = reorderCodes[i]; 7177 } 7178 coll->reorderCodesLength = reorderCodesLength; 7179 ucol_buildPermutationTable(coll, pErrorCode); 7180 if (U_FAILURE(*pErrorCode)) { 7181 uprv_free(coll->reorderCodes); 7182 coll->reorderCodes = NULL; 7183 coll->reorderCodesLength = 0; 7184 } 7185 } 7186 7187 7188 /****************************************************************************/ 7189 /* Following are misc functions */ 7190 /* there are new APIs and some compatibility APIs */ 7191 /****************************************************************************/ 7192 7193 U_CAPI void U_EXPORT2 7194 ucol_getVersion(const UCollator* coll, 7195 UVersionInfo versionInfo) 7196 { 7197 /* RunTime version */ 7198 uint8_t rtVersion = UCOL_RUNTIME_VERSION; 7199 /* Builder version*/ 7200 uint8_t bdVersion = coll->image->version[0]; 7201 7202 /* Charset Version. Need to get the version from cnv files 7203 * makeconv should populate cnv files with version and 7204 * an api has to be provided in ucnv.h to obtain this version 7205 */ 7206 uint8_t csVersion = 0; 7207 7208 /* combine the version info */ 7209 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion)); 7210 7211 /* Tailoring rules */ 7212 versionInfo[0] = (uint8_t)(cmbVersion>>8); 7213 versionInfo[1] = (uint8_t)cmbVersion; 7214 versionInfo[2] = coll->image->version[1]; 7215 if(coll->UCA) { 7216 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */ 7217 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07); 7218 } else { 7219 versionInfo[3] = 0; 7220 } 7221 } 7222 7223 7224 /* This internal API checks whether a character is tailored or not */ 7225 U_CAPI UBool U_EXPORT2 7226 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { 7227 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { 7228 return FALSE; 7229 } 7230 7231 uint32_t CE = UCOL_NOT_FOUND; 7232 const UChar *ContractionStart = NULL; 7233 if(u < 0x100) { /* latin-1 */ 7234 CE = coll->latinOneMapping[u]; 7235 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { 7236 return FALSE; 7237 } 7238 } else { /* regular */ 7239 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); 7240 } 7241 7242 if(isContraction(CE)) { 7243 ContractionStart = (UChar *)coll->image+getContractOffset(CE); 7244 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); 7245 } 7246 7247 return (UBool)(CE != UCOL_NOT_FOUND); 7248 } 7249 7250 7251 /****************************************************************************/ 7252 /* Following are the string compare functions */ 7253 /* */ 7254 /****************************************************************************/ 7255 7256 7257 /* ucol_checkIdent internal function. Does byte level string compare. */ 7258 /* Used by strcoll if strength == identical and strings */ 7259 /* are otherwise equal. */ 7260 /* */ 7261 /* Comparison must be done on NFD normalized strings. */ 7262 /* FCD is not good enough. */ 7263 7264 static 7265 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status) 7266 { 7267 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both 7268 // of same type, but that doesn't really mean that it will stay that way. 7269 int32_t comparison; 7270 7271 if (sColl->flags & UCOL_USE_ITERATOR) { 7272 // The division for the array length may truncate the array size to 7273 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 7274 // for all platforms anyway. 7275 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 7276 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 7277 UNormIterator *sNIt = NULL, *tNIt = NULL; 7278 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 7279 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 7280 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7281 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7282 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); 7283 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); 7284 comparison = u_strCompareIter(sIt, tIt, TRUE); 7285 unorm_closeIter(sNIt); 7286 unorm_closeIter(tNIt); 7287 } else { 7288 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1; 7289 const UChar *sBuf = sColl->string; 7290 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1; 7291 const UChar *tBuf = tColl->string; 7292 7293 if (normalize) { 7294 *status = U_ZERO_ERROR; 7295 // Note: We could use Normalizer::compare() or similar, but for short strings 7296 // which may not be in FCD it might be faster to just NFD them. 7297 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than 7298 // NFD'ing immediately might be faster for long strings, 7299 // but string comparison is usually done on relatively short strings. 7300 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen), 7301 sColl->writableBuffer, 7302 *status); 7303 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen), 7304 tColl->writableBuffer, 7305 *status); 7306 if(U_FAILURE(*status)) { 7307 return UCOL_LESS; 7308 } 7309 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer); 7310 } else { 7311 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); 7312 } 7313 } 7314 7315 if (comparison < 0) { 7316 return UCOL_LESS; 7317 } else if (comparison == 0) { 7318 return UCOL_EQUAL; 7319 } else /* comparison > 0 */ { 7320 return UCOL_GREATER; 7321 } 7322 } 7323 7324 /* CEBuf - A struct and some inline functions to handle the saving */ 7325 /* of CEs in a buffer within ucol_strcoll */ 7326 7327 #define UCOL_CEBUF_SIZE 512 7328 typedef struct ucol_CEBuf { 7329 uint32_t *buf; 7330 uint32_t *endp; 7331 uint32_t *pos; 7332 uint32_t localArray[UCOL_CEBUF_SIZE]; 7333 } ucol_CEBuf; 7334 7335 7336 static 7337 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { 7338 (b)->buf = (b)->pos = (b)->localArray; 7339 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; 7340 } 7341 7342 static 7343 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { 7344 uint32_t oldSize; 7345 uint32_t newSize; 7346 uint32_t *newBuf; 7347 7348 ci->flags |= UCOL_ITER_ALLOCATED; 7349 oldSize = (uint32_t)(b->pos - b->buf); 7350 newSize = oldSize * 2; 7351 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); 7352 if(newBuf == NULL) { 7353 *status = U_MEMORY_ALLOCATION_ERROR; 7354 } 7355 else { 7356 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); 7357 if (b->buf != b->localArray) { 7358 uprv_free(b->buf); 7359 } 7360 b->buf = newBuf; 7361 b->endp = b->buf + newSize; 7362 b->pos = b->buf + oldSize; 7363 } 7364 } 7365 7366 static 7367 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) { 7368 if (b->pos == b->endp) { 7369 ucol_CEBuf_Expand(b, ci, status); 7370 } 7371 if (U_SUCCESS(*status)) { 7372 *(b)->pos++ = ce; 7373 } 7374 } 7375 7376 /* This is a trick string compare function that goes in and uses sortkeys to compare */ 7377 /* It is used when compare gets in trouble and needs to bail out */ 7378 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, 7379 collIterate *tColl, 7380 UErrorCode *status) 7381 { 7382 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; 7383 uint8_t *sourceKeyP = sourceKey; 7384 uint8_t *targetKeyP = targetKey; 7385 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; 7386 const UCollator *coll = sColl->coll; 7387 const UChar *source = NULL; 7388 const UChar *target = NULL; 7389 int32_t result = UCOL_EQUAL; 7390 UnicodeString sourceString, targetString; 7391 int32_t sourceLength; 7392 int32_t targetLength; 7393 7394 if(sColl->flags & UCOL_USE_ITERATOR) { 7395 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7396 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7397 UChar32 c; 7398 while((c=sColl->iterator->next(sColl->iterator))>=0) { 7399 sourceString.append((UChar)c); 7400 } 7401 while((c=tColl->iterator->next(tColl->iterator))>=0) { 7402 targetString.append((UChar)c); 7403 } 7404 source = sourceString.getBuffer(); 7405 sourceLength = sourceString.length(); 7406 target = targetString.getBuffer(); 7407 targetLength = targetString.length(); 7408 } else { // no iterators 7409 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1; 7410 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1; 7411 source = sColl->string; 7412 target = tColl->string; 7413 } 7414 7415 7416 7417 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7418 if(sourceKeyLen > UCOL_MAX_BUFFER) { 7419 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); 7420 if(sourceKeyP == NULL) { 7421 *status = U_MEMORY_ALLOCATION_ERROR; 7422 goto cleanup_and_do_compare; 7423 } 7424 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7425 } 7426 7427 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7428 if(targetKeyLen > UCOL_MAX_BUFFER) { 7429 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); 7430 if(targetKeyP == NULL) { 7431 *status = U_MEMORY_ALLOCATION_ERROR; 7432 goto cleanup_and_do_compare; 7433 } 7434 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7435 } 7436 7437 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); 7438 7439 cleanup_and_do_compare: 7440 if(sourceKeyP != NULL && sourceKeyP != sourceKey) { 7441 uprv_free(sourceKeyP); 7442 } 7443 7444 if(targetKeyP != NULL && targetKeyP != targetKey) { 7445 uprv_free(targetKeyP); 7446 } 7447 7448 if(result<0) { 7449 return UCOL_LESS; 7450 } else if(result>0) { 7451 return UCOL_GREATER; 7452 } else { 7453 return UCOL_EQUAL; 7454 } 7455 } 7456 7457 7458 static UCollationResult 7459 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) 7460 { 7461 U_ALIGN_CODE(16); 7462 7463 const UCollator *coll = sColl->coll; 7464 7465 7466 // setting up the collator parameters 7467 UColAttributeValue strength = coll->strength; 7468 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); 7469 7470 UBool checkSecTer = initialCheckSecTer; 7471 UBool checkTertiary = (strength >= UCOL_TERTIARY); 7472 UBool checkQuad = (strength >= UCOL_QUATERNARY); 7473 UBool checkIdent = (strength == UCOL_IDENTICAL); 7474 UBool checkCase = (coll->caseLevel == UCOL_ON); 7475 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; 7476 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 7477 UBool qShifted = shifted && checkQuad; 7478 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; 7479 7480 if(doHiragana && shifted) { 7481 return (ucol_compareUsingSortKeys(sColl, tColl, status)); 7482 } 7483 uint8_t caseSwitch = coll->caseSwitch; 7484 uint8_t tertiaryMask = coll->tertiaryMask; 7485 7486 // This is the lowest primary value that will not be ignored if shifted 7487 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; 7488 7489 UCollationResult result = UCOL_EQUAL; 7490 UCollationResult hirResult = UCOL_EQUAL; 7491 7492 // Preparing the CE buffers. They will be filled during the primary phase 7493 ucol_CEBuf sCEs; 7494 ucol_CEBuf tCEs; 7495 UCOL_INIT_CEBUF(&sCEs); 7496 UCOL_INIT_CEBUF(&tCEs); 7497 7498 uint32_t secS = 0, secT = 0; 7499 uint32_t sOrder=0, tOrder=0; 7500 7501 // Non shifted primary processing is quite simple 7502 if(!shifted) { 7503 for(;;) { 7504 7505 // We fetch CEs until we hit a non ignorable primary or end. 7506 do { 7507 // We get the next CE 7508 sOrder = ucol_IGetNextCE(coll, sColl, status); 7509 // Stuff it in the buffer 7510 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7511 // And keep just the primary part. 7512 sOrder &= UCOL_PRIMARYMASK; 7513 } while(sOrder == 0); 7514 7515 // see the comments on the above block 7516 do { 7517 tOrder = ucol_IGetNextCE(coll, tColl, status); 7518 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7519 tOrder &= UCOL_PRIMARYMASK; 7520 } while(tOrder == 0); 7521 7522 // if both primaries are the same 7523 if(sOrder == tOrder) { 7524 // and there are no more CEs, we advance to the next level 7525 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7526 break; 7527 } 7528 if(doHiragana && hirResult == UCOL_EQUAL) { 7529 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { 7530 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) 7531 ? UCOL_LESS:UCOL_GREATER; 7532 } 7533 } 7534 } else { 7535 // only need to check one for continuation 7536 // if one is then the other must be or the preceding CE would be a prefix of the other 7537 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) { 7538 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); 7539 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); 7540 } 7541 // if two primaries are different, we are done 7542 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; 7543 goto commonReturn; 7544 } 7545 } // no primary difference... do the rest from the buffers 7546 } else { // shifted - do a slightly more complicated processing :) 7547 for(;;) { 7548 UBool sInShifted = FALSE; 7549 UBool tInShifted = FALSE; 7550 // This version of code can be refactored. However, it seems easier to understand this way. 7551 // Source loop. Sam as the target loop. 7552 for(;;) { 7553 sOrder = ucol_IGetNextCE(coll, sColl, status); 7554 if(sOrder == UCOL_NO_MORE_CES) { 7555 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7556 break; 7557 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { 7558 /* UCA amendment - ignore ignorables that follow shifted code points */ 7559 continue; 7560 } else if(isContinuation(sOrder)) { 7561 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7562 if(sInShifted) { 7563 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7564 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7565 continue; 7566 } else { 7567 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7568 break; 7569 } 7570 } else { /* Just lower level values */ 7571 if(sInShifted) { 7572 continue; 7573 } else { 7574 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7575 continue; 7576 } 7577 } 7578 } else { /* regular */ 7579 if(coll->leadBytePermutationTable != NULL){ 7580 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); 7581 } 7582 if((sOrder & UCOL_PRIMARYMASK) > LVT) { 7583 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7584 break; 7585 } else { 7586 if((sOrder & UCOL_PRIMARYMASK) > 0) { 7587 sInShifted = TRUE; 7588 sOrder &= UCOL_PRIMARYMASK; 7589 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7590 continue; 7591 } else { 7592 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7593 sInShifted = FALSE; 7594 continue; 7595 } 7596 } 7597 } 7598 } 7599 sOrder &= UCOL_PRIMARYMASK; 7600 sInShifted = FALSE; 7601 7602 for(;;) { 7603 tOrder = ucol_IGetNextCE(coll, tColl, status); 7604 if(tOrder == UCOL_NO_MORE_CES) { 7605 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7606 break; 7607 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { 7608 /* UCA amendment - ignore ignorables that follow shifted code points */ 7609 continue; 7610 } else if(isContinuation(tOrder)) { 7611 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7612 if(tInShifted) { 7613 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7614 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7615 continue; 7616 } else { 7617 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7618 break; 7619 } 7620 } else { /* Just lower level values */ 7621 if(tInShifted) { 7622 continue; 7623 } else { 7624 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7625 continue; 7626 } 7627 } 7628 } else { /* regular */ 7629 if(coll->leadBytePermutationTable != NULL){ 7630 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); 7631 } 7632 if((tOrder & UCOL_PRIMARYMASK) > LVT) { 7633 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7634 break; 7635 } else { 7636 if((tOrder & UCOL_PRIMARYMASK) > 0) { 7637 tInShifted = TRUE; 7638 tOrder &= UCOL_PRIMARYMASK; 7639 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7640 continue; 7641 } else { 7642 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7643 tInShifted = FALSE; 7644 continue; 7645 } 7646 } 7647 } 7648 } 7649 tOrder &= UCOL_PRIMARYMASK; 7650 tInShifted = FALSE; 7651 7652 if(sOrder == tOrder) { 7653 /* 7654 if(doHiragana && hirResult == UCOL_EQUAL) { 7655 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { 7656 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) 7657 ? UCOL_LESS:UCOL_GREATER; 7658 } 7659 } 7660 */ 7661 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7662 break; 7663 } else { 7664 sOrder = 0; 7665 tOrder = 0; 7666 continue; 7667 } 7668 } else { 7669 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; 7670 goto commonReturn; 7671 } 7672 } /* no primary difference... do the rest from the buffers */ 7673 } 7674 7675 /* now, we're gonna reexamine collected CEs */ 7676 uint32_t *sCE; 7677 uint32_t *tCE; 7678 7679 /* This is the secondary level of comparison */ 7680 if(checkSecTer) { 7681 if(!isFrenchSec) { /* normal */ 7682 sCE = sCEs.buf; 7683 tCE = tCEs.buf; 7684 for(;;) { 7685 while (secS == 0) { 7686 secS = *(sCE++) & UCOL_SECONDARYMASK; 7687 } 7688 7689 while(secT == 0) { 7690 secT = *(tCE++) & UCOL_SECONDARYMASK; 7691 } 7692 7693 if(secS == secT) { 7694 if(secS == UCOL_NO_MORE_CES_SECONDARY) { 7695 break; 7696 } else { 7697 secS = 0; secT = 0; 7698 continue; 7699 } 7700 } else { 7701 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7702 goto commonReturn; 7703 } 7704 } 7705 } else { /* do the French */ 7706 uint32_t *sCESave = NULL; 7707 uint32_t *tCESave = NULL; 7708 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ 7709 tCE = tCEs.pos-2; 7710 for(;;) { 7711 while (secS == 0 && sCE >= sCEs.buf) { 7712 if(sCESave == NULL) { 7713 secS = *(sCE--); 7714 if(isContinuation(secS)) { 7715 while(isContinuation(secS = *(sCE--))) 7716 ; 7717 /* after this, secS has the start of continuation, and sCEs points before that */ 7718 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7719 sCE+=2; /* need to point to the first continuation CP */ 7720 /* However, now you can just continue doing stuff */ 7721 } 7722 } else { 7723 secS = *(sCE++); 7724 if(!isContinuation(secS)) { /* This means we have finished with this cont */ 7725 sCE = sCESave; /* reset the pointer to before continuation */ 7726 sCESave = NULL; 7727 secS = 0; /* Fetch a fresh CE before the continuation sequence. */ 7728 continue; 7729 } 7730 } 7731 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7732 } 7733 7734 while(secT == 0 && tCE >= tCEs.buf) { 7735 if(tCESave == NULL) { 7736 secT = *(tCE--); 7737 if(isContinuation(secT)) { 7738 while(isContinuation(secT = *(tCE--))) 7739 ; 7740 /* after this, secS has the start of continuation, and sCEs points before that */ 7741 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7742 tCE+=2; /* need to point to the first continuation CP */ 7743 /* However, now you can just continue doing stuff */ 7744 } 7745 } else { 7746 secT = *(tCE++); 7747 if(!isContinuation(secT)) { /* This means we have finished with this cont */ 7748 tCE = tCESave; /* reset the pointer to before continuation */ 7749 tCESave = NULL; 7750 secT = 0; /* Fetch a fresh CE before the continuation sequence. */ 7751 continue; 7752 } 7753 } 7754 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7755 } 7756 7757 if(secS == secT) { 7758 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { 7759 break; 7760 } else { 7761 secS = 0; secT = 0; 7762 continue; 7763 } 7764 } else { 7765 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7766 goto commonReturn; 7767 } 7768 } 7769 } 7770 } 7771 7772 /* doing the case bit */ 7773 if(checkCase) { 7774 sCE = sCEs.buf; 7775 tCE = tCEs.buf; 7776 for(;;) { 7777 while((secS & UCOL_REMOVE_CASE) == 0) { 7778 if(!isContinuation(*sCE++)) { 7779 secS =*(sCE-1); 7780 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7781 // primary ignorables should not be considered on the case level when the strength is primary 7782 // otherwise, the CEs stop being well-formed 7783 secS &= UCOL_TERT_CASE_MASK; 7784 secS ^= caseSwitch; 7785 } else { 7786 secS = 0; 7787 } 7788 } else { 7789 secS = 0; 7790 } 7791 } 7792 7793 while((secT & UCOL_REMOVE_CASE) == 0) { 7794 if(!isContinuation(*tCE++)) { 7795 secT = *(tCE-1); 7796 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7797 // primary ignorables should not be considered on the case level when the strength is primary 7798 // otherwise, the CEs stop being well-formed 7799 secT &= UCOL_TERT_CASE_MASK; 7800 secT ^= caseSwitch; 7801 } else { 7802 secT = 0; 7803 } 7804 } else { 7805 secT = 0; 7806 } 7807 } 7808 7809 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { 7810 result = UCOL_LESS; 7811 goto commonReturn; 7812 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { 7813 result = UCOL_GREATER; 7814 goto commonReturn; 7815 } 7816 7817 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { 7818 break; 7819 } else { 7820 secS = 0; 7821 secT = 0; 7822 } 7823 } 7824 } 7825 7826 /* Tertiary level */ 7827 if(checkTertiary) { 7828 secS = 0; 7829 secT = 0; 7830 sCE = sCEs.buf; 7831 tCE = tCEs.buf; 7832 for(;;) { 7833 while((secS & UCOL_REMOVE_CASE) == 0) { 7834 secS = *(sCE++) & tertiaryMask; 7835 if(!isContinuation(secS)) { 7836 secS ^= caseSwitch; 7837 } else { 7838 secS &= UCOL_REMOVE_CASE; 7839 } 7840 } 7841 7842 while((secT & UCOL_REMOVE_CASE) == 0) { 7843 secT = *(tCE++) & tertiaryMask; 7844 if(!isContinuation(secT)) { 7845 secT ^= caseSwitch; 7846 } else { 7847 secT &= UCOL_REMOVE_CASE; 7848 } 7849 } 7850 7851 if(secS == secT) { 7852 if((secS & UCOL_REMOVE_CASE) == 1) { 7853 break; 7854 } else { 7855 secS = 0; secT = 0; 7856 continue; 7857 } 7858 } else { 7859 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7860 goto commonReturn; 7861 } 7862 } 7863 } 7864 7865 7866 if(qShifted /*checkQuad*/) { 7867 UBool sInShifted = TRUE; 7868 UBool tInShifted = TRUE; 7869 secS = 0; 7870 secT = 0; 7871 sCE = sCEs.buf; 7872 tCE = tCEs.buf; 7873 for(;;) { 7874 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) { 7875 secS = *(sCE++); 7876 if(isContinuation(secS)) { 7877 if(!sInShifted) { 7878 continue; 7879 } 7880 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ 7881 secS = UCOL_PRIMARYMASK; 7882 sInShifted = FALSE; 7883 } else { 7884 sInShifted = TRUE; 7885 } 7886 } 7887 secS &= UCOL_PRIMARYMASK; 7888 7889 7890 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) { 7891 secT = *(tCE++); 7892 if(isContinuation(secT)) { 7893 if(!tInShifted) { 7894 continue; 7895 } 7896 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { 7897 secT = UCOL_PRIMARYMASK; 7898 tInShifted = FALSE; 7899 } else { 7900 tInShifted = TRUE; 7901 } 7902 } 7903 secT &= UCOL_PRIMARYMASK; 7904 7905 if(secS == secT) { 7906 if(secS == UCOL_NO_MORE_CES_PRIMARY) { 7907 break; 7908 } else { 7909 secS = 0; secT = 0; 7910 continue; 7911 } 7912 } else { 7913 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7914 goto commonReturn; 7915 } 7916 } 7917 } else if(doHiragana && hirResult != UCOL_EQUAL) { 7918 // If we're fine on quaternaries, we might be different 7919 // on Hiragana. This, however, might fail us in shifted. 7920 result = hirResult; 7921 goto commonReturn; 7922 } 7923 7924 /* For IDENTICAL comparisons, we use a bitwise character comparison */ 7925 /* as a tiebreaker if all else is equal. */ 7926 /* Getting here should be quite rare - strings are not identical - */ 7927 /* that is checked first, but compared == through all other checks. */ 7928 if(checkIdent) 7929 { 7930 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON); 7931 result = ucol_checkIdent(sColl, tColl, TRUE, status); 7932 } 7933 7934 commonReturn: 7935 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { 7936 if (sCEs.buf != sCEs.localArray ) { 7937 uprv_free(sCEs.buf); 7938 } 7939 if (tCEs.buf != tCEs.localArray ) { 7940 uprv_free(tCEs.buf); 7941 } 7942 } 7943 7944 return result; 7945 } 7946 7947 static UCollationResult 7948 ucol_strcollRegular(const UCollator *coll, 7949 const UChar *source, int32_t sourceLength, 7950 const UChar *target, int32_t targetLength, 7951 UErrorCode *status) { 7952 collIterate sColl, tColl; 7953 // Preparing the context objects for iterating over strings 7954 IInit_collIterate(coll, source, sourceLength, &sColl, status); 7955 IInit_collIterate(coll, target, targetLength, &tColl, status); 7956 if(U_FAILURE(*status)) { 7957 return UCOL_LESS; 7958 } 7959 return ucol_strcollRegular(&sColl, &tColl, status); 7960 } 7961 7962 static inline uint32_t 7963 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, 7964 uint32_t CE, const UChar *s, int32_t *index, int32_t len) 7965 { 7966 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 7967 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 7968 int32_t offset = 1; 7969 UChar schar = 0, tchar = 0; 7970 7971 for(;;) { 7972 if(len == -1) { 7973 if(s[*index] == 0) { // end of string 7974 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7975 } else { 7976 schar = s[*index]; 7977 } 7978 } else { 7979 if(*index == len) { 7980 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7981 } else { 7982 schar = s[*index]; 7983 } 7984 } 7985 7986 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 7987 offset++; 7988 } 7989 7990 if (schar == tchar) { 7991 (*index)++; 7992 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 7993 } 7994 else 7995 { 7996 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 7997 return UCOL_BAIL_OUT_CE; 7998 } 7999 // skip completely ignorables 8000 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 8001 if(isZeroCE == 0) { // we have to ignore completely ignorables 8002 (*index)++; 8003 continue; 8004 } 8005 8006 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8007 } 8008 } 8009 } 8010 8011 8012 /** 8013 * This is a fast strcoll, geared towards text in Latin-1. 8014 * It supports contractions of size two, French secondaries 8015 * and case switching. You can use it with strengths primary 8016 * to tertiary. It does not support shifted and case level. 8017 * It relies on the table build by setupLatin1Table. If it 8018 * doesn't understand something, it will go to the regular 8019 * strcoll. 8020 */ 8021 static UCollationResult 8022 ucol_strcollUseLatin1( const UCollator *coll, 8023 const UChar *source, 8024 int32_t sLen, 8025 const UChar *target, 8026 int32_t tLen, 8027 UErrorCode *status) 8028 { 8029 U_ALIGN_CODE(16); 8030 int32_t strength = coll->strength; 8031 8032 int32_t sIndex = 0, tIndex = 0; 8033 UChar sChar = 0, tChar = 0; 8034 uint32_t sOrder=0, tOrder=0; 8035 8036 UBool endOfSource = FALSE; 8037 8038 uint32_t *elements = coll->latinOneCEs; 8039 8040 UBool haveContractions = FALSE; // if we have contractions in our string 8041 // we cannot do French secondary 8042 8043 // Do the primary level 8044 for(;;) { 8045 while(sOrder==0) { // this loop skips primary ignorables 8046 // sOrder=getNextlatinOneCE(source); 8047 if(sLen==-1) { // handling zero terminated strings 8048 sChar=source[sIndex++]; 8049 if(sChar==0) { 8050 endOfSource = TRUE; 8051 break; 8052 } 8053 } else { // handling strings with known length 8054 if(sIndex==sLen) { 8055 endOfSource = TRUE; 8056 break; 8057 } 8058 sChar=source[sIndex++]; 8059 } 8060 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 8061 //fprintf(stderr, "R"); 8062 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8063 } 8064 sOrder = elements[sChar]; 8065 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 8066 // specials can basically be either contractions or bail-out signs. If we get anything 8067 // else, we'll bail out anywasy 8068 if(getCETag(sOrder) == CONTRACTION_TAG) { 8069 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 8070 haveContractions = TRUE; // if there are contractions, we cannot do French secondary 8071 // However, if there are contractions in the table, but we always use just one char, 8072 // we might be able to do French. This should be checked out. 8073 } 8074 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 8075 //fprintf(stderr, "S"); 8076 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8077 } 8078 } 8079 } 8080 8081 while(tOrder==0) { // this loop skips primary ignorables 8082 // tOrder=getNextlatinOneCE(target); 8083 if(tLen==-1) { // handling zero terminated strings 8084 tChar=target[tIndex++]; 8085 if(tChar==0) { 8086 if(endOfSource) { // this is different than source loop, 8087 // as we already know that source loop is done here, 8088 // so we can either finish the primary loop if both 8089 // strings are done or anounce the result if only 8090 // target is done. Same below. 8091 goto endOfPrimLoop; 8092 } else { 8093 return UCOL_GREATER; 8094 } 8095 } 8096 } else { // handling strings with known length 8097 if(tIndex==tLen) { 8098 if(endOfSource) { 8099 goto endOfPrimLoop; 8100 } else { 8101 return UCOL_GREATER; 8102 } 8103 } 8104 tChar=target[tIndex++]; 8105 } 8106 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 8107 //fprintf(stderr, "R"); 8108 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8109 } 8110 tOrder = elements[tChar]; 8111 if(tOrder >= UCOL_NOT_FOUND) { 8112 // Handling specials, see the comments for source 8113 if(getCETag(tOrder) == CONTRACTION_TAG) { 8114 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 8115 haveContractions = TRUE; 8116 } 8117 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 8118 //fprintf(stderr, "S"); 8119 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8120 } 8121 } 8122 } 8123 if(endOfSource) { // source is finished, but target is not, say the result. 8124 return UCOL_LESS; 8125 } 8126 8127 if(sOrder == tOrder) { // if we have same CEs, we continue the loop 8128 sOrder = 0; tOrder = 0; 8129 continue; 8130 } else { 8131 // compare current top bytes 8132 if(((sOrder^tOrder)&0xFF000000)!=0) { 8133 // top bytes differ, return difference 8134 if(sOrder < tOrder) { 8135 return UCOL_LESS; 8136 } else if(sOrder > tOrder) { 8137 return UCOL_GREATER; 8138 } 8139 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 8140 // since we must return enum value 8141 } 8142 8143 // top bytes match, continue with following bytes 8144 sOrder<<=8; 8145 tOrder<<=8; 8146 } 8147 } 8148 8149 endOfPrimLoop: 8150 // after primary loop, we definitely know the sizes of strings, 8151 // so we set it and use simpler loop for secondaries and tertiaries 8152 sLen = sIndex; tLen = tIndex; 8153 if(strength >= UCOL_SECONDARY) { 8154 // adjust the table beggining 8155 elements += coll->latinOneTableLen; 8156 endOfSource = FALSE; 8157 8158 if(coll->frenchCollation == UCOL_OFF) { // non French 8159 // This loop is a simplified copy of primary loop 8160 // at this point we know that whole strings are latin-1, so we don't 8161 // check for that. We also know that we only have contractions as 8162 // specials. 8163 sIndex = 0; tIndex = 0; 8164 for(;;) { 8165 while(sOrder==0) { 8166 if(sIndex==sLen) { 8167 endOfSource = TRUE; 8168 break; 8169 } 8170 sChar=source[sIndex++]; 8171 sOrder = elements[sChar]; 8172 if(sOrder > UCOL_NOT_FOUND) { 8173 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 8174 } 8175 } 8176 8177 while(tOrder==0) { 8178 if(tIndex==tLen) { 8179 if(endOfSource) { 8180 goto endOfSecLoop; 8181 } else { 8182 return UCOL_GREATER; 8183 } 8184 } 8185 tChar=target[tIndex++]; 8186 tOrder = elements[tChar]; 8187 if(tOrder > UCOL_NOT_FOUND) { 8188 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 8189 } 8190 } 8191 if(endOfSource) { 8192 return UCOL_LESS; 8193 } 8194 8195 if(sOrder == tOrder) { 8196 sOrder = 0; tOrder = 0; 8197 continue; 8198 } else { 8199 // see primary loop for comments on this 8200 if(((sOrder^tOrder)&0xFF000000)!=0) { 8201 if(sOrder < tOrder) { 8202 return UCOL_LESS; 8203 } else if(sOrder > tOrder) { 8204 return UCOL_GREATER; 8205 } 8206 } 8207 sOrder<<=8; 8208 tOrder<<=8; 8209 } 8210 } 8211 } else { // French 8212 if(haveContractions) { // if we have contractions, we have to bail out 8213 // since we don't really know how to handle them here 8214 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8215 } 8216 // For French, we go backwards 8217 sIndex = sLen; tIndex = tLen; 8218 for(;;) { 8219 while(sOrder==0) { 8220 if(sIndex==0) { 8221 endOfSource = TRUE; 8222 break; 8223 } 8224 sChar=source[--sIndex]; 8225 sOrder = elements[sChar]; 8226 // don't even look for contractions 8227 } 8228 8229 while(tOrder==0) { 8230 if(tIndex==0) { 8231 if(endOfSource) { 8232 goto endOfSecLoop; 8233 } else { 8234 return UCOL_GREATER; 8235 } 8236 } 8237 tChar=target[--tIndex]; 8238 tOrder = elements[tChar]; 8239 // don't even look for contractions 8240 } 8241 if(endOfSource) { 8242 return UCOL_LESS; 8243 } 8244 8245 if(sOrder == tOrder) { 8246 sOrder = 0; tOrder = 0; 8247 continue; 8248 } else { 8249 // see the primary loop for comments 8250 if(((sOrder^tOrder)&0xFF000000)!=0) { 8251 if(sOrder < tOrder) { 8252 return UCOL_LESS; 8253 } else if(sOrder > tOrder) { 8254 return UCOL_GREATER; 8255 } 8256 } 8257 sOrder<<=8; 8258 tOrder<<=8; 8259 } 8260 } 8261 } 8262 } 8263 8264 endOfSecLoop: 8265 if(strength >= UCOL_TERTIARY) { 8266 // tertiary loop is the same as secondary (except no French) 8267 elements += coll->latinOneTableLen; 8268 sIndex = 0; tIndex = 0; 8269 endOfSource = FALSE; 8270 for(;;) { 8271 while(sOrder==0) { 8272 if(sIndex==sLen) { 8273 endOfSource = TRUE; 8274 break; 8275 } 8276 sChar=source[sIndex++]; 8277 sOrder = elements[sChar]; 8278 if(sOrder > UCOL_NOT_FOUND) { 8279 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 8280 } 8281 } 8282 while(tOrder==0) { 8283 if(tIndex==tLen) { 8284 if(endOfSource) { 8285 return UCOL_EQUAL; // if both strings are at the end, they are equal 8286 } else { 8287 return UCOL_GREATER; 8288 } 8289 } 8290 tChar=target[tIndex++]; 8291 tOrder = elements[tChar]; 8292 if(tOrder > UCOL_NOT_FOUND) { 8293 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 8294 } 8295 } 8296 if(endOfSource) { 8297 return UCOL_LESS; 8298 } 8299 if(sOrder == tOrder) { 8300 sOrder = 0; tOrder = 0; 8301 continue; 8302 } else { 8303 if(((sOrder^tOrder)&0xff000000)!=0) { 8304 if(sOrder < tOrder) { 8305 return UCOL_LESS; 8306 } else if(sOrder > tOrder) { 8307 return UCOL_GREATER; 8308 } 8309 } 8310 sOrder<<=8; 8311 tOrder<<=8; 8312 } 8313 } 8314 } 8315 return UCOL_EQUAL; 8316 } 8317 8318 8319 U_CAPI UCollationResult U_EXPORT2 8320 ucol_strcollIter( const UCollator *coll, 8321 UCharIterator *sIter, 8322 UCharIterator *tIter, 8323 UErrorCode *status) 8324 { 8325 if(!status || U_FAILURE(*status)) { 8326 return UCOL_EQUAL; 8327 } 8328 8329 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 8330 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 8331 8332 if (sIter == tIter) { 8333 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8334 return UCOL_EQUAL; 8335 } 8336 if(sIter == NULL || tIter == NULL || coll == NULL) { 8337 *status = U_ILLEGAL_ARGUMENT_ERROR; 8338 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8339 return UCOL_EQUAL; 8340 } 8341 8342 UCollationResult result = UCOL_EQUAL; 8343 8344 // Preparing the context objects for iterating over strings 8345 collIterate sColl, tColl; 8346 IInit_collIterate(coll, NULL, -1, &sColl, status); 8347 IInit_collIterate(coll, NULL, -1, &tColl, status); 8348 if(U_FAILURE(*status)) { 8349 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8350 return UCOL_EQUAL; 8351 } 8352 // The division for the array length may truncate the array size to 8353 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 8354 // for all platforms anyway. 8355 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8356 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8357 UNormIterator *sNormIter = NULL, *tNormIter = NULL; 8358 8359 sColl.iterator = sIter; 8360 sColl.flags |= UCOL_USE_ITERATOR; 8361 tColl.flags |= UCOL_USE_ITERATOR; 8362 tColl.iterator = tIter; 8363 8364 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 8365 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 8366 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); 8367 sColl.flags &= ~UCOL_ITER_NORM; 8368 8369 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 8370 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); 8371 tColl.flags &= ~UCOL_ITER_NORM; 8372 } 8373 8374 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; 8375 8376 while((sChar = sColl.iterator->next(sColl.iterator)) == 8377 (tChar = tColl.iterator->next(tColl.iterator))) { 8378 if(sChar == U_SENTINEL) { 8379 result = UCOL_EQUAL; 8380 goto end_compare; 8381 } 8382 } 8383 8384 if(sChar == U_SENTINEL) { 8385 tChar = tColl.iterator->previous(tColl.iterator); 8386 } 8387 8388 if(tChar == U_SENTINEL) { 8389 sChar = sColl.iterator->previous(sColl.iterator); 8390 } 8391 8392 sChar = sColl.iterator->previous(sColl.iterator); 8393 tChar = tColl.iterator->previous(tColl.iterator); 8394 8395 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) 8396 { 8397 // We are stopped in the middle of a contraction. 8398 // Scan backwards through the == part of the string looking for the start of the contraction. 8399 // It doesn't matter which string we scan, since they are the same in this region. 8400 do 8401 { 8402 sChar = sColl.iterator->previous(sColl.iterator); 8403 tChar = tColl.iterator->previous(tColl.iterator); 8404 } 8405 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); 8406 } 8407 8408 8409 if(U_SUCCESS(*status)) { 8410 result = ucol_strcollRegular(&sColl, &tColl, status); 8411 } 8412 8413 end_compare: 8414 if(sNormIter || tNormIter) { 8415 unorm_closeIter(sNormIter); 8416 unorm_closeIter(tNormIter); 8417 } 8418 8419 UTRACE_EXIT_VALUE_STATUS(result, *status) 8420 return result; 8421 } 8422 8423 8424 /* */ 8425 /* ucol_strcoll Main public API string comparison function */ 8426 /* */ 8427 U_CAPI UCollationResult U_EXPORT2 8428 ucol_strcoll( const UCollator *coll, 8429 const UChar *source, 8430 int32_t sourceLength, 8431 const UChar *target, 8432 int32_t targetLength) 8433 { 8434 U_ALIGN_CODE(16); 8435 8436 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 8437 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 8438 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 8439 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 8440 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 8441 } 8442 8443 if(source == NULL || target == NULL) { 8444 // do not crash, but return. Should have 8445 // status argument to return error. 8446 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8447 return UCOL_EQUAL; 8448 } 8449 8450 /* Quick check if source and target are same strings. */ 8451 /* They should either both be NULL terminated or the explicit length should be set on both. */ 8452 if (source==target && sourceLength==targetLength) { 8453 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8454 return UCOL_EQUAL; 8455 } 8456 8457 /* Scan the strings. Find: */ 8458 /* The length of any leading portion that is equal */ 8459 /* Whether they are exactly equal. (in which case we just return) */ 8460 const UChar *pSrc = source; 8461 const UChar *pTarg = target; 8462 int32_t equalLength; 8463 8464 if (sourceLength == -1 && targetLength == -1) { 8465 // Both strings are null terminated. 8466 // Scan through any leading equal portion. 8467 while (*pSrc == *pTarg && *pSrc != 0) { 8468 pSrc++; 8469 pTarg++; 8470 } 8471 if (*pSrc == 0 && *pTarg == 0) { 8472 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8473 return UCOL_EQUAL; 8474 } 8475 equalLength = (int32_t)(pSrc - source); 8476 } 8477 else 8478 { 8479 // One or both strings has an explicit length. 8480 const UChar *pSrcEnd = source + sourceLength; 8481 const UChar *pTargEnd = target + targetLength; 8482 8483 // Scan while the strings are bitwise ==, or until one is exhausted. 8484 for (;;) { 8485 if (pSrc == pSrcEnd || pTarg == pTargEnd) { 8486 break; 8487 } 8488 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 8489 break; 8490 } 8491 if (*pSrc != *pTarg) { 8492 break; 8493 } 8494 pSrc++; 8495 pTarg++; 8496 } 8497 equalLength = (int32_t)(pSrc - source); 8498 8499 // If we made it all the way through both strings, we are done. They are == 8500 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */ 8501 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */ 8502 { 8503 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8504 return UCOL_EQUAL; 8505 } 8506 } 8507 if (equalLength > 0) { 8508 /* There is an identical portion at the beginning of the two strings. */ 8509 /* If the identical portion ends within a contraction or a comibining */ 8510 /* character sequence, back up to the start of that sequence. */ 8511 8512 // These values should already be set by the code above. 8513 //pSrc = source + equalLength; /* point to the first differing chars */ 8514 //pTarg = target + equalLength; 8515 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) || 8516 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))) 8517 { 8518 // We are stopped in the middle of a contraction. 8519 // Scan backwards through the == part of the string looking for the start of the contraction. 8520 // It doesn't matter which string we scan, since they are the same in this region. 8521 do 8522 { 8523 equalLength--; 8524 pSrc--; 8525 } 8526 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); 8527 } 8528 8529 source += equalLength; 8530 target += equalLength; 8531 if (sourceLength > 0) { 8532 sourceLength -= equalLength; 8533 } 8534 if (targetLength > 0) { 8535 targetLength -= equalLength; 8536 } 8537 } 8538 8539 UErrorCode status = U_ZERO_ERROR; 8540 UCollationResult returnVal; 8541 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { 8542 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status); 8543 } else { 8544 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); 8545 } 8546 UTRACE_EXIT_VALUE(returnVal); 8547 return returnVal; 8548 } 8549 8550 /* convenience function for comparing strings */ 8551 U_CAPI UBool U_EXPORT2 8552 ucol_greater( const UCollator *coll, 8553 const UChar *source, 8554 int32_t sourceLength, 8555 const UChar *target, 8556 int32_t targetLength) 8557 { 8558 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8559 == UCOL_GREATER); 8560 } 8561 8562 /* convenience function for comparing strings */ 8563 U_CAPI UBool U_EXPORT2 8564 ucol_greaterOrEqual( const UCollator *coll, 8565 const UChar *source, 8566 int32_t sourceLength, 8567 const UChar *target, 8568 int32_t targetLength) 8569 { 8570 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8571 != UCOL_LESS); 8572 } 8573 8574 /* convenience function for comparing strings */ 8575 U_CAPI UBool U_EXPORT2 8576 ucol_equal( const UCollator *coll, 8577 const UChar *source, 8578 int32_t sourceLength, 8579 const UChar *target, 8580 int32_t targetLength) 8581 { 8582 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8583 == UCOL_EQUAL); 8584 } 8585 8586 U_CAPI void U_EXPORT2 8587 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 8588 if(coll && coll->UCA) { 8589 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); 8590 } 8591 } 8592 8593 #endif /* #if !UCONFIG_NO_COLLATION */ 8594