1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucol.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Modification history 12 * Date Name Comments 13 * 1996-1999 various members of ICU team maintained C API for collation framework 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE 15 * 03/01/2001 synwee Added maxexpansion functionality. 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_COLLATION 22 23 #include "unicode/coleitr.h" 24 #include "unicode/unorm.h" 25 #include "unicode/udata.h" 26 #include "unicode/ustring.h" 27 28 #include "ucol_imp.h" 29 #include "bocsu.h" 30 31 #include "normalizer2impl.h" 32 #include "unorm_it.h" 33 #include "umutex.h" 34 #include "cmemory.h" 35 #include "ucln_in.h" 36 #include "cstring.h" 37 #include "utracimp.h" 38 #include "putilimp.h" 39 #include "uassert.h" 40 41 #ifdef UCOL_DEBUG 42 #include <stdio.h> 43 #endif 44 45 U_NAMESPACE_USE 46 47 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 48 49 #define LAST_BYTE_MASK_ 0xFF 50 #define SECOND_LAST_BYTE_SHIFT_ 8 51 52 #define ZERO_CC_LIMIT_ 0xC0 53 54 // this is static pointer to the normalizer fcdTrieIndex 55 // it is always the same between calls to u_cleanup 56 // and therefore writing to it is not synchronized. 57 // It is cleaned in ucol_cleanup 58 static const uint16_t *fcdTrieIndex=NULL; 59 // Code points at fcdHighStart and above have a zero FCD value. 60 static UChar32 fcdHighStart = 0; 61 62 // These are values from UCA required for 63 // implicit generation and supressing sort key compression 64 // they should regularly be in the UCA, but if one 65 // is running without UCA, it could be a problem 66 static const int32_t maxRegularPrimary = 0xA0; 67 static const int32_t minImplicitPrimary = 0xE0; 68 static const int32_t maxImplicitPrimary = 0xE4; 69 70 U_CDECL_BEGIN 71 static UBool U_CALLCONV 72 ucol_cleanup(void) 73 { 74 fcdTrieIndex = NULL; 75 return TRUE; 76 } 77 78 static int32_t U_CALLCONV 79 _getFoldingOffset(uint32_t data) { 80 return (int32_t)(data&0xFFFFFF); 81 } 82 83 U_CDECL_END 84 85 static 86 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, 87 int32_t sourceLen, collIterate *s, 88 UErrorCode *status) 89 { 90 (s)->string = (s)->pos = sourceString; 91 (s)->origFlags = 0; 92 (s)->flags = 0; 93 if (sourceLen >= 0) { 94 s->flags |= UCOL_ITER_HASLEN; 95 (s)->endp = (UChar *)sourceString+sourceLen; 96 } 97 else { 98 /* change to enable easier checking for end of string for fcdpositon */ 99 (s)->endp = NULL; 100 } 101 (s)->extendCEs = NULL; 102 (s)->extendCEsSize = 0; 103 (s)->CEpos = (s)->toReturn = (s)->CEs; 104 (s)->offsetBuffer = NULL; 105 (s)->offsetBufferSize = 0; 106 (s)->offsetReturn = (s)->offsetStore = NULL; 107 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; 108 (s)->coll = (collator); 109 (s)->nfd = Normalizer2Factory::getNFDInstance(*status); 110 (s)->fcdPosition = 0; 111 if(collator->normalizationMode == UCOL_ON) { 112 (s)->flags |= UCOL_ITER_NORM; 113 } 114 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { 115 (s)->flags |= UCOL_HIRAGANA_Q; 116 } 117 (s)->iterator = NULL; 118 //(s)->iteratorIndex = 0; 119 } 120 121 U_CAPI void U_EXPORT2 122 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, 123 int32_t sourceLen, collIterate *s, 124 UErrorCode *status) { 125 /* Out-of-line version for use from other files. */ 126 IInit_collIterate(collator, sourceString, sourceLen, s, status); 127 } 128 129 U_CAPI collIterate * U_EXPORT2 130 uprv_new_collIterate(UErrorCode *status) { 131 if(U_FAILURE(*status)) { 132 return NULL; 133 } 134 collIterate *s = new collIterate; 135 if(s == NULL) { 136 *status = U_MEMORY_ALLOCATION_ERROR; 137 return NULL; 138 } 139 return s; 140 } 141 142 U_CAPI void U_EXPORT2 143 uprv_delete_collIterate(collIterate *s) { 144 delete s; 145 } 146 147 U_CAPI UBool U_EXPORT2 148 uprv_collIterateAtEnd(collIterate *s) { 149 return s == NULL || s->pos == s->endp; 150 } 151 152 /** 153 * Backup the state of the collIterate struct data 154 * @param data collIterate to backup 155 * @param backup storage 156 */ 157 static 158 inline void backupState(const collIterate *data, collIterateState *backup) 159 { 160 backup->fcdPosition = data->fcdPosition; 161 backup->flags = data->flags; 162 backup->origFlags = data->origFlags; 163 backup->pos = data->pos; 164 backup->bufferaddress = data->writableBuffer.getBuffer(); 165 backup->buffersize = data->writableBuffer.length(); 166 backup->iteratorMove = 0; 167 backup->iteratorIndex = 0; 168 if(data->iterator != NULL) { 169 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); 170 backup->iteratorIndex = data->iterator->getState(data->iterator); 171 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE 172 if(backup->iteratorIndex == UITER_NO_STATE) { 173 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { 174 backup->iteratorMove++; 175 data->iterator->move(data->iterator, -1, UITER_CURRENT); 176 } 177 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 178 } 179 } 180 } 181 182 /** 183 * Loads the state into the collIterate struct data 184 * @param data collIterate to backup 185 * @param backup storage 186 * @param forwards boolean to indicate if forwards iteration is used, 187 * false indicates backwards iteration 188 */ 189 static 190 inline void loadState(collIterate *data, const collIterateState *backup, 191 UBool forwards) 192 { 193 UErrorCode status = U_ZERO_ERROR; 194 data->flags = backup->flags; 195 data->origFlags = backup->origFlags; 196 if(data->iterator != NULL) { 197 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); 198 data->iterator->setState(data->iterator, backup->iteratorIndex, &status); 199 if(backup->iteratorMove != 0) { 200 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 201 } 202 } 203 data->pos = backup->pos; 204 205 if ((data->flags & UCOL_ITER_INNORMBUF) && 206 data->writableBuffer.getBuffer() != backup->bufferaddress) { 207 /* 208 this is when a new buffer has been reallocated and we'll have to 209 calculate the new position. 210 note the new buffer has to contain the contents of the old buffer. 211 */ 212 if (forwards) { 213 data->pos = data->writableBuffer.getTerminatedBuffer() + 214 (data->pos - backup->bufferaddress); 215 } 216 else { 217 /* backwards direction */ 218 int32_t temp = backup->buffersize - 219 (int32_t)(data->pos - backup->bufferaddress); 220 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp); 221 } 222 } 223 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 224 /* 225 this is alittle tricky. 226 if we are initially not in the normalization buffer, even if we 227 normalize in the later stage, the data in the buffer will be 228 ignored, since we skip back up to the data string. 229 however if we are already in the normalization buffer, any 230 further normalization will pull data into the normalization 231 buffer and modify the fcdPosition. 232 since we are keeping the data in the buffer for use, the 233 fcdPosition can not be reverted back. 234 arrgghh.... 235 */ 236 data->fcdPosition = backup->fcdPosition; 237 } 238 } 239 240 static UBool 241 reallocCEs(collIterate *data, int32_t newCapacity) { 242 uint32_t *oldCEs = data->extendCEs; 243 if(oldCEs == NULL) { 244 oldCEs = data->CEs; 245 } 246 int32_t length = data->CEpos - oldCEs; 247 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); 248 if(newCEs == NULL) { 249 return FALSE; 250 } 251 uprv_memcpy(newCEs, oldCEs, length * 4); 252 uprv_free(data->extendCEs); 253 data->extendCEs = newCEs; 254 data->extendCEsSize = newCapacity; 255 data->CEpos = newCEs + length; 256 return TRUE; 257 } 258 259 static UBool 260 increaseCEsCapacity(collIterate *data) { 261 int32_t oldCapacity; 262 if(data->extendCEs != NULL) { 263 oldCapacity = data->extendCEsSize; 264 } else { 265 oldCapacity = LENGTHOF(data->CEs); 266 } 267 return reallocCEs(data, 2 * oldCapacity); 268 } 269 270 static UBool 271 ensureCEsCapacity(collIterate *data, int32_t minCapacity) { 272 int32_t oldCapacity; 273 if(data->extendCEs != NULL) { 274 oldCapacity = data->extendCEsSize; 275 } else { 276 oldCapacity = LENGTHOF(data->CEs); 277 } 278 if(minCapacity <= oldCapacity) { 279 return TRUE; 280 } 281 oldCapacity *= 2; 282 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity); 283 } 284 285 /* 286 * collIter_eos() 287 * Checks for a collIterate being positioned at the end of 288 * its source string. 289 * 290 */ 291 static 292 inline UBool collIter_eos(collIterate *s) { 293 if(s->flags & UCOL_USE_ITERATOR) { 294 return !(s->iterator->hasNext(s->iterator)); 295 } 296 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { 297 // Null terminated string, but not at null, so not at end. 298 // Whether in main or normalization buffer doesn't matter. 299 return FALSE; 300 } 301 302 // String with length. Can't be in normalization buffer, which is always 303 // null termintated. 304 if (s->flags & UCOL_ITER_HASLEN) { 305 return (s->pos == s->endp); 306 } 307 308 // We are at a null termination, could be either normalization buffer or main string. 309 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { 310 // At null at end of main string. 311 return TRUE; 312 } 313 314 // At null at end of normalization buffer. Need to check whether there there are 315 // any characters left in the main buffer. 316 if(s->origFlags & UCOL_USE_ITERATOR) { 317 return !(s->iterator->hasNext(s->iterator)); 318 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { 319 // Null terminated main string. fcdPosition is the 'return' position into main buf. 320 return (*s->fcdPosition == 0); 321 } 322 else { 323 // Main string with an end pointer. 324 return s->fcdPosition == s->endp; 325 } 326 } 327 328 /* 329 * collIter_bos() 330 * Checks for a collIterate being positioned at the start of 331 * its source string. 332 * 333 */ 334 static 335 inline UBool collIter_bos(collIterate *source) { 336 // if we're going backwards, we need to know whether there is more in the 337 // iterator, even if we are in the side buffer 338 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 339 return !source->iterator->hasPrevious(source->iterator); 340 } 341 if (source->pos <= source->string || 342 ((source->flags & UCOL_ITER_INNORMBUF) && 343 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { 344 return TRUE; 345 } 346 return FALSE; 347 } 348 349 /*static 350 inline UBool collIter_SimpleBos(collIterate *source) { 351 // if we're going backwards, we need to know whether there is more in the 352 // iterator, even if we are in the side buffer 353 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 354 return !source->iterator->hasPrevious(source->iterator); 355 } 356 if (source->pos == source->string) { 357 return TRUE; 358 } 359 return FALSE; 360 }*/ 361 //return (data->pos == data->string) || 362 363 364 /****************************************************************************/ 365 /* Following are the open/close functions */ 366 /* */ 367 /****************************************************************************/ 368 369 static UCollator* 370 ucol_initFromBinary(const uint8_t *bin, int32_t length, 371 const UCollator *base, 372 UCollator *fillIn, 373 UErrorCode *status) 374 { 375 UCollator *result = fillIn; 376 if(U_FAILURE(*status)) { 377 return NULL; 378 } 379 /* 380 if(base == NULL) { 381 // we don't support null base yet 382 *status = U_ILLEGAL_ARGUMENT_ERROR; 383 return NULL; 384 } 385 */ 386 // We need these and we could be running without UCA 387 uprv_uca_initImplicitConstants(status); 388 UCATableHeader *colData = (UCATableHeader *)bin; 389 // do we want version check here? We're trying to figure out whether collators are compatible 390 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || 391 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || 392 colData->version[0] != UCOL_BUILDER_VERSION) 393 { 394 *status = U_COLLATOR_VERSION_MISMATCH; 395 return NULL; 396 } 397 else { 398 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { 399 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); 400 if(U_FAILURE(*status)){ 401 return NULL; 402 } 403 result->hasRealData = TRUE; 404 } 405 else { 406 if(base) { 407 result = ucol_initCollator(base->image, result, base, status); 408 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); 409 if(U_FAILURE(*status)){ 410 return NULL; 411 } 412 result->hasRealData = FALSE; 413 } 414 else { 415 *status = U_USELESS_COLLATOR_ERROR; 416 return NULL; 417 } 418 } 419 result->freeImageOnClose = FALSE; 420 } 421 result->actualLocale = NULL; 422 result->validLocale = NULL; 423 result->requestedLocale = NULL; 424 result->rules = NULL; 425 result->rulesLength = 0; 426 result->freeRulesOnClose = FALSE; 427 result->ucaRules = NULL; 428 return result; 429 } 430 431 U_CAPI UCollator* U_EXPORT2 432 ucol_openBinary(const uint8_t *bin, int32_t length, 433 const UCollator *base, 434 UErrorCode *status) 435 { 436 return ucol_initFromBinary(bin, length, base, NULL, status); 437 } 438 439 U_CAPI int32_t U_EXPORT2 440 ucol_cloneBinary(const UCollator *coll, 441 uint8_t *buffer, int32_t capacity, 442 UErrorCode *status) 443 { 444 int32_t length = 0; 445 if(U_FAILURE(*status)) { 446 return length; 447 } 448 if(capacity < 0) { 449 *status = U_ILLEGAL_ARGUMENT_ERROR; 450 return length; 451 } 452 if(coll->hasRealData == TRUE) { 453 length = coll->image->size; 454 if(length <= capacity) { 455 uprv_memcpy(buffer, coll->image, length); 456 } else { 457 *status = U_BUFFER_OVERFLOW_ERROR; 458 } 459 } else { 460 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 461 if(length <= capacity) { 462 /* build the UCATableHeader with minimal entries */ 463 /* do not copy the header from the UCA file because its values are wrong! */ 464 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 465 466 /* reset everything */ 467 uprv_memset(buffer, 0, length); 468 469 /* set the tailoring-specific values */ 470 UCATableHeader *myData = (UCATableHeader *)buffer; 471 myData->size = length; 472 473 /* offset for the options, the only part of the data that is present after the header */ 474 myData->options = sizeof(UCATableHeader); 475 476 /* need to always set the expansion value for an upper bound of the options */ 477 myData->expansion = myData->options + sizeof(UColOptionSet); 478 479 myData->magic = UCOL_HEADER_MAGIC; 480 myData->isBigEndian = U_IS_BIG_ENDIAN; 481 myData->charSetFamily = U_CHARSET_FAMILY; 482 483 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 484 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 485 486 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 487 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 488 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 489 myData->jamoSpecial = coll->image->jamoSpecial; 490 491 /* copy the collator options */ 492 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 493 } else { 494 *status = U_BUFFER_OVERFLOW_ERROR; 495 } 496 } 497 return length; 498 } 499 500 U_CAPI UCollator* U_EXPORT2 501 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) 502 { 503 UCollator * localCollator; 504 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); 505 char *stackBufferChars = (char *)stackBuffer; 506 int32_t imageSize = 0; 507 int32_t rulesSize = 0; 508 int32_t rulesPadding = 0; 509 uint8_t *image; 510 UChar *rules; 511 UBool colAllocated = FALSE; 512 UBool imageAllocated = FALSE; 513 514 if (status == NULL || U_FAILURE(*status)){ 515 return 0; 516 } 517 if ((stackBuffer && !pBufferSize) || !coll){ 518 *status = U_ILLEGAL_ARGUMENT_ERROR; 519 return 0; 520 } 521 if (coll->rules && coll->freeRulesOnClose) { 522 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); 523 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); 524 bufferSizeNeeded += rulesSize + rulesPadding; 525 } 526 527 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ 528 *pBufferSize = bufferSizeNeeded; 529 return 0; 530 } 531 532 /* Pointers on 64-bit platforms need to be aligned 533 * on a 64-bit boundry in memory. 534 */ 535 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 536 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); 537 if (*pBufferSize > offsetUp) { 538 *pBufferSize -= offsetUp; 539 stackBufferChars += offsetUp; 540 } 541 else { 542 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */ 543 *pBufferSize = 1; 544 } 545 } 546 stackBuffer = (void *)stackBufferChars; 547 548 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) { 549 /* allocate one here...*/ 550 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); 551 // Null pointer check. 552 if (stackBufferChars == NULL) { 553 *status = U_MEMORY_ALLOCATION_ERROR; 554 return NULL; 555 } 556 colAllocated = TRUE; 557 if (U_SUCCESS(*status)) { 558 *status = U_SAFECLONE_ALLOCATED_WARNING; 559 } 560 } 561 localCollator = (UCollator *)stackBufferChars; 562 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); 563 { 564 UErrorCode tempStatus = U_ZERO_ERROR; 565 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); 566 } 567 if (coll->freeImageOnClose) { 568 image = (uint8_t *)uprv_malloc(imageSize); 569 // Null pointer check 570 if (image == NULL) { 571 *status = U_MEMORY_ALLOCATION_ERROR; 572 return NULL; 573 } 574 ucol_cloneBinary(coll, image, imageSize, status); 575 imageAllocated = TRUE; 576 } 577 else { 578 image = (uint8_t *)coll->image; 579 } 580 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); 581 if (U_FAILURE(*status)) { 582 return NULL; 583 } 584 585 if (coll->rules) { 586 if (coll->freeRulesOnClose) { 587 localCollator->rules = u_strcpy(rules, coll->rules); 588 //bufferEnd += rulesSize; 589 } 590 else { 591 localCollator->rules = coll->rules; 592 } 593 localCollator->freeRulesOnClose = FALSE; 594 localCollator->rulesLength = coll->rulesLength; 595 } 596 597 int32_t i; 598 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 599 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); 600 } 601 // zero copies of pointers 602 localCollator->actualLocale = NULL; 603 localCollator->validLocale = NULL; 604 localCollator->requestedLocale = NULL; 605 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. 606 localCollator->freeOnClose = colAllocated; 607 localCollator->freeImageOnClose = imageAllocated; 608 return localCollator; 609 } 610 611 U_CAPI void U_EXPORT2 612 ucol_close(UCollator *coll) 613 { 614 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 615 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 616 if(coll != NULL) { 617 // these are always owned by each UCollator struct, 618 // so we always free them 619 if(coll->validLocale != NULL) { 620 uprv_free(coll->validLocale); 621 } 622 if(coll->actualLocale != NULL) { 623 uprv_free(coll->actualLocale); 624 } 625 if(coll->requestedLocale != NULL) { 626 uprv_free(coll->requestedLocale); 627 } 628 if(coll->latinOneCEs != NULL) { 629 uprv_free(coll->latinOneCEs); 630 } 631 if(coll->options != NULL && coll->freeOptionsOnClose) { 632 uprv_free(coll->options); 633 } 634 if(coll->rules != NULL && coll->freeRulesOnClose) { 635 uprv_free((UChar *)coll->rules); 636 } 637 if(coll->image != NULL && coll->freeImageOnClose) { 638 uprv_free((UCATableHeader *)coll->image); 639 } 640 641 /* Here, it would be advisable to close: */ 642 /* - UData for UCA (unless we stuff it in the root resb */ 643 /* Again, do we need additional housekeeping... HMMM! */ 644 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); 645 if(coll->freeOnClose){ 646 /* for safeClone, if freeOnClose is FALSE, 647 don't free the other instance data */ 648 uprv_free(coll); 649 } 650 } 651 UTRACE_EXIT(); 652 } 653 654 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ 655 /* you should be able to get the binary chunk to write out... Doesn't look very full now */ 656 U_CFUNC uint8_t* U_EXPORT2 657 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) 658 { 659 uint8_t *result = NULL; 660 if(U_FAILURE(*status)) { 661 return NULL; 662 } 663 if(coll->hasRealData == TRUE) { 664 *length = coll->image->size; 665 result = (uint8_t *)uprv_malloc(*length); 666 /* test for NULL */ 667 if (result == NULL) { 668 *status = U_MEMORY_ALLOCATION_ERROR; 669 return NULL; 670 } 671 uprv_memcpy(result, coll->image, *length); 672 } else { 673 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 674 result = (uint8_t *)uprv_malloc(*length); 675 /* test for NULL */ 676 if (result == NULL) { 677 *status = U_MEMORY_ALLOCATION_ERROR; 678 return NULL; 679 } 680 681 /* build the UCATableHeader with minimal entries */ 682 /* do not copy the header from the UCA file because its values are wrong! */ 683 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 684 685 /* reset everything */ 686 uprv_memset(result, 0, *length); 687 688 /* set the tailoring-specific values */ 689 UCATableHeader *myData = (UCATableHeader *)result; 690 myData->size = *length; 691 692 /* offset for the options, the only part of the data that is present after the header */ 693 myData->options = sizeof(UCATableHeader); 694 695 /* need to always set the expansion value for an upper bound of the options */ 696 myData->expansion = myData->options + sizeof(UColOptionSet); 697 698 myData->magic = UCOL_HEADER_MAGIC; 699 myData->isBigEndian = U_IS_BIG_ENDIAN; 700 myData->charSetFamily = U_CHARSET_FAMILY; 701 702 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 703 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 704 705 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 706 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 707 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 708 myData->jamoSpecial = coll->image->jamoSpecial; 709 710 /* copy the collator options */ 711 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 712 } 713 return result; 714 } 715 716 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { 717 if(U_FAILURE(*status)) { 718 return; 719 } 720 result->caseFirst = (UColAttributeValue)opts->caseFirst; 721 result->caseLevel = (UColAttributeValue)opts->caseLevel; 722 result->frenchCollation = (UColAttributeValue)opts->frenchCollation; 723 result->normalizationMode = (UColAttributeValue)opts->normalizationMode; 724 result->strength = (UColAttributeValue)opts->strength; 725 result->variableTopValue = opts->variableTopValue; 726 result->alternateHandling = (UColAttributeValue)opts->alternateHandling; 727 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; 728 result->numericCollation = (UColAttributeValue)opts->numericCollation; 729 730 result->caseFirstisDefault = TRUE; 731 result->caseLevelisDefault = TRUE; 732 result->frenchCollationisDefault = TRUE; 733 result->normalizationModeisDefault = TRUE; 734 result->strengthisDefault = TRUE; 735 result->variableTopValueisDefault = TRUE; 736 result->hiraganaQisDefault = TRUE; 737 result->numericCollationisDefault = TRUE; 738 739 ucol_updateInternalState(result, status); 740 741 result->options = opts; 742 } 743 744 745 /** 746 * Approximate determination if a character is at a contraction end. 747 * Guaranteed to be TRUE if a character is at the end of a contraction, 748 * otherwise it is not deterministic. 749 * @param c character to be determined 750 * @param coll collator 751 */ 752 static 753 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { 754 if (c < coll->minContrEndCP) { 755 return FALSE; 756 } 757 758 int32_t hash = c; 759 uint8_t htbyte; 760 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 761 if (U16_IS_TRAIL(c)) { 762 return TRUE; 763 } 764 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 765 } 766 htbyte = coll->contrEndCP[hash>>3]; 767 return (((htbyte >> (hash & 7)) & 1) == 1); 768 } 769 770 771 772 /* 773 * i_getCombiningClass() 774 * A fast, at least partly inline version of u_getCombiningClass() 775 * This is a candidate for further optimization. Used heavily 776 * in contraction processing. 777 */ 778 static 779 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { 780 uint8_t sCC = 0; 781 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { 782 sCC = u_getCombiningClass(c); 783 } 784 return sCC; 785 } 786 787 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { 788 UChar c; 789 UCollator *result = fillIn; 790 if(U_FAILURE(*status) || image == NULL) { 791 return NULL; 792 } 793 794 if(result == NULL) { 795 result = (UCollator *)uprv_malloc(sizeof(UCollator)); 796 if(result == NULL) { 797 *status = U_MEMORY_ALLOCATION_ERROR; 798 return result; 799 } 800 result->freeOnClose = TRUE; 801 } else { 802 result->freeOnClose = FALSE; 803 } 804 805 // init FCD data 806 if (fcdTrieIndex == NULL) { 807 // The result is constant, until the library is reloaded. 808 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 809 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 810 } 811 812 result->image = image; 813 result->mapping.getFoldingOffset = _getFoldingOffset; 814 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; 815 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); 816 if(U_FAILURE(*status)) { 817 if(result->freeOnClose == TRUE) { 818 uprv_free(result); 819 result = NULL; 820 } 821 return result; 822 } 823 824 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/ 825 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); 826 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); 827 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); 828 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); 829 830 result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options); 831 result->freeOptionsOnClose = FALSE; 832 833 /* set attributes */ 834 result->caseFirst = (UColAttributeValue)result->options->caseFirst; 835 result->caseLevel = (UColAttributeValue)result->options->caseLevel; 836 result->frenchCollation = (UColAttributeValue)result->options->frenchCollation; 837 result->normalizationMode = (UColAttributeValue)result->options->normalizationMode; 838 result->strength = (UColAttributeValue)result->options->strength; 839 result->variableTopValue = result->options->variableTopValue; 840 result->alternateHandling = (UColAttributeValue)result->options->alternateHandling; 841 result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ; 842 result->numericCollation = (UColAttributeValue)result->options->numericCollation; 843 844 result->caseFirstisDefault = TRUE; 845 result->caseLevelisDefault = TRUE; 846 result->frenchCollationisDefault = TRUE; 847 result->normalizationModeisDefault = TRUE; 848 result->strengthisDefault = TRUE; 849 result->variableTopValueisDefault = TRUE; 850 result->alternateHandlingisDefault = TRUE; 851 result->hiraganaQisDefault = TRUE; 852 result->numericCollationisDefault = TRUE; 853 854 /*result->scriptOrder = NULL;*/ 855 856 result->rules = NULL; 857 result->rulesLength = 0; 858 result->freeRulesOnClose = FALSE; 859 860 /* get the version info from UCATableHeader and populate the Collator struct*/ 861 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ 862 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ 863 result->dataVersion[2] = 0; 864 result->dataVersion[3] = 0; 865 866 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; 867 result->minUnsafeCP = 0; 868 for (c=0; c<0x300; c++) { // Find the smallest unsafe char. 869 if (ucol_unsafeCP(c, result)) break; 870 } 871 result->minUnsafeCP = c; 872 873 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; 874 result->minContrEndCP = 0; 875 for (c=0; c<0x300; c++) { // Find the Contraction-ending char. 876 if (ucol_contractionEndCP(c, result)) break; 877 } 878 result->minContrEndCP = c; 879 880 /* max expansion tables */ 881 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + 882 result->image->endExpansionCE); 883 result->lastEndExpansionCE = result->endExpansionCE + 884 result->image->endExpansionCECount - 1; 885 result->expansionCESize = (uint8_t*)result->image + 886 result->image->expansionCESize; 887 888 889 //result->errorCode = *status; 890 891 result->latinOneCEs = NULL; 892 893 result->latinOneRegenTable = FALSE; 894 result->latinOneFailed = FALSE; 895 result->UCA = UCA; 896 897 ucol_updateInternalState(result, status); 898 899 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ 900 result->ucaRules = NULL; 901 result->actualLocale = NULL; 902 result->validLocale = NULL; 903 result->requestedLocale = NULL; 904 result->hasRealData = FALSE; // real data lives in .dat file... 905 result->freeImageOnClose = FALSE; 906 907 return result; 908 } 909 910 /* new Mark's code */ 911 912 /** 913 * For generation of Implicit CEs 914 * @author Davis 915 * 916 * Cleaned up so that changes can be made more easily. 917 * Old values: 918 # First Implicit: E26A792D 919 # Last Implicit: E3DC70C0 920 # First CJK: E0030300 921 # Last CJK: E0A9DD00 922 # First CJK_A: E0A9DF00 923 # Last CJK_A: E0DE3100 924 */ 925 /* Following is a port of Mark's code for new treatment of implicits. 926 * It is positioned here, since ucol_initUCA need to initialize the 927 * variables below according to the data in the fractional UCA. 928 */ 929 930 /** 931 * Function used to: 932 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and 933 * b) bump any non-CJK characters by 10FFFF. 934 * The relevant blocks are: 935 * A: 4E00..9FFF; CJK Unified Ideographs 936 * F900..FAFF; CJK Compatibility Ideographs 937 * B: 3400..4DBF; CJK Unified Ideographs Extension A 938 * 20000..XX; CJK Unified Ideographs Extension B (and others later on) 939 * As long as 940 * no new B characters are allocated between 4E00 and FAFF, and 941 * no new A characters are outside of this range, 942 * (very high probability) this simple code will work. 943 * The reordered blocks are: 944 * Block1 is CJK 945 * Block2 is CJK_COMPAT_USED 946 * Block3 is CJK_A 947 * (all contiguous) 948 * Any other CJK gets its normal code point 949 * Any non-CJK gets +10FFFF 950 * When we reorder Block1, we make sure that it is at the very start, 951 * so that it will use a 3-byte form. 952 * Warning: the we only pick up the compatibility characters that are 953 * NOT decomposed, so that block is smaller! 954 */ 955 956 // CONSTANTS 957 static const UChar32 958 NON_CJK_OFFSET = 0x110000, 959 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 960 961 /** 962 * Precomputed by initImplicitConstants() 963 */ 964 static int32_t 965 final3Multiplier = 0, 966 final4Multiplier = 0, 967 final3Count = 0, 968 final4Count = 0, 969 medialCount = 0, 970 min3Primary = 0, 971 min4Primary = 0, 972 max4Primary = 0, 973 minTrail = 0, 974 maxTrail = 0, 975 max3Trail = 0, 976 max4Trail = 0, 977 min4Boundary = 0; 978 979 static const UChar32 980 CJK_BASE = 0x4E00, 981 CJK_LIMIT = 0x9FFF+1, 982 CJK_COMPAT_USED_BASE = 0xFA0E, 983 CJK_COMPAT_USED_LIMIT = 0xFA2F+1, 984 CJK_A_BASE = 0x3400, 985 CJK_A_LIMIT = 0x4DBF+1, 986 CJK_B_BASE = 0x20000, 987 CJK_B_LIMIT = 0x2A6DF+1; 988 989 static UChar32 swapCJK(UChar32 i) { 990 991 if (i >= CJK_BASE) { 992 if (i < CJK_LIMIT) return i - CJK_BASE; 993 994 if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET; 995 996 if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE 997 + (CJK_LIMIT - CJK_BASE); 998 if (i < CJK_B_BASE) return i + NON_CJK_OFFSET; 999 1000 if (i < CJK_B_LIMIT) return i; // non-BMP-CJK 1001 1002 return i + NON_CJK_OFFSET; // non-CJK 1003 } 1004 if (i < CJK_A_BASE) return i + NON_CJK_OFFSET; 1005 1006 if (i < CJK_A_LIMIT) return i - CJK_A_BASE 1007 + (CJK_LIMIT - CJK_BASE) 1008 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1009 return i + NON_CJK_OFFSET; // non-CJK 1010 } 1011 1012 U_CAPI UChar32 U_EXPORT2 1013 uprv_uca_getRawFromCodePoint(UChar32 i) { 1014 return swapCJK(i)+1; 1015 } 1016 1017 U_CAPI UChar32 U_EXPORT2 1018 uprv_uca_getCodePointFromRaw(UChar32 i) { 1019 i--; 1020 UChar32 result = 0; 1021 if(i >= NON_CJK_OFFSET) { 1022 result = i - NON_CJK_OFFSET; 1023 } else if(i >= CJK_B_BASE) { 1024 result = i; 1025 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted 1026 if(i < CJK_LIMIT - CJK_BASE) { 1027 result = i + CJK_BASE; 1028 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { 1029 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); 1030 } else { 1031 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1032 } 1033 } else { 1034 result = -1; 1035 } 1036 return result; 1037 } 1038 1039 // GET IMPLICIT PRIMARY WEIGHTS 1040 // Return value is left justified primary key 1041 U_CAPI uint32_t U_EXPORT2 1042 uprv_uca_getImplicitFromRaw(UChar32 cp) { 1043 /* 1044 if (cp < 0 || cp > UCOL_MAX_INPUT) { 1045 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); 1046 } 1047 */ 1048 int32_t last0 = cp - min4Boundary; 1049 if (last0 < 0) { 1050 int32_t last1 = cp / final3Count; 1051 last0 = cp % final3Count; 1052 1053 int32_t last2 = last1 / medialCount; 1054 last1 %= medialCount; 1055 1056 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start 1057 last1 = minTrail + last1; // offset 1058 last2 = min3Primary + last2; // offset 1059 /* 1060 if (last2 >= min4Primary) { 1061 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); 1062 } 1063 */ 1064 return (last2 << 24) + (last1 << 16) + (last0 << 8); 1065 } else { 1066 int32_t last1 = last0 / final4Count; 1067 last0 %= final4Count; 1068 1069 int32_t last2 = last1 / medialCount; 1070 last1 %= medialCount; 1071 1072 int32_t last3 = last2 / medialCount; 1073 last2 %= medialCount; 1074 1075 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start 1076 last1 = minTrail + last1; // offset 1077 last2 = minTrail + last2; // offset 1078 last3 = min4Primary + last3; // offset 1079 /* 1080 if (last3 > max4Primary) { 1081 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); 1082 } 1083 */ 1084 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; 1085 } 1086 } 1087 1088 static uint32_t U_EXPORT2 1089 uprv_uca_getImplicitPrimary(UChar32 cp) { 1090 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); 1091 1092 cp = swapCJK(cp); 1093 cp++; 1094 // we now have a range of numbers from 0 to 21FFFF. 1095 1096 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); 1097 1098 return uprv_uca_getImplicitFromRaw(cp); 1099 } 1100 1101 /** 1102 * Converts implicit CE into raw integer ("code point") 1103 * @param implicit 1104 * @return -1 if illegal format 1105 */ 1106 U_CAPI UChar32 U_EXPORT2 1107 uprv_uca_getRawFromImplicit(uint32_t implicit) { 1108 UChar32 result; 1109 UChar32 b3 = implicit & 0xFF; 1110 UChar32 b2 = (implicit >> 8) & 0xFF; 1111 UChar32 b1 = (implicit >> 16) & 0xFF; 1112 UChar32 b0 = (implicit >> 24) & 0xFF; 1113 1114 // simple parameter checks 1115 if (b0 < min3Primary || b0 > max4Primary 1116 || b1 < minTrail || b1 > maxTrail) 1117 return -1; 1118 // normal offsets 1119 b1 -= minTrail; 1120 1121 // take care of the final values, and compose 1122 if (b0 < min4Primary) { 1123 if (b2 < minTrail || b2 > max3Trail || b3 != 0) 1124 return -1; 1125 b2 -= minTrail; 1126 UChar32 remainder = b2 % final3Multiplier; 1127 if (remainder != 0) 1128 return -1; 1129 b0 -= min3Primary; 1130 b2 /= final3Multiplier; 1131 result = ((b0 * medialCount) + b1) * final3Count + b2; 1132 } else { 1133 if (b2 < minTrail || b2 > maxTrail 1134 || b3 < minTrail || b3 > max4Trail) 1135 return -1; 1136 b2 -= minTrail; 1137 b3 -= minTrail; 1138 UChar32 remainder = b3 % final4Multiplier; 1139 if (remainder != 0) 1140 return -1; 1141 b3 /= final4Multiplier; 1142 b0 -= min4Primary; 1143 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; 1144 } 1145 // final check 1146 if (result < 0 || result > UCOL_MAX_INPUT) 1147 return -1; 1148 return result; 1149 } 1150 1151 1152 static inline int32_t divideAndRoundUp(int a, int b) { 1153 return 1 + (a-1)/b; 1154 } 1155 1156 /* this function is either called from initUCA or from genUCA before 1157 * doing canonical closure for the UCA. 1158 */ 1159 1160 /** 1161 * Set up to generate implicits. 1162 * Maintenance Note: this function may end up being called more than once, due 1163 * to threading races during initialization. Make sure that 1164 * none of the Constants is ever transiently assigned an 1165 * incorrect value. 1166 * @param minPrimary 1167 * @param maxPrimary 1168 * @param minTrail final byte 1169 * @param maxTrail final byte 1170 * @param gap3 the gap we leave for tailoring for 3-byte forms 1171 * @param gap4 the gap we leave for tailoring for 4-byte forms 1172 */ 1173 static void initImplicitConstants(int minPrimary, int maxPrimary, 1174 int minTrailIn, int maxTrailIn, 1175 int gap3, int primaries3count, 1176 UErrorCode *status) { 1177 // some simple parameter checks 1178 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) 1179 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) 1180 || (primaries3count < 1)) 1181 { 1182 *status = U_ILLEGAL_ARGUMENT_ERROR; 1183 return; 1184 }; 1185 1186 minTrail = minTrailIn; 1187 maxTrail = maxTrailIn; 1188 1189 min3Primary = minPrimary; 1190 max4Primary = maxPrimary; 1191 // compute constants for use later. 1192 // number of values we can use in trailing bytes 1193 // leave room for empty values between AND above, e.g. if gap = 2 1194 // range 3..7 => +3 -4 -5 -6 -7: so 1 value 1195 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values 1196 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values 1197 final3Multiplier = gap3 + 1; 1198 final3Count = (maxTrail - minTrail + 1) / final3Multiplier; 1199 max3Trail = minTrail + (final3Count - 1) * final3Multiplier; 1200 1201 // medials can use full range 1202 medialCount = (maxTrail - minTrail + 1); 1203 // find out how many values fit in each form 1204 int32_t threeByteCount = medialCount * final3Count; 1205 // now determine where the 3/4 boundary is. 1206 // we use 3 bytes below the boundary, and 4 above 1207 int32_t primariesAvailable = maxPrimary - minPrimary + 1; 1208 int32_t primaries4count = primariesAvailable - primaries3count; 1209 1210 1211 int32_t min3ByteCoverage = primaries3count * threeByteCount; 1212 min4Primary = minPrimary + primaries3count; 1213 min4Boundary = min3ByteCoverage; 1214 // Now expand out the multiplier for the 4 bytes, and redo. 1215 1216 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; 1217 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); 1218 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); 1219 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; 1220 if (gap4 < 1) { 1221 *status = U_ILLEGAL_ARGUMENT_ERROR; 1222 return; 1223 } 1224 final4Multiplier = gap4 + 1; 1225 final4Count = neededPerFinalByte; 1226 max4Trail = minTrail + (final4Count - 1) * final4Multiplier; 1227 } 1228 1229 /** 1230 * Supply parameters for generating implicit CEs 1231 */ 1232 U_CAPI void U_EXPORT2 1233 uprv_uca_initImplicitConstants(UErrorCode *status) { 1234 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. 1235 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); 1236 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); 1237 } 1238 1239 1240 /* collIterNormalize Incremental Normalization happens here. */ 1241 /* pick up the range of chars identifed by FCD, */ 1242 /* normalize it into the collIterate's writable buffer, */ 1243 /* switch the collIterate's state to use the writable buffer. */ 1244 /* */ 1245 static 1246 void collIterNormalize(collIterate *collationSource) 1247 { 1248 UErrorCode status = U_ZERO_ERROR; 1249 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ 1250 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ 1251 1252 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)), 1253 collationSource->writableBuffer, 1254 status); 1255 if (U_FAILURE(status)) { 1256 #ifdef UCOL_DEBUG 1257 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status)); 1258 #endif 1259 return; 1260 } 1261 1262 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer(); 1263 collationSource->origFlags = collationSource->flags; 1264 collationSource->flags |= UCOL_ITER_INNORMBUF; 1265 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1266 } 1267 1268 1269 // This function takes the iterator and extracts normalized stuff up to the next boundary 1270 // It is similar in the end results to the collIterNormalize, but for the cases when we 1271 // use an iterator 1272 /*static 1273 inline void normalizeIterator(collIterate *collationSource) { 1274 UErrorCode status = U_ZERO_ERROR; 1275 UBool wasNormalized = FALSE; 1276 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); 1277 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); 1278 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1279 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1280 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { 1281 // reallocate and terminate 1282 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, 1283 &collationSource->writableBuffer, 1284 (int32_t *)&collationSource->writableBufSize, normLen + 1, 1285 0) 1286 ) { 1287 #ifdef UCOL_DEBUG 1288 fprintf(stderr, "normalizeIterator(), out of memory\n"); 1289 #endif 1290 return; 1291 } 1292 status = U_ZERO_ERROR; 1293 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); 1294 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); 1295 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1296 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1297 } 1298 // Terminate the buffer - we already checked that it is big enough 1299 collationSource->writableBuffer[normLen] = 0; 1300 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { 1301 collationSource->flags |= UCOL_ITER_ALLOCATED; 1302 } 1303 collationSource->pos = collationSource->writableBuffer; 1304 collationSource->origFlags = collationSource->flags; 1305 collationSource->flags |= UCOL_ITER_INNORMBUF; 1306 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1307 }*/ 1308 1309 1310 /* Incremental FCD check and normalize */ 1311 /* Called from getNextCE when normalization state is suspect. */ 1312 /* When entering, the state is known to be this: */ 1313 /* o We are working in the main buffer of the collIterate, not the side */ 1314 /* writable buffer. When in the side buffer, normalization mode is always off, */ 1315 /* so we won't get here. */ 1316 /* o The leading combining class from the current character is 0 or */ 1317 /* the trailing combining class of the previous char was zero. */ 1318 /* True because the previous call to this function will have always exited */ 1319 /* that way, and we get called for every char where cc might be non-zero. */ 1320 static 1321 inline UBool collIterFCD(collIterate *collationSource) { 1322 const UChar *srcP, *endP; 1323 uint8_t leadingCC; 1324 uint8_t prevTrailingCC = 0; 1325 uint16_t fcd; 1326 UBool needNormalize = FALSE; 1327 1328 srcP = collationSource->pos-1; 1329 1330 if (collationSource->flags & UCOL_ITER_HASLEN) { 1331 endP = collationSource->endp; 1332 } else { 1333 endP = NULL; 1334 } 1335 1336 // Get the trailing combining class of the current character. If it's zero, 1337 // we are OK. 1338 /* trie access */ 1339 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); 1340 if (fcd != 0) { 1341 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1342 1343 if (prevTrailingCC != 0) { 1344 // The current char has a non-zero trailing CC. Scan forward until we find 1345 // a char with a leading cc of zero. 1346 while (endP == NULL || srcP != endP) 1347 { 1348 const UChar *savedSrcP = srcP; 1349 1350 /* trie access */ 1351 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); 1352 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1353 if (leadingCC == 0) { 1354 srcP = savedSrcP; // Hit char that is not part of combining sequence. 1355 // back up over it. (Could be surrogate pair!) 1356 break; 1357 } 1358 1359 if (leadingCC < prevTrailingCC) { 1360 needNormalize = TRUE; 1361 } 1362 1363 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1364 } 1365 } 1366 } 1367 1368 collationSource->fcdPosition = (UChar *)srcP; 1369 1370 return needNormalize; 1371 } 1372 1373 /****************************************************************************/ 1374 /* Following are the CE retrieval functions */ 1375 /* */ 1376 /****************************************************************************/ 1377 1378 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); 1379 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); 1380 1381 /* there should be a macro version of this function in the header file */ 1382 /* This is the first function that tries to fetch a collation element */ 1383 /* If it's not succesfull or it encounters a more difficult situation */ 1384 /* some more sofisticated and slower functions are invoked */ 1385 static 1386 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1387 uint32_t order = 0; 1388 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ 1389 order = *(collationSource->toReturn++); /* if so, return them */ 1390 if(collationSource->CEpos == collationSource->toReturn) { 1391 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; 1392 } 1393 return order; 1394 } 1395 1396 UChar ch = 0; 1397 collationSource->offsetReturn = NULL; 1398 1399 for (;;) /* Loop handles case when incremental normalize switches */ 1400 { /* to or from the side buffer / original string, and we */ 1401 /* need to start again to get the next character. */ 1402 1403 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) 1404 { 1405 // The source string is null terminated and we're not working from the side buffer, 1406 // and we're not normalizing. This is the fast path. 1407 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) 1408 ch = *collationSource->pos++; 1409 if (ch != 0) { 1410 break; 1411 } 1412 else { 1413 return UCOL_NO_MORE_CES; 1414 } 1415 } 1416 1417 if (collationSource->flags & UCOL_ITER_HASLEN) { 1418 // Normal path for strings when length is specified. 1419 // (We can't be in side buffer because it is always null terminated.) 1420 if (collationSource->pos >= collationSource->endp) { 1421 // Ran off of the end of the main source string. We're done. 1422 return UCOL_NO_MORE_CES; 1423 } 1424 ch = *collationSource->pos++; 1425 } 1426 else if(collationSource->flags & UCOL_USE_ITERATOR) { 1427 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); 1428 if(iterCh == U_SENTINEL) { 1429 return UCOL_NO_MORE_CES; 1430 } 1431 ch = (UChar)iterCh; 1432 } 1433 else 1434 { 1435 // Null terminated string. 1436 ch = *collationSource->pos++; 1437 if (ch == 0) { 1438 // Ran off end of buffer. 1439 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1440 // Ran off end of main string. backing up one character. 1441 collationSource->pos--; 1442 return UCOL_NO_MORE_CES; 1443 } 1444 else 1445 { 1446 // Hit null in the normalize side buffer. 1447 // Usually this means the end of the normalized data, 1448 // except for one odd case: a null followed by combining chars, 1449 // which is the case if we are at the start of the buffer. 1450 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) { 1451 break; 1452 } 1453 1454 // Null marked end of side buffer. 1455 // Revert to the main string and 1456 // loop back to top to try again to get a character. 1457 collationSource->pos = collationSource->fcdPosition; 1458 collationSource->flags = collationSource->origFlags; 1459 continue; 1460 } 1461 } 1462 } 1463 1464 if(collationSource->flags&UCOL_HIRAGANA_Q) { 1465 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag 1466 * based on whether the previous codepoint was Hiragana or Katakana. 1467 */ 1468 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || 1469 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { 1470 collationSource->flags |= UCOL_WAS_HIRAGANA; 1471 } else { 1472 collationSource->flags &= ~UCOL_WAS_HIRAGANA; 1473 } 1474 } 1475 1476 // We've got a character. See if there's any fcd and/or normalization stuff to do. 1477 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. 1478 if ((collationSource->flags & UCOL_ITER_NORM) == 0) { 1479 break; 1480 } 1481 1482 if (collationSource->fcdPosition >= collationSource->pos) { 1483 // An earlier FCD check has already covered the current character. 1484 // We can go ahead and process this char. 1485 break; 1486 } 1487 1488 if (ch < ZERO_CC_LIMIT_ ) { 1489 // Fast fcd safe path. Trailing combining class == 0. This char is OK. 1490 break; 1491 } 1492 1493 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1494 // We need to peek at the next character in order to tell if we are FCD 1495 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { 1496 // We are at the last char of source string. 1497 // It is always OK for FCD check. 1498 break; 1499 } 1500 1501 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test 1502 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { 1503 break; 1504 } 1505 } 1506 1507 1508 // Need a more complete FCD check and possible normalization. 1509 if (collIterFCD(collationSource)) { 1510 collIterNormalize(collationSource); 1511 } 1512 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1513 // No normalization was needed. Go ahead and process the char we already had. 1514 break; 1515 } 1516 1517 // Some normalization happened. Next loop iteration will pick up a char 1518 // from the normalization buffer. 1519 1520 } // end for (;;) 1521 1522 1523 if (ch <= 0xFF) { 1524 /* For latin-1 characters we never need to fall back to the UCA table */ 1525 /* because all of the UCA data is replicated in the latinOneMapping array */ 1526 order = coll->latinOneMapping[ch]; 1527 if (order > UCOL_NOT_FOUND) { 1528 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); 1529 } 1530 } 1531 else 1532 { 1533 // Always use UCA for Han, Hangul 1534 // (Han extension A is before main Han block) 1535 // **** Han compatibility chars ?? **** 1536 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1537 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { 1538 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { 1539 // between the two target ranges; do normal lookup 1540 // **** this range is YI, Modifier tone letters, **** 1541 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1542 // **** Latin-D might be tailored, so we need to **** 1543 // **** do the normal lookup for these guys. **** 1544 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1545 } else { 1546 // in one of the target ranges; use UCA 1547 order = UCOL_NOT_FOUND; 1548 } 1549 } else { 1550 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1551 } 1552 1553 if(order > UCOL_NOT_FOUND) { /* if a CE is special */ 1554 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ 1555 } 1556 1557 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ 1558 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ 1559 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1560 1561 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ 1562 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); 1563 } 1564 } 1565 } 1566 if(order == UCOL_NOT_FOUND) { 1567 order = getImplicit(ch, collationSource); 1568 } 1569 return order; /* return the CE */ 1570 } 1571 1572 /* ucol_getNextCE, out-of-line version for use from other files. */ 1573 U_CAPI uint32_t U_EXPORT2 1574 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1575 return ucol_IGetNextCE(coll, collationSource, status); 1576 } 1577 1578 1579 /** 1580 * Incremental previous normalization happens here. Pick up the range of chars 1581 * identifed by FCD, normalize it into the collIterate's writable buffer, 1582 * switch the collIterate's state to use the writable buffer. 1583 * @param data collation iterator data 1584 */ 1585 static 1586 void collPrevIterNormalize(collIterate *data) 1587 { 1588 UErrorCode status = U_ZERO_ERROR; 1589 const UChar *pEnd = data->pos; /* End normalize + 1 */ 1590 const UChar *pStart; 1591 1592 /* Start normalize */ 1593 if (data->fcdPosition == NULL) { 1594 pStart = data->string; 1595 } 1596 else { 1597 pStart = data->fcdPosition + 1; 1598 } 1599 1600 int32_t normLen = 1601 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)), 1602 data->writableBuffer, 1603 status). 1604 length(); 1605 if(U_FAILURE(status)) { 1606 return; 1607 } 1608 /* 1609 this puts the null termination infront of the normalized string instead 1610 of the end 1611 */ 1612 data->writableBuffer.insert(0, (UChar)0); 1613 1614 if (data->offsetBuffer == NULL) { 1615 int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE; 1616 1617 data->offsetBufferSize = len; 1618 data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len); 1619 data->offsetStore = data->offsetBuffer; 1620 } else if(data->offsetBufferSize < normLen) { 1621 int32_t storeIX = (int32_t)(data->offsetStore - data->offsetBuffer); 1622 int32_t *tob = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1)); 1623 1624 if (tob != NULL) { 1625 data->offsetBuffer = tob; 1626 data->offsetStore = &data->offsetBuffer[storeIX]; 1627 data->offsetBufferSize = normLen + 1; 1628 } 1629 } 1630 1631 /* 1632 * The usual case at this point is that we've got a base 1633 * character followed by marks that were normalized. If 1634 * fcdPosition is NULL, that means that we backed up to 1635 * the beginning of the string and there's no base character. 1636 * 1637 * Forward processing will usually normalize when it sees 1638 * the first mark, so that mark will get it's natural offset 1639 * and the rest will get the offset of the character following 1640 * the marks. The base character will also get its natural offset. 1641 * 1642 * We write the offset of the base character, if there is one, 1643 * followed by the offset of the first mark and then the offsets 1644 * of the rest of the marks. 1645 */ 1646 int32_t firstMarkOffset = 0; 1647 int32_t trailOffset = (int32_t)(data->pos - data->string + 1); 1648 int32_t trailCount = normLen - 1; 1649 1650 if (data->fcdPosition != NULL) { 1651 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); 1652 UChar baseChar = *data->fcdPosition; 1653 1654 firstMarkOffset = baseOffset + 1; 1655 1656 /* 1657 * If the base character is the start of a contraction, forward processing 1658 * will normalize the marks while checking for the contraction, which means 1659 * that the offset of the first mark will the same as the other marks. 1660 * 1661 * **** THIS IS PROBABLY NOT A COMPLETE TEST **** 1662 */ 1663 if (baseChar >= 0x100) { 1664 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); 1665 1666 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { 1667 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); 1668 } 1669 1670 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { 1671 firstMarkOffset = trailOffset; 1672 } 1673 } 1674 1675 *(data->offsetStore++) = baseOffset; 1676 } 1677 1678 *(data->offsetStore++) = firstMarkOffset; 1679 1680 for (int32_t i = 0; i < trailCount; i += 1) { 1681 *(data->offsetStore++) = trailOffset; 1682 } 1683 1684 data->offsetRepeatValue = trailOffset; 1685 1686 data->offsetReturn = data->offsetStore - 1; 1687 if (data->offsetReturn == data->offsetBuffer) { 1688 data->offsetStore = data->offsetBuffer; 1689 } 1690 1691 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; 1692 data->origFlags = data->flags; 1693 data->flags |= UCOL_ITER_INNORMBUF; 1694 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1695 } 1696 1697 1698 /** 1699 * Incremental FCD check for previous iteration and normalize. Called from 1700 * getPrevCE when normalization state is suspect. 1701 * When entering, the state is known to be this: 1702 * o We are working in the main buffer of the collIterate, not the side 1703 * writable buffer. When in the side buffer, normalization mode is always 1704 * off, so we won't get here. 1705 * o The leading combining class from the current character is 0 or the 1706 * trailing combining class of the previous char was zero. 1707 * True because the previous call to this function will have always exited 1708 * that way, and we get called for every char where cc might be non-zero. 1709 * @param data collation iterate struct 1710 * @return normalization status, TRUE for normalization to be done, FALSE 1711 * otherwise 1712 */ 1713 static 1714 inline UBool collPrevIterFCD(collIterate *data) 1715 { 1716 const UChar *src, *start; 1717 uint8_t leadingCC; 1718 uint8_t trailingCC = 0; 1719 uint16_t fcd; 1720 UBool result = FALSE; 1721 1722 start = data->string; 1723 src = data->pos + 1; 1724 1725 /* Get the trailing combining class of the current character. */ 1726 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); 1727 1728 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1729 1730 if (leadingCC != 0) { 1731 /* 1732 The current char has a non-zero leading combining class. 1733 Scan backward until we find a char with a trailing cc of zero. 1734 */ 1735 for (;;) 1736 { 1737 if (start == src) { 1738 data->fcdPosition = NULL; 1739 return result; 1740 } 1741 1742 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); 1743 1744 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1745 1746 if (trailingCC == 0) { 1747 break; 1748 } 1749 1750 if (leadingCC < trailingCC) { 1751 result = TRUE; 1752 } 1753 1754 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1755 } 1756 } 1757 1758 data->fcdPosition = (UChar *)src; 1759 1760 return result; 1761 } 1762 1763 /** gets a character from the string at a given offset 1764 * Handles both normal and iterative cases. 1765 * No error checking - caller beware! 1766 */ 1767 inline static 1768 UChar peekCharacter(collIterate *source, int32_t offset) { 1769 if(source->pos != NULL) { 1770 return *(source->pos + offset); 1771 } else if(source->iterator != NULL) { 1772 if(offset != 0) { 1773 source->iterator->move(source->iterator, offset, UITER_CURRENT); 1774 UChar toReturn = (UChar)source->iterator->next(source->iterator); 1775 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); 1776 return toReturn; 1777 } else { 1778 return (UChar)source->iterator->current(source->iterator); 1779 } 1780 } else { 1781 return (UChar)U_SENTINEL; 1782 } 1783 } 1784 1785 /** 1786 * Determines if we are at the start of the data string in the backwards 1787 * collation iterator 1788 * @param data collation iterator 1789 * @return TRUE if we are at the start 1790 */ 1791 static 1792 inline UBool isAtStartPrevIterate(collIterate *data) { 1793 if(data->pos == NULL && data->iterator != NULL) { 1794 return !data->iterator->hasPrevious(data->iterator); 1795 } 1796 //return (collIter_bos(data)) || 1797 return (data->pos == data->string) || 1798 ((data->flags & UCOL_ITER_INNORMBUF) && 1799 *(data->pos - 1) == 0 && data->fcdPosition == NULL); 1800 } 1801 1802 static 1803 inline void goBackOne(collIterate *data) { 1804 # if 0 1805 // somehow, it looks like we need to keep iterator synced up 1806 // at all times, as above. 1807 if(data->pos) { 1808 data->pos--; 1809 } 1810 if(data->iterator) { 1811 data->iterator->previous(data->iterator); 1812 } 1813 #endif 1814 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { 1815 data->iterator->previous(data->iterator); 1816 } 1817 if(data->pos) { 1818 data->pos --; 1819 } 1820 } 1821 1822 /** 1823 * Inline function that gets a simple CE. 1824 * So what it does is that it will first check the expansion buffer. If the 1825 * expansion buffer is not empty, ie the end pointer to the expansion buffer 1826 * is different from the string pointer, we return the collation element at the 1827 * return pointer and decrement it. 1828 * For more complicated CEs it resorts to getComplicatedCE. 1829 * @param coll collator data 1830 * @param data collation iterator struct 1831 * @param status error status 1832 */ 1833 static 1834 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, 1835 UErrorCode *status) 1836 { 1837 uint32_t result = (uint32_t)UCOL_NULLORDER; 1838 1839 if (data->offsetReturn != NULL) { 1840 if (data->offsetRepeatCount > 0) { 1841 data->offsetRepeatCount -= 1; 1842 } else { 1843 if (data->offsetReturn == data->offsetBuffer) { 1844 data->offsetReturn = NULL; 1845 data->offsetStore = data->offsetBuffer; 1846 } else { 1847 data->offsetReturn -= 1; 1848 } 1849 } 1850 } 1851 1852 if ((data->extendCEs && data->toReturn > data->extendCEs) || 1853 (!data->extendCEs && data->toReturn > data->CEs)) 1854 { 1855 data->toReturn -= 1; 1856 result = *(data->toReturn); 1857 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { 1858 data->CEpos = data->toReturn; 1859 } 1860 } 1861 else { 1862 UChar ch = 0; 1863 1864 /* 1865 Loop handles case when incremental normalize switches to or from the 1866 side buffer / original string, and we need to start again to get the 1867 next character. 1868 */ 1869 for (;;) { 1870 if (data->flags & UCOL_ITER_HASLEN) { 1871 /* 1872 Normal path for strings when length is specified. 1873 Not in side buffer because it is always null terminated. 1874 */ 1875 if (data->pos <= data->string) { 1876 /* End of the main source string */ 1877 return UCOL_NO_MORE_CES; 1878 } 1879 data->pos --; 1880 ch = *data->pos; 1881 } 1882 // we are using an iterator to go back. Pray for us! 1883 else if (data->flags & UCOL_USE_ITERATOR) { 1884 UChar32 iterCh = data->iterator->previous(data->iterator); 1885 if(iterCh == U_SENTINEL) { 1886 return UCOL_NO_MORE_CES; 1887 } else { 1888 ch = (UChar)iterCh; 1889 } 1890 } 1891 else { 1892 data->pos --; 1893 ch = *data->pos; 1894 /* we are in the side buffer. */ 1895 if (ch == 0) { 1896 /* 1897 At the start of the normalize side buffer. 1898 Go back to string. 1899 Because pointer points to the last accessed character, 1900 hence we have to increment it by one here. 1901 */ 1902 data->flags = data->origFlags; 1903 data->offsetRepeatValue = 0; 1904 1905 if (data->fcdPosition == NULL) { 1906 data->pos = data->string; 1907 return UCOL_NO_MORE_CES; 1908 } 1909 else { 1910 data->pos = data->fcdPosition + 1; 1911 } 1912 1913 continue; 1914 } 1915 } 1916 1917 if(data->flags&UCOL_HIRAGANA_Q) { 1918 if(ch>=0x3040 && ch<=0x309f) { 1919 data->flags |= UCOL_WAS_HIRAGANA; 1920 } else { 1921 data->flags &= ~UCOL_WAS_HIRAGANA; 1922 } 1923 } 1924 1925 /* 1926 * got a character to determine if there's fcd and/or normalization 1927 * stuff to do. 1928 * if the current character is not fcd. 1929 * if current character is at the start of the string 1930 * Trailing combining class == 0. 1931 * Note if pos is in the writablebuffer, norm is always 0 1932 */ 1933 if (ch < ZERO_CC_LIMIT_ || 1934 // this should propel us out of the loop in the iterator case 1935 (data->flags & UCOL_ITER_NORM) == 0 || 1936 (data->fcdPosition != NULL && data->fcdPosition <= data->pos) 1937 || data->string == data->pos) { 1938 break; 1939 } 1940 1941 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1942 /* if next character is FCD */ 1943 if (data->pos == data->string) { 1944 /* First char of string is always OK for FCD check */ 1945 break; 1946 } 1947 1948 /* Not first char of string, do the FCD fast test */ 1949 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { 1950 break; 1951 } 1952 } 1953 1954 /* Need a more complete FCD check and possible normalization. */ 1955 if (collPrevIterFCD(data)) { 1956 collPrevIterNormalize(data); 1957 } 1958 1959 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 1960 /* No normalization. Go ahead and process the char. */ 1961 break; 1962 } 1963 1964 /* 1965 Some normalization happened. 1966 Next loop picks up a char from the normalization buffer. 1967 */ 1968 } 1969 1970 /* attempt to handle contractions, after removal of the backwards 1971 contraction 1972 */ 1973 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { 1974 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); 1975 } else { 1976 if (ch <= 0xFF) { 1977 result = coll->latinOneMapping[ch]; 1978 } 1979 else { 1980 // Always use UCA for [3400..9FFF], [AC00..D7AF] 1981 // **** [FA0E..FA2F] ?? **** 1982 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1983 (ch >= 0x3400 && ch <= 0xD7AF)) { 1984 if (ch > 0x9FFF && ch < 0xAC00) { 1985 // between the two target ranges; do normal lookup 1986 // **** this range is YI, Modifier tone letters, **** 1987 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1988 // **** Latin-D might be tailored, so we need to **** 1989 // **** do the normal lookup for these guys. **** 1990 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1991 } else { 1992 result = UCOL_NOT_FOUND; 1993 } 1994 } else { 1995 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1996 } 1997 } 1998 if (result > UCOL_NOT_FOUND) { 1999 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); 2000 } 2001 if (result == UCOL_NOT_FOUND) { // Not found in master list 2002 if (!isAtStartPrevIterate(data) && 2003 ucol_contractionEndCP(ch, data->coll)) 2004 { 2005 result = UCOL_CONTRACTION; 2006 } else { 2007 if(coll->UCA) { 2008 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 2009 } 2010 } 2011 2012 if (result > UCOL_NOT_FOUND) { 2013 if(coll->UCA) { 2014 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); 2015 } 2016 } 2017 } 2018 } 2019 2020 if(result == UCOL_NOT_FOUND) { 2021 result = getPrevImplicit(ch, data); 2022 } 2023 } 2024 2025 return result; 2026 } 2027 2028 2029 /* ucol_getPrevCE, out-of-line version for use from other files. */ 2030 U_CFUNC uint32_t U_EXPORT2 2031 ucol_getPrevCE(const UCollator *coll, collIterate *data, 2032 UErrorCode *status) { 2033 return ucol_IGetPrevCE(coll, data, status); 2034 } 2035 2036 2037 /* this should be connected to special Jamo handling */ 2038 U_CFUNC uint32_t U_EXPORT2 2039 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { 2040 collIterate colIt; 2041 IInit_collIterate(coll, &u, 1, &colIt, status); 2042 if(U_FAILURE(*status)) { 2043 return 0; 2044 } 2045 return ucol_IGetNextCE(coll, &colIt, status); 2046 } 2047 2048 /** 2049 * Inserts the argument character into the end of the buffer pushing back the 2050 * null terminator. 2051 * @param data collIterate struct data 2052 * @param ch character to be appended 2053 * @return the position of the new addition 2054 */ 2055 static 2056 inline const UChar * insertBufferEnd(collIterate *data, UChar ch) 2057 { 2058 int32_t oldLength = data->writableBuffer.length(); 2059 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; 2060 } 2061 2062 /** 2063 * Inserts the argument string into the end of the buffer pushing back the 2064 * null terminator. 2065 * @param data collIterate struct data 2066 * @param string to be appended 2067 * @param length of the string to be appended 2068 * @return the position of the new addition 2069 */ 2070 static 2071 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length) 2072 { 2073 int32_t oldLength = data->writableBuffer.length(); 2074 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength; 2075 } 2076 2077 /** 2078 * Special normalization function for contraction in the forwards iterator. 2079 * This normalization sequence will place the current character at source->pos 2080 * and its following normalized sequence into the buffer. 2081 * The fcd position, pos will be changed. 2082 * pos will now point to positions in the buffer. 2083 * Flags will be changed accordingly. 2084 * @param data collation iterator data 2085 */ 2086 static 2087 inline void normalizeNextContraction(collIterate *data) 2088 { 2089 int32_t strsize; 2090 UErrorCode status = U_ZERO_ERROR; 2091 /* because the pointer points to the next character */ 2092 const UChar *pStart = data->pos - 1; 2093 const UChar *pEnd; 2094 2095 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2096 data->writableBuffer.setTo(*(pStart - 1)); 2097 strsize = 1; 2098 } 2099 else { 2100 strsize = data->writableBuffer.length(); 2101 } 2102 2103 pEnd = data->fcdPosition; 2104 2105 data->writableBuffer.append( 2106 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status)); 2107 if(U_FAILURE(status)) { 2108 return; 2109 } 2110 2111 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; 2112 data->origFlags = data->flags; 2113 data->flags |= UCOL_ITER_INNORMBUF; 2114 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2115 } 2116 2117 /** 2118 * Contraction character management function that returns the next character 2119 * for the forwards iterator. 2120 * Does nothing if the next character is in buffer and not the first character 2121 * in it. 2122 * Else it checks next character in data string to see if it is normalizable. 2123 * If it is not, the character is simply copied into the buffer, else 2124 * the whole normalized substring is copied into the buffer, including the 2125 * current character. 2126 * @param data collation element iterator data 2127 * @return next character 2128 */ 2129 static 2130 inline UChar getNextNormalizedChar(collIterate *data) 2131 { 2132 UChar nextch; 2133 UChar ch; 2134 // Here we need to add the iterator code. One problem is the way 2135 // end of string is handled. If we just return next char, it could 2136 // be the sentinel. Most of the cases already check for this, but we 2137 // need to be sure. 2138 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { 2139 /* if no normalization and not in buffer. */ 2140 if(data->flags & UCOL_USE_ITERATOR) { 2141 return (UChar)data->iterator->next(data->iterator); 2142 } else { 2143 return *(data->pos ++); 2144 } 2145 } 2146 2147 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { 2148 //normalizeIterator(data); 2149 //} 2150 2151 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2152 if ((innormbuf && *data->pos != 0) || 2153 (data->fcdPosition != NULL && !innormbuf && 2154 data->pos < data->fcdPosition)) { 2155 /* 2156 if next character is in normalized buffer, no further normalization 2157 is required 2158 */ 2159 return *(data->pos ++); 2160 } 2161 2162 if (data->flags & UCOL_ITER_HASLEN) { 2163 /* in data string */ 2164 if (data->pos + 1 == data->endp) { 2165 return *(data->pos ++); 2166 } 2167 } 2168 else { 2169 if (innormbuf) { 2170 // inside the normalization buffer, but at the end 2171 // (since we encountered zero). This means, in the 2172 // case we're using char iterator, that we need to 2173 // do another round of normalization. 2174 //if(data->origFlags & UCOL_USE_ITERATOR) { 2175 // we need to restore original flags, 2176 // otherwise, we'll lose them 2177 //data->flags = data->origFlags; 2178 //normalizeIterator(data); 2179 //return *(data->pos++); 2180 //} else { 2181 /* 2182 in writable buffer, at this point fcdPosition can not be 2183 pointing to the end of the data string. see contracting tag. 2184 */ 2185 if(data->fcdPosition) { 2186 if (*(data->fcdPosition + 1) == 0 || 2187 data->fcdPosition + 1 == data->endp) { 2188 /* at the end of the string, dump it into the normalizer */ 2189 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; 2190 // Check if data->pos received a null pointer 2191 if (data->pos == NULL) { 2192 return (UChar)-1; // Return to indicate error. 2193 } 2194 return *(data->fcdPosition ++); 2195 } 2196 data->pos = data->fcdPosition; 2197 } else if(data->origFlags & UCOL_USE_ITERATOR) { 2198 // if we are here, we're using a normalizing iterator. 2199 // we should just continue further. 2200 data->flags = data->origFlags; 2201 data->pos = NULL; 2202 return (UChar)data->iterator->next(data->iterator); 2203 } 2204 //} 2205 } 2206 else { 2207 if (*(data->pos + 1) == 0) { 2208 return *(data->pos ++); 2209 } 2210 } 2211 } 2212 2213 ch = *data->pos ++; 2214 nextch = *data->pos; 2215 2216 /* 2217 * if the current character is not fcd. 2218 * Trailing combining class == 0. 2219 */ 2220 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && 2221 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || 2222 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { 2223 /* 2224 Need a more complete FCD check and possible normalization. 2225 normalize substring will be appended to buffer 2226 */ 2227 if (collIterFCD(data)) { 2228 normalizeNextContraction(data); 2229 return *(data->pos ++); 2230 } 2231 else if (innormbuf) { 2232 /* fcdposition shifted even when there's no normalization, if we 2233 don't input the rest into this, we'll get the wrong position when 2234 we reach the end of the writableBuffer */ 2235 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); 2236 data->pos = insertBufferEnd(data, data->pos - 1, length); 2237 // Check if data->pos received a null pointer 2238 if (data->pos == NULL) { 2239 return (UChar)-1; // Return to indicate error. 2240 } 2241 return *(data->pos ++); 2242 } 2243 } 2244 2245 if (innormbuf) { 2246 /* 2247 no normalization is to be done hence only one character will be 2248 appended to the buffer. 2249 */ 2250 data->pos = insertBufferEnd(data, ch) + 1; 2251 // Check if data->pos received a null pointer 2252 if (data->pos == NULL) { 2253 return (UChar)-1; // Return to indicate error. 2254 } 2255 } 2256 2257 /* points back to the pos in string */ 2258 return ch; 2259 } 2260 2261 2262 2263 /** 2264 * Function to copy the buffer into writableBuffer and sets the fcd position to 2265 * the correct position 2266 * @param source data string source 2267 * @param buffer character buffer 2268 */ 2269 static 2270 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer) 2271 { 2272 /* okay confusing part here. to ensure that the skipped characters are 2273 considered later, we need to place it in the appropriate position in the 2274 normalization buffer and reassign the pos pointer. simple case if pos 2275 reside in string, simply copy to normalization buffer and 2276 fcdposition = pos, pos = start of normalization buffer. if pos in 2277 normalization buffer, we'll insert the copy infront of pos and point pos 2278 to the start of the normalization buffer. why am i doing these copies? 2279 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does 2280 not require any changes, which be really painful. */ 2281 if (source->flags & UCOL_ITER_INNORMBUF) { 2282 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer(); 2283 source->writableBuffer.replace(0, replaceLength, buffer); 2284 } 2285 else { 2286 source->fcdPosition = source->pos; 2287 source->origFlags = source->flags; 2288 source->flags |= UCOL_ITER_INNORMBUF; 2289 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 2290 source->writableBuffer = buffer; 2291 } 2292 2293 source->pos = source->writableBuffer.getTerminatedBuffer(); 2294 } 2295 2296 /** 2297 * Function to get the discontiguos collation element within the source. 2298 * Note this function will set the position to the appropriate places. 2299 * @param coll current collator used 2300 * @param source data string source 2301 * @param constart index to the start character in the contraction table 2302 * @return discontiguos collation element offset 2303 */ 2304 static 2305 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, 2306 const UChar *constart) 2307 { 2308 /* source->pos currently points to the second combining character after 2309 the start character */ 2310 const UChar *temppos = source->pos; 2311 UnicodeString buffer; 2312 const UChar *tempconstart = constart; 2313 uint8_t tempflags = source->flags; 2314 UBool multicontraction = FALSE; 2315 collIterateState discState; 2316 2317 backupState(source, &discState); 2318 2319 buffer.setTo(peekCharacter(source, -1)); 2320 for (;;) { 2321 UChar *UCharOffset; 2322 UChar schar, 2323 tchar; 2324 uint32_t result; 2325 2326 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) 2327 || (peekCharacter(source, 0) == 0 && 2328 //|| (*source->pos == 0 && 2329 ((source->flags & UCOL_ITER_INNORMBUF) == 0 || 2330 source->fcdPosition == NULL || 2331 source->fcdPosition == source->endp || 2332 *(source->fcdPosition) == 0 || 2333 u_getCombiningClass(*(source->fcdPosition)) == 0)) || 2334 /* end of string in null terminated string or stopped by a 2335 null character, note fcd does not always point to a base 2336 character after the discontiguos change */ 2337 u_getCombiningClass(peekCharacter(source, 0)) == 0) { 2338 //u_getCombiningClass(*(source->pos)) == 0) { 2339 //constart = (UChar *)coll->image + getContractOffset(CE); 2340 if (multicontraction) { 2341 source->pos = temppos - 1; 2342 setDiscontiguosAttribute(source, buffer); 2343 return *(coll->contractionCEs + 2344 (tempconstart - coll->contractionIndex)); 2345 } 2346 constart = tempconstart; 2347 break; 2348 } 2349 2350 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ 2351 schar = getNextNormalizedChar(source); 2352 2353 while (schar > (tchar = *UCharOffset)) { 2354 UCharOffset++; 2355 } 2356 2357 if (schar != tchar) { 2358 /* not the correct codepoint. we stuff the current codepoint into 2359 the discontiguos buffer and try the next character */ 2360 buffer.append(schar); 2361 continue; 2362 } 2363 else { 2364 if (u_getCombiningClass(schar) == 2365 u_getCombiningClass(peekCharacter(source, -2))) { 2366 //u_getCombiningClass(*(source->pos - 2))) { 2367 buffer.append(schar); 2368 continue; 2369 } 2370 result = *(coll->contractionCEs + 2371 (UCharOffset - coll->contractionIndex)); 2372 } 2373 2374 if (result == UCOL_NOT_FOUND) { 2375 break; 2376 } else if (isContraction(result)) { 2377 /* this is a multi-contraction*/ 2378 tempconstart = (UChar *)coll->image + getContractOffset(result); 2379 if (*(coll->contractionCEs + (constart - coll->contractionIndex)) 2380 != UCOL_NOT_FOUND) { 2381 multicontraction = TRUE; 2382 temppos = source->pos + 1; 2383 } 2384 } else { 2385 setDiscontiguosAttribute(source, buffer); 2386 return result; 2387 } 2388 } 2389 2390 /* no problems simply reverting just like that, 2391 if we are in string before getting into this function, points back to 2392 string hence no problem. 2393 if we are in normalization buffer before getting into this function, 2394 since we'll never use another normalization within this function, we 2395 know that fcdposition points to a base character. the normalization buffer 2396 never change, hence this revert works. */ 2397 loadState(source, &discState, TRUE); 2398 goBackOne(source); 2399 2400 //source->pos = temppos - 1; 2401 source->flags = tempflags; 2402 return *(coll->contractionCEs + (constart - coll->contractionIndex)); 2403 } 2404 2405 static 2406 inline UBool isNonChar(UChar32 cp) { 2407 return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)); 2408 } 2409 2410 /* now uses Mark's getImplicitPrimary code */ 2411 static 2412 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { 2413 if(isNonChar(cp)) { 2414 return 0; 2415 } 2416 uint32_t r = uprv_uca_getImplicitPrimary(cp); 2417 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; 2418 collationSource->offsetRepeatCount += 1; 2419 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' 2420 } 2421 2422 /** 2423 * Inserts the argument character into the front of the buffer replacing the 2424 * front null terminator. 2425 * @param data collation element iterator data 2426 * @param ch character to be appended 2427 */ 2428 static 2429 inline void insertBufferFront(collIterate *data, UChar ch) 2430 { 2431 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2; 2432 } 2433 2434 /** 2435 * Special normalization function for contraction in the previous iterator. 2436 * This normalization sequence will place the current character at source->pos 2437 * and its following normalized sequence into the buffer. 2438 * The fcd position, pos will be changed. 2439 * pos will now point to positions in the buffer. 2440 * Flags will be changed accordingly. 2441 * @param data collation iterator data 2442 */ 2443 static 2444 inline void normalizePrevContraction(collIterate *data, UErrorCode *status) 2445 { 2446 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ 2447 const UChar *pStart; 2448 2449 UnicodeString endOfBuffer; 2450 if (data->flags & UCOL_ITER_HASLEN) { 2451 /* 2452 normalization buffer not used yet, we'll pull down the next 2453 character into the end of the buffer 2454 */ 2455 endOfBuffer.setTo(*pEnd); 2456 } 2457 else { 2458 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL 2459 } 2460 2461 if (data->fcdPosition == NULL) { 2462 pStart = data->string; 2463 } 2464 else { 2465 pStart = data->fcdPosition + 1; 2466 } 2467 int32_t normLen = 2468 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), 2469 data->writableBuffer, 2470 *status). 2471 length(); 2472 if(U_FAILURE(*status)) { 2473 return; 2474 } 2475 /* 2476 this puts the null termination infront of the normalized string instead 2477 of the end 2478 */ 2479 data->pos = 2480 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() + 2481 1 + normLen; 2482 data->origFlags = data->flags; 2483 data->flags |= UCOL_ITER_INNORMBUF; 2484 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2485 } 2486 2487 /** 2488 * Contraction character management function that returns the previous character 2489 * for the backwards iterator. 2490 * Does nothing if the previous character is in buffer and not the first 2491 * character in it. 2492 * Else it checks previous character in data string to see if it is 2493 * normalizable. 2494 * If it is not, the character is simply copied into the buffer, else 2495 * the whole normalized substring is copied into the buffer, including the 2496 * current character. 2497 * @param data collation element iterator data 2498 * @return previous character 2499 */ 2500 static 2501 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) 2502 { 2503 UChar prevch; 2504 UChar ch; 2505 const UChar *start; 2506 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2507 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || 2508 (innormbuf && *(data->pos - 1) != 0)) { 2509 /* 2510 if no normalization. 2511 if previous character is in normalized buffer, no further normalization 2512 is required 2513 */ 2514 if(data->flags & UCOL_USE_ITERATOR) { 2515 data->iterator->move(data->iterator, -1, UITER_CURRENT); 2516 return (UChar)data->iterator->next(data->iterator); 2517 } else { 2518 return *(data->pos - 1); 2519 } 2520 } 2521 2522 start = data->pos; 2523 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { 2524 /* in data string */ 2525 if ((start - 1) == data->string) { 2526 return *(start - 1); 2527 } 2528 start --; 2529 ch = *start; 2530 prevch = *(start - 1); 2531 } 2532 else { 2533 /* 2534 in writable buffer, at this point fcdPosition can not be NULL. 2535 see contracting tag. 2536 */ 2537 if (data->fcdPosition == data->string) { 2538 /* at the start of the string, just dump it into the normalizer */ 2539 insertBufferFront(data, *(data->fcdPosition)); 2540 data->fcdPosition = NULL; 2541 return *(data->pos - 1); 2542 } 2543 start = data->fcdPosition; 2544 ch = *start; 2545 prevch = *(start - 1); 2546 } 2547 /* 2548 * if the current character is not fcd. 2549 * Trailing combining class == 0. 2550 */ 2551 if (data->fcdPosition > start && 2552 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) 2553 { 2554 /* 2555 Need a more complete FCD check and possible normalization. 2556 normalize substring will be appended to buffer 2557 */ 2558 const UChar *backuppos = data->pos; 2559 data->pos = start; 2560 if (collPrevIterFCD(data)) { 2561 normalizePrevContraction(data, status); 2562 return *(data->pos - 1); 2563 } 2564 data->pos = backuppos; 2565 data->fcdPosition ++; 2566 } 2567 2568 if (innormbuf) { 2569 /* 2570 no normalization is to be done hence only one character will be 2571 appended to the buffer. 2572 */ 2573 insertBufferFront(data, ch); 2574 data->fcdPosition --; 2575 } 2576 2577 return ch; 2578 } 2579 2580 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ 2581 /* It is called by getNextCE */ 2582 2583 /* The following should be even */ 2584 #define UCOL_MAX_DIGITS_FOR_NUMBER 254 2585 2586 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { 2587 collIterateState entryState; 2588 backupState(source, &entryState); 2589 UChar32 cp = ch; 2590 2591 for (;;) { 2592 // This loop will repeat only in the case of contractions, and only when a contraction 2593 // is found and the first CE resulting from that contraction is itself a special 2594 // (an expansion, for example.) All other special CE types are fully handled the 2595 // first time through, and the loop exits. 2596 2597 const uint32_t *CEOffset = NULL; 2598 switch(getCETag(CE)) { 2599 case NOT_FOUND_TAG: 2600 /* This one is not found, and we'll let somebody else bother about it... no more games */ 2601 return CE; 2602 case SPEC_PROC_TAG: 2603 { 2604 // Special processing is getting a CE that is preceded by a certain prefix 2605 // Currently this is only needed for optimizing Japanese length and iteration marks. 2606 // When we encouter a special processing tag, we go backwards and try to see if 2607 // we have a match. 2608 // Contraction tables are used - so the whole process is not unlike contraction. 2609 // prefix data is stored backwards in the table. 2610 const UChar *UCharOffset; 2611 UChar schar, tchar; 2612 collIterateState prefixState; 2613 backupState(source, &prefixState); 2614 loadState(source, &entryState, TRUE); 2615 goBackOne(source); // We want to look at the point where we entered - actually one 2616 // before that... 2617 2618 for(;;) { 2619 // This loop will run once per source string character, for as long as we 2620 // are matching a potential contraction sequence 2621 2622 // First we position ourselves at the begining of contraction sequence 2623 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2624 if (collIter_bos(source)) { 2625 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2626 break; 2627 } 2628 schar = getPrevNormalizedChar(source, status); 2629 goBackOne(source); 2630 2631 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2632 UCharOffset++; 2633 } 2634 2635 if (schar == tchar) { 2636 // Found the source string char in the table. 2637 // Pick up the corresponding CE from the table. 2638 CE = *(coll->contractionCEs + 2639 (UCharOffset - coll->contractionIndex)); 2640 } 2641 else 2642 { 2643 // Source string char was not in the table. 2644 // We have not found the prefix. 2645 CE = *(coll->contractionCEs + 2646 (ContractionStart - coll->contractionIndex)); 2647 } 2648 2649 if(!isPrefix(CE)) { 2650 // The source string char was in the contraction table, and the corresponding 2651 // CE is not a prefix CE. We found the prefix, break 2652 // out of loop, this CE will end up being returned. This is the normal 2653 // way out of prefix handling when the source actually contained 2654 // the prefix. 2655 break; 2656 } 2657 } 2658 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue 2659 loadState(source, &prefixState, TRUE); 2660 if(source->origFlags & UCOL_USE_ITERATOR) { 2661 source->flags = source->origFlags; 2662 } 2663 } else { // prefix search was a failure, we have to backup all the way to the start 2664 loadState(source, &entryState, TRUE); 2665 } 2666 break; 2667 } 2668 case CONTRACTION_TAG: 2669 { 2670 /* This should handle contractions */ 2671 collIterateState state; 2672 backupState(source, &state); 2673 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; 2674 const UChar *UCharOffset; 2675 UChar schar, tchar; 2676 2677 for (;;) { 2678 /* This loop will run once per source string character, for as long as we */ 2679 /* are matching a potential contraction sequence */ 2680 2681 /* First we position ourselves at the begining of contraction sequence */ 2682 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2683 2684 if (collIter_eos(source)) { 2685 // Ran off the end of the source string. 2686 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2687 // So we'll pick whatever we have at the point... 2688 if (CE == UCOL_NOT_FOUND) { 2689 // back up the source over all the chars we scanned going into this contraction. 2690 CE = firstCE; 2691 loadState(source, &state, TRUE); 2692 if(source->origFlags & UCOL_USE_ITERATOR) { 2693 source->flags = source->origFlags; 2694 } 2695 } 2696 break; 2697 } 2698 2699 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ 2700 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); 2701 2702 schar = getNextNormalizedChar(source); 2703 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2704 UCharOffset++; 2705 } 2706 2707 if (schar == tchar) { 2708 // Found the source string char in the contraction table. 2709 // Pick up the corresponding CE from the table. 2710 CE = *(coll->contractionCEs + 2711 (UCharOffset - coll->contractionIndex)); 2712 } 2713 else 2714 { 2715 // Source string char was not in contraction table. 2716 // Unless we have a discontiguous contraction, we have finished 2717 // with this contraction. 2718 // in order to do the proper detection, we 2719 // need to see if we're dealing with a supplementary 2720 /* We test whether the next two char are surrogate pairs. 2721 * This test is done if the iterator is not NULL. 2722 * If there is no surrogate pair, the iterator 2723 * goes back one if needed. */ 2724 UChar32 miss = schar; 2725 if (source->iterator) { 2726 UChar32 surrNextChar; /* the next char in the iteration to test */ 2727 int32_t prevPos; /* holds the previous position before move forward of the source iterator */ 2728 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { 2729 prevPos = source->iterator->index; 2730 surrNextChar = getNextNormalizedChar(source); 2731 if (U16_IS_TRAIL(surrNextChar)) { 2732 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); 2733 } else if (prevPos < source->iterator->index){ 2734 goBackOne(source); 2735 } 2736 } 2737 } else if (U16_IS_LEAD(schar)) { 2738 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); 2739 } 2740 2741 uint8_t sCC; 2742 if (miss < 0x300 || 2743 maxCC == 0 || 2744 (sCC = i_getCombiningClass(miss, coll)) == 0 || 2745 sCC>maxCC || 2746 (allSame != 0 && sCC == maxCC) || 2747 collIter_eos(source)) 2748 { 2749 // Contraction can not be discontiguous. 2750 goBackOne(source); // back up the source string by one, 2751 // because the character we just looked at was 2752 // not part of the contraction. */ 2753 if(U_IS_SUPPLEMENTARY(miss)) { 2754 goBackOne(source); 2755 } 2756 CE = *(coll->contractionCEs + 2757 (ContractionStart - coll->contractionIndex)); 2758 } else { 2759 // 2760 // Contraction is possibly discontiguous. 2761 // Scan more of source string looking for a match 2762 // 2763 UChar tempchar; 2764 /* find the next character if schar is not a base character 2765 and we are not yet at the end of the string */ 2766 tempchar = getNextNormalizedChar(source); 2767 // probably need another supplementary thingie here 2768 goBackOne(source); 2769 if (i_getCombiningClass(tempchar, coll) == 0) { 2770 goBackOne(source); 2771 if(U_IS_SUPPLEMENTARY(miss)) { 2772 goBackOne(source); 2773 } 2774 /* Spit out the last char of the string, wasn't tasty enough */ 2775 CE = *(coll->contractionCEs + 2776 (ContractionStart - coll->contractionIndex)); 2777 } else { 2778 CE = getDiscontiguous(coll, source, ContractionStart); 2779 } 2780 } 2781 } // else after if(schar == tchar) 2782 2783 if(CE == UCOL_NOT_FOUND) { 2784 /* The Source string did not match the contraction that we were checking. */ 2785 /* Back up the source position to undo the effects of having partially */ 2786 /* scanned through what ultimately proved to not be a contraction. */ 2787 loadState(source, &state, TRUE); 2788 CE = firstCE; 2789 break; 2790 } 2791 2792 if(!isContraction(CE)) { 2793 // The source string char was in the contraction table, and the corresponding 2794 // CE is not a contraction CE. We completed the contraction, break 2795 // out of loop, this CE will end up being returned. This is the normal 2796 // way out of contraction handling when the source actually contained 2797 // the contraction. 2798 break; 2799 } 2800 2801 2802 // The source string char was in the contraction table, and the corresponding 2803 // CE is IS a contraction CE. We will continue looping to check the source 2804 // string for the remaining chars in the contraction. 2805 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); 2806 if(tempCE != UCOL_NOT_FOUND) { 2807 // We have scanned a a section of source string for which there is a 2808 // CE from the contraction table. Remember the CE and scan position, so 2809 // that we can return to this point if further scanning fails to 2810 // match a longer contraction sequence. 2811 firstCE = tempCE; 2812 2813 goBackOne(source); 2814 backupState(source, &state); 2815 getNextNormalizedChar(source); 2816 2817 // Another way to do this is: 2818 //collIterateState tempState; 2819 //backupState(source, &tempState); 2820 //goBackOne(source); 2821 //backupState(source, &state); 2822 //loadState(source, &tempState, TRUE); 2823 2824 // The problem is that for incomplete contractions we have to remember the previous 2825 // position. Before, the only thing I needed to do was state.pos--; 2826 // After iterator introduction and especially after introduction of normalizing 2827 // iterators, it became much more difficult to decrease the saved state. 2828 // I'm not yet sure which of the two methods above is faster. 2829 } 2830 } // for(;;) 2831 break; 2832 } // case CONTRACTION_TAG: 2833 case LONG_PRIMARY_TAG: 2834 { 2835 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 2836 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 2837 source->offsetRepeatCount += 1; 2838 return CE; 2839 } 2840 case EXPANSION_TAG: 2841 { 2842 /* This should handle expansion. */ 2843 /* NOTE: we can encounter both continuations and expansions in an expansion! */ 2844 /* I have to decide where continuations are going to be dealt with */ 2845 uint32_t size; 2846 uint32_t i; /* general counter */ 2847 2848 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 2849 size = getExpansionCount(CE); 2850 CE = *CEOffset++; 2851 //source->offsetRepeatCount = -1; 2852 2853 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 2854 for(i = 1; i<size; i++) { 2855 *(source->CEpos++) = *CEOffset++; 2856 source->offsetRepeatCount += 1; 2857 } 2858 } else { /* else, we do */ 2859 while(*CEOffset != 0) { 2860 *(source->CEpos++) = *CEOffset++; 2861 source->offsetRepeatCount += 1; 2862 } 2863 } 2864 2865 return CE; 2866 } 2867 case DIGIT_TAG: 2868 { 2869 /* 2870 We do a check to see if we want to collate digits as numbers; if so we generate 2871 a custom collation key. Otherwise we pull out the value stored in the expansion table. 2872 */ 2873 //uint32_t size; 2874 uint32_t i; /* general counter */ 2875 2876 if (source->coll->numericCollation == UCOL_ON){ 2877 collIterateState digitState = {0,0,0,0,0,0,0,0,0}; 2878 UChar32 char32 = 0; 2879 int32_t digVal = 0; 2880 2881 uint32_t digIndx = 0; 2882 uint32_t endIndex = 0; 2883 uint32_t trailingZeroIndex = 0; 2884 2885 uint8_t collateVal = 0; 2886 2887 UBool nonZeroValReached = FALSE; 2888 2889 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. 2890 /* 2891 We parse the source string until we hit a char that's NOT a digit. 2892 Use this u_charDigitValue. This might be slow because we have to 2893 handle surrogates... 2894 */ 2895 /* 2896 if (U16_IS_LEAD(ch)){ 2897 if (!collIter_eos(source)) { 2898 backupState(source, &digitState); 2899 UChar trail = getNextNormalizedChar(source); 2900 if(U16_IS_TRAIL(trail)) { 2901 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 2902 } else { 2903 loadState(source, &digitState, TRUE); 2904 char32 = ch; 2905 } 2906 } else { 2907 char32 = ch; 2908 } 2909 } else { 2910 char32 = ch; 2911 } 2912 digVal = u_charDigitValue(char32); 2913 */ 2914 digVal = u_charDigitValue(cp); // if we have arrived here, we have 2915 // already processed possible supplementaries that trigered the digit tag - 2916 // all supplementaries are marked in the UCA. 2917 /* 2918 We pad a zero in front of the first element anyways. This takes 2919 care of the (probably) most common case where people are sorting things followed 2920 by a single digit 2921 */ 2922 digIndx++; 2923 for(;;){ 2924 // Make sure we have enough space. No longer needed; 2925 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER 2926 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough 2927 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). 2928 2929 // Skipping over leading zeroes. 2930 if (digVal != 0) { 2931 nonZeroValReached = TRUE; 2932 } 2933 if (nonZeroValReached) { 2934 /* 2935 We parse the digit string into base 100 numbers (this fits into a byte). 2936 We only add to the buffer in twos, thus if we are parsing an odd character, 2937 that serves as the 'tens' digit while the if we are parsing an even one, that 2938 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 2939 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 2940 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 2941 than all the other bytes. 2942 */ 2943 2944 if (digIndx % 2 == 1){ 2945 collateVal += (uint8_t)digVal; 2946 2947 // We don't enter the low-order-digit case unless we've already seen 2948 // the high order, or for the first digit, which is always non-zero. 2949 if (collateVal != 0) 2950 trailingZeroIndex = 0; 2951 2952 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 2953 collateVal = 0; 2954 } 2955 else{ 2956 // We drop the collation value into the buffer so if we need to do 2957 // a "front patch" we don't have to check to see if we're hitting the 2958 // last element. 2959 collateVal = (uint8_t)(digVal * 10); 2960 2961 // Check for trailing zeroes. 2962 if (collateVal == 0) 2963 { 2964 if (!trailingZeroIndex) 2965 trailingZeroIndex = (digIndx/2) + 2; 2966 } 2967 else 2968 trailingZeroIndex = 0; 2969 2970 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 2971 } 2972 digIndx++; 2973 } 2974 2975 // Get next character. 2976 if (!collIter_eos(source)){ 2977 ch = getNextNormalizedChar(source); 2978 if (U16_IS_LEAD(ch)){ 2979 if (!collIter_eos(source)) { 2980 backupState(source, &digitState); 2981 UChar trail = getNextNormalizedChar(source); 2982 if(U16_IS_TRAIL(trail)) { 2983 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 2984 } else { 2985 loadState(source, &digitState, TRUE); 2986 char32 = ch; 2987 } 2988 } 2989 } else { 2990 char32 = ch; 2991 } 2992 2993 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){ 2994 // Resetting position to point to the next unprocessed char. We 2995 // overshot it when doing our test/set for numbers. 2996 if (char32 > 0xFFFF) { // For surrogates. 2997 loadState(source, &digitState, TRUE); 2998 //goBackOne(source); 2999 } 3000 goBackOne(source); 3001 break; 3002 } 3003 } else { 3004 break; 3005 } 3006 } 3007 3008 if (nonZeroValReached == FALSE){ 3009 digIndx = 2; 3010 numTempBuf[2] = 6; 3011 } 3012 3013 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; 3014 if (digIndx % 2 != 0){ 3015 /* 3016 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what 3017 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. 3018 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a 3019 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. 3020 */ 3021 3022 for(i = 2; i < endIndex; i++){ 3023 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + 3024 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; 3025 } 3026 --digIndx; 3027 } 3028 3029 // Subtract one off of the last byte. 3030 numTempBuf[endIndex-1] -= 1; 3031 3032 /* 3033 We want to skip over the first two slots in the buffer. The first slot 3034 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3035 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3036 */ 3037 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3038 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); 3039 3040 // Now transfer the collation key to our collIterate struct. 3041 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. 3042 //size = ((endIndex+1) & ~1)/2; 3043 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3044 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3045 UCOL_BYTE_COMMON; // Tertiary weight. 3046 i = 2; // Reset the index into the buffer. 3047 while(i < endIndex) 3048 { 3049 uint32_t primWeight = numTempBuf[i++] << 8; 3050 if ( i < endIndex) 3051 primWeight |= numTempBuf[i++]; 3052 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3053 } 3054 3055 } else { 3056 // no numeric mode, we'll just switch to whatever we stashed and continue 3057 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 3058 CE = *CEOffset++; 3059 break; 3060 } 3061 return CE; 3062 } 3063 /* various implicits optimization */ 3064 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 3065 /* UCA is filled with these. Tailorings are NOT_FOUND */ 3066 return getImplicit(cp, source); 3067 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 3068 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit 3069 return getImplicit(cp, source); 3070 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3071 { 3072 static const uint32_t 3073 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3074 //const uint32_t LCount = 19; 3075 static const uint32_t VCount = 21; 3076 static const uint32_t TCount = 28; 3077 //const uint32_t NCount = VCount * TCount; // 588 3078 //const uint32_t SCount = LCount * NCount; // 11172 3079 uint32_t L = ch - SBase; 3080 3081 // divide into pieces 3082 3083 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation 3084 L /= TCount; 3085 uint32_t V = L % VCount; 3086 L /= VCount; 3087 3088 // offset them 3089 3090 L += LBase; 3091 V += VBase; 3092 T += TBase; 3093 3094 // return the first CE, but first put the rest into the expansion buffer 3095 if (!source->coll->image->jamoSpecial) { // FAST PATH 3096 3097 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3098 if (T != TBase) { 3099 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3100 } 3101 3102 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3103 3104 } else { // Jamo is Special 3105 // Since Hanguls pass the FCD check, it is 3106 // guaranteed that we won't be in 3107 // the normalization buffer if something like this happens 3108 // However, if we are using a uchar iterator and normalization 3109 // is ON, the Hangul that lead us here is going to be in that 3110 // normalization buffer. Here we want to restore the uchar 3111 // iterator state and pull out of the normalization buffer 3112 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { 3113 source->flags = source->origFlags; // restore the iterator 3114 source->pos = NULL; 3115 } 3116 // Move Jamos into normalization buffer 3117 UChar *buffer = source->writableBuffer.getBuffer(4); 3118 int32_t bufferLength; 3119 buffer[0] = (UChar)L; 3120 buffer[1] = (UChar)V; 3121 if (T != TBase) { 3122 buffer[2] = (UChar)T; 3123 bufferLength = 3; 3124 } else { 3125 bufferLength = 2; 3126 } 3127 source->writableBuffer.releaseBuffer(bufferLength); 3128 3129 source->fcdPosition = source->pos; // Indicate where to continue in main input string 3130 // after exhausting the writableBuffer 3131 source->pos = source->writableBuffer.getTerminatedBuffer(); 3132 source->origFlags = source->flags; 3133 source->flags |= UCOL_ITER_INNORMBUF; 3134 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 3135 3136 return(UCOL_IGNORABLE); 3137 } 3138 } 3139 case SURROGATE_TAG: 3140 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ 3141 /* two things can happen here: next code point can be a trailing surrogate - we will use it */ 3142 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ 3143 /* we return 0 (completely ignorable - per UCA specification */ 3144 { 3145 UChar trail; 3146 collIterateState state; 3147 backupState(source, &state); 3148 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { 3149 // we chould have stepped one char forward and it might have turned that it 3150 // was not a trail surrogate. In that case, we have to backup. 3151 loadState(source, &state, TRUE); 3152 return 0; 3153 } else { 3154 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ 3155 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail); 3156 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. 3157 // We need to backup 3158 loadState(source, &state, TRUE); 3159 return CE; 3160 } 3161 // calculate the supplementary code point value, if surrogate was not tailored 3162 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 3163 } 3164 } 3165 break; 3166 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 3167 UChar nextChar; 3168 if( source->flags & UCOL_USE_ITERATOR) { 3169 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { 3170 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3171 source->iterator->next(source->iterator); 3172 return getImplicit(cp, source); 3173 } else { 3174 return 0; 3175 } 3176 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) && 3177 U_IS_TRAIL((nextChar=*source->pos))) { 3178 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3179 source->pos++; 3180 return getImplicit(cp, source); 3181 } else { 3182 return 0; /* completely ignorable */ 3183 } 3184 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 3185 return 0; /* broken surrogate sequence */ 3186 case CHARSET_TAG: 3187 /* not yet implemented */ 3188 /* probably after 1.8 */ 3189 return UCOL_NOT_FOUND; 3190 default: 3191 *status = U_INTERNAL_PROGRAM_ERROR; 3192 CE=0; 3193 break; 3194 } 3195 if (CE <= UCOL_NOT_FOUND) break; 3196 } 3197 return CE; 3198 } 3199 3200 3201 /* now uses Mark's getImplicitPrimary code */ 3202 static 3203 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { 3204 if(isNonChar(cp)) { 3205 return 0; 3206 } 3207 3208 uint32_t r = uprv_uca_getImplicitPrimary(cp); 3209 3210 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; 3211 collationSource->toReturn = collationSource->CEpos; 3212 3213 if (collationSource->offsetBuffer == NULL) { 3214 collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; 3215 collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); 3216 collationSource->offsetStore = collationSource->offsetBuffer; 3217 } 3218 3219 // **** doesn't work if using iterator **** 3220 if (collationSource->flags & UCOL_ITER_INNORMBUF) { 3221 collationSource->offsetRepeatCount = 1; 3222 } else { 3223 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string); 3224 3225 *(collationSource->offsetStore++) = firstOffset; 3226 *(collationSource->offsetStore++) = firstOffset + 1; 3227 3228 collationSource->offsetReturn = collationSource->offsetStore - 1; 3229 *(collationSource->offsetBuffer) = firstOffset; 3230 if (collationSource->offsetReturn == collationSource->offsetBuffer) { 3231 collationSource->offsetStore = collationSource->offsetBuffer; 3232 } 3233 } 3234 3235 return ((r & 0x0000FFFF)<<16) | 0x000000C0; 3236 } 3237 3238 /** 3239 * This function handles the special CEs like contractions, expansions, 3240 * surrogates, Thai. 3241 * It is called by both getPrevCE 3242 */ 3243 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, 3244 collIterate *source, 3245 UErrorCode *status) 3246 { 3247 const uint32_t *CEOffset = NULL; 3248 UChar *UCharOffset = NULL; 3249 UChar schar; 3250 const UChar *constart = NULL; 3251 uint32_t size; 3252 UChar buffer[UCOL_MAX_BUFFER]; 3253 uint32_t *endCEBuffer; 3254 UChar *strbuffer; 3255 int32_t noChars = 0; 3256 int32_t CECount = 0; 3257 3258 for(;;) 3259 { 3260 /* the only ces that loops are thai and contractions */ 3261 switch (getCETag(CE)) 3262 { 3263 case NOT_FOUND_TAG: /* this tag always returns */ 3264 return CE; 3265 3266 case SPEC_PROC_TAG: 3267 { 3268 // Special processing is getting a CE that is preceded by a certain prefix 3269 // Currently this is only needed for optimizing Japanese length and iteration marks. 3270 // When we encouter a special processing tag, we go backwards and try to see if 3271 // we have a match. 3272 // Contraction tables are used - so the whole process is not unlike contraction. 3273 // prefix data is stored backwards in the table. 3274 const UChar *UCharOffset; 3275 UChar schar, tchar; 3276 collIterateState prefixState; 3277 backupState(source, &prefixState); 3278 for(;;) { 3279 // This loop will run once per source string character, for as long as we 3280 // are matching a potential contraction sequence 3281 3282 // First we position ourselves at the begining of contraction sequence 3283 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 3284 3285 if (collIter_bos(source)) { 3286 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 3287 break; 3288 } 3289 schar = getPrevNormalizedChar(source, status); 3290 goBackOne(source); 3291 3292 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 3293 UCharOffset++; 3294 } 3295 3296 if (schar == tchar) { 3297 // Found the source string char in the table. 3298 // Pick up the corresponding CE from the table. 3299 CE = *(coll->contractionCEs + 3300 (UCharOffset - coll->contractionIndex)); 3301 } 3302 else 3303 { 3304 // if there is a completely ignorable code point in the middle of 3305 // a prefix, we need to act as if it's not there 3306 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) 3307 // lone surrogates cannot be set to zero as it would break other processing 3308 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 3309 // it's easy for BMP code points 3310 if(isZeroCE == 0) { 3311 continue; 3312 } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) { 3313 // for supplementary code points, we have to check the next one 3314 // situations where we are going to ignore 3315 // 1. beginning of the string: schar is a lone surrogate 3316 // 2. schar is a lone surrogate 3317 // 3. schar is a trail surrogate in a valid surrogate sequence 3318 // that is explicitly set to zero. 3319 if (!collIter_bos(source)) { 3320 UChar lead; 3321 if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) { 3322 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead); 3323 if(getCETag(isZeroCE) == SURROGATE_TAG) { 3324 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); 3325 if(finalCE == 0) { 3326 // this is a real, assigned completely ignorable code point 3327 goBackOne(source); 3328 continue; 3329 } 3330 } 3331 } else { 3332 // lone surrogate, completely ignorable 3333 continue; 3334 } 3335 } else { 3336 // lone surrogate at the beggining, completely ignorable 3337 continue; 3338 } 3339 } 3340 // Source string char was not in the table. 3341 // We have not found the prefix. 3342 CE = *(coll->contractionCEs + 3343 (ContractionStart - coll->contractionIndex)); 3344 } 3345 3346 if(!isPrefix(CE)) { 3347 // The source string char was in the contraction table, and the corresponding 3348 // CE is not a prefix CE. We found the prefix, break 3349 // out of loop, this CE will end up being returned. This is the normal 3350 // way out of prefix handling when the source actually contained 3351 // the prefix. 3352 break; 3353 } 3354 } 3355 loadState(source, &prefixState, TRUE); 3356 break; 3357 } 3358 3359 case CONTRACTION_TAG: { 3360 /* to ensure that the backwards and forwards iteration matches, we 3361 take the current region of most possible match and pass it through 3362 the forward iteration. this will ensure that the obstinate problem of 3363 overlapping contractions will not occur. 3364 */ 3365 schar = peekCharacter(source, 0); 3366 constart = (UChar *)coll->image + getContractOffset(CE); 3367 if (isAtStartPrevIterate(source) 3368 /* commented away contraction end checks after adding the checks 3369 in getPrevCE */) { 3370 /* start of string or this is not the end of any contraction */ 3371 CE = *(coll->contractionCEs + 3372 (constart - coll->contractionIndex)); 3373 break; 3374 } 3375 strbuffer = buffer; 3376 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); 3377 *(UCharOffset --) = 0; 3378 noChars = 0; 3379 // have to swap thai characters 3380 while (ucol_unsafeCP(schar, coll)) { 3381 *(UCharOffset) = schar; 3382 noChars++; 3383 UCharOffset --; 3384 schar = getPrevNormalizedChar(source, status); 3385 goBackOne(source); 3386 // TODO: when we exhaust the contraction buffer, 3387 // it needs to get reallocated. The problem is 3388 // that the size depends on the string which is 3389 // not iterated over. However, since we're travelling 3390 // backwards, we already had to set the iterator at 3391 // the end - so we might as well know where we are? 3392 if (UCharOffset + 1 == buffer) { 3393 /* we have exhausted the buffer */ 3394 int32_t newsize = 0; 3395 if(source->pos) { // actually dealing with a position 3396 newsize = (int32_t)(source->pos - source->string + 1); 3397 } else { // iterator 3398 newsize = 4 * UCOL_MAX_BUFFER; 3399 } 3400 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * 3401 (newsize + UCOL_MAX_BUFFER)); 3402 /* test for NULL */ 3403 if (strbuffer == NULL) { 3404 *status = U_MEMORY_ALLOCATION_ERROR; 3405 return UCOL_NO_MORE_CES; 3406 } 3407 UCharOffset = strbuffer + newsize; 3408 uprv_memcpy(UCharOffset, buffer, 3409 UCOL_MAX_BUFFER * sizeof(UChar)); 3410 UCharOffset --; 3411 } 3412 if ((source->pos && (source->pos == source->string || 3413 ((source->flags & UCOL_ITER_INNORMBUF) && 3414 *(source->pos - 1) == 0 && source->fcdPosition == NULL))) 3415 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { 3416 break; 3417 } 3418 } 3419 /* adds the initial base character to the string */ 3420 *(UCharOffset) = schar; 3421 noChars++; 3422 3423 int32_t offsetBias; 3424 3425 // **** doesn't work if using iterator **** 3426 if (source->flags & UCOL_ITER_INNORMBUF) { 3427 offsetBias = -1; 3428 } else { 3429 offsetBias = (int32_t)(source->pos - source->string); 3430 } 3431 3432 /* a new collIterate is used to simplify things, since using the current 3433 collIterate will mean that the forward and backwards iteration will 3434 share and change the same buffers. we don't want to get into that. */ 3435 collIterate temp; 3436 int32_t rawOffset; 3437 3438 IInit_collIterate(coll, UCharOffset, noChars, &temp, status); 3439 if(U_FAILURE(*status)) { 3440 return UCOL_NULLORDER; 3441 } 3442 temp.flags &= ~UCOL_ITER_NORM; 3443 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; 3444 3445 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero? 3446 CE = ucol_IGetNextCE(coll, &temp, status); 3447 3448 if (source->extendCEs) { 3449 endCEBuffer = source->extendCEs + source->extendCEsSize; 3450 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t)); 3451 } else { 3452 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; 3453 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t)); 3454 } 3455 3456 if (source->offsetBuffer == NULL) { 3457 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; 3458 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); 3459 source->offsetStore = source->offsetBuffer; 3460 } 3461 3462 while (CE != UCOL_NO_MORE_CES) { 3463 *(source->CEpos ++) = CE; 3464 3465 if (offsetBias >= 0) { 3466 *(source->offsetStore ++) = rawOffset + offsetBias; 3467 } 3468 3469 CECount++; 3470 if (source->CEpos == endCEBuffer) { 3471 /* ran out of CE space, reallocate to new buffer. 3472 If reallocation fails, reset pointers and bail out, 3473 there's no guarantee of the right character position after 3474 this bail*/ 3475 if (!increaseCEsCapacity(source)) { 3476 *status = U_MEMORY_ALLOCATION_ERROR; 3477 if (strbuffer != buffer) { 3478 uprv_free(strbuffer); 3479 } 3480 3481 return (uint32_t)UCOL_NULLORDER; 3482 } 3483 3484 endCEBuffer = source->extendCEs + source->extendCEsSize; 3485 } 3486 3487 if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) { 3488 int32_t storeIX = (int32_t)(source->offsetStore - source->offsetBuffer); 3489 int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer, 3490 sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE)); 3491 3492 if (tob != NULL) { 3493 source->offsetBuffer = tob; 3494 source->offsetStore = &source->offsetBuffer[storeIX]; 3495 source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE; 3496 } else { 3497 // memory error... 3498 *status = U_MEMORY_ALLOCATION_ERROR; 3499 source->CEpos = source->CEs; 3500 3501 if (strbuffer != buffer) { 3502 uprv_free(strbuffer); 3503 } 3504 3505 return (uint32_t) UCOL_NULLORDER; 3506 } 3507 } 3508 3509 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { 3510 rawOffset = (int32_t)(temp.fcdPosition - temp.string); 3511 } else { 3512 rawOffset = (int32_t)(temp.pos - temp.string); 3513 } 3514 3515 CE = ucol_IGetNextCE(coll, &temp, status); 3516 } 3517 3518 if (source->offsetRepeatValue != 0) { 3519 if (CECount > noChars) { 3520 source->offsetRepeatCount += temp.offsetRepeatCount; 3521 } else { 3522 // **** does this really skip the right offsets? **** 3523 source->offsetReturn -= (noChars - CECount); 3524 } 3525 } 3526 3527 if (strbuffer != buffer) { 3528 uprv_free(strbuffer); 3529 } 3530 3531 if (offsetBias >= 0) { 3532 source->offsetReturn = source->offsetStore - 1; 3533 if (source->offsetReturn == source->offsetBuffer) { 3534 source->offsetStore = source->offsetBuffer; 3535 } 3536 } 3537 3538 source->toReturn = source->CEpos - 1; 3539 if (source->toReturn == source->CEs) { 3540 source->CEpos = source->CEs; 3541 } 3542 3543 return *(source->toReturn); 3544 } 3545 case LONG_PRIMARY_TAG: 3546 { 3547 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 3548 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 3549 source->toReturn = source->CEpos - 1; 3550 3551 if (source->offsetBuffer == NULL) { 3552 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; 3553 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); 3554 source->offsetStore = source->offsetBuffer; 3555 } 3556 3557 if (source->flags & UCOL_ITER_INNORMBUF) { 3558 source->offsetRepeatCount = 1; 3559 } else { 3560 int32_t firstOffset = (int32_t)(source->pos - source->string); 3561 3562 *(source->offsetStore++) = firstOffset; 3563 *(source->offsetStore++) = firstOffset + 1; 3564 3565 source->offsetReturn = source->offsetStore - 1; 3566 *(source->offsetBuffer) = firstOffset; 3567 if (source->offsetReturn == source->offsetBuffer) { 3568 source->offsetStore = source->offsetBuffer; 3569 } 3570 } 3571 3572 3573 return *(source->toReturn); 3574 } 3575 3576 case EXPANSION_TAG: /* this tag always returns */ 3577 { 3578 /* 3579 This should handle expansion. 3580 NOTE: we can encounter both continuations and expansions in an expansion! 3581 I have to decide where continuations are going to be dealt with 3582 */ 3583 int32_t firstOffset = (int32_t)(source->pos - source->string); 3584 3585 // **** doesn't work if using iterator **** 3586 if (source->offsetReturn != NULL) { 3587 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) { 3588 source->offsetStore = source->offsetBuffer; 3589 }else { 3590 firstOffset = -1; 3591 } 3592 } 3593 3594 if (source->offsetBuffer == NULL) { 3595 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; 3596 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); 3597 source->offsetStore = source->offsetBuffer; 3598 } 3599 3600 /* find the offset to expansion table */ 3601 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3602 size = getExpansionCount(CE); 3603 if (size != 0) { 3604 /* 3605 if there are less than 16 elements in expansion, we don't terminate 3606 */ 3607 uint32_t count; 3608 3609 for (count = 0; count < size; count++) { 3610 *(source->CEpos ++) = *CEOffset++; 3611 3612 if (firstOffset >= 0) { 3613 *(source->offsetStore ++) = firstOffset + 1; 3614 } 3615 } 3616 } else { 3617 /* else, we do */ 3618 while (*CEOffset != 0) { 3619 *(source->CEpos ++) = *CEOffset ++; 3620 3621 if (firstOffset >= 0) { 3622 *(source->offsetStore ++) = firstOffset + 1; 3623 } 3624 } 3625 } 3626 3627 if (firstOffset >= 0) { 3628 source->offsetReturn = source->offsetStore - 1; 3629 *(source->offsetBuffer) = firstOffset; 3630 if (source->offsetReturn == source->offsetBuffer) { 3631 source->offsetStore = source->offsetBuffer; 3632 } 3633 } else { 3634 source->offsetRepeatCount += size - 1; 3635 } 3636 3637 source->toReturn = source->CEpos - 1; 3638 // in case of one element expansion, we 3639 // want to immediately return CEpos 3640 if(source->toReturn == source->CEs) { 3641 source->CEpos = source->CEs; 3642 } 3643 3644 return *(source->toReturn); 3645 } 3646 3647 case DIGIT_TAG: 3648 { 3649 /* 3650 We do a check to see if we want to collate digits as numbers; if so we generate 3651 a custom collation key. Otherwise we pull out the value stored in the expansion table. 3652 */ 3653 uint32_t i; /* general counter */ 3654 3655 if (source->coll->numericCollation == UCOL_ON){ 3656 uint32_t digIndx = 0; 3657 uint32_t endIndex = 0; 3658 uint32_t leadingZeroIndex = 0; 3659 uint32_t trailingZeroCount = 0; 3660 3661 uint8_t collateVal = 0; 3662 3663 UBool nonZeroValReached = FALSE; 3664 3665 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs. 3666 /* 3667 We parse the source string until we hit a char that's NOT a digit. 3668 Use this u_charDigitValue. This might be slow because we have to 3669 handle surrogates... 3670 */ 3671 /* 3672 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less, 3673 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation 3674 element we process when going backward. To determine how long that chunk might be, we may need to make 3675 two passes through the loop that collects digits - one to see how long the string is (and how much is 3676 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has 3677 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation 3678 element chunk after resetting the state to the initialState at the right side of the digit string. 3679 */ 3680 uint32_t ceLimit = 0; 3681 UChar initial_ch = ch; 3682 collIterateState initialState = {0,0,0,0,0,0,0,0,0}; 3683 backupState(source, &initialState); 3684 3685 for(;;) { 3686 collIterateState state = {0,0,0,0,0,0,0,0,0}; 3687 UChar32 char32 = 0; 3688 int32_t digVal = 0; 3689 3690 if (U16_IS_TRAIL (ch)) { 3691 if (!collIter_bos(source)){ 3692 UChar lead = getPrevNormalizedChar(source, status); 3693 if(U16_IS_LEAD(lead)) { 3694 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3695 goBackOne(source); 3696 } else { 3697 char32 = ch; 3698 } 3699 } else { 3700 char32 = ch; 3701 } 3702 } else { 3703 char32 = ch; 3704 } 3705 digVal = u_charDigitValue(char32); 3706 3707 for(;;) { 3708 // Make sure we have enough space. No longer needed; 3709 // at this point the largest value of digIndx when we need to save data in numTempBuf 3710 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure 3711 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2). 3712 3713 // Skip over trailing zeroes, and keep a count of them. 3714 if (digVal != 0) 3715 nonZeroValReached = TRUE; 3716 3717 if (nonZeroValReached) { 3718 /* 3719 We parse the digit string into base 100 numbers (this fits into a byte). 3720 We only add to the buffer in twos, thus if we are parsing an odd character, 3721 that serves as the 'tens' digit while the if we are parsing an even one, that 3722 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3723 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3724 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3725 than all the other bytes. 3726 3727 Since we're doing in this reverse we want to put the first digit encountered into the 3728 ones place and the second digit encountered into the tens place. 3729 */ 3730 3731 if ((digIndx + trailingZeroCount) % 2 == 1) { 3732 // High-order digit case (tens place) 3733 collateVal += (uint8_t)(digVal * 10); 3734 3735 // We cannot set leadingZeroIndex unless it has been set for the 3736 // low-order digit. Therefore, all we can do for the high-order 3737 // digit is turn it off, never on. 3738 // The only time we will have a high digit without a low is for 3739 // the very first non-zero digit, so no zero check is necessary. 3740 if (collateVal != 0) 3741 leadingZeroIndex = 0; 3742 3743 // The first pass through, digIndx may exceed the limit, but in that case 3744 // we no longer care about numTempBuf contents since they will be discarded 3745 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) { 3746 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3747 } 3748 collateVal = 0; 3749 } else { 3750 // Low-order digit case (ones place) 3751 collateVal = (uint8_t)digVal; 3752 3753 // Check for leading zeroes. 3754 if (collateVal == 0) { 3755 if (!leadingZeroIndex) 3756 leadingZeroIndex = (digIndx/2) + 2; 3757 } else 3758 leadingZeroIndex = 0; 3759 3760 // No need to write to buffer; the case of a last odd digit 3761 // is handled below. 3762 } 3763 ++digIndx; 3764 } else 3765 ++trailingZeroCount; 3766 3767 if (!collIter_bos(source)) { 3768 ch = getPrevNormalizedChar(source, status); 3769 //goBackOne(source); 3770 if (U16_IS_TRAIL(ch)) { 3771 backupState(source, &state); 3772 if (!collIter_bos(source)) { 3773 goBackOne(source); 3774 UChar lead = getPrevNormalizedChar(source, status); 3775 3776 if(U16_IS_LEAD(lead)) { 3777 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3778 } else { 3779 loadState(source, &state, FALSE); 3780 char32 = ch; 3781 } 3782 } 3783 } else 3784 char32 = ch; 3785 3786 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { 3787 if (char32 > 0xFFFF) {// For surrogates. 3788 loadState(source, &state, FALSE); 3789 } 3790 // Don't need to "reverse" the goBackOne call, 3791 // as this points to the next position to process.. 3792 //if (char32 > 0xFFFF) // For surrogates. 3793 //getNextNormalizedChar(source); 3794 break; 3795 } 3796 3797 goBackOne(source); 3798 }else 3799 break; 3800 } 3801 3802 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) { 3803 // our collation element is not too big, go ahead and finish with it 3804 break; 3805 } 3806 // our digit string is too long for a collation element; 3807 // set the limit for it, reset the state and begin again 3808 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER; 3809 if ( ceLimit == 0 ) { 3810 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; 3811 } 3812 ch = initial_ch; 3813 loadState(source, &initialState, FALSE); 3814 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0; 3815 collateVal = 0; 3816 nonZeroValReached = FALSE; 3817 } 3818 3819 if (! nonZeroValReached) { 3820 digIndx = 2; 3821 trailingZeroCount = 0; 3822 numTempBuf[2] = 6; 3823 } 3824 3825 if ((digIndx + trailingZeroCount) % 2 != 0) { 3826 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; 3827 digIndx += 1; // The implicit leading zero 3828 } 3829 if (trailingZeroCount % 2 != 0) { 3830 // We had to consume one trailing zero for the low digit 3831 // of the least significant byte 3832 digIndx += 1; // The trailing zero not in the exponent 3833 trailingZeroCount -= 1; 3834 } 3835 3836 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; 3837 3838 // Subtract one off of the last byte. Really the first byte here, but it's reversed... 3839 numTempBuf[2] -= 1; 3840 3841 /* 3842 We want to skip over the first two slots in the buffer. The first slot 3843 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3844 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3845 The exponent must be adjusted by the number of leading zeroes, and the number of 3846 trailing zeroes. 3847 */ 3848 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3849 uint32_t exponent = (digIndx+trailingZeroCount)/2; 3850 if (leadingZeroIndex) 3851 exponent -= ((digIndx/2) + 2 - leadingZeroIndex); 3852 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); 3853 3854 // Now transfer the collation key to our collIterate struct. 3855 // The total size for our collation key is half of endIndex, rounded up. 3856 int32_t size = (endIndex+1)/2; 3857 if(!ensureCEsCapacity(source, size)) { 3858 return UCOL_NULLORDER; 3859 } 3860 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3861 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3862 UCOL_BYTE_COMMON; // Tertiary weight. 3863 i = endIndex - 1; // Reset the index into the buffer. 3864 while(i >= 2) { 3865 uint32_t primWeight = numTempBuf[i--] << 8; 3866 if ( i >= 2) 3867 primWeight |= numTempBuf[i--]; 3868 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3869 } 3870 3871 source->toReturn = source->CEpos -1; 3872 return *(source->toReturn); 3873 } else { 3874 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3875 CE = *(CEOffset++); 3876 break; 3877 } 3878 } 3879 3880 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3881 { 3882 static const uint32_t 3883 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3884 //const uint32_t LCount = 19; 3885 static const uint32_t VCount = 21; 3886 static const uint32_t TCount = 28; 3887 //const uint32_t NCount = VCount * TCount; /* 588 */ 3888 //const uint32_t SCount = LCount * NCount; /* 11172 */ 3889 3890 uint32_t L = ch - SBase; 3891 /* 3892 divide into pieces. 3893 we do it in this order since some compilers can do % and / in one 3894 operation 3895 */ 3896 uint32_t T = L % TCount; 3897 L /= TCount; 3898 uint32_t V = L % VCount; 3899 L /= VCount; 3900 3901 /* offset them */ 3902 L += LBase; 3903 V += VBase; 3904 T += TBase; 3905 3906 if (source->offsetBuffer == NULL) { 3907 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; 3908 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); 3909 source->offsetStore = source->offsetBuffer; 3910 } 3911 3912 int32_t firstOffset = (int32_t)(source->pos - source->string); 3913 3914 *(source->offsetStore++) = firstOffset; 3915 3916 /* 3917 * return the first CE, but first put the rest into the expansion buffer 3918 */ 3919 if (!source->coll->image->jamoSpecial) { 3920 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3921 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3922 *(source->offsetStore++) = firstOffset + 1; 3923 3924 if (T != TBase) { 3925 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3926 *(source->offsetStore++) = firstOffset + 1; 3927 } 3928 3929 source->toReturn = source->CEpos - 1; 3930 3931 source->offsetReturn = source->offsetStore - 1; 3932 if (source->offsetReturn == source->offsetBuffer) { 3933 source->offsetStore = source->offsetBuffer; 3934 } 3935 3936 return *(source->toReturn); 3937 } else { 3938 // Since Hanguls pass the FCD check, it is 3939 // guaranteed that we won't be in 3940 // the normalization buffer if something like this happens 3941 // Move Jamos into normalization buffer 3942 /* 3943 Move the Jamos into the 3944 normalization buffer 3945 */ 3946 UChar *tempbuffer = source->writableBuffer.getBuffer(5); 3947 int32_t tempbufferLength; 3948 tempbuffer[0] = 0; 3949 tempbuffer[1] = (UChar)L; 3950 tempbuffer[2] = (UChar)V; 3951 if (T != TBase) { 3952 tempbuffer[3] = (UChar)T; 3953 tempbufferLength = 4; 3954 } else { 3955 tempbufferLength = 3; 3956 } 3957 source->writableBuffer.releaseBuffer(tempbufferLength); 3958 3959 /* 3960 Indicate where to continue in main input string after exhausting 3961 the writableBuffer 3962 */ 3963 if (source->pos == source->string) { 3964 source->fcdPosition = NULL; 3965 } else { 3966 source->fcdPosition = source->pos-1; 3967 } 3968 3969 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength; 3970 source->origFlags = source->flags; 3971 source->flags |= UCOL_ITER_INNORMBUF; 3972 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 3973 3974 return(UCOL_IGNORABLE); 3975 } 3976 } 3977 3978 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 3979 return getPrevImplicit(ch, source); 3980 3981 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function 3982 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 3983 return getPrevImplicit(ch, source); 3984 3985 case SURROGATE_TAG: /* This is a surrogate pair */ 3986 /* essentialy an engaged lead surrogate. */ 3987 /* if you have encountered it here, it means that a */ 3988 /* broken sequence was encountered and this is an error */ 3989 return 0; 3990 3991 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 3992 return 0; /* broken surrogate sequence */ 3993 3994 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 3995 { 3996 UChar32 cp = 0; 3997 UChar prevChar; 3998 const UChar *prev; 3999 if (isAtStartPrevIterate(source)) { 4000 /* we are at the start of the string, wrong place to be at */ 4001 return 0; 4002 } 4003 if (source->pos != source->writableBuffer.getBuffer()) { 4004 prev = source->pos - 1; 4005 } else { 4006 prev = source->fcdPosition; 4007 } 4008 prevChar = *prev; 4009 4010 /* Handles Han and Supplementary characters here.*/ 4011 if (U16_IS_LEAD(prevChar)) { 4012 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 4013 source->pos = prev; 4014 } else { 4015 return 0; /* completely ignorable */ 4016 } 4017 4018 return getPrevImplicit(cp, source); 4019 } 4020 4021 /* UCA is filled with these. Tailorings are NOT_FOUND */ 4022 /* not yet implemented */ 4023 case CHARSET_TAG: /* this tag always returns */ 4024 /* probably after 1.8 */ 4025 return UCOL_NOT_FOUND; 4026 4027 default: /* this tag always returns */ 4028 *status = U_INTERNAL_PROGRAM_ERROR; 4029 CE=0; 4030 break; 4031 } 4032 4033 if (CE <= UCOL_NOT_FOUND) { 4034 break; 4035 } 4036 } 4037 4038 return CE; 4039 } 4040 4041 /* This should really be a macro */ 4042 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */ 4043 /* anyway */ 4044 static 4045 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) { 4046 #ifdef UCOL_DEBUG 4047 fprintf(stderr, "."); 4048 #endif 4049 uint8_t *newStart = NULL; 4050 uint32_t offset = (uint32_t)(*secondaries-secStart); 4051 4052 if(secStart==second) { 4053 newStart=(uint8_t*)uprv_malloc(newSize); 4054 if(newStart==NULL) { 4055 *status = U_MEMORY_ALLOCATION_ERROR; 4056 return NULL; 4057 } 4058 uprv_memcpy(newStart, secStart, *secondaries-secStart); 4059 } else { 4060 newStart=(uint8_t*)uprv_realloc(secStart, newSize); 4061 if(newStart==NULL) { 4062 *status = U_MEMORY_ALLOCATION_ERROR; 4063 /* Since we're reallocating, return original reference so we don't loose it. */ 4064 return secStart; 4065 } 4066 } 4067 *secondaries=newStart+offset; 4068 *secSize=newSize; 4069 return newStart; 4070 } 4071 4072 4073 /* This should really be a macro */ 4074 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */ 4075 /* secondaries in French */ 4076 /* 4077 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { 4078 uint8_t temp; 4079 while(start<end) { 4080 temp = *start; 4081 *start++ = *end; 4082 *end-- = temp; 4083 } 4084 } 4085 */ 4086 4087 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ 4088 TYPE tempA; \ 4089 while((start)<(end)) { \ 4090 tempA = *(start); \ 4091 *(start)++ = *(end); \ 4092 *(end)-- = tempA; \ 4093 } \ 4094 } 4095 4096 /****************************************************************************/ 4097 /* Following are the sortkey generation functions */ 4098 /* */ 4099 /****************************************************************************/ 4100 4101 /** 4102 * Merge two sort keys. 4103 * This is useful, for example, to combine sort keys from first and last names 4104 * to sort such pairs. 4105 * Merged sort keys consider on each collation level the first part first entirely, 4106 * then the second one. 4107 * It is possible to merge multiple sort keys by consecutively merging 4108 * another one with the intermediate result. 4109 * 4110 * The length of the merge result is the sum of the lengths of the input sort keys 4111 * minus 1. 4112 * 4113 * @param src1 the first sort key 4114 * @param src1Length the length of the first sort key, including the zero byte at the end; 4115 * can be -1 if the function is to find the length 4116 * @param src2 the second sort key 4117 * @param src2Length the length of the second sort key, including the zero byte at the end; 4118 * can be -1 if the function is to find the length 4119 * @param dest the buffer where the merged sort key is written, 4120 * can be NULL if destCapacity==0 4121 * @param destCapacity the number of bytes in the dest buffer 4122 * @return the length of the merged sort key, src1Length+src2Length-1; 4123 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments), 4124 * in which cases the contents of dest is undefined 4125 * 4126 * @draft 4127 */ 4128 U_CAPI int32_t U_EXPORT2 4129 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 4130 const uint8_t *src2, int32_t src2Length, 4131 uint8_t *dest, int32_t destCapacity) { 4132 int32_t destLength; 4133 uint8_t b; 4134 4135 /* check arguments */ 4136 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 4137 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 4138 destCapacity<0 || (destCapacity>0 && dest==NULL) 4139 ) { 4140 /* error, attempt to write a zero byte and return 0 */ 4141 if(dest!=NULL && destCapacity>0) { 4142 *dest=0; 4143 } 4144 return 0; 4145 } 4146 4147 /* check lengths and capacity */ 4148 if(src1Length<0) { 4149 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 4150 } 4151 if(src2Length<0) { 4152 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 4153 } 4154 4155 destLength=src1Length+src2Length-1; 4156 if(destLength>destCapacity) { 4157 /* the merged sort key does not fit into the destination */ 4158 return destLength; 4159 } 4160 4161 /* merge the sort keys with the same number of levels */ 4162 while(*src1!=0 && *src2!=0) { /* while both have another level */ 4163 /* copy level from src1 not including 00 or 01 */ 4164 while((b=*src1)>=2) { 4165 ++src1; 4166 *dest++=b; 4167 } 4168 4169 /* add a 02 merge separator */ 4170 *dest++=2; 4171 4172 /* copy level from src2 not including 00 or 01 */ 4173 while((b=*src2)>=2) { 4174 ++src2; 4175 *dest++=b; 4176 } 4177 4178 /* if both sort keys have another level, then add a 01 level separator and continue */ 4179 if(*src1==1 && *src2==1) { 4180 ++src1; 4181 ++src2; 4182 *dest++=1; 4183 } 4184 } 4185 4186 /* 4187 * here, at least one sort key is finished now, but the other one 4188 * might have some contents left from containing more levels; 4189 * that contents is just appended to the result 4190 */ 4191 if(*src1!=0) { 4192 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 4193 src2=src1; 4194 } 4195 /* append src2, "the other, unfinished sort key" */ 4196 uprv_strcpy((char *)dest, (const char *)src2); 4197 4198 /* trust that neither sort key contained illegally embedded zero bytes */ 4199 return destLength; 4200 } 4201 4202 /* sortkey API */ 4203 U_CAPI int32_t U_EXPORT2 4204 ucol_getSortKey(const UCollator *coll, 4205 const UChar *source, 4206 int32_t sourceLength, 4207 uint8_t *result, 4208 int32_t resultLength) 4209 { 4210 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 4211 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 4212 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 4213 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 4214 } 4215 4216 UErrorCode status = U_ZERO_ERROR; 4217 int32_t keySize = 0; 4218 4219 if(source != NULL) { 4220 // source == NULL is actually an error situation, but we would need to 4221 // have an error code to return it. Until we introduce a new 4222 // API, it stays like this 4223 4224 /* this uses the function pointer that is set in updateinternalstate */ 4225 /* currently, there are two funcs: */ 4226 /*ucol_calcSortKey(...);*/ 4227 /*ucol_calcSortKeySimpleTertiary(...);*/ 4228 4229 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status); 4230 //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) { 4231 // That's not good. Something unusual happened. 4232 // We don't know how much we initialized before we failed. 4233 // NULL terminate for safety. 4234 // We have no way say that we have generated a partial sort key. 4235 //result[0] = 0; 4236 //keySize = 0; 4237 //} 4238 } 4239 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 4240 UTRACE_EXIT_STATUS(status); 4241 return keySize; 4242 } 4243 4244 /* this function is called by the C++ API for sortkey generation */ 4245 U_CFUNC int32_t 4246 ucol_getSortKeyWithAllocation(const UCollator *coll, 4247 const UChar *source, int32_t sourceLength, 4248 uint8_t **pResult, 4249 UErrorCode *pErrorCode) { 4250 *pResult = 0; 4251 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode); 4252 } 4253 4254 #define UCOL_FSEC_BUF_SIZE 256 4255 4256 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */ 4257 /* or if we run out of space while making a sortkey and want to return ASAP */ 4258 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) { 4259 UErrorCode status = U_ZERO_ERROR; 4260 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 4261 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4262 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4263 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4264 UBool compareIdent = (strength == UCOL_IDENTICAL); 4265 UBool doCase = (coll->caseLevel == UCOL_ON); 4266 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4267 //UBool qShifted = shifted && (compareQuad == 0); 4268 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4269 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4270 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE]; 4271 uint8_t *fSecs = fSecsBuff; 4272 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE; 4273 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL; 4274 4275 uint32_t variableTopValue = coll->variableTopValue; 4276 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4277 if(doHiragana) { 4278 UCOL_COMMON_BOT4++; 4279 /* allocate one more space for hiragana */ 4280 } 4281 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4282 4283 uint32_t order = UCOL_NO_MORE_CES; 4284 uint8_t primary1 = 0; 4285 uint8_t primary2 = 0; 4286 uint8_t secondary = 0; 4287 uint8_t tertiary = 0; 4288 int32_t caseShift = 0; 4289 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */ 4290 4291 uint8_t caseSwitch = coll->caseSwitch; 4292 uint8_t tertiaryMask = coll->tertiaryMask; 4293 uint8_t tertiaryCommon = coll->tertiaryCommon; 4294 4295 UBool wasShifted = FALSE; 4296 UBool notIsContinuation = FALSE; 4297 uint8_t leadPrimary = 0; 4298 4299 4300 for(;;) { 4301 order = ucol_IGetNextCE(coll, s, &status); 4302 if(order == UCOL_NO_MORE_CES) { 4303 break; 4304 } 4305 4306 if(order == 0) { 4307 continue; 4308 } 4309 4310 notIsContinuation = !isContinuation(order); 4311 4312 4313 if(notIsContinuation) { 4314 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK)); 4315 } else { 4316 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4317 } 4318 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4319 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4320 primary1 = (uint8_t)(order >> 8); 4321 4322 4323 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4324 || (!notIsContinuation && wasShifted)) 4325 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */ 4326 /* and other ignorables should be removed if following a shifted code point */ 4327 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4328 /* we should just completely ignore it */ 4329 continue; 4330 } 4331 if(compareQuad == 0) { 4332 if(c4 > 0) { 4333 currentSize += (c2/UCOL_BOT_COUNT4)+1; 4334 c4 = 0; 4335 } 4336 currentSize++; 4337 if(primary2 != 0) { 4338 currentSize++; 4339 } 4340 } 4341 wasShifted = TRUE; 4342 } else { 4343 wasShifted = FALSE; 4344 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4345 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ 4346 /* calculate sortkey size */ 4347 if(primary1 != UCOL_IGNORABLE) { 4348 if(notIsContinuation) { 4349 if(leadPrimary == primary1) { 4350 currentSize++; 4351 } else { 4352 if(leadPrimary != 0) { 4353 currentSize++; 4354 } 4355 if(primary2 == UCOL_IGNORABLE) { 4356 /* one byter, not compressed */ 4357 currentSize++; 4358 leadPrimary = 0; 4359 } 4360 else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY || 4361 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) { 4362 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { 4363 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) 4364 { 4365 /* not compressible */ 4366 leadPrimary = 0; 4367 currentSize+=2; 4368 } 4369 else { /* compress */ 4370 leadPrimary = primary1; 4371 currentSize+=2; 4372 } 4373 } 4374 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 4375 currentSize++; 4376 if(primary2 != UCOL_IGNORABLE) { 4377 currentSize++; 4378 } 4379 } 4380 } 4381 4382 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */ 4383 if(!isFrenchSec){ 4384 if (secondary == UCOL_COMMON2 && notIsContinuation) { 4385 c2++; 4386 } else { 4387 if(c2 > 0) { 4388 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4389 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1; 4390 } else { 4391 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1; 4392 } 4393 c2 = 0; 4394 } 4395 currentSize++; 4396 } 4397 } else { 4398 fSecs[fSecsLen++] = secondary; 4399 if(fSecsLen == fSecsMaxLen) { 4400 uint8_t *fSecsTemp; 4401 if(fSecs == fSecsBuff) { 4402 fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen); 4403 } else { 4404 fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen); 4405 } 4406 if(fSecsTemp == NULL) { 4407 status = U_MEMORY_ALLOCATION_ERROR; 4408 return 0; 4409 } 4410 fSecs = fSecsTemp; 4411 fSecsMaxLen *= 2; 4412 } 4413 if(notIsContinuation) { 4414 if (frenchStartPtr != NULL) { 4415 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4416 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4417 frenchStartPtr = NULL; 4418 } 4419 } else { 4420 if (frenchStartPtr == NULL) { 4421 frenchStartPtr = fSecs+fSecsLen-2; 4422 } 4423 frenchEndPtr = fSecs+fSecsLen-1; 4424 } 4425 } 4426 } 4427 4428 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 4429 // do the case level if we need to do it. We don't want to calculate 4430 // case level for primary ignorables if we have only primary strength and case level 4431 // otherwise we would break well formedness of CEs 4432 if (caseShift == 0) { 4433 currentSize++; 4434 caseShift = UCOL_CASE_SHIFT_START; 4435 } 4436 if((tertiary&0x3F) > 0 && notIsContinuation) { 4437 caseShift--; 4438 if((tertiary &0xC0) != 0) { 4439 if (caseShift == 0) { 4440 currentSize++; 4441 caseShift = UCOL_CASE_SHIFT_START; 4442 } 4443 caseShift--; 4444 } 4445 } 4446 } else { 4447 if(notIsContinuation) { 4448 tertiary ^= caseSwitch; 4449 } 4450 } 4451 4452 tertiary &= tertiaryMask; 4453 if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */ 4454 if (tertiary == tertiaryCommon && notIsContinuation) { 4455 c3++; 4456 } else { 4457 if(c3 > 0) { 4458 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) 4459 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) { 4460 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1; 4461 } else { 4462 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1; 4463 } 4464 c3 = 0; 4465 } 4466 currentSize++; 4467 } 4468 } 4469 4470 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 4471 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 4472 if(c4>0) { // Close this part 4473 currentSize += (c4/UCOL_BOT_COUNT4)+1; 4474 c4 = 0; 4475 } 4476 currentSize++; // Add the Hiragana 4477 } else { // This wasn't Hiragana, so we can continue adding stuff 4478 c4++; 4479 } 4480 } 4481 } 4482 } 4483 4484 if(!isFrenchSec){ 4485 if(c2 > 0) { 4486 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4487 } 4488 } else { 4489 uint32_t i = 0; 4490 if(frenchStartPtr != NULL) { 4491 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4492 } 4493 for(i = 0; i<fSecsLen; i++) { 4494 secondary = *(fSecs+fSecsLen-i-1); 4495 /* This is compression code. */ 4496 if (secondary == UCOL_COMMON2) { 4497 ++c2; 4498 } else { 4499 if(c2 > 0) { 4500 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4501 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0); 4502 } else { 4503 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4504 } 4505 c2 = 0; 4506 } 4507 currentSize++; 4508 } 4509 } 4510 if(c2 > 0) { 4511 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4512 } 4513 if(fSecs != fSecsBuff) { 4514 uprv_free(fSecs); 4515 } 4516 } 4517 4518 if(c3 > 0) { 4519 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0); 4520 } 4521 4522 if(c4 > 0 && compareQuad == 0) { 4523 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0); 4524 } 4525 4526 if(compareIdent) { 4527 currentSize += u_lengthOfIdenticalLevelRun(s->string, len); 4528 } 4529 return currentSize; 4530 } 4531 4532 static 4533 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) { 4534 if (caseShift == 0) { 4535 *(*cases)++ = UCOL_CASE_BYTE_START; 4536 caseShift = UCOL_CASE_SHIFT_START; 4537 } 4538 } 4539 4540 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we 4541 // know how many values we wanted to add, even if we didn't add them all 4542 static 4543 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) { 4544 size++; 4545 if(primaries < limit) { 4546 *(primaries)++ = value; 4547 } 4548 } 4549 4550 // Packs the secondary buffer when processing French locale. Adds the terminator. 4551 static 4552 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) { 4553 uint8_t secondary; 4554 int32_t count2 = 0; 4555 uint32_t i = 0, size = 0; 4556 // we use i here since the key size already accounts for terminators, so we'll discard the increment 4557 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR); 4558 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */ 4559 if(frenchStartPtr != NULL) { 4560 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4561 } 4562 for(i = 0; i<*secsize; i++) { 4563 secondary = *(secondaries-i-1); 4564 /* This is compression code. */ 4565 if (secondary == UCOL_COMMON2) { 4566 ++count2; 4567 } else { 4568 if (count2 > 0) { 4569 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4570 while (count2 > UCOL_TOP_COUNT2) { 4571 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2)); 4572 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4573 } 4574 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1))); 4575 } else { 4576 while (count2 > UCOL_BOT_COUNT2) { 4577 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4578 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4579 } 4580 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4581 } 4582 count2 = 0; 4583 } 4584 addWithIncrement(primaries, primEnd, size, secondary); 4585 } 4586 } 4587 if (count2 > 0) { 4588 while (count2 > UCOL_BOT_COUNT2) { 4589 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4590 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4591 } 4592 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4593 } 4594 *secsize = size; 4595 return primaries; 4596 } 4597 4598 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 4599 4600 /* This is the sortkey work horse function */ 4601 U_CFUNC int32_t U_CALLCONV 4602 ucol_calcSortKey(const UCollator *coll, 4603 const UChar *source, 4604 int32_t sourceLength, 4605 uint8_t **result, 4606 uint32_t resultLength, 4607 UBool allocateSKBuffer, 4608 UErrorCode *status) 4609 { 4610 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 4611 4612 uint32_t i = 0; /* general purpose counter */ 4613 4614 /* Stack allocated buffers for buffers we use */ 4615 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER]; 4616 4617 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad; 4618 4619 if(U_FAILURE(*status)) { 4620 return 0; 4621 } 4622 4623 if(primaries == NULL && allocateSKBuffer == TRUE) { 4624 primaries = *result = prim; 4625 resultLength = UCOL_PRIMARY_MAX_BUFFER; 4626 } 4627 4628 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER, 4629 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER; 4630 4631 uint32_t sortKeySize = 1; /* it is always \0 terminated */ 4632 4633 UnicodeString normSource; 4634 4635 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); 4636 4637 UColAttributeValue strength = coll->strength; 4638 4639 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4640 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4641 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4642 UBool compareIdent = (strength == UCOL_IDENTICAL); 4643 UBool doCase = (coll->caseLevel == UCOL_ON); 4644 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4645 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4646 //UBool qShifted = shifted && (compareQuad == 0); 4647 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4648 /*const uint8_t *scriptOrder = coll->scriptOrder;*/ 4649 4650 uint32_t variableTopValue = coll->variableTopValue; 4651 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no 4652 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. 4653 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4654 uint8_t UCOL_HIRAGANA_QUAD = 0; 4655 if(doHiragana) { 4656 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; 4657 /* allocate one more space for hiragana, value for hiragana */ 4658 } 4659 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4660 4661 /* support for special features like caselevel and funky secondaries */ 4662 uint8_t *frenchStartPtr = NULL; 4663 uint8_t *frenchEndPtr = NULL; 4664 uint32_t caseShift = 0; 4665 4666 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0)); 4667 4668 /* If we need to normalize, we'll do it all at once at the beginning! */ 4669 const Normalizer2 *norm2; 4670 if(compareIdent) { 4671 norm2 = Normalizer2Factory::getNFDInstance(*status); 4672 } else if(coll->normalizationMode != UCOL_OFF) { 4673 norm2 = Normalizer2Factory::getFCDInstance(*status); 4674 } else { 4675 norm2 = NULL; 4676 } 4677 if(norm2 != NULL) { 4678 normSource.setTo(FALSE, source, len); 4679 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 4680 if(qcYesLength != len) { 4681 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 4682 normSource.truncate(qcYesLength); 4683 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 4684 source = normSource.getBuffer(); 4685 len = normSource.length(); 4686 } 4687 } 4688 collIterate s; 4689 IInit_collIterate(coll, source, len, &s, status); 4690 if(U_FAILURE(*status)) { 4691 return 0; 4692 } 4693 if(source == normSource.getBuffer()) { 4694 s.flags &= ~UCOL_ITER_NORM; 4695 } 4696 4697 if(resultLength == 0 || primaries == NULL) { 4698 return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); 4699 } 4700 uint8_t *primarySafeEnd = primaries + resultLength - 1; 4701 if(strength > UCOL_PRIMARY) { 4702 primarySafeEnd--; 4703 } 4704 4705 uint32_t minBufferSize = UCOL_MAX_BUFFER; 4706 4707 uint8_t *primStart = primaries; 4708 uint8_t *secStart = secondaries; 4709 uint8_t *terStart = tertiaries; 4710 uint8_t *caseStart = cases; 4711 uint8_t *quadStart = quads; 4712 4713 uint32_t order = 0; 4714 4715 uint8_t primary1 = 0; 4716 uint8_t primary2 = 0; 4717 uint8_t secondary = 0; 4718 uint8_t tertiary = 0; 4719 uint8_t caseSwitch = coll->caseSwitch; 4720 uint8_t tertiaryMask = coll->tertiaryMask; 4721 int8_t tertiaryAddition = coll->tertiaryAddition; 4722 uint8_t tertiaryTop = coll->tertiaryTop; 4723 uint8_t tertiaryBottom = coll->tertiaryBottom; 4724 uint8_t tertiaryCommon = coll->tertiaryCommon; 4725 uint8_t caseBits = 0; 4726 4727 UBool finished = FALSE; 4728 UBool wasShifted = FALSE; 4729 UBool notIsContinuation = FALSE; 4730 4731 uint32_t prevBuffSize = 0; 4732 4733 uint32_t count2 = 0, count3 = 0, count4 = 0; 4734 uint8_t leadPrimary = 0; 4735 4736 for(;;) { 4737 for(i=prevBuffSize; i<minBufferSize; ++i) { 4738 4739 order = ucol_IGetNextCE(coll, &s, status); 4740 if(order == UCOL_NO_MORE_CES) { 4741 finished = TRUE; 4742 break; 4743 } 4744 4745 if(order == 0) { 4746 continue; 4747 } 4748 4749 notIsContinuation = !isContinuation(order); 4750 4751 if(notIsContinuation) { 4752 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); 4753 } else { 4754 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4755 } 4756 4757 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4758 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4759 primary1 = (uint8_t)(order >> 8); 4760 4761 /*if(notIsContinuation && scriptOrder != NULL) { 4762 primary1 = scriptOrder[primary1]; 4763 }*/ 4764 4765 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4766 || (!notIsContinuation && wasShifted)) 4767 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 4768 { 4769 /* and other ignorables should be removed if following a shifted code point */ 4770 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4771 /* we should just completely ignore it */ 4772 continue; 4773 } 4774 if(compareQuad == 0) { 4775 if(count4 > 0) { 4776 while (count4 > UCOL_BOT_COUNT4) { 4777 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4778 count4 -= UCOL_BOT_COUNT4; 4779 } 4780 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 4781 count4 = 0; 4782 } 4783 /* We are dealing with a variable and we're treating them as shifted */ 4784 /* This is a shifted ignorable */ 4785 if(primary1 != 0) { /* we need to check this since we could be in continuation */ 4786 *quads++ = primary1; 4787 } 4788 if(primary2 != 0) { 4789 *quads++ = primary2; 4790 } 4791 } 4792 wasShifted = TRUE; 4793 } else { 4794 wasShifted = FALSE; 4795 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4796 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ 4797 /* regular and simple sortkey calc */ 4798 if(primary1 != UCOL_IGNORABLE) { 4799 if(notIsContinuation) { 4800 if(leadPrimary == primary1) { 4801 *primaries++ = primary2; 4802 } else { 4803 if(leadPrimary != 0) { 4804 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 4805 } 4806 if(primary2 == UCOL_IGNORABLE) { 4807 /* one byter, not compressed */ 4808 *primaries++ = primary1; 4809 leadPrimary = 0; 4810 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY || 4811 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { 4812 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) { 4813 /* not compressible */ 4814 leadPrimary = 0; 4815 *primaries++ = primary1; 4816 if(primaries <= primarySafeEnd) { 4817 *primaries++ = primary2; 4818 } 4819 } else { /* compress */ 4820 *primaries++ = leadPrimary = primary1; 4821 if(primaries <= primarySafeEnd) { 4822 *primaries++ = primary2; 4823 } 4824 } 4825 } 4826 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 4827 *primaries++ = primary1; 4828 if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) { 4829 *primaries++ = primary2; /* second part */ 4830 } 4831 } 4832 } 4833 4834 if(secondary > compareSec) { 4835 if(!isFrenchSec) { 4836 /* This is compression code. */ 4837 if (secondary == UCOL_COMMON2 && notIsContinuation) { 4838 ++count2; 4839 } else { 4840 if (count2 > 0) { 4841 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4842 while (count2 > UCOL_TOP_COUNT2) { 4843 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 4844 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4845 } 4846 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); 4847 } else { 4848 while (count2 > UCOL_BOT_COUNT2) { 4849 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 4850 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4851 } 4852 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 4853 } 4854 count2 = 0; 4855 } 4856 *secondaries++ = secondary; 4857 } 4858 } else { 4859 *secondaries++ = secondary; 4860 /* Do the special handling for French secondaries */ 4861 /* We need to get continuation elements and do intermediate restore */ 4862 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ 4863 if(notIsContinuation) { 4864 if (frenchStartPtr != NULL) { 4865 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4866 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4867 frenchStartPtr = NULL; 4868 } 4869 } else { 4870 if (frenchStartPtr == NULL) { 4871 frenchStartPtr = secondaries - 2; 4872 } 4873 frenchEndPtr = secondaries-1; 4874 } 4875 } 4876 } 4877 4878 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 4879 // do the case level if we need to do it. We don't want to calculate 4880 // case level for primary ignorables if we have only primary strength and case level 4881 // otherwise we would break well formedness of CEs 4882 doCaseShift(&cases, caseShift); 4883 if(notIsContinuation) { 4884 caseBits = (uint8_t)(tertiary & 0xC0); 4885 4886 if(tertiary != 0) { 4887 if(coll->caseFirst == UCOL_UPPER_FIRST) { 4888 if((caseBits & 0xC0) == 0) { 4889 *(cases-1) |= 1 << (--caseShift); 4890 } else { 4891 *(cases-1) |= 0 << (--caseShift); 4892 /* second bit */ 4893 doCaseShift(&cases, caseShift); 4894 *(cases-1) |= ((caseBits>>6)&1) << (--caseShift); 4895 } 4896 } else { 4897 if((caseBits & 0xC0) == 0) { 4898 *(cases-1) |= 0 << (--caseShift); 4899 } else { 4900 *(cases-1) |= 1 << (--caseShift); 4901 /* second bit */ 4902 doCaseShift(&cases, caseShift); 4903 *(cases-1) |= ((caseBits>>7)&1) << (--caseShift); 4904 } 4905 } 4906 } 4907 4908 } 4909 } else { 4910 if(notIsContinuation) { 4911 tertiary ^= caseSwitch; 4912 } 4913 } 4914 4915 tertiary &= tertiaryMask; 4916 if(tertiary > compareTer) { 4917 /* This is compression code. */ 4918 /* sequence size check is included in the if clause */ 4919 if (tertiary == tertiaryCommon && notIsContinuation) { 4920 ++count3; 4921 } else { 4922 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 4923 tertiary += tertiaryAddition; 4924 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 4925 tertiary -= tertiaryAddition; 4926 } 4927 if (count3 > 0) { 4928 if ((tertiary > tertiaryCommon)) { 4929 while (count3 > coll->tertiaryTopCount) { 4930 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 4931 count3 -= (uint32_t)coll->tertiaryTopCount; 4932 } 4933 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); 4934 } else { 4935 while (count3 > coll->tertiaryBottomCount) { 4936 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 4937 count3 -= (uint32_t)coll->tertiaryBottomCount; 4938 } 4939 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 4940 } 4941 count3 = 0; 4942 } 4943 *tertiaries++ = tertiary; 4944 } 4945 } 4946 4947 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 4948 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 4949 if(count4>0) { // Close this part 4950 while (count4 > UCOL_BOT_COUNT4) { 4951 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4952 count4 -= UCOL_BOT_COUNT4; 4953 } 4954 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 4955 count4 = 0; 4956 } 4957 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana 4958 } else { // This wasn't Hiragana, so we can continue adding stuff 4959 count4++; 4960 } 4961 } 4962 } 4963 4964 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ 4965 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ 4966 IInit_collIterate(coll, (UChar *)source, len, &s, status); 4967 if(U_FAILURE(*status)) { 4968 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 4969 finished = TRUE; 4970 break; 4971 } 4972 if(source == normSource.getBuffer()) { 4973 s.flags &= ~UCOL_ITER_NORM; 4974 } 4975 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); 4976 *status = U_BUFFER_OVERFLOW_ERROR; 4977 finished = TRUE; 4978 break; 4979 } else { /* It's much nicer if we can actually reallocate */ 4980 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart)); 4981 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); 4982 if(U_SUCCESS(*status)) { 4983 *result = primStart; 4984 primarySafeEnd = primStart + resultLength - 1; 4985 if(strength > UCOL_PRIMARY) { 4986 primarySafeEnd--; 4987 } 4988 } else { 4989 /* We ran out of memory!? We can't recover. */ 4990 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 4991 finished = TRUE; 4992 break; 4993 } 4994 } 4995 } 4996 } 4997 if(finished) { 4998 break; 4999 } else { 5000 prevBuffSize = minBufferSize; 5001 5002 uint32_t frenchStartOffset = 0, frenchEndOffset = 0; 5003 if (frenchStartPtr != NULL) { 5004 frenchStartOffset = (uint32_t)(frenchStartPtr - secStart); 5005 frenchEndOffset = (uint32_t)(frenchEndPtr - secStart); 5006 } 5007 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); 5008 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); 5009 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status); 5010 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status); 5011 if(U_FAILURE(*status)) { 5012 /* We ran out of memory!? We can't recover. */ 5013 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5014 break; 5015 } 5016 if (frenchStartPtr != NULL) { 5017 frenchStartPtr = secStart + frenchStartOffset; 5018 frenchEndPtr = secStart + frenchEndOffset; 5019 } 5020 minBufferSize *= 2; 5021 } 5022 } 5023 5024 /* Here, we are generally done with processing */ 5025 /* bailing out would not be too productive */ 5026 5027 if(U_SUCCESS(*status)) { 5028 sortKeySize += (uint32_t)(primaries - primStart); 5029 /* we have done all the CE's, now let's put them together to form a key */ 5030 if(compareSec == 0) { 5031 if (count2 > 0) { 5032 while (count2 > UCOL_BOT_COUNT2) { 5033 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5034 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5035 } 5036 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5037 } 5038 uint32_t secsize = (uint32_t)(secondaries-secStart); 5039 if(!isFrenchSec) { // Regular situation, we know the length of secondaries 5040 sortKeySize += secsize; 5041 if(sortKeySize <= resultLength) { 5042 *(primaries++) = UCOL_LEVELTERMINATOR; 5043 uprv_memcpy(primaries, secStart, secsize); 5044 primaries += secsize; 5045 } else { 5046 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ 5047 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5048 if(U_SUCCESS(*status)) { 5049 *result = primStart; 5050 *(primaries++) = UCOL_LEVELTERMINATOR; 5051 uprv_memcpy(primaries, secStart, secsize); 5052 primaries += secsize; 5053 } 5054 else { 5055 /* We ran out of memory!? We can't recover. */ 5056 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5057 goto cleanup; 5058 } 5059 } else { 5060 *status = U_BUFFER_OVERFLOW_ERROR; 5061 } 5062 } 5063 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator 5064 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); 5065 sortKeySize += secsize; 5066 if(sortKeySize <= resultLength) { // if we managed to pack fine 5067 primaries = newPrim; // update the primary pointer 5068 } else { // overflow, need to reallocate and redo 5069 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ 5070 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5071 if(U_SUCCESS(*status)) { 5072 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); 5073 } 5074 else { 5075 /* We ran out of memory!? We can't recover. */ 5076 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5077 goto cleanup; 5078 } 5079 } else { 5080 *status = U_BUFFER_OVERFLOW_ERROR; 5081 } 5082 } 5083 } 5084 } 5085 5086 if(doCase) { 5087 uint32_t casesize = (uint32_t)(cases - caseStart); 5088 sortKeySize += casesize; 5089 if(sortKeySize <= resultLength) { 5090 *(primaries++) = UCOL_LEVELTERMINATOR; 5091 uprv_memcpy(primaries, caseStart, casesize); 5092 primaries += casesize; 5093 } else { 5094 if(allocateSKBuffer == TRUE) { 5095 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5096 if(U_SUCCESS(*status)) { 5097 *result = primStart; 5098 *(primaries++) = UCOL_LEVELTERMINATOR; 5099 uprv_memcpy(primaries, caseStart, casesize); 5100 } 5101 else { 5102 /* We ran out of memory!? We can't recover. */ 5103 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5104 goto cleanup; 5105 } 5106 } else { 5107 *status = U_BUFFER_OVERFLOW_ERROR; 5108 } 5109 } 5110 } 5111 5112 if(compareTer == 0) { 5113 if (count3 > 0) { 5114 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { 5115 while (count3 >= coll->tertiaryTopCount) { 5116 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5117 count3 -= (uint32_t)coll->tertiaryTopCount; 5118 } 5119 *tertiaries++ = (uint8_t)(tertiaryTop - count3); 5120 } else { 5121 while (count3 > coll->tertiaryBottomCount) { 5122 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5123 count3 -= (uint32_t)coll->tertiaryBottomCount; 5124 } 5125 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5126 } 5127 } 5128 uint32_t tersize = (uint32_t)(tertiaries - terStart); 5129 sortKeySize += tersize; 5130 if(sortKeySize <= resultLength) { 5131 *(primaries++) = UCOL_LEVELTERMINATOR; 5132 uprv_memcpy(primaries, terStart, tersize); 5133 primaries += tersize; 5134 } else { 5135 if(allocateSKBuffer == TRUE) { 5136 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5137 if(U_SUCCESS(*status)) { 5138 *result = primStart; 5139 *(primaries++) = UCOL_LEVELTERMINATOR; 5140 uprv_memcpy(primaries, terStart, tersize); 5141 } 5142 else { 5143 /* We ran out of memory!? We can't recover. */ 5144 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5145 goto cleanup; 5146 } 5147 } else { 5148 *status = U_BUFFER_OVERFLOW_ERROR; 5149 } 5150 } 5151 5152 if(compareQuad == 0/*qShifted == TRUE*/) { 5153 if(count4 > 0) { 5154 while (count4 > UCOL_BOT_COUNT4) { 5155 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 5156 count4 -= UCOL_BOT_COUNT4; 5157 } 5158 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 5159 } 5160 uint32_t quadsize = (uint32_t)(quads - quadStart); 5161 sortKeySize += quadsize; 5162 if(sortKeySize <= resultLength) { 5163 *(primaries++) = UCOL_LEVELTERMINATOR; 5164 uprv_memcpy(primaries, quadStart, quadsize); 5165 primaries += quadsize; 5166 } else { 5167 if(allocateSKBuffer == TRUE) { 5168 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5169 if(U_SUCCESS(*status)) { 5170 *result = primStart; 5171 *(primaries++) = UCOL_LEVELTERMINATOR; 5172 uprv_memcpy(primaries, quadStart, quadsize); 5173 } 5174 else { 5175 /* We ran out of memory!? We can't recover. */ 5176 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5177 goto cleanup; 5178 } 5179 } else { 5180 *status = U_BUFFER_OVERFLOW_ERROR; 5181 } 5182 } 5183 } 5184 5185 if(compareIdent) { 5186 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len); 5187 if(sortKeySize <= resultLength) { 5188 *(primaries++) = UCOL_LEVELTERMINATOR; 5189 primaries += u_writeIdenticalLevelRun(s.string, len, primaries); 5190 } else { 5191 if(allocateSKBuffer == TRUE) { 5192 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status); 5193 if(U_SUCCESS(*status)) { 5194 *result = primStart; 5195 *(primaries++) = UCOL_LEVELTERMINATOR; 5196 u_writeIdenticalLevelRun(s.string, len, primaries); 5197 } 5198 else { 5199 /* We ran out of memory!? We can't recover. */ 5200 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5201 goto cleanup; 5202 } 5203 } else { 5204 *status = U_BUFFER_OVERFLOW_ERROR; 5205 } 5206 } 5207 } 5208 } 5209 *(primaries++) = '\0'; 5210 } 5211 5212 if(allocateSKBuffer == TRUE) { 5213 *result = (uint8_t*)uprv_malloc(sortKeySize); 5214 /* test for NULL */ 5215 if (*result == NULL) { 5216 *status = U_MEMORY_ALLOCATION_ERROR; 5217 goto cleanup; 5218 } 5219 uprv_memcpy(*result, primStart, sortKeySize); 5220 if(primStart != prim) { 5221 uprv_free(primStart); 5222 } 5223 } 5224 5225 cleanup: 5226 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { 5227 /* NULL terminate for safety */ 5228 **result = 0; 5229 } 5230 if(terStart != tert) { 5231 uprv_free(terStart); 5232 uprv_free(secStart); 5233 uprv_free(caseStart); 5234 uprv_free(quadStart); 5235 } 5236 5237 /* To avoid memory leak, free the offset buffer if necessary. */ 5238 ucol_freeOffsetBuffer(&s); 5239 5240 return sortKeySize; 5241 } 5242 5243 5244 U_CFUNC int32_t U_CALLCONV 5245 ucol_calcSortKeySimpleTertiary(const UCollator *coll, 5246 const UChar *source, 5247 int32_t sourceLength, 5248 uint8_t **result, 5249 uint32_t resultLength, 5250 UBool allocateSKBuffer, 5251 UErrorCode *status) 5252 { 5253 U_ALIGN_CODE(16); 5254 5255 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 5256 uint32_t i = 0; /* general purpose counter */ 5257 5258 /* Stack allocated buffers for buffers we use */ 5259 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER]; 5260 5261 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert; 5262 5263 if(U_FAILURE(*status)) { 5264 return 0; 5265 } 5266 5267 if(primaries == NULL && allocateSKBuffer == TRUE) { 5268 primaries = *result = prim; 5269 resultLength = UCOL_PRIMARY_MAX_BUFFER; 5270 } 5271 5272 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER; 5273 5274 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */ 5275 5276 UnicodeString normSource; 5277 5278 int32_t len = sourceLength; 5279 5280 /* If we need to normalize, we'll do it all at once at the beginning! */ 5281 if(coll->normalizationMode != UCOL_OFF) { 5282 normSource.setTo(len < 0, source, len); 5283 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); 5284 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 5285 if(qcYesLength != normSource.length()) { 5286 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 5287 normSource.truncate(qcYesLength); 5288 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 5289 source = normSource.getBuffer(); 5290 len = normSource.length(); 5291 } 5292 } 5293 collIterate s; 5294 IInit_collIterate(coll, (UChar *)source, len, &s, status); 5295 if(U_FAILURE(*status)) { 5296 return 0; 5297 } 5298 if(source == normSource.getBuffer()) { 5299 s.flags &= ~UCOL_ITER_NORM; 5300 } 5301 5302 if(resultLength == 0 || primaries == NULL) { 5303 return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); 5304 } 5305 5306 uint8_t *primarySafeEnd = primaries + resultLength - 2; 5307 5308 uint32_t minBufferSize = UCOL_MAX_BUFFER; 5309 5310 uint8_t *primStart = primaries; 5311 uint8_t *secStart = secondaries; 5312 uint8_t *terStart = tertiaries; 5313 5314 uint32_t order = 0; 5315 5316 uint8_t primary1 = 0; 5317 uint8_t primary2 = 0; 5318 uint8_t secondary = 0; 5319 uint8_t tertiary = 0; 5320 uint8_t caseSwitch = coll->caseSwitch; 5321 uint8_t tertiaryMask = coll->tertiaryMask; 5322 int8_t tertiaryAddition = coll->tertiaryAddition; 5323 uint8_t tertiaryTop = coll->tertiaryTop; 5324 uint8_t tertiaryBottom = coll->tertiaryBottom; 5325 uint8_t tertiaryCommon = coll->tertiaryCommon; 5326 5327 uint32_t prevBuffSize = 0; 5328 5329 UBool finished = FALSE; 5330 UBool notIsContinuation = FALSE; 5331 5332 uint32_t count2 = 0, count3 = 0; 5333 uint8_t leadPrimary = 0; 5334 5335 for(;;) { 5336 for(i=prevBuffSize; i<minBufferSize; ++i) { 5337 5338 order = ucol_IGetNextCE(coll, &s, status); 5339 5340 if(order == 0) { 5341 continue; 5342 } 5343 5344 if(order == UCOL_NO_MORE_CES) { 5345 finished = TRUE; 5346 break; 5347 } 5348 5349 notIsContinuation = !isContinuation(order); 5350 5351 if(notIsContinuation) { 5352 tertiary = (uint8_t)((order & tertiaryMask)); 5353 } else { 5354 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 5355 } 5356 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5357 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5358 primary1 = (uint8_t)(order >> 8); 5359 5360 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 5361 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ 5362 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ 5363 /* regular and simple sortkey calc */ 5364 if(primary1 != UCOL_IGNORABLE) { 5365 if(notIsContinuation) { 5366 if(leadPrimary == primary1) { 5367 *primaries++ = primary2; 5368 } else { 5369 if(leadPrimary != 0) { 5370 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 5371 } 5372 if(primary2 == UCOL_IGNORABLE) { 5373 /* one byter, not compressed */ 5374 *primaries++ = primary1; 5375 leadPrimary = 0; 5376 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY || 5377 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) 5378 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { 5379 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) { 5380 /* not compressible */ 5381 leadPrimary = 0; 5382 *primaries++ = primary1; 5383 *primaries++ = primary2; 5384 } else { /* compress */ 5385 *primaries++ = leadPrimary = primary1; 5386 *primaries++ = primary2; 5387 } 5388 } 5389 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 5390 *primaries++ = primary1; 5391 if(primary2 != UCOL_IGNORABLE) { 5392 *primaries++ = primary2; /* second part */ 5393 } 5394 } 5395 } 5396 5397 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ 5398 /* This is compression code. */ 5399 if (secondary == UCOL_COMMON2 && notIsContinuation) { 5400 ++count2; 5401 } else { 5402 if (count2 > 0) { 5403 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 5404 while (count2 > UCOL_TOP_COUNT2) { 5405 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 5406 count2 -= (uint32_t)UCOL_TOP_COUNT2; 5407 } 5408 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); 5409 } else { 5410 while (count2 > UCOL_BOT_COUNT2) { 5411 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5412 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5413 } 5414 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5415 } 5416 count2 = 0; 5417 } 5418 *secondaries++ = secondary; 5419 } 5420 } 5421 5422 if(notIsContinuation) { 5423 tertiary ^= caseSwitch; 5424 } 5425 5426 if(tertiary > 0) { 5427 /* This is compression code. */ 5428 /* sequence size check is included in the if clause */ 5429 if (tertiary == tertiaryCommon && notIsContinuation) { 5430 ++count3; 5431 } else { 5432 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 5433 tertiary += tertiaryAddition; 5434 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 5435 tertiary -= tertiaryAddition; 5436 } 5437 if (count3 > 0) { 5438 if ((tertiary > tertiaryCommon)) { 5439 while (count3 > coll->tertiaryTopCount) { 5440 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5441 count3 -= (uint32_t)coll->tertiaryTopCount; 5442 } 5443 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); 5444 } else { 5445 while (count3 > coll->tertiaryBottomCount) { 5446 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5447 count3 -= (uint32_t)coll->tertiaryBottomCount; 5448 } 5449 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5450 } 5451 count3 = 0; 5452 } 5453 *tertiaries++ = tertiary; 5454 } 5455 } 5456 5457 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ 5458 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ 5459 IInit_collIterate(coll, (UChar *)source, len, &s, status); 5460 if(U_FAILURE(*status)) { 5461 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5462 finished = TRUE; 5463 break; 5464 } 5465 if(source == normSource.getBuffer()) { 5466 s.flags &= ~UCOL_ITER_NORM; 5467 } 5468 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); 5469 *status = U_BUFFER_OVERFLOW_ERROR; 5470 finished = TRUE; 5471 break; 5472 } else { /* It's much nicer if we can actually reallocate */ 5473 int32_t sks = sortKeySize+(int32_t)((primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)); 5474 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); 5475 if(U_SUCCESS(*status)) { 5476 *result = primStart; 5477 primarySafeEnd = primStart + resultLength - 2; 5478 } else { 5479 /* We ran out of memory!? We can't recover. */ 5480 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5481 finished = TRUE; 5482 break; 5483 } 5484 } 5485 } 5486 } 5487 if(finished) { 5488 break; 5489 } else { 5490 prevBuffSize = minBufferSize; 5491 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); 5492 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); 5493 minBufferSize *= 2; 5494 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size 5495 /* We ran out of memory!? We can't recover. */ 5496 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5497 break; 5498 } 5499 } 5500 } 5501 5502 if(U_SUCCESS(*status)) { 5503 sortKeySize += (uint32_t)(primaries - primStart); 5504 /* we have done all the CE's, now let's put them together to form a key */ 5505 if (count2 > 0) { 5506 while (count2 > UCOL_BOT_COUNT2) { 5507 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5508 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5509 } 5510 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5511 } 5512 uint32_t secsize = (uint32_t)(secondaries-secStart); 5513 sortKeySize += secsize; 5514 if(sortKeySize <= resultLength) { 5515 *(primaries++) = UCOL_LEVELTERMINATOR; 5516 uprv_memcpy(primaries, secStart, secsize); 5517 primaries += secsize; 5518 } else { 5519 if(allocateSKBuffer == TRUE) { 5520 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5521 if(U_SUCCESS(*status)) { 5522 *(primaries++) = UCOL_LEVELTERMINATOR; 5523 *result = primStart; 5524 uprv_memcpy(primaries, secStart, secsize); 5525 } 5526 else { 5527 /* We ran out of memory!? We can't recover. */ 5528 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5529 goto cleanup; 5530 } 5531 } else { 5532 *status = U_BUFFER_OVERFLOW_ERROR; 5533 } 5534 } 5535 5536 if (count3 > 0) { 5537 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { 5538 while (count3 >= coll->tertiaryTopCount) { 5539 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5540 count3 -= (uint32_t)coll->tertiaryTopCount; 5541 } 5542 *tertiaries++ = (uint8_t)(tertiaryTop - count3); 5543 } else { 5544 while (count3 > coll->tertiaryBottomCount) { 5545 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5546 count3 -= (uint32_t)coll->tertiaryBottomCount; 5547 } 5548 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5549 } 5550 } 5551 uint32_t tersize = (uint32_t)(tertiaries - terStart); 5552 sortKeySize += tersize; 5553 if(sortKeySize <= resultLength) { 5554 *(primaries++) = UCOL_LEVELTERMINATOR; 5555 uprv_memcpy(primaries, terStart, tersize); 5556 primaries += tersize; 5557 } else { 5558 if(allocateSKBuffer == TRUE) { 5559 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5560 if(U_SUCCESS(*status)) { 5561 *result = primStart; 5562 *(primaries++) = UCOL_LEVELTERMINATOR; 5563 uprv_memcpy(primaries, terStart, tersize); 5564 } 5565 else { 5566 /* We ran out of memory!? We can't recover. */ 5567 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5568 goto cleanup; 5569 } 5570 } else { 5571 *status = U_MEMORY_ALLOCATION_ERROR; 5572 } 5573 } 5574 5575 *(primaries++) = '\0'; 5576 } 5577 5578 if(allocateSKBuffer == TRUE) { 5579 *result = (uint8_t*)uprv_malloc(sortKeySize); 5580 /* test for NULL */ 5581 if (*result == NULL) { 5582 *status = U_MEMORY_ALLOCATION_ERROR; 5583 goto cleanup; 5584 } 5585 uprv_memcpy(*result, primStart, sortKeySize); 5586 if(primStart != prim) { 5587 uprv_free(primStart); 5588 } 5589 } 5590 5591 cleanup: 5592 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { 5593 /* NULL terminate for safety */ 5594 **result = 0; 5595 } 5596 if(terStart != tert) { 5597 uprv_free(terStart); 5598 uprv_free(secStart); 5599 } 5600 5601 /* To avoid memory leak, free the offset buffer if necessary. */ 5602 ucol_freeOffsetBuffer(&s); 5603 5604 return sortKeySize; 5605 } 5606 5607 static inline 5608 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { 5609 UBool notIsContinuation = !isContinuation(CE); 5610 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); 5611 if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) 5612 || (!notIsContinuation && *wasShifted)) 5613 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 5614 { 5615 // The stuff below should probably be in the sortkey code... maybe not... 5616 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ 5617 /* we should just completely ignore it */ 5618 *wasShifted = TRUE; 5619 //continue; 5620 } 5621 //*wasShifted = TRUE; 5622 return TRUE; 5623 } else { 5624 *wasShifted = FALSE; 5625 return FALSE; 5626 } 5627 } 5628 static inline 5629 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { 5630 if(level < maxLevel) { 5631 dest[i++] = UCOL_LEVELTERMINATOR; 5632 } else { 5633 dest[i++] = 0; 5634 } 5635 } 5636 5637 /** enumeration of level identifiers for partial sort key generation */ 5638 enum { 5639 UCOL_PSK_PRIMARY = 0, 5640 UCOL_PSK_SECONDARY = 1, 5641 UCOL_PSK_CASE = 2, 5642 UCOL_PSK_TERTIARY = 3, 5643 UCOL_PSK_QUATERNARY = 4, 5644 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */ 5645 UCOL_PSK_IDENTICAL = 6, 5646 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */ 5647 UCOL_PSK_LIMIT 5648 }; 5649 5650 /** collation state enum. *_SHIFT value is how much to shift right 5651 * to get the state piece to the right. *_MASK value should be 5652 * ANDed with the shifted state. This data is stored in state[1] 5653 * field. 5654 */ 5655 enum { 5656 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */ 5657 UCOL_PSK_LEVEL_MASK = 7, /** three bits */ 5658 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */ 5659 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, 5660 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary 5661 * This field is also used to denote that the French secondary level is finished 5662 */ 5663 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ 5664 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ 5665 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */ 5666 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ 5667 /** When we do French we need to reverse secondary values. However, continuations 5668 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba 5669 */ 5670 UCOL_PSK_BOCSU_BYTES_SHIFT = 7, 5671 UCOL_PSK_BOCSU_BYTES_MASK = 3, 5672 UCOL_PSK_CONSUMED_CES_SHIFT = 9, 5673 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF 5674 }; 5675 5676 // macro calculating the number of expansion CEs available 5677 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn 5678 5679 5680 /** main sortkey part procedure. On the first call, 5681 * you should pass in a collator, an iterator, empty state 5682 * state[0] == state[1] == 0, a buffer to hold results 5683 * number of bytes you need and an error code pointer. 5684 * Make sure your buffer is big enough to hold the wanted 5685 * number of sortkey bytes. I don't check. 5686 * The only meaningful status you can get back is 5687 * U_BUFFER_OVERFLOW_ERROR, which basically means that you 5688 * have been dealt a raw deal and that you probably won't 5689 * be able to use partial sortkey generation for this 5690 * particular combination of string and collator. This 5691 * is highly unlikely, but you should still check the error code. 5692 * Any other status means that you're not in a sane situation 5693 * anymore. After the first call, preserve state values and 5694 * use them on subsequent calls to obtain more bytes of a sortkey. 5695 * Use until the number of bytes written is smaller than the requested 5696 * number of bytes. Generated sortkey is not compatible with the 5697 * one generated by ucol_getSortKey, as we don't do any compression. 5698 * However, levels are still terminated by a 1 (one) and the sortkey 5699 * is terminated by a 0 (zero). Identical level is the same as in the 5700 * regular sortkey - internal bocu-1 implementation is used. 5701 * For curious, although you cannot do much about this, here is 5702 * the structure of state words. 5703 * state[0] - iterator state. Depends on the iterator implementation, 5704 * but allows the iterator to continue where it stopped in 5705 * the last iteration. 5706 * state[1] - collation processing state. Here is the distribution 5707 * of the bits: 5708 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary 5709 * quaternary, quin (we don't use this one), identical and 5710 * null (producing only zeroes - first one to terminate the 5711 * sortkey and subsequent to fill the buffer). 5712 * 3 - byte count. Number of bytes written on the primary level. 5713 * 4 - was shifted. Whether the previous iteration finished in the 5714 * shifted state. 5715 * 5, 6 - French continuation bytes written. See the comment in the enum 5716 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on 5717 * the identical level. 5718 * 9..31 - CEs consumed. Number of getCE or next32 operations performed 5719 * since thes last successful update of the iterator state. 5720 */ 5721 U_CAPI int32_t U_EXPORT2 5722 ucol_nextSortKeyPart(const UCollator *coll, 5723 UCharIterator *iter, 5724 uint32_t state[2], 5725 uint8_t *dest, int32_t count, 5726 UErrorCode *status) 5727 { 5728 /* error checking */ 5729 if(status==NULL || U_FAILURE(*status)) { 5730 return 0; 5731 } 5732 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 5733 if( coll==NULL || iter==NULL || 5734 state==NULL || 5735 count<0 || (count>0 && dest==NULL) 5736 ) { 5737 *status=U_ILLEGAL_ARGUMENT_ERROR; 5738 UTRACE_EXIT_STATUS(status); 5739 return 0; 5740 } 5741 5742 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 5743 coll, iter, state[0], state[1], dest, count); 5744 5745 if(count==0) { 5746 /* nothing to do */ 5747 UTRACE_EXIT_VALUE(0); 5748 return 0; 5749 } 5750 /** Setting up situation according to the state we got from the previous iteration */ 5751 // The state of the iterator from the previous invocation 5752 uint32_t iterState = state[0]; 5753 // Has the last iteration ended in the shifted state 5754 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE; 5755 // What is the current level of the sortkey? 5756 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; 5757 // Have we written only one byte from a two byte primary in the previous iteration? 5758 // Also on secondary level - have we finished with the French secondary? 5759 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; 5760 // number of bytes in the continuation buffer for French 5761 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; 5762 // Number of bytes already written from a bocsu sequence. Since 5763 // the longes bocsu sequence is 4 long, this can be up to 3. 5764 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK; 5765 // Number of elements that need to be consumed in this iteration because 5766 // the iterator returned UITER_NO_STATE at the end of the last iteration, 5767 // so we had to save the last valid state. 5768 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK; 5769 5770 /** values that depend on the collator attributes */ 5771 // strength of the collator. 5772 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); 5773 // maximal level of the partial sortkey. Need to take whether case level is done 5774 int32_t maxLevel = 0; 5775 if(strength < UCOL_TERTIARY) { 5776 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 5777 maxLevel = UCOL_PSK_CASE; 5778 } else { 5779 maxLevel = strength; 5780 } 5781 } else { 5782 if(strength == UCOL_TERTIARY) { 5783 maxLevel = UCOL_PSK_TERTIARY; 5784 } else if(strength == UCOL_QUATERNARY) { 5785 maxLevel = UCOL_PSK_QUATERNARY; 5786 } else { // identical 5787 maxLevel = UCOL_IDENTICAL; 5788 } 5789 } 5790 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation 5791 uint8_t UCOL_HIRAGANA_QUAD = 5792 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF; 5793 // Boundary value that decides whether a CE is shifted or not 5794 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0; 5795 // Are we doing French collation? 5796 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON); 5797 5798 /** initializing the collation state */ 5799 UBool notIsContinuation = FALSE; 5800 uint32_t CE = UCOL_NO_MORE_CES; 5801 5802 collIterate s; 5803 IInit_collIterate(coll, NULL, -1, &s, status); 5804 if(U_FAILURE(*status)) { 5805 UTRACE_EXIT_STATUS(*status); 5806 return 0; 5807 } 5808 s.iterator = iter; 5809 s.flags |= UCOL_USE_ITERATOR; 5810 // This variable tells us whether we have produced some other levels in this iteration 5811 // before we moved to the identical level. In that case, we need to switch the 5812 // type of the iterator. 5813 UBool doingIdenticalFromStart = FALSE; 5814 // Normalizing iterator 5815 // The division for the array length may truncate the array size to 5816 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 5817 // for all platforms anyway. 5818 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 5819 UNormIterator *normIter = NULL; 5820 // If the normalization is turned on for the collator and we are below identical level 5821 // we will use a FCD normalizing iterator 5822 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { 5823 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5824 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); 5825 s.flags &= ~UCOL_ITER_NORM; 5826 if(U_FAILURE(*status)) { 5827 UTRACE_EXIT_STATUS(*status); 5828 return 0; 5829 } 5830 } else if(level == UCOL_PSK_IDENTICAL) { 5831 // for identical level, we need a NFD iterator. We need to instantiate it here, since we 5832 // will be updating the state - and this cannot be done on an ordinary iterator. 5833 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5834 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5835 s.flags &= ~UCOL_ITER_NORM; 5836 if(U_FAILURE(*status)) { 5837 UTRACE_EXIT_STATUS(*status); 5838 return 0; 5839 } 5840 doingIdenticalFromStart = TRUE; 5841 } 5842 5843 // This is the tentative new state of the iterator. The problem 5844 // is that the iterator might return an undefined state, in 5845 // which case we should save the last valid state and increase 5846 // the iterator skip value. 5847 uint32_t newState = 0; 5848 5849 // First, we set the iterator to the last valid position 5850 // from the last iteration. This was saved in state[0]. 5851 if(iterState == 0) { 5852 /* initial state */ 5853 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { 5854 s.iterator->move(s.iterator, 0, UITER_LIMIT); 5855 } else { 5856 s.iterator->move(s.iterator, 0, UITER_START); 5857 } 5858 } else { 5859 /* reset to previous state */ 5860 s.iterator->setState(s.iterator, iterState, status); 5861 if(U_FAILURE(*status)) { 5862 UTRACE_EXIT_STATUS(*status); 5863 return 0; 5864 } 5865 } 5866 5867 5868 5869 // This variable tells us whether we can attempt to update the state 5870 // of iterator. Situations where we don't want to update iterator state 5871 // are the existence of expansion CEs that are not yet processed, and 5872 // finishing the case level without enough space in the buffer to insert 5873 // a level terminator. 5874 UBool canUpdateState = TRUE; 5875 5876 // Consume all the CEs that were consumed at the end of the previous 5877 // iteration without updating the iterator state. On identical level, 5878 // consume the code points. 5879 int32_t counter = cces; 5880 if(level < UCOL_PSK_IDENTICAL) { 5881 while(counter-->0) { 5882 // If we're doing French and we are on the secondary level, 5883 // we go backwards. 5884 if(level == UCOL_PSK_SECONDARY && doingFrench) { 5885 CE = ucol_IGetPrevCE(coll, &s, status); 5886 } else { 5887 CE = ucol_IGetNextCE(coll, &s, status); 5888 } 5889 if(CE==UCOL_NO_MORE_CES) { 5890 /* should not happen */ 5891 *status=U_INTERNAL_PROGRAM_ERROR; 5892 UTRACE_EXIT_STATUS(*status); 5893 return 0; 5894 } 5895 if(uprv_numAvailableExpCEs(s)) { 5896 canUpdateState = FALSE; 5897 } 5898 } 5899 } else { 5900 while(counter-->0) { 5901 uiter_next32(s.iterator); 5902 } 5903 } 5904 5905 // French secondary needs to know whether the iterator state of zero came from previous level OR 5906 // from a new invocation... 5907 UBool wasDoingPrimary = FALSE; 5908 // destination buffer byte counter. When this guy 5909 // gets to count, we're done with the iteration 5910 int32_t i = 0; 5911 // used to count the zero bytes written after we 5912 // have finished with the sort key 5913 int32_t j = 0; 5914 5915 5916 // Hm.... I think we're ready to plunge in. Basic story is as following: 5917 // we have a fall through case based on level. This is used for initial 5918 // positioning on iteration start. Every level processor contains a 5919 // for(;;) which will be broken when we exhaust all the CEs. Other 5920 // way to exit is a goto saveState, which happens when we have filled 5921 // out our buffer. 5922 switch(level) { 5923 case UCOL_PSK_PRIMARY: 5924 wasDoingPrimary = TRUE; 5925 for(;;) { 5926 if(i==count) { 5927 goto saveState; 5928 } 5929 // We should save the state only if we 5930 // are sure that we are done with the 5931 // previous iterator state 5932 if(canUpdateState && byteCountOrFrenchDone == 0) { 5933 newState = s.iterator->getState(s.iterator); 5934 if(newState != UITER_NO_STATE) { 5935 iterState = newState; 5936 cces = 0; 5937 } 5938 } 5939 CE = ucol_IGetNextCE(coll, &s, status); 5940 cces++; 5941 if(CE==UCOL_NO_MORE_CES) { 5942 // Add the level separator 5943 terminatePSKLevel(level, maxLevel, i, dest); 5944 byteCountOrFrenchDone=0; 5945 // Restart the iteration an move to the 5946 // second level 5947 s.iterator->move(s.iterator, 0, UITER_START); 5948 cces = 0; 5949 level = UCOL_PSK_SECONDARY; 5950 break; 5951 } 5952 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5953 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ 5954 if(CE != 0) { 5955 if(byteCountOrFrenchDone == 0) { 5956 // get the second byte of primary 5957 dest[i++]=(uint8_t)(CE >> 8); 5958 } else { 5959 byteCountOrFrenchDone = 0; 5960 } 5961 if((CE &=0xff)!=0) { 5962 if(i==count) { 5963 /* overflow */ 5964 byteCountOrFrenchDone = 1; 5965 cces--; 5966 goto saveState; 5967 } 5968 dest[i++]=(uint8_t)CE; 5969 } 5970 } 5971 } 5972 if(uprv_numAvailableExpCEs(s)) { 5973 canUpdateState = FALSE; 5974 } else { 5975 canUpdateState = TRUE; 5976 } 5977 } 5978 /* fall through to next level */ 5979 case UCOL_PSK_SECONDARY: 5980 if(strength >= UCOL_SECONDARY) { 5981 if(!doingFrench) { 5982 for(;;) { 5983 if(i == count) { 5984 goto saveState; 5985 } 5986 // We should save the state only if we 5987 // are sure that we are done with the 5988 // previous iterator state 5989 if(canUpdateState) { 5990 newState = s.iterator->getState(s.iterator); 5991 if(newState != UITER_NO_STATE) { 5992 iterState = newState; 5993 cces = 0; 5994 } 5995 } 5996 CE = ucol_IGetNextCE(coll, &s, status); 5997 cces++; 5998 if(CE==UCOL_NO_MORE_CES) { 5999 // Add the level separator 6000 terminatePSKLevel(level, maxLevel, i, dest); 6001 byteCountOrFrenchDone = 0; 6002 // Restart the iteration an move to the 6003 // second level 6004 s.iterator->move(s.iterator, 0, UITER_START); 6005 cces = 0; 6006 level = UCOL_PSK_CASE; 6007 break; 6008 } 6009 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6010 CE >>= 8; /* get secondary */ 6011 if(CE != 0) { 6012 dest[i++]=(uint8_t)CE; 6013 } 6014 } 6015 if(uprv_numAvailableExpCEs(s)) { 6016 canUpdateState = FALSE; 6017 } else { 6018 canUpdateState = TRUE; 6019 } 6020 } 6021 } else { // French secondary processing 6022 uint8_t frenchBuff[UCOL_MAX_BUFFER]; 6023 int32_t frenchIndex = 0; 6024 // Here we are going backwards. 6025 // If the iterator is at the beggining, it should be 6026 // moved to end. 6027 if(wasDoingPrimary) { 6028 s.iterator->move(s.iterator, 0, UITER_LIMIT); 6029 cces = 0; 6030 } 6031 for(;;) { 6032 if(i == count) { 6033 goto saveState; 6034 } 6035 if(canUpdateState) { 6036 newState = s.iterator->getState(s.iterator); 6037 if(newState != UITER_NO_STATE) { 6038 iterState = newState; 6039 cces = 0; 6040 } 6041 } 6042 CE = ucol_IGetPrevCE(coll, &s, status); 6043 cces++; 6044 if(CE==UCOL_NO_MORE_CES) { 6045 // Add the level separator 6046 terminatePSKLevel(level, maxLevel, i, dest); 6047 byteCountOrFrenchDone = 0; 6048 // Restart the iteration an move to the next level 6049 s.iterator->move(s.iterator, 0, UITER_START); 6050 level = UCOL_PSK_CASE; 6051 break; 6052 } 6053 if(isContinuation(CE)) { // if it's a continuation, we want to save it and 6054 // reverse when we get a first non-continuation CE. 6055 CE >>= 8; 6056 frenchBuff[frenchIndex++] = (uint8_t)CE; 6057 } else if(!isShiftedCE(CE, LVT, &wasShifted)) { 6058 CE >>= 8; /* get secondary */ 6059 if(!frenchIndex) { 6060 if(CE != 0) { 6061 dest[i++]=(uint8_t)CE; 6062 } 6063 } else { 6064 frenchBuff[frenchIndex++] = (uint8_t)CE; 6065 frenchIndex -= usedFrench; 6066 usedFrench = 0; 6067 while(i < count && frenchIndex) { 6068 dest[i++] = frenchBuff[--frenchIndex]; 6069 usedFrench++; 6070 } 6071 } 6072 } 6073 if(uprv_numAvailableExpCEs(s)) { 6074 canUpdateState = FALSE; 6075 } else { 6076 canUpdateState = TRUE; 6077 } 6078 } 6079 } 6080 } else { 6081 level = UCOL_PSK_CASE; 6082 } 6083 /* fall through to next level */ 6084 case UCOL_PSK_CASE: 6085 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 6086 uint32_t caseShift = UCOL_CASE_SHIFT_START; 6087 uint8_t caseByte = UCOL_CASE_BYTE_START; 6088 uint8_t caseBits = 0; 6089 6090 for(;;) { 6091 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); 6092 if(i == count) { 6093 goto saveState; 6094 } 6095 // We should save the state only if we 6096 // are sure that we are done with the 6097 // previous iterator state 6098 if(canUpdateState) { 6099 newState = s.iterator->getState(s.iterator); 6100 if(newState != UITER_NO_STATE) { 6101 iterState = newState; 6102 cces = 0; 6103 } 6104 } 6105 CE = ucol_IGetNextCE(coll, &s, status); 6106 cces++; 6107 if(CE==UCOL_NO_MORE_CES) { 6108 // On the case level we might have an unfinished 6109 // case byte. Add one if it's started. 6110 if(caseShift != UCOL_CASE_SHIFT_START) { 6111 dest[i++] = caseByte; 6112 } 6113 cces = 0; 6114 // We have finished processing CEs on this level. 6115 // However, we don't know if we have enough space 6116 // to add a case level terminator. 6117 if(i < count) { 6118 // Add the level separator 6119 terminatePSKLevel(level, maxLevel, i, dest); 6120 // Restart the iteration and move to the 6121 // next level 6122 s.iterator->move(s.iterator, 0, UITER_START); 6123 level = UCOL_PSK_TERTIARY; 6124 } else { 6125 canUpdateState = FALSE; 6126 } 6127 break; 6128 } 6129 6130 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6131 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) { 6132 // do the case level if we need to do it. We don't want to calculate 6133 // case level for primary ignorables if we have only primary strength and case level 6134 // otherwise we would break well formedness of CEs 6135 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 6136 caseBits = (uint8_t)(CE & 0xC0); 6137 // this copies the case level logic from the 6138 // sort key generation code 6139 if(CE != 0) { 6140 if (caseShift == 0) { 6141 dest[i++] = caseByte; 6142 caseShift = UCOL_CASE_SHIFT_START; 6143 caseByte = UCOL_CASE_BYTE_START; 6144 } 6145 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6146 if((caseBits & 0xC0) == 0) { 6147 caseByte |= 1 << (--caseShift); 6148 } else { 6149 caseByte |= 0 << (--caseShift); 6150 /* second bit */ 6151 if(caseShift == 0) { 6152 dest[i++] = caseByte; 6153 caseShift = UCOL_CASE_SHIFT_START; 6154 caseByte = UCOL_CASE_BYTE_START; 6155 } 6156 caseByte |= ((caseBits>>6)&1) << (--caseShift); 6157 } 6158 } else { 6159 if((caseBits & 0xC0) == 0) { 6160 caseByte |= 0 << (--caseShift); 6161 } else { 6162 caseByte |= 1 << (--caseShift); 6163 /* second bit */ 6164 if(caseShift == 0) { 6165 dest[i++] = caseByte; 6166 caseShift = UCOL_CASE_SHIFT_START; 6167 caseByte = UCOL_CASE_BYTE_START; 6168 } 6169 caseByte |= ((caseBits>>7)&1) << (--caseShift); 6170 } 6171 } 6172 } 6173 6174 } 6175 } 6176 // Not sure this is correct for the case level - revisit 6177 if(uprv_numAvailableExpCEs(s)) { 6178 canUpdateState = FALSE; 6179 } else { 6180 canUpdateState = TRUE; 6181 } 6182 } 6183 } else { 6184 level = UCOL_PSK_TERTIARY; 6185 } 6186 /* fall through to next level */ 6187 case UCOL_PSK_TERTIARY: 6188 if(strength >= UCOL_TERTIARY) { 6189 for(;;) { 6190 if(i == count) { 6191 goto saveState; 6192 } 6193 // We should save the state only if we 6194 // are sure that we are done with the 6195 // previous iterator state 6196 if(canUpdateState) { 6197 newState = s.iterator->getState(s.iterator); 6198 if(newState != UITER_NO_STATE) { 6199 iterState = newState; 6200 cces = 0; 6201 } 6202 } 6203 CE = ucol_IGetNextCE(coll, &s, status); 6204 cces++; 6205 if(CE==UCOL_NO_MORE_CES) { 6206 // Add the level separator 6207 terminatePSKLevel(level, maxLevel, i, dest); 6208 byteCountOrFrenchDone = 0; 6209 // Restart the iteration an move to the 6210 // second level 6211 s.iterator->move(s.iterator, 0, UITER_START); 6212 cces = 0; 6213 level = UCOL_PSK_QUATERNARY; 6214 break; 6215 } 6216 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6217 notIsContinuation = !isContinuation(CE); 6218 6219 if(notIsContinuation) { 6220 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 6221 CE ^= coll->caseSwitch; 6222 CE &= coll->tertiaryMask; 6223 } else { 6224 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6225 } 6226 6227 if(CE != 0) { 6228 dest[i++]=(uint8_t)CE; 6229 } 6230 } 6231 if(uprv_numAvailableExpCEs(s)) { 6232 canUpdateState = FALSE; 6233 } else { 6234 canUpdateState = TRUE; 6235 } 6236 } 6237 } else { 6238 // if we're not doing tertiary 6239 // skip to the end 6240 level = UCOL_PSK_NULL; 6241 } 6242 /* fall through to next level */ 6243 case UCOL_PSK_QUATERNARY: 6244 if(strength >= UCOL_QUATERNARY) { 6245 for(;;) { 6246 if(i == count) { 6247 goto saveState; 6248 } 6249 // We should save the state only if we 6250 // are sure that we are done with the 6251 // previous iterator state 6252 if(canUpdateState) { 6253 newState = s.iterator->getState(s.iterator); 6254 if(newState != UITER_NO_STATE) { 6255 iterState = newState; 6256 cces = 0; 6257 } 6258 } 6259 CE = ucol_IGetNextCE(coll, &s, status); 6260 cces++; 6261 if(CE==UCOL_NO_MORE_CES) { 6262 // Add the level separator 6263 terminatePSKLevel(level, maxLevel, i, dest); 6264 //dest[i++] = UCOL_LEVELTERMINATOR; 6265 byteCountOrFrenchDone = 0; 6266 // Restart the iteration an move to the 6267 // second level 6268 s.iterator->move(s.iterator, 0, UITER_START); 6269 cces = 0; 6270 level = UCOL_PSK_QUIN; 6271 break; 6272 } 6273 if(CE==0) 6274 continue; 6275 if(isShiftedCE(CE, LVT, &wasShifted)) { 6276 CE >>= 16; /* get primary */ 6277 if(CE != 0) { 6278 if(byteCountOrFrenchDone == 0) { 6279 dest[i++]=(uint8_t)(CE >> 8); 6280 } else { 6281 byteCountOrFrenchDone = 0; 6282 } 6283 if((CE &=0xff)!=0) { 6284 if(i==count) { 6285 /* overflow */ 6286 byteCountOrFrenchDone = 1; 6287 goto saveState; 6288 } 6289 dest[i++]=(uint8_t)CE; 6290 } 6291 } 6292 } else { 6293 notIsContinuation = !isContinuation(CE); 6294 if(notIsContinuation) { 6295 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 6296 dest[i++] = UCOL_HIRAGANA_QUAD; 6297 } else { 6298 dest[i++] = 0xFF; 6299 } 6300 } 6301 } 6302 if(uprv_numAvailableExpCEs(s)) { 6303 canUpdateState = FALSE; 6304 } else { 6305 canUpdateState = TRUE; 6306 } 6307 } 6308 } else { 6309 // if we're not doing quaternary 6310 // skip to the end 6311 level = UCOL_PSK_NULL; 6312 } 6313 /* fall through to next level */ 6314 case UCOL_PSK_QUIN: 6315 level = UCOL_PSK_IDENTICAL; 6316 /* fall through to next level */ 6317 case UCOL_PSK_IDENTICAL: 6318 if(strength >= UCOL_IDENTICAL) { 6319 UChar32 first, second; 6320 int32_t bocsuBytesWritten = 0; 6321 // We always need to do identical on 6322 // the NFD form of the string. 6323 if(normIter == NULL) { 6324 // we arrived from the level below and 6325 // normalization was not turned on. 6326 // therefore, we need to make a fresh NFD iterator 6327 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 6328 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 6329 } else if(!doingIdenticalFromStart) { 6330 // there is an iterator, but we did some other levels. 6331 // therefore, we have a FCD iterator - need to make 6332 // a NFD one. 6333 // normIter being at the beginning does not guarantee 6334 // that the underlying iterator is at the beginning 6335 iter->move(iter, 0, UITER_START); 6336 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 6337 } 6338 // At this point we have a NFD iterator that is positioned 6339 // in the right place 6340 if(U_FAILURE(*status)) { 6341 UTRACE_EXIT_STATUS(*status); 6342 return 0; 6343 } 6344 first = uiter_previous32(s.iterator); 6345 // maybe we're at the start of the string 6346 if(first == U_SENTINEL) { 6347 first = 0; 6348 } else { 6349 uiter_next32(s.iterator); 6350 } 6351 6352 j = 0; 6353 for(;;) { 6354 if(i == count) { 6355 if(j+1 < bocsuBytesWritten) { 6356 bocsuBytesUsed = j+1; 6357 } 6358 goto saveState; 6359 } 6360 6361 // On identical level, we will always save 6362 // the state if we reach this point, since 6363 // we don't depend on getNextCE for content 6364 // all the content is in our buffer and we 6365 // already either stored the full buffer OR 6366 // otherwise we won't arrive here. 6367 newState = s.iterator->getState(s.iterator); 6368 if(newState != UITER_NO_STATE) { 6369 iterState = newState; 6370 cces = 0; 6371 } 6372 6373 uint8_t buff[4]; 6374 second = uiter_next32(s.iterator); 6375 cces++; 6376 6377 // end condition for identical level 6378 if(second == U_SENTINEL) { 6379 terminatePSKLevel(level, maxLevel, i, dest); 6380 level = UCOL_PSK_NULL; 6381 break; 6382 } 6383 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); 6384 first = second; 6385 6386 j = 0; 6387 if(bocsuBytesUsed != 0) { 6388 while(bocsuBytesUsed-->0) { 6389 j++; 6390 } 6391 } 6392 6393 while(i < count && j < bocsuBytesWritten) { 6394 dest[i++] = buff[j++]; 6395 } 6396 } 6397 6398 } else { 6399 level = UCOL_PSK_NULL; 6400 } 6401 /* fall through to next level */ 6402 case UCOL_PSK_NULL: 6403 j = i; 6404 while(j<count) { 6405 dest[j++]=0; 6406 } 6407 break; 6408 default: 6409 *status = U_INTERNAL_PROGRAM_ERROR; 6410 UTRACE_EXIT_STATUS(*status); 6411 return 0; 6412 } 6413 6414 saveState: 6415 // Now we need to return stuff. First we want to see whether we have 6416 // done everything for the current state of iterator. 6417 if(byteCountOrFrenchDone 6418 || canUpdateState == FALSE 6419 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) 6420 { 6421 // Any of above mean that the previous transaction 6422 // wasn't finished and that we should store the 6423 // previous iterator state. 6424 state[0] = iterState; 6425 } else { 6426 // The transaction is complete. We will continue in the next iteration. 6427 state[0] = s.iterator->getState(s.iterator); 6428 cces = 0; 6429 } 6430 // Store the number of bocsu bytes written. 6431 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { 6432 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6433 } 6434 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT; 6435 6436 // Next we put in the level of comparison 6437 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); 6438 6439 // If we are doing French, we need to store whether we have just finished the French level 6440 if(level == UCOL_PSK_SECONDARY && doingFrench) { 6441 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6442 } else { 6443 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6444 } 6445 6446 // Was the latest CE shifted 6447 if(wasShifted) { 6448 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; 6449 } 6450 // Check for cces overflow 6451 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { 6452 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6453 } 6454 // Store cces 6455 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT); 6456 6457 // Check for French overflow 6458 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { 6459 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6460 } 6461 // Store number of bytes written in the French secondary continuation sequence 6462 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); 6463 6464 6465 // If we have used normalizing iterator, get rid of it 6466 if(normIter != NULL) { 6467 unorm_closeIter(normIter); 6468 } 6469 6470 /* To avoid memory leak, free the offset buffer if necessary. */ 6471 ucol_freeOffsetBuffer(&s); 6472 6473 // Return number of meaningful sortkey bytes. 6474 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 6475 dest,i, state[0], state[1]); 6476 UTRACE_EXIT_VALUE(i); 6477 return i; 6478 } 6479 6480 /** 6481 * Produce a bound for a given sortkey and a number of levels. 6482 */ 6483 U_CAPI int32_t U_EXPORT2 6484 ucol_getBound(const uint8_t *source, 6485 int32_t sourceLength, 6486 UColBoundMode boundType, 6487 uint32_t noOfLevels, 6488 uint8_t *result, 6489 int32_t resultLength, 6490 UErrorCode *status) 6491 { 6492 // consistency checks 6493 if(status == NULL || U_FAILURE(*status)) { 6494 return 0; 6495 } 6496 if(source == NULL) { 6497 *status = U_ILLEGAL_ARGUMENT_ERROR; 6498 return 0; 6499 } 6500 6501 int32_t sourceIndex = 0; 6502 // Scan the string until we skip enough of the key OR reach the end of the key 6503 do { 6504 sourceIndex++; 6505 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { 6506 noOfLevels--; 6507 } 6508 } while (noOfLevels > 0 6509 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 6510 6511 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 6512 && noOfLevels > 0) { 6513 *status = U_SORT_KEY_TOO_SHORT_WARNING; 6514 } 6515 6516 6517 // READ ME: this code assumes that the values for boundType 6518 // enum will not changes. They are set so that the enum value 6519 // corresponds to the number of extra bytes each bound type 6520 // needs. 6521 if(result != NULL && resultLength >= sourceIndex+boundType) { 6522 uprv_memcpy(result, source, sourceIndex); 6523 switch(boundType) { 6524 // Lower bound just gets terminated. No extra bytes 6525 case UCOL_BOUND_LOWER: // = 0 6526 break; 6527 // Upper bound needs one extra byte 6528 case UCOL_BOUND_UPPER: // = 1 6529 result[sourceIndex++] = 2; 6530 break; 6531 // Upper long bound needs two extra bytes 6532 case UCOL_BOUND_UPPER_LONG: // = 2 6533 result[sourceIndex++] = 0xFF; 6534 result[sourceIndex++] = 0xFF; 6535 break; 6536 default: 6537 *status = U_ILLEGAL_ARGUMENT_ERROR; 6538 return 0; 6539 } 6540 result[sourceIndex++] = 0; 6541 6542 return sourceIndex; 6543 } else { 6544 return sourceIndex+boundType+1; 6545 } 6546 } 6547 6548 /****************************************************************************/ 6549 /* Following are the functions that deal with the properties of a collator */ 6550 /* there are new APIs and some compatibility APIs */ 6551 /****************************************************************************/ 6552 6553 static inline void 6554 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, 6555 int32_t *primShift, int32_t *secShift, int32_t *terShift) 6556 { 6557 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; 6558 UBool reverseSecondary = FALSE; 6559 if(!isContinuation(CE)) { 6560 tertiary = (uint8_t)((CE & coll->tertiaryMask)); 6561 tertiary ^= coll->caseSwitch; 6562 reverseSecondary = TRUE; 6563 } else { 6564 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6565 tertiary &= UCOL_REMOVE_CASE; 6566 reverseSecondary = FALSE; 6567 } 6568 6569 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6570 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6571 primary1 = (uint8_t)(CE >> 8); 6572 6573 if(primary1 != 0) { 6574 coll->latinOneCEs[ch] |= (primary1 << *primShift); 6575 *primShift -= 8; 6576 } 6577 if(primary2 != 0) { 6578 if(*primShift < 0) { 6579 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6580 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6581 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6582 return; 6583 } 6584 coll->latinOneCEs[ch] |= (primary2 << *primShift); 6585 *primShift -= 8; 6586 } 6587 if(secondary != 0) { 6588 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary 6589 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary 6590 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); 6591 } else { // normal case 6592 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); 6593 } 6594 *secShift -= 8; 6595 } 6596 if(tertiary != 0) { 6597 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); 6598 *terShift -= 8; 6599 } 6600 } 6601 6602 static inline UBool 6603 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { 6604 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); 6605 if(newTable == NULL) { 6606 *status = U_MEMORY_ALLOCATION_ERROR; 6607 coll->latinOneFailed = TRUE; 6608 return FALSE; 6609 } 6610 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t); 6611 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); 6612 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); 6613 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy); 6614 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy); 6615 coll->latinOneTableLen = size; 6616 uprv_free(coll->latinOneCEs); 6617 coll->latinOneCEs = newTable; 6618 return TRUE; 6619 } 6620 6621 static UBool 6622 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { 6623 UBool result = TRUE; 6624 if(coll->latinOneCEs == NULL) { 6625 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); 6626 if(coll->latinOneCEs == NULL) { 6627 *status = U_MEMORY_ALLOCATION_ERROR; 6628 return FALSE; 6629 } 6630 coll->latinOneTableLen = UCOL_LATINONETABLELEN; 6631 } 6632 UChar ch = 0; 6633 UCollationElements *it = ucol_openElements(coll, &ch, 1, status); 6634 // Check for null pointer 6635 if (U_FAILURE(*status)) { 6636 return FALSE; 6637 } 6638 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); 6639 6640 int32_t primShift = 24, secShift = 24, terShift = 24; 6641 uint32_t CE = 0; 6642 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; 6643 6644 // TODO: make safe if you get more than you wanted... 6645 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { 6646 primShift = 24; secShift = 24; terShift = 24; 6647 if(ch < 0x100) { 6648 CE = coll->latinOneMapping[ch]; 6649 } else { 6650 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 6651 if(CE == UCOL_NOT_FOUND && coll->UCA) { 6652 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 6653 } 6654 } 6655 if(CE < UCOL_NOT_FOUND) { 6656 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6657 } else { 6658 switch (getCETag(CE)) { 6659 case EXPANSION_TAG: 6660 case DIGIT_TAG: 6661 ucol_setText(it, &ch, 1, status); 6662 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { 6663 if(primShift < 0 || secShift < 0 || terShift < 0) { 6664 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6665 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6666 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6667 break; 6668 } 6669 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6670 } 6671 break; 6672 case CONTRACTION_TAG: 6673 // here is the trick 6674 // F2 is contraction. We do something very similar to contractions 6675 // but have two indices, one in the real contraction table and the 6676 // other to where we stuffed things. This hopes that we don't have 6677 // many contractions (this should work for latin-1 tables). 6678 { 6679 if((CE & 0x00FFF000) != 0) { 6680 *status = U_UNSUPPORTED_ERROR; 6681 goto cleanup_after_failure; 6682 } 6683 6684 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 6685 6686 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table 6687 6688 coll->latinOneCEs[ch] = CE; 6689 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; 6690 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; 6691 6692 // We're going to jump into contraction table, pick the elements 6693 // and use them 6694 do { 6695 CE = *(coll->contractionCEs + 6696 (UCharOffset - coll->contractionIndex)); 6697 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { 6698 uint32_t size; 6699 uint32_t i; /* general counter */ 6700 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 6701 size = getExpansionCount(CE); 6702 //CE = *CEOffset++; 6703 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 6704 for(i = 0; i<size; i++) { 6705 if(primShift < 0 || secShift < 0 || terShift < 0) { 6706 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6707 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6708 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6709 break; 6710 } 6711 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6712 } 6713 } else { /* else, we do */ 6714 while(*CEOffset != 0) { 6715 if(primShift < 0 || secShift < 0 || terShift < 0) { 6716 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6717 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6718 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6719 break; 6720 } 6721 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6722 } 6723 } 6724 contractionOffset++; 6725 } else if(CE < UCOL_NOT_FOUND) { 6726 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); 6727 } else { 6728 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6729 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6730 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6731 contractionOffset++; 6732 } 6733 UCharOffset++; 6734 primShift = 24; secShift = 24; terShift = 24; 6735 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate 6736 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { 6737 goto cleanup_after_failure; 6738 } 6739 } 6740 } while(*UCharOffset != 0xFFFF); 6741 } 6742 break;; 6743 case SPEC_PROC_TAG: 6744 { 6745 // 0xB7 is a precontext character defined in UCA5.1, a special 6746 // handle is implemeted in order to save LatinOne table for 6747 // most locales. 6748 if (ch==0xb7) { 6749 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6750 } 6751 else { 6752 goto cleanup_after_failure; 6753 } 6754 } 6755 break; 6756 default: 6757 goto cleanup_after_failure; 6758 } 6759 } 6760 } 6761 // compact table 6762 if(contractionOffset < coll->latinOneTableLen) { 6763 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { 6764 goto cleanup_after_failure; 6765 } 6766 } 6767 ucol_closeElements(it); 6768 return result; 6769 6770 cleanup_after_failure: 6771 // status should already be set before arriving here. 6772 coll->latinOneFailed = TRUE; 6773 ucol_closeElements(it); 6774 return FALSE; 6775 } 6776 6777 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { 6778 if(U_SUCCESS(*status)) { 6779 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6780 coll->caseSwitch = UCOL_CASE_SWITCH; 6781 } else { 6782 coll->caseSwitch = UCOL_NO_CASE_SWITCH; 6783 } 6784 6785 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { 6786 coll->tertiaryMask = UCOL_REMOVE_CASE; 6787 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6788 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */ 6789 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; 6790 coll->tertiaryBottom = UCOL_COMMON_BOT3; 6791 } else { 6792 coll->tertiaryMask = UCOL_KEEP_CASE; 6793 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; 6794 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6795 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; 6796 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; 6797 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; 6798 } else { 6799 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6800 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; 6801 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; 6802 } 6803 } 6804 6805 /* Set the compression values */ 6806 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1); 6807 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */ 6808 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); 6809 6810 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY 6811 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) 6812 { 6813 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; 6814 } else { 6815 coll->sortKeyGen = ucol_calcSortKey; 6816 } 6817 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF 6818 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) 6819 { 6820 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { 6821 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it 6822 //fprintf(stderr, "F"); 6823 coll->latinOneUse = TRUE; 6824 } else { 6825 coll->latinOneUse = FALSE; 6826 } 6827 if(*status == U_UNSUPPORTED_ERROR) { 6828 *status = U_ZERO_ERROR; 6829 } 6830 } else { // latin1Table exists and it doesn't need to be regenerated, just use it 6831 coll->latinOneUse = TRUE; 6832 } 6833 } else { 6834 coll->latinOneUse = FALSE; 6835 } 6836 } 6837 } 6838 6839 U_CAPI uint32_t U_EXPORT2 6840 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 6841 if(U_FAILURE(*status) || coll == NULL) { 6842 return 0; 6843 } 6844 if(len == -1) { 6845 len = u_strlen(varTop); 6846 } 6847 if(len == 0) { 6848 *status = U_ILLEGAL_ARGUMENT_ERROR; 6849 return 0; 6850 } 6851 6852 collIterate s; 6853 IInit_collIterate(coll, varTop, len, &s, status); 6854 if(U_FAILURE(*status)) { 6855 return 0; 6856 } 6857 6858 uint32_t CE = ucol_IGetNextCE(coll, &s, status); 6859 6860 /* here we check if we have consumed all characters */ 6861 /* you can put in either one character or a contraction */ 6862 /* you shouldn't put more... */ 6863 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { 6864 *status = U_CE_NOT_FOUND_ERROR; 6865 return 0; 6866 } 6867 6868 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); 6869 6870 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { 6871 *status = U_PRIMARY_TOO_LONG_ERROR; 6872 return 0; 6873 } 6874 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { 6875 coll->variableTopValueisDefault = FALSE; 6876 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; 6877 } 6878 6879 /* To avoid memory leak, free the offset buffer if necessary. */ 6880 ucol_freeOffsetBuffer(&s); 6881 6882 return CE & UCOL_PRIMARYMASK; 6883 } 6884 6885 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 6886 if(U_FAILURE(*status) || coll == NULL) { 6887 return 0; 6888 } 6889 return coll->variableTopValue<<16; 6890 } 6891 6892 U_CAPI void U_EXPORT2 6893 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 6894 if(U_FAILURE(*status) || coll == NULL) { 6895 return; 6896 } 6897 6898 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { 6899 coll->variableTopValueisDefault = FALSE; 6900 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; 6901 } 6902 } 6903 /* Attribute setter API */ 6904 U_CAPI void U_EXPORT2 6905 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 6906 if(U_FAILURE(*status) || coll == NULL) { 6907 return; 6908 } 6909 UColAttributeValue oldFrench = coll->frenchCollation; 6910 UColAttributeValue oldCaseFirst = coll->caseFirst; 6911 switch(attr) { 6912 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ 6913 if(value == UCOL_ON) { 6914 coll->numericCollation = UCOL_ON; 6915 coll->numericCollationisDefault = FALSE; 6916 } else if (value == UCOL_OFF) { 6917 coll->numericCollation = UCOL_OFF; 6918 coll->numericCollationisDefault = FALSE; 6919 } else if (value == UCOL_DEFAULT) { 6920 coll->numericCollationisDefault = TRUE; 6921 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; 6922 } else { 6923 *status = U_ILLEGAL_ARGUMENT_ERROR; 6924 } 6925 break; 6926 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ 6927 if(value == UCOL_ON) { 6928 coll->hiraganaQ = UCOL_ON; 6929 coll->hiraganaQisDefault = FALSE; 6930 } else if (value == UCOL_OFF) { 6931 coll->hiraganaQ = UCOL_OFF; 6932 coll->hiraganaQisDefault = FALSE; 6933 } else if (value == UCOL_DEFAULT) { 6934 coll->hiraganaQisDefault = TRUE; 6935 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ; 6936 } else { 6937 *status = U_ILLEGAL_ARGUMENT_ERROR; 6938 } 6939 break; 6940 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 6941 if(value == UCOL_ON) { 6942 coll->frenchCollation = UCOL_ON; 6943 coll->frenchCollationisDefault = FALSE; 6944 } else if (value == UCOL_OFF) { 6945 coll->frenchCollation = UCOL_OFF; 6946 coll->frenchCollationisDefault = FALSE; 6947 } else if (value == UCOL_DEFAULT) { 6948 coll->frenchCollationisDefault = TRUE; 6949 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation; 6950 } else { 6951 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6952 } 6953 break; 6954 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 6955 if(value == UCOL_SHIFTED) { 6956 coll->alternateHandling = UCOL_SHIFTED; 6957 coll->alternateHandlingisDefault = FALSE; 6958 } else if (value == UCOL_NON_IGNORABLE) { 6959 coll->alternateHandling = UCOL_NON_IGNORABLE; 6960 coll->alternateHandlingisDefault = FALSE; 6961 } else if (value == UCOL_DEFAULT) { 6962 coll->alternateHandlingisDefault = TRUE; 6963 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ; 6964 } else { 6965 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6966 } 6967 break; 6968 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 6969 if(value == UCOL_LOWER_FIRST) { 6970 coll->caseFirst = UCOL_LOWER_FIRST; 6971 coll->caseFirstisDefault = FALSE; 6972 } else if (value == UCOL_UPPER_FIRST) { 6973 coll->caseFirst = UCOL_UPPER_FIRST; 6974 coll->caseFirstisDefault = FALSE; 6975 } else if (value == UCOL_OFF) { 6976 coll->caseFirst = UCOL_OFF; 6977 coll->caseFirstisDefault = FALSE; 6978 } else if (value == UCOL_DEFAULT) { 6979 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; 6980 coll->caseFirstisDefault = TRUE; 6981 } else { 6982 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6983 } 6984 break; 6985 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 6986 if(value == UCOL_ON) { 6987 coll->caseLevel = UCOL_ON; 6988 coll->caseLevelisDefault = FALSE; 6989 } else if (value == UCOL_OFF) { 6990 coll->caseLevel = UCOL_OFF; 6991 coll->caseLevelisDefault = FALSE; 6992 } else if (value == UCOL_DEFAULT) { 6993 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; 6994 coll->caseLevelisDefault = TRUE; 6995 } else { 6996 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6997 } 6998 break; 6999 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 7000 if(value == UCOL_ON) { 7001 coll->normalizationMode = UCOL_ON; 7002 coll->normalizationModeisDefault = FALSE; 7003 } else if (value == UCOL_OFF) { 7004 coll->normalizationMode = UCOL_OFF; 7005 coll->normalizationModeisDefault = FALSE; 7006 } else if (value == UCOL_DEFAULT) { 7007 coll->normalizationModeisDefault = TRUE; 7008 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode; 7009 } else { 7010 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7011 } 7012 break; 7013 case UCOL_STRENGTH: /* attribute for strength */ 7014 if (value == UCOL_DEFAULT) { 7015 coll->strengthisDefault = TRUE; 7016 coll->strength = (UColAttributeValue)coll->options->strength; 7017 } else if (value <= UCOL_IDENTICAL) { 7018 coll->strengthisDefault = FALSE; 7019 coll->strength = value; 7020 } else { 7021 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7022 } 7023 break; 7024 case UCOL_ATTRIBUTE_COUNT: 7025 default: 7026 *status = U_ILLEGAL_ARGUMENT_ERROR; 7027 break; 7028 } 7029 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { 7030 coll->latinOneRegenTable = TRUE; 7031 } else { 7032 coll->latinOneRegenTable = FALSE; 7033 } 7034 ucol_updateInternalState(coll, status); 7035 } 7036 7037 U_CAPI UColAttributeValue U_EXPORT2 7038 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 7039 if(U_FAILURE(*status) || coll == NULL) { 7040 return UCOL_DEFAULT; 7041 } 7042 switch(attr) { 7043 case UCOL_NUMERIC_COLLATION: 7044 return coll->numericCollation; 7045 case UCOL_HIRAGANA_QUATERNARY_MODE: 7046 return coll->hiraganaQ; 7047 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 7048 return coll->frenchCollation; 7049 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 7050 return coll->alternateHandling; 7051 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 7052 return coll->caseFirst; 7053 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 7054 return coll->caseLevel; 7055 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 7056 return coll->normalizationMode; 7057 case UCOL_STRENGTH: /* attribute for strength */ 7058 return coll->strength; 7059 case UCOL_ATTRIBUTE_COUNT: 7060 default: 7061 *status = U_ILLEGAL_ARGUMENT_ERROR; 7062 break; 7063 } 7064 return UCOL_DEFAULT; 7065 } 7066 7067 U_CAPI void U_EXPORT2 7068 ucol_setStrength( UCollator *coll, 7069 UCollationStrength strength) 7070 { 7071 UErrorCode status = U_ZERO_ERROR; 7072 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 7073 } 7074 7075 U_CAPI UCollationStrength U_EXPORT2 7076 ucol_getStrength(const UCollator *coll) 7077 { 7078 UErrorCode status = U_ZERO_ERROR; 7079 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 7080 } 7081 7082 /****************************************************************************/ 7083 /* Following are misc functions */ 7084 /* there are new APIs and some compatibility APIs */ 7085 /****************************************************************************/ 7086 7087 U_CAPI void U_EXPORT2 7088 ucol_getVersion(const UCollator* coll, 7089 UVersionInfo versionInfo) 7090 { 7091 /* RunTime version */ 7092 uint8_t rtVersion = UCOL_RUNTIME_VERSION; 7093 /* Builder version*/ 7094 uint8_t bdVersion = coll->image->version[0]; 7095 7096 /* Charset Version. Need to get the version from cnv files 7097 * makeconv should populate cnv files with version and 7098 * an api has to be provided in ucnv.h to obtain this version 7099 */ 7100 uint8_t csVersion = 0; 7101 7102 /* combine the version info */ 7103 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion)); 7104 7105 /* Tailoring rules */ 7106 versionInfo[0] = (uint8_t)(cmbVersion>>8); 7107 versionInfo[1] = (uint8_t)cmbVersion; 7108 versionInfo[2] = coll->image->version[1]; 7109 if(coll->UCA) { 7110 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */ 7111 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07); 7112 } else { 7113 versionInfo[3] = 0; 7114 } 7115 } 7116 7117 7118 /* This internal API checks whether a character is tailored or not */ 7119 U_CAPI UBool U_EXPORT2 7120 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { 7121 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { 7122 return FALSE; 7123 } 7124 7125 uint32_t CE = UCOL_NOT_FOUND; 7126 const UChar *ContractionStart = NULL; 7127 if(u < 0x100) { /* latin-1 */ 7128 CE = coll->latinOneMapping[u]; 7129 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { 7130 return FALSE; 7131 } 7132 } else { /* regular */ 7133 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); 7134 } 7135 7136 if(isContraction(CE)) { 7137 ContractionStart = (UChar *)coll->image+getContractOffset(CE); 7138 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); 7139 } 7140 7141 return (UBool)(CE != UCOL_NOT_FOUND); 7142 } 7143 7144 7145 /****************************************************************************/ 7146 /* Following are the string compare functions */ 7147 /* */ 7148 /****************************************************************************/ 7149 7150 7151 /* ucol_checkIdent internal function. Does byte level string compare. */ 7152 /* Used by strcoll if strength == identical and strings */ 7153 /* are otherwise equal. */ 7154 /* */ 7155 /* Comparison must be done on NFD normalized strings. */ 7156 /* FCD is not good enough. */ 7157 7158 static 7159 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status) 7160 { 7161 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both 7162 // of same type, but that doesn't really mean that it will stay that way. 7163 int32_t comparison; 7164 7165 if (sColl->flags & UCOL_USE_ITERATOR) { 7166 // The division for the array length may truncate the array size to 7167 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 7168 // for all platforms anyway. 7169 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 7170 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 7171 UNormIterator *sNIt = NULL, *tNIt = NULL; 7172 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 7173 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 7174 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7175 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7176 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); 7177 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); 7178 comparison = u_strCompareIter(sIt, tIt, TRUE); 7179 unorm_closeIter(sNIt); 7180 unorm_closeIter(tNIt); 7181 } else { 7182 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1; 7183 const UChar *sBuf = sColl->string; 7184 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1; 7185 const UChar *tBuf = tColl->string; 7186 7187 if (normalize) { 7188 *status = U_ZERO_ERROR; 7189 // Note: We could use Normalizer::compare() or similar, but for short strings 7190 // which may not be in FCD it might be faster to just NFD them. 7191 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than 7192 // NFD'ing immediately might be faster for long strings, 7193 // but string comparison is usually done on relatively short strings. 7194 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen), 7195 sColl->writableBuffer, 7196 *status); 7197 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen), 7198 tColl->writableBuffer, 7199 *status); 7200 if(U_FAILURE(*status)) { 7201 return UCOL_LESS; 7202 } 7203 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer); 7204 } else { 7205 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); 7206 } 7207 } 7208 7209 if (comparison < 0) { 7210 return UCOL_LESS; 7211 } else if (comparison == 0) { 7212 return UCOL_EQUAL; 7213 } else /* comparison > 0 */ { 7214 return UCOL_GREATER; 7215 } 7216 } 7217 7218 /* CEBuf - A struct and some inline functions to handle the saving */ 7219 /* of CEs in a buffer within ucol_strcoll */ 7220 7221 #define UCOL_CEBUF_SIZE 512 7222 typedef struct ucol_CEBuf { 7223 uint32_t *buf; 7224 uint32_t *endp; 7225 uint32_t *pos; 7226 uint32_t localArray[UCOL_CEBUF_SIZE]; 7227 } ucol_CEBuf; 7228 7229 7230 static 7231 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { 7232 (b)->buf = (b)->pos = (b)->localArray; 7233 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; 7234 } 7235 7236 static 7237 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { 7238 uint32_t oldSize; 7239 uint32_t newSize; 7240 uint32_t *newBuf; 7241 7242 ci->flags |= UCOL_ITER_ALLOCATED; 7243 oldSize = (uint32_t)(b->pos - b->buf); 7244 newSize = oldSize * 2; 7245 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); 7246 if(newBuf == NULL) { 7247 *status = U_MEMORY_ALLOCATION_ERROR; 7248 } 7249 else { 7250 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); 7251 if (b->buf != b->localArray) { 7252 uprv_free(b->buf); 7253 } 7254 b->buf = newBuf; 7255 b->endp = b->buf + newSize; 7256 b->pos = b->buf + oldSize; 7257 } 7258 } 7259 7260 static 7261 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) { 7262 if (b->pos == b->endp) { 7263 ucol_CEBuf_Expand(b, ci, status); 7264 } 7265 if (U_SUCCESS(*status)) { 7266 *(b)->pos++ = ce; 7267 } 7268 } 7269 7270 /* This is a trick string compare function that goes in and uses sortkeys to compare */ 7271 /* It is used when compare gets in trouble and needs to bail out */ 7272 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, 7273 collIterate *tColl, 7274 UErrorCode *status) 7275 { 7276 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; 7277 uint8_t *sourceKeyP = sourceKey; 7278 uint8_t *targetKeyP = targetKey; 7279 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; 7280 const UCollator *coll = sColl->coll; 7281 const UChar *source = NULL; 7282 const UChar *target = NULL; 7283 int32_t result = UCOL_EQUAL; 7284 UnicodeString sourceString, targetString; 7285 int32_t sourceLength; 7286 int32_t targetLength; 7287 7288 if(sColl->flags & UCOL_USE_ITERATOR) { 7289 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7290 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7291 UChar32 c; 7292 while((c=sColl->iterator->next(sColl->iterator))>=0) { 7293 sourceString.append((UChar)c); 7294 } 7295 while((c=tColl->iterator->next(tColl->iterator))>=0) { 7296 targetString.append((UChar)c); 7297 } 7298 source = sourceString.getBuffer(); 7299 sourceLength = sourceString.length(); 7300 target = targetString.getBuffer(); 7301 targetLength = targetString.length(); 7302 } else { // no iterators 7303 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1; 7304 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1; 7305 source = sColl->string; 7306 target = tColl->string; 7307 } 7308 7309 7310 7311 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7312 if(sourceKeyLen > UCOL_MAX_BUFFER) { 7313 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); 7314 if(sourceKeyP == NULL) { 7315 *status = U_MEMORY_ALLOCATION_ERROR; 7316 goto cleanup_and_do_compare; 7317 } 7318 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7319 } 7320 7321 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7322 if(targetKeyLen > UCOL_MAX_BUFFER) { 7323 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); 7324 if(targetKeyP == NULL) { 7325 *status = U_MEMORY_ALLOCATION_ERROR; 7326 goto cleanup_and_do_compare; 7327 } 7328 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7329 } 7330 7331 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); 7332 7333 cleanup_and_do_compare: 7334 if(sourceKeyP != NULL && sourceKeyP != sourceKey) { 7335 uprv_free(sourceKeyP); 7336 } 7337 7338 if(targetKeyP != NULL && targetKeyP != targetKey) { 7339 uprv_free(targetKeyP); 7340 } 7341 7342 if(result<0) { 7343 return UCOL_LESS; 7344 } else if(result>0) { 7345 return UCOL_GREATER; 7346 } else { 7347 return UCOL_EQUAL; 7348 } 7349 } 7350 7351 7352 static UCollationResult 7353 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) 7354 { 7355 U_ALIGN_CODE(16); 7356 7357 const UCollator *coll = sColl->coll; 7358 7359 7360 // setting up the collator parameters 7361 UColAttributeValue strength = coll->strength; 7362 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); 7363 7364 UBool checkSecTer = initialCheckSecTer; 7365 UBool checkTertiary = (strength >= UCOL_TERTIARY); 7366 UBool checkQuad = (strength >= UCOL_QUATERNARY); 7367 UBool checkIdent = (strength == UCOL_IDENTICAL); 7368 UBool checkCase = (coll->caseLevel == UCOL_ON); 7369 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; 7370 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 7371 UBool qShifted = shifted && checkQuad; 7372 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; 7373 7374 if(doHiragana && shifted) { 7375 return (ucol_compareUsingSortKeys(sColl, tColl, status)); 7376 } 7377 uint8_t caseSwitch = coll->caseSwitch; 7378 uint8_t tertiaryMask = coll->tertiaryMask; 7379 7380 // This is the lowest primary value that will not be ignored if shifted 7381 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; 7382 7383 UCollationResult result = UCOL_EQUAL; 7384 UCollationResult hirResult = UCOL_EQUAL; 7385 7386 // Preparing the CE buffers. They will be filled during the primary phase 7387 ucol_CEBuf sCEs; 7388 ucol_CEBuf tCEs; 7389 UCOL_INIT_CEBUF(&sCEs); 7390 UCOL_INIT_CEBUF(&tCEs); 7391 7392 uint32_t secS = 0, secT = 0; 7393 uint32_t sOrder=0, tOrder=0; 7394 7395 // Non shifted primary processing is quite simple 7396 if(!shifted) { 7397 for(;;) { 7398 7399 // We fetch CEs until we hit a non ignorable primary or end. 7400 do { 7401 // We get the next CE 7402 sOrder = ucol_IGetNextCE(coll, sColl, status); 7403 // Stuff it in the buffer 7404 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7405 // And keep just the primary part. 7406 sOrder &= UCOL_PRIMARYMASK; 7407 } while(sOrder == 0); 7408 7409 // see the comments on the above block 7410 do { 7411 tOrder = ucol_IGetNextCE(coll, tColl, status); 7412 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7413 tOrder &= UCOL_PRIMARYMASK; 7414 } while(tOrder == 0); 7415 7416 // if both primaries are the same 7417 if(sOrder == tOrder) { 7418 // and there are no more CEs, we advance to the next level 7419 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7420 break; 7421 } 7422 if(doHiragana && hirResult == UCOL_EQUAL) { 7423 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { 7424 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) 7425 ? UCOL_LESS:UCOL_GREATER; 7426 } 7427 } 7428 } else { 7429 // if two primaries are different, we are done 7430 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; 7431 goto commonReturn; 7432 } 7433 } // no primary difference... do the rest from the buffers 7434 } else { // shifted - do a slightly more complicated processing :) 7435 for(;;) { 7436 UBool sInShifted = FALSE; 7437 UBool tInShifted = FALSE; 7438 // This version of code can be refactored. However, it seems easier to understand this way. 7439 // Source loop. Sam as the target loop. 7440 for(;;) { 7441 sOrder = ucol_IGetNextCE(coll, sColl, status); 7442 if(sOrder == UCOL_NO_MORE_CES) { 7443 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7444 break; 7445 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { 7446 /* UCA amendment - ignore ignorables that follow shifted code points */ 7447 continue; 7448 } else if(isContinuation(sOrder)) { 7449 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7450 if(sInShifted) { 7451 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7452 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7453 continue; 7454 } else { 7455 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7456 break; 7457 } 7458 } else { /* Just lower level values */ 7459 if(sInShifted) { 7460 continue; 7461 } else { 7462 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7463 continue; 7464 } 7465 } 7466 } else { /* regular */ 7467 if((sOrder & UCOL_PRIMARYMASK) > LVT) { 7468 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7469 break; 7470 } else { 7471 if((sOrder & UCOL_PRIMARYMASK) > 0) { 7472 sInShifted = TRUE; 7473 sOrder &= UCOL_PRIMARYMASK; 7474 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7475 continue; 7476 } else { 7477 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7478 sInShifted = FALSE; 7479 continue; 7480 } 7481 } 7482 } 7483 } 7484 sOrder &= UCOL_PRIMARYMASK; 7485 sInShifted = FALSE; 7486 7487 for(;;) { 7488 tOrder = ucol_IGetNextCE(coll, tColl, status); 7489 if(tOrder == UCOL_NO_MORE_CES) { 7490 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7491 break; 7492 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { 7493 /* UCA amendment - ignore ignorables that follow shifted code points */ 7494 continue; 7495 } else if(isContinuation(tOrder)) { 7496 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7497 if(tInShifted) { 7498 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7499 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7500 continue; 7501 } else { 7502 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7503 break; 7504 } 7505 } else { /* Just lower level values */ 7506 if(tInShifted) { 7507 continue; 7508 } else { 7509 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7510 continue; 7511 } 7512 } 7513 } else { /* regular */ 7514 if((tOrder & UCOL_PRIMARYMASK) > LVT) { 7515 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7516 break; 7517 } else { 7518 if((tOrder & UCOL_PRIMARYMASK) > 0) { 7519 tInShifted = TRUE; 7520 tOrder &= UCOL_PRIMARYMASK; 7521 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7522 continue; 7523 } else { 7524 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7525 tInShifted = FALSE; 7526 continue; 7527 } 7528 } 7529 } 7530 } 7531 tOrder &= UCOL_PRIMARYMASK; 7532 tInShifted = FALSE; 7533 7534 if(sOrder == tOrder) { 7535 /* 7536 if(doHiragana && hirResult == UCOL_EQUAL) { 7537 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { 7538 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) 7539 ? UCOL_LESS:UCOL_GREATER; 7540 } 7541 } 7542 */ 7543 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7544 break; 7545 } else { 7546 sOrder = 0; 7547 tOrder = 0; 7548 continue; 7549 } 7550 } else { 7551 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; 7552 goto commonReturn; 7553 } 7554 } /* no primary difference... do the rest from the buffers */ 7555 } 7556 7557 /* now, we're gonna reexamine collected CEs */ 7558 uint32_t *sCE; 7559 uint32_t *tCE; 7560 7561 /* This is the secondary level of comparison */ 7562 if(checkSecTer) { 7563 if(!isFrenchSec) { /* normal */ 7564 sCE = sCEs.buf; 7565 tCE = tCEs.buf; 7566 for(;;) { 7567 while (secS == 0) { 7568 secS = *(sCE++) & UCOL_SECONDARYMASK; 7569 } 7570 7571 while(secT == 0) { 7572 secT = *(tCE++) & UCOL_SECONDARYMASK; 7573 } 7574 7575 if(secS == secT) { 7576 if(secS == UCOL_NO_MORE_CES_SECONDARY) { 7577 break; 7578 } else { 7579 secS = 0; secT = 0; 7580 continue; 7581 } 7582 } else { 7583 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7584 goto commonReturn; 7585 } 7586 } 7587 } else { /* do the French */ 7588 uint32_t *sCESave = NULL; 7589 uint32_t *tCESave = NULL; 7590 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ 7591 tCE = tCEs.pos-2; 7592 for(;;) { 7593 while (secS == 0 && sCE >= sCEs.buf) { 7594 if(sCESave == 0) { 7595 secS = *(sCE--); 7596 if(isContinuation(secS)) { 7597 while(isContinuation(secS = *(sCE--))) 7598 ; 7599 /* after this, secS has the start of continuation, and sCEs points before that */ 7600 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7601 sCE+=2; /* need to point to the first continuation CP */ 7602 /* However, now you can just continue doing stuff */ 7603 } 7604 } else { 7605 secS = *(sCE++); 7606 if(!isContinuation(secS)) { /* This means we have finished with this cont */ 7607 sCE = sCESave; /* reset the pointer to before continuation */ 7608 sCESave = 0; 7609 continue; 7610 } 7611 } 7612 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7613 } 7614 7615 while(secT == 0 && tCE >= tCEs.buf) { 7616 if(tCESave == 0) { 7617 secT = *(tCE--); 7618 if(isContinuation(secT)) { 7619 while(isContinuation(secT = *(tCE--))) 7620 ; 7621 /* after this, secS has the start of continuation, and sCEs points before that */ 7622 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7623 tCE+=2; /* need to point to the first continuation CP */ 7624 /* However, now you can just continue doing stuff */ 7625 } 7626 } else { 7627 secT = *(tCE++); 7628 if(!isContinuation(secT)) { /* This means we have finished with this cont */ 7629 tCE = tCESave; /* reset the pointer to before continuation */ 7630 tCESave = 0; 7631 continue; 7632 } 7633 } 7634 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7635 } 7636 7637 if(secS == secT) { 7638 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { 7639 break; 7640 } else { 7641 secS = 0; secT = 0; 7642 continue; 7643 } 7644 } else { 7645 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7646 goto commonReturn; 7647 } 7648 } 7649 } 7650 } 7651 7652 /* doing the case bit */ 7653 if(checkCase) { 7654 sCE = sCEs.buf; 7655 tCE = tCEs.buf; 7656 for(;;) { 7657 while((secS & UCOL_REMOVE_CASE) == 0) { 7658 if(!isContinuation(*sCE++)) { 7659 secS =*(sCE-1); 7660 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7661 // primary ignorables should not be considered on the case level when the strength is primary 7662 // otherwise, the CEs stop being well-formed 7663 secS &= UCOL_TERT_CASE_MASK; 7664 secS ^= caseSwitch; 7665 } else { 7666 secS = 0; 7667 } 7668 } else { 7669 secS = 0; 7670 } 7671 } 7672 7673 while((secT & UCOL_REMOVE_CASE) == 0) { 7674 if(!isContinuation(*tCE++)) { 7675 secT = *(tCE-1); 7676 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7677 // primary ignorables should not be considered on the case level when the strength is primary 7678 // otherwise, the CEs stop being well-formed 7679 secT &= UCOL_TERT_CASE_MASK; 7680 secT ^= caseSwitch; 7681 } else { 7682 secT = 0; 7683 } 7684 } else { 7685 secT = 0; 7686 } 7687 } 7688 7689 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { 7690 result = UCOL_LESS; 7691 goto commonReturn; 7692 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { 7693 result = UCOL_GREATER; 7694 goto commonReturn; 7695 } 7696 7697 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { 7698 break; 7699 } else { 7700 secS = 0; 7701 secT = 0; 7702 } 7703 } 7704 } 7705 7706 /* Tertiary level */ 7707 if(checkTertiary) { 7708 secS = 0; 7709 secT = 0; 7710 sCE = sCEs.buf; 7711 tCE = tCEs.buf; 7712 for(;;) { 7713 while((secS & UCOL_REMOVE_CASE) == 0) { 7714 secS = *(sCE++) & tertiaryMask; 7715 if(!isContinuation(secS)) { 7716 secS ^= caseSwitch; 7717 } else { 7718 secS &= UCOL_REMOVE_CASE; 7719 } 7720 } 7721 7722 while((secT & UCOL_REMOVE_CASE) == 0) { 7723 secT = *(tCE++) & tertiaryMask; 7724 if(!isContinuation(secT)) { 7725 secT ^= caseSwitch; 7726 } else { 7727 secT &= UCOL_REMOVE_CASE; 7728 } 7729 } 7730 7731 if(secS == secT) { 7732 if((secS & UCOL_REMOVE_CASE) == 1) { 7733 break; 7734 } else { 7735 secS = 0; secT = 0; 7736 continue; 7737 } 7738 } else { 7739 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7740 goto commonReturn; 7741 } 7742 } 7743 } 7744 7745 7746 if(qShifted /*checkQuad*/) { 7747 UBool sInShifted = TRUE; 7748 UBool tInShifted = TRUE; 7749 secS = 0; 7750 secT = 0; 7751 sCE = sCEs.buf; 7752 tCE = tCEs.buf; 7753 for(;;) { 7754 while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) { 7755 secS = *(sCE++); 7756 if(isContinuation(secS)) { 7757 if(!sInShifted) { 7758 continue; 7759 } 7760 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ 7761 secS = UCOL_PRIMARYMASK; 7762 sInShifted = FALSE; 7763 } else { 7764 sInShifted = TRUE; 7765 } 7766 } 7767 secS &= UCOL_PRIMARYMASK; 7768 7769 7770 while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) { 7771 secT = *(tCE++); 7772 if(isContinuation(secT)) { 7773 if(!tInShifted) { 7774 continue; 7775 } 7776 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { 7777 secT = UCOL_PRIMARYMASK; 7778 tInShifted = FALSE; 7779 } else { 7780 tInShifted = TRUE; 7781 } 7782 } 7783 secT &= UCOL_PRIMARYMASK; 7784 7785 if(secS == secT) { 7786 if(secS == UCOL_NO_MORE_CES_PRIMARY) { 7787 break; 7788 } else { 7789 secS = 0; secT = 0; 7790 continue; 7791 } 7792 } else { 7793 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7794 goto commonReturn; 7795 } 7796 } 7797 } else if(doHiragana && hirResult != UCOL_EQUAL) { 7798 // If we're fine on quaternaries, we might be different 7799 // on Hiragana. This, however, might fail us in shifted. 7800 result = hirResult; 7801 goto commonReturn; 7802 } 7803 7804 /* For IDENTICAL comparisons, we use a bitwise character comparison */ 7805 /* as a tiebreaker if all else is equal. */ 7806 /* Getting here should be quite rare - strings are not identical - */ 7807 /* that is checked first, but compared == through all other checks. */ 7808 if(checkIdent) 7809 { 7810 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON); 7811 result = ucol_checkIdent(sColl, tColl, TRUE, status); 7812 } 7813 7814 commonReturn: 7815 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { 7816 if (sCEs.buf != sCEs.localArray ) { 7817 uprv_free(sCEs.buf); 7818 } 7819 if (tCEs.buf != tCEs.localArray ) { 7820 uprv_free(tCEs.buf); 7821 } 7822 } 7823 7824 return result; 7825 } 7826 7827 static UCollationResult 7828 ucol_strcollRegular(const UCollator *coll, 7829 const UChar *source, int32_t sourceLength, 7830 const UChar *target, int32_t targetLength, 7831 UErrorCode *status) { 7832 collIterate sColl, tColl; 7833 // Preparing the context objects for iterating over strings 7834 IInit_collIterate(coll, source, sourceLength, &sColl, status); 7835 IInit_collIterate(coll, target, targetLength, &tColl, status); 7836 if(U_FAILURE(*status)) { 7837 return UCOL_LESS; 7838 } 7839 return ucol_strcollRegular(&sColl, &tColl, status); 7840 } 7841 7842 static inline uint32_t 7843 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, 7844 uint32_t CE, const UChar *s, int32_t *index, int32_t len) 7845 { 7846 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 7847 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 7848 int32_t offset = 1; 7849 UChar schar = 0, tchar = 0; 7850 7851 for(;;) { 7852 if(len == -1) { 7853 if(s[*index] == 0) { // end of string 7854 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7855 } else { 7856 schar = s[*index]; 7857 } 7858 } else { 7859 if(*index == len) { 7860 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7861 } else { 7862 schar = s[*index]; 7863 } 7864 } 7865 7866 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 7867 offset++; 7868 } 7869 7870 if (schar == tchar) { 7871 (*index)++; 7872 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 7873 } 7874 else 7875 { 7876 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 7877 return UCOL_BAIL_OUT_CE; 7878 } 7879 // skip completely ignorables 7880 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 7881 if(isZeroCE == 0) { // we have to ignore completely ignorables 7882 (*index)++; 7883 continue; 7884 } 7885 7886 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7887 } 7888 } 7889 } 7890 7891 7892 /** 7893 * This is a fast strcoll, geared towards text in Latin-1. 7894 * It supports contractions of size two, French secondaries 7895 * and case switching. You can use it with strengths primary 7896 * to tertiary. It does not support shifted and case level. 7897 * It relies on the table build by setupLatin1Table. If it 7898 * doesn't understand something, it will go to the regular 7899 * strcoll. 7900 */ 7901 static UCollationResult 7902 ucol_strcollUseLatin1( const UCollator *coll, 7903 const UChar *source, 7904 int32_t sLen, 7905 const UChar *target, 7906 int32_t tLen, 7907 UErrorCode *status) 7908 { 7909 U_ALIGN_CODE(16); 7910 int32_t strength = coll->strength; 7911 7912 int32_t sIndex = 0, tIndex = 0; 7913 UChar sChar = 0, tChar = 0; 7914 uint32_t sOrder=0, tOrder=0; 7915 7916 UBool endOfSource = FALSE; 7917 7918 uint32_t *elements = coll->latinOneCEs; 7919 7920 UBool haveContractions = FALSE; // if we have contractions in our string 7921 // we cannot do French secondary 7922 7923 // Do the primary level 7924 for(;;) { 7925 while(sOrder==0) { // this loop skips primary ignorables 7926 // sOrder=getNextlatinOneCE(source); 7927 if(sLen==-1) { // handling zero terminated strings 7928 sChar=source[sIndex++]; 7929 if(sChar==0) { 7930 endOfSource = TRUE; 7931 break; 7932 } 7933 } else { // handling strings with known length 7934 if(sIndex==sLen) { 7935 endOfSource = TRUE; 7936 break; 7937 } 7938 sChar=source[sIndex++]; 7939 } 7940 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 7941 //fprintf(stderr, "R"); 7942 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7943 } 7944 sOrder = elements[sChar]; 7945 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 7946 // specials can basically be either contractions or bail-out signs. If we get anything 7947 // else, we'll bail out anywasy 7948 if(getCETag(sOrder) == CONTRACTION_TAG) { 7949 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 7950 haveContractions = TRUE; // if there are contractions, we cannot do French secondary 7951 // However, if there are contractions in the table, but we always use just one char, 7952 // we might be able to do French. This should be checked out. 7953 } 7954 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 7955 //fprintf(stderr, "S"); 7956 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7957 } 7958 } 7959 } 7960 7961 while(tOrder==0) { // this loop skips primary ignorables 7962 // tOrder=getNextlatinOneCE(target); 7963 if(tLen==-1) { // handling zero terminated strings 7964 tChar=target[tIndex++]; 7965 if(tChar==0) { 7966 if(endOfSource) { // this is different than source loop, 7967 // as we already know that source loop is done here, 7968 // so we can either finish the primary loop if both 7969 // strings are done or anounce the result if only 7970 // target is done. Same below. 7971 goto endOfPrimLoop; 7972 } else { 7973 return UCOL_GREATER; 7974 } 7975 } 7976 } else { // handling strings with known length 7977 if(tIndex==tLen) { 7978 if(endOfSource) { 7979 goto endOfPrimLoop; 7980 } else { 7981 return UCOL_GREATER; 7982 } 7983 } 7984 tChar=target[tIndex++]; 7985 } 7986 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 7987 //fprintf(stderr, "R"); 7988 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7989 } 7990 tOrder = elements[tChar]; 7991 if(tOrder >= UCOL_NOT_FOUND) { 7992 // Handling specials, see the comments for source 7993 if(getCETag(tOrder) == CONTRACTION_TAG) { 7994 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 7995 haveContractions = TRUE; 7996 } 7997 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 7998 //fprintf(stderr, "S"); 7999 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8000 } 8001 } 8002 } 8003 if(endOfSource) { // source is finished, but target is not, say the result. 8004 return UCOL_LESS; 8005 } 8006 8007 if(sOrder == tOrder) { // if we have same CEs, we continue the loop 8008 sOrder = 0; tOrder = 0; 8009 continue; 8010 } else { 8011 // compare current top bytes 8012 if(((sOrder^tOrder)&0xFF000000)!=0) { 8013 // top bytes differ, return difference 8014 if(sOrder < tOrder) { 8015 return UCOL_LESS; 8016 } else if(sOrder > tOrder) { 8017 return UCOL_GREATER; 8018 } 8019 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 8020 // since we must return enum value 8021 } 8022 8023 // top bytes match, continue with following bytes 8024 sOrder<<=8; 8025 tOrder<<=8; 8026 } 8027 } 8028 8029 endOfPrimLoop: 8030 // after primary loop, we definitely know the sizes of strings, 8031 // so we set it and use simpler loop for secondaries and tertiaries 8032 sLen = sIndex; tLen = tIndex; 8033 if(strength >= UCOL_SECONDARY) { 8034 // adjust the table beggining 8035 elements += coll->latinOneTableLen; 8036 endOfSource = FALSE; 8037 8038 if(coll->frenchCollation == UCOL_OFF) { // non French 8039 // This loop is a simplified copy of primary loop 8040 // at this point we know that whole strings are latin-1, so we don't 8041 // check for that. We also know that we only have contractions as 8042 // specials. 8043 sIndex = 0; tIndex = 0; 8044 for(;;) { 8045 while(sOrder==0) { 8046 if(sIndex==sLen) { 8047 endOfSource = TRUE; 8048 break; 8049 } 8050 sChar=source[sIndex++]; 8051 sOrder = elements[sChar]; 8052 if(sOrder > UCOL_NOT_FOUND) { 8053 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 8054 } 8055 } 8056 8057 while(tOrder==0) { 8058 if(tIndex==tLen) { 8059 if(endOfSource) { 8060 goto endOfSecLoop; 8061 } else { 8062 return UCOL_GREATER; 8063 } 8064 } 8065 tChar=target[tIndex++]; 8066 tOrder = elements[tChar]; 8067 if(tOrder > UCOL_NOT_FOUND) { 8068 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 8069 } 8070 } 8071 if(endOfSource) { 8072 return UCOL_LESS; 8073 } 8074 8075 if(sOrder == tOrder) { 8076 sOrder = 0; tOrder = 0; 8077 continue; 8078 } else { 8079 // see primary loop for comments on this 8080 if(((sOrder^tOrder)&0xFF000000)!=0) { 8081 if(sOrder < tOrder) { 8082 return UCOL_LESS; 8083 } else if(sOrder > tOrder) { 8084 return UCOL_GREATER; 8085 } 8086 } 8087 sOrder<<=8; 8088 tOrder<<=8; 8089 } 8090 } 8091 } else { // French 8092 if(haveContractions) { // if we have contractions, we have to bail out 8093 // since we don't really know how to handle them here 8094 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8095 } 8096 // For French, we go backwards 8097 sIndex = sLen; tIndex = tLen; 8098 for(;;) { 8099 while(sOrder==0) { 8100 if(sIndex==0) { 8101 endOfSource = TRUE; 8102 break; 8103 } 8104 sChar=source[--sIndex]; 8105 sOrder = elements[sChar]; 8106 // don't even look for contractions 8107 } 8108 8109 while(tOrder==0) { 8110 if(tIndex==0) { 8111 if(endOfSource) { 8112 goto endOfSecLoop; 8113 } else { 8114 return UCOL_GREATER; 8115 } 8116 } 8117 tChar=target[--tIndex]; 8118 tOrder = elements[tChar]; 8119 // don't even look for contractions 8120 } 8121 if(endOfSource) { 8122 return UCOL_LESS; 8123 } 8124 8125 if(sOrder == tOrder) { 8126 sOrder = 0; tOrder = 0; 8127 continue; 8128 } else { 8129 // see the primary loop for comments 8130 if(((sOrder^tOrder)&0xFF000000)!=0) { 8131 if(sOrder < tOrder) { 8132 return UCOL_LESS; 8133 } else if(sOrder > tOrder) { 8134 return UCOL_GREATER; 8135 } 8136 } 8137 sOrder<<=8; 8138 tOrder<<=8; 8139 } 8140 } 8141 } 8142 } 8143 8144 endOfSecLoop: 8145 if(strength >= UCOL_TERTIARY) { 8146 // tertiary loop is the same as secondary (except no French) 8147 elements += coll->latinOneTableLen; 8148 sIndex = 0; tIndex = 0; 8149 endOfSource = FALSE; 8150 for(;;) { 8151 while(sOrder==0) { 8152 if(sIndex==sLen) { 8153 endOfSource = TRUE; 8154 break; 8155 } 8156 sChar=source[sIndex++]; 8157 sOrder = elements[sChar]; 8158 if(sOrder > UCOL_NOT_FOUND) { 8159 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 8160 } 8161 } 8162 while(tOrder==0) { 8163 if(tIndex==tLen) { 8164 if(endOfSource) { 8165 return UCOL_EQUAL; // if both strings are at the end, they are equal 8166 } else { 8167 return UCOL_GREATER; 8168 } 8169 } 8170 tChar=target[tIndex++]; 8171 tOrder = elements[tChar]; 8172 if(tOrder > UCOL_NOT_FOUND) { 8173 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 8174 } 8175 } 8176 if(endOfSource) { 8177 return UCOL_LESS; 8178 } 8179 if(sOrder == tOrder) { 8180 sOrder = 0; tOrder = 0; 8181 continue; 8182 } else { 8183 if(((sOrder^tOrder)&0xff000000)!=0) { 8184 if(sOrder < tOrder) { 8185 return UCOL_LESS; 8186 } else if(sOrder > tOrder) { 8187 return UCOL_GREATER; 8188 } 8189 } 8190 sOrder<<=8; 8191 tOrder<<=8; 8192 } 8193 } 8194 } 8195 return UCOL_EQUAL; 8196 } 8197 8198 8199 U_CAPI UCollationResult U_EXPORT2 8200 ucol_strcollIter( const UCollator *coll, 8201 UCharIterator *sIter, 8202 UCharIterator *tIter, 8203 UErrorCode *status) 8204 { 8205 if(!status || U_FAILURE(*status)) { 8206 return UCOL_EQUAL; 8207 } 8208 8209 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 8210 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 8211 8212 if (sIter == tIter) { 8213 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8214 return UCOL_EQUAL; 8215 } 8216 if(sIter == NULL || tIter == NULL || coll == NULL) { 8217 *status = U_ILLEGAL_ARGUMENT_ERROR; 8218 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8219 return UCOL_EQUAL; 8220 } 8221 8222 UCollationResult result = UCOL_EQUAL; 8223 8224 // Preparing the context objects for iterating over strings 8225 collIterate sColl, tColl; 8226 IInit_collIterate(coll, NULL, -1, &sColl, status); 8227 IInit_collIterate(coll, NULL, -1, &tColl, status); 8228 if(U_FAILURE(*status)) { 8229 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8230 return UCOL_EQUAL; 8231 } 8232 // The division for the array length may truncate the array size to 8233 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 8234 // for all platforms anyway. 8235 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8236 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8237 UNormIterator *sNormIter = NULL, *tNormIter = NULL; 8238 8239 sColl.iterator = sIter; 8240 sColl.flags |= UCOL_USE_ITERATOR; 8241 tColl.flags |= UCOL_USE_ITERATOR; 8242 tColl.iterator = tIter; 8243 8244 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 8245 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 8246 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); 8247 sColl.flags &= ~UCOL_ITER_NORM; 8248 8249 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 8250 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); 8251 tColl.flags &= ~UCOL_ITER_NORM; 8252 } 8253 8254 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; 8255 8256 while((sChar = sColl.iterator->next(sColl.iterator)) == 8257 (tChar = tColl.iterator->next(tColl.iterator))) { 8258 if(sChar == U_SENTINEL) { 8259 result = UCOL_EQUAL; 8260 goto end_compare; 8261 } 8262 } 8263 8264 if(sChar == U_SENTINEL) { 8265 tChar = tColl.iterator->previous(tColl.iterator); 8266 } 8267 8268 if(tChar == U_SENTINEL) { 8269 sChar = sColl.iterator->previous(sColl.iterator); 8270 } 8271 8272 sChar = sColl.iterator->previous(sColl.iterator); 8273 tChar = tColl.iterator->previous(tColl.iterator); 8274 8275 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) 8276 { 8277 // We are stopped in the middle of a contraction. 8278 // Scan backwards through the == part of the string looking for the start of the contraction. 8279 // It doesn't matter which string we scan, since they are the same in this region. 8280 do 8281 { 8282 sChar = sColl.iterator->previous(sColl.iterator); 8283 tChar = tColl.iterator->previous(tColl.iterator); 8284 } 8285 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); 8286 } 8287 8288 8289 if(U_SUCCESS(*status)) { 8290 result = ucol_strcollRegular(&sColl, &tColl, status); 8291 } 8292 8293 end_compare: 8294 if(sNormIter || tNormIter) { 8295 unorm_closeIter(sNormIter); 8296 unorm_closeIter(tNormIter); 8297 } 8298 8299 UTRACE_EXIT_VALUE_STATUS(result, *status) 8300 return result; 8301 } 8302 8303 8304 /* */ 8305 /* ucol_strcoll Main public API string comparison function */ 8306 /* */ 8307 U_CAPI UCollationResult U_EXPORT2 8308 ucol_strcoll( const UCollator *coll, 8309 const UChar *source, 8310 int32_t sourceLength, 8311 const UChar *target, 8312 int32_t targetLength) 8313 { 8314 U_ALIGN_CODE(16); 8315 8316 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 8317 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 8318 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 8319 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 8320 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 8321 } 8322 8323 if(source == NULL || target == NULL) { 8324 // do not crash, but return. Should have 8325 // status argument to return error. 8326 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8327 return UCOL_EQUAL; 8328 } 8329 8330 /* Quick check if source and target are same strings. */ 8331 /* They should either both be NULL terminated or the explicit length should be set on both. */ 8332 if (source==target && sourceLength==targetLength) { 8333 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8334 return UCOL_EQUAL; 8335 } 8336 8337 /* Scan the strings. Find: */ 8338 /* The length of any leading portion that is equal */ 8339 /* Whether they are exactly equal. (in which case we just return) */ 8340 const UChar *pSrc = source; 8341 const UChar *pTarg = target; 8342 int32_t equalLength; 8343 8344 if (sourceLength == -1 && targetLength == -1) { 8345 // Both strings are null terminated. 8346 // Scan through any leading equal portion. 8347 while (*pSrc == *pTarg && *pSrc != 0) { 8348 pSrc++; 8349 pTarg++; 8350 } 8351 if (*pSrc == 0 && *pTarg == 0) { 8352 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8353 return UCOL_EQUAL; 8354 } 8355 equalLength = (int32_t)(pSrc - source); 8356 } 8357 else 8358 { 8359 // One or both strings has an explicit length. 8360 const UChar *pSrcEnd = source + sourceLength; 8361 const UChar *pTargEnd = target + targetLength; 8362 8363 // Scan while the strings are bitwise ==, or until one is exhausted. 8364 for (;;) { 8365 if (pSrc == pSrcEnd || pTarg == pTargEnd) { 8366 break; 8367 } 8368 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 8369 break; 8370 } 8371 if (*pSrc != *pTarg) { 8372 break; 8373 } 8374 pSrc++; 8375 pTarg++; 8376 } 8377 equalLength = (int32_t)(pSrc - source); 8378 8379 // If we made it all the way through both strings, we are done. They are == 8380 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */ 8381 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */ 8382 { 8383 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8384 return UCOL_EQUAL; 8385 } 8386 } 8387 if (equalLength > 0) { 8388 /* There is an identical portion at the beginning of the two strings. */ 8389 /* If the identical portion ends within a contraction or a comibining */ 8390 /* character sequence, back up to the start of that sequence. */ 8391 8392 // These values should already be set by the code above. 8393 //pSrc = source + equalLength; /* point to the first differing chars */ 8394 //pTarg = target + equalLength; 8395 if (pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll) || 8396 pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)) 8397 { 8398 // We are stopped in the middle of a contraction. 8399 // Scan backwards through the == part of the string looking for the start of the contraction. 8400 // It doesn't matter which string we scan, since they are the same in this region. 8401 do 8402 { 8403 equalLength--; 8404 pSrc--; 8405 } 8406 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); 8407 } 8408 8409 source += equalLength; 8410 target += equalLength; 8411 if (sourceLength > 0) { 8412 sourceLength -= equalLength; 8413 } 8414 if (targetLength > 0) { 8415 targetLength -= equalLength; 8416 } 8417 } 8418 8419 UErrorCode status = U_ZERO_ERROR; 8420 UCollationResult returnVal; 8421 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { 8422 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status); 8423 } else { 8424 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); 8425 } 8426 UTRACE_EXIT_VALUE(returnVal); 8427 return returnVal; 8428 } 8429 8430 /* convenience function for comparing strings */ 8431 U_CAPI UBool U_EXPORT2 8432 ucol_greater( const UCollator *coll, 8433 const UChar *source, 8434 int32_t sourceLength, 8435 const UChar *target, 8436 int32_t targetLength) 8437 { 8438 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8439 == UCOL_GREATER); 8440 } 8441 8442 /* convenience function for comparing strings */ 8443 U_CAPI UBool U_EXPORT2 8444 ucol_greaterOrEqual( const UCollator *coll, 8445 const UChar *source, 8446 int32_t sourceLength, 8447 const UChar *target, 8448 int32_t targetLength) 8449 { 8450 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8451 != UCOL_LESS); 8452 } 8453 8454 /* convenience function for comparing strings */ 8455 U_CAPI UBool U_EXPORT2 8456 ucol_equal( const UCollator *coll, 8457 const UChar *source, 8458 int32_t sourceLength, 8459 const UChar *target, 8460 int32_t targetLength) 8461 { 8462 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8463 == UCOL_EQUAL); 8464 } 8465 8466 U_CAPI void U_EXPORT2 8467 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 8468 if(coll && coll->UCA) { 8469 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); 8470 } 8471 } 8472 8473 #endif /* #if !UCONFIG_NO_COLLATION */ 8474