1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucol.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Modification history 12 * Date Name Comments 13 * 1996-1999 various members of ICU team maintained C API for collation framework 14 * 02/16/2001 synwee Added internal method getPrevSpecialCE 15 * 03/01/2001 synwee Added maxexpansion functionality. 16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_COLLATION 22 23 #include "unicode/bytestream.h" 24 #include "unicode/coleitr.h" 25 #include "unicode/unorm.h" 26 #include "unicode/udata.h" 27 #include "unicode/ustring.h" 28 29 #include "ucol_imp.h" 30 #include "bocsu.h" 31 32 #include "normalizer2impl.h" 33 #include "unorm_it.h" 34 #include "umutex.h" 35 #include "cmemory.h" 36 #include "ucln_in.h" 37 #include "cstring.h" 38 #include "utracimp.h" 39 #include "putilimp.h" 40 #include "uassert.h" 41 42 #ifdef UCOL_DEBUG 43 #include <stdio.h> 44 #endif 45 46 U_NAMESPACE_USE 47 48 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 49 50 #define LAST_BYTE_MASK_ 0xFF 51 #define SECOND_LAST_BYTE_SHIFT_ 8 52 53 #define ZERO_CC_LIMIT_ 0xC0 54 55 // this is static pointer to the normalizer fcdTrieIndex 56 // it is always the same between calls to u_cleanup 57 // and therefore writing to it is not synchronized. 58 // It is cleaned in ucol_cleanup 59 static const uint16_t *fcdTrieIndex=NULL; 60 // Code points at fcdHighStart and above have a zero FCD value. 61 static UChar32 fcdHighStart = 0; 62 63 // These are values from UCA required for 64 // implicit generation and supressing sort key compression 65 // they should regularly be in the UCA, but if one 66 // is running without UCA, it could be a problem 67 static const int32_t maxRegularPrimary = 0x7A; 68 static const int32_t minImplicitPrimary = 0xE0; 69 static const int32_t maxImplicitPrimary = 0xE4; 70 71 U_CDECL_BEGIN 72 static UBool U_CALLCONV 73 ucol_cleanup(void) 74 { 75 fcdTrieIndex = NULL; 76 return TRUE; 77 } 78 79 static int32_t U_CALLCONV 80 _getFoldingOffset(uint32_t data) { 81 return (int32_t)(data&0xFFFFFF); 82 } 83 84 U_CDECL_END 85 86 // init FCD data 87 static inline 88 UBool initializeFCD(UErrorCode *status) { 89 if (fcdTrieIndex != NULL) { 90 return TRUE; 91 } else { 92 // The result is constant, until the library is reloaded. 93 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 94 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 95 return U_SUCCESS(*status); 96 } 97 } 98 99 static 100 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, 101 int32_t sourceLen, collIterate *s, 102 UErrorCode *status) 103 { 104 (s)->string = (s)->pos = sourceString; 105 (s)->origFlags = 0; 106 (s)->flags = 0; 107 if (sourceLen >= 0) { 108 s->flags |= UCOL_ITER_HASLEN; 109 (s)->endp = (UChar *)sourceString+sourceLen; 110 } 111 else { 112 /* change to enable easier checking for end of string for fcdpositon */ 113 (s)->endp = NULL; 114 } 115 (s)->extendCEs = NULL; 116 (s)->extendCEsSize = 0; 117 (s)->CEpos = (s)->toReturn = (s)->CEs; 118 (s)->offsetBuffer = NULL; 119 (s)->offsetBufferSize = 0; 120 (s)->offsetReturn = (s)->offsetStore = NULL; 121 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; 122 (s)->coll = (collator); 123 (s)->nfd = Normalizer2Factory::getNFDInstance(*status); 124 (s)->fcdPosition = 0; 125 if(collator->normalizationMode == UCOL_ON) { 126 (s)->flags |= UCOL_ITER_NORM; 127 } 128 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { 129 (s)->flags |= UCOL_HIRAGANA_Q; 130 } 131 (s)->iterator = NULL; 132 //(s)->iteratorIndex = 0; 133 } 134 135 U_CAPI void U_EXPORT2 136 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, 137 int32_t sourceLen, collIterate *s, 138 UErrorCode *status) { 139 /* Out-of-line version for use from other files. */ 140 IInit_collIterate(collator, sourceString, sourceLen, s, status); 141 } 142 143 U_CAPI collIterate * U_EXPORT2 144 uprv_new_collIterate(UErrorCode *status) { 145 if(U_FAILURE(*status)) { 146 return NULL; 147 } 148 collIterate *s = new collIterate; 149 if(s == NULL) { 150 *status = U_MEMORY_ALLOCATION_ERROR; 151 return NULL; 152 } 153 return s; 154 } 155 156 U_CAPI void U_EXPORT2 157 uprv_delete_collIterate(collIterate *s) { 158 delete s; 159 } 160 161 U_CAPI UBool U_EXPORT2 162 uprv_collIterateAtEnd(collIterate *s) { 163 return s == NULL || s->pos == s->endp; 164 } 165 166 /** 167 * Backup the state of the collIterate struct data 168 * @param data collIterate to backup 169 * @param backup storage 170 */ 171 static 172 inline void backupState(const collIterate *data, collIterateState *backup) 173 { 174 backup->fcdPosition = data->fcdPosition; 175 backup->flags = data->flags; 176 backup->origFlags = data->origFlags; 177 backup->pos = data->pos; 178 backup->bufferaddress = data->writableBuffer.getBuffer(); 179 backup->buffersize = data->writableBuffer.length(); 180 backup->iteratorMove = 0; 181 backup->iteratorIndex = 0; 182 if(data->iterator != NULL) { 183 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); 184 backup->iteratorIndex = data->iterator->getState(data->iterator); 185 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE 186 if(backup->iteratorIndex == UITER_NO_STATE) { 187 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { 188 backup->iteratorMove++; 189 data->iterator->move(data->iterator, -1, UITER_CURRENT); 190 } 191 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 192 } 193 } 194 } 195 196 /** 197 * Loads the state into the collIterate struct data 198 * @param data collIterate to backup 199 * @param backup storage 200 * @param forwards boolean to indicate if forwards iteration is used, 201 * false indicates backwards iteration 202 */ 203 static 204 inline void loadState(collIterate *data, const collIterateState *backup, 205 UBool forwards) 206 { 207 UErrorCode status = U_ZERO_ERROR; 208 data->flags = backup->flags; 209 data->origFlags = backup->origFlags; 210 if(data->iterator != NULL) { 211 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); 212 data->iterator->setState(data->iterator, backup->iteratorIndex, &status); 213 if(backup->iteratorMove != 0) { 214 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 215 } 216 } 217 data->pos = backup->pos; 218 219 if ((data->flags & UCOL_ITER_INNORMBUF) && 220 data->writableBuffer.getBuffer() != backup->bufferaddress) { 221 /* 222 this is when a new buffer has been reallocated and we'll have to 223 calculate the new position. 224 note the new buffer has to contain the contents of the old buffer. 225 */ 226 if (forwards) { 227 data->pos = data->writableBuffer.getTerminatedBuffer() + 228 (data->pos - backup->bufferaddress); 229 } 230 else { 231 /* backwards direction */ 232 int32_t temp = backup->buffersize - 233 (int32_t)(data->pos - backup->bufferaddress); 234 data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp); 235 } 236 } 237 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 238 /* 239 this is alittle tricky. 240 if we are initially not in the normalization buffer, even if we 241 normalize in the later stage, the data in the buffer will be 242 ignored, since we skip back up to the data string. 243 however if we are already in the normalization buffer, any 244 further normalization will pull data into the normalization 245 buffer and modify the fcdPosition. 246 since we are keeping the data in the buffer for use, the 247 fcdPosition can not be reverted back. 248 arrgghh.... 249 */ 250 data->fcdPosition = backup->fcdPosition; 251 } 252 } 253 254 static UBool 255 reallocCEs(collIterate *data, int32_t newCapacity) { 256 uint32_t *oldCEs = data->extendCEs; 257 if(oldCEs == NULL) { 258 oldCEs = data->CEs; 259 } 260 int32_t length = data->CEpos - oldCEs; 261 uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4); 262 if(newCEs == NULL) { 263 return FALSE; 264 } 265 uprv_memcpy(newCEs, oldCEs, length * 4); 266 uprv_free(data->extendCEs); 267 data->extendCEs = newCEs; 268 data->extendCEsSize = newCapacity; 269 data->CEpos = newCEs + length; 270 return TRUE; 271 } 272 273 static UBool 274 increaseCEsCapacity(collIterate *data) { 275 int32_t oldCapacity; 276 if(data->extendCEs != NULL) { 277 oldCapacity = data->extendCEsSize; 278 } else { 279 oldCapacity = LENGTHOF(data->CEs); 280 } 281 return reallocCEs(data, 2 * oldCapacity); 282 } 283 284 static UBool 285 ensureCEsCapacity(collIterate *data, int32_t minCapacity) { 286 int32_t oldCapacity; 287 if(data->extendCEs != NULL) { 288 oldCapacity = data->extendCEsSize; 289 } else { 290 oldCapacity = LENGTHOF(data->CEs); 291 } 292 if(minCapacity <= oldCapacity) { 293 return TRUE; 294 } 295 oldCapacity *= 2; 296 return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity); 297 } 298 299 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) { 300 if(U_FAILURE(errorCode)) { 301 return; 302 } 303 int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer); 304 if(length >= offsetBufferSize) { 305 int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE; 306 int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4)); 307 if(newBuffer == NULL) { 308 errorCode = U_MEMORY_ALLOCATION_ERROR; 309 return; 310 } 311 if(length > 0) { 312 uprv_memcpy(newBuffer, offsetBuffer, length * 4); 313 } 314 uprv_free(offsetBuffer); 315 offsetBuffer = newBuffer; 316 offsetStore = offsetBuffer + length; 317 offsetBufferSize = newCapacity; 318 } 319 *offsetStore++ = offset; 320 } 321 322 /* 323 * collIter_eos() 324 * Checks for a collIterate being positioned at the end of 325 * its source string. 326 * 327 */ 328 static 329 inline UBool collIter_eos(collIterate *s) { 330 if(s->flags & UCOL_USE_ITERATOR) { 331 return !(s->iterator->hasNext(s->iterator)); 332 } 333 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { 334 // Null terminated string, but not at null, so not at end. 335 // Whether in main or normalization buffer doesn't matter. 336 return FALSE; 337 } 338 339 // String with length. Can't be in normalization buffer, which is always 340 // null termintated. 341 if (s->flags & UCOL_ITER_HASLEN) { 342 return (s->pos == s->endp); 343 } 344 345 // We are at a null termination, could be either normalization buffer or main string. 346 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { 347 // At null at end of main string. 348 return TRUE; 349 } 350 351 // At null at end of normalization buffer. Need to check whether there there are 352 // any characters left in the main buffer. 353 if(s->origFlags & UCOL_USE_ITERATOR) { 354 return !(s->iterator->hasNext(s->iterator)); 355 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { 356 // Null terminated main string. fcdPosition is the 'return' position into main buf. 357 return (*s->fcdPosition == 0); 358 } 359 else { 360 // Main string with an end pointer. 361 return s->fcdPosition == s->endp; 362 } 363 } 364 365 /* 366 * collIter_bos() 367 * Checks for a collIterate being positioned at the start of 368 * its source string. 369 * 370 */ 371 static 372 inline UBool collIter_bos(collIterate *source) { 373 // if we're going backwards, we need to know whether there is more in the 374 // iterator, even if we are in the side buffer 375 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 376 return !source->iterator->hasPrevious(source->iterator); 377 } 378 if (source->pos <= source->string || 379 ((source->flags & UCOL_ITER_INNORMBUF) && 380 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { 381 return TRUE; 382 } 383 return FALSE; 384 } 385 386 /*static 387 inline UBool collIter_SimpleBos(collIterate *source) { 388 // if we're going backwards, we need to know whether there is more in the 389 // iterator, even if we are in the side buffer 390 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 391 return !source->iterator->hasPrevious(source->iterator); 392 } 393 if (source->pos == source->string) { 394 return TRUE; 395 } 396 return FALSE; 397 }*/ 398 //return (data->pos == data->string) || 399 400 401 /****************************************************************************/ 402 /* Following are the open/close functions */ 403 /* */ 404 /****************************************************************************/ 405 406 static UCollator* 407 ucol_initFromBinary(const uint8_t *bin, int32_t length, 408 const UCollator *base, 409 UCollator *fillIn, 410 UErrorCode *status) 411 { 412 UCollator *result = fillIn; 413 if(U_FAILURE(*status)) { 414 return NULL; 415 } 416 /* 417 if(base == NULL) { 418 // we don't support null base yet 419 *status = U_ILLEGAL_ARGUMENT_ERROR; 420 return NULL; 421 } 422 */ 423 // We need these and we could be running without UCA 424 uprv_uca_initImplicitConstants(status); 425 UCATableHeader *colData = (UCATableHeader *)bin; 426 // do we want version check here? We're trying to figure out whether collators are compatible 427 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || 428 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || 429 colData->version[0] != UCOL_BUILDER_VERSION) 430 { 431 *status = U_COLLATOR_VERSION_MISMATCH; 432 return NULL; 433 } 434 else { 435 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { 436 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); 437 if(U_FAILURE(*status)){ 438 return NULL; 439 } 440 result->hasRealData = TRUE; 441 } 442 else { 443 if(base) { 444 result = ucol_initCollator(base->image, result, base, status); 445 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); 446 if(U_FAILURE(*status)){ 447 return NULL; 448 } 449 result->hasRealData = FALSE; 450 } 451 else { 452 *status = U_USELESS_COLLATOR_ERROR; 453 return NULL; 454 } 455 } 456 result->freeImageOnClose = FALSE; 457 } 458 result->actualLocale = NULL; 459 result->validLocale = NULL; 460 result->requestedLocale = NULL; 461 result->rules = NULL; 462 result->rulesLength = 0; 463 result->freeRulesOnClose = FALSE; 464 result->ucaRules = NULL; 465 return result; 466 } 467 468 U_CAPI UCollator* U_EXPORT2 469 ucol_openBinary(const uint8_t *bin, int32_t length, 470 const UCollator *base, 471 UErrorCode *status) 472 { 473 return ucol_initFromBinary(bin, length, base, NULL, status); 474 } 475 476 U_CAPI int32_t U_EXPORT2 477 ucol_cloneBinary(const UCollator *coll, 478 uint8_t *buffer, int32_t capacity, 479 UErrorCode *status) 480 { 481 int32_t length = 0; 482 if(U_FAILURE(*status)) { 483 return length; 484 } 485 if(capacity < 0) { 486 *status = U_ILLEGAL_ARGUMENT_ERROR; 487 return length; 488 } 489 if(coll->hasRealData == TRUE) { 490 length = coll->image->size; 491 if(length <= capacity) { 492 uprv_memcpy(buffer, coll->image, length); 493 } else { 494 *status = U_BUFFER_OVERFLOW_ERROR; 495 } 496 } else { 497 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 498 if(length <= capacity) { 499 /* build the UCATableHeader with minimal entries */ 500 /* do not copy the header from the UCA file because its values are wrong! */ 501 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 502 503 /* reset everything */ 504 uprv_memset(buffer, 0, length); 505 506 /* set the tailoring-specific values */ 507 UCATableHeader *myData = (UCATableHeader *)buffer; 508 myData->size = length; 509 510 /* offset for the options, the only part of the data that is present after the header */ 511 myData->options = sizeof(UCATableHeader); 512 513 /* need to always set the expansion value for an upper bound of the options */ 514 myData->expansion = myData->options + sizeof(UColOptionSet); 515 516 myData->magic = UCOL_HEADER_MAGIC; 517 myData->isBigEndian = U_IS_BIG_ENDIAN; 518 myData->charSetFamily = U_CHARSET_FAMILY; 519 520 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 521 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 522 523 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 524 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 525 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 526 myData->jamoSpecial = coll->image->jamoSpecial; 527 528 /* copy the collator options */ 529 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 530 } else { 531 *status = U_BUFFER_OVERFLOW_ERROR; 532 } 533 } 534 return length; 535 } 536 537 U_CAPI UCollator* U_EXPORT2 538 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) 539 { 540 UCollator * localCollator; 541 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); 542 char *stackBufferChars = (char *)stackBuffer; 543 int32_t imageSize = 0; 544 int32_t rulesSize = 0; 545 int32_t rulesPadding = 0; 546 int32_t defaultReorderCodesSize = 0; 547 int32_t reorderCodesSize = 0; 548 uint8_t *image; 549 UChar *rules; 550 int32_t* defaultReorderCodes; 551 int32_t* reorderCodes; 552 uint8_t* leadBytePermutationTable; 553 UBool colAllocated = FALSE; 554 UBool imageAllocated = FALSE; 555 556 if (status == NULL || U_FAILURE(*status)){ 557 return 0; 558 } 559 if ((stackBuffer && !pBufferSize) || !coll){ 560 *status = U_ILLEGAL_ARGUMENT_ERROR; 561 return 0; 562 } 563 564 if (coll->rules && coll->freeRulesOnClose) { 565 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); 566 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); 567 bufferSizeNeeded += rulesSize + rulesPadding; 568 } 569 // no padding for alignment needed from here since the next two are 4 byte quantities 570 if (coll->defaultReorderCodes) { 571 defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t); 572 bufferSizeNeeded += defaultReorderCodesSize; 573 } 574 if (coll->reorderCodes) { 575 reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t); 576 bufferSizeNeeded += reorderCodesSize; 577 } 578 if (coll->leadBytePermutationTable) { 579 bufferSizeNeeded += 256 * sizeof(uint8_t); 580 } 581 582 if (stackBuffer && *pBufferSize <= 0) { /* 'preflighting' request - set needed size into *pBufferSize */ 583 *pBufferSize = bufferSizeNeeded; 584 return 0; 585 } 586 587 /* Pointers on 64-bit platforms need to be aligned 588 * on a 64-bit boundry in memory. 589 */ 590 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 591 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); 592 if (*pBufferSize > offsetUp) { 593 *pBufferSize -= offsetUp; 594 stackBufferChars += offsetUp; 595 } 596 else { 597 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */ 598 *pBufferSize = 1; 599 } 600 } 601 stackBuffer = (void *)stackBufferChars; 602 603 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) { 604 /* allocate one here...*/ 605 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); 606 // Null pointer check. 607 if (stackBufferChars == NULL) { 608 *status = U_MEMORY_ALLOCATION_ERROR; 609 return NULL; 610 } 611 colAllocated = TRUE; 612 if (U_SUCCESS(*status)) { 613 *status = U_SAFECLONE_ALLOCATED_WARNING; 614 } 615 } 616 localCollator = (UCollator *)stackBufferChars; 617 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); 618 defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize); 619 reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize); 620 leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize; 621 622 { 623 UErrorCode tempStatus = U_ZERO_ERROR; 624 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); 625 } 626 if (coll->freeImageOnClose) { 627 image = (uint8_t *)uprv_malloc(imageSize); 628 // Null pointer check 629 if (image == NULL) { 630 *status = U_MEMORY_ALLOCATION_ERROR; 631 return NULL; 632 } 633 ucol_cloneBinary(coll, image, imageSize, status); 634 imageAllocated = TRUE; 635 } 636 else { 637 image = (uint8_t *)coll->image; 638 } 639 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); 640 if (U_FAILURE(*status)) { 641 return NULL; 642 } 643 644 if (coll->rules) { 645 if (coll->freeRulesOnClose) { 646 localCollator->rules = u_strcpy(rules, coll->rules); 647 //bufferEnd += rulesSize; 648 } 649 else { 650 localCollator->rules = coll->rules; 651 } 652 localCollator->freeRulesOnClose = FALSE; 653 localCollator->rulesLength = coll->rulesLength; 654 } 655 656 // collator reordering 657 if (coll->defaultReorderCodes) { 658 localCollator->defaultReorderCodes = 659 (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t)); 660 localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength; 661 localCollator->freeDefaultReorderCodesOnClose = FALSE; 662 } 663 if (coll->reorderCodes) { 664 localCollator->reorderCodes = 665 (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t)); 666 localCollator->reorderCodesLength = coll->reorderCodesLength; 667 localCollator->freeReorderCodesOnClose = FALSE; 668 } 669 if (coll->leadBytePermutationTable) { 670 localCollator->leadBytePermutationTable = 671 (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256); 672 localCollator->freeLeadBytePermutationTableOnClose = FALSE; 673 } 674 675 int32_t i; 676 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 677 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); 678 } 679 // zero copies of pointers 680 localCollator->actualLocale = NULL; 681 localCollator->validLocale = NULL; 682 localCollator->requestedLocale = NULL; 683 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. 684 localCollator->freeOnClose = colAllocated; 685 localCollator->freeImageOnClose = imageAllocated; 686 return localCollator; 687 } 688 689 U_CAPI void U_EXPORT2 690 ucol_close(UCollator *coll) 691 { 692 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 693 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 694 if(coll != NULL) { 695 // these are always owned by each UCollator struct, 696 // so we always free them 697 if(coll->validLocale != NULL) { 698 uprv_free(coll->validLocale); 699 } 700 if(coll->actualLocale != NULL) { 701 uprv_free(coll->actualLocale); 702 } 703 if(coll->requestedLocale != NULL) { 704 uprv_free(coll->requestedLocale); 705 } 706 if(coll->latinOneCEs != NULL) { 707 uprv_free(coll->latinOneCEs); 708 } 709 if(coll->options != NULL && coll->freeOptionsOnClose) { 710 uprv_free(coll->options); 711 } 712 if(coll->rules != NULL && coll->freeRulesOnClose) { 713 uprv_free((UChar *)coll->rules); 714 } 715 if(coll->image != NULL && coll->freeImageOnClose) { 716 uprv_free((UCATableHeader *)coll->image); 717 } 718 719 if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) { 720 uprv_free(coll->leadBytePermutationTable); 721 } 722 if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) { 723 uprv_free(coll->defaultReorderCodes); 724 } 725 if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { 726 uprv_free(coll->reorderCodes); 727 } 728 729 /* Here, it would be advisable to close: */ 730 /* - UData for UCA (unless we stuff it in the root resb */ 731 /* Again, do we need additional housekeeping... HMMM! */ 732 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); 733 if(coll->freeOnClose){ 734 /* for safeClone, if freeOnClose is FALSE, 735 don't free the other instance data */ 736 uprv_free(coll); 737 } 738 } 739 UTRACE_EXIT(); 740 } 741 742 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ 743 /* you should be able to get the binary chunk to write out... Doesn't look very full now */ 744 U_CFUNC uint8_t* U_EXPORT2 745 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) 746 { 747 uint8_t *result = NULL; 748 if(U_FAILURE(*status)) { 749 return NULL; 750 } 751 if(coll->hasRealData == TRUE) { 752 *length = coll->image->size; 753 result = (uint8_t *)uprv_malloc(*length); 754 /* test for NULL */ 755 if (result == NULL) { 756 *status = U_MEMORY_ALLOCATION_ERROR; 757 return NULL; 758 } 759 uprv_memcpy(result, coll->image, *length); 760 } else { 761 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 762 result = (uint8_t *)uprv_malloc(*length); 763 /* test for NULL */ 764 if (result == NULL) { 765 *status = U_MEMORY_ALLOCATION_ERROR; 766 return NULL; 767 } 768 769 /* build the UCATableHeader with minimal entries */ 770 /* do not copy the header from the UCA file because its values are wrong! */ 771 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 772 773 /* reset everything */ 774 uprv_memset(result, 0, *length); 775 776 /* set the tailoring-specific values */ 777 UCATableHeader *myData = (UCATableHeader *)result; 778 myData->size = *length; 779 780 /* offset for the options, the only part of the data that is present after the header */ 781 myData->options = sizeof(UCATableHeader); 782 783 /* need to always set the expansion value for an upper bound of the options */ 784 myData->expansion = myData->options + sizeof(UColOptionSet); 785 786 myData->magic = UCOL_HEADER_MAGIC; 787 myData->isBigEndian = U_IS_BIG_ENDIAN; 788 myData->charSetFamily = U_CHARSET_FAMILY; 789 790 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 791 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 792 793 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 794 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 795 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 796 myData->jamoSpecial = coll->image->jamoSpecial; 797 798 /* copy the collator options */ 799 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 800 } 801 return result; 802 } 803 804 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { 805 if(U_FAILURE(*status)) { 806 return; 807 } 808 result->caseFirst = (UColAttributeValue)opts->caseFirst; 809 result->caseLevel = (UColAttributeValue)opts->caseLevel; 810 result->frenchCollation = (UColAttributeValue)opts->frenchCollation; 811 result->normalizationMode = (UColAttributeValue)opts->normalizationMode; 812 if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) { 813 return; 814 } 815 result->strength = (UColAttributeValue)opts->strength; 816 result->variableTopValue = opts->variableTopValue; 817 result->alternateHandling = (UColAttributeValue)opts->alternateHandling; 818 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; 819 result->numericCollation = (UColAttributeValue)opts->numericCollation; 820 result->caseFirstisDefault = TRUE; 821 result->caseLevelisDefault = TRUE; 822 result->frenchCollationisDefault = TRUE; 823 result->normalizationModeisDefault = TRUE; 824 result->strengthisDefault = TRUE; 825 result->variableTopValueisDefault = TRUE; 826 result->alternateHandlingisDefault = TRUE; 827 result->hiraganaQisDefault = TRUE; 828 result->numericCollationisDefault = TRUE; 829 830 ucol_updateInternalState(result, status); 831 832 result->options = opts; 833 } 834 835 836 /** 837 * Approximate determination if a character is at a contraction end. 838 * Guaranteed to be TRUE if a character is at the end of a contraction, 839 * otherwise it is not deterministic. 840 * @param c character to be determined 841 * @param coll collator 842 */ 843 static 844 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { 845 if (c < coll->minContrEndCP) { 846 return FALSE; 847 } 848 849 int32_t hash = c; 850 uint8_t htbyte; 851 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 852 if (U16_IS_TRAIL(c)) { 853 return TRUE; 854 } 855 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 856 } 857 htbyte = coll->contrEndCP[hash>>3]; 858 return (((htbyte >> (hash & 7)) & 1) == 1); 859 } 860 861 862 863 /* 864 * i_getCombiningClass() 865 * A fast, at least partly inline version of u_getCombiningClass() 866 * This is a candidate for further optimization. Used heavily 867 * in contraction processing. 868 */ 869 static 870 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { 871 uint8_t sCC = 0; 872 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { 873 sCC = u_getCombiningClass(c); 874 } 875 return sCC; 876 } 877 878 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { 879 UChar c; 880 UCollator *result = fillIn; 881 if(U_FAILURE(*status) || image == NULL) { 882 return NULL; 883 } 884 885 if(result == NULL) { 886 result = (UCollator *)uprv_malloc(sizeof(UCollator)); 887 if(result == NULL) { 888 *status = U_MEMORY_ALLOCATION_ERROR; 889 return result; 890 } 891 result->freeOnClose = TRUE; 892 } else { 893 result->freeOnClose = FALSE; 894 } 895 896 result->image = image; 897 result->mapping.getFoldingOffset = _getFoldingOffset; 898 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; 899 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); 900 if(U_FAILURE(*status)) { 901 if(result->freeOnClose == TRUE) { 902 uprv_free(result); 903 result = NULL; 904 } 905 return result; 906 } 907 908 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); 909 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); 910 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); 911 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); 912 result->rules = NULL; 913 result->rulesLength = 0; 914 result->freeRulesOnClose = FALSE; 915 result->defaultReorderCodes = NULL; 916 result->defaultReorderCodesLength = 0; 917 result->freeDefaultReorderCodesOnClose = FALSE; 918 result->reorderCodes = NULL; 919 result->reorderCodesLength = 0; 920 result->freeReorderCodesOnClose = FALSE; 921 result->leadBytePermutationTable = NULL; 922 result->freeLeadBytePermutationTableOnClose = FALSE; 923 924 /* get the version info from UCATableHeader and populate the Collator struct*/ 925 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ 926 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ 927 result->dataVersion[2] = 0; 928 result->dataVersion[3] = 0; 929 930 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; 931 result->minUnsafeCP = 0; 932 for (c=0; c<0x300; c++) { // Find the smallest unsafe char. 933 if (ucol_unsafeCP(c, result)) break; 934 } 935 result->minUnsafeCP = c; 936 937 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; 938 result->minContrEndCP = 0; 939 for (c=0; c<0x300; c++) { // Find the Contraction-ending char. 940 if (ucol_contractionEndCP(c, result)) break; 941 } 942 result->minContrEndCP = c; 943 944 /* max expansion tables */ 945 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + 946 result->image->endExpansionCE); 947 result->lastEndExpansionCE = result->endExpansionCE + 948 result->image->endExpansionCECount - 1; 949 result->expansionCESize = (uint8_t*)result->image + 950 result->image->expansionCESize; 951 952 953 //result->errorCode = *status; 954 955 result->latinOneCEs = NULL; 956 957 result->latinOneRegenTable = FALSE; 958 result->latinOneFailed = FALSE; 959 result->UCA = UCA; 960 961 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ 962 result->ucaRules = NULL; 963 result->actualLocale = NULL; 964 result->validLocale = NULL; 965 result->requestedLocale = NULL; 966 result->hasRealData = FALSE; // real data lives in .dat file... 967 result->freeImageOnClose = FALSE; 968 969 /* set attributes */ 970 ucol_setOptionsFromHeader( 971 result, 972 (UColOptionSet*)((uint8_t*)result->image+result->image->options), 973 status); 974 result->freeOptionsOnClose = FALSE; 975 976 return result; 977 } 978 979 /* new Mark's code */ 980 981 /** 982 * For generation of Implicit CEs 983 * @author Davis 984 * 985 * Cleaned up so that changes can be made more easily. 986 * Old values: 987 # First Implicit: E26A792D 988 # Last Implicit: E3DC70C0 989 # First CJK: E0030300 990 # Last CJK: E0A9DD00 991 # First CJK_A: E0A9DF00 992 # Last CJK_A: E0DE3100 993 */ 994 /* Following is a port of Mark's code for new treatment of implicits. 995 * It is positioned here, since ucol_initUCA need to initialize the 996 * variables below according to the data in the fractional UCA. 997 */ 998 999 /** 1000 * Function used to: 1001 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and 1002 * b) bump any non-CJK characters by 10FFFF. 1003 * The relevant blocks are: 1004 * A: 4E00..9FFF; CJK Unified Ideographs 1005 * F900..FAFF; CJK Compatibility Ideographs 1006 * B: 3400..4DBF; CJK Unified Ideographs Extension A 1007 * 20000..XX; CJK Unified Ideographs Extension B (and others later on) 1008 * As long as 1009 * no new B characters are allocated between 4E00 and FAFF, and 1010 * no new A characters are outside of this range, 1011 * (very high probability) this simple code will work. 1012 * The reordered blocks are: 1013 * Block1 is CJK 1014 * Block2 is CJK_COMPAT_USED 1015 * Block3 is CJK_A 1016 * (all contiguous) 1017 * Any other CJK gets its normal code point 1018 * Any non-CJK gets +10FFFF 1019 * When we reorder Block1, we make sure that it is at the very start, 1020 * so that it will use a 3-byte form. 1021 * Warning: the we only pick up the compatibility characters that are 1022 * NOT decomposed, so that block is smaller! 1023 */ 1024 1025 // CONSTANTS 1026 static const UChar32 1027 NON_CJK_OFFSET = 0x110000, 1028 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 1029 1030 /** 1031 * Precomputed by initImplicitConstants() 1032 */ 1033 static int32_t 1034 final3Multiplier = 0, 1035 final4Multiplier = 0, 1036 final3Count = 0, 1037 final4Count = 0, 1038 medialCount = 0, 1039 min3Primary = 0, 1040 min4Primary = 0, 1041 max4Primary = 0, 1042 minTrail = 0, 1043 maxTrail = 0, 1044 max3Trail = 0, 1045 max4Trail = 0, 1046 min4Boundary = 0; 1047 1048 static const UChar32 1049 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 1050 // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; 1051 CJK_BASE = 0x4E00, 1052 CJK_LIMIT = 0x9FCB+1, 1053 // Unified CJK ideographs in the compatibility ideographs block. 1054 CJK_COMPAT_USED_BASE = 0xFA0E, 1055 CJK_COMPAT_USED_LIMIT = 0xFA2F+1, 1056 // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 1057 // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 1058 CJK_A_BASE = 0x3400, 1059 CJK_A_LIMIT = 0x4DB5+1, 1060 // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; 1061 // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; 1062 CJK_B_BASE = 0x20000, 1063 CJK_B_LIMIT = 0x2A6D6+1, 1064 // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; 1065 // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; 1066 CJK_C_BASE = 0x2A700, 1067 CJK_C_LIMIT = 0x2B734+1, 1068 // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; 1069 // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; 1070 CJK_D_BASE = 0x2B740, 1071 CJK_D_LIMIT = 0x2B81D+1; 1072 // when adding to this list, look for all occurrences (in project) 1073 // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!! 1074 1075 static UChar32 swapCJK(UChar32 i) { 1076 if (i < CJK_A_BASE) { 1077 // non-CJK 1078 } else if (i < CJK_A_LIMIT) { 1079 // Extension A has lower code points than the original Unihan+compat 1080 // but sorts higher. 1081 return i - CJK_A_BASE 1082 + (CJK_LIMIT - CJK_BASE) 1083 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1084 } else if (i < CJK_BASE) { 1085 // non-CJK 1086 } else if (i < CJK_LIMIT) { 1087 return i - CJK_BASE; 1088 } else if (i < CJK_COMPAT_USED_BASE) { 1089 // non-CJK 1090 } else if (i < CJK_COMPAT_USED_LIMIT) { 1091 return i - CJK_COMPAT_USED_BASE 1092 + (CJK_LIMIT - CJK_BASE); 1093 } else if (i < CJK_B_BASE) { 1094 // non-CJK 1095 } else if (i < CJK_B_LIMIT) { 1096 return i; // non-BMP-CJK 1097 } else if (i < CJK_C_BASE) { 1098 // non-CJK 1099 } else if (i < CJK_C_LIMIT) { 1100 return i; // non-BMP-CJK 1101 } else if (i < CJK_D_BASE) { 1102 // non-CJK 1103 } else if (i < CJK_D_LIMIT) { 1104 return i; // non-BMP-CJK 1105 } 1106 return i + NON_CJK_OFFSET; // non-CJK 1107 } 1108 1109 U_CAPI UChar32 U_EXPORT2 1110 uprv_uca_getRawFromCodePoint(UChar32 i) { 1111 return swapCJK(i)+1; 1112 } 1113 1114 U_CAPI UChar32 U_EXPORT2 1115 uprv_uca_getCodePointFromRaw(UChar32 i) { 1116 i--; 1117 UChar32 result = 0; 1118 if(i >= NON_CJK_OFFSET) { 1119 result = i - NON_CJK_OFFSET; 1120 } else if(i >= CJK_B_BASE) { 1121 result = i; 1122 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted 1123 if(i < CJK_LIMIT - CJK_BASE) { 1124 result = i + CJK_BASE; 1125 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { 1126 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); 1127 } else { 1128 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 1129 } 1130 } else { 1131 result = -1; 1132 } 1133 return result; 1134 } 1135 1136 // GET IMPLICIT PRIMARY WEIGHTS 1137 // Return value is left justified primary key 1138 U_CAPI uint32_t U_EXPORT2 1139 uprv_uca_getImplicitFromRaw(UChar32 cp) { 1140 /* 1141 if (cp < 0 || cp > UCOL_MAX_INPUT) { 1142 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); 1143 } 1144 */ 1145 int32_t last0 = cp - min4Boundary; 1146 if (last0 < 0) { 1147 int32_t last1 = cp / final3Count; 1148 last0 = cp % final3Count; 1149 1150 int32_t last2 = last1 / medialCount; 1151 last1 %= medialCount; 1152 1153 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start 1154 last1 = minTrail + last1; // offset 1155 last2 = min3Primary + last2; // offset 1156 /* 1157 if (last2 >= min4Primary) { 1158 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); 1159 } 1160 */ 1161 return (last2 << 24) + (last1 << 16) + (last0 << 8); 1162 } else { 1163 int32_t last1 = last0 / final4Count; 1164 last0 %= final4Count; 1165 1166 int32_t last2 = last1 / medialCount; 1167 last1 %= medialCount; 1168 1169 int32_t last3 = last2 / medialCount; 1170 last2 %= medialCount; 1171 1172 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start 1173 last1 = minTrail + last1; // offset 1174 last2 = minTrail + last2; // offset 1175 last3 = min4Primary + last3; // offset 1176 /* 1177 if (last3 > max4Primary) { 1178 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); 1179 } 1180 */ 1181 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; 1182 } 1183 } 1184 1185 static uint32_t U_EXPORT2 1186 uprv_uca_getImplicitPrimary(UChar32 cp) { 1187 //fprintf(stdout, "Incoming: %04x\n", cp); 1188 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); 1189 1190 cp = swapCJK(cp); 1191 cp++; 1192 // we now have a range of numbers from 0 to 21FFFF. 1193 1194 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); 1195 //fprintf(stdout, "CJK swapped: %04x\n", cp); 1196 1197 return uprv_uca_getImplicitFromRaw(cp); 1198 } 1199 1200 /** 1201 * Converts implicit CE into raw integer ("code point") 1202 * @param implicit 1203 * @return -1 if illegal format 1204 */ 1205 U_CAPI UChar32 U_EXPORT2 1206 uprv_uca_getRawFromImplicit(uint32_t implicit) { 1207 UChar32 result; 1208 UChar32 b3 = implicit & 0xFF; 1209 UChar32 b2 = (implicit >> 8) & 0xFF; 1210 UChar32 b1 = (implicit >> 16) & 0xFF; 1211 UChar32 b0 = (implicit >> 24) & 0xFF; 1212 1213 // simple parameter checks 1214 if (b0 < min3Primary || b0 > max4Primary 1215 || b1 < minTrail || b1 > maxTrail) 1216 return -1; 1217 // normal offsets 1218 b1 -= minTrail; 1219 1220 // take care of the final values, and compose 1221 if (b0 < min4Primary) { 1222 if (b2 < minTrail || b2 > max3Trail || b3 != 0) 1223 return -1; 1224 b2 -= minTrail; 1225 UChar32 remainder = b2 % final3Multiplier; 1226 if (remainder != 0) 1227 return -1; 1228 b0 -= min3Primary; 1229 b2 /= final3Multiplier; 1230 result = ((b0 * medialCount) + b1) * final3Count + b2; 1231 } else { 1232 if (b2 < minTrail || b2 > maxTrail 1233 || b3 < minTrail || b3 > max4Trail) 1234 return -1; 1235 b2 -= minTrail; 1236 b3 -= minTrail; 1237 UChar32 remainder = b3 % final4Multiplier; 1238 if (remainder != 0) 1239 return -1; 1240 b3 /= final4Multiplier; 1241 b0 -= min4Primary; 1242 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; 1243 } 1244 // final check 1245 if (result < 0 || result > UCOL_MAX_INPUT) 1246 return -1; 1247 return result; 1248 } 1249 1250 1251 static inline int32_t divideAndRoundUp(int a, int b) { 1252 return 1 + (a-1)/b; 1253 } 1254 1255 /* this function is either called from initUCA or from genUCA before 1256 * doing canonical closure for the UCA. 1257 */ 1258 1259 /** 1260 * Set up to generate implicits. 1261 * Maintenance Note: this function may end up being called more than once, due 1262 * to threading races during initialization. Make sure that 1263 * none of the Constants is ever transiently assigned an 1264 * incorrect value. 1265 * @param minPrimary 1266 * @param maxPrimary 1267 * @param minTrail final byte 1268 * @param maxTrail final byte 1269 * @param gap3 the gap we leave for tailoring for 3-byte forms 1270 * @param gap4 the gap we leave for tailoring for 4-byte forms 1271 */ 1272 static void initImplicitConstants(int minPrimary, int maxPrimary, 1273 int minTrailIn, int maxTrailIn, 1274 int gap3, int primaries3count, 1275 UErrorCode *status) { 1276 // some simple parameter checks 1277 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) 1278 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) 1279 || (primaries3count < 1)) 1280 { 1281 *status = U_ILLEGAL_ARGUMENT_ERROR; 1282 return; 1283 }; 1284 1285 minTrail = minTrailIn; 1286 maxTrail = maxTrailIn; 1287 1288 min3Primary = minPrimary; 1289 max4Primary = maxPrimary; 1290 // compute constants for use later. 1291 // number of values we can use in trailing bytes 1292 // leave room for empty values between AND above, e.g. if gap = 2 1293 // range 3..7 => +3 -4 -5 -6 -7: so 1 value 1294 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values 1295 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values 1296 final3Multiplier = gap3 + 1; 1297 final3Count = (maxTrail - minTrail + 1) / final3Multiplier; 1298 max3Trail = minTrail + (final3Count - 1) * final3Multiplier; 1299 1300 // medials can use full range 1301 medialCount = (maxTrail - minTrail + 1); 1302 // find out how many values fit in each form 1303 int32_t threeByteCount = medialCount * final3Count; 1304 // now determine where the 3/4 boundary is. 1305 // we use 3 bytes below the boundary, and 4 above 1306 int32_t primariesAvailable = maxPrimary - minPrimary + 1; 1307 int32_t primaries4count = primariesAvailable - primaries3count; 1308 1309 1310 int32_t min3ByteCoverage = primaries3count * threeByteCount; 1311 min4Primary = minPrimary + primaries3count; 1312 min4Boundary = min3ByteCoverage; 1313 // Now expand out the multiplier for the 4 bytes, and redo. 1314 1315 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; 1316 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); 1317 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); 1318 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; 1319 if (gap4 < 1) { 1320 *status = U_ILLEGAL_ARGUMENT_ERROR; 1321 return; 1322 } 1323 final4Multiplier = gap4 + 1; 1324 final4Count = neededPerFinalByte; 1325 max4Trail = minTrail + (final4Count - 1) * final4Multiplier; 1326 } 1327 1328 /** 1329 * Supply parameters for generating implicit CEs 1330 */ 1331 U_CAPI void U_EXPORT2 1332 uprv_uca_initImplicitConstants(UErrorCode *status) { 1333 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. 1334 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); 1335 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); 1336 } 1337 1338 1339 /* collIterNormalize Incremental Normalization happens here. */ 1340 /* pick up the range of chars identifed by FCD, */ 1341 /* normalize it into the collIterate's writable buffer, */ 1342 /* switch the collIterate's state to use the writable buffer. */ 1343 /* */ 1344 static 1345 void collIterNormalize(collIterate *collationSource) 1346 { 1347 UErrorCode status = U_ZERO_ERROR; 1348 const UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ 1349 const UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ 1350 1351 collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)), 1352 collationSource->writableBuffer, 1353 status); 1354 if (U_FAILURE(status)) { 1355 #ifdef UCOL_DEBUG 1356 fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status)); 1357 #endif 1358 return; 1359 } 1360 1361 collationSource->pos = collationSource->writableBuffer.getTerminatedBuffer(); 1362 collationSource->origFlags = collationSource->flags; 1363 collationSource->flags |= UCOL_ITER_INNORMBUF; 1364 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1365 } 1366 1367 1368 // This function takes the iterator and extracts normalized stuff up to the next boundary 1369 // It is similar in the end results to the collIterNormalize, but for the cases when we 1370 // use an iterator 1371 /*static 1372 inline void normalizeIterator(collIterate *collationSource) { 1373 UErrorCode status = U_ZERO_ERROR; 1374 UBool wasNormalized = FALSE; 1375 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); 1376 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); 1377 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1378 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1379 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { 1380 // reallocate and terminate 1381 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, 1382 &collationSource->writableBuffer, 1383 (int32_t *)&collationSource->writableBufSize, normLen + 1, 1384 0) 1385 ) { 1386 #ifdef UCOL_DEBUG 1387 fprintf(stderr, "normalizeIterator(), out of memory\n"); 1388 #endif 1389 return; 1390 } 1391 status = U_ZERO_ERROR; 1392 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); 1393 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); 1394 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1395 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1396 } 1397 // Terminate the buffer - we already checked that it is big enough 1398 collationSource->writableBuffer[normLen] = 0; 1399 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { 1400 collationSource->flags |= UCOL_ITER_ALLOCATED; 1401 } 1402 collationSource->pos = collationSource->writableBuffer; 1403 collationSource->origFlags = collationSource->flags; 1404 collationSource->flags |= UCOL_ITER_INNORMBUF; 1405 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1406 }*/ 1407 1408 1409 /* Incremental FCD check and normalize */ 1410 /* Called from getNextCE when normalization state is suspect. */ 1411 /* When entering, the state is known to be this: */ 1412 /* o We are working in the main buffer of the collIterate, not the side */ 1413 /* writable buffer. When in the side buffer, normalization mode is always off, */ 1414 /* so we won't get here. */ 1415 /* o The leading combining class from the current character is 0 or */ 1416 /* the trailing combining class of the previous char was zero. */ 1417 /* True because the previous call to this function will have always exited */ 1418 /* that way, and we get called for every char where cc might be non-zero. */ 1419 static 1420 inline UBool collIterFCD(collIterate *collationSource) { 1421 const UChar *srcP, *endP; 1422 uint8_t leadingCC; 1423 uint8_t prevTrailingCC = 0; 1424 uint16_t fcd; 1425 UBool needNormalize = FALSE; 1426 1427 srcP = collationSource->pos-1; 1428 1429 if (collationSource->flags & UCOL_ITER_HASLEN) { 1430 endP = collationSource->endp; 1431 } else { 1432 endP = NULL; 1433 } 1434 1435 // Get the trailing combining class of the current character. If it's zero, 1436 // we are OK. 1437 /* trie access */ 1438 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); 1439 if (fcd != 0) { 1440 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1441 1442 if (prevTrailingCC != 0) { 1443 // The current char has a non-zero trailing CC. Scan forward until we find 1444 // a char with a leading cc of zero. 1445 while (endP == NULL || srcP != endP) 1446 { 1447 const UChar *savedSrcP = srcP; 1448 1449 /* trie access */ 1450 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); 1451 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1452 if (leadingCC == 0) { 1453 srcP = savedSrcP; // Hit char that is not part of combining sequence. 1454 // back up over it. (Could be surrogate pair!) 1455 break; 1456 } 1457 1458 if (leadingCC < prevTrailingCC) { 1459 needNormalize = TRUE; 1460 } 1461 1462 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1463 } 1464 } 1465 } 1466 1467 collationSource->fcdPosition = (UChar *)srcP; 1468 1469 return needNormalize; 1470 } 1471 1472 /****************************************************************************/ 1473 /* Following are the CE retrieval functions */ 1474 /* */ 1475 /****************************************************************************/ 1476 1477 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); 1478 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); 1479 1480 /* there should be a macro version of this function in the header file */ 1481 /* This is the first function that tries to fetch a collation element */ 1482 /* If it's not succesfull or it encounters a more difficult situation */ 1483 /* some more sofisticated and slower functions are invoked */ 1484 static 1485 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1486 uint32_t order = 0; 1487 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ 1488 order = *(collationSource->toReturn++); /* if so, return them */ 1489 if(collationSource->CEpos == collationSource->toReturn) { 1490 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; 1491 } 1492 return order; 1493 } 1494 1495 UChar ch = 0; 1496 collationSource->offsetReturn = NULL; 1497 1498 do { 1499 for (;;) /* Loop handles case when incremental normalize switches */ 1500 { /* to or from the side buffer / original string, and we */ 1501 /* need to start again to get the next character. */ 1502 1503 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) 1504 { 1505 // The source string is null terminated and we're not working from the side buffer, 1506 // and we're not normalizing. This is the fast path. 1507 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) 1508 ch = *collationSource->pos++; 1509 if (ch != 0) { 1510 break; 1511 } 1512 else { 1513 return UCOL_NO_MORE_CES; 1514 } 1515 } 1516 1517 if (collationSource->flags & UCOL_ITER_HASLEN) { 1518 // Normal path for strings when length is specified. 1519 // (We can't be in side buffer because it is always null terminated.) 1520 if (collationSource->pos >= collationSource->endp) { 1521 // Ran off of the end of the main source string. We're done. 1522 return UCOL_NO_MORE_CES; 1523 } 1524 ch = *collationSource->pos++; 1525 } 1526 else if(collationSource->flags & UCOL_USE_ITERATOR) { 1527 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); 1528 if(iterCh == U_SENTINEL) { 1529 return UCOL_NO_MORE_CES; 1530 } 1531 ch = (UChar)iterCh; 1532 } 1533 else 1534 { 1535 // Null terminated string. 1536 ch = *collationSource->pos++; 1537 if (ch == 0) { 1538 // Ran off end of buffer. 1539 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1540 // Ran off end of main string. backing up one character. 1541 collationSource->pos--; 1542 return UCOL_NO_MORE_CES; 1543 } 1544 else 1545 { 1546 // Hit null in the normalize side buffer. 1547 // Usually this means the end of the normalized data, 1548 // except for one odd case: a null followed by combining chars, 1549 // which is the case if we are at the start of the buffer. 1550 if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) { 1551 break; 1552 } 1553 1554 // Null marked end of side buffer. 1555 // Revert to the main string and 1556 // loop back to top to try again to get a character. 1557 collationSource->pos = collationSource->fcdPosition; 1558 collationSource->flags = collationSource->origFlags; 1559 continue; 1560 } 1561 } 1562 } 1563 1564 if(collationSource->flags&UCOL_HIRAGANA_Q) { 1565 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag 1566 * based on whether the previous codepoint was Hiragana or Katakana. 1567 */ 1568 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || 1569 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { 1570 collationSource->flags |= UCOL_WAS_HIRAGANA; 1571 } else { 1572 collationSource->flags &= ~UCOL_WAS_HIRAGANA; 1573 } 1574 } 1575 1576 // We've got a character. See if there's any fcd and/or normalization stuff to do. 1577 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. 1578 if ((collationSource->flags & UCOL_ITER_NORM) == 0) { 1579 break; 1580 } 1581 1582 if (collationSource->fcdPosition >= collationSource->pos) { 1583 // An earlier FCD check has already covered the current character. 1584 // We can go ahead and process this char. 1585 break; 1586 } 1587 1588 if (ch < ZERO_CC_LIMIT_ ) { 1589 // Fast fcd safe path. Trailing combining class == 0. This char is OK. 1590 break; 1591 } 1592 1593 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1594 // We need to peek at the next character in order to tell if we are FCD 1595 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { 1596 // We are at the last char of source string. 1597 // It is always OK for FCD check. 1598 break; 1599 } 1600 1601 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test 1602 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { 1603 break; 1604 } 1605 } 1606 1607 1608 // Need a more complete FCD check and possible normalization. 1609 if (collIterFCD(collationSource)) { 1610 collIterNormalize(collationSource); 1611 } 1612 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1613 // No normalization was needed. Go ahead and process the char we already had. 1614 break; 1615 } 1616 1617 // Some normalization happened. Next loop iteration will pick up a char 1618 // from the normalization buffer. 1619 1620 } // end for (;;) 1621 1622 1623 if (ch <= 0xFF) { 1624 /* For latin-1 characters we never need to fall back to the UCA table */ 1625 /* because all of the UCA data is replicated in the latinOneMapping array */ 1626 order = coll->latinOneMapping[ch]; 1627 if (order > UCOL_NOT_FOUND) { 1628 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); 1629 } 1630 } 1631 else 1632 { 1633 // Always use UCA for Han, Hangul 1634 // (Han extension A is before main Han block) 1635 // **** Han compatibility chars ?? **** 1636 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1637 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { 1638 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { 1639 // between the two target ranges; do normal lookup 1640 // **** this range is YI, Modifier tone letters, **** 1641 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1642 // **** Latin-D might be tailored, so we need to **** 1643 // **** do the normal lookup for these guys. **** 1644 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1645 } else { 1646 // in one of the target ranges; use UCA 1647 order = UCOL_NOT_FOUND; 1648 } 1649 } else { 1650 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1651 } 1652 1653 if(order > UCOL_NOT_FOUND) { /* if a CE is special */ 1654 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ 1655 } 1656 1657 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ 1658 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ 1659 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1660 1661 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ 1662 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); 1663 } 1664 } 1665 } 1666 } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); 1667 1668 if(order == UCOL_NOT_FOUND) { 1669 order = getImplicit(ch, collationSource); 1670 } 1671 return order; /* return the CE */ 1672 } 1673 1674 /* ucol_getNextCE, out-of-line version for use from other files. */ 1675 U_CAPI uint32_t U_EXPORT2 1676 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1677 return ucol_IGetNextCE(coll, collationSource, status); 1678 } 1679 1680 1681 /** 1682 * Incremental previous normalization happens here. Pick up the range of chars 1683 * identifed by FCD, normalize it into the collIterate's writable buffer, 1684 * switch the collIterate's state to use the writable buffer. 1685 * @param data collation iterator data 1686 */ 1687 static 1688 void collPrevIterNormalize(collIterate *data) 1689 { 1690 UErrorCode status = U_ZERO_ERROR; 1691 const UChar *pEnd = data->pos; /* End normalize + 1 */ 1692 const UChar *pStart; 1693 1694 /* Start normalize */ 1695 if (data->fcdPosition == NULL) { 1696 pStart = data->string; 1697 } 1698 else { 1699 pStart = data->fcdPosition + 1; 1700 } 1701 1702 int32_t normLen = 1703 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)), 1704 data->writableBuffer, 1705 status). 1706 length(); 1707 if(U_FAILURE(status)) { 1708 return; 1709 } 1710 /* 1711 this puts the null termination infront of the normalized string instead 1712 of the end 1713 */ 1714 data->writableBuffer.insert(0, (UChar)0); 1715 1716 /* 1717 * The usual case at this point is that we've got a base 1718 * character followed by marks that were normalized. If 1719 * fcdPosition is NULL, that means that we backed up to 1720 * the beginning of the string and there's no base character. 1721 * 1722 * Forward processing will usually normalize when it sees 1723 * the first mark, so that mark will get it's natural offset 1724 * and the rest will get the offset of the character following 1725 * the marks. The base character will also get its natural offset. 1726 * 1727 * We write the offset of the base character, if there is one, 1728 * followed by the offset of the first mark and then the offsets 1729 * of the rest of the marks. 1730 */ 1731 int32_t firstMarkOffset = 0; 1732 int32_t trailOffset = (int32_t)(data->pos - data->string + 1); 1733 int32_t trailCount = normLen - 1; 1734 1735 if (data->fcdPosition != NULL) { 1736 int32_t baseOffset = (int32_t)(data->fcdPosition - data->string); 1737 UChar baseChar = *data->fcdPosition; 1738 1739 firstMarkOffset = baseOffset + 1; 1740 1741 /* 1742 * If the base character is the start of a contraction, forward processing 1743 * will normalize the marks while checking for the contraction, which means 1744 * that the offset of the first mark will the same as the other marks. 1745 * 1746 * **** THIS IS PROBABLY NOT A COMPLETE TEST **** 1747 */ 1748 if (baseChar >= 0x100) { 1749 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); 1750 1751 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { 1752 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); 1753 } 1754 1755 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { 1756 firstMarkOffset = trailOffset; 1757 } 1758 } 1759 1760 data->appendOffset(baseOffset, status); 1761 } 1762 1763 data->appendOffset(firstMarkOffset, status); 1764 1765 for (int32_t i = 0; i < trailCount; i += 1) { 1766 data->appendOffset(trailOffset, status); 1767 } 1768 1769 data->offsetRepeatValue = trailOffset; 1770 1771 data->offsetReturn = data->offsetStore - 1; 1772 if (data->offsetReturn == data->offsetBuffer) { 1773 data->offsetStore = data->offsetBuffer; 1774 } 1775 1776 data->pos = data->writableBuffer.getTerminatedBuffer() + 1 + normLen; 1777 data->origFlags = data->flags; 1778 data->flags |= UCOL_ITER_INNORMBUF; 1779 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1780 } 1781 1782 1783 /** 1784 * Incremental FCD check for previous iteration and normalize. Called from 1785 * getPrevCE when normalization state is suspect. 1786 * When entering, the state is known to be this: 1787 * o We are working in the main buffer of the collIterate, not the side 1788 * writable buffer. When in the side buffer, normalization mode is always 1789 * off, so we won't get here. 1790 * o The leading combining class from the current character is 0 or the 1791 * trailing combining class of the previous char was zero. 1792 * True because the previous call to this function will have always exited 1793 * that way, and we get called for every char where cc might be non-zero. 1794 * @param data collation iterate struct 1795 * @return normalization status, TRUE for normalization to be done, FALSE 1796 * otherwise 1797 */ 1798 static 1799 inline UBool collPrevIterFCD(collIterate *data) 1800 { 1801 const UChar *src, *start; 1802 uint8_t leadingCC; 1803 uint8_t trailingCC = 0; 1804 uint16_t fcd; 1805 UBool result = FALSE; 1806 1807 start = data->string; 1808 src = data->pos + 1; 1809 1810 /* Get the trailing combining class of the current character. */ 1811 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); 1812 1813 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1814 1815 if (leadingCC != 0) { 1816 /* 1817 The current char has a non-zero leading combining class. 1818 Scan backward until we find a char with a trailing cc of zero. 1819 */ 1820 for (;;) 1821 { 1822 if (start == src) { 1823 data->fcdPosition = NULL; 1824 return result; 1825 } 1826 1827 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); 1828 1829 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1830 1831 if (trailingCC == 0) { 1832 break; 1833 } 1834 1835 if (leadingCC < trailingCC) { 1836 result = TRUE; 1837 } 1838 1839 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1840 } 1841 } 1842 1843 data->fcdPosition = (UChar *)src; 1844 1845 return result; 1846 } 1847 1848 /** gets a code unit from the string at a given offset 1849 * Handles both normal and iterative cases. 1850 * No error checking - caller beware! 1851 */ 1852 static inline 1853 UChar peekCodeUnit(collIterate *source, int32_t offset) { 1854 if(source->pos != NULL) { 1855 return *(source->pos + offset); 1856 } else if(source->iterator != NULL) { 1857 UChar32 c; 1858 if(offset != 0) { 1859 source->iterator->move(source->iterator, offset, UITER_CURRENT); 1860 c = source->iterator->next(source->iterator); 1861 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); 1862 } else { 1863 c = source->iterator->current(source->iterator); 1864 } 1865 return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0. 1866 } else { 1867 return 0xfffd; 1868 } 1869 } 1870 1871 // Code point version. Treats the offset as a _code point_ delta. 1872 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16. 1873 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer. 1874 static inline 1875 UChar32 peekCodePoint(collIterate *source, int32_t offset) { 1876 UChar32 c; 1877 if(source->pos != NULL) { 1878 const UChar *p = source->pos; 1879 if(offset >= 0) { 1880 // Skip forward over (offset-1) code points. 1881 while(--offset >= 0) { 1882 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) { 1883 ++p; 1884 } 1885 } 1886 // Read the code point there. 1887 c = *p++; 1888 UChar trail; 1889 if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) { 1890 c = U16_GET_SUPPLEMENTARY(c, trail); 1891 } 1892 } else /* offset<0 */ { 1893 // Skip backward over (offset-1) code points. 1894 while(++offset < 0) { 1895 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) { 1896 --p; 1897 } 1898 } 1899 // Read the code point before that. 1900 c = *--p; 1901 UChar lead; 1902 if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) { 1903 c = U16_GET_SUPPLEMENTARY(lead, c); 1904 } 1905 } 1906 } else if(source->iterator != NULL) { 1907 if(offset >= 0) { 1908 // Skip forward over (offset-1) code points. 1909 int32_t fwd = offset; 1910 while(fwd-- > 0) { 1911 uiter_next32(source->iterator); 1912 } 1913 // Read the code point there. 1914 c = uiter_current32(source->iterator); 1915 // Return to the starting point, skipping backward over (offset-1) code points. 1916 while(offset-- > 0) { 1917 uiter_previous32(source->iterator); 1918 } 1919 } else /* offset<0 */ { 1920 // Read backward, reading offset code points, remember only the last-read one. 1921 int32_t back = offset; 1922 do { 1923 c = uiter_previous32(source->iterator); 1924 } while(++back < 0); 1925 // Return to the starting position, skipping forward over offset code points. 1926 do { 1927 uiter_next32(source->iterator); 1928 } while(++offset < 0); 1929 } 1930 } else { 1931 c = U_SENTINEL; 1932 } 1933 return c; 1934 } 1935 1936 /** 1937 * Determines if we are at the start of the data string in the backwards 1938 * collation iterator 1939 * @param data collation iterator 1940 * @return TRUE if we are at the start 1941 */ 1942 static 1943 inline UBool isAtStartPrevIterate(collIterate *data) { 1944 if(data->pos == NULL && data->iterator != NULL) { 1945 return !data->iterator->hasPrevious(data->iterator); 1946 } 1947 //return (collIter_bos(data)) || 1948 return (data->pos == data->string) || 1949 ((data->flags & UCOL_ITER_INNORMBUF) && 1950 *(data->pos - 1) == 0 && data->fcdPosition == NULL); 1951 } 1952 1953 static 1954 inline void goBackOne(collIterate *data) { 1955 # if 0 1956 // somehow, it looks like we need to keep iterator synced up 1957 // at all times, as above. 1958 if(data->pos) { 1959 data->pos--; 1960 } 1961 if(data->iterator) { 1962 data->iterator->previous(data->iterator); 1963 } 1964 #endif 1965 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { 1966 data->iterator->previous(data->iterator); 1967 } 1968 if(data->pos) { 1969 data->pos --; 1970 } 1971 } 1972 1973 /** 1974 * Inline function that gets a simple CE. 1975 * So what it does is that it will first check the expansion buffer. If the 1976 * expansion buffer is not empty, ie the end pointer to the expansion buffer 1977 * is different from the string pointer, we return the collation element at the 1978 * return pointer and decrement it. 1979 * For more complicated CEs it resorts to getComplicatedCE. 1980 * @param coll collator data 1981 * @param data collation iterator struct 1982 * @param status error status 1983 */ 1984 static 1985 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, 1986 UErrorCode *status) 1987 { 1988 uint32_t result = (uint32_t)UCOL_NULLORDER; 1989 1990 if (data->offsetReturn != NULL) { 1991 if (data->offsetRepeatCount > 0) { 1992 data->offsetRepeatCount -= 1; 1993 } else { 1994 if (data->offsetReturn == data->offsetBuffer) { 1995 data->offsetReturn = NULL; 1996 data->offsetStore = data->offsetBuffer; 1997 } else { 1998 data->offsetReturn -= 1; 1999 } 2000 } 2001 } 2002 2003 if ((data->extendCEs && data->toReturn > data->extendCEs) || 2004 (!data->extendCEs && data->toReturn > data->CEs)) 2005 { 2006 data->toReturn -= 1; 2007 result = *(data->toReturn); 2008 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { 2009 data->CEpos = data->toReturn; 2010 } 2011 } 2012 else { 2013 UChar ch = 0; 2014 2015 do { 2016 /* 2017 Loop handles case when incremental normalize switches to or from the 2018 side buffer / original string, and we need to start again to get the 2019 next character. 2020 */ 2021 for (;;) { 2022 if (data->flags & UCOL_ITER_HASLEN) { 2023 /* 2024 Normal path for strings when length is specified. 2025 Not in side buffer because it is always null terminated. 2026 */ 2027 if (data->pos <= data->string) { 2028 /* End of the main source string */ 2029 return UCOL_NO_MORE_CES; 2030 } 2031 data->pos --; 2032 ch = *data->pos; 2033 } 2034 // we are using an iterator to go back. Pray for us! 2035 else if (data->flags & UCOL_USE_ITERATOR) { 2036 UChar32 iterCh = data->iterator->previous(data->iterator); 2037 if(iterCh == U_SENTINEL) { 2038 return UCOL_NO_MORE_CES; 2039 } else { 2040 ch = (UChar)iterCh; 2041 } 2042 } 2043 else { 2044 data->pos --; 2045 ch = *data->pos; 2046 /* we are in the side buffer. */ 2047 if (ch == 0) { 2048 /* 2049 At the start of the normalize side buffer. 2050 Go back to string. 2051 Because pointer points to the last accessed character, 2052 hence we have to increment it by one here. 2053 */ 2054 data->flags = data->origFlags; 2055 data->offsetRepeatValue = 0; 2056 2057 if (data->fcdPosition == NULL) { 2058 data->pos = data->string; 2059 return UCOL_NO_MORE_CES; 2060 } 2061 else { 2062 data->pos = data->fcdPosition + 1; 2063 } 2064 2065 continue; 2066 } 2067 } 2068 2069 if(data->flags&UCOL_HIRAGANA_Q) { 2070 if(ch>=0x3040 && ch<=0x309f) { 2071 data->flags |= UCOL_WAS_HIRAGANA; 2072 } else { 2073 data->flags &= ~UCOL_WAS_HIRAGANA; 2074 } 2075 } 2076 2077 /* 2078 * got a character to determine if there's fcd and/or normalization 2079 * stuff to do. 2080 * if the current character is not fcd. 2081 * if current character is at the start of the string 2082 * Trailing combining class == 0. 2083 * Note if pos is in the writablebuffer, norm is always 0 2084 */ 2085 if (ch < ZERO_CC_LIMIT_ || 2086 // this should propel us out of the loop in the iterator case 2087 (data->flags & UCOL_ITER_NORM) == 0 || 2088 (data->fcdPosition != NULL && data->fcdPosition <= data->pos) 2089 || data->string == data->pos) { 2090 break; 2091 } 2092 2093 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 2094 /* if next character is FCD */ 2095 if (data->pos == data->string) { 2096 /* First char of string is always OK for FCD check */ 2097 break; 2098 } 2099 2100 /* Not first char of string, do the FCD fast test */ 2101 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { 2102 break; 2103 } 2104 } 2105 2106 /* Need a more complete FCD check and possible normalization. */ 2107 if (collPrevIterFCD(data)) { 2108 collPrevIterNormalize(data); 2109 } 2110 2111 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2112 /* No normalization. Go ahead and process the char. */ 2113 break; 2114 } 2115 2116 /* 2117 Some normalization happened. 2118 Next loop picks up a char from the normalization buffer. 2119 */ 2120 } 2121 2122 /* attempt to handle contractions, after removal of the backwards 2123 contraction 2124 */ 2125 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { 2126 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); 2127 } else { 2128 if (ch <= 0xFF) { 2129 result = coll->latinOneMapping[ch]; 2130 } 2131 else { 2132 // Always use UCA for [3400..9FFF], [AC00..D7AF] 2133 // **** [FA0E..FA2F] ?? **** 2134 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 2135 (ch >= 0x3400 && ch <= 0xD7AF)) { 2136 if (ch > 0x9FFF && ch < 0xAC00) { 2137 // between the two target ranges; do normal lookup 2138 // **** this range is YI, Modifier tone letters, **** 2139 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 2140 // **** Latin-D might be tailored, so we need to **** 2141 // **** do the normal lookup for these guys. **** 2142 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 2143 } else { 2144 result = UCOL_NOT_FOUND; 2145 } 2146 } else { 2147 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 2148 } 2149 } 2150 if (result > UCOL_NOT_FOUND) { 2151 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); 2152 } 2153 if (result == UCOL_NOT_FOUND) { // Not found in master list 2154 if (!isAtStartPrevIterate(data) && 2155 ucol_contractionEndCP(ch, data->coll)) 2156 { 2157 result = UCOL_CONTRACTION; 2158 } else { 2159 if(coll->UCA) { 2160 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 2161 } 2162 } 2163 2164 if (result > UCOL_NOT_FOUND) { 2165 if(coll->UCA) { 2166 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); 2167 } 2168 } 2169 } 2170 } 2171 } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL ); 2172 2173 if(result == UCOL_NOT_FOUND) { 2174 result = getPrevImplicit(ch, data); 2175 } 2176 } 2177 2178 return result; 2179 } 2180 2181 2182 /* ucol_getPrevCE, out-of-line version for use from other files. */ 2183 U_CFUNC uint32_t U_EXPORT2 2184 ucol_getPrevCE(const UCollator *coll, collIterate *data, 2185 UErrorCode *status) { 2186 return ucol_IGetPrevCE(coll, data, status); 2187 } 2188 2189 2190 /* this should be connected to special Jamo handling */ 2191 U_CFUNC uint32_t U_EXPORT2 2192 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { 2193 collIterate colIt; 2194 IInit_collIterate(coll, &u, 1, &colIt, status); 2195 if(U_FAILURE(*status)) { 2196 return 0; 2197 } 2198 return ucol_IGetNextCE(coll, &colIt, status); 2199 } 2200 2201 /** 2202 * Inserts the argument character into the end of the buffer pushing back the 2203 * null terminator. 2204 * @param data collIterate struct data 2205 * @param ch character to be appended 2206 * @return the position of the new addition 2207 */ 2208 static 2209 inline const UChar * insertBufferEnd(collIterate *data, UChar ch) 2210 { 2211 int32_t oldLength = data->writableBuffer.length(); 2212 return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength; 2213 } 2214 2215 /** 2216 * Inserts the argument string into the end of the buffer pushing back the 2217 * null terminator. 2218 * @param data collIterate struct data 2219 * @param string to be appended 2220 * @param length of the string to be appended 2221 * @return the position of the new addition 2222 */ 2223 static 2224 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length) 2225 { 2226 int32_t oldLength = data->writableBuffer.length(); 2227 return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength; 2228 } 2229 2230 /** 2231 * Special normalization function for contraction in the forwards iterator. 2232 * This normalization sequence will place the current character at source->pos 2233 * and its following normalized sequence into the buffer. 2234 * The fcd position, pos will be changed. 2235 * pos will now point to positions in the buffer. 2236 * Flags will be changed accordingly. 2237 * @param data collation iterator data 2238 */ 2239 static 2240 inline void normalizeNextContraction(collIterate *data) 2241 { 2242 int32_t strsize; 2243 UErrorCode status = U_ZERO_ERROR; 2244 /* because the pointer points to the next character */ 2245 const UChar *pStart = data->pos - 1; 2246 const UChar *pEnd; 2247 2248 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2249 data->writableBuffer.setTo(*(pStart - 1)); 2250 strsize = 1; 2251 } 2252 else { 2253 strsize = data->writableBuffer.length(); 2254 } 2255 2256 pEnd = data->fcdPosition; 2257 2258 data->writableBuffer.append( 2259 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status)); 2260 if(U_FAILURE(status)) { 2261 return; 2262 } 2263 2264 data->pos = data->writableBuffer.getTerminatedBuffer() + strsize; 2265 data->origFlags = data->flags; 2266 data->flags |= UCOL_ITER_INNORMBUF; 2267 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2268 } 2269 2270 /** 2271 * Contraction character management function that returns the next character 2272 * for the forwards iterator. 2273 * Does nothing if the next character is in buffer and not the first character 2274 * in it. 2275 * Else it checks next character in data string to see if it is normalizable. 2276 * If it is not, the character is simply copied into the buffer, else 2277 * the whole normalized substring is copied into the buffer, including the 2278 * current character. 2279 * @param data collation element iterator data 2280 * @return next character 2281 */ 2282 static 2283 inline UChar getNextNormalizedChar(collIterate *data) 2284 { 2285 UChar nextch; 2286 UChar ch; 2287 // Here we need to add the iterator code. One problem is the way 2288 // end of string is handled. If we just return next char, it could 2289 // be the sentinel. Most of the cases already check for this, but we 2290 // need to be sure. 2291 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { 2292 /* if no normalization and not in buffer. */ 2293 if(data->flags & UCOL_USE_ITERATOR) { 2294 return (UChar)data->iterator->next(data->iterator); 2295 } else { 2296 return *(data->pos ++); 2297 } 2298 } 2299 2300 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { 2301 //normalizeIterator(data); 2302 //} 2303 2304 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2305 if ((innormbuf && *data->pos != 0) || 2306 (data->fcdPosition != NULL && !innormbuf && 2307 data->pos < data->fcdPosition)) { 2308 /* 2309 if next character is in normalized buffer, no further normalization 2310 is required 2311 */ 2312 return *(data->pos ++); 2313 } 2314 2315 if (data->flags & UCOL_ITER_HASLEN) { 2316 /* in data string */ 2317 if (data->pos + 1 == data->endp) { 2318 return *(data->pos ++); 2319 } 2320 } 2321 else { 2322 if (innormbuf) { 2323 // inside the normalization buffer, but at the end 2324 // (since we encountered zero). This means, in the 2325 // case we're using char iterator, that we need to 2326 // do another round of normalization. 2327 //if(data->origFlags & UCOL_USE_ITERATOR) { 2328 // we need to restore original flags, 2329 // otherwise, we'll lose them 2330 //data->flags = data->origFlags; 2331 //normalizeIterator(data); 2332 //return *(data->pos++); 2333 //} else { 2334 /* 2335 in writable buffer, at this point fcdPosition can not be 2336 pointing to the end of the data string. see contracting tag. 2337 */ 2338 if(data->fcdPosition) { 2339 if (*(data->fcdPosition + 1) == 0 || 2340 data->fcdPosition + 1 == data->endp) { 2341 /* at the end of the string, dump it into the normalizer */ 2342 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1; 2343 // Check if data->pos received a null pointer 2344 if (data->pos == NULL) { 2345 return (UChar)-1; // Return to indicate error. 2346 } 2347 return *(data->fcdPosition ++); 2348 } 2349 data->pos = data->fcdPosition; 2350 } else if(data->origFlags & UCOL_USE_ITERATOR) { 2351 // if we are here, we're using a normalizing iterator. 2352 // we should just continue further. 2353 data->flags = data->origFlags; 2354 data->pos = NULL; 2355 return (UChar)data->iterator->next(data->iterator); 2356 } 2357 //} 2358 } 2359 else { 2360 if (*(data->pos + 1) == 0) { 2361 return *(data->pos ++); 2362 } 2363 } 2364 } 2365 2366 ch = *data->pos ++; 2367 nextch = *data->pos; 2368 2369 /* 2370 * if the current character is not fcd. 2371 * Trailing combining class == 0. 2372 */ 2373 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && 2374 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || 2375 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { 2376 /* 2377 Need a more complete FCD check and possible normalization. 2378 normalize substring will be appended to buffer 2379 */ 2380 if (collIterFCD(data)) { 2381 normalizeNextContraction(data); 2382 return *(data->pos ++); 2383 } 2384 else if (innormbuf) { 2385 /* fcdposition shifted even when there's no normalization, if we 2386 don't input the rest into this, we'll get the wrong position when 2387 we reach the end of the writableBuffer */ 2388 int32_t length = (int32_t)(data->fcdPosition - data->pos + 1); 2389 data->pos = insertBufferEnd(data, data->pos - 1, length); 2390 // Check if data->pos received a null pointer 2391 if (data->pos == NULL) { 2392 return (UChar)-1; // Return to indicate error. 2393 } 2394 return *(data->pos ++); 2395 } 2396 } 2397 2398 if (innormbuf) { 2399 /* 2400 no normalization is to be done hence only one character will be 2401 appended to the buffer. 2402 */ 2403 data->pos = insertBufferEnd(data, ch) + 1; 2404 // Check if data->pos received a null pointer 2405 if (data->pos == NULL) { 2406 return (UChar)-1; // Return to indicate error. 2407 } 2408 } 2409 2410 /* points back to the pos in string */ 2411 return ch; 2412 } 2413 2414 2415 2416 /** 2417 * Function to copy the buffer into writableBuffer and sets the fcd position to 2418 * the correct position 2419 * @param source data string source 2420 * @param buffer character buffer 2421 */ 2422 static 2423 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer) 2424 { 2425 /* okay confusing part here. to ensure that the skipped characters are 2426 considered later, we need to place it in the appropriate position in the 2427 normalization buffer and reassign the pos pointer. simple case if pos 2428 reside in string, simply copy to normalization buffer and 2429 fcdposition = pos, pos = start of normalization buffer. if pos in 2430 normalization buffer, we'll insert the copy infront of pos and point pos 2431 to the start of the normalization buffer. why am i doing these copies? 2432 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does 2433 not require any changes, which be really painful. */ 2434 if (source->flags & UCOL_ITER_INNORMBUF) { 2435 int32_t replaceLength = source->pos - source->writableBuffer.getBuffer(); 2436 source->writableBuffer.replace(0, replaceLength, buffer); 2437 } 2438 else { 2439 source->fcdPosition = source->pos; 2440 source->origFlags = source->flags; 2441 source->flags |= UCOL_ITER_INNORMBUF; 2442 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 2443 source->writableBuffer = buffer; 2444 } 2445 2446 source->pos = source->writableBuffer.getTerminatedBuffer(); 2447 } 2448 2449 /** 2450 * Function to get the discontiguos collation element within the source. 2451 * Note this function will set the position to the appropriate places. 2452 * @param coll current collator used 2453 * @param source data string source 2454 * @param constart index to the start character in the contraction table 2455 * @return discontiguos collation element offset 2456 */ 2457 static 2458 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, 2459 const UChar *constart) 2460 { 2461 /* source->pos currently points to the second combining character after 2462 the start character */ 2463 const UChar *temppos = source->pos; 2464 UnicodeString buffer; 2465 const UChar *tempconstart = constart; 2466 uint8_t tempflags = source->flags; 2467 UBool multicontraction = FALSE; 2468 collIterateState discState; 2469 2470 backupState(source, &discState); 2471 2472 buffer.setTo(peekCodePoint(source, -1)); 2473 for (;;) { 2474 UChar *UCharOffset; 2475 UChar schar, 2476 tchar; 2477 uint32_t result; 2478 2479 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) 2480 || (peekCodeUnit(source, 0) == 0 && 2481 //|| (*source->pos == 0 && 2482 ((source->flags & UCOL_ITER_INNORMBUF) == 0 || 2483 source->fcdPosition == NULL || 2484 source->fcdPosition == source->endp || 2485 *(source->fcdPosition) == 0 || 2486 u_getCombiningClass(*(source->fcdPosition)) == 0)) || 2487 /* end of string in null terminated string or stopped by a 2488 null character, note fcd does not always point to a base 2489 character after the discontiguos change */ 2490 u_getCombiningClass(peekCodePoint(source, 0)) == 0) { 2491 //u_getCombiningClass(*(source->pos)) == 0) { 2492 //constart = (UChar *)coll->image + getContractOffset(CE); 2493 if (multicontraction) { 2494 source->pos = temppos - 1; 2495 setDiscontiguosAttribute(source, buffer); 2496 return *(coll->contractionCEs + 2497 (tempconstart - coll->contractionIndex)); 2498 } 2499 constart = tempconstart; 2500 break; 2501 } 2502 2503 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ 2504 schar = getNextNormalizedChar(source); 2505 2506 while (schar > (tchar = *UCharOffset)) { 2507 UCharOffset++; 2508 } 2509 2510 if (schar != tchar) { 2511 /* not the correct codepoint. we stuff the current codepoint into 2512 the discontiguos buffer and try the next character */ 2513 buffer.append(schar); 2514 continue; 2515 } 2516 else { 2517 if (u_getCombiningClass(schar) == 2518 u_getCombiningClass(peekCodePoint(source, -2))) { 2519 buffer.append(schar); 2520 continue; 2521 } 2522 result = *(coll->contractionCEs + 2523 (UCharOffset - coll->contractionIndex)); 2524 } 2525 2526 if (result == UCOL_NOT_FOUND) { 2527 break; 2528 } else if (isContraction(result)) { 2529 /* this is a multi-contraction*/ 2530 tempconstart = (UChar *)coll->image + getContractOffset(result); 2531 if (*(coll->contractionCEs + (constart - coll->contractionIndex)) 2532 != UCOL_NOT_FOUND) { 2533 multicontraction = TRUE; 2534 temppos = source->pos + 1; 2535 } 2536 } else { 2537 setDiscontiguosAttribute(source, buffer); 2538 return result; 2539 } 2540 } 2541 2542 /* no problems simply reverting just like that, 2543 if we are in string before getting into this function, points back to 2544 string hence no problem. 2545 if we are in normalization buffer before getting into this function, 2546 since we'll never use another normalization within this function, we 2547 know that fcdposition points to a base character. the normalization buffer 2548 never change, hence this revert works. */ 2549 loadState(source, &discState, TRUE); 2550 goBackOne(source); 2551 2552 //source->pos = temppos - 1; 2553 source->flags = tempflags; 2554 return *(coll->contractionCEs + (constart - coll->contractionIndex)); 2555 } 2556 2557 /* now uses Mark's getImplicitPrimary code */ 2558 static 2559 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { 2560 uint32_t r = uprv_uca_getImplicitPrimary(cp); 2561 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; 2562 collationSource->offsetRepeatCount += 1; 2563 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' 2564 } 2565 2566 /** 2567 * Inserts the argument character into the front of the buffer replacing the 2568 * front null terminator. 2569 * @param data collation element iterator data 2570 * @param ch character to be appended 2571 */ 2572 static 2573 inline void insertBufferFront(collIterate *data, UChar ch) 2574 { 2575 data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2; 2576 } 2577 2578 /** 2579 * Special normalization function for contraction in the previous iterator. 2580 * This normalization sequence will place the current character at source->pos 2581 * and its following normalized sequence into the buffer. 2582 * The fcd position, pos will be changed. 2583 * pos will now point to positions in the buffer. 2584 * Flags will be changed accordingly. 2585 * @param data collation iterator data 2586 */ 2587 static 2588 inline void normalizePrevContraction(collIterate *data, UErrorCode *status) 2589 { 2590 const UChar *pEnd = data->pos + 1; /* End normalize + 1 */ 2591 const UChar *pStart; 2592 2593 UnicodeString endOfBuffer; 2594 if (data->flags & UCOL_ITER_HASLEN) { 2595 /* 2596 normalization buffer not used yet, we'll pull down the next 2597 character into the end of the buffer 2598 */ 2599 endOfBuffer.setTo(*pEnd); 2600 } 2601 else { 2602 endOfBuffer.setTo(data->writableBuffer, 1); // after the leading NUL 2603 } 2604 2605 if (data->fcdPosition == NULL) { 2606 pStart = data->string; 2607 } 2608 else { 2609 pStart = data->fcdPosition + 1; 2610 } 2611 int32_t normLen = 2612 data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), 2613 data->writableBuffer, 2614 *status). 2615 length(); 2616 if(U_FAILURE(*status)) { 2617 return; 2618 } 2619 /* 2620 this puts the null termination infront of the normalized string instead 2621 of the end 2622 */ 2623 data->pos = 2624 data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() + 2625 1 + normLen; 2626 data->origFlags = data->flags; 2627 data->flags |= UCOL_ITER_INNORMBUF; 2628 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2629 } 2630 2631 /** 2632 * Contraction character management function that returns the previous character 2633 * for the backwards iterator. 2634 * Does nothing if the previous character is in buffer and not the first 2635 * character in it. 2636 * Else it checks previous character in data string to see if it is 2637 * normalizable. 2638 * If it is not, the character is simply copied into the buffer, else 2639 * the whole normalized substring is copied into the buffer, including the 2640 * current character. 2641 * @param data collation element iterator data 2642 * @return previous character 2643 */ 2644 static 2645 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) 2646 { 2647 UChar prevch; 2648 UChar ch; 2649 const UChar *start; 2650 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2651 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || 2652 (innormbuf && *(data->pos - 1) != 0)) { 2653 /* 2654 if no normalization. 2655 if previous character is in normalized buffer, no further normalization 2656 is required 2657 */ 2658 if(data->flags & UCOL_USE_ITERATOR) { 2659 data->iterator->move(data->iterator, -1, UITER_CURRENT); 2660 return (UChar)data->iterator->next(data->iterator); 2661 } else { 2662 return *(data->pos - 1); 2663 } 2664 } 2665 2666 start = data->pos; 2667 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { 2668 /* in data string */ 2669 if ((start - 1) == data->string) { 2670 return *(start - 1); 2671 } 2672 start --; 2673 ch = *start; 2674 prevch = *(start - 1); 2675 } 2676 else { 2677 /* 2678 in writable buffer, at this point fcdPosition can not be NULL. 2679 see contracting tag. 2680 */ 2681 if (data->fcdPosition == data->string) { 2682 /* at the start of the string, just dump it into the normalizer */ 2683 insertBufferFront(data, *(data->fcdPosition)); 2684 data->fcdPosition = NULL; 2685 return *(data->pos - 1); 2686 } 2687 start = data->fcdPosition; 2688 ch = *start; 2689 prevch = *(start - 1); 2690 } 2691 /* 2692 * if the current character is not fcd. 2693 * Trailing combining class == 0. 2694 */ 2695 if (data->fcdPosition > start && 2696 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) 2697 { 2698 /* 2699 Need a more complete FCD check and possible normalization. 2700 normalize substring will be appended to buffer 2701 */ 2702 const UChar *backuppos = data->pos; 2703 data->pos = start; 2704 if (collPrevIterFCD(data)) { 2705 normalizePrevContraction(data, status); 2706 return *(data->pos - 1); 2707 } 2708 data->pos = backuppos; 2709 data->fcdPosition ++; 2710 } 2711 2712 if (innormbuf) { 2713 /* 2714 no normalization is to be done hence only one character will be 2715 appended to the buffer. 2716 */ 2717 insertBufferFront(data, ch); 2718 data->fcdPosition --; 2719 } 2720 2721 return ch; 2722 } 2723 2724 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ 2725 /* It is called by getNextCE */ 2726 2727 /* The following should be even */ 2728 #define UCOL_MAX_DIGITS_FOR_NUMBER 254 2729 2730 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { 2731 collIterateState entryState; 2732 backupState(source, &entryState); 2733 UChar32 cp = ch; 2734 2735 for (;;) { 2736 // This loop will repeat only in the case of contractions, and only when a contraction 2737 // is found and the first CE resulting from that contraction is itself a special 2738 // (an expansion, for example.) All other special CE types are fully handled the 2739 // first time through, and the loop exits. 2740 2741 const uint32_t *CEOffset = NULL; 2742 switch(getCETag(CE)) { 2743 case NOT_FOUND_TAG: 2744 /* This one is not found, and we'll let somebody else bother about it... no more games */ 2745 return CE; 2746 case SPEC_PROC_TAG: 2747 { 2748 // Special processing is getting a CE that is preceded by a certain prefix 2749 // Currently this is only needed for optimizing Japanese length and iteration marks. 2750 // When we encouter a special processing tag, we go backwards and try to see if 2751 // we have a match. 2752 // Contraction tables are used - so the whole process is not unlike contraction. 2753 // prefix data is stored backwards in the table. 2754 const UChar *UCharOffset; 2755 UChar schar, tchar; 2756 collIterateState prefixState; 2757 backupState(source, &prefixState); 2758 loadState(source, &entryState, TRUE); 2759 goBackOne(source); // We want to look at the point where we entered - actually one 2760 // before that... 2761 2762 for(;;) { 2763 // This loop will run once per source string character, for as long as we 2764 // are matching a potential contraction sequence 2765 2766 // First we position ourselves at the begining of contraction sequence 2767 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2768 if (collIter_bos(source)) { 2769 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2770 break; 2771 } 2772 schar = getPrevNormalizedChar(source, status); 2773 goBackOne(source); 2774 2775 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2776 UCharOffset++; 2777 } 2778 2779 if (schar == tchar) { 2780 // Found the source string char in the table. 2781 // Pick up the corresponding CE from the table. 2782 CE = *(coll->contractionCEs + 2783 (UCharOffset - coll->contractionIndex)); 2784 } 2785 else 2786 { 2787 // Source string char was not in the table. 2788 // We have not found the prefix. 2789 CE = *(coll->contractionCEs + 2790 (ContractionStart - coll->contractionIndex)); 2791 } 2792 2793 if(!isPrefix(CE)) { 2794 // The source string char was in the contraction table, and the corresponding 2795 // CE is not a prefix CE. We found the prefix, break 2796 // out of loop, this CE will end up being returned. This is the normal 2797 // way out of prefix handling when the source actually contained 2798 // the prefix. 2799 break; 2800 } 2801 } 2802 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue 2803 loadState(source, &prefixState, TRUE); 2804 if(source->origFlags & UCOL_USE_ITERATOR) { 2805 source->flags = source->origFlags; 2806 } 2807 } else { // prefix search was a failure, we have to backup all the way to the start 2808 loadState(source, &entryState, TRUE); 2809 } 2810 break; 2811 } 2812 case CONTRACTION_TAG: 2813 { 2814 /* This should handle contractions */ 2815 collIterateState state; 2816 backupState(source, &state); 2817 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; 2818 const UChar *UCharOffset; 2819 UChar schar, tchar; 2820 2821 for (;;) { 2822 /* This loop will run once per source string character, for as long as we */ 2823 /* are matching a potential contraction sequence */ 2824 2825 /* First we position ourselves at the begining of contraction sequence */ 2826 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2827 2828 if (collIter_eos(source)) { 2829 // Ran off the end of the source string. 2830 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2831 // So we'll pick whatever we have at the point... 2832 if (CE == UCOL_NOT_FOUND) { 2833 // back up the source over all the chars we scanned going into this contraction. 2834 CE = firstCE; 2835 loadState(source, &state, TRUE); 2836 if(source->origFlags & UCOL_USE_ITERATOR) { 2837 source->flags = source->origFlags; 2838 } 2839 } 2840 break; 2841 } 2842 2843 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ 2844 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); 2845 2846 schar = getNextNormalizedChar(source); 2847 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2848 UCharOffset++; 2849 } 2850 2851 if (schar == tchar) { 2852 // Found the source string char in the contraction table. 2853 // Pick up the corresponding CE from the table. 2854 CE = *(coll->contractionCEs + 2855 (UCharOffset - coll->contractionIndex)); 2856 } 2857 else 2858 { 2859 // Source string char was not in contraction table. 2860 // Unless we have a discontiguous contraction, we have finished 2861 // with this contraction. 2862 // in order to do the proper detection, we 2863 // need to see if we're dealing with a supplementary 2864 /* We test whether the next two char are surrogate pairs. 2865 * This test is done if the iterator is not NULL. 2866 * If there is no surrogate pair, the iterator 2867 * goes back one if needed. */ 2868 UChar32 miss = schar; 2869 if (source->iterator) { 2870 UChar32 surrNextChar; /* the next char in the iteration to test */ 2871 int32_t prevPos; /* holds the previous position before move forward of the source iterator */ 2872 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { 2873 prevPos = source->iterator->index; 2874 surrNextChar = getNextNormalizedChar(source); 2875 if (U16_IS_TRAIL(surrNextChar)) { 2876 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); 2877 } else if (prevPos < source->iterator->index){ 2878 goBackOne(source); 2879 } 2880 } 2881 } else if (U16_IS_LEAD(schar)) { 2882 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); 2883 } 2884 2885 uint8_t sCC; 2886 if (miss < 0x300 || 2887 maxCC == 0 || 2888 (sCC = i_getCombiningClass(miss, coll)) == 0 || 2889 sCC>maxCC || 2890 (allSame != 0 && sCC == maxCC) || 2891 collIter_eos(source)) 2892 { 2893 // Contraction can not be discontiguous. 2894 goBackOne(source); // back up the source string by one, 2895 // because the character we just looked at was 2896 // not part of the contraction. */ 2897 if(U_IS_SUPPLEMENTARY(miss)) { 2898 goBackOne(source); 2899 } 2900 CE = *(coll->contractionCEs + 2901 (ContractionStart - coll->contractionIndex)); 2902 } else { 2903 // 2904 // Contraction is possibly discontiguous. 2905 // Scan more of source string looking for a match 2906 // 2907 UChar tempchar; 2908 /* find the next character if schar is not a base character 2909 and we are not yet at the end of the string */ 2910 tempchar = getNextNormalizedChar(source); 2911 // probably need another supplementary thingie here 2912 goBackOne(source); 2913 if (i_getCombiningClass(tempchar, coll) == 0) { 2914 goBackOne(source); 2915 if(U_IS_SUPPLEMENTARY(miss)) { 2916 goBackOne(source); 2917 } 2918 /* Spit out the last char of the string, wasn't tasty enough */ 2919 CE = *(coll->contractionCEs + 2920 (ContractionStart - coll->contractionIndex)); 2921 } else { 2922 CE = getDiscontiguous(coll, source, ContractionStart); 2923 } 2924 } 2925 } // else after if(schar == tchar) 2926 2927 if(CE == UCOL_NOT_FOUND) { 2928 /* The Source string did not match the contraction that we were checking. */ 2929 /* Back up the source position to undo the effects of having partially */ 2930 /* scanned through what ultimately proved to not be a contraction. */ 2931 loadState(source, &state, TRUE); 2932 CE = firstCE; 2933 break; 2934 } 2935 2936 if(!isContraction(CE)) { 2937 // The source string char was in the contraction table, and the corresponding 2938 // CE is not a contraction CE. We completed the contraction, break 2939 // out of loop, this CE will end up being returned. This is the normal 2940 // way out of contraction handling when the source actually contained 2941 // the contraction. 2942 break; 2943 } 2944 2945 2946 // The source string char was in the contraction table, and the corresponding 2947 // CE is IS a contraction CE. We will continue looping to check the source 2948 // string for the remaining chars in the contraction. 2949 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); 2950 if(tempCE != UCOL_NOT_FOUND) { 2951 // We have scanned a a section of source string for which there is a 2952 // CE from the contraction table. Remember the CE and scan position, so 2953 // that we can return to this point if further scanning fails to 2954 // match a longer contraction sequence. 2955 firstCE = tempCE; 2956 2957 goBackOne(source); 2958 backupState(source, &state); 2959 getNextNormalizedChar(source); 2960 2961 // Another way to do this is: 2962 //collIterateState tempState; 2963 //backupState(source, &tempState); 2964 //goBackOne(source); 2965 //backupState(source, &state); 2966 //loadState(source, &tempState, TRUE); 2967 2968 // The problem is that for incomplete contractions we have to remember the previous 2969 // position. Before, the only thing I needed to do was state.pos--; 2970 // After iterator introduction and especially after introduction of normalizing 2971 // iterators, it became much more difficult to decrease the saved state. 2972 // I'm not yet sure which of the two methods above is faster. 2973 } 2974 } // for(;;) 2975 break; 2976 } // case CONTRACTION_TAG: 2977 case LONG_PRIMARY_TAG: 2978 { 2979 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 2980 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 2981 source->offsetRepeatCount += 1; 2982 return CE; 2983 } 2984 case EXPANSION_TAG: 2985 { 2986 /* This should handle expansion. */ 2987 /* NOTE: we can encounter both continuations and expansions in an expansion! */ 2988 /* I have to decide where continuations are going to be dealt with */ 2989 uint32_t size; 2990 uint32_t i; /* general counter */ 2991 2992 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 2993 size = getExpansionCount(CE); 2994 CE = *CEOffset++; 2995 //source->offsetRepeatCount = -1; 2996 2997 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 2998 for(i = 1; i<size; i++) { 2999 *(source->CEpos++) = *CEOffset++; 3000 source->offsetRepeatCount += 1; 3001 } 3002 } else { /* else, we do */ 3003 while(*CEOffset != 0) { 3004 *(source->CEpos++) = *CEOffset++; 3005 source->offsetRepeatCount += 1; 3006 } 3007 } 3008 3009 return CE; 3010 } 3011 case DIGIT_TAG: 3012 { 3013 /* 3014 We do a check to see if we want to collate digits as numbers; if so we generate 3015 a custom collation key. Otherwise we pull out the value stored in the expansion table. 3016 */ 3017 //uint32_t size; 3018 uint32_t i; /* general counter */ 3019 3020 if (source->coll->numericCollation == UCOL_ON){ 3021 collIterateState digitState = {0,0,0,0,0,0,0,0,0}; 3022 UChar32 char32 = 0; 3023 int32_t digVal = 0; 3024 3025 uint32_t digIndx = 0; 3026 uint32_t endIndex = 0; 3027 uint32_t trailingZeroIndex = 0; 3028 3029 uint8_t collateVal = 0; 3030 3031 UBool nonZeroValReached = FALSE; 3032 3033 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. 3034 /* 3035 We parse the source string until we hit a char that's NOT a digit. 3036 Use this u_charDigitValue. This might be slow because we have to 3037 handle surrogates... 3038 */ 3039 /* 3040 if (U16_IS_LEAD(ch)){ 3041 if (!collIter_eos(source)) { 3042 backupState(source, &digitState); 3043 UChar trail = getNextNormalizedChar(source); 3044 if(U16_IS_TRAIL(trail)) { 3045 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 3046 } else { 3047 loadState(source, &digitState, TRUE); 3048 char32 = ch; 3049 } 3050 } else { 3051 char32 = ch; 3052 } 3053 } else { 3054 char32 = ch; 3055 } 3056 digVal = u_charDigitValue(char32); 3057 */ 3058 digVal = u_charDigitValue(cp); // if we have arrived here, we have 3059 // already processed possible supplementaries that trigered the digit tag - 3060 // all supplementaries are marked in the UCA. 3061 /* 3062 We pad a zero in front of the first element anyways. This takes 3063 care of the (probably) most common case where people are sorting things followed 3064 by a single digit 3065 */ 3066 digIndx++; 3067 for(;;){ 3068 // Make sure we have enough space. No longer needed; 3069 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER 3070 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough 3071 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). 3072 3073 // Skipping over leading zeroes. 3074 if (digVal != 0) { 3075 nonZeroValReached = TRUE; 3076 } 3077 if (nonZeroValReached) { 3078 /* 3079 We parse the digit string into base 100 numbers (this fits into a byte). 3080 We only add to the buffer in twos, thus if we are parsing an odd character, 3081 that serves as the 'tens' digit while the if we are parsing an even one, that 3082 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3083 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3084 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3085 than all the other bytes. 3086 */ 3087 3088 if (digIndx % 2 == 1){ 3089 collateVal += (uint8_t)digVal; 3090 3091 // We don't enter the low-order-digit case unless we've already seen 3092 // the high order, or for the first digit, which is always non-zero. 3093 if (collateVal != 0) 3094 trailingZeroIndex = 0; 3095 3096 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3097 collateVal = 0; 3098 } 3099 else{ 3100 // We drop the collation value into the buffer so if we need to do 3101 // a "front patch" we don't have to check to see if we're hitting the 3102 // last element. 3103 collateVal = (uint8_t)(digVal * 10); 3104 3105 // Check for trailing zeroes. 3106 if (collateVal == 0) 3107 { 3108 if (!trailingZeroIndex) 3109 trailingZeroIndex = (digIndx/2) + 2; 3110 } 3111 else 3112 trailingZeroIndex = 0; 3113 3114 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3115 } 3116 digIndx++; 3117 } 3118 3119 // Get next character. 3120 if (!collIter_eos(source)){ 3121 ch = getNextNormalizedChar(source); 3122 if (U16_IS_LEAD(ch)){ 3123 if (!collIter_eos(source)) { 3124 backupState(source, &digitState); 3125 UChar trail = getNextNormalizedChar(source); 3126 if(U16_IS_TRAIL(trail)) { 3127 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 3128 } else { 3129 loadState(source, &digitState, TRUE); 3130 char32 = ch; 3131 } 3132 } 3133 } else { 3134 char32 = ch; 3135 } 3136 3137 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){ 3138 // Resetting position to point to the next unprocessed char. We 3139 // overshot it when doing our test/set for numbers. 3140 if (char32 > 0xFFFF) { // For surrogates. 3141 loadState(source, &digitState, TRUE); 3142 //goBackOne(source); 3143 } 3144 goBackOne(source); 3145 break; 3146 } 3147 } else { 3148 break; 3149 } 3150 } 3151 3152 if (nonZeroValReached == FALSE){ 3153 digIndx = 2; 3154 numTempBuf[2] = 6; 3155 } 3156 3157 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; 3158 if (digIndx % 2 != 0){ 3159 /* 3160 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what 3161 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. 3162 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a 3163 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. 3164 */ 3165 3166 for(i = 2; i < endIndex; i++){ 3167 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + 3168 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; 3169 } 3170 --digIndx; 3171 } 3172 3173 // Subtract one off of the last byte. 3174 numTempBuf[endIndex-1] -= 1; 3175 3176 /* 3177 We want to skip over the first two slots in the buffer. The first slot 3178 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3179 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3180 */ 3181 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3182 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); 3183 3184 // Now transfer the collation key to our collIterate struct. 3185 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. 3186 //size = ((endIndex+1) & ~1)/2; 3187 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3188 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3189 UCOL_BYTE_COMMON; // Tertiary weight. 3190 i = 2; // Reset the index into the buffer. 3191 while(i < endIndex) 3192 { 3193 uint32_t primWeight = numTempBuf[i++] << 8; 3194 if ( i < endIndex) 3195 primWeight |= numTempBuf[i++]; 3196 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3197 } 3198 3199 } else { 3200 // no numeric mode, we'll just switch to whatever we stashed and continue 3201 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 3202 CE = *CEOffset++; 3203 break; 3204 } 3205 return CE; 3206 } 3207 /* various implicits optimization */ 3208 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 3209 /* UCA is filled with these. Tailorings are NOT_FOUND */ 3210 return getImplicit(cp, source); 3211 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 3212 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit 3213 return getImplicit(cp, source); 3214 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3215 { 3216 static const uint32_t 3217 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3218 //const uint32_t LCount = 19; 3219 static const uint32_t VCount = 21; 3220 static const uint32_t TCount = 28; 3221 //const uint32_t NCount = VCount * TCount; // 588 3222 //const uint32_t SCount = LCount * NCount; // 11172 3223 uint32_t L = ch - SBase; 3224 3225 // divide into pieces 3226 3227 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation 3228 L /= TCount; 3229 uint32_t V = L % VCount; 3230 L /= VCount; 3231 3232 // offset them 3233 3234 L += LBase; 3235 V += VBase; 3236 T += TBase; 3237 3238 // return the first CE, but first put the rest into the expansion buffer 3239 if (!source->coll->image->jamoSpecial) { // FAST PATH 3240 3241 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3242 if (T != TBase) { 3243 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3244 } 3245 3246 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3247 3248 } else { // Jamo is Special 3249 // Since Hanguls pass the FCD check, it is 3250 // guaranteed that we won't be in 3251 // the normalization buffer if something like this happens 3252 3253 // However, if we are using a uchar iterator and normalization 3254 // is ON, the Hangul that lead us here is going to be in that 3255 // normalization buffer. Here we want to restore the uchar 3256 // iterator state and pull out of the normalization buffer 3257 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { 3258 source->flags = source->origFlags; // restore the iterator 3259 source->pos = NULL; 3260 } 3261 3262 // Move Jamos into normalization buffer 3263 UChar *buffer = source->writableBuffer.getBuffer(4); 3264 int32_t bufferLength; 3265 buffer[0] = (UChar)L; 3266 buffer[1] = (UChar)V; 3267 if (T != TBase) { 3268 buffer[2] = (UChar)T; 3269 bufferLength = 3; 3270 } else { 3271 bufferLength = 2; 3272 } 3273 source->writableBuffer.releaseBuffer(bufferLength); 3274 3275 // Indicate where to continue in main input string after exhausting the writableBuffer 3276 source->fcdPosition = source->pos; 3277 3278 source->pos = source->writableBuffer.getTerminatedBuffer(); 3279 source->origFlags = source->flags; 3280 source->flags |= UCOL_ITER_INNORMBUF; 3281 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 3282 3283 return(UCOL_IGNORABLE); 3284 } 3285 } 3286 case SURROGATE_TAG: 3287 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ 3288 /* two things can happen here: next code point can be a trailing surrogate - we will use it */ 3289 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ 3290 /* we treat it like an unassigned code point. */ 3291 { 3292 UChar trail; 3293 collIterateState state; 3294 backupState(source, &state); 3295 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { 3296 // we chould have stepped one char forward and it might have turned that it 3297 // was not a trail surrogate. In that case, we have to backup. 3298 loadState(source, &state, TRUE); 3299 return UCOL_NOT_FOUND; 3300 } else { 3301 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ 3302 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail); 3303 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. 3304 // We need to backup 3305 loadState(source, &state, TRUE); 3306 return CE; 3307 } 3308 // calculate the supplementary code point value, if surrogate was not tailored 3309 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 3310 } 3311 } 3312 break; 3313 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 3314 UChar nextChar; 3315 if( source->flags & UCOL_USE_ITERATOR) { 3316 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { 3317 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3318 source->iterator->next(source->iterator); 3319 return getImplicit(cp, source); 3320 } 3321 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) && 3322 U_IS_TRAIL((nextChar=*source->pos))) { 3323 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3324 source->pos++; 3325 return getImplicit(cp, source); 3326 } 3327 return UCOL_NOT_FOUND; 3328 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 3329 return UCOL_NOT_FOUND; /* broken surrogate sequence */ 3330 case CHARSET_TAG: 3331 /* not yet implemented */ 3332 /* probably after 1.8 */ 3333 return UCOL_NOT_FOUND; 3334 default: 3335 *status = U_INTERNAL_PROGRAM_ERROR; 3336 CE=0; 3337 break; 3338 } 3339 if (CE <= UCOL_NOT_FOUND) break; 3340 } 3341 return CE; 3342 } 3343 3344 3345 /* now uses Mark's getImplicitPrimary code */ 3346 static 3347 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { 3348 uint32_t r = uprv_uca_getImplicitPrimary(cp); 3349 3350 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; 3351 collationSource->toReturn = collationSource->CEpos; 3352 3353 // **** doesn't work if using iterator **** 3354 if (collationSource->flags & UCOL_ITER_INNORMBUF) { 3355 collationSource->offsetRepeatCount = 1; 3356 } else { 3357 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string); 3358 3359 UErrorCode errorCode = U_ZERO_ERROR; 3360 collationSource->appendOffset(firstOffset, errorCode); 3361 collationSource->appendOffset(firstOffset + 1, errorCode); 3362 3363 collationSource->offsetReturn = collationSource->offsetStore - 1; 3364 *(collationSource->offsetBuffer) = firstOffset; 3365 if (collationSource->offsetReturn == collationSource->offsetBuffer) { 3366 collationSource->offsetStore = collationSource->offsetBuffer; 3367 } 3368 } 3369 3370 return ((r & 0x0000FFFF)<<16) | 0x000000C0; 3371 } 3372 3373 /** 3374 * This function handles the special CEs like contractions, expansions, 3375 * surrogates, Thai. 3376 * It is called by both getPrevCE 3377 */ 3378 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, 3379 collIterate *source, 3380 UErrorCode *status) 3381 { 3382 const uint32_t *CEOffset = NULL; 3383 UChar *UCharOffset = NULL; 3384 UChar schar; 3385 const UChar *constart = NULL; 3386 uint32_t size; 3387 UChar buffer[UCOL_MAX_BUFFER]; 3388 uint32_t *endCEBuffer; 3389 UChar *strbuffer; 3390 int32_t noChars = 0; 3391 int32_t CECount = 0; 3392 3393 for(;;) 3394 { 3395 /* the only ces that loops are thai and contractions */ 3396 switch (getCETag(CE)) 3397 { 3398 case NOT_FOUND_TAG: /* this tag always returns */ 3399 return CE; 3400 3401 case SPEC_PROC_TAG: 3402 { 3403 // Special processing is getting a CE that is preceded by a certain prefix 3404 // Currently this is only needed for optimizing Japanese length and iteration marks. 3405 // When we encouter a special processing tag, we go backwards and try to see if 3406 // we have a match. 3407 // Contraction tables are used - so the whole process is not unlike contraction. 3408 // prefix data is stored backwards in the table. 3409 const UChar *UCharOffset; 3410 UChar schar, tchar; 3411 collIterateState prefixState; 3412 backupState(source, &prefixState); 3413 for(;;) { 3414 // This loop will run once per source string character, for as long as we 3415 // are matching a potential contraction sequence 3416 3417 // First we position ourselves at the begining of contraction sequence 3418 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 3419 3420 if (collIter_bos(source)) { 3421 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 3422 break; 3423 } 3424 schar = getPrevNormalizedChar(source, status); 3425 goBackOne(source); 3426 3427 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 3428 UCharOffset++; 3429 } 3430 3431 if (schar == tchar) { 3432 // Found the source string char in the table. 3433 // Pick up the corresponding CE from the table. 3434 CE = *(coll->contractionCEs + 3435 (UCharOffset - coll->contractionIndex)); 3436 } 3437 else 3438 { 3439 // if there is a completely ignorable code point in the middle of 3440 // a prefix, we need to act as if it's not there 3441 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) 3442 // lone surrogates cannot be set to zero as it would break other processing 3443 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 3444 // it's easy for BMP code points 3445 if(isZeroCE == 0) { 3446 continue; 3447 } else if(U16_IS_SURROGATE(schar)) { 3448 // for supplementary code points, we have to check the next one 3449 // situations where we are going to ignore 3450 // 1. beginning of the string: schar is a lone surrogate 3451 // 2. schar is a lone surrogate 3452 // 3. schar is a trail surrogate in a valid surrogate sequence 3453 // that is explicitly set to zero. 3454 if (!collIter_bos(source)) { 3455 UChar lead; 3456 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) { 3457 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead); 3458 if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) { 3459 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); 3460 if(finalCE == 0) { 3461 // this is a real, assigned completely ignorable code point 3462 goBackOne(source); 3463 continue; 3464 } 3465 } 3466 } else { 3467 // lone surrogate, treat like unassigned 3468 return UCOL_NOT_FOUND; 3469 } 3470 } else { 3471 // lone surrogate at the beggining, treat like unassigned 3472 return UCOL_NOT_FOUND; 3473 } 3474 } 3475 // Source string char was not in the table. 3476 // We have not found the prefix. 3477 CE = *(coll->contractionCEs + 3478 (ContractionStart - coll->contractionIndex)); 3479 } 3480 3481 if(!isPrefix(CE)) { 3482 // The source string char was in the contraction table, and the corresponding 3483 // CE is not a prefix CE. We found the prefix, break 3484 // out of loop, this CE will end up being returned. This is the normal 3485 // way out of prefix handling when the source actually contained 3486 // the prefix. 3487 break; 3488 } 3489 } 3490 loadState(source, &prefixState, TRUE); 3491 break; 3492 } 3493 3494 case CONTRACTION_TAG: { 3495 /* to ensure that the backwards and forwards iteration matches, we 3496 take the current region of most possible match and pass it through 3497 the forward iteration. this will ensure that the obstinate problem of 3498 overlapping contractions will not occur. 3499 */ 3500 schar = peekCodeUnit(source, 0); 3501 constart = (UChar *)coll->image + getContractOffset(CE); 3502 if (isAtStartPrevIterate(source) 3503 /* commented away contraction end checks after adding the checks 3504 in getPrevCE */) { 3505 /* start of string or this is not the end of any contraction */ 3506 CE = *(coll->contractionCEs + 3507 (constart - coll->contractionIndex)); 3508 break; 3509 } 3510 strbuffer = buffer; 3511 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); 3512 *(UCharOffset --) = 0; 3513 noChars = 0; 3514 // have to swap thai characters 3515 while (ucol_unsafeCP(schar, coll)) { 3516 *(UCharOffset) = schar; 3517 noChars++; 3518 UCharOffset --; 3519 schar = getPrevNormalizedChar(source, status); 3520 goBackOne(source); 3521 // TODO: when we exhaust the contraction buffer, 3522 // it needs to get reallocated. The problem is 3523 // that the size depends on the string which is 3524 // not iterated over. However, since we're travelling 3525 // backwards, we already had to set the iterator at 3526 // the end - so we might as well know where we are? 3527 if (UCharOffset + 1 == buffer) { 3528 /* we have exhausted the buffer */ 3529 int32_t newsize = 0; 3530 if(source->pos) { // actually dealing with a position 3531 newsize = (int32_t)(source->pos - source->string + 1); 3532 } else { // iterator 3533 newsize = 4 * UCOL_MAX_BUFFER; 3534 } 3535 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * 3536 (newsize + UCOL_MAX_BUFFER)); 3537 /* test for NULL */ 3538 if (strbuffer == NULL) { 3539 *status = U_MEMORY_ALLOCATION_ERROR; 3540 return UCOL_NO_MORE_CES; 3541 } 3542 UCharOffset = strbuffer + newsize; 3543 uprv_memcpy(UCharOffset, buffer, 3544 UCOL_MAX_BUFFER * sizeof(UChar)); 3545 UCharOffset --; 3546 } 3547 if ((source->pos && (source->pos == source->string || 3548 ((source->flags & UCOL_ITER_INNORMBUF) && 3549 *(source->pos - 1) == 0 && source->fcdPosition == NULL))) 3550 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { 3551 break; 3552 } 3553 } 3554 /* adds the initial base character to the string */ 3555 *(UCharOffset) = schar; 3556 noChars++; 3557 3558 int32_t offsetBias; 3559 3560 // **** doesn't work if using iterator **** 3561 if (source->flags & UCOL_ITER_INNORMBUF) { 3562 offsetBias = -1; 3563 } else { 3564 offsetBias = (int32_t)(source->pos - source->string); 3565 } 3566 3567 /* a new collIterate is used to simplify things, since using the current 3568 collIterate will mean that the forward and backwards iteration will 3569 share and change the same buffers. we don't want to get into that. */ 3570 collIterate temp; 3571 int32_t rawOffset; 3572 3573 IInit_collIterate(coll, UCharOffset, noChars, &temp, status); 3574 if(U_FAILURE(*status)) { 3575 return UCOL_NULLORDER; 3576 } 3577 temp.flags &= ~UCOL_ITER_NORM; 3578 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; 3579 3580 rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero? 3581 CE = ucol_IGetNextCE(coll, &temp, status); 3582 3583 if (source->extendCEs) { 3584 endCEBuffer = source->extendCEs + source->extendCEsSize; 3585 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t)); 3586 } else { 3587 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; 3588 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t)); 3589 } 3590 3591 while (CE != UCOL_NO_MORE_CES) { 3592 *(source->CEpos ++) = CE; 3593 3594 if (offsetBias >= 0) { 3595 source->appendOffset(rawOffset + offsetBias, *status); 3596 } 3597 3598 CECount++; 3599 if (source->CEpos == endCEBuffer) { 3600 /* ran out of CE space, reallocate to new buffer. 3601 If reallocation fails, reset pointers and bail out, 3602 there's no guarantee of the right character position after 3603 this bail*/ 3604 if (!increaseCEsCapacity(source)) { 3605 *status = U_MEMORY_ALLOCATION_ERROR; 3606 break; 3607 } 3608 3609 endCEBuffer = source->extendCEs + source->extendCEsSize; 3610 } 3611 3612 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { 3613 rawOffset = (int32_t)(temp.fcdPosition - temp.string); 3614 } else { 3615 rawOffset = (int32_t)(temp.pos - temp.string); 3616 } 3617 3618 CE = ucol_IGetNextCE(coll, &temp, status); 3619 } 3620 3621 if (strbuffer != buffer) { 3622 uprv_free(strbuffer); 3623 } 3624 if (U_FAILURE(*status)) { 3625 return (uint32_t)UCOL_NULLORDER; 3626 } 3627 3628 if (source->offsetRepeatValue != 0) { 3629 if (CECount > noChars) { 3630 source->offsetRepeatCount += temp.offsetRepeatCount; 3631 } else { 3632 // **** does this really skip the right offsets? **** 3633 source->offsetReturn -= (noChars - CECount); 3634 } 3635 } 3636 3637 if (offsetBias >= 0) { 3638 source->offsetReturn = source->offsetStore - 1; 3639 if (source->offsetReturn == source->offsetBuffer) { 3640 source->offsetStore = source->offsetBuffer; 3641 } 3642 } 3643 3644 source->toReturn = source->CEpos - 1; 3645 if (source->toReturn == source->CEs) { 3646 source->CEpos = source->CEs; 3647 } 3648 3649 return *(source->toReturn); 3650 } 3651 case LONG_PRIMARY_TAG: 3652 { 3653 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 3654 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 3655 source->toReturn = source->CEpos - 1; 3656 3657 if (source->flags & UCOL_ITER_INNORMBUF) { 3658 source->offsetRepeatCount = 1; 3659 } else { 3660 int32_t firstOffset = (int32_t)(source->pos - source->string); 3661 3662 source->appendOffset(firstOffset, *status); 3663 source->appendOffset(firstOffset + 1, *status); 3664 3665 source->offsetReturn = source->offsetStore - 1; 3666 *(source->offsetBuffer) = firstOffset; 3667 if (source->offsetReturn == source->offsetBuffer) { 3668 source->offsetStore = source->offsetBuffer; 3669 } 3670 } 3671 3672 3673 return *(source->toReturn); 3674 } 3675 3676 case EXPANSION_TAG: /* this tag always returns */ 3677 { 3678 /* 3679 This should handle expansion. 3680 NOTE: we can encounter both continuations and expansions in an expansion! 3681 I have to decide where continuations are going to be dealt with 3682 */ 3683 int32_t firstOffset = (int32_t)(source->pos - source->string); 3684 3685 // **** doesn't work if using iterator **** 3686 if (source->offsetReturn != NULL) { 3687 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) { 3688 source->offsetStore = source->offsetBuffer; 3689 }else { 3690 firstOffset = -1; 3691 } 3692 } 3693 3694 /* find the offset to expansion table */ 3695 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3696 size = getExpansionCount(CE); 3697 if (size != 0) { 3698 /* 3699 if there are less than 16 elements in expansion, we don't terminate 3700 */ 3701 uint32_t count; 3702 3703 for (count = 0; count < size; count++) { 3704 *(source->CEpos ++) = *CEOffset++; 3705 3706 if (firstOffset >= 0) { 3707 source->appendOffset(firstOffset + 1, *status); 3708 } 3709 } 3710 } else { 3711 /* else, we do */ 3712 while (*CEOffset != 0) { 3713 *(source->CEpos ++) = *CEOffset ++; 3714 3715 if (firstOffset >= 0) { 3716 source->appendOffset(firstOffset + 1, *status); 3717 } 3718 } 3719 } 3720 3721 if (firstOffset >= 0) { 3722 source->offsetReturn = source->offsetStore - 1; 3723 *(source->offsetBuffer) = firstOffset; 3724 if (source->offsetReturn == source->offsetBuffer) { 3725 source->offsetStore = source->offsetBuffer; 3726 } 3727 } else { 3728 source->offsetRepeatCount += size - 1; 3729 } 3730 3731 source->toReturn = source->CEpos - 1; 3732 // in case of one element expansion, we 3733 // want to immediately return CEpos 3734 if(source->toReturn == source->CEs) { 3735 source->CEpos = source->CEs; 3736 } 3737 3738 return *(source->toReturn); 3739 } 3740 3741 case DIGIT_TAG: 3742 { 3743 /* 3744 We do a check to see if we want to collate digits as numbers; if so we generate 3745 a custom collation key. Otherwise we pull out the value stored in the expansion table. 3746 */ 3747 uint32_t i; /* general counter */ 3748 3749 if (source->coll->numericCollation == UCOL_ON){ 3750 uint32_t digIndx = 0; 3751 uint32_t endIndex = 0; 3752 uint32_t leadingZeroIndex = 0; 3753 uint32_t trailingZeroCount = 0; 3754 3755 uint8_t collateVal = 0; 3756 3757 UBool nonZeroValReached = FALSE; 3758 3759 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs. 3760 /* 3761 We parse the source string until we hit a char that's NOT a digit. 3762 Use this u_charDigitValue. This might be slow because we have to 3763 handle surrogates... 3764 */ 3765 /* 3766 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less, 3767 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation 3768 element we process when going backward. To determine how long that chunk might be, we may need to make 3769 two passes through the loop that collects digits - one to see how long the string is (and how much is 3770 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has 3771 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation 3772 element chunk after resetting the state to the initialState at the right side of the digit string. 3773 */ 3774 uint32_t ceLimit = 0; 3775 UChar initial_ch = ch; 3776 collIterateState initialState = {0,0,0,0,0,0,0,0,0}; 3777 backupState(source, &initialState); 3778 3779 for(;;) { 3780 collIterateState state = {0,0,0,0,0,0,0,0,0}; 3781 UChar32 char32 = 0; 3782 int32_t digVal = 0; 3783 3784 if (U16_IS_TRAIL (ch)) { 3785 if (!collIter_bos(source)){ 3786 UChar lead = getPrevNormalizedChar(source, status); 3787 if(U16_IS_LEAD(lead)) { 3788 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3789 goBackOne(source); 3790 } else { 3791 char32 = ch; 3792 } 3793 } else { 3794 char32 = ch; 3795 } 3796 } else { 3797 char32 = ch; 3798 } 3799 digVal = u_charDigitValue(char32); 3800 3801 for(;;) { 3802 // Make sure we have enough space. No longer needed; 3803 // at this point the largest value of digIndx when we need to save data in numTempBuf 3804 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure 3805 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2). 3806 3807 // Skip over trailing zeroes, and keep a count of them. 3808 if (digVal != 0) 3809 nonZeroValReached = TRUE; 3810 3811 if (nonZeroValReached) { 3812 /* 3813 We parse the digit string into base 100 numbers (this fits into a byte). 3814 We only add to the buffer in twos, thus if we are parsing an odd character, 3815 that serves as the 'tens' digit while the if we are parsing an even one, that 3816 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3817 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3818 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3819 than all the other bytes. 3820 3821 Since we're doing in this reverse we want to put the first digit encountered into the 3822 ones place and the second digit encountered into the tens place. 3823 */ 3824 3825 if ((digIndx + trailingZeroCount) % 2 == 1) { 3826 // High-order digit case (tens place) 3827 collateVal += (uint8_t)(digVal * 10); 3828 3829 // We cannot set leadingZeroIndex unless it has been set for the 3830 // low-order digit. Therefore, all we can do for the high-order 3831 // digit is turn it off, never on. 3832 // The only time we will have a high digit without a low is for 3833 // the very first non-zero digit, so no zero check is necessary. 3834 if (collateVal != 0) 3835 leadingZeroIndex = 0; 3836 3837 // The first pass through, digIndx may exceed the limit, but in that case 3838 // we no longer care about numTempBuf contents since they will be discarded 3839 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) { 3840 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3841 } 3842 collateVal = 0; 3843 } else { 3844 // Low-order digit case (ones place) 3845 collateVal = (uint8_t)digVal; 3846 3847 // Check for leading zeroes. 3848 if (collateVal == 0) { 3849 if (!leadingZeroIndex) 3850 leadingZeroIndex = (digIndx/2) + 2; 3851 } else 3852 leadingZeroIndex = 0; 3853 3854 // No need to write to buffer; the case of a last odd digit 3855 // is handled below. 3856 } 3857 ++digIndx; 3858 } else 3859 ++trailingZeroCount; 3860 3861 if (!collIter_bos(source)) { 3862 ch = getPrevNormalizedChar(source, status); 3863 //goBackOne(source); 3864 if (U16_IS_TRAIL(ch)) { 3865 backupState(source, &state); 3866 if (!collIter_bos(source)) { 3867 goBackOne(source); 3868 UChar lead = getPrevNormalizedChar(source, status); 3869 3870 if(U16_IS_LEAD(lead)) { 3871 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3872 } else { 3873 loadState(source, &state, FALSE); 3874 char32 = ch; 3875 } 3876 } 3877 } else 3878 char32 = ch; 3879 3880 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { 3881 if (char32 > 0xFFFF) {// For surrogates. 3882 loadState(source, &state, FALSE); 3883 } 3884 // Don't need to "reverse" the goBackOne call, 3885 // as this points to the next position to process.. 3886 //if (char32 > 0xFFFF) // For surrogates. 3887 //getNextNormalizedChar(source); 3888 break; 3889 } 3890 3891 goBackOne(source); 3892 }else 3893 break; 3894 } 3895 3896 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) { 3897 // our collation element is not too big, go ahead and finish with it 3898 break; 3899 } 3900 // our digit string is too long for a collation element; 3901 // set the limit for it, reset the state and begin again 3902 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER; 3903 if ( ceLimit == 0 ) { 3904 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; 3905 } 3906 ch = initial_ch; 3907 loadState(source, &initialState, FALSE); 3908 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0; 3909 collateVal = 0; 3910 nonZeroValReached = FALSE; 3911 } 3912 3913 if (! nonZeroValReached) { 3914 digIndx = 2; 3915 trailingZeroCount = 0; 3916 numTempBuf[2] = 6; 3917 } 3918 3919 if ((digIndx + trailingZeroCount) % 2 != 0) { 3920 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; 3921 digIndx += 1; // The implicit leading zero 3922 } 3923 if (trailingZeroCount % 2 != 0) { 3924 // We had to consume one trailing zero for the low digit 3925 // of the least significant byte 3926 digIndx += 1; // The trailing zero not in the exponent 3927 trailingZeroCount -= 1; 3928 } 3929 3930 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; 3931 3932 // Subtract one off of the last byte. Really the first byte here, but it's reversed... 3933 numTempBuf[2] -= 1; 3934 3935 /* 3936 We want to skip over the first two slots in the buffer. The first slot 3937 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3938 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3939 The exponent must be adjusted by the number of leading zeroes, and the number of 3940 trailing zeroes. 3941 */ 3942 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3943 uint32_t exponent = (digIndx+trailingZeroCount)/2; 3944 if (leadingZeroIndex) 3945 exponent -= ((digIndx/2) + 2 - leadingZeroIndex); 3946 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); 3947 3948 // Now transfer the collation key to our collIterate struct. 3949 // The total size for our collation key is half of endIndex, rounded up. 3950 int32_t size = (endIndex+1)/2; 3951 if(!ensureCEsCapacity(source, size)) { 3952 return UCOL_NULLORDER; 3953 } 3954 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3955 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3956 UCOL_BYTE_COMMON; // Tertiary weight. 3957 i = endIndex - 1; // Reset the index into the buffer. 3958 while(i >= 2) { 3959 uint32_t primWeight = numTempBuf[i--] << 8; 3960 if ( i >= 2) 3961 primWeight |= numTempBuf[i--]; 3962 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3963 } 3964 3965 source->toReturn = source->CEpos -1; 3966 return *(source->toReturn); 3967 } else { 3968 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3969 CE = *(CEOffset++); 3970 break; 3971 } 3972 } 3973 3974 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3975 { 3976 static const uint32_t 3977 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3978 //const uint32_t LCount = 19; 3979 static const uint32_t VCount = 21; 3980 static const uint32_t TCount = 28; 3981 //const uint32_t NCount = VCount * TCount; /* 588 */ 3982 //const uint32_t SCount = LCount * NCount; /* 11172 */ 3983 3984 uint32_t L = ch - SBase; 3985 /* 3986 divide into pieces. 3987 we do it in this order since some compilers can do % and / in one 3988 operation 3989 */ 3990 uint32_t T = L % TCount; 3991 L /= TCount; 3992 uint32_t V = L % VCount; 3993 L /= VCount; 3994 3995 /* offset them */ 3996 L += LBase; 3997 V += VBase; 3998 T += TBase; 3999 4000 int32_t firstOffset = (int32_t)(source->pos - source->string); 4001 source->appendOffset(firstOffset, *status); 4002 4003 /* 4004 * return the first CE, but first put the rest into the expansion buffer 4005 */ 4006 if (!source->coll->image->jamoSpecial) { 4007 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 4008 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 4009 source->appendOffset(firstOffset + 1, *status); 4010 4011 if (T != TBase) { 4012 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 4013 source->appendOffset(firstOffset + 1, *status); 4014 } 4015 4016 source->toReturn = source->CEpos - 1; 4017 4018 source->offsetReturn = source->offsetStore - 1; 4019 if (source->offsetReturn == source->offsetBuffer) { 4020 source->offsetStore = source->offsetBuffer; 4021 } 4022 4023 return *(source->toReturn); 4024 } else { 4025 // Since Hanguls pass the FCD check, it is 4026 // guaranteed that we won't be in 4027 // the normalization buffer if something like this happens 4028 4029 // Move Jamos into normalization buffer 4030 UChar *tempbuffer = source->writableBuffer.getBuffer(5); 4031 int32_t tempbufferLength, jamoOffset; 4032 tempbuffer[0] = 0; 4033 tempbuffer[1] = (UChar)L; 4034 tempbuffer[2] = (UChar)V; 4035 if (T != TBase) { 4036 tempbuffer[3] = (UChar)T; 4037 tempbufferLength = 4; 4038 } else { 4039 tempbufferLength = 3; 4040 } 4041 source->writableBuffer.releaseBuffer(tempbufferLength); 4042 4043 // Indicate where to continue in main input string after exhausting the writableBuffer 4044 if (source->pos == source->string) { 4045 jamoOffset = 0; 4046 source->fcdPosition = NULL; 4047 } else { 4048 jamoOffset = source->pos - source->string; 4049 source->fcdPosition = source->pos-1; 4050 } 4051 4052 // Append offsets for the additional chars 4053 // (not the 0, and not the L whose offsets match the original Hangul) 4054 int32_t jamoRemaining = tempbufferLength - 2; 4055 jamoOffset++; // appended offsets should match end of original Hangul 4056 while (jamoRemaining-- > 0) { 4057 source->appendOffset(jamoOffset, *status); 4058 } 4059 4060 source->offsetRepeatValue = jamoOffset; 4061 4062 source->offsetReturn = source->offsetStore - 1; 4063 if (source->offsetReturn == source->offsetBuffer) { 4064 source->offsetStore = source->offsetBuffer; 4065 } 4066 4067 source->pos = source->writableBuffer.getTerminatedBuffer() + tempbufferLength; 4068 source->origFlags = source->flags; 4069 source->flags |= UCOL_ITER_INNORMBUF; 4070 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 4071 4072 return(UCOL_IGNORABLE); 4073 } 4074 } 4075 4076 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 4077 return getPrevImplicit(ch, source); 4078 4079 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function 4080 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 4081 return getPrevImplicit(ch, source); 4082 4083 case SURROGATE_TAG: /* This is a surrogate pair */ 4084 /* essentially an engaged lead surrogate. */ 4085 /* if you have encountered it here, it means that a */ 4086 /* broken sequence was encountered and this is an error */ 4087 return UCOL_NOT_FOUND; 4088 4089 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 4090 return UCOL_NOT_FOUND; /* broken surrogate sequence */ 4091 4092 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 4093 { 4094 UChar32 cp = 0; 4095 UChar prevChar; 4096 const UChar *prev; 4097 if (isAtStartPrevIterate(source)) { 4098 /* we are at the start of the string, wrong place to be at */ 4099 return UCOL_NOT_FOUND; 4100 } 4101 if (source->pos != source->writableBuffer.getBuffer()) { 4102 prev = source->pos - 1; 4103 } else { 4104 prev = source->fcdPosition; 4105 } 4106 prevChar = *prev; 4107 4108 /* Handles Han and Supplementary characters here.*/ 4109 if (U16_IS_LEAD(prevChar)) { 4110 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 4111 source->pos = prev; 4112 } else { 4113 return UCOL_NOT_FOUND; /* like unassigned */ 4114 } 4115 4116 return getPrevImplicit(cp, source); 4117 } 4118 4119 /* UCA is filled with these. Tailorings are NOT_FOUND */ 4120 /* not yet implemented */ 4121 case CHARSET_TAG: /* this tag always returns */ 4122 /* probably after 1.8 */ 4123 return UCOL_NOT_FOUND; 4124 4125 default: /* this tag always returns */ 4126 *status = U_INTERNAL_PROGRAM_ERROR; 4127 CE=0; 4128 break; 4129 } 4130 4131 if (CE <= UCOL_NOT_FOUND) { 4132 break; 4133 } 4134 } 4135 4136 return CE; 4137 } 4138 4139 /* This should really be a macro */ 4140 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */ 4141 /* secondaries in French */ 4142 /* 4143 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { 4144 uint8_t temp; 4145 while(start<end) { 4146 temp = *start; 4147 *start++ = *end; 4148 *end-- = temp; 4149 } 4150 } 4151 */ 4152 4153 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \ 4154 TYPE tempA; \ 4155 while((start)<(end)) { \ 4156 tempA = *(start); \ 4157 *(start)++ = *(end); \ 4158 *(end)-- = tempA; \ 4159 } \ 4160 } 4161 4162 /****************************************************************************/ 4163 /* Following are the sortkey generation functions */ 4164 /* */ 4165 /****************************************************************************/ 4166 4167 /** 4168 * Merge two sort keys. 4169 * This is useful, for example, to combine sort keys from first and last names 4170 * to sort such pairs. 4171 * Merged sort keys consider on each collation level the first part first entirely, 4172 * then the second one. 4173 * It is possible to merge multiple sort keys by consecutively merging 4174 * another one with the intermediate result. 4175 * 4176 * The length of the merge result is the sum of the lengths of the input sort keys 4177 * minus 1. 4178 * 4179 * @param src1 the first sort key 4180 * @param src1Length the length of the first sort key, including the zero byte at the end; 4181 * can be -1 if the function is to find the length 4182 * @param src2 the second sort key 4183 * @param src2Length the length of the second sort key, including the zero byte at the end; 4184 * can be -1 if the function is to find the length 4185 * @param dest the buffer where the merged sort key is written, 4186 * can be NULL if destCapacity==0 4187 * @param destCapacity the number of bytes in the dest buffer 4188 * @return the length of the merged sort key, src1Length+src2Length-1; 4189 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments), 4190 * in which cases the contents of dest is undefined 4191 * 4192 * @draft 4193 */ 4194 U_CAPI int32_t U_EXPORT2 4195 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 4196 const uint8_t *src2, int32_t src2Length, 4197 uint8_t *dest, int32_t destCapacity) { 4198 int32_t destLength; 4199 uint8_t b; 4200 4201 /* check arguments */ 4202 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 4203 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 4204 destCapacity<0 || (destCapacity>0 && dest==NULL) 4205 ) { 4206 /* error, attempt to write a zero byte and return 0 */ 4207 if(dest!=NULL && destCapacity>0) { 4208 *dest=0; 4209 } 4210 return 0; 4211 } 4212 4213 /* check lengths and capacity */ 4214 if(src1Length<0) { 4215 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 4216 } 4217 if(src2Length<0) { 4218 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 4219 } 4220 4221 destLength=src1Length+src2Length-1; 4222 if(destLength>destCapacity) { 4223 /* the merged sort key does not fit into the destination */ 4224 return destLength; 4225 } 4226 4227 /* merge the sort keys with the same number of levels */ 4228 while(*src1!=0 && *src2!=0) { /* while both have another level */ 4229 /* copy level from src1 not including 00 or 01 */ 4230 while((b=*src1)>=2) { 4231 ++src1; 4232 *dest++=b; 4233 } 4234 4235 /* add a 02 merge separator */ 4236 *dest++=2; 4237 4238 /* copy level from src2 not including 00 or 01 */ 4239 while((b=*src2)>=2) { 4240 ++src2; 4241 *dest++=b; 4242 } 4243 4244 /* if both sort keys have another level, then add a 01 level separator and continue */ 4245 if(*src1==1 && *src2==1) { 4246 ++src1; 4247 ++src2; 4248 *dest++=1; 4249 } 4250 } 4251 4252 /* 4253 * here, at least one sort key is finished now, but the other one 4254 * might have some contents left from containing more levels; 4255 * that contents is just appended to the result 4256 */ 4257 if(*src1!=0) { 4258 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 4259 src2=src1; 4260 } 4261 /* append src2, "the other, unfinished sort key" */ 4262 uprv_strcpy((char *)dest, (const char *)src2); 4263 4264 /* trust that neither sort key contained illegally embedded zero bytes */ 4265 return destLength; 4266 } 4267 4268 U_NAMESPACE_BEGIN 4269 4270 class SortKeyByteSink : public ByteSink { 4271 public: 4272 static const uint32_t FILL_ORIGINAL_BUFFER = 1; 4273 static const uint32_t DONT_GROW = 2; 4274 SortKeyByteSink(char *dest, int32_t destCapacity, uint32_t flags=0) 4275 : ownedBuffer_(NULL), buffer_(dest), capacity_(destCapacity), 4276 appended_(0), 4277 fill_(flags & FILL_ORIGINAL_BUFFER), 4278 grow_((flags & DONT_GROW) == 0) { 4279 if (buffer_ == NULL || capacity_ < 0) { 4280 buffer_ = reinterpret_cast<char *>(&lastResortByte_); 4281 capacity_ = 0; 4282 } 4283 } 4284 virtual ~SortKeyByteSink() { uprv_free(ownedBuffer_); } 4285 4286 virtual void Append(const char *bytes, int32_t n); 4287 void Append(const uint8_t *bytes, int32_t n) { Append(reinterpret_cast<const char *>(bytes), n); } 4288 void Append(uint8_t b) { 4289 if (appended_ < capacity_) { 4290 buffer_[appended_++] = (char)b; 4291 } else { 4292 Append(&b, 1); 4293 } 4294 } 4295 void Append(uint8_t b1, uint8_t b2) { 4296 int32_t a2 = appended_ + 2; 4297 if (a2 <= capacity_) { 4298 buffer_[appended_] = (char)b1; 4299 buffer_[appended_ + 1] = (char)b2; 4300 appended_ = a2; 4301 } else { 4302 char bytes[2] = { (char)b1, (char)b2 }; 4303 Append(bytes, 2); 4304 } 4305 } 4306 void Append(const SortKeyByteSink &other) { Append(other.buffer_, other.appended_); } 4307 virtual char *GetAppendBuffer(int32_t min_capacity, 4308 int32_t desired_capacity_hint, 4309 char *scratch, int32_t scratch_capacity, 4310 int32_t *result_capacity); 4311 int32_t NumberOfBytesAppended() const { return appended_; } 4312 uint8_t &LastByte() { 4313 if (buffer_ != NULL && appended_ > 0) { 4314 return reinterpret_cast<uint8_t *>(buffer_)[appended_ - 1]; 4315 } else { 4316 return lastResortByte_; 4317 } 4318 } 4319 uint8_t *GetLastFewBytes(int32_t n) { 4320 if (buffer_ != NULL && appended_ >= n) { 4321 return reinterpret_cast<uint8_t *>(buffer_) + appended_ - n; 4322 } else { 4323 return NULL; 4324 } 4325 } 4326 char *GetBuffer() { return buffer_; } 4327 uint8_t *GetUnsignedBuffer() { return reinterpret_cast<uint8_t *>(buffer_); } 4328 uint8_t *OrphanUnsignedBuffer(int32_t &orphanedCapacity); 4329 UBool IsOk() const { return buffer_ != NULL; } // otherwise out-of-memory 4330 4331 private: 4332 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented 4333 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented 4334 4335 UBool Resize(int32_t appendCapacity, int32_t length); 4336 void SetNotOk() { 4337 buffer_ = NULL; 4338 capacity_ = 0; 4339 } 4340 4341 static uint8_t lastResortByte_; // last-resort return value from LastByte() 4342 4343 char *ownedBuffer_; 4344 char *buffer_; 4345 int32_t capacity_; 4346 int32_t appended_; 4347 UBool fill_; 4348 UBool grow_; 4349 }; 4350 4351 uint8_t SortKeyByteSink::lastResortByte_ = 0; 4352 4353 void 4354 SortKeyByteSink::Append(const char *bytes, int32_t n) { 4355 if (n <= 0) { 4356 return; 4357 } 4358 int32_t length = appended_; 4359 appended_ += n; 4360 if ((buffer_ + length) == bytes) { 4361 return; // the caller used GetAppendBuffer() and wrote the bytes already 4362 } 4363 if (buffer_ == NULL) { 4364 return; // allocation failed before already 4365 } 4366 int32_t available = capacity_ - length; 4367 if (bytes == NULL) { 4368 // assume that the caller failed to allocate memory 4369 if (fill_) { 4370 if (n > available) { 4371 n = available; 4372 } 4373 uprv_memset(buffer_, 0, n); 4374 } 4375 SetNotOk(); // propagate the out-of-memory error 4376 return; 4377 } 4378 if (n > available) { 4379 if (fill_ && available > 0) { 4380 // Fill the original buffer completely. 4381 uprv_memcpy(buffer_ + length, bytes, available); 4382 bytes += available; 4383 length += available; 4384 n -= available; 4385 available = 0; 4386 } 4387 fill_ = FALSE; 4388 if (!Resize(n, length)) { 4389 SetNotOk(); 4390 return; 4391 } 4392 } 4393 uprv_memcpy(buffer_ + length, bytes, n); 4394 } 4395 4396 char * 4397 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity, 4398 int32_t desired_capacity_hint, 4399 char *scratch, 4400 int32_t scratch_capacity, 4401 int32_t *result_capacity) { 4402 if (min_capacity < 1 || scratch_capacity < min_capacity) { 4403 *result_capacity = 0; 4404 return NULL; 4405 } 4406 int32_t available = capacity_ - appended_; 4407 if (available >= min_capacity) { 4408 *result_capacity = available; 4409 return buffer_ + appended_; 4410 } else if (Resize(desired_capacity_hint, appended_)) { 4411 *result_capacity = capacity_ - appended_; 4412 return buffer_ + appended_; 4413 } else { 4414 *result_capacity = scratch_capacity; 4415 return scratch; 4416 } 4417 } 4418 4419 UBool 4420 SortKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { 4421 if (!grow_) { 4422 return FALSE; 4423 } 4424 int32_t newCapacity = 2 * capacity_; 4425 int32_t altCapacity = length + 2 * appendCapacity; 4426 if (newCapacity < altCapacity) { 4427 newCapacity = altCapacity; 4428 } 4429 if (newCapacity < 1024) { 4430 newCapacity = 1024; 4431 } 4432 char *newBuffer = (char *)uprv_malloc(newCapacity); 4433 if (newBuffer == NULL) { 4434 return FALSE; 4435 } 4436 uprv_memcpy(newBuffer, buffer_, length); 4437 uprv_free(ownedBuffer_); 4438 ownedBuffer_ = buffer_ = newBuffer; 4439 capacity_ = newCapacity; 4440 return TRUE; 4441 } 4442 4443 uint8_t * 4444 SortKeyByteSink::OrphanUnsignedBuffer(int32_t &orphanedCapacity) { 4445 if (buffer_ == NULL || appended_ == 0) { 4446 orphanedCapacity = 0; 4447 return NULL; 4448 } 4449 if (ownedBuffer_ != NULL) { 4450 // orphan & forget the ownedBuffer_ 4451 uint8_t *returnBuffer = reinterpret_cast<uint8_t *>(ownedBuffer_); 4452 ownedBuffer_ = buffer_ = NULL; 4453 orphanedCapacity = capacity_; 4454 capacity_ = appended_ = 0; 4455 return returnBuffer; 4456 } 4457 // clone the buffer_ 4458 uint8_t *newBuffer = (uint8_t *)uprv_malloc(appended_); 4459 if (newBuffer == NULL) { 4460 orphanedCapacity = 0; 4461 return NULL; 4462 } 4463 uprv_memcpy(newBuffer, buffer_, appended_); 4464 orphanedCapacity = appended_; 4465 return newBuffer; 4466 } 4467 4468 U_NAMESPACE_END 4469 4470 /* sortkey API */ 4471 U_CAPI int32_t U_EXPORT2 4472 ucol_getSortKey(const UCollator *coll, 4473 const UChar *source, 4474 int32_t sourceLength, 4475 uint8_t *result, 4476 int32_t resultLength) 4477 { 4478 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 4479 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 4480 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 4481 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 4482 } 4483 4484 UErrorCode status = U_ZERO_ERROR; 4485 int32_t keySize = 0; 4486 4487 if(source != NULL) { 4488 // source == NULL is actually an error situation, but we would need to 4489 // have an error code to return it. Until we introduce a new 4490 // API, it stays like this 4491 4492 /* this uses the function pointer that is set in updateinternalstate */ 4493 /* currently, there are two funcs: */ 4494 /*ucol_calcSortKey(...);*/ 4495 /*ucol_calcSortKeySimpleTertiary(...);*/ 4496 4497 SortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength, 4498 SortKeyByteSink::FILL_ORIGINAL_BUFFER | SortKeyByteSink::DONT_GROW); 4499 coll->sortKeyGen(coll, source, sourceLength, sink, &status); 4500 keySize = sink.NumberOfBytesAppended(); 4501 } 4502 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 4503 UTRACE_EXIT_STATUS(status); 4504 return keySize; 4505 } 4506 4507 /* this function is called by the C++ API for sortkey generation */ 4508 U_CFUNC int32_t 4509 ucol_getSortKeyWithAllocation(const UCollator *coll, 4510 const UChar *source, int32_t sourceLength, 4511 uint8_t *&result, int32_t &resultCapacity, 4512 UErrorCode *pErrorCode) { 4513 SortKeyByteSink sink(reinterpret_cast<char *>(result), resultCapacity); 4514 coll->sortKeyGen(coll, source, sourceLength, sink, pErrorCode); 4515 int32_t resultLen = sink.NumberOfBytesAppended(); 4516 if (U_SUCCESS(*pErrorCode)) { 4517 if (!sink.IsOk()) { 4518 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 4519 } else if (result != sink.GetUnsignedBuffer()) { 4520 result = sink.OrphanUnsignedBuffer(resultCapacity); 4521 } 4522 } 4523 return resultLen; 4524 } 4525 4526 // Is this primary weight compressible? 4527 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). 4528 // TODO: This should use per-lead-byte flags from FractionalUCA.txt. 4529 static inline UBool 4530 isCompressible(const UCollator * /*coll*/, uint8_t primary1) { 4531 return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary; 4532 } 4533 4534 static 4535 inline void doCaseShift(SortKeyByteSink &cases, uint32_t &caseShift) { 4536 if (caseShift == 0) { 4537 cases.Append(UCOL_CASE_BYTE_START); 4538 caseShift = UCOL_CASE_SHIFT_START; 4539 } 4540 } 4541 4542 // Packs the secondary buffer when processing French locale. 4543 static void 4544 packFrench(uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) { 4545 secondaries += secsize; // We read the secondary-level bytes back to front. 4546 uint8_t secondary; 4547 int32_t count2 = 0; 4548 int32_t i = 0; 4549 // we use i here since the key size already accounts for terminators, so we'll discard the increment 4550 for(i = 0; i<secsize; i++) { 4551 secondary = *(secondaries-i-1); 4552 /* This is compression code. */ 4553 if (secondary == UCOL_COMMON2) { 4554 ++count2; 4555 } else { 4556 if (count2 > 0) { 4557 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4558 while (count2 > UCOL_TOP_COUNT2) { 4559 result.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2)); 4560 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4561 } 4562 result.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1))); 4563 } else { 4564 while (count2 > UCOL_BOT_COUNT2) { 4565 result.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4566 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4567 } 4568 result.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4569 } 4570 count2 = 0; 4571 } 4572 result.Append(secondary); 4573 } 4574 } 4575 if (count2 > 0) { 4576 while (count2 > UCOL_BOT_COUNT2) { 4577 result.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4578 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4579 } 4580 result.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4581 } 4582 } 4583 4584 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 4585 4586 /* This is the sortkey work horse function */ 4587 U_CFUNC void U_CALLCONV 4588 ucol_calcSortKey(const UCollator *coll, 4589 const UChar *source, 4590 int32_t sourceLength, 4591 SortKeyByteSink &result, 4592 UErrorCode *status) 4593 { 4594 if(U_FAILURE(*status)) { 4595 return; 4596 } 4597 4598 /* Stack allocated buffers for buffers we use */ 4599 char second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER]; 4600 char caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER]; 4601 4602 SortKeyByteSink &primaries = result; 4603 SortKeyByteSink secondaries(second, LENGTHOF(second)); 4604 SortKeyByteSink tertiaries(tert, LENGTHOF(tert)); 4605 SortKeyByteSink cases(caseB, LENGTHOF(caseB)); 4606 SortKeyByteSink quads(quad, LENGTHOF(quad)); 4607 4608 UnicodeString normSource; 4609 4610 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); 4611 4612 UColAttributeValue strength = coll->strength; 4613 4614 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4615 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4616 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4617 UBool compareIdent = (strength == UCOL_IDENTICAL); 4618 UBool doCase = (coll->caseLevel == UCOL_ON); 4619 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4620 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4621 //UBool qShifted = shifted && (compareQuad == 0); 4622 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4623 4624 uint32_t variableTopValue = coll->variableTopValue; 4625 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no 4626 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. 4627 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4628 uint8_t UCOL_HIRAGANA_QUAD = 0; 4629 if(doHiragana) { 4630 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; 4631 /* allocate one more space for hiragana, value for hiragana */ 4632 } 4633 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4634 4635 /* support for special features like caselevel and funky secondaries */ 4636 int32_t lastSecondaryLength = 0; 4637 uint32_t caseShift = 0; 4638 4639 /* If we need to normalize, we'll do it all at once at the beginning! */ 4640 const Normalizer2 *norm2; 4641 if(compareIdent) { 4642 norm2 = Normalizer2Factory::getNFDInstance(*status); 4643 } else if(coll->normalizationMode != UCOL_OFF) { 4644 norm2 = Normalizer2Factory::getFCDInstance(*status); 4645 } else { 4646 norm2 = NULL; 4647 } 4648 if(norm2 != NULL) { 4649 normSource.setTo(FALSE, source, len); 4650 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 4651 if(qcYesLength != len) { 4652 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 4653 normSource.truncate(qcYesLength); 4654 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 4655 source = normSource.getBuffer(); 4656 len = normSource.length(); 4657 } 4658 } 4659 collIterate s; 4660 IInit_collIterate(coll, source, len, &s, status); 4661 if(U_FAILURE(*status)) { 4662 return; 4663 } 4664 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 4665 4666 uint32_t order = 0; 4667 4668 uint8_t primary1 = 0; 4669 uint8_t primary2 = 0; 4670 uint8_t secondary = 0; 4671 uint8_t tertiary = 0; 4672 uint8_t caseSwitch = coll->caseSwitch; 4673 uint8_t tertiaryMask = coll->tertiaryMask; 4674 int8_t tertiaryAddition = coll->tertiaryAddition; 4675 uint8_t tertiaryTop = coll->tertiaryTop; 4676 uint8_t tertiaryBottom = coll->tertiaryBottom; 4677 uint8_t tertiaryCommon = coll->tertiaryCommon; 4678 uint8_t caseBits = 0; 4679 4680 UBool wasShifted = FALSE; 4681 UBool notIsContinuation = FALSE; 4682 4683 uint32_t count2 = 0, count3 = 0, count4 = 0; 4684 uint8_t leadPrimary = 0; 4685 4686 for(;;) { 4687 order = ucol_IGetNextCE(coll, &s, status); 4688 if(order == UCOL_NO_MORE_CES) { 4689 break; 4690 } 4691 4692 if(order == 0) { 4693 continue; 4694 } 4695 4696 notIsContinuation = !isContinuation(order); 4697 4698 if(notIsContinuation) { 4699 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); 4700 } else { 4701 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4702 } 4703 4704 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4705 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4706 primary1 = (uint8_t)(order >> 8); 4707 4708 uint8_t originalPrimary1 = primary1; 4709 if(notIsContinuation && coll->leadBytePermutationTable != NULL) { 4710 primary1 = coll->leadBytePermutationTable[primary1]; 4711 } 4712 4713 if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4714 || (!notIsContinuation && wasShifted))) 4715 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 4716 { 4717 /* and other ignorables should be removed if following a shifted code point */ 4718 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4719 /* we should just completely ignore it */ 4720 continue; 4721 } 4722 if(compareQuad == 0) { 4723 if(count4 > 0) { 4724 while (count4 > UCOL_BOT_COUNT4) { 4725 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4)); 4726 count4 -= UCOL_BOT_COUNT4; 4727 } 4728 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1))); 4729 count4 = 0; 4730 } 4731 /* We are dealing with a variable and we're treating them as shifted */ 4732 /* This is a shifted ignorable */ 4733 if(primary1 != 0) { /* we need to check this since we could be in continuation */ 4734 quads.Append(primary1); 4735 } 4736 if(primary2 != 0) { 4737 quads.Append(primary2); 4738 } 4739 } 4740 wasShifted = TRUE; 4741 } else { 4742 wasShifted = FALSE; 4743 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4744 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 4745 /* regular and simple sortkey calc */ 4746 if(primary1 != UCOL_IGNORABLE) { 4747 if(notIsContinuation) { 4748 if(leadPrimary == primary1) { 4749 primaries.Append(primary2); 4750 } else { 4751 if(leadPrimary != 0) { 4752 primaries.Append((uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN)); 4753 } 4754 if(primary2 == UCOL_IGNORABLE) { 4755 /* one byter, not compressed */ 4756 primaries.Append(primary1); 4757 leadPrimary = 0; 4758 } else if(isCompressible(coll, originalPrimary1)) { 4759 /* compress */ 4760 primaries.Append(leadPrimary = primary1, primary2); 4761 } else { 4762 leadPrimary = 0; 4763 primaries.Append(primary1, primary2); 4764 } 4765 } 4766 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 4767 if(primary2 == UCOL_IGNORABLE) { 4768 primaries.Append(primary1); 4769 } else { 4770 primaries.Append(primary1, primary2); 4771 } 4772 } 4773 } 4774 4775 if(secondary > compareSec) { 4776 if(!isFrenchSec) { 4777 /* This is compression code. */ 4778 if (secondary == UCOL_COMMON2 && notIsContinuation) { 4779 ++count2; 4780 } else { 4781 if (count2 > 0) { 4782 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4783 while (count2 > UCOL_TOP_COUNT2) { 4784 secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2)); 4785 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4786 } 4787 secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1))); 4788 } else { 4789 while (count2 > UCOL_BOT_COUNT2) { 4790 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4791 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4792 } 4793 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4794 } 4795 count2 = 0; 4796 } 4797 secondaries.Append(secondary); 4798 } 4799 } else { 4800 /* Do the special handling for French secondaries */ 4801 /* We need to get continuation elements and do intermediate restore */ 4802 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ 4803 if(notIsContinuation) { 4804 if (lastSecondaryLength > 1) { 4805 uint8_t *frenchStartPtr = secondaries.GetLastFewBytes(lastSecondaryLength); 4806 if (frenchStartPtr != NULL) { 4807 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4808 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1; 4809 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4810 } 4811 } 4812 lastSecondaryLength = 1; 4813 } else { 4814 ++lastSecondaryLength; 4815 } 4816 secondaries.Append(secondary); 4817 } 4818 } 4819 4820 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 4821 // do the case level if we need to do it. We don't want to calculate 4822 // case level for primary ignorables if we have only primary strength and case level 4823 // otherwise we would break well formedness of CEs 4824 doCaseShift(cases, caseShift); 4825 if(notIsContinuation) { 4826 caseBits = (uint8_t)(tertiary & 0xC0); 4827 4828 if(tertiary != 0) { 4829 if(coll->caseFirst == UCOL_UPPER_FIRST) { 4830 if((caseBits & 0xC0) == 0) { 4831 cases.LastByte() |= 1 << (--caseShift); 4832 } else { 4833 cases.LastByte() |= 0 << (--caseShift); 4834 /* second bit */ 4835 doCaseShift(cases, caseShift); 4836 cases.LastByte() |= ((caseBits>>6)&1) << (--caseShift); 4837 } 4838 } else { 4839 if((caseBits & 0xC0) == 0) { 4840 cases.LastByte() |= 0 << (--caseShift); 4841 } else { 4842 cases.LastByte() |= 1 << (--caseShift); 4843 /* second bit */ 4844 doCaseShift(cases, caseShift); 4845 cases.LastByte() |= ((caseBits>>7)&1) << (--caseShift); 4846 } 4847 } 4848 } 4849 } 4850 } else { 4851 if(notIsContinuation) { 4852 tertiary ^= caseSwitch; 4853 } 4854 } 4855 4856 tertiary &= tertiaryMask; 4857 if(tertiary > compareTer) { 4858 /* This is compression code. */ 4859 /* sequence size check is included in the if clause */ 4860 if (tertiary == tertiaryCommon && notIsContinuation) { 4861 ++count3; 4862 } else { 4863 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 4864 tertiary += tertiaryAddition; 4865 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 4866 tertiary -= tertiaryAddition; 4867 } 4868 if (count3 > 0) { 4869 if ((tertiary > tertiaryCommon)) { 4870 while (count3 > coll->tertiaryTopCount) { 4871 tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount)); 4872 count3 -= (uint32_t)coll->tertiaryTopCount; 4873 } 4874 tertiaries.Append((uint8_t)(tertiaryTop - (count3-1))); 4875 } else { 4876 while (count3 > coll->tertiaryBottomCount) { 4877 tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount)); 4878 count3 -= (uint32_t)coll->tertiaryBottomCount; 4879 } 4880 tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1))); 4881 } 4882 count3 = 0; 4883 } 4884 tertiaries.Append(tertiary); 4885 } 4886 } 4887 4888 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 4889 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 4890 if(count4>0) { // Close this part 4891 while (count4 > UCOL_BOT_COUNT4) { 4892 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4)); 4893 count4 -= UCOL_BOT_COUNT4; 4894 } 4895 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1))); 4896 count4 = 0; 4897 } 4898 quads.Append(UCOL_HIRAGANA_QUAD); // Add the Hiragana 4899 } else { // This wasn't Hiragana, so we can continue adding stuff 4900 count4++; 4901 } 4902 } 4903 } 4904 } 4905 4906 /* Here, we are generally done with processing */ 4907 /* bailing out would not be too productive */ 4908 4909 if(U_SUCCESS(*status)) { 4910 /* we have done all the CE's, now let's put them together to form a key */ 4911 if(compareSec == 0) { 4912 if (count2 > 0) { 4913 while (count2 > UCOL_BOT_COUNT2) { 4914 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4915 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4916 } 4917 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4918 } 4919 result.Append(UCOL_LEVELTERMINATOR); 4920 if(!isFrenchSec || !secondaries.IsOk()) { 4921 result.Append(secondaries); 4922 } else { 4923 // If there are any unresolved continuation secondaries, 4924 // reverse them here so that we can reverse the whole secondary thing. 4925 if (lastSecondaryLength > 1) { 4926 uint8_t *frenchStartPtr = secondaries.GetLastFewBytes(lastSecondaryLength); 4927 if (frenchStartPtr != NULL) { 4928 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4929 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1; 4930 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4931 } 4932 } 4933 packFrench(secondaries.GetUnsignedBuffer(), secondaries.NumberOfBytesAppended(), result); 4934 } 4935 } 4936 4937 if(doCase) { 4938 result.Append(UCOL_LEVELTERMINATOR); 4939 result.Append(cases); 4940 } 4941 4942 if(compareTer == 0) { 4943 if (count3 > 0) { 4944 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { 4945 while (count3 >= coll->tertiaryTopCount) { 4946 tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount)); 4947 count3 -= (uint32_t)coll->tertiaryTopCount; 4948 } 4949 tertiaries.Append((uint8_t)(tertiaryTop - count3)); 4950 } else { 4951 while (count3 > coll->tertiaryBottomCount) { 4952 tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount)); 4953 count3 -= (uint32_t)coll->tertiaryBottomCount; 4954 } 4955 tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1))); 4956 } 4957 } 4958 result.Append(UCOL_LEVELTERMINATOR); 4959 result.Append(tertiaries); 4960 4961 if(compareQuad == 0/*qShifted == TRUE*/) { 4962 if(count4 > 0) { 4963 while (count4 > UCOL_BOT_COUNT4) { 4964 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4)); 4965 count4 -= UCOL_BOT_COUNT4; 4966 } 4967 quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1))); 4968 } 4969 result.Append(UCOL_LEVELTERMINATOR); 4970 result.Append(quads); 4971 } 4972 4973 if(compareIdent) { 4974 result.Append(UCOL_LEVELTERMINATOR); 4975 u_writeIdenticalLevelRun(s.string, len, result); 4976 } 4977 } 4978 result.Append(0); 4979 } 4980 4981 /* To avoid memory leak, free the offset buffer if necessary. */ 4982 ucol_freeOffsetBuffer(&s); 4983 } 4984 4985 4986 U_CFUNC void U_CALLCONV 4987 ucol_calcSortKeySimpleTertiary(const UCollator *coll, 4988 const UChar *source, 4989 int32_t sourceLength, 4990 SortKeyByteSink &result, 4991 UErrorCode *status) 4992 { 4993 U_ALIGN_CODE(16); 4994 4995 if(U_FAILURE(*status)) { 4996 return; 4997 } 4998 4999 /* Stack allocated buffers for buffers we use */ 5000 char second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER]; 5001 5002 SortKeyByteSink &primaries = result; 5003 SortKeyByteSink secondaries(second, LENGTHOF(second)); 5004 SortKeyByteSink tertiaries(tert, LENGTHOF(tert)); 5005 5006 UnicodeString normSource; 5007 5008 int32_t len = sourceLength; 5009 5010 /* If we need to normalize, we'll do it all at once at the beginning! */ 5011 if(coll->normalizationMode != UCOL_OFF) { 5012 normSource.setTo(len < 0, source, len); 5013 const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status); 5014 int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status); 5015 if(qcYesLength != normSource.length()) { 5016 UnicodeString unnormalized = normSource.tempSubString(qcYesLength); 5017 normSource.truncate(qcYesLength); 5018 norm2->normalizeSecondAndAppend(normSource, unnormalized, *status); 5019 source = normSource.getBuffer(); 5020 len = normSource.length(); 5021 } 5022 } 5023 collIterate s; 5024 IInit_collIterate(coll, (UChar *)source, len, &s, status); 5025 if(U_FAILURE(*status)) { 5026 return; 5027 } 5028 s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized. 5029 5030 uint32_t order = 0; 5031 5032 uint8_t primary1 = 0; 5033 uint8_t primary2 = 0; 5034 uint8_t secondary = 0; 5035 uint8_t tertiary = 0; 5036 uint8_t caseSwitch = coll->caseSwitch; 5037 uint8_t tertiaryMask = coll->tertiaryMask; 5038 int8_t tertiaryAddition = coll->tertiaryAddition; 5039 uint8_t tertiaryTop = coll->tertiaryTop; 5040 uint8_t tertiaryBottom = coll->tertiaryBottom; 5041 uint8_t tertiaryCommon = coll->tertiaryCommon; 5042 5043 UBool notIsContinuation = FALSE; 5044 5045 uint32_t count2 = 0, count3 = 0; 5046 uint8_t leadPrimary = 0; 5047 5048 for(;;) { 5049 order = ucol_IGetNextCE(coll, &s, status); 5050 5051 if(order == 0) { 5052 continue; 5053 } 5054 5055 if(order == UCOL_NO_MORE_CES) { 5056 break; 5057 } 5058 5059 notIsContinuation = !isContinuation(order); 5060 5061 if(notIsContinuation) { 5062 tertiary = (uint8_t)((order & tertiaryMask)); 5063 } else { 5064 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 5065 } 5066 5067 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5068 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5069 primary1 = (uint8_t)(order >> 8); 5070 5071 uint8_t originalPrimary1 = primary1; 5072 if (coll->leadBytePermutationTable != NULL && notIsContinuation) { 5073 primary1 = coll->leadBytePermutationTable[primary1]; 5074 } 5075 5076 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 5077 /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */ 5078 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ 5079 /* regular and simple sortkey calc */ 5080 if(primary1 != UCOL_IGNORABLE) { 5081 if(notIsContinuation) { 5082 if(leadPrimary == primary1) { 5083 primaries.Append(primary2); 5084 } else { 5085 if(leadPrimary != 0) { 5086 primaries.Append((uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN)); 5087 } 5088 if(primary2 == UCOL_IGNORABLE) { 5089 /* one byter, not compressed */ 5090 primaries.Append(primary1); 5091 leadPrimary = 0; 5092 } else if(isCompressible(coll, originalPrimary1)) { 5093 /* compress */ 5094 primaries.Append(leadPrimary = primary1, primary2); 5095 } else { 5096 leadPrimary = 0; 5097 primaries.Append(primary1, primary2); 5098 } 5099 } 5100 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 5101 if(primary2 == UCOL_IGNORABLE) { 5102 primaries.Append(primary1); 5103 } else { 5104 primaries.Append(primary1, primary2); 5105 } 5106 } 5107 } 5108 5109 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ 5110 /* This is compression code. */ 5111 if (secondary == UCOL_COMMON2 && notIsContinuation) { 5112 ++count2; 5113 } else { 5114 if (count2 > 0) { 5115 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 5116 while (count2 > UCOL_TOP_COUNT2) { 5117 secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2)); 5118 count2 -= (uint32_t)UCOL_TOP_COUNT2; 5119 } 5120 secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1))); 5121 } else { 5122 while (count2 > UCOL_BOT_COUNT2) { 5123 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 5124 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5125 } 5126 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 5127 } 5128 count2 = 0; 5129 } 5130 secondaries.Append(secondary); 5131 } 5132 } 5133 5134 if(notIsContinuation) { 5135 tertiary ^= caseSwitch; 5136 } 5137 5138 if(tertiary > 0) { 5139 /* This is compression code. */ 5140 /* sequence size check is included in the if clause */ 5141 if (tertiary == tertiaryCommon && notIsContinuation) { 5142 ++count3; 5143 } else { 5144 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 5145 tertiary += tertiaryAddition; 5146 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 5147 tertiary -= tertiaryAddition; 5148 } 5149 if (count3 > 0) { 5150 if ((tertiary > tertiaryCommon)) { 5151 while (count3 > coll->tertiaryTopCount) { 5152 tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount)); 5153 count3 -= (uint32_t)coll->tertiaryTopCount; 5154 } 5155 tertiaries.Append((uint8_t)(tertiaryTop - (count3-1))); 5156 } else { 5157 while (count3 > coll->tertiaryBottomCount) { 5158 tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount)); 5159 count3 -= (uint32_t)coll->tertiaryBottomCount; 5160 } 5161 tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1))); 5162 } 5163 count3 = 0; 5164 } 5165 tertiaries.Append(tertiary); 5166 } 5167 } 5168 } 5169 5170 if(U_SUCCESS(*status)) { 5171 /* we have done all the CE's, now let's put them together to form a key */ 5172 if (count2 > 0) { 5173 while (count2 > UCOL_BOT_COUNT2) { 5174 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 5175 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5176 } 5177 secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 5178 } 5179 result.Append(UCOL_LEVELTERMINATOR); 5180 result.Append(secondaries); 5181 5182 if (count3 > 0) { 5183 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { 5184 while (count3 >= coll->tertiaryTopCount) { 5185 tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount)); 5186 count3 -= (uint32_t)coll->tertiaryTopCount; 5187 } 5188 tertiaries.Append((uint8_t)(tertiaryTop - count3)); 5189 } else { 5190 while (count3 > coll->tertiaryBottomCount) { 5191 tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount)); 5192 count3 -= (uint32_t)coll->tertiaryBottomCount; 5193 } 5194 tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1))); 5195 } 5196 } 5197 result.Append(UCOL_LEVELTERMINATOR); 5198 result.Append(tertiaries); 5199 5200 result.Append(0); 5201 } 5202 5203 /* To avoid memory leak, free the offset buffer if necessary. */ 5204 ucol_freeOffsetBuffer(&s); 5205 5206 if (U_SUCCESS(*status) && !result.IsOk()) { 5207 *status = U_BUFFER_OVERFLOW_ERROR; 5208 } 5209 } 5210 5211 static inline 5212 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { 5213 UBool notIsContinuation = !isContinuation(CE); 5214 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); 5215 if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) 5216 || (!notIsContinuation && *wasShifted))) 5217 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 5218 { 5219 // The stuff below should probably be in the sortkey code... maybe not... 5220 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ 5221 /* we should just completely ignore it */ 5222 *wasShifted = TRUE; 5223 //continue; 5224 } 5225 //*wasShifted = TRUE; 5226 return TRUE; 5227 } else { 5228 *wasShifted = FALSE; 5229 return FALSE; 5230 } 5231 } 5232 static inline 5233 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { 5234 if(level < maxLevel) { 5235 dest[i++] = UCOL_LEVELTERMINATOR; 5236 } else { 5237 dest[i++] = 0; 5238 } 5239 } 5240 5241 /** enumeration of level identifiers for partial sort key generation */ 5242 enum { 5243 UCOL_PSK_PRIMARY = 0, 5244 UCOL_PSK_SECONDARY = 1, 5245 UCOL_PSK_CASE = 2, 5246 UCOL_PSK_TERTIARY = 3, 5247 UCOL_PSK_QUATERNARY = 4, 5248 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */ 5249 UCOL_PSK_IDENTICAL = 6, 5250 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */ 5251 UCOL_PSK_LIMIT 5252 }; 5253 5254 /** collation state enum. *_SHIFT value is how much to shift right 5255 * to get the state piece to the right. *_MASK value should be 5256 * ANDed with the shifted state. This data is stored in state[1] 5257 * field. 5258 */ 5259 enum { 5260 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */ 5261 UCOL_PSK_LEVEL_MASK = 7, /** three bits */ 5262 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */ 5263 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, 5264 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary 5265 * This field is also used to denote that the French secondary level is finished 5266 */ 5267 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ 5268 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ 5269 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */ 5270 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ 5271 /** When we do French we need to reverse secondary values. However, continuations 5272 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba 5273 */ 5274 UCOL_PSK_BOCSU_BYTES_SHIFT = 7, 5275 UCOL_PSK_BOCSU_BYTES_MASK = 3, 5276 UCOL_PSK_CONSUMED_CES_SHIFT = 9, 5277 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF 5278 }; 5279 5280 // macro calculating the number of expansion CEs available 5281 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn 5282 5283 5284 /** main sortkey part procedure. On the first call, 5285 * you should pass in a collator, an iterator, empty state 5286 * state[0] == state[1] == 0, a buffer to hold results 5287 * number of bytes you need and an error code pointer. 5288 * Make sure your buffer is big enough to hold the wanted 5289 * number of sortkey bytes. I don't check. 5290 * The only meaningful status you can get back is 5291 * U_BUFFER_OVERFLOW_ERROR, which basically means that you 5292 * have been dealt a raw deal and that you probably won't 5293 * be able to use partial sortkey generation for this 5294 * particular combination of string and collator. This 5295 * is highly unlikely, but you should still check the error code. 5296 * Any other status means that you're not in a sane situation 5297 * anymore. After the first call, preserve state values and 5298 * use them on subsequent calls to obtain more bytes of a sortkey. 5299 * Use until the number of bytes written is smaller than the requested 5300 * number of bytes. Generated sortkey is not compatible with the 5301 * one generated by ucol_getSortKey, as we don't do any compression. 5302 * However, levels are still terminated by a 1 (one) and the sortkey 5303 * is terminated by a 0 (zero). Identical level is the same as in the 5304 * regular sortkey - internal bocu-1 implementation is used. 5305 * For curious, although you cannot do much about this, here is 5306 * the structure of state words. 5307 * state[0] - iterator state. Depends on the iterator implementation, 5308 * but allows the iterator to continue where it stopped in 5309 * the last iteration. 5310 * state[1] - collation processing state. Here is the distribution 5311 * of the bits: 5312 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary 5313 * quaternary, quin (we don't use this one), identical and 5314 * null (producing only zeroes - first one to terminate the 5315 * sortkey and subsequent to fill the buffer). 5316 * 3 - byte count. Number of bytes written on the primary level. 5317 * 4 - was shifted. Whether the previous iteration finished in the 5318 * shifted state. 5319 * 5, 6 - French continuation bytes written. See the comment in the enum 5320 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on 5321 * the identical level. 5322 * 9..31 - CEs consumed. Number of getCE or next32 operations performed 5323 * since thes last successful update of the iterator state. 5324 */ 5325 U_CAPI int32_t U_EXPORT2 5326 ucol_nextSortKeyPart(const UCollator *coll, 5327 UCharIterator *iter, 5328 uint32_t state[2], 5329 uint8_t *dest, int32_t count, 5330 UErrorCode *status) 5331 { 5332 /* error checking */ 5333 if(status==NULL || U_FAILURE(*status)) { 5334 return 0; 5335 } 5336 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 5337 if( coll==NULL || iter==NULL || 5338 state==NULL || 5339 count<0 || (count>0 && dest==NULL) 5340 ) { 5341 *status=U_ILLEGAL_ARGUMENT_ERROR; 5342 UTRACE_EXIT_STATUS(status); 5343 return 0; 5344 } 5345 5346 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 5347 coll, iter, state[0], state[1], dest, count); 5348 5349 if(count==0) { 5350 /* nothing to do */ 5351 UTRACE_EXIT_VALUE(0); 5352 return 0; 5353 } 5354 /** Setting up situation according to the state we got from the previous iteration */ 5355 // The state of the iterator from the previous invocation 5356 uint32_t iterState = state[0]; 5357 // Has the last iteration ended in the shifted state 5358 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE; 5359 // What is the current level of the sortkey? 5360 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; 5361 // Have we written only one byte from a two byte primary in the previous iteration? 5362 // Also on secondary level - have we finished with the French secondary? 5363 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; 5364 // number of bytes in the continuation buffer for French 5365 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; 5366 // Number of bytes already written from a bocsu sequence. Since 5367 // the longes bocsu sequence is 4 long, this can be up to 3. 5368 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK; 5369 // Number of elements that need to be consumed in this iteration because 5370 // the iterator returned UITER_NO_STATE at the end of the last iteration, 5371 // so we had to save the last valid state. 5372 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK; 5373 5374 /** values that depend on the collator attributes */ 5375 // strength of the collator. 5376 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); 5377 // maximal level of the partial sortkey. Need to take whether case level is done 5378 int32_t maxLevel = 0; 5379 if(strength < UCOL_TERTIARY) { 5380 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 5381 maxLevel = UCOL_PSK_CASE; 5382 } else { 5383 maxLevel = strength; 5384 } 5385 } else { 5386 if(strength == UCOL_TERTIARY) { 5387 maxLevel = UCOL_PSK_TERTIARY; 5388 } else if(strength == UCOL_QUATERNARY) { 5389 maxLevel = UCOL_PSK_QUATERNARY; 5390 } else { // identical 5391 maxLevel = UCOL_IDENTICAL; 5392 } 5393 } 5394 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation 5395 uint8_t UCOL_HIRAGANA_QUAD = 5396 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF; 5397 // Boundary value that decides whether a CE is shifted or not 5398 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0; 5399 // Are we doing French collation? 5400 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON); 5401 5402 /** initializing the collation state */ 5403 UBool notIsContinuation = FALSE; 5404 uint32_t CE = UCOL_NO_MORE_CES; 5405 5406 collIterate s; 5407 IInit_collIterate(coll, NULL, -1, &s, status); 5408 if(U_FAILURE(*status)) { 5409 UTRACE_EXIT_STATUS(*status); 5410 return 0; 5411 } 5412 s.iterator = iter; 5413 s.flags |= UCOL_USE_ITERATOR; 5414 // This variable tells us whether we have produced some other levels in this iteration 5415 // before we moved to the identical level. In that case, we need to switch the 5416 // type of the iterator. 5417 UBool doingIdenticalFromStart = FALSE; 5418 // Normalizing iterator 5419 // The division for the array length may truncate the array size to 5420 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 5421 // for all platforms anyway. 5422 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 5423 UNormIterator *normIter = NULL; 5424 // If the normalization is turned on for the collator and we are below identical level 5425 // we will use a FCD normalizing iterator 5426 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { 5427 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5428 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); 5429 s.flags &= ~UCOL_ITER_NORM; 5430 if(U_FAILURE(*status)) { 5431 UTRACE_EXIT_STATUS(*status); 5432 return 0; 5433 } 5434 } else if(level == UCOL_PSK_IDENTICAL) { 5435 // for identical level, we need a NFD iterator. We need to instantiate it here, since we 5436 // will be updating the state - and this cannot be done on an ordinary iterator. 5437 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5438 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5439 s.flags &= ~UCOL_ITER_NORM; 5440 if(U_FAILURE(*status)) { 5441 UTRACE_EXIT_STATUS(*status); 5442 return 0; 5443 } 5444 doingIdenticalFromStart = TRUE; 5445 } 5446 5447 // This is the tentative new state of the iterator. The problem 5448 // is that the iterator might return an undefined state, in 5449 // which case we should save the last valid state and increase 5450 // the iterator skip value. 5451 uint32_t newState = 0; 5452 5453 // First, we set the iterator to the last valid position 5454 // from the last iteration. This was saved in state[0]. 5455 if(iterState == 0) { 5456 /* initial state */ 5457 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { 5458 s.iterator->move(s.iterator, 0, UITER_LIMIT); 5459 } else { 5460 s.iterator->move(s.iterator, 0, UITER_START); 5461 } 5462 } else { 5463 /* reset to previous state */ 5464 s.iterator->setState(s.iterator, iterState, status); 5465 if(U_FAILURE(*status)) { 5466 UTRACE_EXIT_STATUS(*status); 5467 return 0; 5468 } 5469 } 5470 5471 5472 5473 // This variable tells us whether we can attempt to update the state 5474 // of iterator. Situations where we don't want to update iterator state 5475 // are the existence of expansion CEs that are not yet processed, and 5476 // finishing the case level without enough space in the buffer to insert 5477 // a level terminator. 5478 UBool canUpdateState = TRUE; 5479 5480 // Consume all the CEs that were consumed at the end of the previous 5481 // iteration without updating the iterator state. On identical level, 5482 // consume the code points. 5483 int32_t counter = cces; 5484 if(level < UCOL_PSK_IDENTICAL) { 5485 while(counter-->0) { 5486 // If we're doing French and we are on the secondary level, 5487 // we go backwards. 5488 if(level == UCOL_PSK_SECONDARY && doingFrench) { 5489 CE = ucol_IGetPrevCE(coll, &s, status); 5490 } else { 5491 CE = ucol_IGetNextCE(coll, &s, status); 5492 } 5493 if(CE==UCOL_NO_MORE_CES) { 5494 /* should not happen */ 5495 *status=U_INTERNAL_PROGRAM_ERROR; 5496 UTRACE_EXIT_STATUS(*status); 5497 return 0; 5498 } 5499 if(uprv_numAvailableExpCEs(s)) { 5500 canUpdateState = FALSE; 5501 } 5502 } 5503 } else { 5504 while(counter-->0) { 5505 uiter_next32(s.iterator); 5506 } 5507 } 5508 5509 // French secondary needs to know whether the iterator state of zero came from previous level OR 5510 // from a new invocation... 5511 UBool wasDoingPrimary = FALSE; 5512 // destination buffer byte counter. When this guy 5513 // gets to count, we're done with the iteration 5514 int32_t i = 0; 5515 // used to count the zero bytes written after we 5516 // have finished with the sort key 5517 int32_t j = 0; 5518 5519 5520 // Hm.... I think we're ready to plunge in. Basic story is as following: 5521 // we have a fall through case based on level. This is used for initial 5522 // positioning on iteration start. Every level processor contains a 5523 // for(;;) which will be broken when we exhaust all the CEs. Other 5524 // way to exit is a goto saveState, which happens when we have filled 5525 // out our buffer. 5526 switch(level) { 5527 case UCOL_PSK_PRIMARY: 5528 wasDoingPrimary = TRUE; 5529 for(;;) { 5530 if(i==count) { 5531 goto saveState; 5532 } 5533 // We should save the state only if we 5534 // are sure that we are done with the 5535 // previous iterator state 5536 if(canUpdateState && byteCountOrFrenchDone == 0) { 5537 newState = s.iterator->getState(s.iterator); 5538 if(newState != UITER_NO_STATE) { 5539 iterState = newState; 5540 cces = 0; 5541 } 5542 } 5543 CE = ucol_IGetNextCE(coll, &s, status); 5544 cces++; 5545 if(CE==UCOL_NO_MORE_CES) { 5546 // Add the level separator 5547 terminatePSKLevel(level, maxLevel, i, dest); 5548 byteCountOrFrenchDone=0; 5549 // Restart the iteration an move to the 5550 // second level 5551 s.iterator->move(s.iterator, 0, UITER_START); 5552 cces = 0; 5553 level = UCOL_PSK_SECONDARY; 5554 break; 5555 } 5556 if(!isContinuation(CE)){ 5557 if(coll->leadBytePermutationTable != NULL){ 5558 CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF); 5559 } 5560 } 5561 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5562 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ 5563 if(CE != 0) { 5564 if(byteCountOrFrenchDone == 0) { 5565 // get the second byte of primary 5566 dest[i++]=(uint8_t)(CE >> 8); 5567 } else { 5568 byteCountOrFrenchDone = 0; 5569 } 5570 if((CE &=0xff)!=0) { 5571 if(i==count) { 5572 /* overflow */ 5573 byteCountOrFrenchDone = 1; 5574 cces--; 5575 goto saveState; 5576 } 5577 dest[i++]=(uint8_t)CE; 5578 } 5579 } 5580 } 5581 if(uprv_numAvailableExpCEs(s)) { 5582 canUpdateState = FALSE; 5583 } else { 5584 canUpdateState = TRUE; 5585 } 5586 } 5587 /* fall through to next level */ 5588 case UCOL_PSK_SECONDARY: 5589 if(strength >= UCOL_SECONDARY) { 5590 if(!doingFrench) { 5591 for(;;) { 5592 if(i == count) { 5593 goto saveState; 5594 } 5595 // We should save the state only if we 5596 // are sure that we are done with the 5597 // previous iterator state 5598 if(canUpdateState) { 5599 newState = s.iterator->getState(s.iterator); 5600 if(newState != UITER_NO_STATE) { 5601 iterState = newState; 5602 cces = 0; 5603 } 5604 } 5605 CE = ucol_IGetNextCE(coll, &s, status); 5606 cces++; 5607 if(CE==UCOL_NO_MORE_CES) { 5608 // Add the level separator 5609 terminatePSKLevel(level, maxLevel, i, dest); 5610 byteCountOrFrenchDone = 0; 5611 // Restart the iteration an move to the 5612 // second level 5613 s.iterator->move(s.iterator, 0, UITER_START); 5614 cces = 0; 5615 level = UCOL_PSK_CASE; 5616 break; 5617 } 5618 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5619 CE >>= 8; /* get secondary */ 5620 if(CE != 0) { 5621 dest[i++]=(uint8_t)CE; 5622 } 5623 } 5624 if(uprv_numAvailableExpCEs(s)) { 5625 canUpdateState = FALSE; 5626 } else { 5627 canUpdateState = TRUE; 5628 } 5629 } 5630 } else { // French secondary processing 5631 uint8_t frenchBuff[UCOL_MAX_BUFFER]; 5632 int32_t frenchIndex = 0; 5633 // Here we are going backwards. 5634 // If the iterator is at the beggining, it should be 5635 // moved to end. 5636 if(wasDoingPrimary) { 5637 s.iterator->move(s.iterator, 0, UITER_LIMIT); 5638 cces = 0; 5639 } 5640 for(;;) { 5641 if(i == count) { 5642 goto saveState; 5643 } 5644 if(canUpdateState) { 5645 newState = s.iterator->getState(s.iterator); 5646 if(newState != UITER_NO_STATE) { 5647 iterState = newState; 5648 cces = 0; 5649 } 5650 } 5651 CE = ucol_IGetPrevCE(coll, &s, status); 5652 cces++; 5653 if(CE==UCOL_NO_MORE_CES) { 5654 // Add the level separator 5655 terminatePSKLevel(level, maxLevel, i, dest); 5656 byteCountOrFrenchDone = 0; 5657 // Restart the iteration an move to the next level 5658 s.iterator->move(s.iterator, 0, UITER_START); 5659 level = UCOL_PSK_CASE; 5660 break; 5661 } 5662 if(isContinuation(CE)) { // if it's a continuation, we want to save it and 5663 // reverse when we get a first non-continuation CE. 5664 CE >>= 8; 5665 frenchBuff[frenchIndex++] = (uint8_t)CE; 5666 } else if(!isShiftedCE(CE, LVT, &wasShifted)) { 5667 CE >>= 8; /* get secondary */ 5668 if(!frenchIndex) { 5669 if(CE != 0) { 5670 dest[i++]=(uint8_t)CE; 5671 } 5672 } else { 5673 frenchBuff[frenchIndex++] = (uint8_t)CE; 5674 frenchIndex -= usedFrench; 5675 usedFrench = 0; 5676 while(i < count && frenchIndex) { 5677 dest[i++] = frenchBuff[--frenchIndex]; 5678 usedFrench++; 5679 } 5680 } 5681 } 5682 if(uprv_numAvailableExpCEs(s)) { 5683 canUpdateState = FALSE; 5684 } else { 5685 canUpdateState = TRUE; 5686 } 5687 } 5688 } 5689 } else { 5690 level = UCOL_PSK_CASE; 5691 } 5692 /* fall through to next level */ 5693 case UCOL_PSK_CASE: 5694 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 5695 uint32_t caseShift = UCOL_CASE_SHIFT_START; 5696 uint8_t caseByte = UCOL_CASE_BYTE_START; 5697 uint8_t caseBits = 0; 5698 5699 for(;;) { 5700 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START); 5701 if(i == count) { 5702 goto saveState; 5703 } 5704 // We should save the state only if we 5705 // are sure that we are done with the 5706 // previous iterator state 5707 if(canUpdateState) { 5708 newState = s.iterator->getState(s.iterator); 5709 if(newState != UITER_NO_STATE) { 5710 iterState = newState; 5711 cces = 0; 5712 } 5713 } 5714 CE = ucol_IGetNextCE(coll, &s, status); 5715 cces++; 5716 if(CE==UCOL_NO_MORE_CES) { 5717 // On the case level we might have an unfinished 5718 // case byte. Add one if it's started. 5719 if(caseShift != UCOL_CASE_SHIFT_START) { 5720 dest[i++] = caseByte; 5721 } 5722 cces = 0; 5723 // We have finished processing CEs on this level. 5724 // However, we don't know if we have enough space 5725 // to add a case level terminator. 5726 if(i < count) { 5727 // Add the level separator 5728 terminatePSKLevel(level, maxLevel, i, dest); 5729 // Restart the iteration and move to the 5730 // next level 5731 s.iterator->move(s.iterator, 0, UITER_START); 5732 level = UCOL_PSK_TERTIARY; 5733 } else { 5734 canUpdateState = FALSE; 5735 } 5736 break; 5737 } 5738 5739 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5740 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) { 5741 // do the case level if we need to do it. We don't want to calculate 5742 // case level for primary ignorables if we have only primary strength and case level 5743 // otherwise we would break well formedness of CEs 5744 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 5745 caseBits = (uint8_t)(CE & 0xC0); 5746 // this copies the case level logic from the 5747 // sort key generation code 5748 if(CE != 0) { 5749 if (caseShift == 0) { 5750 dest[i++] = caseByte; 5751 caseShift = UCOL_CASE_SHIFT_START; 5752 caseByte = UCOL_CASE_BYTE_START; 5753 } 5754 if(coll->caseFirst == UCOL_UPPER_FIRST) { 5755 if((caseBits & 0xC0) == 0) { 5756 caseByte |= 1 << (--caseShift); 5757 } else { 5758 caseByte |= 0 << (--caseShift); 5759 /* second bit */ 5760 if(caseShift == 0) { 5761 dest[i++] = caseByte; 5762 caseShift = UCOL_CASE_SHIFT_START; 5763 caseByte = UCOL_CASE_BYTE_START; 5764 } 5765 caseByte |= ((caseBits>>6)&1) << (--caseShift); 5766 } 5767 } else { 5768 if((caseBits & 0xC0) == 0) { 5769 caseByte |= 0 << (--caseShift); 5770 } else { 5771 caseByte |= 1 << (--caseShift); 5772 /* second bit */ 5773 if(caseShift == 0) { 5774 dest[i++] = caseByte; 5775 caseShift = UCOL_CASE_SHIFT_START; 5776 caseByte = UCOL_CASE_BYTE_START; 5777 } 5778 caseByte |= ((caseBits>>7)&1) << (--caseShift); 5779 } 5780 } 5781 } 5782 5783 } 5784 } 5785 // Not sure this is correct for the case level - revisit 5786 if(uprv_numAvailableExpCEs(s)) { 5787 canUpdateState = FALSE; 5788 } else { 5789 canUpdateState = TRUE; 5790 } 5791 } 5792 } else { 5793 level = UCOL_PSK_TERTIARY; 5794 } 5795 /* fall through to next level */ 5796 case UCOL_PSK_TERTIARY: 5797 if(strength >= UCOL_TERTIARY) { 5798 for(;;) { 5799 if(i == count) { 5800 goto saveState; 5801 } 5802 // We should save the state only if we 5803 // are sure that we are done with the 5804 // previous iterator state 5805 if(canUpdateState) { 5806 newState = s.iterator->getState(s.iterator); 5807 if(newState != UITER_NO_STATE) { 5808 iterState = newState; 5809 cces = 0; 5810 } 5811 } 5812 CE = ucol_IGetNextCE(coll, &s, status); 5813 cces++; 5814 if(CE==UCOL_NO_MORE_CES) { 5815 // Add the level separator 5816 terminatePSKLevel(level, maxLevel, i, dest); 5817 byteCountOrFrenchDone = 0; 5818 // Restart the iteration an move to the 5819 // second level 5820 s.iterator->move(s.iterator, 0, UITER_START); 5821 cces = 0; 5822 level = UCOL_PSK_QUATERNARY; 5823 break; 5824 } 5825 if(!isShiftedCE(CE, LVT, &wasShifted)) { 5826 notIsContinuation = !isContinuation(CE); 5827 5828 if(notIsContinuation) { 5829 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 5830 CE ^= coll->caseSwitch; 5831 CE &= coll->tertiaryMask; 5832 } else { 5833 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 5834 } 5835 5836 if(CE != 0) { 5837 dest[i++]=(uint8_t)CE; 5838 } 5839 } 5840 if(uprv_numAvailableExpCEs(s)) { 5841 canUpdateState = FALSE; 5842 } else { 5843 canUpdateState = TRUE; 5844 } 5845 } 5846 } else { 5847 // if we're not doing tertiary 5848 // skip to the end 5849 level = UCOL_PSK_NULL; 5850 } 5851 /* fall through to next level */ 5852 case UCOL_PSK_QUATERNARY: 5853 if(strength >= UCOL_QUATERNARY) { 5854 for(;;) { 5855 if(i == count) { 5856 goto saveState; 5857 } 5858 // We should save the state only if we 5859 // are sure that we are done with the 5860 // previous iterator state 5861 if(canUpdateState) { 5862 newState = s.iterator->getState(s.iterator); 5863 if(newState != UITER_NO_STATE) { 5864 iterState = newState; 5865 cces = 0; 5866 } 5867 } 5868 CE = ucol_IGetNextCE(coll, &s, status); 5869 cces++; 5870 if(CE==UCOL_NO_MORE_CES) { 5871 // Add the level separator 5872 terminatePSKLevel(level, maxLevel, i, dest); 5873 //dest[i++] = UCOL_LEVELTERMINATOR; 5874 byteCountOrFrenchDone = 0; 5875 // Restart the iteration an move to the 5876 // second level 5877 s.iterator->move(s.iterator, 0, UITER_START); 5878 cces = 0; 5879 level = UCOL_PSK_QUIN; 5880 break; 5881 } 5882 if(CE==0) 5883 continue; 5884 if(isShiftedCE(CE, LVT, &wasShifted)) { 5885 CE >>= 16; /* get primary */ 5886 if(CE != 0) { 5887 if(byteCountOrFrenchDone == 0) { 5888 dest[i++]=(uint8_t)(CE >> 8); 5889 } else { 5890 byteCountOrFrenchDone = 0; 5891 } 5892 if((CE &=0xff)!=0) { 5893 if(i==count) { 5894 /* overflow */ 5895 byteCountOrFrenchDone = 1; 5896 goto saveState; 5897 } 5898 dest[i++]=(uint8_t)CE; 5899 } 5900 } 5901 } else { 5902 notIsContinuation = !isContinuation(CE); 5903 if(notIsContinuation) { 5904 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 5905 dest[i++] = UCOL_HIRAGANA_QUAD; 5906 } else { 5907 dest[i++] = 0xFF; 5908 } 5909 } 5910 } 5911 if(uprv_numAvailableExpCEs(s)) { 5912 canUpdateState = FALSE; 5913 } else { 5914 canUpdateState = TRUE; 5915 } 5916 } 5917 } else { 5918 // if we're not doing quaternary 5919 // skip to the end 5920 level = UCOL_PSK_NULL; 5921 } 5922 /* fall through to next level */ 5923 case UCOL_PSK_QUIN: 5924 level = UCOL_PSK_IDENTICAL; 5925 /* fall through to next level */ 5926 case UCOL_PSK_IDENTICAL: 5927 if(strength >= UCOL_IDENTICAL) { 5928 UChar32 first, second; 5929 int32_t bocsuBytesWritten = 0; 5930 // We always need to do identical on 5931 // the NFD form of the string. 5932 if(normIter == NULL) { 5933 // we arrived from the level below and 5934 // normalization was not turned on. 5935 // therefore, we need to make a fresh NFD iterator 5936 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 5937 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5938 } else if(!doingIdenticalFromStart) { 5939 // there is an iterator, but we did some other levels. 5940 // therefore, we have a FCD iterator - need to make 5941 // a NFD one. 5942 // normIter being at the beginning does not guarantee 5943 // that the underlying iterator is at the beginning 5944 iter->move(iter, 0, UITER_START); 5945 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 5946 } 5947 // At this point we have a NFD iterator that is positioned 5948 // in the right place 5949 if(U_FAILURE(*status)) { 5950 UTRACE_EXIT_STATUS(*status); 5951 return 0; 5952 } 5953 first = uiter_previous32(s.iterator); 5954 // maybe we're at the start of the string 5955 if(first == U_SENTINEL) { 5956 first = 0; 5957 } else { 5958 uiter_next32(s.iterator); 5959 } 5960 5961 j = 0; 5962 for(;;) { 5963 if(i == count) { 5964 if(j+1 < bocsuBytesWritten) { 5965 bocsuBytesUsed = j+1; 5966 } 5967 goto saveState; 5968 } 5969 5970 // On identical level, we will always save 5971 // the state if we reach this point, since 5972 // we don't depend on getNextCE for content 5973 // all the content is in our buffer and we 5974 // already either stored the full buffer OR 5975 // otherwise we won't arrive here. 5976 newState = s.iterator->getState(s.iterator); 5977 if(newState != UITER_NO_STATE) { 5978 iterState = newState; 5979 cces = 0; 5980 } 5981 5982 uint8_t buff[4]; 5983 second = uiter_next32(s.iterator); 5984 cces++; 5985 5986 // end condition for identical level 5987 if(second == U_SENTINEL) { 5988 terminatePSKLevel(level, maxLevel, i, dest); 5989 level = UCOL_PSK_NULL; 5990 break; 5991 } 5992 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); 5993 first = second; 5994 5995 j = 0; 5996 if(bocsuBytesUsed != 0) { 5997 while(bocsuBytesUsed-->0) { 5998 j++; 5999 } 6000 } 6001 6002 while(i < count && j < bocsuBytesWritten) { 6003 dest[i++] = buff[j++]; 6004 } 6005 } 6006 6007 } else { 6008 level = UCOL_PSK_NULL; 6009 } 6010 /* fall through to next level */ 6011 case UCOL_PSK_NULL: 6012 j = i; 6013 while(j<count) { 6014 dest[j++]=0; 6015 } 6016 break; 6017 default: 6018 *status = U_INTERNAL_PROGRAM_ERROR; 6019 UTRACE_EXIT_STATUS(*status); 6020 return 0; 6021 } 6022 6023 saveState: 6024 // Now we need to return stuff. First we want to see whether we have 6025 // done everything for the current state of iterator. 6026 if(byteCountOrFrenchDone 6027 || canUpdateState == FALSE 6028 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) 6029 { 6030 // Any of above mean that the previous transaction 6031 // wasn't finished and that we should store the 6032 // previous iterator state. 6033 state[0] = iterState; 6034 } else { 6035 // The transaction is complete. We will continue in the next iteration. 6036 state[0] = s.iterator->getState(s.iterator); 6037 cces = 0; 6038 } 6039 // Store the number of bocsu bytes written. 6040 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { 6041 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6042 } 6043 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT; 6044 6045 // Next we put in the level of comparison 6046 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); 6047 6048 // If we are doing French, we need to store whether we have just finished the French level 6049 if(level == UCOL_PSK_SECONDARY && doingFrench) { 6050 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6051 } else { 6052 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6053 } 6054 6055 // Was the latest CE shifted 6056 if(wasShifted) { 6057 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; 6058 } 6059 // Check for cces overflow 6060 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { 6061 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6062 } 6063 // Store cces 6064 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT); 6065 6066 // Check for French overflow 6067 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { 6068 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6069 } 6070 // Store number of bytes written in the French secondary continuation sequence 6071 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); 6072 6073 6074 // If we have used normalizing iterator, get rid of it 6075 if(normIter != NULL) { 6076 unorm_closeIter(normIter); 6077 } 6078 6079 /* To avoid memory leak, free the offset buffer if necessary. */ 6080 ucol_freeOffsetBuffer(&s); 6081 6082 // Return number of meaningful sortkey bytes. 6083 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 6084 dest,i, state[0], state[1]); 6085 UTRACE_EXIT_VALUE(i); 6086 return i; 6087 } 6088 6089 /** 6090 * Produce a bound for a given sortkey and a number of levels. 6091 */ 6092 U_CAPI int32_t U_EXPORT2 6093 ucol_getBound(const uint8_t *source, 6094 int32_t sourceLength, 6095 UColBoundMode boundType, 6096 uint32_t noOfLevels, 6097 uint8_t *result, 6098 int32_t resultLength, 6099 UErrorCode *status) 6100 { 6101 // consistency checks 6102 if(status == NULL || U_FAILURE(*status)) { 6103 return 0; 6104 } 6105 if(source == NULL) { 6106 *status = U_ILLEGAL_ARGUMENT_ERROR; 6107 return 0; 6108 } 6109 6110 int32_t sourceIndex = 0; 6111 // Scan the string until we skip enough of the key OR reach the end of the key 6112 do { 6113 sourceIndex++; 6114 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { 6115 noOfLevels--; 6116 } 6117 } while (noOfLevels > 0 6118 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 6119 6120 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 6121 && noOfLevels > 0) { 6122 *status = U_SORT_KEY_TOO_SHORT_WARNING; 6123 } 6124 6125 6126 // READ ME: this code assumes that the values for boundType 6127 // enum will not changes. They are set so that the enum value 6128 // corresponds to the number of extra bytes each bound type 6129 // needs. 6130 if(result != NULL && resultLength >= sourceIndex+boundType) { 6131 uprv_memcpy(result, source, sourceIndex); 6132 switch(boundType) { 6133 // Lower bound just gets terminated. No extra bytes 6134 case UCOL_BOUND_LOWER: // = 0 6135 break; 6136 // Upper bound needs one extra byte 6137 case UCOL_BOUND_UPPER: // = 1 6138 result[sourceIndex++] = 2; 6139 break; 6140 // Upper long bound needs two extra bytes 6141 case UCOL_BOUND_UPPER_LONG: // = 2 6142 result[sourceIndex++] = 0xFF; 6143 result[sourceIndex++] = 0xFF; 6144 break; 6145 default: 6146 *status = U_ILLEGAL_ARGUMENT_ERROR; 6147 return 0; 6148 } 6149 result[sourceIndex++] = 0; 6150 6151 return sourceIndex; 6152 } else { 6153 return sourceIndex+boundType+1; 6154 } 6155 } 6156 6157 /****************************************************************************/ 6158 /* Following are the functions that deal with the properties of a collator */ 6159 /* there are new APIs and some compatibility APIs */ 6160 /****************************************************************************/ 6161 6162 static inline void 6163 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, 6164 int32_t *primShift, int32_t *secShift, int32_t *terShift) 6165 { 6166 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; 6167 UBool reverseSecondary = FALSE; 6168 UBool continuation = isContinuation(CE); 6169 if(!continuation) { 6170 tertiary = (uint8_t)((CE & coll->tertiaryMask)); 6171 tertiary ^= coll->caseSwitch; 6172 reverseSecondary = TRUE; 6173 } else { 6174 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6175 tertiary &= UCOL_REMOVE_CASE; 6176 reverseSecondary = FALSE; 6177 } 6178 6179 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6180 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6181 primary1 = (uint8_t)(CE >> 8); 6182 6183 if(primary1 != 0) { 6184 if (coll->leadBytePermutationTable != NULL && !continuation) { 6185 primary1 = coll->leadBytePermutationTable[primary1]; 6186 } 6187 6188 coll->latinOneCEs[ch] |= (primary1 << *primShift); 6189 *primShift -= 8; 6190 } 6191 if(primary2 != 0) { 6192 if(*primShift < 0) { 6193 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6194 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6195 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6196 return; 6197 } 6198 coll->latinOneCEs[ch] |= (primary2 << *primShift); 6199 *primShift -= 8; 6200 } 6201 if(secondary != 0) { 6202 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary 6203 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary 6204 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); 6205 } else { // normal case 6206 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); 6207 } 6208 *secShift -= 8; 6209 } 6210 if(tertiary != 0) { 6211 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); 6212 *terShift -= 8; 6213 } 6214 } 6215 6216 static inline UBool 6217 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { 6218 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); 6219 if(newTable == NULL) { 6220 *status = U_MEMORY_ALLOCATION_ERROR; 6221 coll->latinOneFailed = TRUE; 6222 return FALSE; 6223 } 6224 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t); 6225 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); 6226 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); 6227 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy); 6228 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy); 6229 coll->latinOneTableLen = size; 6230 uprv_free(coll->latinOneCEs); 6231 coll->latinOneCEs = newTable; 6232 return TRUE; 6233 } 6234 6235 static UBool 6236 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { 6237 UBool result = TRUE; 6238 if(coll->latinOneCEs == NULL) { 6239 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); 6240 if(coll->latinOneCEs == NULL) { 6241 *status = U_MEMORY_ALLOCATION_ERROR; 6242 return FALSE; 6243 } 6244 coll->latinOneTableLen = UCOL_LATINONETABLELEN; 6245 } 6246 UChar ch = 0; 6247 UCollationElements *it = ucol_openElements(coll, &ch, 1, status); 6248 // Check for null pointer 6249 if (U_FAILURE(*status)) { 6250 return FALSE; 6251 } 6252 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); 6253 6254 int32_t primShift = 24, secShift = 24, terShift = 24; 6255 uint32_t CE = 0; 6256 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; 6257 6258 // TODO: make safe if you get more than you wanted... 6259 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { 6260 primShift = 24; secShift = 24; terShift = 24; 6261 if(ch < 0x100) { 6262 CE = coll->latinOneMapping[ch]; 6263 } else { 6264 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 6265 if(CE == UCOL_NOT_FOUND && coll->UCA) { 6266 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 6267 } 6268 } 6269 if(CE < UCOL_NOT_FOUND) { 6270 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6271 } else { 6272 switch (getCETag(CE)) { 6273 case EXPANSION_TAG: 6274 case DIGIT_TAG: 6275 ucol_setText(it, &ch, 1, status); 6276 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { 6277 if(primShift < 0 || secShift < 0 || terShift < 0) { 6278 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6279 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6280 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6281 break; 6282 } 6283 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6284 } 6285 break; 6286 case CONTRACTION_TAG: 6287 // here is the trick 6288 // F2 is contraction. We do something very similar to contractions 6289 // but have two indices, one in the real contraction table and the 6290 // other to where we stuffed things. This hopes that we don't have 6291 // many contractions (this should work for latin-1 tables). 6292 { 6293 if((CE & 0x00FFF000) != 0) { 6294 *status = U_UNSUPPORTED_ERROR; 6295 goto cleanup_after_failure; 6296 } 6297 6298 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 6299 6300 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table 6301 6302 coll->latinOneCEs[ch] = CE; 6303 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; 6304 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; 6305 6306 // We're going to jump into contraction table, pick the elements 6307 // and use them 6308 do { 6309 CE = *(coll->contractionCEs + 6310 (UCharOffset - coll->contractionIndex)); 6311 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { 6312 uint32_t size; 6313 uint32_t i; /* general counter */ 6314 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 6315 size = getExpansionCount(CE); 6316 //CE = *CEOffset++; 6317 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 6318 for(i = 0; i<size; i++) { 6319 if(primShift < 0 || secShift < 0 || terShift < 0) { 6320 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6321 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6322 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6323 break; 6324 } 6325 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6326 } 6327 } else { /* else, we do */ 6328 while(*CEOffset != 0) { 6329 if(primShift < 0 || secShift < 0 || terShift < 0) { 6330 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6331 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6332 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6333 break; 6334 } 6335 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6336 } 6337 } 6338 contractionOffset++; 6339 } else if(CE < UCOL_NOT_FOUND) { 6340 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); 6341 } else { 6342 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6343 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6344 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6345 contractionOffset++; 6346 } 6347 UCharOffset++; 6348 primShift = 24; secShift = 24; terShift = 24; 6349 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate 6350 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { 6351 goto cleanup_after_failure; 6352 } 6353 } 6354 } while(*UCharOffset != 0xFFFF); 6355 } 6356 break;; 6357 case SPEC_PROC_TAG: 6358 { 6359 // 0xB7 is a precontext character defined in UCA5.1, a special 6360 // handle is implemeted in order to save LatinOne table for 6361 // most locales. 6362 if (ch==0xb7) { 6363 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6364 } 6365 else { 6366 goto cleanup_after_failure; 6367 } 6368 } 6369 break; 6370 default: 6371 goto cleanup_after_failure; 6372 } 6373 } 6374 } 6375 // compact table 6376 if(contractionOffset < coll->latinOneTableLen) { 6377 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { 6378 goto cleanup_after_failure; 6379 } 6380 } 6381 ucol_closeElements(it); 6382 return result; 6383 6384 cleanup_after_failure: 6385 // status should already be set before arriving here. 6386 coll->latinOneFailed = TRUE; 6387 ucol_closeElements(it); 6388 return FALSE; 6389 } 6390 6391 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { 6392 if(U_SUCCESS(*status)) { 6393 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6394 coll->caseSwitch = UCOL_CASE_SWITCH; 6395 } else { 6396 coll->caseSwitch = UCOL_NO_CASE_SWITCH; 6397 } 6398 6399 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { 6400 coll->tertiaryMask = UCOL_REMOVE_CASE; 6401 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6402 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */ 6403 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; 6404 coll->tertiaryBottom = UCOL_COMMON_BOT3; 6405 } else { 6406 coll->tertiaryMask = UCOL_KEEP_CASE; 6407 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; 6408 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6409 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; 6410 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; 6411 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; 6412 } else { 6413 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6414 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; 6415 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; 6416 } 6417 } 6418 6419 /* Set the compression values */ 6420 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1); 6421 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */ 6422 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); 6423 6424 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY 6425 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) 6426 { 6427 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; 6428 } else { 6429 coll->sortKeyGen = ucol_calcSortKey; 6430 } 6431 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF 6432 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) 6433 { 6434 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { 6435 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it 6436 //fprintf(stderr, "F"); 6437 coll->latinOneUse = TRUE; 6438 } else { 6439 coll->latinOneUse = FALSE; 6440 } 6441 if(*status == U_UNSUPPORTED_ERROR) { 6442 *status = U_ZERO_ERROR; 6443 } 6444 } else { // latin1Table exists and it doesn't need to be regenerated, just use it 6445 coll->latinOneUse = TRUE; 6446 } 6447 } else { 6448 coll->latinOneUse = FALSE; 6449 } 6450 } 6451 } 6452 6453 U_CAPI uint32_t U_EXPORT2 6454 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 6455 if(U_FAILURE(*status) || coll == NULL) { 6456 return 0; 6457 } 6458 if(len == -1) { 6459 len = u_strlen(varTop); 6460 } 6461 if(len == 0) { 6462 *status = U_ILLEGAL_ARGUMENT_ERROR; 6463 return 0; 6464 } 6465 6466 collIterate s; 6467 IInit_collIterate(coll, varTop, len, &s, status); 6468 if(U_FAILURE(*status)) { 6469 return 0; 6470 } 6471 6472 uint32_t CE = ucol_IGetNextCE(coll, &s, status); 6473 6474 /* here we check if we have consumed all characters */ 6475 /* you can put in either one character or a contraction */ 6476 /* you shouldn't put more... */ 6477 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { 6478 *status = U_CE_NOT_FOUND_ERROR; 6479 return 0; 6480 } 6481 6482 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); 6483 6484 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { 6485 *status = U_PRIMARY_TOO_LONG_ERROR; 6486 return 0; 6487 } 6488 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { 6489 coll->variableTopValueisDefault = FALSE; 6490 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; 6491 } 6492 6493 /* To avoid memory leak, free the offset buffer if necessary. */ 6494 ucol_freeOffsetBuffer(&s); 6495 6496 return CE & UCOL_PRIMARYMASK; 6497 } 6498 6499 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 6500 if(U_FAILURE(*status) || coll == NULL) { 6501 return 0; 6502 } 6503 return coll->variableTopValue<<16; 6504 } 6505 6506 U_CAPI void U_EXPORT2 6507 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 6508 if(U_FAILURE(*status) || coll == NULL) { 6509 return; 6510 } 6511 6512 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { 6513 coll->variableTopValueisDefault = FALSE; 6514 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; 6515 } 6516 } 6517 /* Attribute setter API */ 6518 U_CAPI void U_EXPORT2 6519 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 6520 if(U_FAILURE(*status) || coll == NULL) { 6521 return; 6522 } 6523 6524 UColAttributeValue oldFrench = coll->frenchCollation; 6525 UColAttributeValue oldCaseFirst = coll->caseFirst; 6526 switch(attr) { 6527 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ 6528 if(value == UCOL_ON) { 6529 coll->numericCollation = UCOL_ON; 6530 coll->numericCollationisDefault = FALSE; 6531 } else if (value == UCOL_OFF) { 6532 coll->numericCollation = UCOL_OFF; 6533 coll->numericCollationisDefault = FALSE; 6534 } else if (value == UCOL_DEFAULT) { 6535 coll->numericCollationisDefault = TRUE; 6536 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; 6537 } else { 6538 *status = U_ILLEGAL_ARGUMENT_ERROR; 6539 } 6540 break; 6541 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ 6542 if(value == UCOL_ON) { 6543 coll->hiraganaQ = UCOL_ON; 6544 coll->hiraganaQisDefault = FALSE; 6545 } else if (value == UCOL_OFF) { 6546 coll->hiraganaQ = UCOL_OFF; 6547 coll->hiraganaQisDefault = FALSE; 6548 } else if (value == UCOL_DEFAULT) { 6549 coll->hiraganaQisDefault = TRUE; 6550 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ; 6551 } else { 6552 *status = U_ILLEGAL_ARGUMENT_ERROR; 6553 } 6554 break; 6555 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 6556 if(value == UCOL_ON) { 6557 coll->frenchCollation = UCOL_ON; 6558 coll->frenchCollationisDefault = FALSE; 6559 } else if (value == UCOL_OFF) { 6560 coll->frenchCollation = UCOL_OFF; 6561 coll->frenchCollationisDefault = FALSE; 6562 } else if (value == UCOL_DEFAULT) { 6563 coll->frenchCollationisDefault = TRUE; 6564 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation; 6565 } else { 6566 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6567 } 6568 break; 6569 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 6570 if(value == UCOL_SHIFTED) { 6571 coll->alternateHandling = UCOL_SHIFTED; 6572 coll->alternateHandlingisDefault = FALSE; 6573 } else if (value == UCOL_NON_IGNORABLE) { 6574 coll->alternateHandling = UCOL_NON_IGNORABLE; 6575 coll->alternateHandlingisDefault = FALSE; 6576 } else if (value == UCOL_DEFAULT) { 6577 coll->alternateHandlingisDefault = TRUE; 6578 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ; 6579 } else { 6580 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6581 } 6582 break; 6583 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 6584 if(value == UCOL_LOWER_FIRST) { 6585 coll->caseFirst = UCOL_LOWER_FIRST; 6586 coll->caseFirstisDefault = FALSE; 6587 } else if (value == UCOL_UPPER_FIRST) { 6588 coll->caseFirst = UCOL_UPPER_FIRST; 6589 coll->caseFirstisDefault = FALSE; 6590 } else if (value == UCOL_OFF) { 6591 coll->caseFirst = UCOL_OFF; 6592 coll->caseFirstisDefault = FALSE; 6593 } else if (value == UCOL_DEFAULT) { 6594 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; 6595 coll->caseFirstisDefault = TRUE; 6596 } else { 6597 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6598 } 6599 break; 6600 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 6601 if(value == UCOL_ON) { 6602 coll->caseLevel = UCOL_ON; 6603 coll->caseLevelisDefault = FALSE; 6604 } else if (value == UCOL_OFF) { 6605 coll->caseLevel = UCOL_OFF; 6606 coll->caseLevelisDefault = FALSE; 6607 } else if (value == UCOL_DEFAULT) { 6608 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; 6609 coll->caseLevelisDefault = TRUE; 6610 } else { 6611 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6612 } 6613 break; 6614 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 6615 if(value == UCOL_ON) { 6616 coll->normalizationMode = UCOL_ON; 6617 coll->normalizationModeisDefault = FALSE; 6618 initializeFCD(status); 6619 } else if (value == UCOL_OFF) { 6620 coll->normalizationMode = UCOL_OFF; 6621 coll->normalizationModeisDefault = FALSE; 6622 } else if (value == UCOL_DEFAULT) { 6623 coll->normalizationModeisDefault = TRUE; 6624 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode; 6625 if(coll->normalizationMode == UCOL_ON) { 6626 initializeFCD(status); 6627 } 6628 } else { 6629 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6630 } 6631 break; 6632 case UCOL_STRENGTH: /* attribute for strength */ 6633 if (value == UCOL_DEFAULT) { 6634 coll->strengthisDefault = TRUE; 6635 coll->strength = (UColAttributeValue)coll->options->strength; 6636 } else if (value <= UCOL_IDENTICAL) { 6637 coll->strengthisDefault = FALSE; 6638 coll->strength = value; 6639 } else { 6640 *status = U_ILLEGAL_ARGUMENT_ERROR ; 6641 } 6642 break; 6643 case UCOL_ATTRIBUTE_COUNT: 6644 default: 6645 *status = U_ILLEGAL_ARGUMENT_ERROR; 6646 break; 6647 } 6648 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { 6649 coll->latinOneRegenTable = TRUE; 6650 } else { 6651 coll->latinOneRegenTable = FALSE; 6652 } 6653 ucol_updateInternalState(coll, status); 6654 } 6655 6656 U_CAPI UColAttributeValue U_EXPORT2 6657 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 6658 if(U_FAILURE(*status) || coll == NULL) { 6659 return UCOL_DEFAULT; 6660 } 6661 switch(attr) { 6662 case UCOL_NUMERIC_COLLATION: 6663 return coll->numericCollation; 6664 case UCOL_HIRAGANA_QUATERNARY_MODE: 6665 return coll->hiraganaQ; 6666 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 6667 return coll->frenchCollation; 6668 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 6669 return coll->alternateHandling; 6670 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 6671 return coll->caseFirst; 6672 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 6673 return coll->caseLevel; 6674 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 6675 return coll->normalizationMode; 6676 case UCOL_STRENGTH: /* attribute for strength */ 6677 return coll->strength; 6678 case UCOL_ATTRIBUTE_COUNT: 6679 default: 6680 *status = U_ILLEGAL_ARGUMENT_ERROR; 6681 break; 6682 } 6683 return UCOL_DEFAULT; 6684 } 6685 6686 U_CAPI void U_EXPORT2 6687 ucol_setStrength( UCollator *coll, 6688 UCollationStrength strength) 6689 { 6690 UErrorCode status = U_ZERO_ERROR; 6691 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 6692 } 6693 6694 U_CAPI UCollationStrength U_EXPORT2 6695 ucol_getStrength(const UCollator *coll) 6696 { 6697 UErrorCode status = U_ZERO_ERROR; 6698 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 6699 } 6700 6701 U_DRAFT int32_t U_EXPORT2 6702 ucol_getReorderCodes(const UCollator *coll, 6703 int32_t *dest, 6704 int32_t destCapacity, 6705 UErrorCode *status) { 6706 if (U_FAILURE(*status)) { 6707 return 0; 6708 } 6709 6710 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 6711 *status = U_ILLEGAL_ARGUMENT_ERROR; 6712 return 0; 6713 } 6714 6715 #ifdef UCOL_DEBUG 6716 printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength); 6717 printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength); 6718 #endif 6719 6720 if (coll->reorderCodesLength > destCapacity) { 6721 *status = U_BUFFER_OVERFLOW_ERROR; 6722 return coll->reorderCodesLength; 6723 } 6724 for (int32_t i = 0; i < coll->reorderCodesLength; i++) { 6725 dest[i] = coll->reorderCodes[i]; 6726 } 6727 return coll->reorderCodesLength; 6728 } 6729 6730 U_DRAFT void U_EXPORT2 6731 ucol_setReorderCodes(UCollator* coll, 6732 const int32_t* reorderCodes, 6733 int32_t reorderCodesLength, 6734 UErrorCode *status) { 6735 if (U_FAILURE(*status)) { 6736 return; 6737 } 6738 6739 if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) { 6740 *status = U_ILLEGAL_ARGUMENT_ERROR; 6741 return; 6742 } 6743 6744 if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) { 6745 uprv_free(coll->reorderCodes); 6746 } 6747 coll->reorderCodes = NULL; 6748 coll->reorderCodesLength = 0; 6749 if (reorderCodesLength == 0) { 6750 if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) { 6751 uprv_free(coll->leadBytePermutationTable); 6752 } 6753 coll->leadBytePermutationTable = NULL; 6754 return; 6755 } 6756 coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t)); 6757 if (coll->reorderCodes == NULL) { 6758 *status = U_MEMORY_ALLOCATION_ERROR; 6759 return; 6760 } 6761 coll->freeReorderCodesOnClose = TRUE; 6762 for (int32_t i = 0; i < reorderCodesLength; i++) { 6763 coll->reorderCodes[i] = reorderCodes[i]; 6764 } 6765 coll->reorderCodesLength = reorderCodesLength; 6766 ucol_buildPermutationTable(coll, status); 6767 } 6768 6769 U_DRAFT int32_t U_EXPORT2 6770 ucol_getEquivalentReorderCodes(int32_t reorderCode, 6771 int32_t* dest, 6772 int32_t destCapacity, 6773 UErrorCode *pErrorCode) { 6774 bool equivalentCodesSet[USCRIPT_CODE_LIMIT]; 6775 uint16_t leadBytes[256]; 6776 int leadBytesCount; 6777 int leadByteIndex; 6778 int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT]; 6779 int reorderCodesForLeadByteCount; 6780 int reorderCodeIndex; 6781 6782 int32_t equivalentCodesCount = 0; 6783 int setIndex; 6784 6785 if (U_FAILURE(*pErrorCode)) { 6786 return 0; 6787 } 6788 6789 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { 6790 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 6791 return 0; 6792 } 6793 6794 uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool)); 6795 6796 const UCollator* uca = ucol_initUCA(pErrorCode); 6797 if (U_FAILURE(*pErrorCode)) { 6798 return 0; 6799 } 6800 leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256); 6801 for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) { 6802 reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte( 6803 uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT); 6804 for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) { 6805 equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true; 6806 } 6807 } 6808 6809 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { 6810 if (equivalentCodesSet[setIndex] == true) { 6811 equivalentCodesCount++; 6812 } 6813 } 6814 6815 if (destCapacity == 0) { 6816 return equivalentCodesCount; 6817 } 6818 6819 equivalentCodesCount = 0; 6820 for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { 6821 if (equivalentCodesSet[setIndex] == true) { 6822 dest[equivalentCodesCount++] = setIndex; 6823 if (equivalentCodesCount >= destCapacity) { 6824 break; 6825 } 6826 } 6827 } 6828 return equivalentCodesCount; 6829 } 6830 6831 6832 /****************************************************************************/ 6833 /* Following are misc functions */ 6834 /* there are new APIs and some compatibility APIs */ 6835 /****************************************************************************/ 6836 6837 U_CAPI void U_EXPORT2 6838 ucol_getVersion(const UCollator* coll, 6839 UVersionInfo versionInfo) 6840 { 6841 /* RunTime version */ 6842 uint8_t rtVersion = UCOL_RUNTIME_VERSION; 6843 /* Builder version*/ 6844 uint8_t bdVersion = coll->image->version[0]; 6845 6846 /* Charset Version. Need to get the version from cnv files 6847 * makeconv should populate cnv files with version and 6848 * an api has to be provided in ucnv.h to obtain this version 6849 */ 6850 uint8_t csVersion = 0; 6851 6852 /* combine the version info */ 6853 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion)); 6854 6855 /* Tailoring rules */ 6856 versionInfo[0] = (uint8_t)(cmbVersion>>8); 6857 versionInfo[1] = (uint8_t)cmbVersion; 6858 versionInfo[2] = coll->image->version[1]; 6859 if(coll->UCA) { 6860 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */ 6861 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07); 6862 } else { 6863 versionInfo[3] = 0; 6864 } 6865 } 6866 6867 6868 /* This internal API checks whether a character is tailored or not */ 6869 U_CAPI UBool U_EXPORT2 6870 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { 6871 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { 6872 return FALSE; 6873 } 6874 6875 uint32_t CE = UCOL_NOT_FOUND; 6876 const UChar *ContractionStart = NULL; 6877 if(u < 0x100) { /* latin-1 */ 6878 CE = coll->latinOneMapping[u]; 6879 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { 6880 return FALSE; 6881 } 6882 } else { /* regular */ 6883 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); 6884 } 6885 6886 if(isContraction(CE)) { 6887 ContractionStart = (UChar *)coll->image+getContractOffset(CE); 6888 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); 6889 } 6890 6891 return (UBool)(CE != UCOL_NOT_FOUND); 6892 } 6893 6894 6895 /****************************************************************************/ 6896 /* Following are the string compare functions */ 6897 /* */ 6898 /****************************************************************************/ 6899 6900 6901 /* ucol_checkIdent internal function. Does byte level string compare. */ 6902 /* Used by strcoll if strength == identical and strings */ 6903 /* are otherwise equal. */ 6904 /* */ 6905 /* Comparison must be done on NFD normalized strings. */ 6906 /* FCD is not good enough. */ 6907 6908 static 6909 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status) 6910 { 6911 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both 6912 // of same type, but that doesn't really mean that it will stay that way. 6913 int32_t comparison; 6914 6915 if (sColl->flags & UCOL_USE_ITERATOR) { 6916 // The division for the array length may truncate the array size to 6917 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 6918 // for all platforms anyway. 6919 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 6920 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 6921 UNormIterator *sNIt = NULL, *tNIt = NULL; 6922 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 6923 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 6924 sColl->iterator->move(sColl->iterator, 0, UITER_START); 6925 tColl->iterator->move(tColl->iterator, 0, UITER_START); 6926 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); 6927 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); 6928 comparison = u_strCompareIter(sIt, tIt, TRUE); 6929 unorm_closeIter(sNIt); 6930 unorm_closeIter(tNIt); 6931 } else { 6932 int32_t sLen = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1; 6933 const UChar *sBuf = sColl->string; 6934 int32_t tLen = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1; 6935 const UChar *tBuf = tColl->string; 6936 6937 if (normalize) { 6938 *status = U_ZERO_ERROR; 6939 // Note: We could use Normalizer::compare() or similar, but for short strings 6940 // which may not be in FCD it might be faster to just NFD them. 6941 // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than 6942 // NFD'ing immediately might be faster for long strings, 6943 // but string comparison is usually done on relatively short strings. 6944 sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen), 6945 sColl->writableBuffer, 6946 *status); 6947 tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen), 6948 tColl->writableBuffer, 6949 *status); 6950 if(U_FAILURE(*status)) { 6951 return UCOL_LESS; 6952 } 6953 comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer); 6954 } else { 6955 comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE); 6956 } 6957 } 6958 6959 if (comparison < 0) { 6960 return UCOL_LESS; 6961 } else if (comparison == 0) { 6962 return UCOL_EQUAL; 6963 } else /* comparison > 0 */ { 6964 return UCOL_GREATER; 6965 } 6966 } 6967 6968 /* CEBuf - A struct and some inline functions to handle the saving */ 6969 /* of CEs in a buffer within ucol_strcoll */ 6970 6971 #define UCOL_CEBUF_SIZE 512 6972 typedef struct ucol_CEBuf { 6973 uint32_t *buf; 6974 uint32_t *endp; 6975 uint32_t *pos; 6976 uint32_t localArray[UCOL_CEBUF_SIZE]; 6977 } ucol_CEBuf; 6978 6979 6980 static 6981 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { 6982 (b)->buf = (b)->pos = (b)->localArray; 6983 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; 6984 } 6985 6986 static 6987 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { 6988 uint32_t oldSize; 6989 uint32_t newSize; 6990 uint32_t *newBuf; 6991 6992 ci->flags |= UCOL_ITER_ALLOCATED; 6993 oldSize = (uint32_t)(b->pos - b->buf); 6994 newSize = oldSize * 2; 6995 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); 6996 if(newBuf == NULL) { 6997 *status = U_MEMORY_ALLOCATION_ERROR; 6998 } 6999 else { 7000 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); 7001 if (b->buf != b->localArray) { 7002 uprv_free(b->buf); 7003 } 7004 b->buf = newBuf; 7005 b->endp = b->buf + newSize; 7006 b->pos = b->buf + oldSize; 7007 } 7008 } 7009 7010 static 7011 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) { 7012 if (b->pos == b->endp) { 7013 ucol_CEBuf_Expand(b, ci, status); 7014 } 7015 if (U_SUCCESS(*status)) { 7016 *(b)->pos++ = ce; 7017 } 7018 } 7019 7020 /* This is a trick string compare function that goes in and uses sortkeys to compare */ 7021 /* It is used when compare gets in trouble and needs to bail out */ 7022 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, 7023 collIterate *tColl, 7024 UErrorCode *status) 7025 { 7026 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; 7027 uint8_t *sourceKeyP = sourceKey; 7028 uint8_t *targetKeyP = targetKey; 7029 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; 7030 const UCollator *coll = sColl->coll; 7031 const UChar *source = NULL; 7032 const UChar *target = NULL; 7033 int32_t result = UCOL_EQUAL; 7034 UnicodeString sourceString, targetString; 7035 int32_t sourceLength; 7036 int32_t targetLength; 7037 7038 if(sColl->flags & UCOL_USE_ITERATOR) { 7039 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7040 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7041 UChar32 c; 7042 while((c=sColl->iterator->next(sColl->iterator))>=0) { 7043 sourceString.append((UChar)c); 7044 } 7045 while((c=tColl->iterator->next(tColl->iterator))>=0) { 7046 targetString.append((UChar)c); 7047 } 7048 source = sourceString.getBuffer(); 7049 sourceLength = sourceString.length(); 7050 target = targetString.getBuffer(); 7051 targetLength = targetString.length(); 7052 } else { // no iterators 7053 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1; 7054 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1; 7055 source = sColl->string; 7056 target = tColl->string; 7057 } 7058 7059 7060 7061 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7062 if(sourceKeyLen > UCOL_MAX_BUFFER) { 7063 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); 7064 if(sourceKeyP == NULL) { 7065 *status = U_MEMORY_ALLOCATION_ERROR; 7066 goto cleanup_and_do_compare; 7067 } 7068 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7069 } 7070 7071 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7072 if(targetKeyLen > UCOL_MAX_BUFFER) { 7073 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); 7074 if(targetKeyP == NULL) { 7075 *status = U_MEMORY_ALLOCATION_ERROR; 7076 goto cleanup_and_do_compare; 7077 } 7078 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7079 } 7080 7081 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); 7082 7083 cleanup_and_do_compare: 7084 if(sourceKeyP != NULL && sourceKeyP != sourceKey) { 7085 uprv_free(sourceKeyP); 7086 } 7087 7088 if(targetKeyP != NULL && targetKeyP != targetKey) { 7089 uprv_free(targetKeyP); 7090 } 7091 7092 if(result<0) { 7093 return UCOL_LESS; 7094 } else if(result>0) { 7095 return UCOL_GREATER; 7096 } else { 7097 return UCOL_EQUAL; 7098 } 7099 } 7100 7101 7102 static UCollationResult 7103 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status) 7104 { 7105 U_ALIGN_CODE(16); 7106 7107 const UCollator *coll = sColl->coll; 7108 7109 7110 // setting up the collator parameters 7111 UColAttributeValue strength = coll->strength; 7112 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); 7113 7114 UBool checkSecTer = initialCheckSecTer; 7115 UBool checkTertiary = (strength >= UCOL_TERTIARY); 7116 UBool checkQuad = (strength >= UCOL_QUATERNARY); 7117 UBool checkIdent = (strength == UCOL_IDENTICAL); 7118 UBool checkCase = (coll->caseLevel == UCOL_ON); 7119 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; 7120 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 7121 UBool qShifted = shifted && checkQuad; 7122 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; 7123 7124 if(doHiragana && shifted) { 7125 return (ucol_compareUsingSortKeys(sColl, tColl, status)); 7126 } 7127 uint8_t caseSwitch = coll->caseSwitch; 7128 uint8_t tertiaryMask = coll->tertiaryMask; 7129 7130 // This is the lowest primary value that will not be ignored if shifted 7131 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; 7132 7133 UCollationResult result = UCOL_EQUAL; 7134 UCollationResult hirResult = UCOL_EQUAL; 7135 7136 // Preparing the CE buffers. They will be filled during the primary phase 7137 ucol_CEBuf sCEs; 7138 ucol_CEBuf tCEs; 7139 UCOL_INIT_CEBUF(&sCEs); 7140 UCOL_INIT_CEBUF(&tCEs); 7141 7142 uint32_t secS = 0, secT = 0; 7143 uint32_t sOrder=0, tOrder=0; 7144 7145 // Non shifted primary processing is quite simple 7146 if(!shifted) { 7147 for(;;) { 7148 7149 // We fetch CEs until we hit a non ignorable primary or end. 7150 do { 7151 // We get the next CE 7152 sOrder = ucol_IGetNextCE(coll, sColl, status); 7153 // Stuff it in the buffer 7154 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7155 // And keep just the primary part. 7156 sOrder &= UCOL_PRIMARYMASK; 7157 } while(sOrder == 0); 7158 7159 // see the comments on the above block 7160 do { 7161 tOrder = ucol_IGetNextCE(coll, tColl, status); 7162 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7163 tOrder &= UCOL_PRIMARYMASK; 7164 } while(tOrder == 0); 7165 7166 // if both primaries are the same 7167 if(sOrder == tOrder) { 7168 // and there are no more CEs, we advance to the next level 7169 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7170 break; 7171 } 7172 if(doHiragana && hirResult == UCOL_EQUAL) { 7173 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { 7174 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) 7175 ? UCOL_LESS:UCOL_GREATER; 7176 } 7177 } 7178 } else { 7179 // only need to check one for continuation 7180 // if one is then the other must be or the preceding CE would be a prefix of the other 7181 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) { 7182 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); 7183 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); 7184 } 7185 // if two primaries are different, we are done 7186 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; 7187 goto commonReturn; 7188 } 7189 } // no primary difference... do the rest from the buffers 7190 } else { // shifted - do a slightly more complicated processing :) 7191 for(;;) { 7192 UBool sInShifted = FALSE; 7193 UBool tInShifted = FALSE; 7194 // This version of code can be refactored. However, it seems easier to understand this way. 7195 // Source loop. Sam as the target loop. 7196 for(;;) { 7197 sOrder = ucol_IGetNextCE(coll, sColl, status); 7198 if(sOrder == UCOL_NO_MORE_CES) { 7199 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7200 break; 7201 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { 7202 /* UCA amendment - ignore ignorables that follow shifted code points */ 7203 continue; 7204 } else if(isContinuation(sOrder)) { 7205 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7206 if(sInShifted) { 7207 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7208 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7209 continue; 7210 } else { 7211 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7212 break; 7213 } 7214 } else { /* Just lower level values */ 7215 if(sInShifted) { 7216 continue; 7217 } else { 7218 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7219 continue; 7220 } 7221 } 7222 } else { /* regular */ 7223 if(coll->leadBytePermutationTable != NULL){ 7224 sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF); 7225 } 7226 if((sOrder & UCOL_PRIMARYMASK) > LVT) { 7227 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7228 break; 7229 } else { 7230 if((sOrder & UCOL_PRIMARYMASK) > 0) { 7231 sInShifted = TRUE; 7232 sOrder &= UCOL_PRIMARYMASK; 7233 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7234 continue; 7235 } else { 7236 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7237 sInShifted = FALSE; 7238 continue; 7239 } 7240 } 7241 } 7242 } 7243 sOrder &= UCOL_PRIMARYMASK; 7244 sInShifted = FALSE; 7245 7246 for(;;) { 7247 tOrder = ucol_IGetNextCE(coll, tColl, status); 7248 if(tOrder == UCOL_NO_MORE_CES) { 7249 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7250 break; 7251 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { 7252 /* UCA amendment - ignore ignorables that follow shifted code points */ 7253 continue; 7254 } else if(isContinuation(tOrder)) { 7255 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7256 if(tInShifted) { 7257 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7258 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7259 continue; 7260 } else { 7261 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7262 break; 7263 } 7264 } else { /* Just lower level values */ 7265 if(tInShifted) { 7266 continue; 7267 } else { 7268 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7269 continue; 7270 } 7271 } 7272 } else { /* regular */ 7273 if(coll->leadBytePermutationTable != NULL){ 7274 tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF); 7275 } 7276 if((tOrder & UCOL_PRIMARYMASK) > LVT) { 7277 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7278 break; 7279 } else { 7280 if((tOrder & UCOL_PRIMARYMASK) > 0) { 7281 tInShifted = TRUE; 7282 tOrder &= UCOL_PRIMARYMASK; 7283 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7284 continue; 7285 } else { 7286 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7287 tInShifted = FALSE; 7288 continue; 7289 } 7290 } 7291 } 7292 } 7293 tOrder &= UCOL_PRIMARYMASK; 7294 tInShifted = FALSE; 7295 7296 if(sOrder == tOrder) { 7297 /* 7298 if(doHiragana && hirResult == UCOL_EQUAL) { 7299 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { 7300 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) 7301 ? UCOL_LESS:UCOL_GREATER; 7302 } 7303 } 7304 */ 7305 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7306 break; 7307 } else { 7308 sOrder = 0; 7309 tOrder = 0; 7310 continue; 7311 } 7312 } else { 7313 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; 7314 goto commonReturn; 7315 } 7316 } /* no primary difference... do the rest from the buffers */ 7317 } 7318 7319 /* now, we're gonna reexamine collected CEs */ 7320 uint32_t *sCE; 7321 uint32_t *tCE; 7322 7323 /* This is the secondary level of comparison */ 7324 if(checkSecTer) { 7325 if(!isFrenchSec) { /* normal */ 7326 sCE = sCEs.buf; 7327 tCE = tCEs.buf; 7328 for(;;) { 7329 while (secS == 0) { 7330 secS = *(sCE++) & UCOL_SECONDARYMASK; 7331 } 7332 7333 while(secT == 0) { 7334 secT = *(tCE++) & UCOL_SECONDARYMASK; 7335 } 7336 7337 if(secS == secT) { 7338 if(secS == UCOL_NO_MORE_CES_SECONDARY) { 7339 break; 7340 } else { 7341 secS = 0; secT = 0; 7342 continue; 7343 } 7344 } else { 7345 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7346 goto commonReturn; 7347 } 7348 } 7349 } else { /* do the French */ 7350 uint32_t *sCESave = NULL; 7351 uint32_t *tCESave = NULL; 7352 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ 7353 tCE = tCEs.pos-2; 7354 for(;;) { 7355 while (secS == 0 && sCE >= sCEs.buf) { 7356 if(sCESave == NULL) { 7357 secS = *(sCE--); 7358 if(isContinuation(secS)) { 7359 while(isContinuation(secS = *(sCE--))) 7360 ; 7361 /* after this, secS has the start of continuation, and sCEs points before that */ 7362 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7363 sCE+=2; /* need to point to the first continuation CP */ 7364 /* However, now you can just continue doing stuff */ 7365 } 7366 } else { 7367 secS = *(sCE++); 7368 if(!isContinuation(secS)) { /* This means we have finished with this cont */ 7369 sCE = sCESave; /* reset the pointer to before continuation */ 7370 sCESave = NULL; 7371 secS = 0; /* Fetch a fresh CE before the continuation sequence. */ 7372 continue; 7373 } 7374 } 7375 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7376 } 7377 7378 while(secT == 0 && tCE >= tCEs.buf) { 7379 if(tCESave == NULL) { 7380 secT = *(tCE--); 7381 if(isContinuation(secT)) { 7382 while(isContinuation(secT = *(tCE--))) 7383 ; 7384 /* after this, secS has the start of continuation, and sCEs points before that */ 7385 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7386 tCE+=2; /* need to point to the first continuation CP */ 7387 /* However, now you can just continue doing stuff */ 7388 } 7389 } else { 7390 secT = *(tCE++); 7391 if(!isContinuation(secT)) { /* This means we have finished with this cont */ 7392 tCE = tCESave; /* reset the pointer to before continuation */ 7393 tCESave = NULL; 7394 secT = 0; /* Fetch a fresh CE before the continuation sequence. */ 7395 continue; 7396 } 7397 } 7398 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7399 } 7400 7401 if(secS == secT) { 7402 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { 7403 break; 7404 } else { 7405 secS = 0; secT = 0; 7406 continue; 7407 } 7408 } else { 7409 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7410 goto commonReturn; 7411 } 7412 } 7413 } 7414 } 7415 7416 /* doing the case bit */ 7417 if(checkCase) { 7418 sCE = sCEs.buf; 7419 tCE = tCEs.buf; 7420 for(;;) { 7421 while((secS & UCOL_REMOVE_CASE) == 0) { 7422 if(!isContinuation(*sCE++)) { 7423 secS =*(sCE-1); 7424 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7425 // primary ignorables should not be considered on the case level when the strength is primary 7426 // otherwise, the CEs stop being well-formed 7427 secS &= UCOL_TERT_CASE_MASK; 7428 secS ^= caseSwitch; 7429 } else { 7430 secS = 0; 7431 } 7432 } else { 7433 secS = 0; 7434 } 7435 } 7436 7437 while((secT & UCOL_REMOVE_CASE) == 0) { 7438 if(!isContinuation(*tCE++)) { 7439 secT = *(tCE-1); 7440 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7441 // primary ignorables should not be considered on the case level when the strength is primary 7442 // otherwise, the CEs stop being well-formed 7443 secT &= UCOL_TERT_CASE_MASK; 7444 secT ^= caseSwitch; 7445 } else { 7446 secT = 0; 7447 } 7448 } else { 7449 secT = 0; 7450 } 7451 } 7452 7453 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { 7454 result = UCOL_LESS; 7455 goto commonReturn; 7456 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { 7457 result = UCOL_GREATER; 7458 goto commonReturn; 7459 } 7460 7461 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { 7462 break; 7463 } else { 7464 secS = 0; 7465 secT = 0; 7466 } 7467 } 7468 } 7469 7470 /* Tertiary level */ 7471 if(checkTertiary) { 7472 secS = 0; 7473 secT = 0; 7474 sCE = sCEs.buf; 7475 tCE = tCEs.buf; 7476 for(;;) { 7477 while((secS & UCOL_REMOVE_CASE) == 0) { 7478 secS = *(sCE++) & tertiaryMask; 7479 if(!isContinuation(secS)) { 7480 secS ^= caseSwitch; 7481 } else { 7482 secS &= UCOL_REMOVE_CASE; 7483 } 7484 } 7485 7486 while((secT & UCOL_REMOVE_CASE) == 0) { 7487 secT = *(tCE++) & tertiaryMask; 7488 if(!isContinuation(secT)) { 7489 secT ^= caseSwitch; 7490 } else { 7491 secT &= UCOL_REMOVE_CASE; 7492 } 7493 } 7494 7495 if(secS == secT) { 7496 if((secS & UCOL_REMOVE_CASE) == 1) { 7497 break; 7498 } else { 7499 secS = 0; secT = 0; 7500 continue; 7501 } 7502 } else { 7503 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7504 goto commonReturn; 7505 } 7506 } 7507 } 7508 7509 7510 if(qShifted /*checkQuad*/) { 7511 UBool sInShifted = TRUE; 7512 UBool tInShifted = TRUE; 7513 secS = 0; 7514 secT = 0; 7515 sCE = sCEs.buf; 7516 tCE = tCEs.buf; 7517 for(;;) { 7518 while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) { 7519 secS = *(sCE++); 7520 if(isContinuation(secS)) { 7521 if(!sInShifted) { 7522 continue; 7523 } 7524 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ 7525 secS = UCOL_PRIMARYMASK; 7526 sInShifted = FALSE; 7527 } else { 7528 sInShifted = TRUE; 7529 } 7530 } 7531 secS &= UCOL_PRIMARYMASK; 7532 7533 7534 while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) { 7535 secT = *(tCE++); 7536 if(isContinuation(secT)) { 7537 if(!tInShifted) { 7538 continue; 7539 } 7540 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { 7541 secT = UCOL_PRIMARYMASK; 7542 tInShifted = FALSE; 7543 } else { 7544 tInShifted = TRUE; 7545 } 7546 } 7547 secT &= UCOL_PRIMARYMASK; 7548 7549 if(secS == secT) { 7550 if(secS == UCOL_NO_MORE_CES_PRIMARY) { 7551 break; 7552 } else { 7553 secS = 0; secT = 0; 7554 continue; 7555 } 7556 } else { 7557 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7558 goto commonReturn; 7559 } 7560 } 7561 } else if(doHiragana && hirResult != UCOL_EQUAL) { 7562 // If we're fine on quaternaries, we might be different 7563 // on Hiragana. This, however, might fail us in shifted. 7564 result = hirResult; 7565 goto commonReturn; 7566 } 7567 7568 /* For IDENTICAL comparisons, we use a bitwise character comparison */ 7569 /* as a tiebreaker if all else is equal. */ 7570 /* Getting here should be quite rare - strings are not identical - */ 7571 /* that is checked first, but compared == through all other checks. */ 7572 if(checkIdent) 7573 { 7574 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON); 7575 result = ucol_checkIdent(sColl, tColl, TRUE, status); 7576 } 7577 7578 commonReturn: 7579 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { 7580 if (sCEs.buf != sCEs.localArray ) { 7581 uprv_free(sCEs.buf); 7582 } 7583 if (tCEs.buf != tCEs.localArray ) { 7584 uprv_free(tCEs.buf); 7585 } 7586 } 7587 7588 return result; 7589 } 7590 7591 static UCollationResult 7592 ucol_strcollRegular(const UCollator *coll, 7593 const UChar *source, int32_t sourceLength, 7594 const UChar *target, int32_t targetLength, 7595 UErrorCode *status) { 7596 collIterate sColl, tColl; 7597 // Preparing the context objects for iterating over strings 7598 IInit_collIterate(coll, source, sourceLength, &sColl, status); 7599 IInit_collIterate(coll, target, targetLength, &tColl, status); 7600 if(U_FAILURE(*status)) { 7601 return UCOL_LESS; 7602 } 7603 return ucol_strcollRegular(&sColl, &tColl, status); 7604 } 7605 7606 static inline uint32_t 7607 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, 7608 uint32_t CE, const UChar *s, int32_t *index, int32_t len) 7609 { 7610 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 7611 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 7612 int32_t offset = 1; 7613 UChar schar = 0, tchar = 0; 7614 7615 for(;;) { 7616 if(len == -1) { 7617 if(s[*index] == 0) { // end of string 7618 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7619 } else { 7620 schar = s[*index]; 7621 } 7622 } else { 7623 if(*index == len) { 7624 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7625 } else { 7626 schar = s[*index]; 7627 } 7628 } 7629 7630 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 7631 offset++; 7632 } 7633 7634 if (schar == tchar) { 7635 (*index)++; 7636 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 7637 } 7638 else 7639 { 7640 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 7641 return UCOL_BAIL_OUT_CE; 7642 } 7643 // skip completely ignorables 7644 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 7645 if(isZeroCE == 0) { // we have to ignore completely ignorables 7646 (*index)++; 7647 continue; 7648 } 7649 7650 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 7651 } 7652 } 7653 } 7654 7655 7656 /** 7657 * This is a fast strcoll, geared towards text in Latin-1. 7658 * It supports contractions of size two, French secondaries 7659 * and case switching. You can use it with strengths primary 7660 * to tertiary. It does not support shifted and case level. 7661 * It relies on the table build by setupLatin1Table. If it 7662 * doesn't understand something, it will go to the regular 7663 * strcoll. 7664 */ 7665 static UCollationResult 7666 ucol_strcollUseLatin1( const UCollator *coll, 7667 const UChar *source, 7668 int32_t sLen, 7669 const UChar *target, 7670 int32_t tLen, 7671 UErrorCode *status) 7672 { 7673 U_ALIGN_CODE(16); 7674 int32_t strength = coll->strength; 7675 7676 int32_t sIndex = 0, tIndex = 0; 7677 UChar sChar = 0, tChar = 0; 7678 uint32_t sOrder=0, tOrder=0; 7679 7680 UBool endOfSource = FALSE; 7681 7682 uint32_t *elements = coll->latinOneCEs; 7683 7684 UBool haveContractions = FALSE; // if we have contractions in our string 7685 // we cannot do French secondary 7686 7687 // Do the primary level 7688 for(;;) { 7689 while(sOrder==0) { // this loop skips primary ignorables 7690 // sOrder=getNextlatinOneCE(source); 7691 if(sLen==-1) { // handling zero terminated strings 7692 sChar=source[sIndex++]; 7693 if(sChar==0) { 7694 endOfSource = TRUE; 7695 break; 7696 } 7697 } else { // handling strings with known length 7698 if(sIndex==sLen) { 7699 endOfSource = TRUE; 7700 break; 7701 } 7702 sChar=source[sIndex++]; 7703 } 7704 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 7705 //fprintf(stderr, "R"); 7706 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7707 } 7708 sOrder = elements[sChar]; 7709 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 7710 // specials can basically be either contractions or bail-out signs. If we get anything 7711 // else, we'll bail out anywasy 7712 if(getCETag(sOrder) == CONTRACTION_TAG) { 7713 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 7714 haveContractions = TRUE; // if there are contractions, we cannot do French secondary 7715 // However, if there are contractions in the table, but we always use just one char, 7716 // we might be able to do French. This should be checked out. 7717 } 7718 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 7719 //fprintf(stderr, "S"); 7720 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7721 } 7722 } 7723 } 7724 7725 while(tOrder==0) { // this loop skips primary ignorables 7726 // tOrder=getNextlatinOneCE(target); 7727 if(tLen==-1) { // handling zero terminated strings 7728 tChar=target[tIndex++]; 7729 if(tChar==0) { 7730 if(endOfSource) { // this is different than source loop, 7731 // as we already know that source loop is done here, 7732 // so we can either finish the primary loop if both 7733 // strings are done or anounce the result if only 7734 // target is done. Same below. 7735 goto endOfPrimLoop; 7736 } else { 7737 return UCOL_GREATER; 7738 } 7739 } 7740 } else { // handling strings with known length 7741 if(tIndex==tLen) { 7742 if(endOfSource) { 7743 goto endOfPrimLoop; 7744 } else { 7745 return UCOL_GREATER; 7746 } 7747 } 7748 tChar=target[tIndex++]; 7749 } 7750 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 7751 //fprintf(stderr, "R"); 7752 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7753 } 7754 tOrder = elements[tChar]; 7755 if(tOrder >= UCOL_NOT_FOUND) { 7756 // Handling specials, see the comments for source 7757 if(getCETag(tOrder) == CONTRACTION_TAG) { 7758 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 7759 haveContractions = TRUE; 7760 } 7761 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 7762 //fprintf(stderr, "S"); 7763 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7764 } 7765 } 7766 } 7767 if(endOfSource) { // source is finished, but target is not, say the result. 7768 return UCOL_LESS; 7769 } 7770 7771 if(sOrder == tOrder) { // if we have same CEs, we continue the loop 7772 sOrder = 0; tOrder = 0; 7773 continue; 7774 } else { 7775 // compare current top bytes 7776 if(((sOrder^tOrder)&0xFF000000)!=0) { 7777 // top bytes differ, return difference 7778 if(sOrder < tOrder) { 7779 return UCOL_LESS; 7780 } else if(sOrder > tOrder) { 7781 return UCOL_GREATER; 7782 } 7783 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 7784 // since we must return enum value 7785 } 7786 7787 // top bytes match, continue with following bytes 7788 sOrder<<=8; 7789 tOrder<<=8; 7790 } 7791 } 7792 7793 endOfPrimLoop: 7794 // after primary loop, we definitely know the sizes of strings, 7795 // so we set it and use simpler loop for secondaries and tertiaries 7796 sLen = sIndex; tLen = tIndex; 7797 if(strength >= UCOL_SECONDARY) { 7798 // adjust the table beggining 7799 elements += coll->latinOneTableLen; 7800 endOfSource = FALSE; 7801 7802 if(coll->frenchCollation == UCOL_OFF) { // non French 7803 // This loop is a simplified copy of primary loop 7804 // at this point we know that whole strings are latin-1, so we don't 7805 // check for that. We also know that we only have contractions as 7806 // specials. 7807 sIndex = 0; tIndex = 0; 7808 for(;;) { 7809 while(sOrder==0) { 7810 if(sIndex==sLen) { 7811 endOfSource = TRUE; 7812 break; 7813 } 7814 sChar=source[sIndex++]; 7815 sOrder = elements[sChar]; 7816 if(sOrder > UCOL_NOT_FOUND) { 7817 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 7818 } 7819 } 7820 7821 while(tOrder==0) { 7822 if(tIndex==tLen) { 7823 if(endOfSource) { 7824 goto endOfSecLoop; 7825 } else { 7826 return UCOL_GREATER; 7827 } 7828 } 7829 tChar=target[tIndex++]; 7830 tOrder = elements[tChar]; 7831 if(tOrder > UCOL_NOT_FOUND) { 7832 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 7833 } 7834 } 7835 if(endOfSource) { 7836 return UCOL_LESS; 7837 } 7838 7839 if(sOrder == tOrder) { 7840 sOrder = 0; tOrder = 0; 7841 continue; 7842 } else { 7843 // see primary loop for comments on this 7844 if(((sOrder^tOrder)&0xFF000000)!=0) { 7845 if(sOrder < tOrder) { 7846 return UCOL_LESS; 7847 } else if(sOrder > tOrder) { 7848 return UCOL_GREATER; 7849 } 7850 } 7851 sOrder<<=8; 7852 tOrder<<=8; 7853 } 7854 } 7855 } else { // French 7856 if(haveContractions) { // if we have contractions, we have to bail out 7857 // since we don't really know how to handle them here 7858 return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 7859 } 7860 // For French, we go backwards 7861 sIndex = sLen; tIndex = tLen; 7862 for(;;) { 7863 while(sOrder==0) { 7864 if(sIndex==0) { 7865 endOfSource = TRUE; 7866 break; 7867 } 7868 sChar=source[--sIndex]; 7869 sOrder = elements[sChar]; 7870 // don't even look for contractions 7871 } 7872 7873 while(tOrder==0) { 7874 if(tIndex==0) { 7875 if(endOfSource) { 7876 goto endOfSecLoop; 7877 } else { 7878 return UCOL_GREATER; 7879 } 7880 } 7881 tChar=target[--tIndex]; 7882 tOrder = elements[tChar]; 7883 // don't even look for contractions 7884 } 7885 if(endOfSource) { 7886 return UCOL_LESS; 7887 } 7888 7889 if(sOrder == tOrder) { 7890 sOrder = 0; tOrder = 0; 7891 continue; 7892 } else { 7893 // see the primary loop for comments 7894 if(((sOrder^tOrder)&0xFF000000)!=0) { 7895 if(sOrder < tOrder) { 7896 return UCOL_LESS; 7897 } else if(sOrder > tOrder) { 7898 return UCOL_GREATER; 7899 } 7900 } 7901 sOrder<<=8; 7902 tOrder<<=8; 7903 } 7904 } 7905 } 7906 } 7907 7908 endOfSecLoop: 7909 if(strength >= UCOL_TERTIARY) { 7910 // tertiary loop is the same as secondary (except no French) 7911 elements += coll->latinOneTableLen; 7912 sIndex = 0; tIndex = 0; 7913 endOfSource = FALSE; 7914 for(;;) { 7915 while(sOrder==0) { 7916 if(sIndex==sLen) { 7917 endOfSource = TRUE; 7918 break; 7919 } 7920 sChar=source[sIndex++]; 7921 sOrder = elements[sChar]; 7922 if(sOrder > UCOL_NOT_FOUND) { 7923 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 7924 } 7925 } 7926 while(tOrder==0) { 7927 if(tIndex==tLen) { 7928 if(endOfSource) { 7929 return UCOL_EQUAL; // if both strings are at the end, they are equal 7930 } else { 7931 return UCOL_GREATER; 7932 } 7933 } 7934 tChar=target[tIndex++]; 7935 tOrder = elements[tChar]; 7936 if(tOrder > UCOL_NOT_FOUND) { 7937 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 7938 } 7939 } 7940 if(endOfSource) { 7941 return UCOL_LESS; 7942 } 7943 if(sOrder == tOrder) { 7944 sOrder = 0; tOrder = 0; 7945 continue; 7946 } else { 7947 if(((sOrder^tOrder)&0xff000000)!=0) { 7948 if(sOrder < tOrder) { 7949 return UCOL_LESS; 7950 } else if(sOrder > tOrder) { 7951 return UCOL_GREATER; 7952 } 7953 } 7954 sOrder<<=8; 7955 tOrder<<=8; 7956 } 7957 } 7958 } 7959 return UCOL_EQUAL; 7960 } 7961 7962 7963 U_CAPI UCollationResult U_EXPORT2 7964 ucol_strcollIter( const UCollator *coll, 7965 UCharIterator *sIter, 7966 UCharIterator *tIter, 7967 UErrorCode *status) 7968 { 7969 if(!status || U_FAILURE(*status)) { 7970 return UCOL_EQUAL; 7971 } 7972 7973 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 7974 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 7975 7976 if (sIter == tIter) { 7977 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 7978 return UCOL_EQUAL; 7979 } 7980 if(sIter == NULL || tIter == NULL || coll == NULL) { 7981 *status = U_ILLEGAL_ARGUMENT_ERROR; 7982 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 7983 return UCOL_EQUAL; 7984 } 7985 7986 UCollationResult result = UCOL_EQUAL; 7987 7988 // Preparing the context objects for iterating over strings 7989 collIterate sColl, tColl; 7990 IInit_collIterate(coll, NULL, -1, &sColl, status); 7991 IInit_collIterate(coll, NULL, -1, &tColl, status); 7992 if(U_FAILURE(*status)) { 7993 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 7994 return UCOL_EQUAL; 7995 } 7996 // The division for the array length may truncate the array size to 7997 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 7998 // for all platforms anyway. 7999 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8000 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8001 UNormIterator *sNormIter = NULL, *tNormIter = NULL; 8002 8003 sColl.iterator = sIter; 8004 sColl.flags |= UCOL_USE_ITERATOR; 8005 tColl.flags |= UCOL_USE_ITERATOR; 8006 tColl.iterator = tIter; 8007 8008 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 8009 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 8010 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); 8011 sColl.flags &= ~UCOL_ITER_NORM; 8012 8013 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 8014 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); 8015 tColl.flags &= ~UCOL_ITER_NORM; 8016 } 8017 8018 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; 8019 8020 while((sChar = sColl.iterator->next(sColl.iterator)) == 8021 (tChar = tColl.iterator->next(tColl.iterator))) { 8022 if(sChar == U_SENTINEL) { 8023 result = UCOL_EQUAL; 8024 goto end_compare; 8025 } 8026 } 8027 8028 if(sChar == U_SENTINEL) { 8029 tChar = tColl.iterator->previous(tColl.iterator); 8030 } 8031 8032 if(tChar == U_SENTINEL) { 8033 sChar = sColl.iterator->previous(sColl.iterator); 8034 } 8035 8036 sChar = sColl.iterator->previous(sColl.iterator); 8037 tChar = tColl.iterator->previous(tColl.iterator); 8038 8039 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) 8040 { 8041 // We are stopped in the middle of a contraction. 8042 // Scan backwards through the == part of the string looking for the start of the contraction. 8043 // It doesn't matter which string we scan, since they are the same in this region. 8044 do 8045 { 8046 sChar = sColl.iterator->previous(sColl.iterator); 8047 tChar = tColl.iterator->previous(tColl.iterator); 8048 } 8049 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); 8050 } 8051 8052 8053 if(U_SUCCESS(*status)) { 8054 result = ucol_strcollRegular(&sColl, &tColl, status); 8055 } 8056 8057 end_compare: 8058 if(sNormIter || tNormIter) { 8059 unorm_closeIter(sNormIter); 8060 unorm_closeIter(tNormIter); 8061 } 8062 8063 UTRACE_EXIT_VALUE_STATUS(result, *status) 8064 return result; 8065 } 8066 8067 8068 /* */ 8069 /* ucol_strcoll Main public API string comparison function */ 8070 /* */ 8071 U_CAPI UCollationResult U_EXPORT2 8072 ucol_strcoll( const UCollator *coll, 8073 const UChar *source, 8074 int32_t sourceLength, 8075 const UChar *target, 8076 int32_t targetLength) 8077 { 8078 U_ALIGN_CODE(16); 8079 8080 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 8081 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 8082 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 8083 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 8084 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 8085 } 8086 8087 if(source == NULL || target == NULL) { 8088 // do not crash, but return. Should have 8089 // status argument to return error. 8090 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8091 return UCOL_EQUAL; 8092 } 8093 8094 /* Quick check if source and target are same strings. */ 8095 /* They should either both be NULL terminated or the explicit length should be set on both. */ 8096 if (source==target && sourceLength==targetLength) { 8097 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8098 return UCOL_EQUAL; 8099 } 8100 8101 /* Scan the strings. Find: */ 8102 /* The length of any leading portion that is equal */ 8103 /* Whether they are exactly equal. (in which case we just return) */ 8104 const UChar *pSrc = source; 8105 const UChar *pTarg = target; 8106 int32_t equalLength; 8107 8108 if (sourceLength == -1 && targetLength == -1) { 8109 // Both strings are null terminated. 8110 // Scan through any leading equal portion. 8111 while (*pSrc == *pTarg && *pSrc != 0) { 8112 pSrc++; 8113 pTarg++; 8114 } 8115 if (*pSrc == 0 && *pTarg == 0) { 8116 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8117 return UCOL_EQUAL; 8118 } 8119 equalLength = (int32_t)(pSrc - source); 8120 } 8121 else 8122 { 8123 // One or both strings has an explicit length. 8124 const UChar *pSrcEnd = source + sourceLength; 8125 const UChar *pTargEnd = target + targetLength; 8126 8127 // Scan while the strings are bitwise ==, or until one is exhausted. 8128 for (;;) { 8129 if (pSrc == pSrcEnd || pTarg == pTargEnd) { 8130 break; 8131 } 8132 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 8133 break; 8134 } 8135 if (*pSrc != *pTarg) { 8136 break; 8137 } 8138 pSrc++; 8139 pTarg++; 8140 } 8141 equalLength = (int32_t)(pSrc - source); 8142 8143 // If we made it all the way through both strings, we are done. They are == 8144 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */ 8145 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */ 8146 { 8147 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8148 return UCOL_EQUAL; 8149 } 8150 } 8151 if (equalLength > 0) { 8152 /* There is an identical portion at the beginning of the two strings. */ 8153 /* If the identical portion ends within a contraction or a comibining */ 8154 /* character sequence, back up to the start of that sequence. */ 8155 8156 // These values should already be set by the code above. 8157 //pSrc = source + equalLength; /* point to the first differing chars */ 8158 //pTarg = target + equalLength; 8159 if ((pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) || 8160 (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))) 8161 { 8162 // We are stopped in the middle of a contraction. 8163 // Scan backwards through the == part of the string looking for the start of the contraction. 8164 // It doesn't matter which string we scan, since they are the same in this region. 8165 do 8166 { 8167 equalLength--; 8168 pSrc--; 8169 } 8170 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); 8171 } 8172 8173 source += equalLength; 8174 target += equalLength; 8175 if (sourceLength > 0) { 8176 sourceLength -= equalLength; 8177 } 8178 if (targetLength > 0) { 8179 targetLength -= equalLength; 8180 } 8181 } 8182 8183 UErrorCode status = U_ZERO_ERROR; 8184 UCollationResult returnVal; 8185 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { 8186 returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status); 8187 } else { 8188 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); 8189 } 8190 UTRACE_EXIT_VALUE(returnVal); 8191 return returnVal; 8192 } 8193 8194 /* convenience function for comparing strings */ 8195 U_CAPI UBool U_EXPORT2 8196 ucol_greater( const UCollator *coll, 8197 const UChar *source, 8198 int32_t sourceLength, 8199 const UChar *target, 8200 int32_t targetLength) 8201 { 8202 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8203 == UCOL_GREATER); 8204 } 8205 8206 /* convenience function for comparing strings */ 8207 U_CAPI UBool U_EXPORT2 8208 ucol_greaterOrEqual( const UCollator *coll, 8209 const UChar *source, 8210 int32_t sourceLength, 8211 const UChar *target, 8212 int32_t targetLength) 8213 { 8214 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8215 != UCOL_LESS); 8216 } 8217 8218 /* convenience function for comparing strings */ 8219 U_CAPI UBool U_EXPORT2 8220 ucol_equal( const UCollator *coll, 8221 const UChar *source, 8222 int32_t sourceLength, 8223 const UChar *target, 8224 int32_t targetLength) 8225 { 8226 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8227 == UCOL_EQUAL); 8228 } 8229 8230 U_CAPI void U_EXPORT2 8231 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 8232 if(coll && coll->UCA) { 8233 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); 8234 } 8235 } 8236 8237 #endif /* #if !UCONFIG_NO_COLLATION */ 8238