1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: ucol.cpp 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * Modification history 14 * Date Name Comments 15 * 1996-1999 various members of ICU team maintained C API for collation framework 16 * 02/16/2001 synwee Added internal method getPrevSpecialCE 17 * 03/01/2001 synwee Added maxexpansion functionality. 18 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 19 * 2012-2014 markus Rewritten in C++ again. 20 */ 21 22 #include "unicode/utypes.h" 23 24 #if !UCONFIG_NO_COLLATION 25 26 #include "unicode/coll.h" 27 #include "unicode/tblcoll.h" 28 #include "unicode/bytestream.h" 29 #include "unicode/coleitr.h" 30 #include "unicode/ucoleitr.h" 31 #include "unicode/ustring.h" 32 #include "cmemory.h" 33 #include "collation.h" 34 #include "cstring.h" 35 #include "putilimp.h" 36 #include "uassert.h" 37 #include "utracimp.h" 38 39 U_NAMESPACE_USE 40 41 U_CAPI UCollator* U_EXPORT2 42 ucol_openBinary(const uint8_t *bin, int32_t length, 43 const UCollator *base, 44 UErrorCode *status) 45 { 46 if(U_FAILURE(*status)) { return NULL; } 47 RuleBasedCollator *coll = new RuleBasedCollator( 48 bin, length, 49 RuleBasedCollator::rbcFromUCollator(base), 50 *status); 51 if(coll == NULL) { 52 *status = U_MEMORY_ALLOCATION_ERROR; 53 return NULL; 54 } 55 if(U_FAILURE(*status)) { 56 delete coll; 57 return NULL; 58 } 59 return coll->toUCollator(); 60 } 61 62 U_CAPI int32_t U_EXPORT2 63 ucol_cloneBinary(const UCollator *coll, 64 uint8_t *buffer, int32_t capacity, 65 UErrorCode *status) 66 { 67 if(U_FAILURE(*status)) { 68 return 0; 69 } 70 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 71 if(rbc == NULL && coll != NULL) { 72 *status = U_UNSUPPORTED_ERROR; 73 return 0; 74 } 75 return rbc->cloneBinary(buffer, capacity, *status); 76 } 77 78 U_CAPI UCollator* U_EXPORT2 79 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status) 80 { 81 if (status == NULL || U_FAILURE(*status)){ 82 return NULL; 83 } 84 if (coll == NULL) { 85 *status = U_ILLEGAL_ARGUMENT_ERROR; 86 return NULL; 87 } 88 if (pBufferSize != NULL) { 89 int32_t inputSize = *pBufferSize; 90 *pBufferSize = 1; 91 if (inputSize == 0) { 92 return NULL; // preflighting for deprecated functionality 93 } 94 } 95 Collator *newColl = Collator::fromUCollator(coll)->clone(); 96 if (newColl == NULL) { 97 *status = U_MEMORY_ALLOCATION_ERROR; 98 return nullptr; 99 } else { 100 *status = U_SAFECLONE_ALLOCATED_WARNING; 101 } 102 return newColl->toUCollator(); 103 } 104 105 U_CAPI void U_EXPORT2 106 ucol_close(UCollator *coll) 107 { 108 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 109 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 110 if(coll != NULL) { 111 delete Collator::fromUCollator(coll); 112 } 113 UTRACE_EXIT(); 114 } 115 116 U_CAPI int32_t U_EXPORT2 117 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 118 const uint8_t *src2, int32_t src2Length, 119 uint8_t *dest, int32_t destCapacity) { 120 /* check arguments */ 121 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 122 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 123 destCapacity<0 || (destCapacity>0 && dest==NULL) 124 ) { 125 /* error, attempt to write a zero byte and return 0 */ 126 if(dest!=NULL && destCapacity>0) { 127 *dest=0; 128 } 129 return 0; 130 } 131 132 /* check lengths and capacity */ 133 if(src1Length<0) { 134 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 135 } 136 if(src2Length<0) { 137 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 138 } 139 140 int32_t destLength=src1Length+src2Length; 141 if(destLength>destCapacity) { 142 /* the merged sort key does not fit into the destination */ 143 return destLength; 144 } 145 146 /* merge the sort keys with the same number of levels */ 147 uint8_t *p=dest; 148 for(;;) { 149 /* copy level from src1 not including 00 or 01 */ 150 uint8_t b; 151 while((b=*src1)>=2) { 152 ++src1; 153 *p++=b; 154 } 155 156 /* add a 02 merge separator */ 157 *p++=2; 158 159 /* copy level from src2 not including 00 or 01 */ 160 while((b=*src2)>=2) { 161 ++src2; 162 *p++=b; 163 } 164 165 /* if both sort keys have another level, then add a 01 level separator and continue */ 166 if(*src1==1 && *src2==1) { 167 ++src1; 168 ++src2; 169 *p++=1; 170 } else { 171 break; 172 } 173 } 174 175 /* 176 * here, at least one sort key is finished now, but the other one 177 * might have some contents left from containing more levels; 178 * that contents is just appended to the result 179 */ 180 if(*src1!=0) { 181 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 182 src2=src1; 183 } 184 /* append src2, "the other, unfinished sort key" */ 185 while((*p++=*src2++)!=0) {} 186 187 /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */ 188 return (int32_t)(p-dest); 189 } 190 191 U_CAPI int32_t U_EXPORT2 192 ucol_getSortKey(const UCollator *coll, 193 const UChar *source, 194 int32_t sourceLength, 195 uint8_t *result, 196 int32_t resultLength) 197 { 198 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 199 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 200 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 201 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 202 } 203 204 int32_t keySize = Collator::fromUCollator(coll)-> 205 getSortKey(source, sourceLength, result, resultLength); 206 207 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 208 UTRACE_EXIT_VALUE(keySize); 209 return keySize; 210 } 211 212 U_CAPI int32_t U_EXPORT2 213 ucol_nextSortKeyPart(const UCollator *coll, 214 UCharIterator *iter, 215 uint32_t state[2], 216 uint8_t *dest, int32_t count, 217 UErrorCode *status) 218 { 219 /* error checking */ 220 if(status==NULL || U_FAILURE(*status)) { 221 return 0; 222 } 223 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 224 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 225 coll, iter, state[0], state[1], dest, count); 226 227 int32_t i = Collator::fromUCollator(coll)-> 228 internalNextSortKeyPart(iter, state, dest, count, *status); 229 230 // Return number of meaningful sortkey bytes. 231 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 232 dest,i, state[0], state[1]); 233 UTRACE_EXIT_VALUE_STATUS(i, *status); 234 return i; 235 } 236 237 /** 238 * Produce a bound for a given sortkey and a number of levels. 239 */ 240 U_CAPI int32_t U_EXPORT2 241 ucol_getBound(const uint8_t *source, 242 int32_t sourceLength, 243 UColBoundMode boundType, 244 uint32_t noOfLevels, 245 uint8_t *result, 246 int32_t resultLength, 247 UErrorCode *status) 248 { 249 // consistency checks 250 if(status == NULL || U_FAILURE(*status)) { 251 return 0; 252 } 253 if(source == NULL) { 254 *status = U_ILLEGAL_ARGUMENT_ERROR; 255 return 0; 256 } 257 258 int32_t sourceIndex = 0; 259 // Scan the string until we skip enough of the key OR reach the end of the key 260 do { 261 sourceIndex++; 262 if(source[sourceIndex] == Collation::LEVEL_SEPARATOR_BYTE) { 263 noOfLevels--; 264 } 265 } while (noOfLevels > 0 266 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 267 268 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 269 && noOfLevels > 0) { 270 *status = U_SORT_KEY_TOO_SHORT_WARNING; 271 } 272 273 274 // READ ME: this code assumes that the values for boundType 275 // enum will not changes. They are set so that the enum value 276 // corresponds to the number of extra bytes each bound type 277 // needs. 278 if(result != NULL && resultLength >= sourceIndex+boundType) { 279 uprv_memcpy(result, source, sourceIndex); 280 switch(boundType) { 281 // Lower bound just gets terminated. No extra bytes 282 case UCOL_BOUND_LOWER: // = 0 283 break; 284 // Upper bound needs one extra byte 285 case UCOL_BOUND_UPPER: // = 1 286 result[sourceIndex++] = 2; 287 break; 288 // Upper long bound needs two extra bytes 289 case UCOL_BOUND_UPPER_LONG: // = 2 290 result[sourceIndex++] = 0xFF; 291 result[sourceIndex++] = 0xFF; 292 break; 293 default: 294 *status = U_ILLEGAL_ARGUMENT_ERROR; 295 return 0; 296 } 297 result[sourceIndex++] = 0; 298 299 return sourceIndex; 300 } else { 301 return sourceIndex+boundType+1; 302 } 303 } 304 305 U_CAPI void U_EXPORT2 306 ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCode) { 307 if(U_FAILURE(*pErrorCode)) { return; } 308 Collator::fromUCollator(coll)->setMaxVariable(group, *pErrorCode); 309 } 310 311 U_CAPI UColReorderCode U_EXPORT2 312 ucol_getMaxVariable(const UCollator *coll) { 313 return Collator::fromUCollator(coll)->getMaxVariable(); 314 } 315 316 U_CAPI uint32_t U_EXPORT2 317 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 318 if(U_FAILURE(*status) || coll == NULL) { 319 return 0; 320 } 321 return Collator::fromUCollator(coll)->setVariableTop(varTop, len, *status); 322 } 323 324 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 325 if(U_FAILURE(*status) || coll == NULL) { 326 return 0; 327 } 328 return Collator::fromUCollator(coll)->getVariableTop(*status); 329 } 330 331 U_CAPI void U_EXPORT2 332 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 333 if(U_FAILURE(*status) || coll == NULL) { 334 return; 335 } 336 Collator::fromUCollator(coll)->setVariableTop(varTop, *status); 337 } 338 339 U_CAPI void U_EXPORT2 340 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 341 if(U_FAILURE(*status) || coll == NULL) { 342 return; 343 } 344 345 Collator::fromUCollator(coll)->setAttribute(attr, value, *status); 346 } 347 348 U_CAPI UColAttributeValue U_EXPORT2 349 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 350 if(U_FAILURE(*status) || coll == NULL) { 351 return UCOL_DEFAULT; 352 } 353 354 return Collator::fromUCollator(coll)->getAttribute(attr, *status); 355 } 356 357 U_CAPI void U_EXPORT2 358 ucol_setStrength( UCollator *coll, 359 UCollationStrength strength) 360 { 361 UErrorCode status = U_ZERO_ERROR; 362 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 363 } 364 365 U_CAPI UCollationStrength U_EXPORT2 366 ucol_getStrength(const UCollator *coll) 367 { 368 UErrorCode status = U_ZERO_ERROR; 369 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 370 } 371 372 U_CAPI int32_t U_EXPORT2 373 ucol_getReorderCodes(const UCollator *coll, 374 int32_t *dest, 375 int32_t destCapacity, 376 UErrorCode *status) { 377 if (U_FAILURE(*status)) { 378 return 0; 379 } 380 381 return Collator::fromUCollator(coll)->getReorderCodes(dest, destCapacity, *status); 382 } 383 384 U_CAPI void U_EXPORT2 385 ucol_setReorderCodes(UCollator* coll, 386 const int32_t* reorderCodes, 387 int32_t reorderCodesLength, 388 UErrorCode *status) { 389 if (U_FAILURE(*status)) { 390 return; 391 } 392 393 Collator::fromUCollator(coll)->setReorderCodes(reorderCodes, reorderCodesLength, *status); 394 } 395 396 U_CAPI int32_t U_EXPORT2 397 ucol_getEquivalentReorderCodes(int32_t reorderCode, 398 int32_t* dest, 399 int32_t destCapacity, 400 UErrorCode *pErrorCode) { 401 return Collator::getEquivalentReorderCodes(reorderCode, dest, destCapacity, *pErrorCode); 402 } 403 404 U_CAPI void U_EXPORT2 405 ucol_getVersion(const UCollator* coll, 406 UVersionInfo versionInfo) 407 { 408 Collator::fromUCollator(coll)->getVersion(versionInfo); 409 } 410 411 U_CAPI UCollationResult U_EXPORT2 412 ucol_strcollIter( const UCollator *coll, 413 UCharIterator *sIter, 414 UCharIterator *tIter, 415 UErrorCode *status) 416 { 417 if(!status || U_FAILURE(*status)) { 418 return UCOL_EQUAL; 419 } 420 421 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 422 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 423 424 if(sIter == NULL || tIter == NULL || coll == NULL) { 425 *status = U_ILLEGAL_ARGUMENT_ERROR; 426 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 427 return UCOL_EQUAL; 428 } 429 430 UCollationResult result = Collator::fromUCollator(coll)->compare(*sIter, *tIter, *status); 431 432 UTRACE_EXIT_VALUE_STATUS(result, *status); 433 return result; 434 } 435 436 437 /* */ 438 /* ucol_strcoll Main public API string comparison function */ 439 /* */ 440 U_CAPI UCollationResult U_EXPORT2 441 ucol_strcoll( const UCollator *coll, 442 const UChar *source, 443 int32_t sourceLength, 444 const UChar *target, 445 int32_t targetLength) 446 { 447 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 448 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 449 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 450 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 451 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 452 } 453 454 UErrorCode status = U_ZERO_ERROR; 455 UCollationResult returnVal = Collator::fromUCollator(coll)-> 456 compare(source, sourceLength, target, targetLength, status); 457 UTRACE_EXIT_VALUE_STATUS(returnVal, status); 458 return returnVal; 459 } 460 461 U_CAPI UCollationResult U_EXPORT2 462 ucol_strcollUTF8( 463 const UCollator *coll, 464 const char *source, 465 int32_t sourceLength, 466 const char *target, 467 int32_t targetLength, 468 UErrorCode *status) 469 { 470 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); 471 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 472 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 473 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength); 474 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength); 475 } 476 477 if (U_FAILURE(*status)) { 478 /* do nothing */ 479 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 480 return UCOL_EQUAL; 481 } 482 483 UCollationResult returnVal = Collator::fromUCollator(coll)->internalCompareUTF8( 484 source, sourceLength, target, targetLength, *status); 485 UTRACE_EXIT_VALUE_STATUS(returnVal, *status); 486 return returnVal; 487 } 488 489 490 /* convenience function for comparing strings */ 491 U_CAPI UBool U_EXPORT2 492 ucol_greater( const UCollator *coll, 493 const UChar *source, 494 int32_t sourceLength, 495 const UChar *target, 496 int32_t targetLength) 497 { 498 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 499 == UCOL_GREATER); 500 } 501 502 /* convenience function for comparing strings */ 503 U_CAPI UBool U_EXPORT2 504 ucol_greaterOrEqual( const UCollator *coll, 505 const UChar *source, 506 int32_t sourceLength, 507 const UChar *target, 508 int32_t targetLength) 509 { 510 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 511 != UCOL_LESS); 512 } 513 514 /* convenience function for comparing strings */ 515 U_CAPI UBool U_EXPORT2 516 ucol_equal( const UCollator *coll, 517 const UChar *source, 518 int32_t sourceLength, 519 const UChar *target, 520 int32_t targetLength) 521 { 522 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 523 == UCOL_EQUAL); 524 } 525 526 U_CAPI void U_EXPORT2 527 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 528 const Collator *c = Collator::fromUCollator(coll); 529 if(c != NULL) { 530 UVersionInfo v; 531 c->getVersion(v); 532 // Note: This is tied to how the current implementation encodes the UCA version 533 // in the overall getVersion(). 534 // Alternatively, we could load the root collator and get at lower-level data from there. 535 // Either way, it will reflect the input collator's UCA version only 536 // if it is a known implementation. 537 // It would be cleaner to make this a virtual Collator method. 538 info[0] = v[1] >> 3; 539 info[1] = v[1] & 7; 540 info[2] = v[2] >> 6; 541 info[3] = 0; 542 } 543 } 544 545 U_CAPI const UChar * U_EXPORT2 546 ucol_getRules(const UCollator *coll, int32_t *length) { 547 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 548 // OK to crash if coll==NULL: We do not want to check "this" pointers. 549 if(rbc != NULL || coll == NULL) { 550 const UnicodeString &rules = rbc->getRules(); 551 U_ASSERT(rules.getBuffer()[rules.length()] == 0); 552 *length = rules.length(); 553 return rules.getBuffer(); 554 } 555 static const UChar _NUL = 0; 556 *length = 0; 557 return &_NUL; 558 } 559 560 U_CAPI int32_t U_EXPORT2 561 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) { 562 UnicodeString rules; 563 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 564 if(rbc != NULL || coll == NULL) { 565 rbc->getRules(delta, rules); 566 } 567 if(buffer != NULL && bufferLen > 0) { 568 UErrorCode errorCode = U_ZERO_ERROR; 569 return rules.extract(buffer, bufferLen, errorCode); 570 } else { 571 return rules.length(); 572 } 573 } 574 575 U_CAPI const char * U_EXPORT2 576 ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) { 577 return ucol_getLocaleByType(coll, type, status); 578 } 579 580 U_CAPI const char * U_EXPORT2 581 ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) { 582 if(U_FAILURE(*status)) { 583 return NULL; 584 } 585 UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE); 586 UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll); 587 588 const char *result; 589 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 590 if(rbc == NULL && coll != NULL) { 591 *status = U_UNSUPPORTED_ERROR; 592 result = NULL; 593 } else { 594 result = rbc->internalGetLocaleID(type, *status); 595 } 596 597 UTRACE_DATA1(UTRACE_INFO, "result = %s", result); 598 UTRACE_EXIT_STATUS(*status); 599 return result; 600 } 601 602 U_CAPI USet * U_EXPORT2 603 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) { 604 if(U_FAILURE(*status)) { 605 return NULL; 606 } 607 UnicodeSet *set = Collator::fromUCollator(coll)->getTailoredSet(*status); 608 if(U_FAILURE(*status)) { 609 delete set; 610 return NULL; 611 } 612 return set->toUSet(); 613 } 614 615 U_CAPI UBool U_EXPORT2 616 ucol_equals(const UCollator *source, const UCollator *target) { 617 return source == target || 618 (*Collator::fromUCollator(source)) == (*Collator::fromUCollator(target)); 619 } 620 621 #endif /* #if !UCONFIG_NO_COLLATION */ 622