1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: ucol.cpp 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * Modification history 14 * Date Name Comments 15 * 1996-1999 various members of ICU team maintained C API for collation framework 16 * 02/16/2001 synwee Added internal method getPrevSpecialCE 17 * 03/01/2001 synwee Added maxexpansion functionality. 18 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 19 * 2012-2014 markus Rewritten in C++ again. 20 */ 21 22 #include "unicode/utypes.h" 23 24 #if !UCONFIG_NO_COLLATION 25 26 #include "unicode/coll.h" 27 #include "unicode/tblcoll.h" 28 #include "unicode/bytestream.h" 29 #include "unicode/coleitr.h" 30 #include "unicode/ucoleitr.h" 31 #include "unicode/ustring.h" 32 #include "cmemory.h" 33 #include "collation.h" 34 #include "cstring.h" 35 #include "putilimp.h" 36 #include "uassert.h" 37 #include "utracimp.h" 38 39 U_NAMESPACE_USE 40 41 U_CAPI UCollator* U_EXPORT2 42 ucol_openBinary(const uint8_t *bin, int32_t length, 43 const UCollator *base, 44 UErrorCode *status) 45 { 46 if(U_FAILURE(*status)) { return NULL; } 47 RuleBasedCollator *coll = new RuleBasedCollator( 48 bin, length, 49 RuleBasedCollator::rbcFromUCollator(base), 50 *status); 51 if(coll == NULL) { 52 *status = U_MEMORY_ALLOCATION_ERROR; 53 return NULL; 54 } 55 if(U_FAILURE(*status)) { 56 delete coll; 57 return NULL; 58 } 59 return coll->toUCollator(); 60 } 61 62 U_CAPI int32_t U_EXPORT2 63 ucol_cloneBinary(const UCollator *coll, 64 uint8_t *buffer, int32_t capacity, 65 UErrorCode *status) 66 { 67 if(U_FAILURE(*status)) { 68 return 0; 69 } 70 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 71 if(rbc == NULL && coll != NULL) { 72 *status = U_UNSUPPORTED_ERROR; 73 return 0; 74 } 75 return rbc->cloneBinary(buffer, capacity, *status); 76 } 77 78 U_CAPI UCollator* U_EXPORT2 79 ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status) 80 { 81 if (status == NULL || U_FAILURE(*status)){ 82 return NULL; 83 } 84 if (coll == NULL) { 85 *status = U_ILLEGAL_ARGUMENT_ERROR; 86 return NULL; 87 } 88 if (pBufferSize != NULL) { 89 int32_t inputSize = *pBufferSize; 90 *pBufferSize = 1; 91 if (inputSize == 0) { 92 return NULL; // preflighting for deprecated functionality 93 } 94 } 95 Collator *newColl = Collator::fromUCollator(coll)->clone(); 96 if (newColl == NULL) { 97 *status = U_MEMORY_ALLOCATION_ERROR; 98 } else { 99 *status = U_SAFECLONE_ALLOCATED_WARNING; 100 } 101 return newColl->toUCollator(); 102 } 103 104 U_CAPI void U_EXPORT2 105 ucol_close(UCollator *coll) 106 { 107 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 108 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 109 if(coll != NULL) { 110 delete Collator::fromUCollator(coll); 111 } 112 UTRACE_EXIT(); 113 } 114 115 U_CAPI int32_t U_EXPORT2 116 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 117 const uint8_t *src2, int32_t src2Length, 118 uint8_t *dest, int32_t destCapacity) { 119 /* check arguments */ 120 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 121 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 122 destCapacity<0 || (destCapacity>0 && dest==NULL) 123 ) { 124 /* error, attempt to write a zero byte and return 0 */ 125 if(dest!=NULL && destCapacity>0) { 126 *dest=0; 127 } 128 return 0; 129 } 130 131 /* check lengths and capacity */ 132 if(src1Length<0) { 133 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 134 } 135 if(src2Length<0) { 136 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 137 } 138 139 int32_t destLength=src1Length+src2Length; 140 if(destLength>destCapacity) { 141 /* the merged sort key does not fit into the destination */ 142 return destLength; 143 } 144 145 /* merge the sort keys with the same number of levels */ 146 uint8_t *p=dest; 147 for(;;) { 148 /* copy level from src1 not including 00 or 01 */ 149 uint8_t b; 150 while((b=*src1)>=2) { 151 ++src1; 152 *p++=b; 153 } 154 155 /* add a 02 merge separator */ 156 *p++=2; 157 158 /* copy level from src2 not including 00 or 01 */ 159 while((b=*src2)>=2) { 160 ++src2; 161 *p++=b; 162 } 163 164 /* if both sort keys have another level, then add a 01 level separator and continue */ 165 if(*src1==1 && *src2==1) { 166 ++src1; 167 ++src2; 168 *p++=1; 169 } else { 170 break; 171 } 172 } 173 174 /* 175 * here, at least one sort key is finished now, but the other one 176 * might have some contents left from containing more levels; 177 * that contents is just appended to the result 178 */ 179 if(*src1!=0) { 180 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 181 src2=src1; 182 } 183 /* append src2, "the other, unfinished sort key" */ 184 while((*p++=*src2++)!=0) {} 185 186 /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */ 187 return (int32_t)(p-dest); 188 } 189 190 U_CAPI int32_t U_EXPORT2 191 ucol_getSortKey(const UCollator *coll, 192 const UChar *source, 193 int32_t sourceLength, 194 uint8_t *result, 195 int32_t resultLength) 196 { 197 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 198 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 199 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 200 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 201 } 202 203 int32_t keySize = Collator::fromUCollator(coll)-> 204 getSortKey(source, sourceLength, result, resultLength); 205 206 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 207 UTRACE_EXIT_VALUE(keySize); 208 return keySize; 209 } 210 211 U_CAPI int32_t U_EXPORT2 212 ucol_nextSortKeyPart(const UCollator *coll, 213 UCharIterator *iter, 214 uint32_t state[2], 215 uint8_t *dest, int32_t count, 216 UErrorCode *status) 217 { 218 /* error checking */ 219 if(status==NULL || U_FAILURE(*status)) { 220 return 0; 221 } 222 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 223 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 224 coll, iter, state[0], state[1], dest, count); 225 226 int32_t i = Collator::fromUCollator(coll)-> 227 internalNextSortKeyPart(iter, state, dest, count, *status); 228 229 // Return number of meaningful sortkey bytes. 230 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 231 dest,i, state[0], state[1]); 232 UTRACE_EXIT_VALUE_STATUS(i, *status); 233 return i; 234 } 235 236 /** 237 * Produce a bound for a given sortkey and a number of levels. 238 */ 239 U_CAPI int32_t U_EXPORT2 240 ucol_getBound(const uint8_t *source, 241 int32_t sourceLength, 242 UColBoundMode boundType, 243 uint32_t noOfLevels, 244 uint8_t *result, 245 int32_t resultLength, 246 UErrorCode *status) 247 { 248 // consistency checks 249 if(status == NULL || U_FAILURE(*status)) { 250 return 0; 251 } 252 if(source == NULL) { 253 *status = U_ILLEGAL_ARGUMENT_ERROR; 254 return 0; 255 } 256 257 int32_t sourceIndex = 0; 258 // Scan the string until we skip enough of the key OR reach the end of the key 259 do { 260 sourceIndex++; 261 if(source[sourceIndex] == Collation::LEVEL_SEPARATOR_BYTE) { 262 noOfLevels--; 263 } 264 } while (noOfLevels > 0 265 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 266 267 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 268 && noOfLevels > 0) { 269 *status = U_SORT_KEY_TOO_SHORT_WARNING; 270 } 271 272 273 // READ ME: this code assumes that the values for boundType 274 // enum will not changes. They are set so that the enum value 275 // corresponds to the number of extra bytes each bound type 276 // needs. 277 if(result != NULL && resultLength >= sourceIndex+boundType) { 278 uprv_memcpy(result, source, sourceIndex); 279 switch(boundType) { 280 // Lower bound just gets terminated. No extra bytes 281 case UCOL_BOUND_LOWER: // = 0 282 break; 283 // Upper bound needs one extra byte 284 case UCOL_BOUND_UPPER: // = 1 285 result[sourceIndex++] = 2; 286 break; 287 // Upper long bound needs two extra bytes 288 case UCOL_BOUND_UPPER_LONG: // = 2 289 result[sourceIndex++] = 0xFF; 290 result[sourceIndex++] = 0xFF; 291 break; 292 default: 293 *status = U_ILLEGAL_ARGUMENT_ERROR; 294 return 0; 295 } 296 result[sourceIndex++] = 0; 297 298 return sourceIndex; 299 } else { 300 return sourceIndex+boundType+1; 301 } 302 } 303 304 U_CAPI void U_EXPORT2 305 ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCode) { 306 if(U_FAILURE(*pErrorCode)) { return; } 307 Collator::fromUCollator(coll)->setMaxVariable(group, *pErrorCode); 308 } 309 310 U_CAPI UColReorderCode U_EXPORT2 311 ucol_getMaxVariable(const UCollator *coll) { 312 return Collator::fromUCollator(coll)->getMaxVariable(); 313 } 314 315 U_CAPI uint32_t U_EXPORT2 316 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 317 if(U_FAILURE(*status) || coll == NULL) { 318 return 0; 319 } 320 return Collator::fromUCollator(coll)->setVariableTop(varTop, len, *status); 321 } 322 323 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 324 if(U_FAILURE(*status) || coll == NULL) { 325 return 0; 326 } 327 return Collator::fromUCollator(coll)->getVariableTop(*status); 328 } 329 330 U_CAPI void U_EXPORT2 331 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 332 if(U_FAILURE(*status) || coll == NULL) { 333 return; 334 } 335 Collator::fromUCollator(coll)->setVariableTop(varTop, *status); 336 } 337 338 U_CAPI void U_EXPORT2 339 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 340 if(U_FAILURE(*status) || coll == NULL) { 341 return; 342 } 343 344 Collator::fromUCollator(coll)->setAttribute(attr, value, *status); 345 } 346 347 U_CAPI UColAttributeValue U_EXPORT2 348 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 349 if(U_FAILURE(*status) || coll == NULL) { 350 return UCOL_DEFAULT; 351 } 352 353 return Collator::fromUCollator(coll)->getAttribute(attr, *status); 354 } 355 356 U_CAPI void U_EXPORT2 357 ucol_setStrength( UCollator *coll, 358 UCollationStrength strength) 359 { 360 UErrorCode status = U_ZERO_ERROR; 361 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 362 } 363 364 U_CAPI UCollationStrength U_EXPORT2 365 ucol_getStrength(const UCollator *coll) 366 { 367 UErrorCode status = U_ZERO_ERROR; 368 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 369 } 370 371 U_CAPI int32_t U_EXPORT2 372 ucol_getReorderCodes(const UCollator *coll, 373 int32_t *dest, 374 int32_t destCapacity, 375 UErrorCode *status) { 376 if (U_FAILURE(*status)) { 377 return 0; 378 } 379 380 return Collator::fromUCollator(coll)->getReorderCodes(dest, destCapacity, *status); 381 } 382 383 U_CAPI void U_EXPORT2 384 ucol_setReorderCodes(UCollator* coll, 385 const int32_t* reorderCodes, 386 int32_t reorderCodesLength, 387 UErrorCode *status) { 388 if (U_FAILURE(*status)) { 389 return; 390 } 391 392 Collator::fromUCollator(coll)->setReorderCodes(reorderCodes, reorderCodesLength, *status); 393 } 394 395 U_CAPI int32_t U_EXPORT2 396 ucol_getEquivalentReorderCodes(int32_t reorderCode, 397 int32_t* dest, 398 int32_t destCapacity, 399 UErrorCode *pErrorCode) { 400 return Collator::getEquivalentReorderCodes(reorderCode, dest, destCapacity, *pErrorCode); 401 } 402 403 U_CAPI void U_EXPORT2 404 ucol_getVersion(const UCollator* coll, 405 UVersionInfo versionInfo) 406 { 407 Collator::fromUCollator(coll)->getVersion(versionInfo); 408 } 409 410 U_CAPI UCollationResult U_EXPORT2 411 ucol_strcollIter( const UCollator *coll, 412 UCharIterator *sIter, 413 UCharIterator *tIter, 414 UErrorCode *status) 415 { 416 if(!status || U_FAILURE(*status)) { 417 return UCOL_EQUAL; 418 } 419 420 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 421 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 422 423 if(sIter == NULL || tIter == NULL || coll == NULL) { 424 *status = U_ILLEGAL_ARGUMENT_ERROR; 425 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 426 return UCOL_EQUAL; 427 } 428 429 UCollationResult result = Collator::fromUCollator(coll)->compare(*sIter, *tIter, *status); 430 431 UTRACE_EXIT_VALUE_STATUS(result, *status); 432 return result; 433 } 434 435 436 /* */ 437 /* ucol_strcoll Main public API string comparison function */ 438 /* */ 439 U_CAPI UCollationResult U_EXPORT2 440 ucol_strcoll( const UCollator *coll, 441 const UChar *source, 442 int32_t sourceLength, 443 const UChar *target, 444 int32_t targetLength) 445 { 446 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 447 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 448 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 449 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 450 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 451 } 452 453 UErrorCode status = U_ZERO_ERROR; 454 UCollationResult returnVal = Collator::fromUCollator(coll)-> 455 compare(source, sourceLength, target, targetLength, status); 456 UTRACE_EXIT_VALUE_STATUS(returnVal, status); 457 return returnVal; 458 } 459 460 U_CAPI UCollationResult U_EXPORT2 461 ucol_strcollUTF8( 462 const UCollator *coll, 463 const char *source, 464 int32_t sourceLength, 465 const char *target, 466 int32_t targetLength, 467 UErrorCode *status) 468 { 469 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); 470 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 471 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 472 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength); 473 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength); 474 } 475 476 if (U_FAILURE(*status)) { 477 /* do nothing */ 478 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 479 return UCOL_EQUAL; 480 } 481 482 UCollationResult returnVal = Collator::fromUCollator(coll)->internalCompareUTF8( 483 source, sourceLength, target, targetLength, *status); 484 UTRACE_EXIT_VALUE_STATUS(returnVal, *status); 485 return returnVal; 486 } 487 488 489 /* convenience function for comparing strings */ 490 U_CAPI UBool U_EXPORT2 491 ucol_greater( const UCollator *coll, 492 const UChar *source, 493 int32_t sourceLength, 494 const UChar *target, 495 int32_t targetLength) 496 { 497 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 498 == UCOL_GREATER); 499 } 500 501 /* convenience function for comparing strings */ 502 U_CAPI UBool U_EXPORT2 503 ucol_greaterOrEqual( const UCollator *coll, 504 const UChar *source, 505 int32_t sourceLength, 506 const UChar *target, 507 int32_t targetLength) 508 { 509 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 510 != UCOL_LESS); 511 } 512 513 /* convenience function for comparing strings */ 514 U_CAPI UBool U_EXPORT2 515 ucol_equal( const UCollator *coll, 516 const UChar *source, 517 int32_t sourceLength, 518 const UChar *target, 519 int32_t targetLength) 520 { 521 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 522 == UCOL_EQUAL); 523 } 524 525 U_CAPI void U_EXPORT2 526 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 527 const Collator *c = Collator::fromUCollator(coll); 528 if(c != NULL) { 529 UVersionInfo v; 530 c->getVersion(v); 531 // Note: This is tied to how the current implementation encodes the UCA version 532 // in the overall getVersion(). 533 // Alternatively, we could load the root collator and get at lower-level data from there. 534 // Either way, it will reflect the input collator's UCA version only 535 // if it is a known implementation. 536 // It would be cleaner to make this a virtual Collator method. 537 info[0] = v[1] >> 3; 538 info[1] = v[1] & 7; 539 info[2] = v[2] >> 6; 540 info[3] = 0; 541 } 542 } 543 544 U_CAPI const UChar * U_EXPORT2 545 ucol_getRules(const UCollator *coll, int32_t *length) { 546 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 547 // OK to crash if coll==NULL: We do not want to check "this" pointers. 548 if(rbc != NULL || coll == NULL) { 549 const UnicodeString &rules = rbc->getRules(); 550 U_ASSERT(rules.getBuffer()[rules.length()] == 0); 551 *length = rules.length(); 552 return rules.getBuffer(); 553 } 554 static const UChar _NUL = 0; 555 *length = 0; 556 return &_NUL; 557 } 558 559 U_CAPI int32_t U_EXPORT2 560 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) { 561 UnicodeString rules; 562 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 563 if(rbc != NULL || coll == NULL) { 564 rbc->getRules(delta, rules); 565 } 566 if(buffer != NULL && bufferLen > 0) { 567 UErrorCode errorCode = U_ZERO_ERROR; 568 return rules.extract(buffer, bufferLen, errorCode); 569 } else { 570 return rules.length(); 571 } 572 } 573 574 U_CAPI const char * U_EXPORT2 575 ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) { 576 return ucol_getLocaleByType(coll, type, status); 577 } 578 579 U_CAPI const char * U_EXPORT2 580 ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) { 581 if(U_FAILURE(*status)) { 582 return NULL; 583 } 584 UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE); 585 UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll); 586 587 const char *result; 588 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 589 if(rbc == NULL && coll != NULL) { 590 *status = U_UNSUPPORTED_ERROR; 591 result = NULL; 592 } else { 593 result = rbc->internalGetLocaleID(type, *status); 594 } 595 596 UTRACE_DATA1(UTRACE_INFO, "result = %s", result); 597 UTRACE_EXIT_STATUS(*status); 598 return result; 599 } 600 601 U_CAPI USet * U_EXPORT2 602 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) { 603 if(U_FAILURE(*status)) { 604 return NULL; 605 } 606 UnicodeSet *set = Collator::fromUCollator(coll)->getTailoredSet(*status); 607 if(U_FAILURE(*status)) { 608 delete set; 609 return NULL; 610 } 611 return set->toUSet(); 612 } 613 614 U_CAPI UBool U_EXPORT2 615 ucol_equals(const UCollator *source, const UCollator *target) { 616 return source == target || 617 (*Collator::fromUCollator(source)) == (*Collator::fromUCollator(target)); 618 } 619 620 #endif /* #if !UCONFIG_NO_COLLATION */ 621