1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2011, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uset.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002mar07 16 * created by: Markus W. Scherer 17 * 18 * There are functions to efficiently serialize a USet into an array of uint16_t 19 * and functions to use such a serialized form efficiently without 20 * instantiating a new USet. 21 */ 22 23 #include "unicode/utypes.h" 24 #include "unicode/uobject.h" 25 #include "unicode/uset.h" 26 #include "unicode/uniset.h" 27 #include "cmemory.h" 28 #include "unicode/ustring.h" 29 #include "unicode/parsepos.h" 30 31 U_NAMESPACE_USE 32 33 U_CAPI USet* U_EXPORT2 34 uset_openEmpty() { 35 return (USet*) new UnicodeSet(); 36 } 37 38 U_CAPI USet* U_EXPORT2 39 uset_open(UChar32 start, UChar32 end) { 40 return (USet*) new UnicodeSet(start, end); 41 } 42 43 U_CAPI void U_EXPORT2 44 uset_close(USet* set) { 45 delete (UnicodeSet*) set; 46 } 47 48 U_CAPI USet * U_EXPORT2 49 uset_clone(const USet *set) { 50 return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone()); 51 } 52 53 U_CAPI UBool U_EXPORT2 54 uset_isFrozen(const USet *set) { 55 return ((UnicodeSet*) set)->UnicodeSet::isFrozen(); 56 } 57 58 U_CAPI void U_EXPORT2 59 uset_freeze(USet *set) { 60 ((UnicodeSet*) set)->UnicodeSet::freeze(); 61 } 62 63 U_CAPI USet * U_EXPORT2 64 uset_cloneAsThawed(const USet *set) { 65 return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed()); 66 } 67 68 U_CAPI void U_EXPORT2 69 uset_set(USet* set, 70 UChar32 start, UChar32 end) { 71 ((UnicodeSet*) set)->UnicodeSet::set(start, end); 72 } 73 74 U_CAPI void U_EXPORT2 75 uset_addAll(USet* set, const USet *additionalSet) { 76 ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet)); 77 } 78 79 U_CAPI void U_EXPORT2 80 uset_add(USet* set, UChar32 c) { 81 ((UnicodeSet*) set)->UnicodeSet::add(c); 82 } 83 84 U_CAPI void U_EXPORT2 85 uset_addRange(USet* set, UChar32 start, UChar32 end) { 86 ((UnicodeSet*) set)->UnicodeSet::add(start, end); 87 } 88 89 U_CAPI void U_EXPORT2 90 uset_addString(USet* set, const UChar* str, int32_t strLen) { 91 // UnicodeString handles -1 for strLen 92 UnicodeString s(strLen<0, str, strLen); 93 ((UnicodeSet*) set)->UnicodeSet::add(s); 94 } 95 96 U_CAPI void U_EXPORT2 97 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) { 98 // UnicodeString handles -1 for strLen 99 UnicodeString s(str, strLen); 100 ((UnicodeSet*) set)->UnicodeSet::addAll(s); 101 } 102 103 U_CAPI void U_EXPORT2 104 uset_remove(USet* set, UChar32 c) { 105 ((UnicodeSet*) set)->UnicodeSet::remove(c); 106 } 107 108 U_CAPI void U_EXPORT2 109 uset_removeRange(USet* set, UChar32 start, UChar32 end) { 110 ((UnicodeSet*) set)->UnicodeSet::remove(start, end); 111 } 112 113 U_CAPI void U_EXPORT2 114 uset_removeString(USet* set, const UChar* str, int32_t strLen) { 115 UnicodeString s(strLen==-1, str, strLen); 116 ((UnicodeSet*) set)->UnicodeSet::remove(s); 117 } 118 119 U_CAPI void U_EXPORT2 120 uset_removeAll(USet* set, const USet* remove) { 121 ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove); 122 } 123 124 U_CAPI void U_EXPORT2 125 uset_retain(USet* set, UChar32 start, UChar32 end) { 126 ((UnicodeSet*) set)->UnicodeSet::retain(start, end); 127 } 128 129 U_CAPI void U_EXPORT2 130 uset_retainAll(USet* set, const USet* retain) { 131 ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain); 132 } 133 134 U_CAPI void U_EXPORT2 135 uset_compact(USet* set) { 136 ((UnicodeSet*) set)->UnicodeSet::compact(); 137 } 138 139 U_CAPI void U_EXPORT2 140 uset_complement(USet* set) { 141 ((UnicodeSet*) set)->UnicodeSet::complement(); 142 } 143 144 U_CAPI void U_EXPORT2 145 uset_complementAll(USet* set, const USet* complement) { 146 ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement); 147 } 148 149 U_CAPI void U_EXPORT2 150 uset_clear(USet* set) { 151 ((UnicodeSet*) set)->UnicodeSet::clear(); 152 } 153 154 U_CAPI void U_EXPORT2 155 uset_removeAllStrings(USet* set) { 156 ((UnicodeSet*) set)->UnicodeSet::removeAllStrings(); 157 } 158 159 U_CAPI UBool U_EXPORT2 160 uset_isEmpty(const USet* set) { 161 return ((const UnicodeSet*) set)->UnicodeSet::isEmpty(); 162 } 163 164 U_CAPI UBool U_EXPORT2 165 uset_contains(const USet* set, UChar32 c) { 166 return ((const UnicodeSet*) set)->UnicodeSet::contains(c); 167 } 168 169 U_CAPI UBool U_EXPORT2 170 uset_containsRange(const USet* set, UChar32 start, UChar32 end) { 171 return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end); 172 } 173 174 U_CAPI UBool U_EXPORT2 175 uset_containsString(const USet* set, const UChar* str, int32_t strLen) { 176 UnicodeString s(strLen==-1, str, strLen); 177 return ((const UnicodeSet*) set)->UnicodeSet::contains(s); 178 } 179 180 U_CAPI UBool U_EXPORT2 181 uset_containsAll(const USet* set1, const USet* set2) { 182 return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2); 183 } 184 185 U_CAPI UBool U_EXPORT2 186 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) { 187 // Create a string alias, since nothing is being added to the set. 188 UnicodeString s(strLen==-1, str, strLen); 189 return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s); 190 } 191 192 U_CAPI UBool U_EXPORT2 193 uset_containsNone(const USet* set1, const USet* set2) { 194 return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2); 195 } 196 197 U_CAPI UBool U_EXPORT2 198 uset_containsSome(const USet* set1, const USet* set2) { 199 return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2); 200 } 201 202 U_CAPI int32_t U_EXPORT2 203 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 204 return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition); 205 } 206 207 U_CAPI int32_t U_EXPORT2 208 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 209 return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition); 210 } 211 212 U_CAPI int32_t U_EXPORT2 213 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 214 return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition); 215 } 216 217 U_CAPI int32_t U_EXPORT2 218 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 219 return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition); 220 } 221 222 U_CAPI UBool U_EXPORT2 223 uset_equals(const USet* set1, const USet* set2) { 224 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2; 225 } 226 227 U_CAPI int32_t U_EXPORT2 228 uset_indexOf(const USet* set, UChar32 c) { 229 return ((UnicodeSet*) set)->UnicodeSet::indexOf(c); 230 } 231 232 U_CAPI UChar32 U_EXPORT2 233 uset_charAt(const USet* set, int32_t index) { 234 return ((UnicodeSet*) set)->UnicodeSet::charAt(index); 235 } 236 237 U_CAPI int32_t U_EXPORT2 238 uset_size(const USet* set) { 239 return ((const UnicodeSet*) set)->UnicodeSet::size(); 240 } 241 242 U_NAMESPACE_BEGIN 243 /** 244 * This class only exists to provide access to the UnicodeSet private 245 * USet support API. Declaring a class a friend is more portable than 246 * trying to declare extern "C" functions as friends. 247 */ 248 class USetAccess /* not : public UObject because all methods are static */ { 249 public: 250 /* Try to have the compiler inline these*/ 251 inline static int32_t getStringCount(const UnicodeSet& set) { 252 return set.stringsSize(); 253 } 254 inline static const UnicodeString* getString(const UnicodeSet& set, 255 int32_t i) { 256 return set.getString(i); 257 } 258 private: 259 /* do not instantiate*/ 260 USetAccess(); 261 }; 262 U_NAMESPACE_END 263 264 U_CAPI int32_t U_EXPORT2 265 uset_getItemCount(const USet* uset) { 266 const UnicodeSet& set = *(const UnicodeSet*)uset; 267 return set.getRangeCount() + USetAccess::getStringCount(set); 268 } 269 270 U_CAPI int32_t U_EXPORT2 271 uset_getItem(const USet* uset, int32_t itemIndex, 272 UChar32* start, UChar32* end, 273 UChar* str, int32_t strCapacity, 274 UErrorCode* ec) { 275 if (U_FAILURE(*ec)) return 0; 276 const UnicodeSet& set = *(const UnicodeSet*)uset; 277 int32_t rangeCount; 278 279 if (itemIndex < 0) { 280 *ec = U_ILLEGAL_ARGUMENT_ERROR; 281 return -1; 282 } else if (itemIndex < (rangeCount = set.getRangeCount())) { 283 *start = set.getRangeStart(itemIndex); 284 *end = set.getRangeEnd(itemIndex); 285 return 0; 286 } else { 287 itemIndex -= rangeCount; 288 if (itemIndex < USetAccess::getStringCount(set)) { 289 const UnicodeString* s = USetAccess::getString(set, itemIndex); 290 return s->extract(str, strCapacity, *ec); 291 } else { 292 *ec = U_INDEX_OUTOFBOUNDS_ERROR; 293 return -1; 294 } 295 } 296 } 297 298 //U_CAPI int32_t U_EXPORT2 299 //uset_getRangeCount(const USet* set) { 300 // return ((const UnicodeSet*) set)->getRangeCount(); 301 //} 302 // 303 //U_CAPI UBool U_EXPORT2 304 //uset_getRange(const USet* set, int32_t rangeIndex, 305 // UChar32* pStart, UChar32* pEnd) { 306 // if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) { 307 // return FALSE; 308 // } 309 // const UnicodeSet* us = (const UnicodeSet*) set; 310 // *pStart = us->getRangeStart(rangeIndex); 311 // *pEnd = us->getRangeEnd(rangeIndex); 312 // return TRUE; 313 //} 314 315 /* 316 * Serialize a USet into 16-bit units. 317 * Store BMP code points as themselves with one 16-bit unit each. 318 * 319 * Important: the code points in the array are in ascending order, 320 * therefore all BMP code points precede all supplementary code points. 321 * 322 * Store each supplementary code point in 2 16-bit units, 323 * simply with higher-then-lower 16-bit halfs. 324 * 325 * Precede the entire list with the length. 326 * If there are supplementary code points, then set bit 15 in the length 327 * and add the bmpLength between it and the array. 328 * 329 * In other words: 330 * - all BMP: (length=bmpLength) BMP, .., BMP 331 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, .. 332 */ 333 U_CAPI int32_t U_EXPORT2 334 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) { 335 if (ec==NULL || U_FAILURE(*ec)) { 336 return 0; 337 } 338 339 return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec); 340 } 341 342 U_CAPI UBool U_EXPORT2 343 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) { 344 int32_t length; 345 346 if(fillSet==NULL) { 347 return FALSE; 348 } 349 if(src==NULL || srcLength<=0) { 350 fillSet->length=fillSet->bmpLength=0; 351 return FALSE; 352 } 353 354 length=*src++; 355 if(length&0x8000) { 356 /* there are supplementary values */ 357 length&=0x7fff; 358 if(srcLength<(2+length)) { 359 fillSet->length=fillSet->bmpLength=0; 360 return FALSE; 361 } 362 fillSet->bmpLength=*src++; 363 } else { 364 /* only BMP values */ 365 if(srcLength<(1+length)) { 366 fillSet->length=fillSet->bmpLength=0; 367 return FALSE; 368 } 369 fillSet->bmpLength=length; 370 } 371 fillSet->array=src; 372 fillSet->length=length; 373 return TRUE; 374 } 375 376 U_CAPI void U_EXPORT2 377 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) { 378 if(fillSet==NULL || (uint32_t)c>0x10ffff) { 379 return; 380 } 381 382 fillSet->array=fillSet->staticArray; 383 if(c<0xffff) { 384 fillSet->bmpLength=fillSet->length=2; 385 fillSet->staticArray[0]=(uint16_t)c; 386 fillSet->staticArray[1]=(uint16_t)c+1; 387 } else if(c==0xffff) { 388 fillSet->bmpLength=1; 389 fillSet->length=3; 390 fillSet->staticArray[0]=0xffff; 391 fillSet->staticArray[1]=1; 392 fillSet->staticArray[2]=0; 393 } else if(c<0x10ffff) { 394 fillSet->bmpLength=0; 395 fillSet->length=4; 396 fillSet->staticArray[0]=(uint16_t)(c>>16); 397 fillSet->staticArray[1]=(uint16_t)c; 398 ++c; 399 fillSet->staticArray[2]=(uint16_t)(c>>16); 400 fillSet->staticArray[3]=(uint16_t)c; 401 } else /* c==0x10ffff */ { 402 fillSet->bmpLength=0; 403 fillSet->length=2; 404 fillSet->staticArray[0]=0x10; 405 fillSet->staticArray[1]=0xffff; 406 } 407 } 408 409 U_CAPI UBool U_EXPORT2 410 uset_serializedContains(const USerializedSet* set, UChar32 c) { 411 const uint16_t* array; 412 413 if(set==NULL || (uint32_t)c>0x10ffff) { 414 return FALSE; 415 } 416 417 array=set->array; 418 if(c<=0xffff) { 419 /* find c in the BMP part */ 420 int32_t lo = 0; 421 int32_t hi = set->bmpLength-1; 422 if (c < array[0]) { 423 hi = 0; 424 } else if (c < array[hi]) { 425 for(;;) { 426 int32_t i = (lo + hi) >> 1; 427 if (i == lo) { 428 break; // Done! 429 } else if (c < array[i]) { 430 hi = i; 431 } else { 432 lo = i; 433 } 434 } 435 } else { 436 hi += 1; 437 } 438 return (UBool)(hi&1); 439 } else { 440 /* find c in the supplementary part */ 441 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c; 442 int32_t base = set->bmpLength; 443 int32_t lo = 0; 444 int32_t hi = set->length - 2 - base; 445 if (high < array[base] || (high==array[base] && low<array[base+1])) { 446 hi = 0; 447 } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) { 448 for (;;) { 449 int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result 450 int32_t iabs = i + base; 451 if (i == lo) { 452 break; // Done! 453 } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) { 454 hi = i; 455 } else { 456 lo = i; 457 } 458 } 459 } else { 460 hi += 2; 461 } 462 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */ 463 return (UBool)(((hi+(base<<1))&2)!=0); 464 } 465 } 466 467 U_CAPI int32_t U_EXPORT2 468 uset_getSerializedRangeCount(const USerializedSet* set) { 469 if(set==NULL) { 470 return 0; 471 } 472 473 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2; 474 } 475 476 U_CAPI UBool U_EXPORT2 477 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 478 UChar32* pStart, UChar32* pEnd) { 479 const uint16_t* array; 480 int32_t bmpLength, length; 481 482 if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) { 483 return FALSE; 484 } 485 486 array=set->array; 487 length=set->length; 488 bmpLength=set->bmpLength; 489 490 rangeIndex*=2; /* address start/limit pairs */ 491 if(rangeIndex<bmpLength) { 492 *pStart=array[rangeIndex++]; 493 if(rangeIndex<bmpLength) { 494 *pEnd=array[rangeIndex]-1; 495 } else if(rangeIndex<length) { 496 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; 497 } else { 498 *pEnd=0x10ffff; 499 } 500 return TRUE; 501 } else { 502 rangeIndex-=bmpLength; 503 rangeIndex*=2; /* address pairs of pairs of units */ 504 length-=bmpLength; 505 if(rangeIndex<length) { 506 array+=bmpLength; 507 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1]; 508 rangeIndex+=2; 509 if(rangeIndex<length) { 510 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; 511 } else { 512 *pEnd=0x10ffff; 513 } 514 return TRUE; 515 } else { 516 return FALSE; 517 } 518 } 519 } 520 521 // TODO The old, internal uset.c had an efficient uset_containsOne function. 522 // Returned the one and only code point, or else -1 or something. 523 // Consider adding such a function to both C and C++ UnicodeSet/uset. 524 // See tools/gennorm/store.c for usage, now usetContainsOne there. 525 526 // TODO Investigate incorporating this code into UnicodeSet to improve 527 // efficiency. 528 // --- 529 // #define USET_GROW_DELTA 20 530 // 531 // static int32_t 532 // findChar(const UChar32* array, int32_t length, UChar32 c) { 533 // int32_t i; 534 // 535 // /* check the last range limit first for more efficient appending */ 536 // if(length>0) { 537 // if(c>=array[length-1]) { 538 // return length; 539 // } 540 // 541 // /* do not check the last range limit again in the loop below */ 542 // --length; 543 // } 544 // 545 // for(i=0; i<length && c>=array[i]; ++i) {} 546 // return i; 547 // } 548 // 549 // static UBool 550 // addRemove(USet* set, UChar32 c, int32_t doRemove) { 551 // int32_t i, length, more; 552 // 553 // if(set==NULL || (uint32_t)c>0x10ffff) { 554 // return FALSE; 555 // } 556 // 557 // length=set->length; 558 // i=findChar(set->array, length, c); 559 // if((i&1)^doRemove) { 560 // /* c is already in the set */ 561 // return TRUE; 562 // } 563 // 564 // /* how many more array items do we need? */ 565 // if(i<length && (c+1)==set->array[i]) { 566 // /* c is just before the following range, extend that in-place by one */ 567 // set->array[i]=c; 568 // if(i>0) { 569 // --i; 570 // if(c==set->array[i]) { 571 // /* the previous range collapsed, remove it */ 572 // set->length=length-=2; 573 // if(i<length) { 574 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 575 // } 576 // } 577 // } 578 // return TRUE; 579 // } else if(i>0 && c==set->array[i-1]) { 580 // /* c is just after the previous range, extend that in-place by one */ 581 // if(++c<=0x10ffff) { 582 // set->array[i-1]=c; 583 // if(i<length && c==set->array[i]) { 584 // /* the following range collapsed, remove it */ 585 // --i; 586 // set->length=length-=2; 587 // if(i<length) { 588 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 589 // } 590 // } 591 // } else { 592 // /* extend the previous range (had limit 0x10ffff) to the end of Unicode */ 593 // set->length=i-1; 594 // } 595 // return TRUE; 596 // } else if(i==length && c==0x10ffff) { 597 // /* insert one range limit c */ 598 // more=1; 599 // } else { 600 // /* insert two range limits c, c+1 */ 601 // more=2; 602 // } 603 // 604 // /* insert <more> range limits */ 605 // if(length+more>set->capacity) { 606 // /* reallocate */ 607 // int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA; 608 // UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4); 609 // if(newArray==NULL) { 610 // return FALSE; 611 // } 612 // set->capacity=newCapacity; 613 // uprv_memcpy(newArray, set->array, length*4); 614 // 615 // if(set->array!=set->staticBuffer) { 616 // uprv_free(set->array); 617 // } 618 // set->array=newArray; 619 // } 620 // 621 // if(i<length) { 622 // uprv_memmove(set->array+i+more, set->array+i, (length-i)*4); 623 // } 624 // set->array[i]=c; 625 // if(more==2) { 626 // set->array[i+1]=c+1; 627 // } 628 // set->length+=more; 629 // 630 // return TRUE; 631 // } 632 // 633 // U_CAPI UBool U_EXPORT2 634 // uset_add(USet* set, UChar32 c) { 635 // return addRemove(set, c, 0); 636 // } 637 // 638 // U_CAPI void U_EXPORT2 639 // uset_remove(USet* set, UChar32 c) { 640 // addRemove(set, c, 1); 641 // } 642