1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uset.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002mar07 14 * created by: Markus W. Scherer 15 * 16 * There are functions to efficiently serialize a USet into an array of uint16_t 17 * and functions to use such a serialized form efficiently without 18 * instantiating a new USet. 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/uobject.h" 23 #include "unicode/uset.h" 24 #include "unicode/uniset.h" 25 #include "cmemory.h" 26 #include "unicode/ustring.h" 27 #include "unicode/parsepos.h" 28 29 U_NAMESPACE_USE 30 31 U_CAPI USet* U_EXPORT2 32 uset_openEmpty() { 33 return (USet*) new UnicodeSet(); 34 } 35 36 U_CAPI USet* U_EXPORT2 37 uset_open(UChar32 start, UChar32 end) { 38 return (USet*) new UnicodeSet(start, end); 39 } 40 41 U_CAPI void U_EXPORT2 42 uset_close(USet* set) { 43 delete (UnicodeSet*) set; 44 } 45 46 U_CAPI USet * U_EXPORT2 47 uset_clone(const USet *set) { 48 return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone()); 49 } 50 51 U_CAPI UBool U_EXPORT2 52 uset_isFrozen(const USet *set) { 53 return ((UnicodeSet*) set)->UnicodeSet::isFrozen(); 54 } 55 56 U_CAPI void U_EXPORT2 57 uset_freeze(USet *set) { 58 ((UnicodeSet*) set)->UnicodeSet::freeze(); 59 } 60 61 U_CAPI USet * U_EXPORT2 62 uset_cloneAsThawed(const USet *set) { 63 return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed()); 64 } 65 66 U_CAPI void U_EXPORT2 67 uset_set(USet* set, 68 UChar32 start, UChar32 end) { 69 ((UnicodeSet*) set)->UnicodeSet::set(start, end); 70 } 71 72 U_CAPI void U_EXPORT2 73 uset_addAll(USet* set, const USet *additionalSet) { 74 ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet)); 75 } 76 77 U_CAPI void U_EXPORT2 78 uset_add(USet* set, UChar32 c) { 79 ((UnicodeSet*) set)->UnicodeSet::add(c); 80 } 81 82 U_CAPI void U_EXPORT2 83 uset_addRange(USet* set, UChar32 start, UChar32 end) { 84 ((UnicodeSet*) set)->UnicodeSet::add(start, end); 85 } 86 87 U_CAPI void U_EXPORT2 88 uset_addString(USet* set, const UChar* str, int32_t strLen) { 89 // UnicodeString handles -1 for strLen 90 UnicodeString s(strLen<0, str, strLen); 91 ((UnicodeSet*) set)->UnicodeSet::add(s); 92 } 93 94 U_CAPI void U_EXPORT2 95 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) { 96 // UnicodeString handles -1 for strLen 97 UnicodeString s(str, strLen); 98 ((UnicodeSet*) set)->UnicodeSet::addAll(s); 99 } 100 101 U_CAPI void U_EXPORT2 102 uset_remove(USet* set, UChar32 c) { 103 ((UnicodeSet*) set)->UnicodeSet::remove(c); 104 } 105 106 U_CAPI void U_EXPORT2 107 uset_removeRange(USet* set, UChar32 start, UChar32 end) { 108 ((UnicodeSet*) set)->UnicodeSet::remove(start, end); 109 } 110 111 U_CAPI void U_EXPORT2 112 uset_removeString(USet* set, const UChar* str, int32_t strLen) { 113 UnicodeString s(strLen==-1, str, strLen); 114 ((UnicodeSet*) set)->UnicodeSet::remove(s); 115 } 116 117 U_CAPI void U_EXPORT2 118 uset_removeAll(USet* set, const USet* remove) { 119 ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove); 120 } 121 122 U_CAPI void U_EXPORT2 123 uset_retain(USet* set, UChar32 start, UChar32 end) { 124 ((UnicodeSet*) set)->UnicodeSet::retain(start, end); 125 } 126 127 U_CAPI void U_EXPORT2 128 uset_retainAll(USet* set, const USet* retain) { 129 ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain); 130 } 131 132 U_CAPI void U_EXPORT2 133 uset_compact(USet* set) { 134 ((UnicodeSet*) set)->UnicodeSet::compact(); 135 } 136 137 U_CAPI void U_EXPORT2 138 uset_complement(USet* set) { 139 ((UnicodeSet*) set)->UnicodeSet::complement(); 140 } 141 142 U_CAPI void U_EXPORT2 143 uset_complementAll(USet* set, const USet* complement) { 144 ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement); 145 } 146 147 U_CAPI void U_EXPORT2 148 uset_clear(USet* set) { 149 ((UnicodeSet*) set)->UnicodeSet::clear(); 150 } 151 152 U_CAPI void U_EXPORT2 153 uset_removeAllStrings(USet* set) { 154 ((UnicodeSet*) set)->UnicodeSet::removeAllStrings(); 155 } 156 157 U_CAPI UBool U_EXPORT2 158 uset_isEmpty(const USet* set) { 159 return ((const UnicodeSet*) set)->UnicodeSet::isEmpty(); 160 } 161 162 U_CAPI UBool U_EXPORT2 163 uset_contains(const USet* set, UChar32 c) { 164 return ((const UnicodeSet*) set)->UnicodeSet::contains(c); 165 } 166 167 U_CAPI UBool U_EXPORT2 168 uset_containsRange(const USet* set, UChar32 start, UChar32 end) { 169 return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end); 170 } 171 172 U_CAPI UBool U_EXPORT2 173 uset_containsString(const USet* set, const UChar* str, int32_t strLen) { 174 UnicodeString s(strLen==-1, str, strLen); 175 return ((const UnicodeSet*) set)->UnicodeSet::contains(s); 176 } 177 178 U_CAPI UBool U_EXPORT2 179 uset_containsAll(const USet* set1, const USet* set2) { 180 return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2); 181 } 182 183 U_CAPI UBool U_EXPORT2 184 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) { 185 // Create a string alias, since nothing is being added to the set. 186 UnicodeString s(strLen==-1, str, strLen); 187 return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s); 188 } 189 190 U_CAPI UBool U_EXPORT2 191 uset_containsNone(const USet* set1, const USet* set2) { 192 return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2); 193 } 194 195 U_CAPI UBool U_EXPORT2 196 uset_containsSome(const USet* set1, const USet* set2) { 197 return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2); 198 } 199 200 U_CAPI int32_t U_EXPORT2 201 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 202 return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition); 203 } 204 205 U_CAPI int32_t U_EXPORT2 206 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 207 return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition); 208 } 209 210 U_CAPI int32_t U_EXPORT2 211 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 212 return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition); 213 } 214 215 U_CAPI int32_t U_EXPORT2 216 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 217 return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition); 218 } 219 220 U_CAPI UBool U_EXPORT2 221 uset_equals(const USet* set1, const USet* set2) { 222 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2; 223 } 224 225 U_CAPI int32_t U_EXPORT2 226 uset_indexOf(const USet* set, UChar32 c) { 227 return ((UnicodeSet*) set)->UnicodeSet::indexOf(c); 228 } 229 230 U_CAPI UChar32 U_EXPORT2 231 uset_charAt(const USet* set, int32_t index) { 232 return ((UnicodeSet*) set)->UnicodeSet::charAt(index); 233 } 234 235 U_CAPI int32_t U_EXPORT2 236 uset_size(const USet* set) { 237 return ((const UnicodeSet*) set)->UnicodeSet::size(); 238 } 239 240 U_NAMESPACE_BEGIN 241 /** 242 * This class only exists to provide access to the UnicodeSet private 243 * USet support API. Declaring a class a friend is more portable than 244 * trying to declare extern "C" functions as friends. 245 */ 246 class USetAccess /* not : public UObject because all methods are static */ { 247 public: 248 /* Try to have the compiler inline these*/ 249 inline static int32_t getStringCount(const UnicodeSet& set) { 250 return set.getStringCount(); 251 } 252 inline static const UnicodeString* getString(const UnicodeSet& set, 253 int32_t i) { 254 return set.getString(i); 255 } 256 private: 257 /* do not instantiate*/ 258 USetAccess(); 259 }; 260 U_NAMESPACE_END 261 262 U_CAPI int32_t U_EXPORT2 263 uset_getItemCount(const USet* uset) { 264 const UnicodeSet& set = *(const UnicodeSet*)uset; 265 return set.getRangeCount() + USetAccess::getStringCount(set); 266 } 267 268 U_CAPI int32_t U_EXPORT2 269 uset_getItem(const USet* uset, int32_t itemIndex, 270 UChar32* start, UChar32* end, 271 UChar* str, int32_t strCapacity, 272 UErrorCode* ec) { 273 if (U_FAILURE(*ec)) return 0; 274 const UnicodeSet& set = *(const UnicodeSet*)uset; 275 int32_t rangeCount; 276 277 if (itemIndex < 0) { 278 *ec = U_ILLEGAL_ARGUMENT_ERROR; 279 return -1; 280 } else if (itemIndex < (rangeCount = set.getRangeCount())) { 281 *start = set.getRangeStart(itemIndex); 282 *end = set.getRangeEnd(itemIndex); 283 return 0; 284 } else { 285 itemIndex -= rangeCount; 286 if (itemIndex < USetAccess::getStringCount(set)) { 287 const UnicodeString* s = USetAccess::getString(set, itemIndex); 288 return s->extract(str, strCapacity, *ec); 289 } else { 290 *ec = U_INDEX_OUTOFBOUNDS_ERROR; 291 return -1; 292 } 293 } 294 } 295 296 //U_CAPI int32_t U_EXPORT2 297 //uset_getRangeCount(const USet* set) { 298 // return ((const UnicodeSet*) set)->getRangeCount(); 299 //} 300 // 301 //U_CAPI UBool U_EXPORT2 302 //uset_getRange(const USet* set, int32_t rangeIndex, 303 // UChar32* pStart, UChar32* pEnd) { 304 // if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) { 305 // return FALSE; 306 // } 307 // const UnicodeSet* us = (const UnicodeSet*) set; 308 // *pStart = us->getRangeStart(rangeIndex); 309 // *pEnd = us->getRangeEnd(rangeIndex); 310 // return TRUE; 311 //} 312 313 /* 314 * Serialize a USet into 16-bit units. 315 * Store BMP code points as themselves with one 16-bit unit each. 316 * 317 * Important: the code points in the array are in ascending order, 318 * therefore all BMP code points precede all supplementary code points. 319 * 320 * Store each supplementary code point in 2 16-bit units, 321 * simply with higher-then-lower 16-bit halfs. 322 * 323 * Precede the entire list with the length. 324 * If there are supplementary code points, then set bit 15 in the length 325 * and add the bmpLength between it and the array. 326 * 327 * In other words: 328 * - all BMP: (length=bmpLength) BMP, .., BMP 329 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, .. 330 */ 331 U_CAPI int32_t U_EXPORT2 332 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) { 333 if (ec==NULL || U_FAILURE(*ec)) { 334 return 0; 335 } 336 337 return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec); 338 } 339 340 U_CAPI UBool U_EXPORT2 341 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) { 342 int32_t length; 343 344 if(fillSet==NULL) { 345 return FALSE; 346 } 347 if(src==NULL || srcLength<=0) { 348 fillSet->length=fillSet->bmpLength=0; 349 return FALSE; 350 } 351 352 length=*src++; 353 if(length&0x8000) { 354 /* there are supplementary values */ 355 length&=0x7fff; 356 if(srcLength<(2+length)) { 357 fillSet->length=fillSet->bmpLength=0; 358 return FALSE; 359 } 360 fillSet->bmpLength=*src++; 361 } else { 362 /* only BMP values */ 363 if(srcLength<(1+length)) { 364 fillSet->length=fillSet->bmpLength=0; 365 return FALSE; 366 } 367 fillSet->bmpLength=length; 368 } 369 fillSet->array=src; 370 fillSet->length=length; 371 return TRUE; 372 } 373 374 U_CAPI void U_EXPORT2 375 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) { 376 if(fillSet==NULL || (uint32_t)c>0x10ffff) { 377 return; 378 } 379 380 fillSet->array=fillSet->staticArray; 381 if(c<0xffff) { 382 fillSet->bmpLength=fillSet->length=2; 383 fillSet->staticArray[0]=(uint16_t)c; 384 fillSet->staticArray[1]=(uint16_t)c+1; 385 } else if(c==0xffff) { 386 fillSet->bmpLength=1; 387 fillSet->length=3; 388 fillSet->staticArray[0]=0xffff; 389 fillSet->staticArray[1]=1; 390 fillSet->staticArray[2]=0; 391 } else if(c<0x10ffff) { 392 fillSet->bmpLength=0; 393 fillSet->length=4; 394 fillSet->staticArray[0]=(uint16_t)(c>>16); 395 fillSet->staticArray[1]=(uint16_t)c; 396 ++c; 397 fillSet->staticArray[2]=(uint16_t)(c>>16); 398 fillSet->staticArray[3]=(uint16_t)c; 399 } else /* c==0x10ffff */ { 400 fillSet->bmpLength=0; 401 fillSet->length=2; 402 fillSet->staticArray[0]=0x10; 403 fillSet->staticArray[1]=0xffff; 404 } 405 } 406 407 U_CAPI UBool U_EXPORT2 408 uset_serializedContains(const USerializedSet* set, UChar32 c) { 409 const uint16_t* array; 410 411 if(set==NULL || (uint32_t)c>0x10ffff) { 412 return FALSE; 413 } 414 415 array=set->array; 416 if(c<=0xffff) { 417 /* find c in the BMP part */ 418 int32_t lo = 0; 419 int32_t hi = set->bmpLength-1; 420 if (c < array[0]) { 421 hi = 0; 422 } else if (c < array[hi]) { 423 for(;;) { 424 int32_t i = (lo + hi) >> 1; 425 if (i == lo) { 426 break; // Done! 427 } else if (c < array[i]) { 428 hi = i; 429 } else { 430 lo = i; 431 } 432 } 433 } else { 434 hi += 1; 435 } 436 return (UBool)(hi&1); 437 } else { 438 /* find c in the supplementary part */ 439 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c; 440 int32_t base = set->bmpLength; 441 int32_t lo = 0; 442 int32_t hi = set->length - 2 - base; 443 if (high < array[base] || (high==array[base] && low<array[base+1])) { 444 hi = 0; 445 } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) { 446 for (;;) { 447 int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result 448 int32_t iabs = i + base; 449 if (i == lo) { 450 break; // Done! 451 } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) { 452 hi = i; 453 } else { 454 lo = i; 455 } 456 } 457 } else { 458 hi += 2; 459 } 460 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */ 461 return (UBool)(((hi+(base<<1))&2)!=0); 462 } 463 } 464 465 U_CAPI int32_t U_EXPORT2 466 uset_getSerializedRangeCount(const USerializedSet* set) { 467 if(set==NULL) { 468 return 0; 469 } 470 471 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2; 472 } 473 474 U_CAPI UBool U_EXPORT2 475 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 476 UChar32* pStart, UChar32* pEnd) { 477 const uint16_t* array; 478 int32_t bmpLength, length; 479 480 if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) { 481 return FALSE; 482 } 483 484 array=set->array; 485 length=set->length; 486 bmpLength=set->bmpLength; 487 488 rangeIndex*=2; /* address start/limit pairs */ 489 if(rangeIndex<bmpLength) { 490 *pStart=array[rangeIndex++]; 491 if(rangeIndex<bmpLength) { 492 *pEnd=array[rangeIndex]-1; 493 } else if(rangeIndex<length) { 494 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; 495 } else { 496 *pEnd=0x10ffff; 497 } 498 return TRUE; 499 } else { 500 rangeIndex-=bmpLength; 501 rangeIndex*=2; /* address pairs of pairs of units */ 502 length-=bmpLength; 503 if(rangeIndex<length) { 504 array+=bmpLength; 505 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1]; 506 rangeIndex+=2; 507 if(rangeIndex<length) { 508 *pEnd=((((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1])-1; 509 } else { 510 *pEnd=0x10ffff; 511 } 512 return TRUE; 513 } else { 514 return FALSE; 515 } 516 } 517 } 518 519 // TODO The old, internal uset.c had an efficient uset_containsOne function. 520 // Returned the one and only code point, or else -1 or something. 521 // Consider adding such a function to both C and C++ UnicodeSet/uset. 522 // See tools/gennorm/store.c for usage, now usetContainsOne there. 523 524 // TODO Investigate incorporating this code into UnicodeSet to improve 525 // efficiency. 526 // --- 527 // #define USET_GROW_DELTA 20 528 // 529 // static int32_t 530 // findChar(const UChar32* array, int32_t length, UChar32 c) { 531 // int32_t i; 532 // 533 // /* check the last range limit first for more efficient appending */ 534 // if(length>0) { 535 // if(c>=array[length-1]) { 536 // return length; 537 // } 538 // 539 // /* do not check the last range limit again in the loop below */ 540 // --length; 541 // } 542 // 543 // for(i=0; i<length && c>=array[i]; ++i) {} 544 // return i; 545 // } 546 // 547 // static UBool 548 // addRemove(USet* set, UChar32 c, int32_t doRemove) { 549 // int32_t i, length, more; 550 // 551 // if(set==NULL || (uint32_t)c>0x10ffff) { 552 // return FALSE; 553 // } 554 // 555 // length=set->length; 556 // i=findChar(set->array, length, c); 557 // if((i&1)^doRemove) { 558 // /* c is already in the set */ 559 // return TRUE; 560 // } 561 // 562 // /* how many more array items do we need? */ 563 // if(i<length && (c+1)==set->array[i]) { 564 // /* c is just before the following range, extend that in-place by one */ 565 // set->array[i]=c; 566 // if(i>0) { 567 // --i; 568 // if(c==set->array[i]) { 569 // /* the previous range collapsed, remove it */ 570 // set->length=length-=2; 571 // if(i<length) { 572 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 573 // } 574 // } 575 // } 576 // return TRUE; 577 // } else if(i>0 && c==set->array[i-1]) { 578 // /* c is just after the previous range, extend that in-place by one */ 579 // if(++c<=0x10ffff) { 580 // set->array[i-1]=c; 581 // if(i<length && c==set->array[i]) { 582 // /* the following range collapsed, remove it */ 583 // --i; 584 // set->length=length-=2; 585 // if(i<length) { 586 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 587 // } 588 // } 589 // } else { 590 // /* extend the previous range (had limit 0x10ffff) to the end of Unicode */ 591 // set->length=i-1; 592 // } 593 // return TRUE; 594 // } else if(i==length && c==0x10ffff) { 595 // /* insert one range limit c */ 596 // more=1; 597 // } else { 598 // /* insert two range limits c, c+1 */ 599 // more=2; 600 // } 601 // 602 // /* insert <more> range limits */ 603 // if(length+more>set->capacity) { 604 // /* reallocate */ 605 // int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA; 606 // UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4); 607 // if(newArray==NULL) { 608 // return FALSE; 609 // } 610 // set->capacity=newCapacity; 611 // uprv_memcpy(newArray, set->array, length*4); 612 // 613 // if(set->array!=set->staticBuffer) { 614 // uprv_free(set->array); 615 // } 616 // set->array=newArray; 617 // } 618 // 619 // if(i<length) { 620 // uprv_memmove(set->array+i+more, set->array+i, (length-i)*4); 621 // } 622 // set->array[i]=c; 623 // if(more==2) { 624 // set->array[i+1]=c+1; 625 // } 626 // set->length+=more; 627 // 628 // return TRUE; 629 // } 630 // 631 // U_CAPI UBool U_EXPORT2 632 // uset_add(USet* set, UChar32 c) { 633 // return addRemove(set, c, 0); 634 // } 635 // 636 // U_CAPI void U_EXPORT2 637 // uset_remove(USet* set, UChar32 c) { 638 // addRemove(set, c, 1); 639 // } 640