1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uset.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002mar07 14 * created by: Markus W. Scherer 15 * 16 * There are functions to efficiently serialize a USet into an array of uint16_t 17 * and functions to use such a serialized form efficiently without 18 * instantiating a new USet. 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/uobject.h" 23 #include "unicode/uset.h" 24 #include "unicode/uniset.h" 25 #include "cmemory.h" 26 #include "unicode/ustring.h" 27 #include "unicode/parsepos.h" 28 29 U_NAMESPACE_USE 30 31 U_CAPI USet* U_EXPORT2 32 uset_openEmpty() { 33 return (USet*) new UnicodeSet(); 34 } 35 36 U_CAPI USet* U_EXPORT2 37 uset_open(UChar32 start, UChar32 end) { 38 return (USet*) new UnicodeSet(start, end); 39 } 40 41 U_CAPI void U_EXPORT2 42 uset_close(USet* set) { 43 delete (UnicodeSet*) set; 44 } 45 46 U_CAPI USet * U_EXPORT2 47 uset_clone(const USet *set) { 48 return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone()); 49 } 50 51 U_CAPI UBool U_EXPORT2 52 uset_isFrozen(const USet *set) { 53 return ((UnicodeSet*) set)->UnicodeSet::isFrozen(); 54 } 55 56 U_CAPI void U_EXPORT2 57 uset_freeze(USet *set) { 58 ((UnicodeSet*) set)->UnicodeSet::freeze(); 59 } 60 61 U_CAPI USet * U_EXPORT2 62 uset_cloneAsThawed(const USet *set) { 63 return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed()); 64 } 65 66 U_CAPI void U_EXPORT2 67 uset_set(USet* set, 68 UChar32 start, UChar32 end) { 69 ((UnicodeSet*) set)->UnicodeSet::set(start, end); 70 } 71 72 U_CAPI void U_EXPORT2 73 uset_addAll(USet* set, const USet *additionalSet) { 74 ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet)); 75 } 76 77 U_CAPI void U_EXPORT2 78 uset_add(USet* set, UChar32 c) { 79 ((UnicodeSet*) set)->UnicodeSet::add(c); 80 } 81 82 U_CAPI void U_EXPORT2 83 uset_addRange(USet* set, UChar32 start, UChar32 end) { 84 ((UnicodeSet*) set)->UnicodeSet::add(start, end); 85 } 86 87 U_CAPI void U_EXPORT2 88 uset_addString(USet* set, const UChar* str, int32_t strLen) { 89 // UnicodeString handles -1 for strLen 90 UnicodeString s(strLen<0, str, strLen); 91 ((UnicodeSet*) set)->UnicodeSet::add(s); 92 } 93 94 U_CAPI void U_EXPORT2 95 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) { 96 // UnicodeString handles -1 for strLen 97 UnicodeString s(str, strLen); 98 ((UnicodeSet*) set)->UnicodeSet::addAll(s); 99 } 100 101 U_CAPI void U_EXPORT2 102 uset_remove(USet* set, UChar32 c) { 103 ((UnicodeSet*) set)->UnicodeSet::remove(c); 104 } 105 106 U_CAPI void U_EXPORT2 107 uset_removeRange(USet* set, UChar32 start, UChar32 end) { 108 ((UnicodeSet*) set)->UnicodeSet::remove(start, end); 109 } 110 111 U_CAPI void U_EXPORT2 112 uset_removeString(USet* set, const UChar* str, int32_t strLen) { 113 UnicodeString s(strLen==-1, str, strLen); 114 ((UnicodeSet*) set)->UnicodeSet::remove(s); 115 } 116 117 U_CAPI void U_EXPORT2 118 uset_removeAll(USet* set, const USet* remove) { 119 ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove); 120 } 121 122 U_CAPI void U_EXPORT2 123 uset_retain(USet* set, UChar32 start, UChar32 end) { 124 ((UnicodeSet*) set)->UnicodeSet::retain(start, end); 125 } 126 127 U_CAPI void U_EXPORT2 128 uset_retainAll(USet* set, const USet* retain) { 129 ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain); 130 } 131 132 U_CAPI void U_EXPORT2 133 uset_compact(USet* set) { 134 ((UnicodeSet*) set)->UnicodeSet::compact(); 135 } 136 137 U_CAPI void U_EXPORT2 138 uset_complement(USet* set) { 139 ((UnicodeSet*) set)->UnicodeSet::complement(); 140 } 141 142 U_CAPI void U_EXPORT2 143 uset_complementAll(USet* set, const USet* complement) { 144 ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement); 145 } 146 147 U_CAPI void U_EXPORT2 148 uset_clear(USet* set) { 149 ((UnicodeSet*) set)->UnicodeSet::clear(); 150 } 151 152 U_CAPI void U_EXPORT2 153 uset_closeOver(USet* set, int32_t attributes) { 154 ((UnicodeSet*) set)->UnicodeSet::closeOver(attributes); 155 } 156 157 U_CAPI void U_EXPORT2 158 uset_removeAllStrings(USet* set) { 159 ((UnicodeSet*) set)->UnicodeSet::removeAllStrings(); 160 } 161 162 U_CAPI UBool U_EXPORT2 163 uset_isEmpty(const USet* set) { 164 return ((const UnicodeSet*) set)->UnicodeSet::isEmpty(); 165 } 166 167 U_CAPI UBool U_EXPORT2 168 uset_contains(const USet* set, UChar32 c) { 169 return ((const UnicodeSet*) set)->UnicodeSet::contains(c); 170 } 171 172 U_CAPI UBool U_EXPORT2 173 uset_containsRange(const USet* set, UChar32 start, UChar32 end) { 174 return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end); 175 } 176 177 U_CAPI UBool U_EXPORT2 178 uset_containsString(const USet* set, const UChar* str, int32_t strLen) { 179 UnicodeString s(strLen==-1, str, strLen); 180 return ((const UnicodeSet*) set)->UnicodeSet::contains(s); 181 } 182 183 U_CAPI UBool U_EXPORT2 184 uset_containsAll(const USet* set1, const USet* set2) { 185 return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2); 186 } 187 188 U_CAPI UBool U_EXPORT2 189 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) { 190 // Create a string alias, since nothing is being added to the set. 191 UnicodeString s(strLen==-1, str, strLen); 192 return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s); 193 } 194 195 U_CAPI UBool U_EXPORT2 196 uset_containsNone(const USet* set1, const USet* set2) { 197 return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2); 198 } 199 200 U_CAPI UBool U_EXPORT2 201 uset_containsSome(const USet* set1, const USet* set2) { 202 return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2); 203 } 204 205 U_CAPI int32_t U_EXPORT2 206 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 207 return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition); 208 } 209 210 U_CAPI int32_t U_EXPORT2 211 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { 212 return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition); 213 } 214 215 U_CAPI int32_t U_EXPORT2 216 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 217 return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition); 218 } 219 220 U_CAPI int32_t U_EXPORT2 221 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { 222 return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition); 223 } 224 225 U_CAPI UBool U_EXPORT2 226 uset_equals(const USet* set1, const USet* set2) { 227 return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2; 228 } 229 230 U_CAPI int32_t U_EXPORT2 231 uset_indexOf(const USet* set, UChar32 c) { 232 return ((UnicodeSet*) set)->UnicodeSet::indexOf(c); 233 } 234 235 U_CAPI UChar32 U_EXPORT2 236 uset_charAt(const USet* set, int32_t index) { 237 return ((UnicodeSet*) set)->UnicodeSet::charAt(index); 238 } 239 240 U_CAPI int32_t U_EXPORT2 241 uset_size(const USet* set) { 242 return ((const UnicodeSet*) set)->UnicodeSet::size(); 243 } 244 245 U_NAMESPACE_BEGIN 246 /** 247 * This class only exists to provide access to the UnicodeSet private 248 * USet support API. Declaring a class a friend is more portable than 249 * trying to declare extern "C" functions as friends. 250 */ 251 class USetAccess /* not : public UObject because all methods are static */ { 252 public: 253 /* Try to have the compiler inline these*/ 254 inline static int32_t getStringCount(const UnicodeSet& set) { 255 return set.getStringCount(); 256 } 257 inline static const UnicodeString* getString(const UnicodeSet& set, 258 int32_t i) { 259 return set.getString(i); 260 } 261 private: 262 /* do not instantiate*/ 263 USetAccess(); 264 }; 265 U_NAMESPACE_END 266 267 U_CAPI int32_t U_EXPORT2 268 uset_getItemCount(const USet* uset) { 269 const UnicodeSet& set = *(const UnicodeSet*)uset; 270 return set.getRangeCount() + USetAccess::getStringCount(set); 271 } 272 273 U_CAPI int32_t U_EXPORT2 274 uset_getItem(const USet* uset, int32_t itemIndex, 275 UChar32* start, UChar32* end, 276 UChar* str, int32_t strCapacity, 277 UErrorCode* ec) { 278 if (U_FAILURE(*ec)) return 0; 279 const UnicodeSet& set = *(const UnicodeSet*)uset; 280 int32_t rangeCount; 281 282 if (itemIndex < 0) { 283 *ec = U_ILLEGAL_ARGUMENT_ERROR; 284 return -1; 285 } else if (itemIndex < (rangeCount = set.getRangeCount())) { 286 *start = set.getRangeStart(itemIndex); 287 *end = set.getRangeEnd(itemIndex); 288 return 0; 289 } else { 290 itemIndex -= rangeCount; 291 if (itemIndex < USetAccess::getStringCount(set)) { 292 const UnicodeString* s = USetAccess::getString(set, itemIndex); 293 return s->extract(str, strCapacity, *ec); 294 } else { 295 *ec = U_INDEX_OUTOFBOUNDS_ERROR; 296 return -1; 297 } 298 } 299 } 300 301 //U_CAPI int32_t U_EXPORT2 302 //uset_getRangeCount(const USet* set) { 303 // return ((const UnicodeSet*) set)->getRangeCount(); 304 //} 305 // 306 //U_CAPI UBool U_EXPORT2 307 //uset_getRange(const USet* set, int32_t rangeIndex, 308 // UChar32* pStart, UChar32* pEnd) { 309 // if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) { 310 // return FALSE; 311 // } 312 // const UnicodeSet* us = (const UnicodeSet*) set; 313 // *pStart = us->getRangeStart(rangeIndex); 314 // *pEnd = us->getRangeEnd(rangeIndex); 315 // return TRUE; 316 //} 317 318 /* 319 * Serialize a USet into 16-bit units. 320 * Store BMP code points as themselves with one 16-bit unit each. 321 * 322 * Important: the code points in the array are in ascending order, 323 * therefore all BMP code points precede all supplementary code points. 324 * 325 * Store each supplementary code point in 2 16-bit units, 326 * simply with higher-then-lower 16-bit halfs. 327 * 328 * Precede the entire list with the length. 329 * If there are supplementary code points, then set bit 15 in the length 330 * and add the bmpLength between it and the array. 331 * 332 * In other words: 333 * - all BMP: (length=bmpLength) BMP, .., BMP 334 * - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, .. 335 */ 336 U_CAPI int32_t U_EXPORT2 337 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) { 338 if (ec==NULL || U_FAILURE(*ec)) { 339 return 0; 340 } 341 342 return ((const UnicodeSet*) set)->UnicodeSet::serialize(dest, destCapacity,* ec); 343 } 344 345 U_CAPI UBool U_EXPORT2 346 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) { 347 int32_t length; 348 349 if(fillSet==NULL) { 350 return FALSE; 351 } 352 if(src==NULL || srcLength<=0) { 353 fillSet->length=fillSet->bmpLength=0; 354 return FALSE; 355 } 356 357 length=*src++; 358 if(length&0x8000) { 359 /* there are supplementary values */ 360 length&=0x7fff; 361 if(srcLength<(2+length)) { 362 fillSet->length=fillSet->bmpLength=0; 363 return FALSE; 364 } 365 fillSet->bmpLength=*src++; 366 } else { 367 /* only BMP values */ 368 if(srcLength<(1+length)) { 369 fillSet->length=fillSet->bmpLength=0; 370 return FALSE; 371 } 372 fillSet->bmpLength=length; 373 } 374 fillSet->array=src; 375 fillSet->length=length; 376 return TRUE; 377 } 378 379 U_CAPI void U_EXPORT2 380 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) { 381 if(fillSet==NULL || (uint32_t)c>0x10ffff) { 382 return; 383 } 384 385 fillSet->array=fillSet->staticArray; 386 if(c<0xffff) { 387 fillSet->bmpLength=fillSet->length=2; 388 fillSet->staticArray[0]=(uint16_t)c; 389 fillSet->staticArray[1]=(uint16_t)c+1; 390 } else if(c==0xffff) { 391 fillSet->bmpLength=1; 392 fillSet->length=3; 393 fillSet->staticArray[0]=0xffff; 394 fillSet->staticArray[1]=1; 395 fillSet->staticArray[2]=0; 396 } else if(c<0x10ffff) { 397 fillSet->bmpLength=0; 398 fillSet->length=4; 399 fillSet->staticArray[0]=(uint16_t)(c>>16); 400 fillSet->staticArray[1]=(uint16_t)c; 401 ++c; 402 fillSet->staticArray[2]=(uint16_t)(c>>16); 403 fillSet->staticArray[3]=(uint16_t)c; 404 } else /* c==0x10ffff */ { 405 fillSet->bmpLength=0; 406 fillSet->length=2; 407 fillSet->staticArray[0]=0x10; 408 fillSet->staticArray[1]=0xffff; 409 } 410 } 411 412 U_CAPI UBool U_EXPORT2 413 uset_serializedContains(const USerializedSet* set, UChar32 c) { 414 const uint16_t* array; 415 416 if(set==NULL || (uint32_t)c>0x10ffff) { 417 return FALSE; 418 } 419 420 array=set->array; 421 if(c<=0xffff) { 422 /* find c in the BMP part */ 423 int32_t lo = 0; 424 int32_t hi = set->bmpLength-1; 425 if (c < array[0]) { 426 hi = 0; 427 } else if (c < array[hi]) { 428 for(;;) { 429 int32_t i = (lo + hi) >> 1; 430 if (i == lo) { 431 break; // Done! 432 } else if (c < array[i]) { 433 hi = i; 434 } else { 435 lo = i; 436 } 437 } 438 } else { 439 hi += 1; 440 } 441 return (UBool)(hi&1); 442 } else { 443 /* find c in the supplementary part */ 444 uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c; 445 int32_t base = set->bmpLength; 446 int32_t lo = 0; 447 int32_t hi = set->length - 2 - base; 448 if (high < array[base] || (high==array[base] && low<array[base+1])) { 449 hi = 0; 450 } else if (high < array[base+hi] || (high==array[base+hi] && low<array[base+hi+1])) { 451 for (;;) { 452 int32_t i = ((lo + hi) >> 1) & ~1; // Guarantee even result 453 int32_t iabs = i + base; 454 if (i == lo) { 455 break; // Done! 456 } else if (high < array[iabs] || (high==array[iabs] && low<array[iabs+1])) { 457 hi = i; 458 } else { 459 lo = i; 460 } 461 } 462 } else { 463 hi += 2; 464 } 465 /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */ 466 return (UBool)(((hi+(base<<1))&2)!=0); 467 } 468 } 469 470 U_CAPI int32_t U_EXPORT2 471 uset_getSerializedRangeCount(const USerializedSet* set) { 472 if(set==NULL) { 473 return 0; 474 } 475 476 return (set->bmpLength+(set->length-set->bmpLength)/2+1)/2; 477 } 478 479 U_CAPI UBool U_EXPORT2 480 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 481 UChar32* pStart, UChar32* pEnd) { 482 const uint16_t* array; 483 int32_t bmpLength, length; 484 485 if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) { 486 return FALSE; 487 } 488 489 array=set->array; 490 length=set->length; 491 bmpLength=set->bmpLength; 492 493 rangeIndex*=2; /* address start/limit pairs */ 494 if(rangeIndex<bmpLength) { 495 *pStart=array[rangeIndex++]; 496 if(rangeIndex<bmpLength) { 497 *pEnd=array[rangeIndex]; 498 } else if(rangeIndex<length) { 499 *pEnd=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1]; 500 } else { 501 *pEnd=0x110000; 502 } 503 --*pEnd; 504 return TRUE; 505 } else { 506 rangeIndex-=bmpLength; 507 rangeIndex*=2; /* address pairs of pairs of units */ 508 length-=bmpLength; 509 if(rangeIndex<length) { 510 array+=bmpLength; 511 *pStart=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1]; 512 rangeIndex+=2; 513 if(rangeIndex<length) { 514 *pEnd=(((int32_t)array[rangeIndex])<<16)|array[rangeIndex+1]; 515 } else { 516 *pEnd=0x110000; 517 } 518 --*pEnd; 519 return TRUE; 520 } else { 521 return FALSE; 522 } 523 } 524 } 525 526 // TODO The old, internal uset.c had an efficient uset_containsOne function. 527 // Returned the one and only code point, or else -1 or something. 528 // Consider adding such a function to both C and C++ UnicodeSet/uset. 529 // See tools/gennorm/store.c for usage, now usetContainsOne there. 530 531 // TODO Investigate incorporating this code into UnicodeSet to improve 532 // efficiency. 533 // --- 534 // #define USET_GROW_DELTA 20 535 // 536 // static U_INLINE int32_t 537 // findChar(const UChar32* array, int32_t length, UChar32 c) { 538 // int32_t i; 539 // 540 // /* check the last range limit first for more efficient appending */ 541 // if(length>0) { 542 // if(c>=array[length-1]) { 543 // return length; 544 // } 545 // 546 // /* do not check the last range limit again in the loop below */ 547 // --length; 548 // } 549 // 550 // for(i=0; i<length && c>=array[i]; ++i) {} 551 // return i; 552 // } 553 // 554 // static UBool 555 // addRemove(USet* set, UChar32 c, int32_t doRemove) { 556 // int32_t i, length, more; 557 // 558 // if(set==NULL || (uint32_t)c>0x10ffff) { 559 // return FALSE; 560 // } 561 // 562 // length=set->length; 563 // i=findChar(set->array, length, c); 564 // if((i&1)^doRemove) { 565 // /* c is already in the set */ 566 // return TRUE; 567 // } 568 // 569 // /* how many more array items do we need? */ 570 // if(i<length && (c+1)==set->array[i]) { 571 // /* c is just before the following range, extend that in-place by one */ 572 // set->array[i]=c; 573 // if(i>0) { 574 // --i; 575 // if(c==set->array[i]) { 576 // /* the previous range collapsed, remove it */ 577 // set->length=length-=2; 578 // if(i<length) { 579 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 580 // } 581 // } 582 // } 583 // return TRUE; 584 // } else if(i>0 && c==set->array[i-1]) { 585 // /* c is just after the previous range, extend that in-place by one */ 586 // if(++c<=0x10ffff) { 587 // set->array[i-1]=c; 588 // if(i<length && c==set->array[i]) { 589 // /* the following range collapsed, remove it */ 590 // --i; 591 // set->length=length-=2; 592 // if(i<length) { 593 // uprv_memmove(set->array+i, set->array+i+2, (length-i)*4); 594 // } 595 // } 596 // } else { 597 // /* extend the previous range (had limit 0x10ffff) to the end of Unicode */ 598 // set->length=i-1; 599 // } 600 // return TRUE; 601 // } else if(i==length && c==0x10ffff) { 602 // /* insert one range limit c */ 603 // more=1; 604 // } else { 605 // /* insert two range limits c, c+1 */ 606 // more=2; 607 // } 608 // 609 // /* insert <more> range limits */ 610 // if(length+more>set->capacity) { 611 // /* reallocate */ 612 // int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA; 613 // UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4); 614 // if(newArray==NULL) { 615 // return FALSE; 616 // } 617 // set->capacity=newCapacity; 618 // uprv_memcpy(newArray, set->array, length*4); 619 // 620 // if(set->array!=set->staticBuffer) { 621 // uprv_free(set->array); 622 // } 623 // set->array=newArray; 624 // } 625 // 626 // if(i<length) { 627 // uprv_memmove(set->array+i+more, set->array+i, (length-i)*4); 628 // } 629 // set->array[i]=c; 630 // if(more==2) { 631 // set->array[i+1]=c+1; 632 // } 633 // set->length+=more; 634 // 635 // return TRUE; 636 // } 637 // 638 // U_CAPI UBool U_EXPORT2 639 // uset_add(USet* set, UChar32 c) { 640 // return addRemove(set, c, 0); 641 // } 642 // 643 // U_CAPI void U_EXPORT2 644 // uset_remove(USet* set, UChar32 c) { 645 // addRemove(set, c, 1); 646 // } 647