1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2014, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uset.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002mar07 14 * created by: Markus W. Scherer 15 * 16 * C version of UnicodeSet. 17 */ 18 19 20 /** 21 * \file 22 * \brief C API: Unicode Set 23 * 24 * <p>This is a C wrapper around the C++ UnicodeSet class.</p> 25 */ 26 27 #ifndef __USET_H__ 28 #define __USET_H__ 29 30 #include "unicode/utypes.h" 31 #include "unicode/uchar.h" 32 #include "unicode/localpointer.h" 33 34 #ifndef UCNV_H 35 struct USet; 36 /** 37 * A UnicodeSet. Use the uset_* API to manipulate. Create with 38 * uset_open*, and destroy with uset_close. 39 * @stable ICU 2.4 40 */ 41 typedef struct USet USet; 42 #endif 43 44 /** 45 * Bitmask values to be passed to uset_openPatternOptions() or 46 * uset_applyPattern() taking an option parameter. 47 * @stable ICU 2.4 48 */ 49 enum { 50 /** 51 * Ignore white space within patterns unless quoted or escaped. 52 * @stable ICU 2.4 53 */ 54 USET_IGNORE_SPACE = 1, 55 56 /** 57 * Enable case insensitive matching. E.g., "[ab]" with this flag 58 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 59 * match all except 'a', 'A', 'b', and 'B'. This performs a full 60 * closure over case mappings, e.g. U+017F for s. 61 * 62 * The resulting set is a superset of the input for the code points but 63 * not for the strings. 64 * It performs a case mapping closure of the code points and adds 65 * full case folding strings for the code points, and reduces strings of 66 * the original set to their full case folding equivalents. 67 * 68 * This is designed for case-insensitive matches, for example 69 * in regular expressions. The full code point case closure allows checking of 70 * an input character directly against the closure set. 71 * Strings are matched by comparing the case-folded form from the closure 72 * set with an incremental case folding of the string in question. 73 * 74 * The closure set will also contain single code points if the original 75 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 76 * This is not necessary (that is, redundant) for the above matching method 77 * but results in the same closure sets regardless of whether the original 78 * set contained the code point or a string. 79 * 80 * @stable ICU 2.4 81 */ 82 USET_CASE_INSENSITIVE = 2, 83 84 /** 85 * Enable case insensitive matching. E.g., "[ab]" with this flag 86 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 87 * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, 88 * title-, and uppercase mappings as well as the case folding 89 * of each existing element in the set. 90 * @stable ICU 3.2 91 */ 92 USET_ADD_CASE_MAPPINGS = 4 93 }; 94 95 /** 96 * Argument values for whether span() and similar functions continue while 97 * the current character is contained vs. not contained in the set. 98 * 99 * The functionality is straightforward for sets with only single code points, 100 * without strings (which is the common case): 101 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. 102 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. 103 * - span() and spanBack() partition any string the same way when 104 * alternating between span(USET_SPAN_NOT_CONTAINED) and 105 * span(either "contained" condition). 106 * - Using a complemented (inverted) set and the opposite span conditions 107 * yields the same results. 108 * 109 * When a set contains multi-code point strings, then these statements may not 110 * be true, depending on the strings in the set (for example, whether they 111 * overlap with each other) and the string that is processed. 112 * For a set with strings: 113 * - The complement of the set contains the opposite set of code points, 114 * but the same set of strings. 115 * Therefore, complementing both the set and the span conditions 116 * may yield different results. 117 * - When starting spans at different positions in a string 118 * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 119 * because a set string may start before the later position. 120 * - span(USET_SPAN_SIMPLE) may be shorter than 121 * span(USET_SPAN_CONTAINED) because it will not recursively try 122 * all possible paths. 123 * For example, with a set which contains the three strings "xy", "xya" and "ax", 124 * span("xyax", USET_SPAN_CONTAINED) will return 4 but 125 * span("xyax", USET_SPAN_SIMPLE) will return 3. 126 * span(USET_SPAN_SIMPLE) will never be longer than 127 * span(USET_SPAN_CONTAINED). 128 * - With either "contained" condition, span() and spanBack() may partition 129 * a string in different ways. 130 * For example, with a set which contains the two strings "ab" and "ba", 131 * and when processing the string "aba", 132 * span() will yield contained/not-contained boundaries of { 0, 2, 3 } 133 * while spanBack() will yield boundaries of { 0, 1, 3 }. 134 * 135 * Note: If it is important to get the same boundaries whether iterating forward 136 * or backward through a string, then either only span() should be used and 137 * the boundaries cached for backward operation, or an ICU BreakIterator 138 * could be used. 139 * 140 * Note: Unpaired surrogates are treated like surrogate code points. 141 * Similarly, set strings match only on code point boundaries, 142 * never in the middle of a surrogate pair. 143 * Illegal UTF-8 sequences are treated like U+FFFD. 144 * When processing UTF-8 strings, malformed set strings 145 * (strings with unpaired surrogates which cannot be converted to UTF-8) 146 * are ignored. 147 * 148 * @stable ICU 3.8 149 */ 150 typedef enum USetSpanCondition { 151 /** 152 * Continues a span() while there is no set element at the current position. 153 * Increments by one code point at a time. 154 * Stops before the first set element (character or string). 155 * (For code points only, this is like while contains(current)==FALSE). 156 * 157 * When span() returns, the substring between where it started and the position 158 * it returned consists only of characters that are not in the set, 159 * and none of its strings overlap with the span. 160 * 161 * @stable ICU 3.8 162 */ 163 USET_SPAN_NOT_CONTAINED = 0, 164 /** 165 * Spans the longest substring that is a concatenation of set elements (characters or strings). 166 * (For characters only, this is like while contains(current)==TRUE). 167 * 168 * When span() returns, the substring between where it started and the position 169 * it returned consists only of set elements (characters or strings) that are in the set. 170 * 171 * If a set contains strings, then the span will be the longest substring for which there 172 * exists at least one non-overlapping concatenation of set elements (characters or strings). 173 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 174 * (Java/ICU/Perl regex stops at the first match of an OR.) 175 * 176 * @stable ICU 3.8 177 */ 178 USET_SPAN_CONTAINED = 1, 179 /** 180 * Continues a span() while there is a set element at the current position. 181 * Increments by the longest matching element at each position. 182 * (For characters only, this is like while contains(current)==TRUE). 183 * 184 * When span() returns, the substring between where it started and the position 185 * it returned consists only of set elements (characters or strings) that are in the set. 186 * 187 * If a set only contains single characters, then this is the same 188 * as USET_SPAN_CONTAINED. 189 * 190 * If a set contains strings, then the span will be the longest substring 191 * with a match at each position with the longest single set element (character or string). 192 * 193 * Use this span condition together with other longest-match algorithms, 194 * such as ICU converters (ucnv_getUnicodeSet()). 195 * 196 * @stable ICU 3.8 197 */ 198 USET_SPAN_SIMPLE = 2, 199 /** 200 * One more than the last span condition. 201 * @stable ICU 3.8 202 */ 203 USET_SPAN_CONDITION_COUNT 204 } USetSpanCondition; 205 206 enum { 207 /** 208 * Capacity of USerializedSet::staticArray. 209 * Enough for any single-code point set. 210 * Also provides padding for nice sizeof(USerializedSet). 211 * @stable ICU 2.4 212 */ 213 USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 214 }; 215 216 /** 217 * A serialized form of a Unicode set. Limited manipulations are 218 * possible directly on a serialized set. See below. 219 * @stable ICU 2.4 220 */ 221 typedef struct USerializedSet { 222 /** 223 * The serialized Unicode Set. 224 * @stable ICU 2.4 225 */ 226 const uint16_t *array; 227 /** 228 * The length of the array that contains BMP characters. 229 * @stable ICU 2.4 230 */ 231 int32_t bmpLength; 232 /** 233 * The total length of the array. 234 * @stable ICU 2.4 235 */ 236 int32_t length; 237 /** 238 * A small buffer for the array to reduce memory allocations. 239 * @stable ICU 2.4 240 */ 241 uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; 242 } USerializedSet; 243 244 /********************************************************************* 245 * USet API 246 *********************************************************************/ 247 248 /** 249 * Create an empty USet object. 250 * Equivalent to uset_open(1, 0). 251 * @return a newly created USet. The caller must call uset_close() on 252 * it when done. 253 * @stable ICU 4.2 254 */ 255 U_STABLE USet* U_EXPORT2 256 uset_openEmpty(void); 257 258 /** 259 * Creates a USet object that contains the range of characters 260 * start..end, inclusive. If <code>start > end</code> 261 * then an empty set is created (same as using uset_openEmpty()). 262 * @param start first character of the range, inclusive 263 * @param end last character of the range, inclusive 264 * @return a newly created USet. The caller must call uset_close() on 265 * it when done. 266 * @stable ICU 2.4 267 */ 268 U_STABLE USet* U_EXPORT2 269 uset_open(UChar32 start, UChar32 end); 270 271 /** 272 * Creates a set from the given pattern. See the UnicodeSet class 273 * description for the syntax of the pattern language. 274 * @param pattern a string specifying what characters are in the set 275 * @param patternLength the length of the pattern, or -1 if null 276 * terminated 277 * @param ec the error code 278 * @stable ICU 2.4 279 */ 280 U_STABLE USet* U_EXPORT2 281 uset_openPattern(const UChar* pattern, int32_t patternLength, 282 UErrorCode* ec); 283 284 /** 285 * Creates a set from the given pattern. See the UnicodeSet class 286 * description for the syntax of the pattern language. 287 * @param pattern a string specifying what characters are in the set 288 * @param patternLength the length of the pattern, or -1 if null 289 * terminated 290 * @param options bitmask for options to apply to the pattern. 291 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 292 * @param ec the error code 293 * @stable ICU 2.4 294 */ 295 U_STABLE USet* U_EXPORT2 296 uset_openPatternOptions(const UChar* pattern, int32_t patternLength, 297 uint32_t options, 298 UErrorCode* ec); 299 300 /** 301 * Disposes of the storage used by a USet object. This function should 302 * be called exactly once for objects returned by uset_open(). 303 * @param set the object to dispose of 304 * @stable ICU 2.4 305 */ 306 U_STABLE void U_EXPORT2 307 uset_close(USet* set); 308 309 #if U_SHOW_CPLUSPLUS_API 310 311 U_NAMESPACE_BEGIN 312 313 /** 314 * \class LocalUSetPointer 315 * "Smart pointer" class, closes a USet via uset_close(). 316 * For most methods see the LocalPointerBase base class. 317 * 318 * @see LocalPointerBase 319 * @see LocalPointer 320 * @stable ICU 4.4 321 */ 322 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close); 323 324 U_NAMESPACE_END 325 326 #endif 327 328 /** 329 * Returns a copy of this object. 330 * If this set is frozen, then the clone will be frozen as well. 331 * Use uset_cloneAsThawed() for a mutable clone of a frozen set. 332 * @param set the original set 333 * @return the newly allocated copy of the set 334 * @see uset_cloneAsThawed 335 * @stable ICU 3.8 336 */ 337 U_STABLE USet * U_EXPORT2 338 uset_clone(const USet *set); 339 340 /** 341 * Determines whether the set has been frozen (made immutable) or not. 342 * See the ICU4J Freezable interface for details. 343 * @param set the set 344 * @return TRUE/FALSE for whether the set has been frozen 345 * @see uset_freeze 346 * @see uset_cloneAsThawed 347 * @stable ICU 3.8 348 */ 349 U_STABLE UBool U_EXPORT2 350 uset_isFrozen(const USet *set); 351 352 /** 353 * Freeze the set (make it immutable). 354 * Once frozen, it cannot be unfrozen and is therefore thread-safe 355 * until it is deleted. 356 * See the ICU4J Freezable interface for details. 357 * Freezing the set may also make some operations faster, for example 358 * uset_contains() and uset_span(). 359 * A frozen set will not be modified. (It remains frozen.) 360 * @param set the set 361 * @return the same set, now frozen 362 * @see uset_isFrozen 363 * @see uset_cloneAsThawed 364 * @stable ICU 3.8 365 */ 366 U_STABLE void U_EXPORT2 367 uset_freeze(USet *set); 368 369 /** 370 * Clone the set and make the clone mutable. 371 * See the ICU4J Freezable interface for details. 372 * @param set the set 373 * @return the mutable clone 374 * @see uset_freeze 375 * @see uset_isFrozen 376 * @see uset_clone 377 * @stable ICU 3.8 378 */ 379 U_STABLE USet * U_EXPORT2 380 uset_cloneAsThawed(const USet *set); 381 382 /** 383 * Causes the USet object to represent the range <code>start - end</code>. 384 * If <code>start > end</code> then this USet is set to an empty range. 385 * A frozen set will not be modified. 386 * @param set the object to set to the given range 387 * @param start first character in the set, inclusive 388 * @param end last character in the set, inclusive 389 * @stable ICU 3.2 390 */ 391 U_STABLE void U_EXPORT2 392 uset_set(USet* set, 393 UChar32 start, UChar32 end); 394 395 /** 396 * Modifies the set to represent the set specified by the given 397 * pattern. See the UnicodeSet class description for the syntax of 398 * the pattern language. See also the User Guide chapter about UnicodeSet. 399 * <em>Empties the set passed before applying the pattern.</em> 400 * A frozen set will not be modified. 401 * @param set The set to which the pattern is to be applied. 402 * @param pattern A pointer to UChar string specifying what characters are in the set. 403 * The character at pattern[0] must be a '['. 404 * @param patternLength The length of the UChar string. -1 if NUL terminated. 405 * @param options A bitmask for options to apply to the pattern. 406 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 407 * @param status Returns an error if the pattern cannot be parsed. 408 * @return Upon successful parse, the value is either 409 * the index of the character after the closing ']' 410 * of the parsed pattern. 411 * If the status code indicates failure, then the return value 412 * is the index of the error in the source. 413 * 414 * @stable ICU 2.8 415 */ 416 U_STABLE int32_t U_EXPORT2 417 uset_applyPattern(USet *set, 418 const UChar *pattern, int32_t patternLength, 419 uint32_t options, 420 UErrorCode *status); 421 422 /** 423 * Modifies the set to contain those code points which have the given value 424 * for the given binary or enumerated property, as returned by 425 * u_getIntPropertyValue. Prior contents of this set are lost. 426 * A frozen set will not be modified. 427 * 428 * @param set the object to contain the code points defined by the property 429 * 430 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 431 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 432 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 433 * 434 * @param value a value in the range u_getIntPropertyMinValue(prop).. 435 * u_getIntPropertyMaxValue(prop), with one exception. If prop is 436 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 437 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped 438 * categories such as [:L:] to be represented. 439 * 440 * @param ec error code input/output parameter 441 * 442 * @stable ICU 3.2 443 */ 444 U_STABLE void U_EXPORT2 445 uset_applyIntPropertyValue(USet* set, 446 UProperty prop, int32_t value, UErrorCode* ec); 447 448 /** 449 * Modifies the set to contain those code points which have the 450 * given value for the given property. Prior contents of this 451 * set are lost. 452 * A frozen set will not be modified. 453 * 454 * @param set the object to contain the code points defined by the given 455 * property and value alias 456 * 457 * @param prop a string specifying a property alias, either short or long. 458 * The name is matched loosely. See PropertyAliases.txt for names and a 459 * description of loose matching. If the value string is empty, then this 460 * string is interpreted as either a General_Category value alias, a Script 461 * value alias, a binary property alias, or a special ID. Special IDs are 462 * matched loosely and correspond to the following sets: 463 * 464 * "ANY" = [\\u0000-\\U0010FFFF], 465 * "ASCII" = [\\u0000-\\u007F], 466 * "Assigned" = [:^Cn:]. 467 * 468 * @param propLength the length of the prop, or -1 if NULL 469 * 470 * @param value a string specifying a value alias, either short or long. 471 * The name is matched loosely. See PropertyValueAliases.txt for names 472 * and a description of loose matching. In addition to aliases listed, 473 * numeric values and canonical combining classes may be expressed 474 * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string 475 * may also be empty. 476 * 477 * @param valueLength the length of the value, or -1 if NULL 478 * 479 * @param ec error code input/output parameter 480 * 481 * @stable ICU 3.2 482 */ 483 U_STABLE void U_EXPORT2 484 uset_applyPropertyAlias(USet* set, 485 const UChar *prop, int32_t propLength, 486 const UChar *value, int32_t valueLength, 487 UErrorCode* ec); 488 489 /** 490 * Return true if the given position, in the given pattern, appears 491 * to be the start of a UnicodeSet pattern. 492 * 493 * @param pattern a string specifying the pattern 494 * @param patternLength the length of the pattern, or -1 if NULL 495 * @param pos the given position 496 * @stable ICU 3.2 497 */ 498 U_STABLE UBool U_EXPORT2 499 uset_resemblesPattern(const UChar *pattern, int32_t patternLength, 500 int32_t pos); 501 502 /** 503 * Returns a string representation of this set. If the result of 504 * calling this function is passed to a uset_openPattern(), it 505 * will produce another set that is equal to this one. 506 * @param set the set 507 * @param result the string to receive the rules, may be NULL 508 * @param resultCapacity the capacity of result, may be 0 if result is NULL 509 * @param escapeUnprintable if TRUE then convert unprintable 510 * character to their hex escape representations, \\uxxxx or 511 * \\Uxxxxxxxx. Unprintable characters are those other than 512 * U+000A, U+0020..U+007E. 513 * @param ec error code. 514 * @return length of string, possibly larger than resultCapacity 515 * @stable ICU 2.4 516 */ 517 U_STABLE int32_t U_EXPORT2 518 uset_toPattern(const USet* set, 519 UChar* result, int32_t resultCapacity, 520 UBool escapeUnprintable, 521 UErrorCode* ec); 522 523 /** 524 * Adds the given character to the given USet. After this call, 525 * uset_contains(set, c) will return TRUE. 526 * A frozen set will not be modified. 527 * @param set the object to which to add the character 528 * @param c the character to add 529 * @stable ICU 2.4 530 */ 531 U_STABLE void U_EXPORT2 532 uset_add(USet* set, UChar32 c); 533 534 /** 535 * Adds all of the elements in the specified set to this set if 536 * they're not already present. This operation effectively 537 * modifies this set so that its value is the <i>union</i> of the two 538 * sets. The behavior of this operation is unspecified if the specified 539 * collection is modified while the operation is in progress. 540 * A frozen set will not be modified. 541 * 542 * @param set the object to which to add the set 543 * @param additionalSet the source set whose elements are to be added to this set. 544 * @stable ICU 2.6 545 */ 546 U_STABLE void U_EXPORT2 547 uset_addAll(USet* set, const USet *additionalSet); 548 549 /** 550 * Adds the given range of characters to the given USet. After this call, 551 * uset_contains(set, start, end) will return TRUE. 552 * A frozen set will not be modified. 553 * @param set the object to which to add the character 554 * @param start the first character of the range to add, inclusive 555 * @param end the last character of the range to add, inclusive 556 * @stable ICU 2.2 557 */ 558 U_STABLE void U_EXPORT2 559 uset_addRange(USet* set, UChar32 start, UChar32 end); 560 561 /** 562 * Adds the given string to the given USet. After this call, 563 * uset_containsString(set, str, strLen) will return TRUE. 564 * A frozen set will not be modified. 565 * @param set the object to which to add the character 566 * @param str the string to add 567 * @param strLen the length of the string or -1 if null terminated. 568 * @stable ICU 2.4 569 */ 570 U_STABLE void U_EXPORT2 571 uset_addString(USet* set, const UChar* str, int32_t strLen); 572 573 /** 574 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 575 * If this set already any particular character, it has no effect on that character. 576 * A frozen set will not be modified. 577 * @param set the object to which to add the character 578 * @param str the source string 579 * @param strLen the length of the string or -1 if null terminated. 580 * @stable ICU 3.4 581 */ 582 U_STABLE void U_EXPORT2 583 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); 584 585 /** 586 * Removes the given character from the given USet. After this call, 587 * uset_contains(set, c) will return FALSE. 588 * A frozen set will not be modified. 589 * @param set the object from which to remove the character 590 * @param c the character to remove 591 * @stable ICU 2.4 592 */ 593 U_STABLE void U_EXPORT2 594 uset_remove(USet* set, UChar32 c); 595 596 /** 597 * Removes the given range of characters from the given USet. After this call, 598 * uset_contains(set, start, end) will return FALSE. 599 * A frozen set will not be modified. 600 * @param set the object to which to add the character 601 * @param start the first character of the range to remove, inclusive 602 * @param end the last character of the range to remove, inclusive 603 * @stable ICU 2.2 604 */ 605 U_STABLE void U_EXPORT2 606 uset_removeRange(USet* set, UChar32 start, UChar32 end); 607 608 /** 609 * Removes the given string to the given USet. After this call, 610 * uset_containsString(set, str, strLen) will return FALSE. 611 * A frozen set will not be modified. 612 * @param set the object to which to add the character 613 * @param str the string to remove 614 * @param strLen the length of the string or -1 if null terminated. 615 * @stable ICU 2.4 616 */ 617 U_STABLE void U_EXPORT2 618 uset_removeString(USet* set, const UChar* str, int32_t strLen); 619 620 /** 621 * Removes from this set all of its elements that are contained in the 622 * specified set. This operation effectively modifies this 623 * set so that its value is the <i>asymmetric set difference</i> of 624 * the two sets. 625 * A frozen set will not be modified. 626 * @param set the object from which the elements are to be removed 627 * @param removeSet the object that defines which elements will be 628 * removed from this set 629 * @stable ICU 3.2 630 */ 631 U_STABLE void U_EXPORT2 632 uset_removeAll(USet* set, const USet* removeSet); 633 634 /** 635 * Retain only the elements in this set that are contained in the 636 * specified range. If <code>start > end</code> then an empty range is 637 * retained, leaving the set empty. This is equivalent to 638 * a boolean logic AND, or a set INTERSECTION. 639 * A frozen set will not be modified. 640 * 641 * @param set the object for which to retain only the specified range 642 * @param start first character, inclusive, of range to be retained 643 * to this set. 644 * @param end last character, inclusive, of range to be retained 645 * to this set. 646 * @stable ICU 3.2 647 */ 648 U_STABLE void U_EXPORT2 649 uset_retain(USet* set, UChar32 start, UChar32 end); 650 651 /** 652 * Retains only the elements in this set that are contained in the 653 * specified set. In other words, removes from this set all of 654 * its elements that are not contained in the specified set. This 655 * operation effectively modifies this set so that its value is 656 * the <i>intersection</i> of the two sets. 657 * A frozen set will not be modified. 658 * 659 * @param set the object on which to perform the retain 660 * @param retain set that defines which elements this set will retain 661 * @stable ICU 3.2 662 */ 663 U_STABLE void U_EXPORT2 664 uset_retainAll(USet* set, const USet* retain); 665 666 /** 667 * Reallocate this objects internal structures to take up the least 668 * possible space, without changing this object's value. 669 * A frozen set will not be modified. 670 * 671 * @param set the object on which to perfrom the compact 672 * @stable ICU 3.2 673 */ 674 U_STABLE void U_EXPORT2 675 uset_compact(USet* set); 676 677 /** 678 * Inverts this set. This operation modifies this set so that 679 * its value is its complement. This operation does not affect 680 * the multicharacter strings, if any. 681 * A frozen set will not be modified. 682 * @param set the set 683 * @stable ICU 2.4 684 */ 685 U_STABLE void U_EXPORT2 686 uset_complement(USet* set); 687 688 /** 689 * Complements in this set all elements contained in the specified 690 * set. Any character in the other set will be removed if it is 691 * in this set, or will be added if it is not in this set. 692 * A frozen set will not be modified. 693 * 694 * @param set the set with which to complement 695 * @param complement set that defines which elements will be xor'ed 696 * from this set. 697 * @stable ICU 3.2 698 */ 699 U_STABLE void U_EXPORT2 700 uset_complementAll(USet* set, const USet* complement); 701 702 /** 703 * Removes all of the elements from this set. This set will be 704 * empty after this call returns. 705 * A frozen set will not be modified. 706 * @param set the set 707 * @stable ICU 2.4 708 */ 709 U_STABLE void U_EXPORT2 710 uset_clear(USet* set); 711 712 /** 713 * Close this set over the given attribute. For the attribute 714 * USET_CASE, the result is to modify this set so that: 715 * 716 * 1. For each character or string 'a' in this set, all strings or 717 * characters 'b' such that foldCase(a) == foldCase(b) are added 718 * to this set. 719 * 720 * 2. For each string 'e' in the resulting set, if e != 721 * foldCase(e), 'e' will be removed. 722 * 723 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 724 * 725 * (Here foldCase(x) refers to the operation u_strFoldCase, and a 726 * == b denotes that the contents are the same, not pointer 727 * comparison.) 728 * 729 * A frozen set will not be modified. 730 * 731 * @param set the set 732 * 733 * @param attributes bitmask for attributes to close over. 734 * Currently only the USET_CASE bit is supported. Any undefined bits 735 * are ignored. 736 * @stable ICU 4.2 737 */ 738 U_STABLE void U_EXPORT2 739 uset_closeOver(USet* set, int32_t attributes); 740 741 /** 742 * Remove all strings from this set. 743 * 744 * @param set the set 745 * @stable ICU 4.2 746 */ 747 U_STABLE void U_EXPORT2 748 uset_removeAllStrings(USet* set); 749 750 /** 751 * Returns TRUE if the given USet contains no characters and no 752 * strings. 753 * @param set the set 754 * @return true if set is empty 755 * @stable ICU 2.4 756 */ 757 U_STABLE UBool U_EXPORT2 758 uset_isEmpty(const USet* set); 759 760 /** 761 * Returns TRUE if the given USet contains the given character. 762 * This function works faster with a frozen set. 763 * @param set the set 764 * @param c The codepoint to check for within the set 765 * @return true if set contains c 766 * @stable ICU 2.4 767 */ 768 U_STABLE UBool U_EXPORT2 769 uset_contains(const USet* set, UChar32 c); 770 771 /** 772 * Returns TRUE if the given USet contains all characters c 773 * where start <= c && c <= end. 774 * @param set the set 775 * @param start the first character of the range to test, inclusive 776 * @param end the last character of the range to test, inclusive 777 * @return TRUE if set contains the range 778 * @stable ICU 2.2 779 */ 780 U_STABLE UBool U_EXPORT2 781 uset_containsRange(const USet* set, UChar32 start, UChar32 end); 782 783 /** 784 * Returns TRUE if the given USet contains the given string. 785 * @param set the set 786 * @param str the string 787 * @param strLen the length of the string or -1 if null terminated. 788 * @return true if set contains str 789 * @stable ICU 2.4 790 */ 791 U_STABLE UBool U_EXPORT2 792 uset_containsString(const USet* set, const UChar* str, int32_t strLen); 793 794 /** 795 * Returns the index of the given character within this set, where 796 * the set is ordered by ascending code point. If the character 797 * is not in this set, return -1. The inverse of this method is 798 * <code>charAt()</code>. 799 * @param set the set 800 * @param c the character to obtain the index for 801 * @return an index from 0..size()-1, or -1 802 * @stable ICU 3.2 803 */ 804 U_STABLE int32_t U_EXPORT2 805 uset_indexOf(const USet* set, UChar32 c); 806 807 /** 808 * Returns the character at the given index within this set, where 809 * the set is ordered by ascending code point. If the index is 810 * out of range, return (UChar32)-1. The inverse of this method is 811 * <code>indexOf()</code>. 812 * @param set the set 813 * @param charIndex an index from 0..size()-1 to obtain the char for 814 * @return the character at the given index, or (UChar32)-1. 815 * @stable ICU 3.2 816 */ 817 U_STABLE UChar32 U_EXPORT2 818 uset_charAt(const USet* set, int32_t charIndex); 819 820 /** 821 * Returns the number of characters and strings contained in the given 822 * USet. 823 * @param set the set 824 * @return a non-negative integer counting the characters and strings 825 * contained in set 826 * @stable ICU 2.4 827 */ 828 U_STABLE int32_t U_EXPORT2 829 uset_size(const USet* set); 830 831 /** 832 * Returns the number of items in this set. An item is either a range 833 * of characters or a single multicharacter string. 834 * @param set the set 835 * @return a non-negative integer counting the character ranges 836 * and/or strings contained in set 837 * @stable ICU 2.4 838 */ 839 U_STABLE int32_t U_EXPORT2 840 uset_getItemCount(const USet* set); 841 842 /** 843 * Returns an item of this set. An item is either a range of 844 * characters or a single multicharacter string. 845 * @param set the set 846 * @param itemIndex a non-negative integer in the range 0.. 847 * uset_getItemCount(set)-1 848 * @param start pointer to variable to receive first character 849 * in range, inclusive 850 * @param end pointer to variable to receive last character in range, 851 * inclusive 852 * @param str buffer to receive the string, may be NULL 853 * @param strCapacity capacity of str, or 0 if str is NULL 854 * @param ec error code 855 * @return the length of the string (>= 2), or 0 if the item is a 856 * range, in which case it is the range *start..*end, or -1 if 857 * itemIndex is out of range 858 * @stable ICU 2.4 859 */ 860 U_STABLE int32_t U_EXPORT2 861 uset_getItem(const USet* set, int32_t itemIndex, 862 UChar32* start, UChar32* end, 863 UChar* str, int32_t strCapacity, 864 UErrorCode* ec); 865 866 /** 867 * Returns true if set1 contains all the characters and strings 868 * of set2. It answers the question, 'Is set1 a superset of set2?' 869 * @param set1 set to be checked for containment 870 * @param set2 set to be checked for containment 871 * @return true if the test condition is met 872 * @stable ICU 3.2 873 */ 874 U_STABLE UBool U_EXPORT2 875 uset_containsAll(const USet* set1, const USet* set2); 876 877 /** 878 * Returns true if this set contains all the characters 879 * of the given string. This is does not check containment of grapheme 880 * clusters, like uset_containsString. 881 * @param set set of characters to be checked for containment 882 * @param str string containing codepoints to be checked for containment 883 * @param strLen the length of the string or -1 if null terminated. 884 * @return true if the test condition is met 885 * @stable ICU 3.4 886 */ 887 U_STABLE UBool U_EXPORT2 888 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); 889 890 /** 891 * Returns true if set1 contains none of the characters and strings 892 * of set2. It answers the question, 'Is set1 a disjoint set of set2?' 893 * @param set1 set to be checked for containment 894 * @param set2 set to be checked for containment 895 * @return true if the test condition is met 896 * @stable ICU 3.2 897 */ 898 U_STABLE UBool U_EXPORT2 899 uset_containsNone(const USet* set1, const USet* set2); 900 901 /** 902 * Returns true if set1 contains some of the characters and strings 903 * of set2. It answers the question, 'Does set1 and set2 have an intersection?' 904 * @param set1 set to be checked for containment 905 * @param set2 set to be checked for containment 906 * @return true if the test condition is met 907 * @stable ICU 3.2 908 */ 909 U_STABLE UBool U_EXPORT2 910 uset_containsSome(const USet* set1, const USet* set2); 911 912 /** 913 * Returns the length of the initial substring of the input string which 914 * consists only of characters and strings that are contained in this set 915 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 916 * or only of characters and strings that are not contained 917 * in this set (USET_SPAN_NOT_CONTAINED). 918 * See USetSpanCondition for details. 919 * Similar to the strspn() C library function. 920 * Unpaired surrogates are treated according to contains() of their surrogate code points. 921 * This function works faster with a frozen set and with a non-negative string length argument. 922 * @param set the set 923 * @param s start of the string 924 * @param length of the string; can be -1 for NUL-terminated 925 * @param spanCondition specifies the containment condition 926 * @return the length of the initial substring according to the spanCondition; 927 * 0 if the start of the string does not fit the spanCondition 928 * @stable ICU 3.8 929 * @see USetSpanCondition 930 */ 931 U_STABLE int32_t U_EXPORT2 932 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 933 934 /** 935 * Returns the start of the trailing substring of the input string which 936 * consists only of characters and strings that are contained in this set 937 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 938 * or only of characters and strings that are not contained 939 * in this set (USET_SPAN_NOT_CONTAINED). 940 * See USetSpanCondition for details. 941 * Unpaired surrogates are treated according to contains() of their surrogate code points. 942 * This function works faster with a frozen set and with a non-negative string length argument. 943 * @param set the set 944 * @param s start of the string 945 * @param length of the string; can be -1 for NUL-terminated 946 * @param spanCondition specifies the containment condition 947 * @return the start of the trailing substring according to the spanCondition; 948 * the string length if the end of the string does not fit the spanCondition 949 * @stable ICU 3.8 950 * @see USetSpanCondition 951 */ 952 U_STABLE int32_t U_EXPORT2 953 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 954 955 /** 956 * Returns the length of the initial substring of the input string which 957 * consists only of characters and strings that are contained in this set 958 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 959 * or only of characters and strings that are not contained 960 * in this set (USET_SPAN_NOT_CONTAINED). 961 * See USetSpanCondition for details. 962 * Similar to the strspn() C library function. 963 * Malformed byte sequences are treated according to contains(0xfffd). 964 * This function works faster with a frozen set and with a non-negative string length argument. 965 * @param set the set 966 * @param s start of the string (UTF-8) 967 * @param length of the string; can be -1 for NUL-terminated 968 * @param spanCondition specifies the containment condition 969 * @return the length of the initial substring according to the spanCondition; 970 * 0 if the start of the string does not fit the spanCondition 971 * @stable ICU 3.8 972 * @see USetSpanCondition 973 */ 974 U_STABLE int32_t U_EXPORT2 975 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 976 977 /** 978 * Returns the start of the trailing substring of the input string which 979 * consists only of characters and strings that are contained in this set 980 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 981 * or only of characters and strings that are not contained 982 * in this set (USET_SPAN_NOT_CONTAINED). 983 * See USetSpanCondition for details. 984 * Malformed byte sequences are treated according to contains(0xfffd). 985 * This function works faster with a frozen set and with a non-negative string length argument. 986 * @param set the set 987 * @param s start of the string (UTF-8) 988 * @param length of the string; can be -1 for NUL-terminated 989 * @param spanCondition specifies the containment condition 990 * @return the start of the trailing substring according to the spanCondition; 991 * the string length if the end of the string does not fit the spanCondition 992 * @stable ICU 3.8 993 * @see USetSpanCondition 994 */ 995 U_STABLE int32_t U_EXPORT2 996 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 997 998 /** 999 * Returns true if set1 contains all of the characters and strings 1000 * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' 1001 * @param set1 set to be checked for containment 1002 * @param set2 set to be checked for containment 1003 * @return true if the test condition is met 1004 * @stable ICU 3.2 1005 */ 1006 U_STABLE UBool U_EXPORT2 1007 uset_equals(const USet* set1, const USet* set2); 1008 1009 /********************************************************************* 1010 * Serialized set API 1011 *********************************************************************/ 1012 1013 /** 1014 * Serializes this set into an array of 16-bit integers. Serialization 1015 * (currently) only records the characters in the set; multicharacter 1016 * strings are ignored. 1017 * 1018 * The array 1019 * has following format (each line is one 16-bit integer): 1020 * 1021 * length = (n+2*m) | (m!=0?0x8000:0) 1022 * bmpLength = n; present if m!=0 1023 * bmp[0] 1024 * bmp[1] 1025 * ... 1026 * bmp[n-1] 1027 * supp-high[0] 1028 * supp-low[0] 1029 * supp-high[1] 1030 * supp-low[1] 1031 * ... 1032 * supp-high[m-1] 1033 * supp-low[m-1] 1034 * 1035 * The array starts with a header. After the header are n bmp 1036 * code points, then m supplementary code points. Either n or m 1037 * or both may be zero. n+2*m is always <= 0x7FFF. 1038 * 1039 * If there are no supplementary characters (if m==0) then the 1040 * header is one 16-bit integer, 'length', with value n. 1041 * 1042 * If there are supplementary characters (if m!=0) then the header 1043 * is two 16-bit integers. The first, 'length', has value 1044 * (n+2*m)|0x8000. The second, 'bmpLength', has value n. 1045 * 1046 * After the header the code points are stored in ascending order. 1047 * Supplementary code points are stored as most significant 16 1048 * bits followed by least significant 16 bits. 1049 * 1050 * @param set the set 1051 * @param dest pointer to buffer of destCapacity 16-bit integers. 1052 * May be NULL only if destCapacity is zero. 1053 * @param destCapacity size of dest, or zero. Must not be negative. 1054 * @param pErrorCode pointer to the error code. Will be set to 1055 * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to 1056 * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. 1057 * @return the total length of the serialized format, including 1058 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 1059 * than U_BUFFER_OVERFLOW_ERROR. 1060 * @stable ICU 2.4 1061 */ 1062 U_STABLE int32_t U_EXPORT2 1063 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); 1064 1065 /** 1066 * Given a serialized array, fill in the given serialized set object. 1067 * @param fillSet pointer to result 1068 * @param src pointer to start of array 1069 * @param srcLength length of array 1070 * @return true if the given array is valid, otherwise false 1071 * @stable ICU 2.4 1072 */ 1073 U_STABLE UBool U_EXPORT2 1074 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); 1075 1076 /** 1077 * Set the USerializedSet to contain the given character (and nothing 1078 * else). 1079 * @param fillSet pointer to result 1080 * @param c The codepoint to set 1081 * @stable ICU 2.4 1082 */ 1083 U_STABLE void U_EXPORT2 1084 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); 1085 1086 /** 1087 * Returns TRUE if the given USerializedSet contains the given 1088 * character. 1089 * @param set the serialized set 1090 * @param c The codepoint to check for within the set 1091 * @return true if set contains c 1092 * @stable ICU 2.4 1093 */ 1094 U_STABLE UBool U_EXPORT2 1095 uset_serializedContains(const USerializedSet* set, UChar32 c); 1096 1097 /** 1098 * Returns the number of disjoint ranges of characters contained in 1099 * the given serialized set. Ignores any strings contained in the 1100 * set. 1101 * @param set the serialized set 1102 * @return a non-negative integer counting the character ranges 1103 * contained in set 1104 * @stable ICU 2.4 1105 */ 1106 U_STABLE int32_t U_EXPORT2 1107 uset_getSerializedRangeCount(const USerializedSet* set); 1108 1109 /** 1110 * Returns a range of characters contained in the given serialized 1111 * set. 1112 * @param set the serialized set 1113 * @param rangeIndex a non-negative integer in the range 0.. 1114 * uset_getSerializedRangeCount(set)-1 1115 * @param pStart pointer to variable to receive first character 1116 * in range, inclusive 1117 * @param pEnd pointer to variable to receive last character in range, 1118 * inclusive 1119 * @return true if rangeIndex is valid, otherwise false 1120 * @stable ICU 2.4 1121 */ 1122 U_STABLE UBool U_EXPORT2 1123 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 1124 UChar32* pStart, UChar32* pEnd); 1125 1126 #endif 1127