1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uset.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002mar07 14 * created by: Markus W. Scherer 15 * 16 * C version of UnicodeSet. 17 */ 18 19 20 /** 21 * \file 22 * \brief C API: Unicode Set 23 * 24 * <p>This is a C wrapper around the C++ UnicodeSet class.</p> 25 */ 26 27 #ifndef __USET_H__ 28 #define __USET_H__ 29 30 #include "unicode/utypes.h" 31 #include "unicode/uchar.h" 32 33 #ifndef UCNV_H 34 struct USet; 35 /** 36 * A UnicodeSet. Use the uset_* API to manipulate. Create with 37 * uset_open*, and destroy with uset_close. 38 * @stable ICU 2.4 39 */ 40 typedef struct USet USet; 41 #endif 42 43 /** 44 * Bitmask values to be passed to uset_openPatternOptions() or 45 * uset_applyPattern() taking an option parameter. 46 * @stable ICU 2.4 47 */ 48 enum { 49 /** 50 * Ignore white space within patterns unless quoted or escaped. 51 * @stable ICU 2.4 52 */ 53 USET_IGNORE_SPACE = 1, 54 55 /** 56 * Enable case insensitive matching. E.g., "[ab]" with this flag 57 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 58 * match all except 'a', 'A', 'b', and 'B'. This performs a full 59 * closure over case mappings, e.g. U+017F for s. 60 * 61 * The resulting set is a superset of the input for the code points but 62 * not for the strings. 63 * It performs a case mapping closure of the code points and adds 64 * full case folding strings for the code points, and reduces strings of 65 * the original set to their full case folding equivalents. 66 * 67 * This is designed for case-insensitive matches, for example 68 * in regular expressions. The full code point case closure allows checking of 69 * an input character directly against the closure set. 70 * Strings are matched by comparing the case-folded form from the closure 71 * set with an incremental case folding of the string in question. 72 * 73 * The closure set will also contain single code points if the original 74 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 75 * This is not necessary (that is, redundant) for the above matching method 76 * but results in the same closure sets regardless of whether the original 77 * set contained the code point or a string. 78 * 79 * @stable ICU 2.4 80 */ 81 USET_CASE_INSENSITIVE = 2, 82 83 /** 84 * Enable case insensitive matching. E.g., "[ab]" with this flag 85 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 86 * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, 87 * title-, and uppercase mappings as well as the case folding 88 * of each existing element in the set. 89 * @stable ICU 3.2 90 */ 91 USET_ADD_CASE_MAPPINGS = 4, 92 93 /** 94 * Enough for any single-code point set 95 * @internal 96 */ 97 USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 98 }; 99 100 /** 101 * Argument values for whether span() and similar functions continue while 102 * the current character is contained vs. not contained in the set. 103 * 104 * The functionality is straightforward for sets with only single code points, 105 * without strings (which is the common case): 106 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE 107 * work the same. 108 * - span() and spanBack() partition any string the same way when 109 * alternating between span(USET_SPAN_NOT_CONTAINED) and 110 * span(either "contained" condition). 111 * - Using a complemented (inverted) set and the opposite span conditions 112 * yields the same results. 113 * 114 * When a set contains multi-code point strings, then these statements may not 115 * be true, depending on the strings in the set (for example, whether they 116 * overlap with each other) and the string that is processed. 117 * For a set with strings: 118 * - The complement of the set contains the opposite set of code points, 119 * but the same set of strings. 120 * Therefore, complementing both the set and the span conditions 121 * may yield different results. 122 * - When starting spans at different positions in a string 123 * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 124 * because a set string may start before the later position. 125 * - span(USET_SPAN_SIMPLE) may be shorter than 126 * span(USET_SPAN_CONTAINED) because it will not recursively try 127 * all possible paths. 128 * For example, with a set which contains the three strings "xy", "xya" and "ax", 129 * span("xyax", USET_SPAN_CONTAINED) will return 4 but 130 * span("xyax", USET_SPAN_SIMPLE) will return 3. 131 * span(USET_SPAN_SIMPLE) will never be longer than 132 * span(USET_SPAN_CONTAINED). 133 * - With either "contained" condition, span() and spanBack() may partition 134 * a string in different ways. 135 * For example, with a set which contains the two strings "ab" and "ba", 136 * and when processing the string "aba", 137 * span() will yield contained/not-contained boundaries of { 0, 2, 3 } 138 * while spanBack() will yield boundaries of { 0, 1, 3 }. 139 * 140 * Note: If it is important to get the same boundaries whether iterating forward 141 * or backward through a string, then either only span() should be used and 142 * the boundaries cached for backward operation, or an ICU BreakIterator 143 * could be used. 144 * 145 * Note: Unpaired surrogates are treated like surrogate code points. 146 * Similarly, set strings match only on code point boundaries, 147 * never in the middle of a surrogate pair. 148 * Illegal UTF-8 sequences are treated like U+FFFD. 149 * When processing UTF-8 strings, malformed set strings 150 * (strings with unpaired surrogates which cannot be converted to UTF-8) 151 * are ignored. 152 * 153 * @stable ICU 3.8 154 */ 155 typedef enum USetSpanCondition { 156 /** 157 * Continue a span() while there is no set element at the current position. 158 * Stops before the first set element (character or string). 159 * (For code points only, this is like while contains(current)==FALSE). 160 * 161 * When span() returns, the substring between where it started and the position 162 * it returned consists only of characters that are not in the set, 163 * and none of its strings overlap with the span. 164 * 165 * @stable ICU 3.8 166 */ 167 USET_SPAN_NOT_CONTAINED = 0, 168 /** 169 * Continue a span() while there is a set element at the current position. 170 * (For characters only, this is like while contains(current)==TRUE). 171 * 172 * When span() returns, the substring between where it started and the position 173 * it returned consists only of set elements (characters or strings) that are in the set. 174 * 175 * If a set contains strings, then the span will be the longest substring 176 * matching any of the possible concatenations of set elements (characters or strings). 177 * (There must be a single, non-overlapping concatenation of characters or strings.) 178 * This is equivalent to a POSIX regular expression for (OR of each set element)*. 179 * 180 * @stable ICU 3.8 181 */ 182 USET_SPAN_CONTAINED = 1, 183 /** 184 * Continue a span() while there is a set element at the current position. 185 * (For characters only, this is like while contains(current)==TRUE). 186 * 187 * When span() returns, the substring between where it started and the position 188 * it returned consists only of set elements (characters or strings) that are in the set. 189 * 190 * If a set only contains single characters, then this is the same 191 * as USET_SPAN_CONTAINED. 192 * 193 * If a set contains strings, then the span will be the longest substring 194 * with a match at each position with the longest single set element (character or string). 195 * 196 * Use this span condition together with other longest-match algorithms, 197 * such as ICU converters (ucnv_getUnicodeSet()). 198 * 199 * @stable ICU 3.8 200 */ 201 USET_SPAN_SIMPLE = 2, 202 /** 203 * One more than the last span condition. 204 * @stable ICU 3.8 205 */ 206 USET_SPAN_CONDITION_COUNT 207 } USetSpanCondition; 208 209 /** 210 * A serialized form of a Unicode set. Limited manipulations are 211 * possible directly on a serialized set. See below. 212 * @stable ICU 2.4 213 */ 214 typedef struct USerializedSet { 215 /** 216 * The serialized Unicode Set. 217 * @stable ICU 2.4 218 */ 219 const uint16_t *array; 220 /** 221 * The length of the array that contains BMP characters. 222 * @stable ICU 2.4 223 */ 224 int32_t bmpLength; 225 /** 226 * The total length of the array. 227 * @stable ICU 2.4 228 */ 229 int32_t length; 230 /** 231 * A small buffer for the array to reduce memory allocations. 232 * @stable ICU 2.4 233 */ 234 uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; 235 } USerializedSet; 236 237 /********************************************************************* 238 * USet API 239 *********************************************************************/ 240 241 /** 242 * Create an empty USet object. 243 * Equivalent to uset_open(1, 0). 244 * @return a newly created USet. The caller must call uset_close() on 245 * it when done. 246 * @draft ICU 4.2 247 */ 248 U_DRAFT USet* U_EXPORT2 249 uset_openEmpty(); 250 251 /** 252 * Creates a USet object that contains the range of characters 253 * start..end, inclusive. If <code>start > end</code> 254 * then an empty set is created (same as using uset_openEmpty()). 255 * @param start first character of the range, inclusive 256 * @param end last character of the range, inclusive 257 * @return a newly created USet. The caller must call uset_close() on 258 * it when done. 259 * @stable ICU 2.4 260 */ 261 U_STABLE USet* U_EXPORT2 262 uset_open(UChar32 start, UChar32 end); 263 264 /** 265 * Creates a set from the given pattern. See the UnicodeSet class 266 * description for the syntax of the pattern language. 267 * @param pattern a string specifying what characters are in the set 268 * @param patternLength the length of the pattern, or -1 if null 269 * terminated 270 * @param ec the error code 271 * @stable ICU 2.4 272 */ 273 U_STABLE USet* U_EXPORT2 274 uset_openPattern(const UChar* pattern, int32_t patternLength, 275 UErrorCode* ec); 276 277 /** 278 * Creates a set from the given pattern. See the UnicodeSet class 279 * description for the syntax of the pattern language. 280 * @param pattern a string specifying what characters are in the set 281 * @param patternLength the length of the pattern, or -1 if null 282 * terminated 283 * @param options bitmask for options to apply to the pattern. 284 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 285 * @param ec the error code 286 * @stable ICU 2.4 287 */ 288 U_STABLE USet* U_EXPORT2 289 uset_openPatternOptions(const UChar* pattern, int32_t patternLength, 290 uint32_t options, 291 UErrorCode* ec); 292 293 /** 294 * Disposes of the storage used by a USet object. This function should 295 * be called exactly once for objects returned by uset_open(). 296 * @param set the object to dispose of 297 * @stable ICU 2.4 298 */ 299 U_STABLE void U_EXPORT2 300 uset_close(USet* set); 301 302 /** 303 * Returns a copy of this object. 304 * If this set is frozen, then the clone will be frozen as well. 305 * Use uset_cloneAsThawed() for a mutable clone of a frozen set. 306 * @param set the original set 307 * @return the newly allocated copy of the set 308 * @see uset_cloneAsThawed 309 * @stable ICU 3.8 310 */ 311 U_DRAFT USet * U_EXPORT2 312 uset_clone(const USet *set); 313 314 /** 315 * Determines whether the set has been frozen (made immutable) or not. 316 * See the ICU4J Freezable interface for details. 317 * @param set the set 318 * @return TRUE/FALSE for whether the set has been frozen 319 * @see uset_freeze 320 * @see uset_cloneAsThawed 321 * @stable ICU 3.8 322 */ 323 U_DRAFT UBool U_EXPORT2 324 uset_isFrozen(const USet *set); 325 326 /** 327 * Freeze the set (make it immutable). 328 * Once frozen, it cannot be unfrozen and is therefore thread-safe 329 * until it is deleted. 330 * See the ICU4J Freezable interface for details. 331 * Freezing the set may also make some operations faster, for example 332 * uset_contains() and uset_span(). 333 * A frozen set will not be modified. (It remains frozen.) 334 * @param set the set 335 * @return the same set, now frozen 336 * @see uset_isFrozen 337 * @see uset_cloneAsThawed 338 * @stable ICU 3.8 339 */ 340 U_DRAFT void U_EXPORT2 341 uset_freeze(USet *set); 342 343 /** 344 * Clone the set and make the clone mutable. 345 * See the ICU4J Freezable interface for details. 346 * @param set the set 347 * @return the mutable clone 348 * @see uset_freeze 349 * @see uset_isFrozen 350 * @see uset_clone 351 * @stable ICU 3.8 352 */ 353 U_DRAFT USet * U_EXPORT2 354 uset_cloneAsThawed(const USet *set); 355 356 /** 357 * Causes the USet object to represent the range <code>start - end</code>. 358 * If <code>start > end</code> then this USet is set to an empty range. 359 * A frozen set will not be modified. 360 * @param set the object to set to the given range 361 * @param start first character in the set, inclusive 362 * @param end last character in the set, inclusive 363 * @stable ICU 3.2 364 */ 365 U_STABLE void U_EXPORT2 366 uset_set(USet* set, 367 UChar32 start, UChar32 end); 368 369 /** 370 * Modifies the set to represent the set specified by the given 371 * pattern. See the UnicodeSet class description for the syntax of 372 * the pattern language. See also the User Guide chapter about UnicodeSet. 373 * <em>Empties the set passed before applying the pattern.</em> 374 * A frozen set will not be modified. 375 * @param set The set to which the pattern is to be applied. 376 * @param pattern A pointer to UChar string specifying what characters are in the set. 377 * The character at pattern[0] must be a '['. 378 * @param patternLength The length of the UChar string. -1 if NUL terminated. 379 * @param options A bitmask for options to apply to the pattern. 380 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 381 * @param status Returns an error if the pattern cannot be parsed. 382 * @return Upon successful parse, the value is either 383 * the index of the character after the closing ']' 384 * of the parsed pattern. 385 * If the status code indicates failure, then the return value 386 * is the index of the error in the source. 387 * 388 * @stable ICU 2.8 389 */ 390 U_STABLE int32_t U_EXPORT2 391 uset_applyPattern(USet *set, 392 const UChar *pattern, int32_t patternLength, 393 uint32_t options, 394 UErrorCode *status); 395 396 /** 397 * Modifies the set to contain those code points which have the given value 398 * for the given binary or enumerated property, as returned by 399 * u_getIntPropertyValue. Prior contents of this set are lost. 400 * A frozen set will not be modified. 401 * 402 * @param set the object to contain the code points defined by the property 403 * 404 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 405 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 406 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 407 * 408 * @param value a value in the range u_getIntPropertyMinValue(prop).. 409 * u_getIntPropertyMaxValue(prop), with one exception. If prop is 410 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 411 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped 412 * categories such as [:L:] to be represented. 413 * 414 * @param ec error code input/output parameter 415 * 416 * @stable ICU 3.2 417 */ 418 U_STABLE void U_EXPORT2 419 uset_applyIntPropertyValue(USet* set, 420 UProperty prop, int32_t value, UErrorCode* ec); 421 422 /** 423 * Modifies the set to contain those code points which have the 424 * given value for the given property. Prior contents of this 425 * set are lost. 426 * A frozen set will not be modified. 427 * 428 * @param set the object to contain the code points defined by the given 429 * property and value alias 430 * 431 * @param prop a string specifying a property alias, either short or long. 432 * The name is matched loosely. See PropertyAliases.txt for names and a 433 * description of loose matching. If the value string is empty, then this 434 * string is interpreted as either a General_Category value alias, a Script 435 * value alias, a binary property alias, or a special ID. Special IDs are 436 * matched loosely and correspond to the following sets: 437 * 438 * "ANY" = [\\u0000-\\U0010FFFF], 439 * "ASCII" = [\\u0000-\\u007F], 440 * "Assigned" = [:^Cn:]. 441 * 442 * @param propLength the length of the prop, or -1 if NULL 443 * 444 * @param value a string specifying a value alias, either short or long. 445 * The name is matched loosely. See PropertyValueAliases.txt for names 446 * and a description of loose matching. In addition to aliases listed, 447 * numeric values and canonical combining classes may be expressed 448 * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string 449 * may also be empty. 450 * 451 * @param valueLength the length of the value, or -1 if NULL 452 * 453 * @param ec error code input/output parameter 454 * 455 * @stable ICU 3.2 456 */ 457 U_STABLE void U_EXPORT2 458 uset_applyPropertyAlias(USet* set, 459 const UChar *prop, int32_t propLength, 460 const UChar *value, int32_t valueLength, 461 UErrorCode* ec); 462 463 /** 464 * Return true if the given position, in the given pattern, appears 465 * to be the start of a UnicodeSet pattern. 466 * 467 * @param pattern a string specifying the pattern 468 * @param patternLength the length of the pattern, or -1 if NULL 469 * @param pos the given position 470 * @stable ICU 3.2 471 */ 472 U_STABLE UBool U_EXPORT2 473 uset_resemblesPattern(const UChar *pattern, int32_t patternLength, 474 int32_t pos); 475 476 /** 477 * Returns a string representation of this set. If the result of 478 * calling this function is passed to a uset_openPattern(), it 479 * will produce another set that is equal to this one. 480 * @param set the set 481 * @param result the string to receive the rules, may be NULL 482 * @param resultCapacity the capacity of result, may be 0 if result is NULL 483 * @param escapeUnprintable if TRUE then convert unprintable 484 * character to their hex escape representations, \\uxxxx or 485 * \\Uxxxxxxxx. Unprintable characters are those other than 486 * U+000A, U+0020..U+007E. 487 * @param ec error code. 488 * @return length of string, possibly larger than resultCapacity 489 * @stable ICU 2.4 490 */ 491 U_STABLE int32_t U_EXPORT2 492 uset_toPattern(const USet* set, 493 UChar* result, int32_t resultCapacity, 494 UBool escapeUnprintable, 495 UErrorCode* ec); 496 497 /** 498 * Adds the given character to the given USet. After this call, 499 * uset_contains(set, c) will return TRUE. 500 * A frozen set will not be modified. 501 * @param set the object to which to add the character 502 * @param c the character to add 503 * @stable ICU 2.4 504 */ 505 U_STABLE void U_EXPORT2 506 uset_add(USet* set, UChar32 c); 507 508 /** 509 * Adds all of the elements in the specified set to this set if 510 * they're not already present. This operation effectively 511 * modifies this set so that its value is the <i>union</i> of the two 512 * sets. The behavior of this operation is unspecified if the specified 513 * collection is modified while the operation is in progress. 514 * A frozen set will not be modified. 515 * 516 * @param set the object to which to add the set 517 * @param additionalSet the source set whose elements are to be added to this set. 518 * @stable ICU 2.6 519 */ 520 U_STABLE void U_EXPORT2 521 uset_addAll(USet* set, const USet *additionalSet); 522 523 /** 524 * Adds the given range of characters to the given USet. After this call, 525 * uset_contains(set, start, end) will return TRUE. 526 * A frozen set will not be modified. 527 * @param set the object to which to add the character 528 * @param start the first character of the range to add, inclusive 529 * @param end the last character of the range to add, inclusive 530 * @stable ICU 2.2 531 */ 532 U_STABLE void U_EXPORT2 533 uset_addRange(USet* set, UChar32 start, UChar32 end); 534 535 /** 536 * Adds the given string to the given USet. After this call, 537 * uset_containsString(set, str, strLen) will return TRUE. 538 * A frozen set will not be modified. 539 * @param set the object to which to add the character 540 * @param str the string to add 541 * @param strLen the length of the string or -1 if null terminated. 542 * @stable ICU 2.4 543 */ 544 U_STABLE void U_EXPORT2 545 uset_addString(USet* set, const UChar* str, int32_t strLen); 546 547 /** 548 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 549 * If this set already any particular character, it has no effect on that character. 550 * A frozen set will not be modified. 551 * @param set the object to which to add the character 552 * @param str the source string 553 * @param strLen the length of the string or -1 if null terminated. 554 * @stable ICU 3.4 555 */ 556 U_STABLE void U_EXPORT2 557 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); 558 559 /** 560 * Removes the given character from the given USet. After this call, 561 * uset_contains(set, c) will return FALSE. 562 * A frozen set will not be modified. 563 * @param set the object from which to remove the character 564 * @param c the character to remove 565 * @stable ICU 2.4 566 */ 567 U_STABLE void U_EXPORT2 568 uset_remove(USet* set, UChar32 c); 569 570 /** 571 * Removes the given range of characters from the given USet. After this call, 572 * uset_contains(set, start, end) will return FALSE. 573 * A frozen set will not be modified. 574 * @param set the object to which to add the character 575 * @param start the first character of the range to remove, inclusive 576 * @param end the last character of the range to remove, inclusive 577 * @stable ICU 2.2 578 */ 579 U_STABLE void U_EXPORT2 580 uset_removeRange(USet* set, UChar32 start, UChar32 end); 581 582 /** 583 * Removes the given string to the given USet. After this call, 584 * uset_containsString(set, str, strLen) will return FALSE. 585 * A frozen set will not be modified. 586 * @param set the object to which to add the character 587 * @param str the string to remove 588 * @param strLen the length of the string or -1 if null terminated. 589 * @stable ICU 2.4 590 */ 591 U_STABLE void U_EXPORT2 592 uset_removeString(USet* set, const UChar* str, int32_t strLen); 593 594 /** 595 * Removes from this set all of its elements that are contained in the 596 * specified set. This operation effectively modifies this 597 * set so that its value is the <i>asymmetric set difference</i> of 598 * the two sets. 599 * A frozen set will not be modified. 600 * @param set the object from which the elements are to be removed 601 * @param removeSet the object that defines which elements will be 602 * removed from this set 603 * @stable ICU 3.2 604 */ 605 U_STABLE void U_EXPORT2 606 uset_removeAll(USet* set, const USet* removeSet); 607 608 /** 609 * Retain only the elements in this set that are contained in the 610 * specified range. If <code>start > end</code> then an empty range is 611 * retained, leaving the set empty. This is equivalent to 612 * a boolean logic AND, or a set INTERSECTION. 613 * A frozen set will not be modified. 614 * 615 * @param set the object for which to retain only the specified range 616 * @param start first character, inclusive, of range to be retained 617 * to this set. 618 * @param end last character, inclusive, of range to be retained 619 * to this set. 620 * @stable ICU 3.2 621 */ 622 U_STABLE void U_EXPORT2 623 uset_retain(USet* set, UChar32 start, UChar32 end); 624 625 /** 626 * Retains only the elements in this set that are contained in the 627 * specified set. In other words, removes from this set all of 628 * its elements that are not contained in the specified set. This 629 * operation effectively modifies this set so that its value is 630 * the <i>intersection</i> of the two sets. 631 * A frozen set will not be modified. 632 * 633 * @param set the object on which to perform the retain 634 * @param retain set that defines which elements this set will retain 635 * @stable ICU 3.2 636 */ 637 U_STABLE void U_EXPORT2 638 uset_retainAll(USet* set, const USet* retain); 639 640 /** 641 * Reallocate this objects internal structures to take up the least 642 * possible space, without changing this object's value. 643 * A frozen set will not be modified. 644 * 645 * @param set the object on which to perfrom the compact 646 * @stable ICU 3.2 647 */ 648 U_STABLE void U_EXPORT2 649 uset_compact(USet* set); 650 651 /** 652 * Inverts this set. This operation modifies this set so that 653 * its value is its complement. This operation does not affect 654 * the multicharacter strings, if any. 655 * A frozen set will not be modified. 656 * @param set the set 657 * @stable ICU 2.4 658 */ 659 U_STABLE void U_EXPORT2 660 uset_complement(USet* set); 661 662 /** 663 * Complements in this set all elements contained in the specified 664 * set. Any character in the other set will be removed if it is 665 * in this set, or will be added if it is not in this set. 666 * A frozen set will not be modified. 667 * 668 * @param set the set with which to complement 669 * @param complement set that defines which elements will be xor'ed 670 * from this set. 671 * @stable ICU 3.2 672 */ 673 U_STABLE void U_EXPORT2 674 uset_complementAll(USet* set, const USet* complement); 675 676 /** 677 * Removes all of the elements from this set. This set will be 678 * empty after this call returns. 679 * A frozen set will not be modified. 680 * @param set the set 681 * @stable ICU 2.4 682 */ 683 U_STABLE void U_EXPORT2 684 uset_clear(USet* set); 685 686 /** 687 * Close this set over the given attribute. For the attribute 688 * USET_CASE, the result is to modify this set so that: 689 * 690 * 1. For each character or string 'a' in this set, all strings or 691 * characters 'b' such that foldCase(a) == foldCase(b) are added 692 * to this set. 693 * 694 * 2. For each string 'e' in the resulting set, if e != 695 * foldCase(e), 'e' will be removed. 696 * 697 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 698 * 699 * (Here foldCase(x) refers to the operation u_strFoldCase, and a 700 * == b denotes that the contents are the same, not pointer 701 * comparison.) 702 * 703 * A frozen set will not be modified. 704 * 705 * @param set the set 706 * 707 * @param attributes bitmask for attributes to close over. 708 * Currently only the USET_CASE bit is supported. Any undefined bits 709 * are ignored. 710 * @draft ICU 4.2 711 */ 712 U_DRAFT void U_EXPORT2 713 uset_closeOver(USet* set, int32_t attributes); 714 715 /** 716 * Remove all strings from this set. 717 * 718 * @param set the set 719 * @draft ICU 4.2 720 */ 721 U_DRAFT void U_EXPORT2 722 uset_removeAllStrings(USet* set); 723 724 /** 725 * Returns TRUE if the given USet contains no characters and no 726 * strings. 727 * @param set the set 728 * @return true if set is empty 729 * @stable ICU 2.4 730 */ 731 U_STABLE UBool U_EXPORT2 732 uset_isEmpty(const USet* set); 733 734 /** 735 * Returns TRUE if the given USet contains the given character. 736 * This function works faster with a frozen set. 737 * @param set the set 738 * @param c The codepoint to check for within the set 739 * @return true if set contains c 740 * @stable ICU 2.4 741 */ 742 U_STABLE UBool U_EXPORT2 743 uset_contains(const USet* set, UChar32 c); 744 745 /** 746 * Returns TRUE if the given USet contains all characters c 747 * where start <= c && c <= end. 748 * @param set the set 749 * @param start the first character of the range to test, inclusive 750 * @param end the last character of the range to test, inclusive 751 * @return TRUE if set contains the range 752 * @stable ICU 2.2 753 */ 754 U_STABLE UBool U_EXPORT2 755 uset_containsRange(const USet* set, UChar32 start, UChar32 end); 756 757 /** 758 * Returns TRUE if the given USet contains the given string. 759 * @param set the set 760 * @param str the string 761 * @param strLen the length of the string or -1 if null terminated. 762 * @return true if set contains str 763 * @stable ICU 2.4 764 */ 765 U_STABLE UBool U_EXPORT2 766 uset_containsString(const USet* set, const UChar* str, int32_t strLen); 767 768 /** 769 * Returns the index of the given character within this set, where 770 * the set is ordered by ascending code point. If the character 771 * is not in this set, return -1. The inverse of this method is 772 * <code>charAt()</code>. 773 * @param set the set 774 * @param c the character to obtain the index for 775 * @return an index from 0..size()-1, or -1 776 * @stable ICU 3.2 777 */ 778 U_STABLE int32_t U_EXPORT2 779 uset_indexOf(const USet* set, UChar32 c); 780 781 /** 782 * Returns the character at the given index within this set, where 783 * the set is ordered by ascending code point. If the index is 784 * out of range, return (UChar32)-1. The inverse of this method is 785 * <code>indexOf()</code>. 786 * @param set the set 787 * @param charIndex an index from 0..size()-1 to obtain the char for 788 * @return the character at the given index, or (UChar32)-1. 789 * @stable ICU 3.2 790 */ 791 U_STABLE UChar32 U_EXPORT2 792 uset_charAt(const USet* set, int32_t charIndex); 793 794 /** 795 * Returns the number of characters and strings contained in the given 796 * USet. 797 * @param set the set 798 * @return a non-negative integer counting the characters and strings 799 * contained in set 800 * @stable ICU 2.4 801 */ 802 U_STABLE int32_t U_EXPORT2 803 uset_size(const USet* set); 804 805 /** 806 * Returns the number of items in this set. An item is either a range 807 * of characters or a single multicharacter string. 808 * @param set the set 809 * @return a non-negative integer counting the character ranges 810 * and/or strings contained in set 811 * @stable ICU 2.4 812 */ 813 U_STABLE int32_t U_EXPORT2 814 uset_getItemCount(const USet* set); 815 816 /** 817 * Returns an item of this set. An item is either a range of 818 * characters or a single multicharacter string. 819 * @param set the set 820 * @param itemIndex a non-negative integer in the range 0.. 821 * uset_getItemCount(set)-1 822 * @param start pointer to variable to receive first character 823 * in range, inclusive 824 * @param end pointer to variable to receive last character in range, 825 * inclusive 826 * @param str buffer to receive the string, may be NULL 827 * @param strCapacity capacity of str, or 0 if str is NULL 828 * @param ec error code 829 * @return the length of the string (>= 2), or 0 if the item is a 830 * range, in which case it is the range *start..*end, or -1 if 831 * itemIndex is out of range 832 * @stable ICU 2.4 833 */ 834 U_STABLE int32_t U_EXPORT2 835 uset_getItem(const USet* set, int32_t itemIndex, 836 UChar32* start, UChar32* end, 837 UChar* str, int32_t strCapacity, 838 UErrorCode* ec); 839 840 /** 841 * Returns true if set1 contains all the characters and strings 842 * of set2. It answers the question, 'Is set1 a superset of set2?' 843 * @param set1 set to be checked for containment 844 * @param set2 set to be checked for containment 845 * @return true if the test condition is met 846 * @stable ICU 3.2 847 */ 848 U_STABLE UBool U_EXPORT2 849 uset_containsAll(const USet* set1, const USet* set2); 850 851 /** 852 * Returns true if this set contains all the characters 853 * of the given string. This is does not check containment of grapheme 854 * clusters, like uset_containsString. 855 * @param set set of characters to be checked for containment 856 * @param str string containing codepoints to be checked for containment 857 * @param strLen the length of the string or -1 if null terminated. 858 * @return true if the test condition is met 859 * @stable ICU 3.4 860 */ 861 U_STABLE UBool U_EXPORT2 862 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); 863 864 /** 865 * Returns true if set1 contains none of the characters and strings 866 * of set2. It answers the question, 'Is set1 a disjoint set of set2?' 867 * @param set1 set to be checked for containment 868 * @param set2 set to be checked for containment 869 * @return true if the test condition is met 870 * @stable ICU 3.2 871 */ 872 U_STABLE UBool U_EXPORT2 873 uset_containsNone(const USet* set1, const USet* set2); 874 875 /** 876 * Returns true if set1 contains some of the characters and strings 877 * of set2. It answers the question, 'Does set1 and set2 have an intersection?' 878 * @param set1 set to be checked for containment 879 * @param set2 set to be checked for containment 880 * @return true if the test condition is met 881 * @stable ICU 3.2 882 */ 883 U_STABLE UBool U_EXPORT2 884 uset_containsSome(const USet* set1, const USet* set2); 885 886 /** 887 * Returns the length of the initial substring of the input string which 888 * consists only of characters and strings that are contained in this set 889 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 890 * or only of characters and strings that are not contained 891 * in this set (USET_SPAN_NOT_CONTAINED). 892 * See USetSpanCondition for details. 893 * Similar to the strspn() C library function. 894 * Unpaired surrogates are treated according to contains() of their surrogate code points. 895 * This function works faster with a frozen set and with a non-negative string length argument. 896 * @param set the set 897 * @param s start of the string 898 * @param length of the string; can be -1 for NUL-terminated 899 * @param spanCondition specifies the containment condition 900 * @return the length of the initial substring according to the spanCondition; 901 * 0 if the start of the string does not fit the spanCondition 902 * @stable ICU 3.8 903 * @see USetSpanCondition 904 */ 905 U_DRAFT int32_t U_EXPORT2 906 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 907 908 /** 909 * Returns the start of the trailing substring of the input string which 910 * consists only of characters and strings that are contained in this set 911 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 912 * or only of characters and strings that are not contained 913 * in this set (USET_SPAN_NOT_CONTAINED). 914 * See USetSpanCondition for details. 915 * Unpaired surrogates are treated according to contains() of their surrogate code points. 916 * This function works faster with a frozen set and with a non-negative string length argument. 917 * @param set the set 918 * @param s start of the string 919 * @param length of the string; can be -1 for NUL-terminated 920 * @param spanCondition specifies the containment condition 921 * @return the start of the trailing substring according to the spanCondition; 922 * the string length if the end of the string does not fit the spanCondition 923 * @stable ICU 3.8 924 * @see USetSpanCondition 925 */ 926 U_DRAFT int32_t U_EXPORT2 927 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 928 929 /** 930 * Returns the length of the initial substring of the input string which 931 * consists only of characters and strings that are contained in this set 932 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 933 * or only of characters and strings that are not contained 934 * in this set (USET_SPAN_NOT_CONTAINED). 935 * See USetSpanCondition for details. 936 * Similar to the strspn() C library function. 937 * Malformed byte sequences are treated according to contains(0xfffd). 938 * This function works faster with a frozen set and with a non-negative string length argument. 939 * @param set the set 940 * @param s start of the string (UTF-8) 941 * @param length of the string; can be -1 for NUL-terminated 942 * @param spanCondition specifies the containment condition 943 * @return the length of the initial substring according to the spanCondition; 944 * 0 if the start of the string does not fit the spanCondition 945 * @stable ICU 3.8 946 * @see USetSpanCondition 947 */ 948 U_DRAFT int32_t U_EXPORT2 949 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 950 951 /** 952 * Returns the start of the trailing substring of the input string which 953 * consists only of characters and strings that are contained in this set 954 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 955 * or only of characters and strings that are not contained 956 * in this set (USET_SPAN_NOT_CONTAINED). 957 * See USetSpanCondition for details. 958 * Malformed byte sequences are treated according to contains(0xfffd). 959 * This function works faster with a frozen set and with a non-negative string length argument. 960 * @param set the set 961 * @param s start of the string (UTF-8) 962 * @param length of the string; can be -1 for NUL-terminated 963 * @param spanCondition specifies the containment condition 964 * @return the start of the trailing substring according to the spanCondition; 965 * the string length if the end of the string does not fit the spanCondition 966 * @stable ICU 3.8 967 * @see USetSpanCondition 968 */ 969 U_DRAFT int32_t U_EXPORT2 970 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 971 972 /** 973 * Returns true if set1 contains all of the characters and strings 974 * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' 975 * @param set1 set to be checked for containment 976 * @param set2 set to be checked for containment 977 * @return true if the test condition is met 978 * @stable ICU 3.2 979 */ 980 U_STABLE UBool U_EXPORT2 981 uset_equals(const USet* set1, const USet* set2); 982 983 /********************************************************************* 984 * Serialized set API 985 *********************************************************************/ 986 987 /** 988 * Serializes this set into an array of 16-bit integers. Serialization 989 * (currently) only records the characters in the set; multicharacter 990 * strings are ignored. 991 * 992 * The array 993 * has following format (each line is one 16-bit integer): 994 * 995 * length = (n+2*m) | (m!=0?0x8000:0) 996 * bmpLength = n; present if m!=0 997 * bmp[0] 998 * bmp[1] 999 * ... 1000 * bmp[n-1] 1001 * supp-high[0] 1002 * supp-low[0] 1003 * supp-high[1] 1004 * supp-low[1] 1005 * ... 1006 * supp-high[m-1] 1007 * supp-low[m-1] 1008 * 1009 * The array starts with a header. After the header are n bmp 1010 * code points, then m supplementary code points. Either n or m 1011 * or both may be zero. n+2*m is always <= 0x7FFF. 1012 * 1013 * If there are no supplementary characters (if m==0) then the 1014 * header is one 16-bit integer, 'length', with value n. 1015 * 1016 * If there are supplementary characters (if m!=0) then the header 1017 * is two 16-bit integers. The first, 'length', has value 1018 * (n+2*m)|0x8000. The second, 'bmpLength', has value n. 1019 * 1020 * After the header the code points are stored in ascending order. 1021 * Supplementary code points are stored as most significant 16 1022 * bits followed by least significant 16 bits. 1023 * 1024 * @param set the set 1025 * @param dest pointer to buffer of destCapacity 16-bit integers. 1026 * May be NULL only if destCapacity is zero. 1027 * @param destCapacity size of dest, or zero. Must not be negative. 1028 * @param pErrorCode pointer to the error code. Will be set to 1029 * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to 1030 * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. 1031 * @return the total length of the serialized format, including 1032 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 1033 * than U_BUFFER_OVERFLOW_ERROR. 1034 * @stable ICU 2.4 1035 */ 1036 U_STABLE int32_t U_EXPORT2 1037 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); 1038 1039 /** 1040 * Given a serialized array, fill in the given serialized set object. 1041 * @param fillSet pointer to result 1042 * @param src pointer to start of array 1043 * @param srcLength length of array 1044 * @return true if the given array is valid, otherwise false 1045 * @stable ICU 2.4 1046 */ 1047 U_STABLE UBool U_EXPORT2 1048 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); 1049 1050 /** 1051 * Set the USerializedSet to contain the given character (and nothing 1052 * else). 1053 * @param fillSet pointer to result 1054 * @param c The codepoint to set 1055 * @stable ICU 2.4 1056 */ 1057 U_STABLE void U_EXPORT2 1058 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); 1059 1060 /** 1061 * Returns TRUE if the given USerializedSet contains the given 1062 * character. 1063 * @param set the serialized set 1064 * @param c The codepoint to check for within the set 1065 * @return true if set contains c 1066 * @stable ICU 2.4 1067 */ 1068 U_STABLE UBool U_EXPORT2 1069 uset_serializedContains(const USerializedSet* set, UChar32 c); 1070 1071 /** 1072 * Returns the number of disjoint ranges of characters contained in 1073 * the given serialized set. Ignores any strings contained in the 1074 * set. 1075 * @param set the serialized set 1076 * @return a non-negative integer counting the character ranges 1077 * contained in set 1078 * @stable ICU 2.4 1079 */ 1080 U_STABLE int32_t U_EXPORT2 1081 uset_getSerializedRangeCount(const USerializedSet* set); 1082 1083 /** 1084 * Returns a range of characters contained in the given serialized 1085 * set. 1086 * @param set the serialized set 1087 * @param rangeIndex a non-negative integer in the range 0.. 1088 * uset_getSerializedRangeCount(set)-1 1089 * @param pStart pointer to variable to receive first character 1090 * in range, inclusive 1091 * @param pEnd pointer to variable to receive last character in range, 1092 * inclusive 1093 * @return true if rangeIndex is valid, otherwise false 1094 * @stable ICU 2.4 1095 */ 1096 U_STABLE UBool U_EXPORT2 1097 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 1098 UChar32* pStart, UChar32* pEnd); 1099 1100 #endif 1101