1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.io.IOException; 12 import java.text.ParsePosition; 13 import java.util.ArrayList; 14 import java.util.Arrays; 15 import java.util.Collection; 16 import java.util.Collections; 17 import java.util.Iterator; 18 import java.util.NoSuchElementException; 19 import java.util.SortedSet; 20 import java.util.TreeSet; 21 22 import com.ibm.icu.impl.BMPSet; 23 import com.ibm.icu.impl.CharacterPropertiesImpl; 24 import com.ibm.icu.impl.PatternProps; 25 import com.ibm.icu.impl.RuleCharacterIterator; 26 import com.ibm.icu.impl.SortedSetRelation; 27 import com.ibm.icu.impl.StringRange; 28 import com.ibm.icu.impl.UCaseProps; 29 import com.ibm.icu.impl.UPropertyAliases; 30 import com.ibm.icu.impl.UnicodeSetStringSpan; 31 import com.ibm.icu.impl.Utility; 32 import com.ibm.icu.lang.CharSequences; 33 import com.ibm.icu.lang.CharacterProperties; 34 import com.ibm.icu.lang.UCharacter; 35 import com.ibm.icu.lang.UProperty; 36 import com.ibm.icu.lang.UScript; 37 import com.ibm.icu.util.Freezable; 38 import com.ibm.icu.util.ICUUncheckedIOException; 39 import com.ibm.icu.util.OutputInt; 40 import com.ibm.icu.util.ULocale; 41 import com.ibm.icu.util.VersionInfo; 42 43 /** 44 * A mutable set of Unicode characters and multicharacter strings. 45 * Objects of this class represent <em>character classes</em> used 46 * in regular expressions. A character specifies a subset of Unicode 47 * code points. Legal code points are U+0000 to U+10FFFF, inclusive. 48 * 49 * Note: method freeze() will not only make the set immutable, but 50 * also makes important methods much higher performance: 51 * contains(c), containsNone(...), span(...), spanBack(...) etc. 52 * After the object is frozen, any subsequent call that wants to change 53 * the object will throw UnsupportedOperationException. 54 * 55 * <p>The UnicodeSet class is not designed to be subclassed. 56 * 57 * <p><code>UnicodeSet</code> supports two APIs. The first is the 58 * <em>operand</em> API that allows the caller to modify the value of 59 * a <code>UnicodeSet</code> object. It conforms to Java 2's 60 * <code>java.util.Set</code> interface, although 61 * <code>UnicodeSet</code> does not actually implement that 62 * interface. All methods of <code>Set</code> are supported, with the 63 * modification that they take a character range or single character 64 * instead of an <code>Object</code>, and they take a 65 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 66 * operand API may be thought of in terms of boolean logic: a boolean 67 * OR is implemented by <code>add</code>, a boolean AND is implemented 68 * by <code>retain</code>, a boolean XOR is implemented by 69 * <code>complement</code> taking an argument, and a boolean NOT is 70 * implemented by <code>complement</code> with no argument. In terms 71 * of traditional set theory function names, <code>add</code> is a 72 * union, <code>retain</code> is an intersection, <code>remove</code> 73 * is an asymmetric difference, and <code>complement</code> with no 74 * argument is a set complement with respect to the superset range 75 * <code>MIN_VALUE-MAX_VALUE</code> 76 * 77 * <p>The second API is the 78 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 79 * <code>java.text.Format</code>-derived classes. Unlike the 80 * methods that add characters, add categories, and control the logic 81 * of the set, the method <code>applyPattern()</code> sets all 82 * attributes of a <code>UnicodeSet</code> at once, based on a 83 * string pattern. 84 * 85 * <p><b>Pattern syntax</b></p> 86 * 87 * Patterns are accepted by the constructors and the 88 * <code>applyPattern()</code> methods and returned by the 89 * <code>toPattern()</code> method. These patterns follow a syntax 90 * similar to that employed by version 8 regular expression character 91 * classes. Here are some simple examples: 92 * 93 * <blockquote> 94 * <table> 95 * <tr style="vertical-align: top"> 96 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td> 97 * <td style="vertical-align: top;">No characters</td> 98 * </tr><tr style="vertical-align: top"> 99 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td> 100 * <td style="vertical-align: top;">The character 'a'</td> 101 * </tr><tr style="vertical-align: top"> 102 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td> 103 * <td style="vertical-align: top;">The characters 'a' and 'e'</td> 104 * </tr> 105 * <tr> 106 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td> 107 * <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code 108 * point order</td> 109 * </tr> 110 * <tr> 111 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td> 112 * <td style="vertical-align: top;">The character U+4E01</td> 113 * </tr> 114 * <tr> 115 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td> 116 * <td style="vertical-align: top;">The character 'a' and the multicharacter strings "ab" and 117 * "ac"</td> 118 * </tr> 119 * <tr> 120 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td> 121 * <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td> 122 * </tr> 123 * </table> 124 * </blockquote> 125 * 126 * Any character may be preceded by a backslash in order to remove any special 127 * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are 128 * ignored, unless they are escaped. 129 * 130 * <p>Property patterns specify a set of characters having a certain 131 * property as defined by the Unicode standard. Both the POSIX-like 132 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a 133 * complete list of supported property patterns, see the User's Guide 134 * for UnicodeSet at 135 * <a href="http://www.icu-project.org/userguide/unicodeSet.html"> 136 * http://www.icu-project.org/userguide/unicodeSet.html</a>. 137 * Actual determination of property data is defined by the underlying 138 * Unicode database as implemented by UCharacter. 139 * 140 * <p>Patterns specify individual characters, ranges of characters, and 141 * Unicode property sets. When elements are concatenated, they 142 * specify their union. To complement a set, place a '^' immediately 143 * after the opening '['. Property patterns are inverted by modifying 144 * their delimiters; "[:^foo]" and "\P{foo}". In any other location, 145 * '^' has no special meaning. 146 * 147 * <p>Ranges are indicated by placing two a '-' between two 148 * characters, as in "a-z". This specifies the range of all 149 * characters from the left to the right, in Unicode order. If the 150 * left character is greater than or equal to the 151 * right character it is a syntax error. If a '-' occurs as the first 152 * character after the opening '[' or '[^', or if it occurs as the 153 * last character before the closing ']', then it is taken as a 154 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same 155 * set of three characters, 'a', 'b', and '-'. 156 * 157 * <p>Sets may be intersected using the '&' operator or the asymmetric 158 * set difference may be taken using the '-' operator, for example, 159 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 160 * with values less than 4096. Operators ('&' and '|') have equal 161 * precedence and bind left-to-right. Thus 162 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 163 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 164 * difference; intersection is commutative. 165 * 166 * <table> 167 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a' 168 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a' 169 * through 'z' and all letters in between, in Unicode order 170 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing 171 * all characters but 'a' through 'z', 172 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 173 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 174 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 175 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 176 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 177 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 178 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 179 * <em>pat2</em> 180 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code> 181 * <td>The set of characters having the specified 182 * Unicode property; in 183 * this case, Unicode uppercase letters 184 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code> 185 * <td>The set of characters <em>not</em> having the given 186 * Unicode property 187 * </table> 188 * 189 * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p> 190 * 191 * <p><b>Formal syntax</b></p> 192 * 193 * <blockquote> 194 * <table> 195 * <tr style="vertical-align: top"> 196 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern := </code></td> 197 * <td style="vertical-align: top;"><code>('[' '^'? item* ']') | 198 * property</code></td> 199 * </tr> 200 * <tr style="vertical-align: top"> 201 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item := </code></td> 202 * <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br> 203 * </code></td> 204 * </tr> 205 * <tr style="vertical-align: top"> 206 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr := </code></td> 207 * <td style="vertical-align: top;"><code>pattern | pattern-expr pattern | 208 * pattern-expr op pattern<br> 209 * </code></td> 210 * </tr> 211 * <tr style="vertical-align: top"> 212 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op := </code></td> 213 * <td style="vertical-align: top;"><code>'&' | '-'<br> 214 * </code></td> 215 * </tr> 216 * <tr style="vertical-align: top"> 217 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special := </code></td> 218 * <td style="vertical-align: top;"><code>'[' | ']' | '-'<br> 219 * </code></td> 220 * </tr> 221 * <tr style="vertical-align: top"> 222 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char := </code></td> 223 * <td style="vertical-align: top;"><em>any character that is not</em><code> special<br> 224 * | ('\\' </code><em>any character</em><code>)<br> 225 * | ('\u' hex hex hex hex)<br> 226 * </code></td> 227 * </tr> 228 * <tr style="vertical-align: top"> 229 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex := </code></td> 230 * <td style="vertical-align: top;"><em>any character for which 231 * </em><code>Character.digit(c, 16)</code><em> 232 * returns a non-negative result</em></td> 233 * </tr> 234 * <tr> 235 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property := </code></td> 236 * <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td> 237 * </tr> 238 * </table> 239 * <br> 240 * <table border="1"> 241 * <tr> 242 * <td>Legend: <table> 243 * <tr> 244 * <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td> 245 * <td style="width: 20; vertical-align: top;"> </td> 246 * <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td> 247 * </tr> 248 * <tr> 249 * <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td> 250 * <td style="vertical-align: top;"></td> 251 * <td style="vertical-align: top;">zero or one instance of <code>a</code><br> 252 * </td> 253 * </tr> 254 * <tr> 255 * <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td> 256 * <td style="vertical-align: top;"></td> 257 * <td style="vertical-align: top;">one or more instances of <code>a</code><br> 258 * </td> 259 * </tr> 260 * <tr> 261 * <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td> 262 * <td style="vertical-align: top;"></td> 263 * <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br> 264 * </td> 265 * </tr> 266 * <tr> 267 * <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td> 268 * <td style="vertical-align: top;"></td> 269 * <td style="vertical-align: top;">the literal string between the quotes </td> 270 * </tr> 271 * </table> 272 * </td> 273 * </tr> 274 * </table> 275 * </blockquote> 276 * <p>To iterate over contents of UnicodeSet, the following are available: 277 * <ul><li>{@link #ranges()} to iterate through the ranges</li> 278 * <li>{@link #strings()} to iterate through the strings</li> 279 * <li>{@link #iterator()} to iterate through the entire contents in a single loop. 280 * That method is, however, not particularly efficient, since it "boxes" each code point into a String. 281 * </ul> 282 * All of the above can be used in <b>for</b> loops. 283 * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops. 284 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 285 * 286 * @author Alan Liu 287 * @stable ICU 2.0 288 * @see UnicodeSetIterator 289 * @see UnicodeSetSpanner 290 */ 291 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> { 292 private static final SortedSet<String> EMPTY_STRINGS = 293 Collections.unmodifiableSortedSet(new TreeSet<String>()); 294 295 /** 296 * Constant for the empty set. 297 * @stable ICU 4.8 298 */ 299 public static final UnicodeSet EMPTY = new UnicodeSet().freeze(); 300 /** 301 * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.) 302 * @stable ICU 4.8 303 */ 304 public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze(); 305 306 private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing 307 308 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints 309 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. 310 // 110000 for codepoints 311 312 /** 313 * Enough for sets with few ranges. 314 * For example, White_Space has 10 ranges, list length 21. 315 */ 316 private static final int INITIAL_CAPACITY = 25; 317 318 /** Max list [0, 1, 2, ..., max code point, HIGH] */ 319 private static final int MAX_LENGTH = HIGH + 1; 320 321 /** 322 * Minimum value that can be stored in a UnicodeSet. 323 * @stable ICU 2.0 324 */ 325 public static final int MIN_VALUE = LOW; 326 327 /** 328 * Maximum value that can be stored in a UnicodeSet. 329 * @stable ICU 2.0 330 */ 331 public static final int MAX_VALUE = HIGH - 1; 332 333 private int len; // length used; list may be longer to minimize reallocs 334 private int[] list; // MUST be terminated with HIGH 335 private int[] rangeList; // internal buffer 336 private int[] buffer; // internal buffer 337 338 // is not private so that UnicodeSetIterator can get access 339 SortedSet<String> strings = EMPTY_STRINGS; 340 341 /** 342 * The pattern representation of this set. This may not be the 343 * most economical pattern. It is the pattern supplied to 344 * applyPattern(), with variables substituted and whitespace 345 * removed. For sets constructed without applyPattern(), or 346 * modified using the non-pattern API, this string will be null, 347 * indicating that toPattern() must generate a pattern 348 * representation from the inversion list. 349 */ 350 private String pat = null; 351 352 // Special property set IDs 353 private static final String ANY_ID = "ANY"; // [\u0000-\U0010FFFF] 354 private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F] 355 private static final String ASSIGNED = "Assigned"; // [:^Cn:] 356 357 private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. 358 private volatile UnicodeSetStringSpan stringSpan; 359 //---------------------------------------------------------------- 360 // Public API 361 //---------------------------------------------------------------- 362 363 /** 364 * Constructs an empty set. 365 * @stable ICU 2.0 366 */ 367 public UnicodeSet() { 368 list = new int[INITIAL_CAPACITY]; 369 list[0] = HIGH; 370 len = 1; 371 } 372 373 /** 374 * Constructs a copy of an existing set. 375 * @stable ICU 2.0 376 */ 377 public UnicodeSet(UnicodeSet other) { 378 set(other); 379 } 380 381 /** 382 * Constructs a set containing the given range. If <code>end > 383 * start</code> then an empty set is created. 384 * 385 * @param start first character, inclusive, of range 386 * @param end last character, inclusive, of range 387 * @stable ICU 2.0 388 */ 389 public UnicodeSet(int start, int end) { 390 this(); 391 add(start, end); 392 } 393 394 /** 395 * Quickly constructs a set from a set of ranges <s0, e0, s1, e1, s2, e2, ..., sn, en>. 396 * There must be an even number of integers, and they must be all greater than zero, 397 * all less than or equal to Character.MAX_CODE_POINT. 398 * In each pair (..., si, ei, ...) it must be true that si <= ei 399 * Between adjacent pairs (...ei, sj...), it must be true that ei+1 < sj 400 * @param pairs pairs of character representing ranges 401 * @stable ICU 4.4 402 */ 403 public UnicodeSet(int... pairs) { 404 if ((pairs.length & 1) != 0) { 405 throw new IllegalArgumentException("Must have even number of integers"); 406 } 407 list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set. 408 len = list.length; 409 int last = -1; // used to ensure that the results are monotonically increasing. 410 int i = 0; 411 while (i < pairs.length) { 412 int start = pairs[i]; 413 if (last >= start) { 414 throw new IllegalArgumentException("Must be monotonically increasing."); 415 } 416 list[i++] = start; 417 int limit = pairs[i] + 1; 418 if (start >= limit) { 419 throw new IllegalArgumentException("Must be monotonically increasing."); 420 } 421 list[i++] = last = limit; 422 } 423 list[i] = HIGH; // terminate 424 } 425 426 /** 427 * Constructs a set from the given pattern. See the class description 428 * for the syntax of the pattern language. Whitespace is ignored. 429 * @param pattern a string specifying what characters are in the set 430 * @exception java.lang.IllegalArgumentException if the pattern contains 431 * a syntax error. 432 * @stable ICU 2.0 433 */ 434 public UnicodeSet(String pattern) { 435 this(); 436 applyPattern(pattern, null, null, IGNORE_SPACE); 437 } 438 439 /** 440 * Constructs a set from the given pattern. See the class description 441 * for the syntax of the pattern language. 442 * @param pattern a string specifying what characters are in the set 443 * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters 444 * @exception java.lang.IllegalArgumentException if the pattern contains 445 * a syntax error. 446 * @stable ICU 2.0 447 */ 448 public UnicodeSet(String pattern, boolean ignoreWhitespace) { 449 this(); 450 applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 451 } 452 453 /** 454 * Constructs a set from the given pattern. See the class description 455 * for the syntax of the pattern language. 456 * @param pattern a string specifying what characters are in the set 457 * @param options a bitmask indicating which options to apply. 458 * Valid options are IGNORE_SPACE and CASE. 459 * @exception java.lang.IllegalArgumentException if the pattern contains 460 * a syntax error. 461 * @stable ICU 3.8 462 */ 463 public UnicodeSet(String pattern, int options) { 464 this(); 465 applyPattern(pattern, null, null, options); 466 } 467 468 /** 469 * Constructs a set from the given pattern. See the class description 470 * for the syntax of the pattern language. 471 * @param pattern a string specifying what characters are in the set 472 * @param pos on input, the position in pattern at which to start parsing. 473 * On output, the position after the last character parsed. 474 * @param symbols a symbol table mapping variables to char[] arrays 475 * and chars to UnicodeSets 476 * @exception java.lang.IllegalArgumentException if the pattern 477 * contains a syntax error. 478 * @stable ICU 2.0 479 */ 480 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) { 481 this(); 482 applyPattern(pattern, pos, symbols, IGNORE_SPACE); 483 } 484 485 /** 486 * Constructs a set from the given pattern. See the class description 487 * for the syntax of the pattern language. 488 * @param pattern a string specifying what characters are in the set 489 * @param pos on input, the position in pattern at which to start parsing. 490 * On output, the position after the last character parsed. 491 * @param symbols a symbol table mapping variables to char[] arrays 492 * and chars to UnicodeSets 493 * @param options a bitmask indicating which options to apply. 494 * Valid options are IGNORE_SPACE and CASE. 495 * @exception java.lang.IllegalArgumentException if the pattern 496 * contains a syntax error. 497 * @stable ICU 3.2 498 */ 499 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) { 500 this(); 501 applyPattern(pattern, pos, symbols, options); 502 } 503 504 505 /** 506 * Return a new set that is equivalent to this one. 507 * @stable ICU 2.0 508 */ 509 @Override 510 public Object clone() { 511 if (isFrozen()) { 512 return this; 513 } 514 return new UnicodeSet(this); 515 } 516 517 /** 518 * Make this object represent the range <code>start - end</code>. 519 * If <code>end > start</code> then this object is set to an empty range. 520 * 521 * @param start first character in the set, inclusive 522 * @param end last character in the set, inclusive 523 * @stable ICU 2.0 524 */ 525 public UnicodeSet set(int start, int end) { 526 checkFrozen(); 527 clear(); 528 complement(start, end); 529 return this; 530 } 531 532 /** 533 * Make this object represent the same set as <code>other</code>. 534 * @param other a <code>UnicodeSet</code> whose value will be 535 * copied to this object 536 * @stable ICU 2.0 537 */ 538 public UnicodeSet set(UnicodeSet other) { 539 checkFrozen(); 540 list = Arrays.copyOf(other.list, other.len); 541 len = other.len; 542 pat = other.pat; 543 if (other.hasStrings()) { 544 strings = new TreeSet<>(other.strings); 545 } else { 546 strings = EMPTY_STRINGS; 547 } 548 return this; 549 } 550 551 /** 552 * Modifies this set to represent the set specified by the given pattern. 553 * See the class description for the syntax of the pattern language. 554 * Whitespace is ignored. 555 * @param pattern a string specifying what characters are in the set 556 * @exception java.lang.IllegalArgumentException if the pattern 557 * contains a syntax error. 558 * @stable ICU 2.0 559 */ 560 public final UnicodeSet applyPattern(String pattern) { 561 checkFrozen(); 562 return applyPattern(pattern, null, null, IGNORE_SPACE); 563 } 564 565 /** 566 * Modifies this set to represent the set specified by the given pattern, 567 * optionally ignoring whitespace. 568 * See the class description for the syntax of the pattern language. 569 * @param pattern a string specifying what characters are in the set 570 * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored 571 * @exception java.lang.IllegalArgumentException if the pattern 572 * contains a syntax error. 573 * @stable ICU 2.0 574 */ 575 public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) { 576 checkFrozen(); 577 return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 578 } 579 580 /** 581 * Modifies this set to represent the set specified by the given pattern, 582 * optionally ignoring whitespace. 583 * See the class description for the syntax of the pattern language. 584 * @param pattern a string specifying what characters are in the set 585 * @param options a bitmask indicating which options to apply. 586 * Valid options are IGNORE_SPACE and CASE. 587 * @exception java.lang.IllegalArgumentException if the pattern 588 * contains a syntax error. 589 * @stable ICU 3.8 590 */ 591 public UnicodeSet applyPattern(String pattern, int options) { 592 checkFrozen(); 593 return applyPattern(pattern, null, null, options); 594 } 595 596 /** 597 * Return true if the given position, in the given pattern, appears 598 * to be the start of a UnicodeSet pattern. 599 * @stable ICU 2.0 600 */ 601 public static boolean resemblesPattern(String pattern, int pos) { 602 return ((pos+1) < pattern.length() && 603 pattern.charAt(pos) == '[') || 604 resemblesPropertyPattern(pattern, pos); 605 } 606 607 /** 608 * TODO: create Appendable version of UTF16.append(buf, c), 609 * maybe in new class Appendables? 610 * @throws IOException 611 */ 612 private static void appendCodePoint(Appendable app, int c) { 613 assert 0 <= c && c <= 0x10ffff; 614 try { 615 if (c <= 0xffff) { 616 app.append((char) c); 617 } else { 618 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c)); 619 } 620 } catch (IOException e) { 621 throw new ICUUncheckedIOException(e); 622 } 623 } 624 625 /** 626 * TODO: create class Appendables? 627 * @throws IOException 628 */ 629 private static void append(Appendable app, CharSequence s) { 630 try { 631 app.append(s); 632 } catch (IOException e) { 633 throw new ICUUncheckedIOException(e); 634 } 635 } 636 637 /** 638 * Append the <code>toPattern()</code> representation of a 639 * string to the given <code>Appendable</code>. 640 */ 641 private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) { 642 int cp; 643 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 644 cp = s.codePointAt(i); 645 _appendToPat(buf, cp, escapeUnprintable); 646 } 647 return buf; 648 } 649 650 /** 651 * Append the <code>toPattern()</code> representation of a 652 * character to the given <code>Appendable</code>. 653 */ 654 private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) { 655 try { 656 if (escapeUnprintable && Utility.isUnprintable(c)) { 657 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything 658 // unprintable 659 if (Utility.escapeUnprintable(buf, c)) { 660 return buf; 661 } 662 } 663 // Okay to let ':' pass through 664 switch (c) { 665 case '[': // SET_OPEN: 666 case ']': // SET_CLOSE: 667 case '-': // HYPHEN: 668 case '^': // COMPLEMENT: 669 case '&': // INTERSECTION: 670 case '\\': //BACKSLASH: 671 case '{': 672 case '}': 673 case '$': 674 case ':': 675 buf.append('\\'); 676 break; 677 default: 678 // Escape whitespace 679 if (PatternProps.isWhiteSpace(c)) { 680 buf.append('\\'); 681 } 682 break; 683 } 684 appendCodePoint(buf, c); 685 return buf; 686 } catch (IOException e) { 687 throw new ICUUncheckedIOException(e); 688 } 689 } 690 691 /** 692 * Returns a string representation of this set. If the result of 693 * calling this function is passed to a UnicodeSet constructor, it 694 * will produce another set that is equal to this one. 695 * @stable ICU 2.0 696 */ 697 @Override 698 public String toPattern(boolean escapeUnprintable) { 699 if (pat != null && !escapeUnprintable) { 700 return pat; 701 } 702 StringBuilder result = new StringBuilder(); 703 return _toPattern(result, escapeUnprintable).toString(); 704 } 705 706 /** 707 * Append a string representation of this set to result. This will be 708 * a cleaned version of the string passed to applyPattern(), if there 709 * is one. Otherwise it will be generated. 710 */ 711 private <T extends Appendable> T _toPattern(T result, 712 boolean escapeUnprintable) { 713 if (pat == null) { 714 return appendNewPattern(result, escapeUnprintable, true); 715 } 716 try { 717 if (!escapeUnprintable) { 718 result.append(pat); 719 return result; 720 } 721 boolean oddNumberOfBackslashes = false; 722 for (int i=0; i<pat.length(); ) { 723 int c = pat.codePointAt(i); 724 i += Character.charCount(c); 725 if (Utility.isUnprintable(c)) { 726 // If the unprintable character is preceded by an odd 727 // number of backslashes, then it has been escaped 728 // and we omit the last backslash. 729 Utility.escapeUnprintable(result, c); 730 oddNumberOfBackslashes = false; 731 } else if (!oddNumberOfBackslashes && c == '\\') { 732 // Temporarily withhold an odd-numbered backslash. 733 oddNumberOfBackslashes = true; 734 } else { 735 if (oddNumberOfBackslashes) { 736 result.append('\\'); 737 } 738 appendCodePoint(result, c); 739 oddNumberOfBackslashes = false; 740 } 741 } 742 if (oddNumberOfBackslashes) { 743 result.append('\\'); 744 } 745 return result; 746 } catch (IOException e) { 747 throw new ICUUncheckedIOException(e); 748 } 749 } 750 751 /** 752 * Generate and append a string representation of this set to result. 753 * This does not use this.pat, the cleaned up copy of the string 754 * passed to applyPattern(). 755 * @param result the buffer into which to generate the pattern 756 * @param escapeUnprintable escape unprintable characters if true 757 * @stable ICU 2.0 758 */ 759 public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) { 760 return _generatePattern(result, escapeUnprintable, true); 761 } 762 763 /** 764 * Generate and append a string representation of this set to result. 765 * This does not use this.pat, the cleaned up copy of the string 766 * passed to applyPattern(). 767 * @param includeStrings if false, doesn't include the strings. 768 * @stable ICU 3.8 769 */ 770 public StringBuffer _generatePattern(StringBuffer result, 771 boolean escapeUnprintable, boolean includeStrings) { 772 return appendNewPattern(result, escapeUnprintable, includeStrings); 773 } 774 775 private <T extends Appendable> T appendNewPattern( 776 T result, boolean escapeUnprintable, boolean includeStrings) { 777 try { 778 result.append('['); 779 780 int count = getRangeCount(); 781 782 // If the set contains at least 2 intervals and includes both 783 // MIN_VALUE and MAX_VALUE, then the inverse representation will 784 // be more economical. 785 if (count > 1 && 786 getRangeStart(0) == MIN_VALUE && 787 getRangeEnd(count-1) == MAX_VALUE) { 788 789 // Emit the inverse 790 result.append('^'); 791 792 for (int i = 1; i < count; ++i) { 793 int start = getRangeEnd(i-1)+1; 794 int end = getRangeStart(i)-1; 795 _appendToPat(result, start, escapeUnprintable); 796 if (start != end) { 797 if ((start+1) != end) { 798 result.append('-'); 799 } 800 _appendToPat(result, end, escapeUnprintable); 801 } 802 } 803 } 804 805 // Default; emit the ranges as pairs 806 else { 807 for (int i = 0; i < count; ++i) { 808 int start = getRangeStart(i); 809 int end = getRangeEnd(i); 810 _appendToPat(result, start, escapeUnprintable); 811 if (start != end) { 812 if ((start+1) != end) { 813 result.append('-'); 814 } 815 _appendToPat(result, end, escapeUnprintable); 816 } 817 } 818 } 819 820 if (includeStrings && hasStrings()) { 821 for (String s : strings) { 822 result.append('{'); 823 _appendToPat(result, s, escapeUnprintable); 824 result.append('}'); 825 } 826 } 827 result.append(']'); 828 return result; 829 } catch (IOException e) { 830 throw new ICUUncheckedIOException(e); 831 } 832 } 833 834 boolean hasStrings() { 835 return !strings.isEmpty(); 836 } 837 838 /** 839 * Returns the number of elements in this set (its cardinality) 840 * Note than the elements of a set may include both individual 841 * codepoints and strings. 842 * 843 * @return the number of elements in this set (its cardinality). 844 * @stable ICU 2.0 845 */ 846 public int size() { 847 int n = 0; 848 int count = getRangeCount(); 849 for (int i = 0; i < count; ++i) { 850 n += getRangeEnd(i) - getRangeStart(i) + 1; 851 } 852 return n + strings.size(); 853 } 854 855 /** 856 * Returns <tt>true</tt> if this set contains no elements. 857 * 858 * @return <tt>true</tt> if this set contains no elements. 859 * @stable ICU 2.0 860 */ 861 public boolean isEmpty() { 862 return len == 1 && !hasStrings(); 863 } 864 865 /** 866 * Implementation of UnicodeMatcher API. Returns <tt>true</tt> if 867 * this set contains any character whose low byte is the given 868 * value. This is used by <tt>RuleBasedTransliterator</tt> for 869 * indexing. 870 * @stable ICU 2.0 871 */ 872 @Override 873 public boolean matchesIndexValue(int v) { 874 /* The index value v, in the range [0,255], is contained in this set if 875 * it is contained in any pair of this set. Pairs either have the high 876 * bytes equal, or unequal. If the high bytes are equal, then we have 877 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 878 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 879 * Then v is contained if xx <= v || v <= yy. (This is identical to the 880 * time zone month containment logic.) 881 */ 882 for (int i=0; i<getRangeCount(); ++i) { 883 int low = getRangeStart(i); 884 int high = getRangeEnd(i); 885 if ((low & ~0xFF) == (high & ~0xFF)) { 886 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 887 return true; 888 } 889 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 890 return true; 891 } 892 } 893 if (hasStrings()) { 894 for (String s : strings) { 895 //if (s.length() == 0) { 896 // // Empty strings match everything 897 // return true; 898 //} 899 // assert(s.length() != 0); // We enforce this elsewhere 900 int c = UTF16.charAt(s, 0); 901 if ((c & 0xFF) == v) { 902 return true; 903 } 904 } 905 } 906 return false; 907 } 908 909 /** 910 * Implementation of UnicodeMatcher.matches(). Always matches the 911 * longest possible multichar string. 912 * @stable ICU 2.0 913 */ 914 @Override 915 public int matches(Replaceable text, 916 int[] offset, 917 int limit, 918 boolean incremental) { 919 920 if (offset[0] == limit) { 921 // Strings, if any, have length != 0, so we don't worry 922 // about them here. If we ever allow zero-length strings 923 // we much check for them here. 924 if (contains(UnicodeMatcher.ETHER)) { 925 return incremental ? U_PARTIAL_MATCH : U_MATCH; 926 } else { 927 return U_MISMATCH; 928 } 929 } else { 930 if (hasStrings()) { // try strings first 931 932 // might separate forward and backward loops later 933 // for now they are combined 934 935 // TODO Improve efficiency of this, at least in the forward 936 // direction, if not in both. In the forward direction we 937 // can assume the strings are sorted. 938 939 boolean forward = offset[0] < limit; 940 941 // firstChar is the leftmost char to match in the 942 // forward direction or the rightmost char to match in 943 // the reverse direction. 944 char firstChar = text.charAt(offset[0]); 945 946 // If there are multiple strings that can match we 947 // return the longest match. 948 int highWaterLength = 0; 949 950 for (String trial : strings) { 951 //if (trial.length() == 0) { 952 // return U_MATCH; // null-string always matches 953 //} 954 // assert(trial.length() != 0); // We ensure this elsewhere 955 956 char c = trial.charAt(forward ? 0 : trial.length() - 1); 957 958 // Strings are sorted, so we can optimize in the 959 // forward direction. 960 if (forward && c > firstChar) break; 961 if (c != firstChar) continue; 962 963 int length = matchRest(text, offset[0], limit, trial); 964 965 if (incremental) { 966 int maxLen = forward ? limit-offset[0] : offset[0]-limit; 967 if (length == maxLen) { 968 // We have successfully matched but only up to limit. 969 return U_PARTIAL_MATCH; 970 } 971 } 972 973 if (length == trial.length()) { 974 // We have successfully matched the whole string. 975 if (length > highWaterLength) { 976 highWaterLength = length; 977 } 978 // In the forward direction we know strings 979 // are sorted so we can bail early. 980 if (forward && length < highWaterLength) { 981 break; 982 } 983 continue; 984 } 985 } 986 987 // We've checked all strings without a partial match. 988 // If we have full matches, return the longest one. 989 if (highWaterLength != 0) { 990 offset[0] += forward ? highWaterLength : -highWaterLength; 991 return U_MATCH; 992 } 993 } 994 return super.matches(text, offset, limit, incremental); 995 } 996 } 997 998 /** 999 * Returns the longest match for s in text at the given position. 1000 * If limit > start then match forward from start+1 to limit 1001 * matching all characters except s.charAt(0). If limit < start, 1002 * go backward starting from start-1 matching all characters 1003 * except s.charAt(s.length()-1). This method assumes that the 1004 * first character, text.charAt(start), matches s, so it does not 1005 * check it. 1006 * @param text the text to match 1007 * @param start the first character to match. In the forward 1008 * direction, text.charAt(start) is matched against s.charAt(0). 1009 * In the reverse direction, it is matched against 1010 * s.charAt(s.length()-1). 1011 * @param limit the limit offset for matching, either last+1 in 1012 * the forward direction, or last-1 in the reverse direction, 1013 * where last is the index of the last character to match. 1014 * @return If part of s matches up to the limit, return |limit - 1015 * start|. If all of s matches before reaching the limit, return 1016 * s.length(). If there is a mismatch between s and text, return 1017 * 0 1018 */ 1019 private static int matchRest (Replaceable text, int start, int limit, String s) { 1020 int maxLen; 1021 int slen = s.length(); 1022 if (start < limit) { 1023 maxLen = limit - start; 1024 if (maxLen > slen) maxLen = slen; 1025 for (int i = 1; i < maxLen; ++i) { 1026 if (text.charAt(start + i) != s.charAt(i)) return 0; 1027 } 1028 } else { 1029 maxLen = start - limit; 1030 if (maxLen > slen) maxLen = slen; 1031 --slen; // <=> slen = s.length() - 1; 1032 for (int i = 1; i < maxLen; ++i) { 1033 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 1034 } 1035 } 1036 return maxLen; 1037 } 1038 1039 /** 1040 * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1. 1041 * @internal 1042 * @deprecated This API is ICU internal only. 1043 */ 1044 @Deprecated 1045 public int matchesAt(CharSequence text, int offset) { 1046 int lastLen = -1; 1047 strings: 1048 if (hasStrings()) { 1049 char firstChar = text.charAt(offset); 1050 String trial = null; 1051 // find the first string starting with firstChar 1052 Iterator<String> it = strings.iterator(); 1053 while (it.hasNext()) { 1054 trial = it.next(); 1055 char firstStringChar = trial.charAt(0); 1056 if (firstStringChar < firstChar) continue; 1057 if (firstStringChar > firstChar) break strings; 1058 } 1059 1060 // now keep checking string until we get the longest one 1061 for (;;) { 1062 int tempLen = matchesAt(text, offset, trial); 1063 if (lastLen > tempLen) break strings; 1064 lastLen = tempLen; 1065 if (!it.hasNext()) break; 1066 trial = it.next(); 1067 } 1068 } 1069 1070 if (lastLen < 2) { 1071 int cp = UTF16.charAt(text, offset); 1072 if (contains(cp)) lastLen = UTF16.getCharCount(cp); 1073 } 1074 1075 return offset+lastLen; 1076 } 1077 1078 /** 1079 * Does one string contain another, starting at a specific offset? 1080 * @param text text to match 1081 * @param offsetInText offset within that text 1082 * @param substring substring to match at offset in text 1083 * @return -1 if match fails, otherwise other.length() 1084 */ 1085 // Note: This method was moved from CollectionUtilities 1086 private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) { 1087 int len = substring.length(); 1088 int textLength = text.length(); 1089 if (textLength + offsetInText > len) { 1090 return -1; 1091 } 1092 int i = 0; 1093 for (int j = offsetInText; i < len; ++i, ++j) { 1094 char pc = substring.charAt(i); 1095 char tc = text.charAt(j); 1096 if (pc != tc) return -1; 1097 } 1098 return i; 1099 } 1100 1101 /** 1102 * Implementation of UnicodeMatcher API. Union the set of all 1103 * characters that may be matched by this object into the given 1104 * set. 1105 * @param toUnionTo the set into which to union the source characters 1106 * @stable ICU 2.2 1107 */ 1108 @Override 1109 public void addMatchSetTo(UnicodeSet toUnionTo) { 1110 toUnionTo.addAll(this); 1111 } 1112 1113 /** 1114 * Returns the index of the given character within this set, where 1115 * the set is ordered by ascending code point. If the character 1116 * is not in this set, return -1. The inverse of this method is 1117 * <code>charAt()</code>. 1118 * @return an index from 0..size()-1, or -1 1119 * @stable ICU 2.0 1120 */ 1121 public int indexOf(int c) { 1122 if (c < MIN_VALUE || c > MAX_VALUE) { 1123 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1124 } 1125 int i = 0; 1126 int n = 0; 1127 for (;;) { 1128 int start = list[i++]; 1129 if (c < start) { 1130 return -1; 1131 } 1132 int limit = list[i++]; 1133 if (c < limit) { 1134 return n + c - start; 1135 } 1136 n += limit - start; 1137 } 1138 } 1139 1140 /** 1141 * Returns the character at the given index within this set, where 1142 * the set is ordered by ascending code point. If the index is 1143 * out of range, return -1. The inverse of this method is 1144 * <code>indexOf()</code>. 1145 * @param index an index from 0..size()-1 1146 * @return the character at the given index, or -1. 1147 * @stable ICU 2.0 1148 */ 1149 public int charAt(int index) { 1150 if (index >= 0) { 1151 // len2 is the largest even integer <= len, that is, it is len 1152 // for even values and len-1 for odd values. With odd values 1153 // the last entry is UNICODESET_HIGH. 1154 int len2 = len & ~1; 1155 for (int i=0; i < len2;) { 1156 int start = list[i++]; 1157 int count = list[i++] - start; 1158 if (index < count) { 1159 return start + index; 1160 } 1161 index -= count; 1162 } 1163 } 1164 return -1; 1165 } 1166 1167 /** 1168 * Adds the specified range to this set if it is not already 1169 * present. If this set already contains the specified range, 1170 * the call leaves this set unchanged. If <code>end > start</code> 1171 * then an empty range is added, leaving the set unchanged. 1172 * 1173 * @param start first character, inclusive, of range to be added 1174 * to this set. 1175 * @param end last character, inclusive, of range to be added 1176 * to this set. 1177 * @stable ICU 2.0 1178 */ 1179 public UnicodeSet add(int start, int end) { 1180 checkFrozen(); 1181 return add_unchecked(start, end); 1182 } 1183 1184 /** 1185 * Adds all characters in range (uses preferred naming convention). 1186 * @param start The index of where to start on adding all characters. 1187 * @param end The index of where to end on adding all characters. 1188 * @return a reference to this object 1189 * @stable ICU 4.4 1190 */ 1191 public UnicodeSet addAll(int start, int end) { 1192 checkFrozen(); 1193 return add_unchecked(start, end); 1194 } 1195 1196 // for internal use, after checkFrozen has been called 1197 private UnicodeSet add_unchecked(int start, int end) { 1198 if (start < MIN_VALUE || start > MAX_VALUE) { 1199 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1200 } 1201 if (end < MIN_VALUE || end > MAX_VALUE) { 1202 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1203 } 1204 if (start < end) { 1205 int limit = end + 1; 1206 // Fast path for adding a new range after the last one. 1207 // Odd list length: [..., lastStart, lastLimit, HIGH] 1208 if ((len & 1) != 0) { 1209 // If the list is empty, set lastLimit low enough to not be adjacent to 0. 1210 int lastLimit = len == 1 ? -2 : list[len - 2]; 1211 if (lastLimit <= start) { 1212 checkFrozen(); 1213 if (lastLimit == start) { 1214 // Extend the last range. 1215 list[len - 2] = limit; 1216 if (limit == HIGH) { 1217 --len; 1218 } 1219 } else { 1220 list[len - 1] = start; 1221 if (limit < HIGH) { 1222 ensureCapacity(len + 2); 1223 list[len++] = limit; 1224 list[len++] = HIGH; 1225 } else { // limit == HIGH 1226 ensureCapacity(len + 1); 1227 list[len++] = HIGH; 1228 } 1229 } 1230 pat = null; 1231 return this; 1232 } 1233 } 1234 // This is slow. Could be much faster using findCodePoint(start) 1235 // and modifying the list, dealing with adjacent & overlapping ranges. 1236 add(range(start, end), 2, 0); 1237 } else if (start == end) { 1238 add(start); 1239 } 1240 return this; 1241 } 1242 1243 // /** 1244 // * Format out the inversion list as a string, for debugging. Uncomment when 1245 // * needed. 1246 // */ 1247 // public final String dump() { 1248 // StringBuffer buf = new StringBuffer("["); 1249 // for (int i=0; i<len; ++i) { 1250 // if (i != 0) buf.append(", "); 1251 // int c = list[i]; 1252 // //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') { 1253 // // buf.append((char) c); 1254 // //} else { 1255 // buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6)); 1256 // //} 1257 // } 1258 // buf.append("]"); 1259 // return buf.toString(); 1260 // } 1261 1262 /** 1263 * Adds the specified character to this set if it is not already 1264 * present. If this set already contains the specified character, 1265 * the call leaves this set unchanged. 1266 * @stable ICU 2.0 1267 */ 1268 public final UnicodeSet add(int c) { 1269 checkFrozen(); 1270 return add_unchecked(c); 1271 } 1272 1273 // for internal use only, after checkFrozen has been called 1274 private final UnicodeSet add_unchecked(int c) { 1275 if (c < MIN_VALUE || c > MAX_VALUE) { 1276 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1277 } 1278 1279 // find smallest i such that c < list[i] 1280 // if odd, then it is IN the set 1281 // if even, then it is OUT of the set 1282 int i = findCodePoint(c); 1283 1284 // already in set? 1285 if ((i & 1) != 0) return this; 1286 1287 // HIGH is 0x110000 1288 // assert(list[len-1] == HIGH); 1289 1290 // empty = [HIGH] 1291 // [start_0, limit_0, start_1, limit_1, HIGH] 1292 1293 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1294 // ^ 1295 // list[i] 1296 1297 // i == 0 means c is before the first range 1298 // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into 1299 // add_unchecked, the maximum value that "c" will be compared to 1300 // is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will 1301 // never be reached according to this logic. 1302 if (c == list[i]-1) { 1303 // c is before start of next range 1304 list[i] = c; 1305 // if we touched the HIGH mark, then add a new one 1306 if (c == MAX_VALUE) { 1307 ensureCapacity(len+1); 1308 list[len++] = HIGH; 1309 } 1310 if (i > 0 && c == list[i-1]) { 1311 // collapse adjacent ranges 1312 1313 // [..., start_k-1, c, c, limit_k, ..., HIGH] 1314 // ^ 1315 // list[i] 1316 System.arraycopy(list, i+1, list, i-1, len-i-1); 1317 len -= 2; 1318 } 1319 } 1320 1321 else if (i > 0 && c == list[i-1]) { 1322 // c is after end of prior range 1323 list[i-1]++; 1324 // no need to chcek for collapse here 1325 } 1326 1327 else { 1328 // At this point we know the new char is not adjacent to 1329 // any existing ranges, and it is not 10FFFF. 1330 1331 1332 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1333 // ^ 1334 // list[i] 1335 1336 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 1337 // ^ 1338 // list[i] 1339 1340 // Don't use ensureCapacity() to save on copying. 1341 // NOTE: This has no measurable impact on performance, 1342 // but it might help in some usage patterns. 1343 if (len+2 > list.length) { 1344 int[] temp = new int[nextCapacity(len + 2)]; 1345 if (i != 0) System.arraycopy(list, 0, temp, 0, i); 1346 System.arraycopy(list, i, temp, i+2, len-i); 1347 list = temp; 1348 } else { 1349 System.arraycopy(list, i, list, i+2, len-i); 1350 } 1351 1352 list[i] = c; 1353 list[i+1] = c+1; 1354 len += 2; 1355 } 1356 1357 pat = null; 1358 return this; 1359 } 1360 1361 /** 1362 * Adds the specified multicharacter to this set if it is not already 1363 * present. If this set already contains the multicharacter, 1364 * the call leaves this set unchanged. 1365 * Thus "ch" => {"ch"} 1366 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1367 * @param s the source string 1368 * @return this object, for chaining 1369 * @stable ICU 2.0 1370 */ 1371 public final UnicodeSet add(CharSequence s) { 1372 checkFrozen(); 1373 int cp = getSingleCP(s); 1374 if (cp < 0) { 1375 String str = s.toString(); 1376 if (!strings.contains(str)) { 1377 addString(str); 1378 pat = null; 1379 } 1380 } else { 1381 add_unchecked(cp, cp); 1382 } 1383 return this; 1384 } 1385 1386 private void addString(CharSequence s) { 1387 if (strings == EMPTY_STRINGS) { 1388 strings = new TreeSet<>(); 1389 } 1390 strings.add(s.toString()); 1391 } 1392 1393 /** 1394 * Utility for getting code point from single code point CharSequence. 1395 * See the public UTF16.getSingleCodePoint() 1396 * @return a code point IF the string consists of a single one. 1397 * otherwise returns -1. 1398 * @param s to test 1399 */ 1400 private static int getSingleCP(CharSequence s) { 1401 if (s.length() < 1) { 1402 throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); 1403 } 1404 if (s.length() > 2) return -1; 1405 if (s.length() == 1) return s.charAt(0); 1406 1407 // at this point, len = 2 1408 int cp = UTF16.charAt(s, 0); 1409 if (cp > 0xFFFF) { // is surrogate pair 1410 return cp; 1411 } 1412 return -1; 1413 } 1414 1415 /** 1416 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1417 * If this set already any particular character, it has no effect on that character. 1418 * @param s the source string 1419 * @return this object, for chaining 1420 * @stable ICU 2.0 1421 */ 1422 public final UnicodeSet addAll(CharSequence s) { 1423 checkFrozen(); 1424 int cp; 1425 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1426 cp = UTF16.charAt(s, i); 1427 add_unchecked(cp, cp); 1428 } 1429 return this; 1430 } 1431 1432 /** 1433 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1434 * If this set already any particular character, it has no effect on that character. 1435 * @param s the source string 1436 * @return this object, for chaining 1437 * @stable ICU 2.0 1438 */ 1439 public final UnicodeSet retainAll(CharSequence s) { 1440 return retainAll(fromAll(s)); 1441 } 1442 1443 /** 1444 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1445 * If this set already any particular character, it has no effect on that character. 1446 * @param s the source string 1447 * @return this object, for chaining 1448 * @stable ICU 2.0 1449 */ 1450 public final UnicodeSet complementAll(CharSequence s) { 1451 return complementAll(fromAll(s)); 1452 } 1453 1454 /** 1455 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1456 * If this set already any particular character, it has no effect on that character. 1457 * @param s the source string 1458 * @return this object, for chaining 1459 * @stable ICU 2.0 1460 */ 1461 public final UnicodeSet removeAll(CharSequence s) { 1462 return removeAll(fromAll(s)); 1463 } 1464 1465 /** 1466 * Remove all strings from this UnicodeSet 1467 * @return this object, for chaining 1468 * @stable ICU 4.2 1469 */ 1470 public final UnicodeSet removeAllStrings() { 1471 checkFrozen(); 1472 if (hasStrings()) { 1473 strings.clear(); 1474 pat = null; 1475 } 1476 return this; 1477 } 1478 1479 /** 1480 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1481 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1482 * @param s the source string 1483 * @return a newly created set containing the given string 1484 * @stable ICU 2.0 1485 */ 1486 public static UnicodeSet from(CharSequence s) { 1487 return new UnicodeSet().add(s); 1488 } 1489 1490 1491 /** 1492 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1493 * @param s the source string 1494 * @return a newly created set containing the given characters 1495 * @stable ICU 2.0 1496 */ 1497 public static UnicodeSet fromAll(CharSequence s) { 1498 return new UnicodeSet().addAll(s); 1499 } 1500 1501 1502 /** 1503 * Retain only the elements in this set that are contained in the 1504 * specified range. If <code>end > start</code> then an empty range is 1505 * retained, leaving the set empty. 1506 * 1507 * @param start first character, inclusive, of range to be retained 1508 * to this set. 1509 * @param end last character, inclusive, of range to be retained 1510 * to this set. 1511 * @stable ICU 2.0 1512 */ 1513 public UnicodeSet retain(int start, int end) { 1514 checkFrozen(); 1515 if (start < MIN_VALUE || start > MAX_VALUE) { 1516 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1517 } 1518 if (end < MIN_VALUE || end > MAX_VALUE) { 1519 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1520 } 1521 if (start <= end) { 1522 retain(range(start, end), 2, 0); 1523 } else { 1524 clear(); 1525 } 1526 return this; 1527 } 1528 1529 /** 1530 * Retain the specified character from this set if it is present. 1531 * Upon return this set will be empty if it did not contain c, or 1532 * will only contain c if it did contain c. 1533 * @param c the character to be retained 1534 * @return this object, for chaining 1535 * @stable ICU 2.0 1536 */ 1537 public final UnicodeSet retain(int c) { 1538 return retain(c, c); 1539 } 1540 1541 /** 1542 * Retain the specified string in this set if it is present. 1543 * Upon return this set will be empty if it did not contain s, or 1544 * will only contain s if it did contain s. 1545 * @param cs the string to be retained 1546 * @return this object, for chaining 1547 * @stable ICU 2.0 1548 */ 1549 public final UnicodeSet retain(CharSequence cs) { 1550 int cp = getSingleCP(cs); 1551 if (cp < 0) { 1552 checkFrozen(); 1553 String s = cs.toString(); 1554 boolean isIn = strings.contains(s); 1555 if (isIn && size() == 1) { 1556 return this; 1557 } 1558 clear(); 1559 addString(s); 1560 pat = null; 1561 } else { 1562 retain(cp, cp); 1563 } 1564 return this; 1565 } 1566 1567 /** 1568 * Removes the specified range from this set if it is present. 1569 * The set will not contain the specified range once the call 1570 * returns. If <code>end > start</code> then an empty range is 1571 * removed, leaving the set unchanged. 1572 * 1573 * @param start first character, inclusive, of range to be removed 1574 * from this set. 1575 * @param end last character, inclusive, of range to be removed 1576 * from this set. 1577 * @stable ICU 2.0 1578 */ 1579 public UnicodeSet remove(int start, int end) { 1580 checkFrozen(); 1581 if (start < MIN_VALUE || start > MAX_VALUE) { 1582 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1583 } 1584 if (end < MIN_VALUE || end > MAX_VALUE) { 1585 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1586 } 1587 if (start <= end) { 1588 retain(range(start, end), 2, 2); 1589 } 1590 return this; 1591 } 1592 1593 /** 1594 * Removes the specified character from this set if it is present. 1595 * The set will not contain the specified character once the call 1596 * returns. 1597 * @param c the character to be removed 1598 * @return this object, for chaining 1599 * @stable ICU 2.0 1600 */ 1601 public final UnicodeSet remove(int c) { 1602 return remove(c, c); 1603 } 1604 1605 /** 1606 * Removes the specified string from this set if it is present. 1607 * The set will not contain the specified string once the call 1608 * returns. 1609 * @param s the string to be removed 1610 * @return this object, for chaining 1611 * @stable ICU 2.0 1612 */ 1613 public final UnicodeSet remove(CharSequence s) { 1614 int cp = getSingleCP(s); 1615 if (cp < 0) { 1616 checkFrozen(); 1617 String str = s.toString(); 1618 if (strings.contains(str)) { 1619 strings.remove(str); 1620 pat = null; 1621 } 1622 } else { 1623 remove(cp, cp); 1624 } 1625 return this; 1626 } 1627 1628 /** 1629 * Complements the specified range in this set. Any character in 1630 * the range will be removed if it is in this set, or will be 1631 * added if it is not in this set. If <code>end > start</code> 1632 * then an empty range is complemented, leaving the set unchanged. 1633 * 1634 * @param start first character, inclusive, of range to be removed 1635 * from this set. 1636 * @param end last character, inclusive, of range to be removed 1637 * from this set. 1638 * @stable ICU 2.0 1639 */ 1640 public UnicodeSet complement(int start, int end) { 1641 checkFrozen(); 1642 if (start < MIN_VALUE || start > MAX_VALUE) { 1643 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1644 } 1645 if (end < MIN_VALUE || end > MAX_VALUE) { 1646 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1647 } 1648 if (start <= end) { 1649 xor(range(start, end), 2, 0); 1650 } 1651 pat = null; 1652 return this; 1653 } 1654 1655 /** 1656 * Complements the specified character in this set. The character 1657 * will be removed if it is in this set, or will be added if it is 1658 * not in this set. 1659 * @stable ICU 2.0 1660 */ 1661 public final UnicodeSet complement(int c) { 1662 return complement(c, c); 1663 } 1664 1665 /** 1666 * This is equivalent to 1667 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1668 * @stable ICU 2.0 1669 */ 1670 public UnicodeSet complement() { 1671 checkFrozen(); 1672 if (list[0] == LOW) { 1673 System.arraycopy(list, 1, list, 0, len-1); 1674 --len; 1675 } else { 1676 ensureCapacity(len+1); 1677 System.arraycopy(list, 0, list, 1, len); 1678 list[0] = LOW; 1679 ++len; 1680 } 1681 pat = null; 1682 return this; 1683 } 1684 1685 /** 1686 * Complement the specified string in this set. 1687 * The set will not contain the specified string once the call 1688 * returns. 1689 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1690 * @param s the string to complement 1691 * @return this object, for chaining 1692 * @stable ICU 2.0 1693 */ 1694 public final UnicodeSet complement(CharSequence s) { 1695 checkFrozen(); 1696 int cp = getSingleCP(s); 1697 if (cp < 0) { 1698 String s2 = s.toString(); 1699 if (strings.contains(s2)) { 1700 strings.remove(s2); 1701 } else { 1702 addString(s2); 1703 } 1704 pat = null; 1705 } else { 1706 complement(cp, cp); 1707 } 1708 return this; 1709 } 1710 1711 /** 1712 * Returns true if this set contains the given character. 1713 * @param c character to be checked for containment 1714 * @return true if the test condition is met 1715 * @stable ICU 2.0 1716 */ 1717 @Override 1718 public boolean contains(int c) { 1719 if (c < MIN_VALUE || c > MAX_VALUE) { 1720 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1721 } 1722 if (bmpSet != null) { 1723 return bmpSet.contains(c); 1724 } 1725 if (stringSpan != null) { 1726 return stringSpan.contains(c); 1727 } 1728 1729 /* 1730 // Set i to the index of the start item greater than ch 1731 // We know we will terminate without length test! 1732 int i = -1; 1733 while (true) { 1734 if (c < list[++i]) break; 1735 } 1736 */ 1737 1738 int i = findCodePoint(c); 1739 1740 return ((i & 1) != 0); // return true if odd 1741 } 1742 1743 /** 1744 * Returns the smallest value i such that c < list[i]. Caller 1745 * must ensure that c is a legal value or this method will enter 1746 * an infinite loop. This method performs a binary search. 1747 * @param c a character in the range MIN_VALUE..MAX_VALUE 1748 * inclusive 1749 * @return the smallest integer i in the range 0..len-1, 1750 * inclusive, such that c < list[i] 1751 */ 1752 private final int findCodePoint(int c) { 1753 /* Examples: 1754 findCodePoint(c) 1755 set list[] c=0 1 3 4 7 8 1756 === ============== =========== 1757 [] [110000] 0 0 0 0 0 0 1758 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 1759 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 1760 [:all:] [0, 110000] 1 1 1 1 1 1 1761 */ 1762 1763 // Return the smallest i such that c < list[i]. Assume 1764 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 1765 if (c < list[0]) return 0; 1766 // High runner test. c is often after the last range, so an 1767 // initial check for this condition pays off. 1768 if (len >= 2 && c >= list[len-2]) return len-1; 1769 int lo = 0; 1770 int hi = len - 1; 1771 // invariant: c >= list[lo] 1772 // invariant: c < list[hi] 1773 for (;;) { 1774 int i = (lo + hi) >>> 1; 1775 if (i == lo) return hi; 1776 if (c < list[i]) { 1777 hi = i; 1778 } else { 1779 lo = i; 1780 } 1781 } 1782 } 1783 1784 // //---------------------------------------------------------------- 1785 // // Unrolled binary search 1786 // //---------------------------------------------------------------- 1787 // 1788 // private int validLen = -1; // validated value of len 1789 // private int topOfLow; 1790 // private int topOfHigh; 1791 // private int power; 1792 // private int deltaStart; 1793 // 1794 // private void validate() { 1795 // if (len <= 1) { 1796 // throw new IllegalArgumentException("list.len==" + len + "; must be >1"); 1797 // } 1798 // 1799 // // find greatest power of 2 less than or equal to len 1800 // for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {} 1801 // 1802 // // assert(exp2[power] <= len); 1803 // 1804 // // determine the starting points 1805 // topOfLow = exp2[power] - 1; 1806 // topOfHigh = len - 1; 1807 // deltaStart = exp2[power-1]; 1808 // validLen = len; 1809 // } 1810 // 1811 // private static final int exp2[] = { 1812 // 0x1, 0x2, 0x4, 0x8, 1813 // 0x10, 0x20, 0x40, 0x80, 1814 // 0x100, 0x200, 0x400, 0x800, 1815 // 0x1000, 0x2000, 0x4000, 0x8000, 1816 // 0x10000, 0x20000, 0x40000, 0x80000, 1817 // 0x100000, 0x200000, 0x400000, 0x800000, 1818 // 0x1000000, 0x2000000, 0x4000000, 0x8000000, 1819 // 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java 1820 // }; 1821 // 1822 // /** 1823 // * Unrolled lowest index GT. 1824 // */ 1825 // private final int leastIndexGT(int searchValue) { 1826 // 1827 // if (len != validLen) { 1828 // if (len == 1) return 0; 1829 // validate(); 1830 // } 1831 // int temp; 1832 // 1833 // // set up initial range to search. Each subrange is a power of two in length 1834 // int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh; 1835 // 1836 // // Completely unrolled binary search, folhighing "Programming Pearls" 1837 // // Each case deliberately falls through to the next 1838 // // Logically, list[-1] < all_search_values && list[count] > all_search_values 1839 // // although the values -1 and count are never actually touched. 1840 // 1841 // // The bounds at each point are low & high, 1842 // // where low == high - delta*2 1843 // // so high - delta is the midpoint 1844 // 1845 // // The invariant AFTER each line is that list[low] < searchValue <= list[high] 1846 // 1847 // switch (power) { 1848 // //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java 1849 // case 30: if (searchValue < list[temp = high-0x20000000]) high = temp; 1850 // case 29: if (searchValue < list[temp = high-0x10000000]) high = temp; 1851 // 1852 // case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp; 1853 // case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp; 1854 // case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp; 1855 // case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp; 1856 // 1857 // case 24: if (searchValue < list[temp = high- 0x800000]) high = temp; 1858 // case 23: if (searchValue < list[temp = high- 0x400000]) high = temp; 1859 // case 22: if (searchValue < list[temp = high- 0x200000]) high = temp; 1860 // case 21: if (searchValue < list[temp = high- 0x100000]) high = temp; 1861 // 1862 // case 20: if (searchValue < list[temp = high- 0x80000]) high = temp; 1863 // case 19: if (searchValue < list[temp = high- 0x40000]) high = temp; 1864 // case 18: if (searchValue < list[temp = high- 0x20000]) high = temp; 1865 // case 17: if (searchValue < list[temp = high- 0x10000]) high = temp; 1866 // 1867 // case 16: if (searchValue < list[temp = high- 0x8000]) high = temp; 1868 // case 15: if (searchValue < list[temp = high- 0x4000]) high = temp; 1869 // case 14: if (searchValue < list[temp = high- 0x2000]) high = temp; 1870 // case 13: if (searchValue < list[temp = high- 0x1000]) high = temp; 1871 // 1872 // case 12: if (searchValue < list[temp = high- 0x800]) high = temp; 1873 // case 11: if (searchValue < list[temp = high- 0x400]) high = temp; 1874 // case 10: if (searchValue < list[temp = high- 0x200]) high = temp; 1875 // case 9: if (searchValue < list[temp = high- 0x100]) high = temp; 1876 // 1877 // case 8: if (searchValue < list[temp = high- 0x80]) high = temp; 1878 // case 7: if (searchValue < list[temp = high- 0x40]) high = temp; 1879 // case 6: if (searchValue < list[temp = high- 0x20]) high = temp; 1880 // case 5: if (searchValue < list[temp = high- 0x10]) high = temp; 1881 // 1882 // case 4: if (searchValue < list[temp = high- 0x8]) high = temp; 1883 // case 3: if (searchValue < list[temp = high- 0x4]) high = temp; 1884 // case 2: if (searchValue < list[temp = high- 0x2]) high = temp; 1885 // case 1: if (searchValue < list[temp = high- 0x1]) high = temp; 1886 // } 1887 // 1888 // return high; 1889 // } 1890 // 1891 // // For debugging only 1892 // public int len() { 1893 // return len; 1894 // } 1895 // 1896 // //---------------------------------------------------------------- 1897 // //---------------------------------------------------------------- 1898 1899 /** 1900 * Returns true if this set contains every character 1901 * of the given range. 1902 * @param start first character, inclusive, of the range 1903 * @param end last character, inclusive, of the range 1904 * @return true if the test condition is met 1905 * @stable ICU 2.0 1906 */ 1907 public boolean contains(int start, int end) { 1908 if (start < MIN_VALUE || start > MAX_VALUE) { 1909 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1910 } 1911 if (end < MIN_VALUE || end > MAX_VALUE) { 1912 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1913 } 1914 //int i = -1; 1915 //while (true) { 1916 // if (start < list[++i]) break; 1917 //} 1918 int i = findCodePoint(start); 1919 return ((i & 1) != 0 && end < list[i]); 1920 } 1921 1922 /** 1923 * Returns <tt>true</tt> if this set contains the given 1924 * multicharacter string. 1925 * @param s string to be checked for containment 1926 * @return <tt>true</tt> if this set contains the specified string 1927 * @stable ICU 2.0 1928 */ 1929 public final boolean contains(CharSequence s) { 1930 1931 int cp = getSingleCP(s); 1932 if (cp < 0) { 1933 return strings.contains(s.toString()); 1934 } else { 1935 return contains(cp); 1936 } 1937 } 1938 1939 /** 1940 * Returns true if this set contains all the characters and strings 1941 * of the given set. 1942 * @param b set to be checked for containment 1943 * @return true if the test condition is met 1944 * @stable ICU 2.0 1945 */ 1946 public boolean containsAll(UnicodeSet b) { 1947 // The specified set is a subset if all of its pairs are contained in 1948 // this set. This implementation accesses the lists directly for speed. 1949 // TODO: this could be faster if size() were cached. But that would affect building speed 1950 // so it needs investigation. 1951 int[] listB = b.list; 1952 boolean needA = true; 1953 boolean needB = true; 1954 int aPtr = 0; 1955 int bPtr = 0; 1956 int aLen = len - 1; 1957 int bLen = b.len - 1; 1958 int startA = 0, startB = 0, limitA = 0, limitB = 0; 1959 while (true) { 1960 // double iterations are such a pain... 1961 if (needA) { 1962 if (aPtr >= aLen) { 1963 // ran out of A. If B is also exhausted, then break; 1964 if (needB && bPtr >= bLen) { 1965 break; 1966 } 1967 return false; 1968 } 1969 startA = list[aPtr++]; 1970 limitA = list[aPtr++]; 1971 } 1972 if (needB) { 1973 if (bPtr >= bLen) { 1974 // ran out of B. Since we got this far, we have an A and we are ok so far 1975 break; 1976 } 1977 startB = listB[bPtr++]; 1978 limitB = listB[bPtr++]; 1979 } 1980 // if B doesn't overlap and is greater than A, get new A 1981 if (startB >= limitA) { 1982 needA = true; 1983 needB = false; 1984 continue; 1985 } 1986 // if B is wholy contained in A, then get a new B 1987 if (startB >= startA && limitB <= limitA) { 1988 needA = false; 1989 needB = true; 1990 continue; 1991 } 1992 // all other combinations mean we fail 1993 return false; 1994 } 1995 1996 if (!strings.containsAll(b.strings)) return false; 1997 return true; 1998 } 1999 2000 // /** 2001 // * Returns true if this set contains all the characters and strings 2002 // * of the given set. 2003 // * @param c set to be checked for containment 2004 // * @return true if the test condition is met 2005 // * @stable ICU 2.0 2006 // */ 2007 // public boolean containsAllOld(UnicodeSet c) { 2008 // // The specified set is a subset if all of its pairs are contained in 2009 // // this set. It's possible to code this more efficiently in terms of 2010 // // direct manipulation of the inversion lists if the need arises. 2011 // int n = c.getRangeCount(); 2012 // for (int i=0; i<n; ++i) { 2013 // if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 2014 // return false; 2015 // } 2016 // } 2017 // if (!strings.containsAll(c.strings)) return false; 2018 // return true; 2019 // } 2020 2021 /** 2022 * Returns true if there is a partition of the string such that this set contains each of the partitioned strings. 2023 * For example, for the Unicode set [a{bc}{cd}]<br> 2024 * containsAll is true for each of: "a", "bc", ""cdbca"<br> 2025 * containsAll is false for each of: "acb", "bcda", "bcx"<br> 2026 * @param s string containing characters to be checked for containment 2027 * @return true if the test condition is met 2028 * @stable ICU 2.0 2029 */ 2030 public boolean containsAll(String s) { 2031 int cp; 2032 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 2033 cp = UTF16.charAt(s, i); 2034 if (!contains(cp)) { 2035 if (!hasStrings()) { 2036 return false; 2037 } 2038 return containsAll(s, 0); 2039 } 2040 } 2041 return true; 2042 } 2043 2044 /** 2045 * Recursive routine called if we fail to find a match in containsAll, and there are strings 2046 * @param s source string 2047 * @param i point to match to the end on 2048 * @return true if ok 2049 */ 2050 private boolean containsAll(String s, int i) { 2051 if (i >= s.length()) { 2052 return true; 2053 } 2054 int cp= UTF16.charAt(s, i); 2055 if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) { 2056 return true; 2057 } 2058 for (String setStr : strings) { 2059 if (s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) { 2060 return true; 2061 } 2062 } 2063 return false; 2064 2065 } 2066 2067 /** 2068 * Get the Regex equivalent for this UnicodeSet 2069 * @return regex pattern equivalent to this UnicodeSet 2070 * @internal 2071 * @deprecated This API is ICU internal only. 2072 */ 2073 @Deprecated 2074 public String getRegexEquivalent() { 2075 if (!hasStrings()) { 2076 return toString(); 2077 } 2078 StringBuilder result = new StringBuilder("(?:"); 2079 appendNewPattern(result, true, false); 2080 for (String s : strings) { 2081 result.append('|'); 2082 _appendToPat(result, s, true); 2083 } 2084 return result.append(")").toString(); 2085 } 2086 2087 /** 2088 * Returns true if this set contains none of the characters 2089 * of the given range. 2090 * @param start first character, inclusive, of the range 2091 * @param end last character, inclusive, of the range 2092 * @return true if the test condition is met 2093 * @stable ICU 2.0 2094 */ 2095 public boolean containsNone(int start, int end) { 2096 if (start < MIN_VALUE || start > MAX_VALUE) { 2097 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 2098 } 2099 if (end < MIN_VALUE || end > MAX_VALUE) { 2100 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 2101 } 2102 int i = -1; 2103 while (true) { 2104 if (start < list[++i]) break; 2105 } 2106 return ((i & 1) == 0 && end < list[i]); 2107 } 2108 2109 /** 2110 * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2111 * For example, for the Unicode set [a{bc}{cd}]<br> 2112 * containsNone is true for: "xy", "cb"<br> 2113 * containsNone is false for: "a", "bc", "bcd"<br> 2114 * @param b set to be checked for containment 2115 * @return true if the test condition is met 2116 * @stable ICU 2.0 2117 */ 2118 public boolean containsNone(UnicodeSet b) { 2119 // The specified set is a subset if some of its pairs overlap with some of this set's pairs. 2120 // This implementation accesses the lists directly for speed. 2121 int[] listB = b.list; 2122 boolean needA = true; 2123 boolean needB = true; 2124 int aPtr = 0; 2125 int bPtr = 0; 2126 int aLen = len - 1; 2127 int bLen = b.len - 1; 2128 int startA = 0, startB = 0, limitA = 0, limitB = 0; 2129 while (true) { 2130 // double iterations are such a pain... 2131 if (needA) { 2132 if (aPtr >= aLen) { 2133 // ran out of A: break so we test strings 2134 break; 2135 } 2136 startA = list[aPtr++]; 2137 limitA = list[aPtr++]; 2138 } 2139 if (needB) { 2140 if (bPtr >= bLen) { 2141 // ran out of B: break so we test strings 2142 break; 2143 } 2144 startB = listB[bPtr++]; 2145 limitB = listB[bPtr++]; 2146 } 2147 // if B is higher than any part of A, get new A 2148 if (startB >= limitA) { 2149 needA = true; 2150 needB = false; 2151 continue; 2152 } 2153 // if A is higher than any part of B, get new B 2154 if (startA >= limitB) { 2155 needA = false; 2156 needB = true; 2157 continue; 2158 } 2159 // all other combinations mean we fail 2160 return false; 2161 } 2162 2163 if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false; 2164 return true; 2165 } 2166 2167 // /** 2168 // * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2169 // * For example, for the Unicode set [a{bc}{cd}]<br> 2170 // * containsNone is true for: "xy", "cb"<br> 2171 // * containsNone is false for: "a", "bc", "bcd"<br> 2172 // * @param c set to be checked for containment 2173 // * @return true if the test condition is met 2174 // * @stable ICU 2.0 2175 // */ 2176 // public boolean containsNoneOld(UnicodeSet c) { 2177 // // The specified set is a subset if all of its pairs are contained in 2178 // // this set. It's possible to code this more efficiently in terms of 2179 // // direct manipulation of the inversion lists if the need arises. 2180 // int n = c.getRangeCount(); 2181 // for (int i=0; i<n; ++i) { 2182 // if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 2183 // return false; 2184 // } 2185 // } 2186 // if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false; 2187 // return true; 2188 // } 2189 2190 /** 2191 * Returns true if this set contains none of the characters 2192 * of the given string. 2193 * @param s string containing characters to be checked for containment 2194 * @return true if the test condition is met 2195 * @stable ICU 2.0 2196 */ 2197 public boolean containsNone(CharSequence s) { 2198 return span(s, SpanCondition.NOT_CONTAINED) == s.length(); 2199 } 2200 2201 /** 2202 * Returns true if this set contains one or more of the characters 2203 * in the given range. 2204 * @param start first character, inclusive, of the range 2205 * @param end last character, inclusive, of the range 2206 * @return true if the condition is met 2207 * @stable ICU 2.0 2208 */ 2209 public final boolean containsSome(int start, int end) { 2210 return !containsNone(start, end); 2211 } 2212 2213 /** 2214 * Returns true if this set contains one or more of the characters 2215 * and strings of the given set. 2216 * @param s set to be checked for containment 2217 * @return true if the condition is met 2218 * @stable ICU 2.0 2219 */ 2220 public final boolean containsSome(UnicodeSet s) { 2221 return !containsNone(s); 2222 } 2223 2224 /** 2225 * Returns true if this set contains one or more of the characters 2226 * of the given string. 2227 * @param s string containing characters to be checked for containment 2228 * @return true if the condition is met 2229 * @stable ICU 2.0 2230 */ 2231 public final boolean containsSome(CharSequence s) { 2232 return !containsNone(s); 2233 } 2234 2235 2236 /** 2237 * Adds all of the elements in the specified set to this set if 2238 * they're not already present. This operation effectively 2239 * modifies this set so that its value is the <i>union</i> of the two 2240 * sets. The behavior of this operation is unspecified if the specified 2241 * collection is modified while the operation is in progress. 2242 * 2243 * @param c set whose elements are to be added to this set. 2244 * @stable ICU 2.0 2245 */ 2246 public UnicodeSet addAll(UnicodeSet c) { 2247 checkFrozen(); 2248 add(c.list, c.len, 0); 2249 if (c.hasStrings()) { 2250 if (strings == EMPTY_STRINGS) { 2251 strings = new TreeSet<>(c.strings); 2252 } else { 2253 strings.addAll(c.strings); 2254 } 2255 } 2256 return this; 2257 } 2258 2259 /** 2260 * Retains only the elements in this set that are contained in the 2261 * specified set. In other words, removes from this set all of 2262 * its elements that are not contained in the specified set. This 2263 * operation effectively modifies this set so that its value is 2264 * the <i>intersection</i> of the two sets. 2265 * 2266 * @param c set that defines which elements this set will retain. 2267 * @stable ICU 2.0 2268 */ 2269 public UnicodeSet retainAll(UnicodeSet c) { 2270 checkFrozen(); 2271 retain(c.list, c.len, 0); 2272 if (hasStrings()) { 2273 if (!c.hasStrings()) { 2274 strings.clear(); 2275 } else { 2276 strings.retainAll(c.strings); 2277 } 2278 } 2279 return this; 2280 } 2281 2282 /** 2283 * Removes from this set all of its elements that are contained in the 2284 * specified set. This operation effectively modifies this 2285 * set so that its value is the <i>asymmetric set difference</i> of 2286 * the two sets. 2287 * 2288 * @param c set that defines which elements will be removed from 2289 * this set. 2290 * @stable ICU 2.0 2291 */ 2292 public UnicodeSet removeAll(UnicodeSet c) { 2293 checkFrozen(); 2294 retain(c.list, c.len, 2); 2295 if (hasStrings() && c.hasStrings()) { 2296 strings.removeAll(c.strings); 2297 } 2298 return this; 2299 } 2300 2301 /** 2302 * Complements in this set all elements contained in the specified 2303 * set. Any character in the other set will be removed if it is 2304 * in this set, or will be added if it is not in this set. 2305 * 2306 * @param c set that defines which elements will be complemented from 2307 * this set. 2308 * @stable ICU 2.0 2309 */ 2310 public UnicodeSet complementAll(UnicodeSet c) { 2311 checkFrozen(); 2312 xor(c.list, c.len, 0); 2313 if (c.hasStrings()) { 2314 if (strings == EMPTY_STRINGS) { 2315 strings = new TreeSet<>(c.strings); 2316 } else { 2317 SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings); 2318 } 2319 } 2320 return this; 2321 } 2322 2323 /** 2324 * Removes all of the elements from this set. This set will be 2325 * empty after this call returns. 2326 * @stable ICU 2.0 2327 */ 2328 public UnicodeSet clear() { 2329 checkFrozen(); 2330 list[0] = HIGH; 2331 len = 1; 2332 pat = null; 2333 if (hasStrings()) { 2334 strings.clear(); 2335 } 2336 return this; 2337 } 2338 2339 /** 2340 * Iteration method that returns the number of ranges contained in 2341 * this set. 2342 * @see #getRangeStart 2343 * @see #getRangeEnd 2344 * @stable ICU 2.0 2345 */ 2346 public int getRangeCount() { 2347 return len/2; 2348 } 2349 2350 /** 2351 * Iteration method that returns the first character in the 2352 * specified range of this set. 2353 * @exception ArrayIndexOutOfBoundsException if index is outside 2354 * the range <code>0..getRangeCount()-1</code> 2355 * @see #getRangeCount 2356 * @see #getRangeEnd 2357 * @stable ICU 2.0 2358 */ 2359 public int getRangeStart(int index) { 2360 return list[index*2]; 2361 } 2362 2363 /** 2364 * Iteration method that returns the last character in the 2365 * specified range of this set. 2366 * @exception ArrayIndexOutOfBoundsException if index is outside 2367 * the range <code>0..getRangeCount()-1</code> 2368 * @see #getRangeStart 2369 * @see #getRangeEnd 2370 * @stable ICU 2.0 2371 */ 2372 public int getRangeEnd(int index) { 2373 return (list[index*2 + 1] - 1); 2374 } 2375 2376 /** 2377 * Reallocate this objects internal structures to take up the least 2378 * possible space, without changing this object's value. 2379 * @stable ICU 2.0 2380 */ 2381 public UnicodeSet compact() { 2382 checkFrozen(); 2383 if ((len + 7) < list.length) { 2384 // If we have more than a little unused capacity, shrink it to len. 2385 list = Arrays.copyOf(list, len); 2386 } 2387 rangeList = null; 2388 buffer = null; 2389 if (strings != EMPTY_STRINGS && strings.isEmpty()) { 2390 strings = EMPTY_STRINGS; 2391 } 2392 return this; 2393 } 2394 2395 /** 2396 * Compares the specified object with this set for equality. Returns 2397 * <tt>true</tt> if the specified object is also a set, the two sets 2398 * have the same size, and every member of the specified set is 2399 * contained in this set (or equivalently, every member of this set is 2400 * contained in the specified set). 2401 * 2402 * @param o Object to be compared for equality with this set. 2403 * @return <tt>true</tt> if the specified Object is equal to this set. 2404 * @stable ICU 2.0 2405 */ 2406 @Override 2407 public boolean equals(Object o) { 2408 if (o == null) { 2409 return false; 2410 } 2411 if (this == o) { 2412 return true; 2413 } 2414 try { 2415 UnicodeSet that = (UnicodeSet) o; 2416 if (len != that.len) return false; 2417 for (int i = 0; i < len; ++i) { 2418 if (list[i] != that.list[i]) return false; 2419 } 2420 if (!strings.equals(that.strings)) return false; 2421 } catch (Exception e) { 2422 return false; 2423 } 2424 return true; 2425 } 2426 2427 /** 2428 * Returns the hash code value for this set. 2429 * 2430 * @return the hash code value for this set. 2431 * @see java.lang.Object#hashCode() 2432 * @stable ICU 2.0 2433 */ 2434 @Override 2435 public int hashCode() { 2436 int result = len; 2437 for (int i = 0; i < len; ++i) { 2438 result *= 1000003; 2439 result += list[i]; 2440 } 2441 return result; 2442 } 2443 2444 /** 2445 * Return a programmer-readable string representation of this object. 2446 * @stable ICU 2.0 2447 */ 2448 @Override 2449 public String toString() { 2450 return toPattern(true); 2451 } 2452 2453 //---------------------------------------------------------------- 2454 // Implementation: Pattern parsing 2455 //---------------------------------------------------------------- 2456 2457 /** 2458 * Parses the given pattern, starting at the given position. The character 2459 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. 2460 * Parsing continues until the corresponding closing ']'. If a syntax error 2461 * is encountered between the opening and closing brace, the parse fails. 2462 * Upon return from a successful parse, the ParsePosition is updated to 2463 * point to the character following the closing ']', and an inversion 2464 * list for the parsed pattern is returned. This method 2465 * calls itself recursively to parse embedded subpatterns. 2466 * 2467 * @param pattern the string containing the pattern to be parsed. The 2468 * portion of the string from pos.getIndex(), which must be a '[', to the 2469 * corresponding closing ']', is parsed. 2470 * @param pos upon entry, the position at which to being parsing. The 2471 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return 2472 * from a successful parse, pos.getIndex() is either the character after the 2473 * closing ']' of the parsed pattern, or pattern.length() if the closing ']' 2474 * is the last character of the pattern string. 2475 * @return an inversion list for the parsed substring 2476 * of <code>pattern</code> 2477 * @exception java.lang.IllegalArgumentException if the parse fails. 2478 * @internal 2479 * @deprecated This API is ICU internal only. 2480 */ 2481 @Deprecated 2482 public UnicodeSet applyPattern(String pattern, 2483 ParsePosition pos, 2484 SymbolTable symbols, 2485 int options) { 2486 2487 // Need to build the pattern in a temporary string because 2488 // _applyPattern calls add() etc., which set pat to empty. 2489 boolean parsePositionWasNull = pos == null; 2490 if (parsePositionWasNull) { 2491 pos = new ParsePosition(0); 2492 } 2493 2494 StringBuilder rebuiltPat = new StringBuilder(); 2495 RuleCharacterIterator chars = 2496 new RuleCharacterIterator(pattern, symbols, pos); 2497 applyPattern(chars, symbols, rebuiltPat, options, 0); 2498 if (chars.inVariable()) { 2499 syntaxError(chars, "Extra chars in variable value"); 2500 } 2501 pat = rebuiltPat.toString(); 2502 if (parsePositionWasNull) { 2503 int i = pos.getIndex(); 2504 2505 // Skip over trailing whitespace 2506 if ((options & IGNORE_SPACE) != 0) { 2507 i = PatternProps.skipWhiteSpace(pattern, i); 2508 } 2509 2510 if (i != pattern.length()) { 2511 throw new IllegalArgumentException("Parse of \"" + pattern + 2512 "\" failed at " + i); 2513 } 2514 } 2515 return this; 2516 } 2517 2518 // Add constants to make the applyPattern() code easier to follow. 2519 2520 private static final int LAST0_START = 0, 2521 LAST1_RANGE = 1, 2522 LAST2_SET = 2; 2523 2524 private static final int MODE0_NONE = 0, 2525 MODE1_INBRACKET = 1, 2526 MODE2_OUTBRACKET = 2; 2527 2528 private static final int SETMODE0_NONE = 0, 2529 SETMODE1_UNICODESET = 1, 2530 SETMODE2_PROPERTYPAT = 2, 2531 SETMODE3_PREPARSED = 3; 2532 2533 private static final int MAX_DEPTH = 100; 2534 2535 /** 2536 * Parse the pattern from the given RuleCharacterIterator. The 2537 * iterator is advanced over the parsed pattern. 2538 * @param chars iterator over the pattern characters. Upon return 2539 * it will be advanced to the first character after the parsed 2540 * pattern, or the end of the iteration if all characters are 2541 * parsed. 2542 * @param symbols symbol table to use to parse and dereference 2543 * variables, or null if none. 2544 * @param rebuiltPat the pattern that was parsed, rebuilt or 2545 * copied from the input pattern, as appropriate. 2546 * @param options a bit mask of zero or more of the following: 2547 * IGNORE_SPACE, CASE. 2548 */ 2549 private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, 2550 Appendable rebuiltPat, int options, int depth) { 2551 if (depth > MAX_DEPTH) { 2552 syntaxError(chars, "Pattern nested too deeply"); 2553 } 2554 2555 // Syntax characters: [ ] ^ - & { } 2556 2557 // Recognized special forms for chars, sets: c-c s-s s&s 2558 2559 int opts = RuleCharacterIterator.PARSE_VARIABLES | 2560 RuleCharacterIterator.PARSE_ESCAPES; 2561 if ((options & IGNORE_SPACE) != 0) { 2562 opts |= RuleCharacterIterator.SKIP_WHITESPACE; 2563 } 2564 2565 StringBuilder patBuf = new StringBuilder(), buf = null; 2566 boolean usePat = false; 2567 UnicodeSet scratch = null; 2568 Object backup = null; 2569 2570 // mode: 0=before [, 1=between [...], 2=after ] 2571 // lastItem: 0=none, 1=char, 2=set 2572 int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE; 2573 char op = 0; 2574 2575 boolean invert = false; 2576 2577 clear(); 2578 String lastString = null; 2579 2580 while (mode != MODE2_OUTBRACKET && !chars.atEnd()) { 2581 //Eclipse stated the following is "dead code" 2582 /* 2583 if (false) { 2584 // Debugging assertion 2585 if (!((lastItem == 0 && op == 0) || 2586 (lastItem == 1 && (op == 0 || op == '-')) || 2587 (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { 2588 throw new IllegalArgumentException(); 2589 } 2590 }*/ 2591 2592 int c = 0; 2593 boolean literal = false; 2594 UnicodeSet nested = null; 2595 2596 // -------- Check for property pattern 2597 2598 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 2599 int setMode = SETMODE0_NONE; 2600 if (resemblesPropertyPattern(chars, opts)) { 2601 setMode = SETMODE2_PROPERTYPAT; 2602 } 2603 2604 // -------- Parse '[' of opening delimiter OR nested set. 2605 // If there is a nested set, use `setMode' to define how 2606 // the set should be parsed. If the '[' is part of the 2607 // opening delimiter for this pattern, parse special 2608 // strings "[", "[^", "[-", and "[^-". Check for stand-in 2609 // characters representing a nested set in the symbol 2610 // table. 2611 2612 else { 2613 // Prepare to backup if necessary 2614 backup = chars.getPos(backup); 2615 c = chars.next(opts); 2616 literal = chars.isEscaped(); 2617 2618 if (c == '[' && !literal) { 2619 if (mode == MODE1_INBRACKET) { 2620 chars.setPos(backup); // backup 2621 setMode = SETMODE1_UNICODESET; 2622 } else { 2623 // Handle opening '[' delimiter 2624 mode = MODE1_INBRACKET; 2625 patBuf.append('['); 2626 backup = chars.getPos(backup); // prepare to backup 2627 c = chars.next(opts); 2628 literal = chars.isEscaped(); 2629 if (c == '^' && !literal) { 2630 invert = true; 2631 patBuf.append('^'); 2632 backup = chars.getPos(backup); // prepare to backup 2633 c = chars.next(opts); 2634 literal = chars.isEscaped(); 2635 } 2636 // Fall through to handle special leading '-'; 2637 // otherwise restart loop for nested [], \p{}, etc. 2638 if (c == '-') { 2639 literal = true; 2640 // Fall through to handle literal '-' below 2641 } else { 2642 chars.setPos(backup); // backup 2643 continue; 2644 } 2645 } 2646 } else if (symbols != null) { 2647 UnicodeMatcher m = symbols.lookupMatcher(c); // may be null 2648 if (m != null) { 2649 try { 2650 nested = (UnicodeSet) m; 2651 setMode = SETMODE3_PREPARSED; 2652 } catch (ClassCastException e) { 2653 syntaxError(chars, "Syntax error"); 2654 } 2655 } 2656 } 2657 } 2658 2659 // -------- Handle a nested set. This either is inline in 2660 // the pattern or represented by a stand-in that has 2661 // previously been parsed and was looked up in the symbol 2662 // table. 2663 2664 if (setMode != SETMODE0_NONE) { 2665 if (lastItem == LAST1_RANGE) { 2666 if (op != 0) { 2667 syntaxError(chars, "Char expected after operator"); 2668 } 2669 add_unchecked(lastChar, lastChar); 2670 _appendToPat(patBuf, lastChar, false); 2671 lastItem = LAST0_START; 2672 op = 0; 2673 } 2674 2675 if (op == '-' || op == '&') { 2676 patBuf.append(op); 2677 } 2678 2679 if (nested == null) { 2680 if (scratch == null) scratch = new UnicodeSet(); 2681 nested = scratch; 2682 } 2683 switch (setMode) { 2684 case SETMODE1_UNICODESET: 2685 nested.applyPattern(chars, symbols, patBuf, options, depth + 1); 2686 break; 2687 case SETMODE2_PROPERTYPAT: 2688 chars.skipIgnored(opts); 2689 nested.applyPropertyPattern(chars, patBuf, symbols); 2690 break; 2691 case SETMODE3_PREPARSED: // `nested' already parsed 2692 nested._toPattern(patBuf, false); 2693 break; 2694 } 2695 2696 usePat = true; 2697 2698 if (mode == MODE0_NONE) { 2699 // Entire pattern is a category; leave parse loop 2700 set(nested); 2701 mode = MODE2_OUTBRACKET; 2702 break; 2703 } 2704 2705 switch (op) { 2706 case '-': 2707 removeAll(nested); 2708 break; 2709 case '&': 2710 retainAll(nested); 2711 break; 2712 case 0: 2713 addAll(nested); 2714 break; 2715 } 2716 2717 op = 0; 2718 lastItem = LAST2_SET; 2719 2720 continue; 2721 } 2722 2723 if (mode == MODE0_NONE) { 2724 syntaxError(chars, "Missing '['"); 2725 } 2726 2727 // -------- Parse special (syntax) characters. If the 2728 // current character is not special, or if it is escaped, 2729 // then fall through and handle it below. 2730 2731 if (!literal) { 2732 switch (c) { 2733 case ']': 2734 if (lastItem == LAST1_RANGE) { 2735 add_unchecked(lastChar, lastChar); 2736 _appendToPat(patBuf, lastChar, false); 2737 } 2738 // Treat final trailing '-' as a literal 2739 if (op == '-') { 2740 add_unchecked(op, op); 2741 patBuf.append(op); 2742 } else if (op == '&') { 2743 syntaxError(chars, "Trailing '&'"); 2744 } 2745 patBuf.append(']'); 2746 mode = MODE2_OUTBRACKET; 2747 continue; 2748 case '-': 2749 if (op == 0) { 2750 if (lastItem != LAST0_START) { 2751 op = (char) c; 2752 continue; 2753 } else if (lastString != null) { 2754 op = (char) c; 2755 continue; 2756 } else { 2757 // Treat final trailing '-' as a literal 2758 add_unchecked(c, c); 2759 c = chars.next(opts); 2760 literal = chars.isEscaped(); 2761 if (c == ']' && !literal) { 2762 patBuf.append("-]"); 2763 mode = MODE2_OUTBRACKET; 2764 continue; 2765 } 2766 } 2767 } 2768 syntaxError(chars, "'-' not after char, string, or set"); 2769 break; 2770 case '&': 2771 if (lastItem == LAST2_SET && op == 0) { 2772 op = (char) c; 2773 continue; 2774 } 2775 syntaxError(chars, "'&' not after set"); 2776 break; 2777 case '^': 2778 syntaxError(chars, "'^' not after '['"); 2779 break; 2780 case '{': 2781 if (op != 0 && op != '-') { 2782 syntaxError(chars, "Missing operand after operator"); 2783 } 2784 if (lastItem == LAST1_RANGE) { 2785 add_unchecked(lastChar, lastChar); 2786 _appendToPat(patBuf, lastChar, false); 2787 } 2788 lastItem = LAST0_START; 2789 if (buf == null) { 2790 buf = new StringBuilder(); 2791 } else { 2792 buf.setLength(0); 2793 } 2794 boolean ok = false; 2795 while (!chars.atEnd()) { 2796 c = chars.next(opts); 2797 literal = chars.isEscaped(); 2798 if (c == '}' && !literal) { 2799 ok = true; 2800 break; 2801 } 2802 appendCodePoint(buf, c); 2803 } 2804 if (buf.length() < 1 || !ok) { 2805 syntaxError(chars, "Invalid multicharacter string"); 2806 } 2807 // We have new string. Add it to set and continue; 2808 // we don't need to drop through to the further 2809 // processing 2810 String curString = buf.toString(); 2811 if (op == '-') { 2812 int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString); 2813 int curSingle = CharSequences.getSingleCodePoint(curString); 2814 if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) { 2815 add(lastSingle,curSingle); 2816 } else { 2817 if (strings == EMPTY_STRINGS) { 2818 strings = new TreeSet<>(); 2819 } 2820 try { 2821 StringRange.expand(lastString, curString, true, strings); 2822 } catch (Exception e) { 2823 syntaxError(chars, e.getMessage()); 2824 } 2825 } 2826 lastString = null; 2827 op = 0; 2828 } else { 2829 add(curString); 2830 lastString = curString; 2831 } 2832 patBuf.append('{'); 2833 _appendToPat(patBuf, curString, false); 2834 patBuf.append('}'); 2835 continue; 2836 case SymbolTable.SYMBOL_REF: 2837 // symbols nosymbols 2838 // [a-$] error error (ambiguous) 2839 // [a$] anchor anchor 2840 // [a-$x] var "x"* literal '$' 2841 // [a-$.] error literal '$' 2842 // *We won't get here in the case of var "x" 2843 backup = chars.getPos(backup); 2844 c = chars.next(opts); 2845 literal = chars.isEscaped(); 2846 boolean anchor = (c == ']' && !literal); 2847 if (symbols == null && !anchor) { 2848 c = SymbolTable.SYMBOL_REF; 2849 chars.setPos(backup); 2850 break; // literal '$' 2851 } 2852 if (anchor && op == 0) { 2853 if (lastItem == LAST1_RANGE) { 2854 add_unchecked(lastChar, lastChar); 2855 _appendToPat(patBuf, lastChar, false); 2856 } 2857 add_unchecked(UnicodeMatcher.ETHER); 2858 usePat = true; 2859 patBuf.append(SymbolTable.SYMBOL_REF).append(']'); 2860 mode = MODE2_OUTBRACKET; 2861 continue; 2862 } 2863 syntaxError(chars, "Unquoted '$'"); 2864 break; 2865 default: 2866 break; 2867 } 2868 } 2869 2870 // -------- Parse literal characters. This includes both 2871 // escaped chars ("\u4E01") and non-syntax characters 2872 // ("a"). 2873 2874 switch (lastItem) { 2875 case LAST0_START: 2876 if (op == '-' && lastString != null) { 2877 syntaxError(chars, "Invalid range"); 2878 } 2879 lastItem = LAST1_RANGE; 2880 lastChar = c; 2881 lastString = null; 2882 break; 2883 case LAST1_RANGE: 2884 if (op == '-') { 2885 if (lastString != null) { 2886 syntaxError(chars, "Invalid range"); 2887 } 2888 if (lastChar >= c) { 2889 // Don't allow redundant (a-a) or empty (b-a) ranges; 2890 // these are most likely typos. 2891 syntaxError(chars, "Invalid range"); 2892 } 2893 add_unchecked(lastChar, c); 2894 _appendToPat(patBuf, lastChar, false); 2895 patBuf.append(op); 2896 _appendToPat(patBuf, c, false); 2897 lastItem = LAST0_START; 2898 op = 0; 2899 } else { 2900 add_unchecked(lastChar, lastChar); 2901 _appendToPat(patBuf, lastChar, false); 2902 lastChar = c; 2903 } 2904 break; 2905 case LAST2_SET: 2906 if (op != 0) { 2907 syntaxError(chars, "Set expected after operator"); 2908 } 2909 lastChar = c; 2910 lastItem = LAST1_RANGE; 2911 break; 2912 } 2913 } 2914 2915 if (mode != MODE2_OUTBRACKET) { 2916 syntaxError(chars, "Missing ']'"); 2917 } 2918 2919 chars.skipIgnored(opts); 2920 2921 /** 2922 * Handle global flags (invert, case insensitivity). If this 2923 * pattern should be compiled case-insensitive, then we need 2924 * to close over case BEFORE COMPLEMENTING. This makes 2925 * patterns like /[^abc]/i work. 2926 */ 2927 if ((options & CASE) != 0) { 2928 closeOver(CASE); 2929 } 2930 if (invert) { 2931 complement(); 2932 } 2933 2934 // Use the rebuilt pattern (pat) only if necessary. Prefer the 2935 // generated pattern. 2936 if (usePat) { 2937 append(rebuiltPat, patBuf.toString()); 2938 } else { 2939 appendNewPattern(rebuiltPat, false, true); 2940 } 2941 } 2942 2943 private static void syntaxError(RuleCharacterIterator chars, String msg) { 2944 throw new IllegalArgumentException("Error: " + msg + " at \"" + 2945 Utility.escape(chars.toString()) + 2946 '"'); 2947 } 2948 2949 /** 2950 * Add the contents of the UnicodeSet (as strings) into a collection. 2951 * @param target collection to add into 2952 * @stable ICU 4.4 2953 */ 2954 public <T extends Collection<String>> T addAllTo(T target) { 2955 return addAllTo(this, target); 2956 } 2957 2958 2959 /** 2960 * Add the contents of the UnicodeSet (as strings) into a collection. 2961 * @param target collection to add into 2962 * @stable ICU 4.4 2963 */ 2964 public String[] addAllTo(String[] target) { 2965 return addAllTo(this, target); 2966 } 2967 2968 /** 2969 * Add the contents of the UnicodeSet (as strings) into an array. 2970 * @stable ICU 4.4 2971 */ 2972 public static String[] toArray(UnicodeSet set) { 2973 return addAllTo(set, new String[set.size()]); 2974 } 2975 2976 /** 2977 * Add the contents of the collection (as strings) into this UnicodeSet. 2978 * The collection must not contain null. 2979 * @param source the collection to add 2980 * @return a reference to this object 2981 * @stable ICU 4.4 2982 */ 2983 public UnicodeSet add(Iterable<?> source) { 2984 return addAll(source); 2985 } 2986 2987 /** 2988 * Add a collection (as strings) into this UnicodeSet. 2989 * Uses standard naming convention. 2990 * @param source collection to add into 2991 * @return a reference to this object 2992 * @stable ICU 4.4 2993 */ 2994 public UnicodeSet addAll(Iterable<?> source) { 2995 checkFrozen(); 2996 for (Object o : source) { 2997 add(o.toString()); 2998 } 2999 return this; 3000 } 3001 3002 //---------------------------------------------------------------- 3003 // Implementation: Utility methods 3004 //---------------------------------------------------------------- 3005 3006 private int nextCapacity(int minCapacity) { 3007 // Grow exponentially to reduce the frequency of allocations. 3008 if (minCapacity < INITIAL_CAPACITY) { 3009 return minCapacity + INITIAL_CAPACITY; 3010 } else if (minCapacity <= 2500) { 3011 return 5 * minCapacity; 3012 } else { 3013 int newCapacity = 2 * minCapacity; 3014 if (newCapacity > MAX_LENGTH) { 3015 newCapacity = MAX_LENGTH; 3016 } 3017 return newCapacity; 3018 } 3019 } 3020 3021 private void ensureCapacity(int newLen) { 3022 if (newLen > MAX_LENGTH) { 3023 newLen = MAX_LENGTH; 3024 } 3025 if (newLen <= list.length) return; 3026 int newCapacity = nextCapacity(newLen); 3027 int[] temp = new int[newCapacity]; 3028 // Copy only the actual contents. 3029 System.arraycopy(list, 0, temp, 0, len); 3030 list = temp; 3031 } 3032 3033 private void ensureBufferCapacity(int newLen) { 3034 if (newLen > MAX_LENGTH) { 3035 newLen = MAX_LENGTH; 3036 } 3037 if (buffer != null && newLen <= buffer.length) return; 3038 int newCapacity = nextCapacity(newLen); 3039 buffer = new int[newCapacity]; 3040 // The buffer has no contents to be copied. 3041 // It is always filled from scratch after this call. 3042 } 3043 3044 /** 3045 * Assumes start <= end. 3046 */ 3047 private int[] range(int start, int end) { 3048 if (rangeList == null) { 3049 rangeList = new int[] { start, end+1, HIGH }; 3050 } else { 3051 rangeList[0] = start; 3052 rangeList[1] = end+1; 3053 } 3054 return rangeList; 3055 } 3056 3057 //---------------------------------------------------------------- 3058 // Implementation: Fundamental operations 3059 //---------------------------------------------------------------- 3060 3061 // polarity = 0, 3 is normal: x xor y 3062 // polarity = 1, 2: x xor ~y == x === y 3063 3064 private UnicodeSet xor(int[] other, int otherLen, int polarity) { 3065 ensureBufferCapacity(len + otherLen); 3066 int i = 0, j = 0, k = 0; 3067 int a = list[i++]; 3068 int b; 3069 // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used 3070 // so the following if statement will not be called. 3071 ///CLOVER:OFF 3072 if (polarity == 1 || polarity == 2) { 3073 b = LOW; 3074 if (other[j] == LOW) { // skip base if already LOW 3075 ++j; 3076 b = other[j]; 3077 } 3078 ///CLOVER:ON 3079 } else { 3080 b = other[j++]; 3081 } 3082 // simplest of all the routines 3083 // sort the values, discarding identicals! 3084 while (true) { 3085 if (a < b) { 3086 buffer[k++] = a; 3087 a = list[i++]; 3088 } else if (b < a) { 3089 buffer[k++] = b; 3090 b = other[j++]; 3091 } else if (a != HIGH) { // at this point, a == b 3092 // discard both values! 3093 a = list[i++]; 3094 b = other[j++]; 3095 } else { // DONE! 3096 buffer[k++] = HIGH; 3097 len = k; 3098 break; 3099 } 3100 } 3101 // swap list and buffer 3102 int[] temp = list; 3103 list = buffer; 3104 buffer = temp; 3105 pat = null; 3106 return this; 3107 } 3108 3109 // polarity = 0 is normal: x union y 3110 // polarity = 2: x union ~y 3111 // polarity = 1: ~x union y 3112 // polarity = 3: ~x union ~y 3113 3114 private UnicodeSet add(int[] other, int otherLen, int polarity) { 3115 ensureBufferCapacity(len + otherLen); 3116 int i = 0, j = 0, k = 0; 3117 int a = list[i++]; 3118 int b = other[j++]; 3119 // change from xor is that we have to check overlapping pairs 3120 // polarity bit 1 means a is second, bit 2 means b is. 3121 main: 3122 while (true) { 3123 switch (polarity) { 3124 case 0: // both first; take lower if unequal 3125 if (a < b) { // take a 3126 // Back up over overlapping ranges in buffer[] 3127 if (k > 0 && a <= buffer[k-1]) { 3128 // Pick latter end value in buffer[] vs. list[] 3129 a = max(list[i], buffer[--k]); 3130 } else { 3131 // No overlap 3132 buffer[k++] = a; 3133 a = list[i]; 3134 } 3135 i++; // Common if/else code factored out 3136 polarity ^= 1; 3137 } else if (b < a) { // take b 3138 if (k > 0 && b <= buffer[k-1]) { 3139 b = max(other[j], buffer[--k]); 3140 } else { 3141 buffer[k++] = b; 3142 b = other[j]; 3143 } 3144 j++; 3145 polarity ^= 2; 3146 } else { // a == b, take a, drop b 3147 if (a == HIGH) break main; 3148 // This is symmetrical; it doesn't matter if 3149 // we backtrack with a or b. - liu 3150 if (k > 0 && a <= buffer[k-1]) { 3151 a = max(list[i], buffer[--k]); 3152 } else { 3153 // No overlap 3154 buffer[k++] = a; 3155 a = list[i]; 3156 } 3157 i++; 3158 polarity ^= 1; 3159 b = other[j++]; polarity ^= 2; 3160 } 3161 break; 3162 case 3: // both second; take higher if unequal, and drop other 3163 if (b <= a) { // take a 3164 if (a == HIGH) break main; 3165 buffer[k++] = a; 3166 } else { // take b 3167 if (b == HIGH) break main; 3168 buffer[k++] = b; 3169 } 3170 a = list[i++]; polarity ^= 1; // factored common code 3171 b = other[j++]; polarity ^= 2; 3172 break; 3173 case 1: // a second, b first; if b < a, overlap 3174 if (a < b) { // no overlap, take a 3175 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3176 } else if (b < a) { // OVERLAP, drop b 3177 b = other[j++]; polarity ^= 2; 3178 } else { // a == b, drop both! 3179 if (a == HIGH) break main; 3180 a = list[i++]; polarity ^= 1; 3181 b = other[j++]; polarity ^= 2; 3182 } 3183 break; 3184 case 2: // a first, b second; if a < b, overlap 3185 if (b < a) { // no overlap, take b 3186 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3187 } else if (a < b) { // OVERLAP, drop a 3188 a = list[i++]; polarity ^= 1; 3189 } else { // a == b, drop both! 3190 if (a == HIGH) break main; 3191 a = list[i++]; polarity ^= 1; 3192 b = other[j++]; polarity ^= 2; 3193 } 3194 break; 3195 } 3196 } 3197 buffer[k++] = HIGH; // terminate 3198 len = k; 3199 // swap list and buffer 3200 int[] temp = list; 3201 list = buffer; 3202 buffer = temp; 3203 pat = null; 3204 return this; 3205 } 3206 3207 // polarity = 0 is normal: x intersect y 3208 // polarity = 2: x intersect ~y == set-minus 3209 // polarity = 1: ~x intersect y 3210 // polarity = 3: ~x intersect ~y 3211 3212 private UnicodeSet retain(int[] other, int otherLen, int polarity) { 3213 ensureBufferCapacity(len + otherLen); 3214 int i = 0, j = 0, k = 0; 3215 int a = list[i++]; 3216 int b = other[j++]; 3217 // change from xor is that we have to check overlapping pairs 3218 // polarity bit 1 means a is second, bit 2 means b is. 3219 main: 3220 while (true) { 3221 switch (polarity) { 3222 case 0: // both first; drop the smaller 3223 if (a < b) { // drop a 3224 a = list[i++]; polarity ^= 1; 3225 } else if (b < a) { // drop b 3226 b = other[j++]; polarity ^= 2; 3227 } else { // a == b, take one, drop other 3228 if (a == HIGH) break main; 3229 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3230 b = other[j++]; polarity ^= 2; 3231 } 3232 break; 3233 case 3: // both second; take lower if unequal 3234 if (a < b) { // take a 3235 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3236 } else if (b < a) { // take b 3237 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3238 } else { // a == b, take one, drop other 3239 if (a == HIGH) break main; 3240 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3241 b = other[j++]; polarity ^= 2; 3242 } 3243 break; 3244 case 1: // a second, b first; 3245 if (a < b) { // NO OVERLAP, drop a 3246 a = list[i++]; polarity ^= 1; 3247 } else if (b < a) { // OVERLAP, take b 3248 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3249 } else { // a == b, drop both! 3250 if (a == HIGH) break main; 3251 a = list[i++]; polarity ^= 1; 3252 b = other[j++]; polarity ^= 2; 3253 } 3254 break; 3255 case 2: // a first, b second; if a < b, overlap 3256 if (b < a) { // no overlap, drop b 3257 b = other[j++]; polarity ^= 2; 3258 } else if (a < b) { // OVERLAP, take a 3259 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3260 } else { // a == b, drop both! 3261 if (a == HIGH) break main; 3262 a = list[i++]; polarity ^= 1; 3263 b = other[j++]; polarity ^= 2; 3264 } 3265 break; 3266 } 3267 } 3268 buffer[k++] = HIGH; // terminate 3269 len = k; 3270 // swap list and buffer 3271 int[] temp = list; 3272 list = buffer; 3273 buffer = temp; 3274 pat = null; 3275 return this; 3276 } 3277 3278 private static final int max(int a, int b) { 3279 return (a > b) ? a : b; 3280 } 3281 3282 //---------------------------------------------------------------- 3283 // Generic filter-based scanning code 3284 //---------------------------------------------------------------- 3285 3286 private static interface Filter { 3287 boolean contains(int codePoint); 3288 } 3289 3290 private static final class NumericValueFilter implements Filter { 3291 double value; 3292 NumericValueFilter(double value) { this.value = value; } 3293 @Override 3294 public boolean contains(int ch) { 3295 return UCharacter.getUnicodeNumericValue(ch) == value; 3296 } 3297 } 3298 3299 private static final class GeneralCategoryMaskFilter implements Filter { 3300 int mask; 3301 GeneralCategoryMaskFilter(int mask) { this.mask = mask; } 3302 @Override 3303 public boolean contains(int ch) { 3304 return ((1 << UCharacter.getType(ch)) & mask) != 0; 3305 } 3306 } 3307 3308 private static final class IntPropertyFilter implements Filter { 3309 int prop; 3310 int value; 3311 IntPropertyFilter(int prop, int value) { 3312 this.prop = prop; 3313 this.value = value; 3314 } 3315 @Override 3316 public boolean contains(int ch) { 3317 return UCharacter.getIntPropertyValue(ch, prop) == value; 3318 } 3319 } 3320 3321 private static final class ScriptExtensionsFilter implements Filter { 3322 int script; 3323 ScriptExtensionsFilter(int script) { this.script = script; } 3324 @Override 3325 public boolean contains(int c) { 3326 return UScript.hasScript(c, script); 3327 } 3328 } 3329 3330 // VersionInfo for unassigned characters 3331 private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); 3332 3333 private static final class VersionFilter implements Filter { 3334 VersionInfo version; 3335 VersionFilter(VersionInfo version) { this.version = version; } 3336 @Override 3337 public boolean contains(int ch) { 3338 VersionInfo v = UCharacter.getAge(ch); 3339 // Reference comparison ok; VersionInfo caches and reuses 3340 // unique objects. 3341 return !Utility.sameObjects(v, NO_VERSION) && 3342 v.compareTo(version) <= 0; 3343 } 3344 } 3345 3346 /** 3347 * Generic filter-based scanning code for UCD property UnicodeSets. 3348 */ 3349 private void applyFilter(Filter filter, UnicodeSet inclusions) { 3350 // Logically, walk through all Unicode characters, noting the start 3351 // and end of each range for which filter.contain(c) is 3352 // true. Add each range to a set. 3353 // 3354 // To improve performance, use an inclusions set which 3355 // encodes information about character ranges that are known 3356 // to have identical properties. 3357 // inclusions contains the first characters of 3358 // same-value ranges for the given property. 3359 3360 clear(); 3361 3362 int startHasProperty = -1; 3363 int limitRange = inclusions.getRangeCount(); 3364 3365 for (int j=0; j<limitRange; ++j) { 3366 // get current range 3367 int start = inclusions.getRangeStart(j); 3368 int end = inclusions.getRangeEnd(j); 3369 3370 // for all the code points in the range, process 3371 for (int ch = start; ch <= end; ++ch) { 3372 // only add to the unicodeset on inflection points -- 3373 // where the hasProperty value changes to false 3374 if (filter.contains(ch)) { 3375 if (startHasProperty < 0) { 3376 startHasProperty = ch; 3377 } 3378 } else if (startHasProperty >= 0) { 3379 add_unchecked(startHasProperty, ch-1); 3380 startHasProperty = -1; 3381 } 3382 } 3383 } 3384 if (startHasProperty >= 0) { 3385 add_unchecked(startHasProperty, 0x10FFFF); 3386 } 3387 } 3388 3389 /** 3390 * Remove leading and trailing Pattern_White_Space and compress 3391 * internal Pattern_White_Space to a single space character. 3392 */ 3393 private static String mungeCharName(String source) { 3394 source = PatternProps.trimWhiteSpace(source); 3395 StringBuilder buf = null; 3396 for (int i=0; i<source.length(); ++i) { 3397 char ch = source.charAt(i); 3398 if (PatternProps.isWhiteSpace(ch)) { 3399 if (buf == null) { 3400 buf = new StringBuilder().append(source, 0, i); 3401 } else if (buf.charAt(buf.length() - 1) == ' ') { 3402 continue; 3403 } 3404 ch = ' '; // convert to ' ' 3405 } 3406 if (buf != null) { 3407 buf.append(ch); 3408 } 3409 } 3410 return buf == null ? source : buf.toString(); 3411 } 3412 3413 //---------------------------------------------------------------- 3414 // Property set API 3415 //---------------------------------------------------------------- 3416 3417 /** 3418 * Modifies this set to contain those code points which have the 3419 * given value for the given binary or enumerated property, as 3420 * returned by UCharacter.getIntPropertyValue. Prior contents of 3421 * this set are lost. 3422 * 3423 * @param prop a property in the range 3424 * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or 3425 * UProperty.INT_START..UProperty.INT_LIMIT-1 or. 3426 * UProperty.MASK_START..UProperty.MASK_LIMIT-1. 3427 * 3428 * @param value a value in the range 3429 * UCharacter.getIntPropertyMinValue(prop).. 3430 * UCharacter.getIntPropertyMaxValue(prop), with one exception. 3431 * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be 3432 * a UCharacter.getType() result, but rather a mask value produced 3433 * by logically ORing (1 << UCharacter.getType()) values together. 3434 * This allows grouped categories such as [:L:] to be represented. 3435 * 3436 * @return a reference to this set 3437 * 3438 * @stable ICU 2.4 3439 */ 3440 public UnicodeSet applyIntPropertyValue(int prop, int value) { 3441 // All of the following include checkFrozen() before modifying this set. 3442 if (prop == UProperty.GENERAL_CATEGORY_MASK) { 3443 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3444 applyFilter(new GeneralCategoryMaskFilter(value), inclusions); 3445 } else if (prop == UProperty.SCRIPT_EXTENSIONS) { 3446 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3447 applyFilter(new ScriptExtensionsFilter(value), inclusions); 3448 } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) { 3449 if (value == 0 || value == 1) { 3450 set(CharacterProperties.getBinaryPropertySet(prop)); 3451 if (value == 0) { 3452 complement(); 3453 } 3454 } else { 3455 clear(); 3456 } 3457 } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) { 3458 UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); 3459 applyFilter(new IntPropertyFilter(prop, value), inclusions); 3460 } else { 3461 throw new IllegalArgumentException("unsupported property " + prop); 3462 } 3463 return this; 3464 } 3465 3466 3467 3468 /** 3469 * Modifies this set to contain those code points which have the 3470 * given value for the given property. Prior contents of this 3471 * set are lost. 3472 * 3473 * @param propertyAlias a property alias, either short or long. 3474 * The name is matched loosely. See PropertyAliases.txt for names 3475 * and a description of loose matching. If the value string is 3476 * empty, then this string is interpreted as either a 3477 * General_Category value alias, a Script value alias, a binary 3478 * property alias, or a special ID. Special IDs are matched 3479 * loosely and correspond to the following sets: 3480 * 3481 * "ANY" = [\\u0000-\\U0010FFFF], 3482 * "ASCII" = [\\u0000-\\u007F]. 3483 * 3484 * @param valueAlias a value alias, either short or long. The 3485 * name is matched loosely. See PropertyValueAliases.txt for 3486 * names and a description of loose matching. In addition to 3487 * aliases listed, numeric values and canonical combining classes 3488 * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc", 3489 * "220"). The value string may also be empty. 3490 * 3491 * @return a reference to this set 3492 * 3493 * @stable ICU 2.4 3494 */ 3495 public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) { 3496 return applyPropertyAlias(propertyAlias, valueAlias, null); 3497 } 3498 3499 /** 3500 * Modifies this set to contain those code points which have the 3501 * given value for the given property. Prior contents of this 3502 * set are lost. 3503 * @param propertyAlias A string of the property alias. 3504 * @param valueAlias A string of the value alias. 3505 * @param symbols if not null, then symbols are first called to see if a property 3506 * is available. If true, then everything else is skipped. 3507 * @return this set 3508 * @stable ICU 3.2 3509 */ 3510 public UnicodeSet applyPropertyAlias(String propertyAlias, 3511 String valueAlias, SymbolTable symbols) { 3512 checkFrozen(); 3513 int p; 3514 int v; 3515 boolean invert = false; 3516 3517 if (symbols != null 3518 && (symbols instanceof XSymbolTable) 3519 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) { 3520 return this; 3521 } 3522 3523 if (XSYMBOL_TABLE != null) { 3524 if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) { 3525 return this; 3526 } 3527 } 3528 3529 if (valueAlias.length() > 0) { 3530 p = UCharacter.getPropertyEnum(propertyAlias); 3531 3532 // Treat gc as gcm 3533 if (p == UProperty.GENERAL_CATEGORY) { 3534 p = UProperty.GENERAL_CATEGORY_MASK; 3535 } 3536 3537 if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) || 3538 (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) || 3539 (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) { 3540 try { 3541 v = UCharacter.getPropertyValueEnum(p, valueAlias); 3542 } catch (IllegalArgumentException e) { 3543 // Handle numeric CCC 3544 if (p == UProperty.CANONICAL_COMBINING_CLASS || 3545 p == UProperty.LEAD_CANONICAL_COMBINING_CLASS || 3546 p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) { 3547 v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias)); 3548 // Anything between 0 and 255 is valid even if unused. 3549 if (v < 0 || v > 255) throw e; 3550 } else { 3551 throw e; 3552 } 3553 } 3554 } 3555 3556 else { 3557 switch (p) { 3558 case UProperty.NUMERIC_VALUE: 3559 { 3560 double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias)); 3561 applyFilter(new NumericValueFilter(value), 3562 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3563 return this; 3564 } 3565 case UProperty.NAME: 3566 { 3567 // Must munge name, since 3568 // UCharacter.charFromName() does not do 3569 // 'loose' matching. 3570 String buf = mungeCharName(valueAlias); 3571 int ch = UCharacter.getCharFromExtendedName(buf); 3572 if (ch == -1) { 3573 throw new IllegalArgumentException("Invalid character name"); 3574 } 3575 clear(); 3576 add_unchecked(ch); 3577 return this; 3578 } 3579 case UProperty.UNICODE_1_NAME: 3580 // ICU 49 deprecates the Unicode_1_Name property APIs. 3581 throw new IllegalArgumentException("Unicode_1_Name (na1) not supported"); 3582 case UProperty.AGE: 3583 { 3584 // Must munge name, since 3585 // VersionInfo.getInstance() does not do 3586 // 'loose' matching. 3587 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); 3588 applyFilter(new VersionFilter(version), 3589 CharacterPropertiesImpl.getInclusionsForProperty(p)); 3590 return this; 3591 } 3592 case UProperty.SCRIPT_EXTENSIONS: 3593 v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias); 3594 // fall through to calling applyIntPropertyValue() 3595 break; 3596 default: 3597 // p is a non-binary, non-enumerated property that we 3598 // don't support (yet). 3599 throw new IllegalArgumentException("Unsupported property"); 3600 } 3601 } 3602 } 3603 3604 else { 3605 // valueAlias is empty. Interpret as General Category, Script, 3606 // Binary property, or ANY or ASCII. Upon success, p and v will 3607 // be set. 3608 UPropertyAliases pnames = UPropertyAliases.INSTANCE; 3609 p = UProperty.GENERAL_CATEGORY_MASK; 3610 v = pnames.getPropertyValueEnum(p, propertyAlias); 3611 if (v == UProperty.UNDEFINED) { 3612 p = UProperty.SCRIPT; 3613 v = pnames.getPropertyValueEnum(p, propertyAlias); 3614 if (v == UProperty.UNDEFINED) { 3615 p = pnames.getPropertyEnum(propertyAlias); 3616 if (p == UProperty.UNDEFINED) { 3617 p = -1; 3618 } 3619 if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) { 3620 v = 1; 3621 } else if (p == -1) { 3622 if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) { 3623 set(MIN_VALUE, MAX_VALUE); 3624 return this; 3625 } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) { 3626 set(0, 0x7F); 3627 return this; 3628 } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) { 3629 // [:Assigned:]=[:^Cn:] 3630 p = UProperty.GENERAL_CATEGORY_MASK; 3631 v = (1<<UCharacter.UNASSIGNED); 3632 invert = true; 3633 } else { 3634 // Property name was never matched. 3635 throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias); 3636 } 3637 } else { 3638 // Valid propery name, but it isn't binary, so the value 3639 // must be supplied. 3640 throw new IllegalArgumentException("Missing property value"); 3641 } 3642 } 3643 } 3644 } 3645 3646 applyIntPropertyValue(p, v); 3647 if(invert) { 3648 complement(); 3649 } 3650 3651 return this; 3652 } 3653 3654 //---------------------------------------------------------------- 3655 // Property set patterns 3656 //---------------------------------------------------------------- 3657 3658 /** 3659 * Return true if the given position, in the given pattern, appears 3660 * to be the start of a property set pattern. 3661 */ 3662 private static boolean resemblesPropertyPattern(String pattern, int pos) { 3663 // Patterns are at least 5 characters long 3664 if ((pos+5) > pattern.length()) { 3665 return false; 3666 } 3667 3668 // Look for an opening [:, [:^, \p, or \P 3669 return pattern.regionMatches(pos, "[:", 0, 2) || 3670 pattern.regionMatches(true, pos, "\\p", 0, 2) || 3671 pattern.regionMatches(pos, "\\N", 0, 2); 3672 } 3673 3674 /** 3675 * Return true if the given iterator appears to point at a 3676 * property pattern. Regardless of the result, return with the 3677 * iterator unchanged. 3678 * @param chars iterator over the pattern characters. Upon return 3679 * it will be unchanged. 3680 * @param iterOpts RuleCharacterIterator options 3681 */ 3682 private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, 3683 int iterOpts) { 3684 boolean result = false; 3685 iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; 3686 Object pos = chars.getPos(null); 3687 int c = chars.next(iterOpts); 3688 if (c == '[' || c == '\\') { 3689 int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); 3690 result = (c == '[') ? (d == ':') : 3691 (d == 'N' || d == 'p' || d == 'P'); 3692 } 3693 chars.setPos(pos); 3694 return result; 3695 } 3696 3697 /** 3698 * Parse the given property pattern at the given parse position. 3699 * @param symbols TODO 3700 */ 3701 private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { 3702 int pos = ppos.getIndex(); 3703 3704 // On entry, ppos should point to one of the following locations: 3705 3706 // Minimum length is 5 characters, e.g. \p{L} 3707 if ((pos+5) > pattern.length()) { 3708 return null; 3709 } 3710 3711 boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 3712 boolean isName = false; // true for \N{pat}, o/w false 3713 boolean invert = false; 3714 3715 // Look for an opening [:, [:^, \p, or \P 3716 if (pattern.regionMatches(pos, "[:", 0, 2)) { 3717 posix = true; 3718 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3719 if (pos < pattern.length() && pattern.charAt(pos) == '^') { 3720 ++pos; 3721 invert = true; 3722 } 3723 } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || 3724 pattern.regionMatches(pos, "\\N", 0, 2)) { 3725 char c = pattern.charAt(pos+1); 3726 invert = (c == 'P'); 3727 isName = (c == 'N'); 3728 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3729 if (pos == pattern.length() || pattern.charAt(pos++) != '{') { 3730 // Syntax error; "\p" or "\P" not followed by "{" 3731 return null; 3732 } 3733 } else { 3734 // Open delimiter not seen 3735 return null; 3736 } 3737 3738 // Look for the matching close delimiter, either :] or } 3739 int close = pattern.indexOf(posix ? ":]" : "}", pos); 3740 if (close < 0) { 3741 // Syntax error; close delimiter missing 3742 return null; 3743 } 3744 3745 // Look for an '=' sign. If this is present, we will parse a 3746 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 3747 // pattern. 3748 int equals = pattern.indexOf('=', pos); 3749 String propName, valueName; 3750 if (equals >= 0 && equals < close && !isName) { 3751 // Equals seen; parse medium/long pattern 3752 propName = pattern.substring(pos, equals); 3753 valueName = pattern.substring(equals+1, close); 3754 } 3755 3756 else { 3757 // Handle case where no '=' is seen, and \N{} 3758 propName = pattern.substring(pos, close); 3759 valueName = ""; 3760 3761 // Handle \N{name} 3762 if (isName) { 3763 // This is a little inefficient since it means we have to 3764 // parse "na" back to UProperty.NAME even though we already 3765 // know it's UProperty.NAME. If we refactor the API to 3766 // support args of (int, String) then we can remove 3767 // "na" and make this a little more efficient. 3768 valueName = propName; 3769 propName = "na"; 3770 } 3771 } 3772 3773 applyPropertyAlias(propName, valueName, symbols); 3774 3775 if (invert) { 3776 complement(); 3777 } 3778 3779 // Move to the limit position after the close delimiter 3780 ppos.setIndex(close + (posix ? 2 : 1)); 3781 3782 return this; 3783 } 3784 3785 /** 3786 * Parse a property pattern. 3787 * @param chars iterator over the pattern characters. Upon return 3788 * it will be advanced to the first character after the parsed 3789 * pattern, or the end of the iteration if all characters are 3790 * parsed. 3791 * @param rebuiltPat the pattern that was parsed, rebuilt or 3792 * copied from the input pattern, as appropriate. 3793 * @param symbols TODO 3794 */ 3795 private void applyPropertyPattern(RuleCharacterIterator chars, 3796 Appendable rebuiltPat, SymbolTable symbols) { 3797 String patStr = chars.lookahead(); 3798 ParsePosition pos = new ParsePosition(0); 3799 applyPropertyPattern(patStr, pos, symbols); 3800 if (pos.getIndex() == 0) { 3801 syntaxError(chars, "Invalid property pattern"); 3802 } 3803 chars.jumpahead(pos.getIndex()); 3804 append(rebuiltPat, patStr.substring(0, pos.getIndex())); 3805 } 3806 3807 //---------------------------------------------------------------- 3808 // Case folding API 3809 //---------------------------------------------------------------- 3810 3811 /** 3812 * Bitmask for constructor and applyPattern() indicating that 3813 * white space should be ignored. If set, ignore Unicode Pattern_White_Space characters, 3814 * unless they are quoted or escaped. This may be ORed together 3815 * with other selectors. 3816 * @stable ICU 3.8 3817 */ 3818 public static final int IGNORE_SPACE = 1; 3819 3820 /** 3821 * Bitmask for constructor, applyPattern(), and closeOver() 3822 * indicating letter case. This may be ORed together with other 3823 * selectors. 3824 * 3825 * Enable case insensitive matching. E.g., "[ab]" with this flag 3826 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3827 * match all except 'a', 'A', 'b', and 'B'. This performs a full 3828 * closure over case mappings, e.g. U+017F for s. 3829 * 3830 * The resulting set is a superset of the input for the code points but 3831 * not for the strings. 3832 * It performs a case mapping closure of the code points and adds 3833 * full case folding strings for the code points, and reduces strings of 3834 * the original set to their full case folding equivalents. 3835 * 3836 * This is designed for case-insensitive matches, for example 3837 * in regular expressions. The full code point case closure allows checking of 3838 * an input character directly against the closure set. 3839 * Strings are matched by comparing the case-folded form from the closure 3840 * set with an incremental case folding of the string in question. 3841 * 3842 * The closure set will also contain single code points if the original 3843 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 3844 * This is not necessary (that is, redundant) for the above matching method 3845 * but results in the same closure sets regardless of whether the original 3846 * set contained the code point or a string. 3847 * @stable ICU 3.8 3848 */ 3849 public static final int CASE = 2; 3850 3851 /** 3852 * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C 3853 * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h). 3854 * @see #CASE 3855 * @stable ICU 3.4 3856 */ 3857 public static final int CASE_INSENSITIVE = 2; 3858 3859 /** 3860 * Bitmask for constructor, applyPattern(), and closeOver() 3861 * indicating letter case. This may be ORed together with other 3862 * selectors. 3863 * 3864 * Enable case insensitive matching. E.g., "[ab]" with this flag 3865 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3866 * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, 3867 * title-, and uppercase mappings as well as the case folding 3868 * of each existing element in the set. 3869 * @stable ICU 3.4 3870 */ 3871 public static final int ADD_CASE_MAPPINGS = 4; 3872 3873 // add the result of a full case mapping to the set 3874 // use str as a temporary string to avoid constructing one 3875 private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) { 3876 if(result >= 0) { 3877 if(result > UCaseProps.MAX_STRING_LENGTH) { 3878 // add a single-code point case mapping 3879 set.add(result); 3880 } else { 3881 // add a string case mapping from full with length result 3882 set.add(full.toString()); 3883 full.setLength(0); 3884 } 3885 } 3886 // result < 0: the code point mapped to itself, no need to add it 3887 // see UCaseProps 3888 } 3889 3890 /** 3891 * Close this set over the given attribute. For the attribute 3892 * CASE, the result is to modify this set so that: 3893 * 3894 * 1. For each character or string 'a' in this set, all strings 3895 * 'b' such that foldCase(a) == foldCase(b) are added to this set. 3896 * (For most 'a' that are single characters, 'b' will have 3897 * b.length() == 1.) 3898 * 3899 * 2. For each string 'e' in the resulting set, if e != 3900 * foldCase(e), 'e' will be removed. 3901 * 3902 * Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] 3903 * 3904 * (Here foldCase(x) refers to the operation 3905 * UCharacter.foldCase(x, true), and a == b actually denotes 3906 * a.equals(b), not pointer comparison.) 3907 * 3908 * @param attribute bitmask for attributes to close over. 3909 * Currently only the CASE bit is supported. Any undefined bits 3910 * are ignored. 3911 * @return a reference to this set. 3912 * @stable ICU 3.8 3913 */ 3914 public UnicodeSet closeOver(int attribute) { 3915 checkFrozen(); 3916 if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) { 3917 UCaseProps csp = UCaseProps.INSTANCE; 3918 UnicodeSet foldSet = new UnicodeSet(this); 3919 ULocale root = ULocale.ROOT; 3920 3921 // start with input set to guarantee inclusion 3922 // CASE: remove strings because the strings will actually be reduced (folded); 3923 // therefore, start with no strings and add only those needed 3924 if((attribute & CASE) != 0 && foldSet.hasStrings()) { 3925 foldSet.strings.clear(); 3926 } 3927 3928 int n = getRangeCount(); 3929 int result; 3930 StringBuilder full = new StringBuilder(); 3931 3932 for (int i=0; i<n; ++i) { 3933 int start = getRangeStart(i); 3934 int end = getRangeEnd(i); 3935 3936 if((attribute & CASE) != 0) { 3937 // full case closure 3938 for (int cp=start; cp<=end; ++cp) { 3939 csp.addCaseClosure(cp, foldSet); 3940 } 3941 } else { 3942 // add case mappings 3943 // (does not add long s for regular s, or Kelvin for k, for example) 3944 for (int cp=start; cp<=end; ++cp) { 3945 result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); 3946 addCaseMapping(foldSet, result, full); 3947 3948 result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); 3949 addCaseMapping(foldSet, result, full); 3950 3951 result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); 3952 addCaseMapping(foldSet, result, full); 3953 3954 result = csp.toFullFolding(cp, full, 0); 3955 addCaseMapping(foldSet, result, full); 3956 } 3957 } 3958 } 3959 if (hasStrings()) { 3960 if ((attribute & CASE) != 0) { 3961 for (String s : strings) { 3962 String str = UCharacter.foldCase(s, 0); 3963 if(!csp.addStringCaseClosure(str, foldSet)) { 3964 foldSet.add(str); // does not map to code points: add the folded string itself 3965 } 3966 } 3967 } else { 3968 BreakIterator bi = BreakIterator.getWordInstance(root); 3969 for (String str : strings) { 3970 // TODO: call lower-level functions 3971 foldSet.add(UCharacter.toLowerCase(root, str)); 3972 foldSet.add(UCharacter.toTitleCase(root, str, bi)); 3973 foldSet.add(UCharacter.toUpperCase(root, str)); 3974 foldSet.add(UCharacter.foldCase(str, 0)); 3975 } 3976 } 3977 } 3978 set(foldSet); 3979 } 3980 return this; 3981 } 3982 3983 /** 3984 * Internal class for customizing UnicodeSet parsing of properties. 3985 * TODO: extend to allow customizing of codepoint ranges 3986 * @draft ICU3.8 (retain) 3987 * @provisional This API might change or be removed in a future release. 3988 * @author medavis 3989 */ 3990 abstract public static class XSymbolTable implements SymbolTable { 3991 /** 3992 * Default constructor 3993 * @draft ICU3.8 (retain) 3994 * @provisional This API might change or be removed in a future release. 3995 */ 3996 public XSymbolTable(){} 3997 /** 3998 * Supplies default implementation for SymbolTable (no action). 3999 * @draft ICU3.8 (retain) 4000 * @provisional This API might change or be removed in a future release. 4001 */ 4002 @Override 4003 public UnicodeMatcher lookupMatcher(int i) { 4004 return null; 4005 } 4006 4007 /** 4008 * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style 4009 * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be. 4010 * <p> 4011 * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a 4012 * propertyName+propertyValue combination. 4013 * 4014 * @param propertyName 4015 * the name of the property 4016 * @param propertyValue 4017 * the name of the property value 4018 * @param result UnicodeSet value to change 4019 * a set to which the characters having the propertyName+propertyValue are to be added. 4020 * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters 4021 * with that property have been added to the UnicodeSet, and returns false if the 4022 * propertyName+propertyValue combination is not recognized (in which case result is unaltered). 4023 * @draft ICU3.8 (retain) 4024 * @provisional This API might change or be removed in a future release. 4025 */ 4026 public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { 4027 return false; 4028 } 4029 /** 4030 * Supplies default implementation for SymbolTable (no action). 4031 * @draft ICU3.8 (retain) 4032 * @provisional This API might change or be removed in a future release. 4033 */ 4034 @Override 4035 public char[] lookup(String s) { 4036 return null; 4037 } 4038 /** 4039 * Supplies default implementation for SymbolTable (no action). 4040 * @draft ICU3.8 (retain) 4041 * @provisional This API might change or be removed in a future release. 4042 */ 4043 @Override 4044 public String parseReference(String text, ParsePosition pos, int limit) { 4045 return null; 4046 } 4047 } 4048 4049 /** 4050 * Is this frozen, according to the Freezable interface? 4051 * 4052 * @return value 4053 * @stable ICU 3.8 4054 */ 4055 @Override 4056 public boolean isFrozen() { 4057 return (bmpSet != null || stringSpan != null); 4058 } 4059 4060 /** 4061 * Freeze this class, according to the Freezable interface. 4062 * 4063 * @return this 4064 * @stable ICU 4.4 4065 */ 4066 @Override 4067 public UnicodeSet freeze() { 4068 if (!isFrozen()) { 4069 compact(); 4070 4071 // Optimize contains() and span() and similar functions. 4072 if (hasStrings()) { 4073 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), UnicodeSetStringSpan.ALL); 4074 } 4075 if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { 4076 // Optimize for code point spans. 4077 // There are no strings, or 4078 // all strings are irrelevant for span() etc. because 4079 // all of each string's code points are contained in this set. 4080 // However, fully contained strings are relevant for spanAndCount(), 4081 // so we create both objects. 4082 bmpSet = new BMPSet(list, len); 4083 } 4084 } 4085 return this; 4086 } 4087 4088 /** 4089 * Span a string using this UnicodeSet. 4090 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4091 * @param s The string to be spanned 4092 * @param spanCondition The span condition 4093 * @return the length of the span 4094 * @stable ICU 4.4 4095 */ 4096 public int span(CharSequence s, SpanCondition spanCondition) { 4097 return span(s, 0, spanCondition); 4098 } 4099 4100 /** 4101 * Span a string using this UnicodeSet. 4102 * If the start index is less than 0, span will start from 0. 4103 * If the start index is greater than the string length, span returns the string length. 4104 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4105 * @param s The string to be spanned 4106 * @param start The start index that the span begins 4107 * @param spanCondition The span condition 4108 * @return the string index which ends the span (i.e. exclusive) 4109 * @stable ICU 4.4 4110 */ 4111 public int span(CharSequence s, int start, SpanCondition spanCondition) { 4112 int end = s.length(); 4113 if (start < 0) { 4114 start = 0; 4115 } else if (start >= end) { 4116 return end; 4117 } 4118 if (bmpSet != null) { 4119 // Frozen set without strings, or no string is relevant for span(). 4120 return bmpSet.span(s, start, spanCondition, null); 4121 } 4122 if (stringSpan != null) { 4123 return stringSpan.span(s, start, spanCondition); 4124 } else if (hasStrings()) { 4125 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4126 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4127 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4128 if (strSpan.needsStringSpanUTF16()) { 4129 return strSpan.span(s, start, spanCondition); 4130 } 4131 } 4132 4133 return spanCodePointsAndCount(s, start, spanCondition, null); 4134 } 4135 4136 /** 4137 * Same as span() but also counts the smallest number of set elements on any path across the span. 4138 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4139 * @param outCount An output-only object (must not be null) for returning the count. 4140 * @return the limit (exclusive end) of the span 4141 * @internal 4142 * @deprecated This API is ICU internal only. 4143 */ 4144 @Deprecated 4145 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { 4146 if (outCount == null) { 4147 throw new IllegalArgumentException("outCount must not be null"); 4148 } 4149 int end = s.length(); 4150 if (start < 0) { 4151 start = 0; 4152 } else if (start >= end) { 4153 return end; 4154 } 4155 if (stringSpan != null) { 4156 // We might also have bmpSet != null, 4157 // but fully-contained strings are relevant for counting elements. 4158 return stringSpan.spanAndCount(s, start, spanCondition, outCount); 4159 } else if (bmpSet != null) { 4160 return bmpSet.span(s, start, spanCondition, outCount); 4161 } else if (hasStrings()) { 4162 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4163 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4164 which |= UnicodeSetStringSpan.WITH_COUNT; 4165 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4166 return strSpan.spanAndCount(s, start, spanCondition, outCount); 4167 } 4168 4169 return spanCodePointsAndCount(s, start, spanCondition, outCount); 4170 } 4171 4172 private int spanCodePointsAndCount(CharSequence s, int start, 4173 SpanCondition spanCondition, OutputInt outCount) { 4174 // Pin to 0/1 values. 4175 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4176 4177 int c; 4178 int next = start; 4179 int length = s.length(); 4180 int count = 0; 4181 do { 4182 c = Character.codePointAt(s, next); 4183 if (spanContained != contains(c)) { 4184 break; 4185 } 4186 ++count; 4187 next += Character.charCount(c); 4188 } while (next < length); 4189 if (outCount != null) { outCount.value = count; } 4190 return next; 4191 } 4192 4193 /** 4194 * Span a string backwards (from the end) using this UnicodeSet. 4195 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4196 * @param s The string to be spanned 4197 * @param spanCondition The span condition 4198 * @return The string index which starts the span (i.e. inclusive). 4199 * @stable ICU 4.4 4200 */ 4201 public int spanBack(CharSequence s, SpanCondition spanCondition) { 4202 return spanBack(s, s.length(), spanCondition); 4203 } 4204 4205 /** 4206 * Span a string backwards (from the fromIndex) using this UnicodeSet. 4207 * If the fromIndex is less than 0, spanBack will return 0. 4208 * If fromIndex is greater than the string length, spanBack will start from the string length. 4209 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4210 * @param s The string to be spanned 4211 * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards 4212 * @param spanCondition The span condition 4213 * @return The string index which starts the span (i.e. inclusive). 4214 * @stable ICU 4.4 4215 */ 4216 public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { 4217 if (fromIndex <= 0) { 4218 return 0; 4219 } 4220 if (fromIndex > s.length()) { 4221 fromIndex = s.length(); 4222 } 4223 if (bmpSet != null) { 4224 // Frozen set without strings, or no string is relevant for spanBack(). 4225 return bmpSet.spanBack(s, fromIndex, spanCondition); 4226 } 4227 if (stringSpan != null) { 4228 return stringSpan.spanBack(s, fromIndex, spanCondition); 4229 } else if (hasStrings()) { 4230 int which = (spanCondition == SpanCondition.NOT_CONTAINED) 4231 ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED 4232 : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; 4233 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<>(strings), which); 4234 if (strSpan.needsStringSpanUTF16()) { 4235 return strSpan.spanBack(s, fromIndex, spanCondition); 4236 } 4237 } 4238 4239 // Pin to 0/1 values. 4240 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4241 4242 int c; 4243 int prev = fromIndex; 4244 do { 4245 c = Character.codePointBefore(s, prev); 4246 if (spanContained != contains(c)) { 4247 break; 4248 } 4249 prev -= Character.charCount(c); 4250 } while (prev > 0); 4251 return prev; 4252 } 4253 4254 /** 4255 * Clone a thawed version of this class, according to the Freezable interface. 4256 * @return the clone, not frozen 4257 * @stable ICU 4.4 4258 */ 4259 @Override 4260 public UnicodeSet cloneAsThawed() { 4261 UnicodeSet result = new UnicodeSet(this); 4262 assert !result.isFrozen(); 4263 return result; 4264 } 4265 4266 // internal function 4267 private void checkFrozen() { 4268 if (isFrozen()) { 4269 throw new UnsupportedOperationException("Attempt to modify frozen object"); 4270 } 4271 } 4272 4273 // ************************ 4274 // Additional methods for integration with Generics and Collections 4275 // ************************ 4276 4277 /** 4278 * A struct-like class used for iteration through ranges, for faster iteration than by String. 4279 * Read about the restrictions on usage in {@link UnicodeSet#ranges()}. 4280 * 4281 * @stable ICU 54 4282 */ 4283 public static class EntryRange { 4284 /** 4285 * The starting code point of the range. 4286 * 4287 * @stable ICU 54 4288 */ 4289 public int codepoint; 4290 /** 4291 * The ending code point of the range 4292 * 4293 * @stable ICU 54 4294 */ 4295 public int codepointEnd; 4296 4297 EntryRange() { 4298 } 4299 4300 /** 4301 * {@inheritDoc} 4302 * 4303 * @stable ICU 54 4304 */ 4305 @Override 4306 public String toString() { 4307 StringBuilder b = new StringBuilder(); 4308 return ( 4309 codepoint == codepointEnd ? _appendToPat(b, codepoint, false) 4310 : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false)) 4311 .toString(); 4312 } 4313 } 4314 4315 /** 4316 * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points. 4317 * The UnicodeSet must not be altered during the iteration. 4318 * The EntryRange instance is the same each time; the contents are just reset. 4319 * 4320 * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings. 4321 * 4322 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4323 * Do not alter the UnicodeSet while iterating. 4324 * 4325 * <pre> 4326 * // Sample code 4327 * for (EntryRange range : us1.ranges()) { 4328 * // do something with code points between range.codepoint and range.codepointEnd; 4329 * } 4330 * for (String s : us1.strings()) { 4331 * // do something with each string; 4332 * } 4333 * </pre> 4334 * 4335 * @stable ICU 54 4336 */ 4337 public Iterable<EntryRange> ranges() { 4338 return new EntryRangeIterable(); 4339 } 4340 4341 private class EntryRangeIterable implements Iterable<EntryRange> { 4342 @Override 4343 public Iterator<EntryRange> iterator() { 4344 return new EntryRangeIterator(); 4345 } 4346 } 4347 4348 private class EntryRangeIterator implements Iterator<EntryRange> { 4349 int pos; 4350 EntryRange result = new EntryRange(); 4351 4352 @Override 4353 public boolean hasNext() { 4354 return pos < len-1; 4355 } 4356 @Override 4357 public EntryRange next() { 4358 if (pos < len-1) { 4359 result.codepoint = list[pos++]; 4360 result.codepointEnd = list[pos++]-1; 4361 } else { 4362 throw new NoSuchElementException(); 4363 } 4364 return result; 4365 } 4366 @Override 4367 public void remove() { 4368 throw new UnsupportedOperationException(); 4369 } 4370 } 4371 4372 4373 /** 4374 * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}. 4375 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4376 * Do not alter the UnicodeSet while iterating. 4377 * @see java.util.Set#iterator() 4378 * @stable ICU 4.4 4379 */ 4380 @Override 4381 public Iterator<String> iterator() { 4382 return new UnicodeSetIterator2(this); 4383 } 4384 4385 // Cover for string iteration. 4386 private static class UnicodeSetIterator2 implements Iterator<String> { 4387 // Invariants: 4388 // sourceList != null then sourceList[item] is a valid character 4389 // sourceList == null then delegates to stringIterator 4390 private int[] sourceList; 4391 private int len; 4392 private int item; 4393 private int current; 4394 private int limit; 4395 private SortedSet<String> sourceStrings; 4396 private Iterator<String> stringIterator; 4397 private char[] buffer; 4398 4399 UnicodeSetIterator2(UnicodeSet source) { 4400 // set according to invariants 4401 len = source.len - 1; 4402 if (len > 0) { 4403 sourceStrings = source.strings; 4404 sourceList = source.list; 4405 current = sourceList[item++]; 4406 limit = sourceList[item++]; 4407 } else { 4408 stringIterator = source.strings.iterator(); 4409 sourceList = null; 4410 } 4411 } 4412 4413 /* (non-Javadoc) 4414 * @see java.util.Iterator#hasNext() 4415 */ 4416 @Override 4417 public boolean hasNext() { 4418 return sourceList != null || stringIterator.hasNext(); 4419 } 4420 4421 /* (non-Javadoc) 4422 * @see java.util.Iterator#next() 4423 */ 4424 @Override 4425 public String next() { 4426 if (sourceList == null) { 4427 return stringIterator.next(); 4428 } 4429 int codepoint = current++; 4430 // we have the codepoint we need, but we may need to adjust the state 4431 if (current >= limit) { 4432 if (item >= len) { 4433 stringIterator = sourceStrings.iterator(); 4434 sourceList = null; 4435 } else { 4436 current = sourceList[item++]; 4437 limit = sourceList[item++]; 4438 } 4439 } 4440 // Now return. Single code point is easy 4441 if (codepoint <= 0xFFFF) { 4442 return String.valueOf((char)codepoint); 4443 } 4444 // But Java lacks a valueOfCodePoint, so we handle ourselves for speed 4445 // allocate a buffer the first time, to make conversion faster. 4446 if (buffer == null) { 4447 buffer = new char[2]; 4448 } 4449 // compute ourselves, to save tests and calls 4450 int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT; 4451 buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE); 4452 buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE); 4453 return String.valueOf(buffer); 4454 } 4455 4456 /* (non-Javadoc) 4457 * @see java.util.Iterator#remove() 4458 */ 4459 @Override 4460 public void remove() { 4461 throw new UnsupportedOperationException(); 4462 } 4463 } 4464 4465 /** 4466 * @see #containsAll(com.ibm.icu.text.UnicodeSet) 4467 * @stable ICU 4.4 4468 */ 4469 public <T extends CharSequence> boolean containsAll(Iterable<T> collection) { 4470 for (T o : collection) { 4471 if (!contains(o)) { 4472 return false; 4473 } 4474 } 4475 return true; 4476 } 4477 4478 /** 4479 * @see #containsNone(com.ibm.icu.text.UnicodeSet) 4480 * @stable ICU 4.4 4481 */ 4482 public <T extends CharSequence> boolean containsNone(Iterable<T> collection) { 4483 for (T o : collection) { 4484 if (contains(o)) { 4485 return false; 4486 } 4487 } 4488 return true; 4489 } 4490 4491 /** 4492 * @see #containsAll(com.ibm.icu.text.UnicodeSet) 4493 * @stable ICU 4.4 4494 */ 4495 public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) { 4496 return !containsNone(collection); 4497 } 4498 4499 /** 4500 * @see #addAll(com.ibm.icu.text.UnicodeSet) 4501 * @stable ICU 4.4 4502 */ 4503 @SuppressWarnings("unchecked") // See ticket #11395, this is safe. 4504 public <T extends CharSequence> UnicodeSet addAll(T... collection) { 4505 checkFrozen(); 4506 for (T str : collection) { 4507 add(str); 4508 } 4509 return this; 4510 } 4511 4512 4513 /** 4514 * @see #removeAll(com.ibm.icu.text.UnicodeSet) 4515 * @stable ICU 4.4 4516 */ 4517 public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) { 4518 checkFrozen(); 4519 for (T o : collection) { 4520 remove(o); 4521 } 4522 return this; 4523 } 4524 4525 /** 4526 * @see #retainAll(com.ibm.icu.text.UnicodeSet) 4527 * @stable ICU 4.4 4528 */ 4529 public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) { 4530 checkFrozen(); 4531 // TODO optimize 4532 UnicodeSet toRetain = new UnicodeSet(); 4533 toRetain.addAll(collection); 4534 retainAll(toRetain); 4535 return this; 4536 } 4537 4538 /** 4539 * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}. 4540 * @stable ICU 4.4 4541 */ 4542 public enum ComparisonStyle { 4543 /** 4544 * @stable ICU 4.4 4545 */ 4546 SHORTER_FIRST, 4547 /** 4548 * @stable ICU 4.4 4549 */ 4550 LEXICOGRAPHIC, 4551 /** 4552 * @stable ICU 4.4 4553 */ 4554 LONGER_FIRST 4555 } 4556 4557 /** 4558 * Compares UnicodeSets, where shorter come first, and otherwise lexigraphically 4559 * (according to the comparison of the first characters that differ). 4560 * @see java.lang.Comparable#compareTo(java.lang.Object) 4561 * @stable ICU 4.4 4562 */ 4563 @Override 4564 public int compareTo(UnicodeSet o) { 4565 return compareTo(o, ComparisonStyle.SHORTER_FIRST); 4566 } 4567 /** 4568 * Compares UnicodeSets, in three different ways. 4569 * @see java.lang.Comparable#compareTo(java.lang.Object) 4570 * @stable ICU 4.4 4571 */ 4572 public int compareTo(UnicodeSet o, ComparisonStyle style) { 4573 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4574 int diff = size() - o.size(); 4575 if (diff != 0) { 4576 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4577 } 4578 } 4579 int result; 4580 for (int i = 0; ; ++i) { 4581 if (0 != (result = list[i] - o.list[i])) { 4582 // if either list ran out, compare to the last string 4583 if (list[i] == HIGH) { 4584 if (!hasStrings()) return 1; 4585 String item = strings.first(); 4586 return compare(item, o.list[i]); 4587 } 4588 if (o.list[i] == HIGH) { 4589 if (!o.hasStrings()) return -1; 4590 String item = o.strings.first(); 4591 int compareResult = compare(item, list[i]); 4592 return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order. 4593 } 4594 // otherwise return the result if even index, or the reversal if not 4595 return (i & 1) == 0 ? result : -result; 4596 } 4597 if (list[i] == HIGH) { 4598 break; 4599 } 4600 } 4601 return compare(strings, o.strings); 4602 } 4603 4604 /** 4605 * @stable ICU 4.4 4606 */ 4607 public int compareTo(Iterable<String> other) { 4608 return compare(this, other); 4609 } 4610 4611 /** 4612 * Utility to compare a string to a code point. 4613 * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString()) 4614 * and comparing, but much faster (no object creation). 4615 * Actually, there is one difference; a null compares as less. 4616 * Note that this (=String) order is UTF-16 order -- *not* code point order. 4617 * @stable ICU 4.4 4618 */ 4619 4620 public static int compare(CharSequence string, int codePoint) { 4621 return CharSequences.compare(string, codePoint); 4622 } 4623 4624 /** 4625 * Utility to compare a string to a code point. 4626 * Same results as turning the code point into a string and comparing, but much faster (no object creation). 4627 * Actually, there is one difference; a null compares as less. 4628 * Note that this (=String) order is UTF-16 order -- *not* code point order. 4629 * @stable ICU 4.4 4630 */ 4631 public static int compare(int codePoint, CharSequence string) { 4632 return -CharSequences.compare(string, codePoint); 4633 } 4634 4635 4636 /** 4637 * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered, 4638 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4639 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4640 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4641 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4642 * @stable ICU 4.4 4643 */ 4644 public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) { 4645 return compare(collection1.iterator(), collection2.iterator()); 4646 } 4647 4648 /** 4649 * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered, 4650 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4651 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4652 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4653 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4654 * @internal 4655 * @deprecated This API is ICU internal only. 4656 */ 4657 @Deprecated 4658 public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) { 4659 while (true) { 4660 if (!first.hasNext()) { 4661 return other.hasNext() ? -1 : 0; 4662 } else if (!other.hasNext()) { 4663 return 1; 4664 } 4665 T item1 = first.next(); 4666 T item2 = other.next(); 4667 int result = item1.compareTo(item2); 4668 if (result != 0) { 4669 return result; 4670 } 4671 } 4672 } 4673 4674 4675 /** 4676 * Utility to compare two collections, optionally by size, and then lexicographically. 4677 * @stable ICU 4.4 4678 */ 4679 public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) { 4680 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4681 int diff = collection1.size() - collection2.size(); 4682 if (diff != 0) { 4683 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4684 } 4685 } 4686 return compare(collection1, collection2); 4687 } 4688 4689 /** 4690 * Utility for adding the contents of an iterable to a collection. 4691 * @stable ICU 4.4 4692 */ 4693 public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) { 4694 for (T item : source) { 4695 target.add(item); 4696 } 4697 return target; 4698 } 4699 4700 /** 4701 * Utility for adding the contents of an iterable to a collection. 4702 * @stable ICU 4.4 4703 */ 4704 public static <T> T[] addAllTo(Iterable<T> source, T[] target) { 4705 int i = 0; 4706 for (T item : source) { 4707 target[i++] = item; 4708 } 4709 return target; 4710 } 4711 4712 /** 4713 * For iterating through the strings in the set. Example: 4714 * <pre> 4715 * for (String key : myUnicodeSet.strings()) { 4716 * doSomethingWith(key); 4717 * } 4718 * </pre> 4719 * @stable ICU 4.4 4720 */ 4721 public Collection<String> strings() { 4722 if (hasStrings()) { 4723 return Collections.unmodifiableSortedSet(strings); 4724 } else { 4725 return EMPTY_STRINGS; 4726 } 4727 } 4728 4729 /** 4730 * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE. 4731 * @internal 4732 * @deprecated This API is ICU internal only. 4733 */ 4734 @Deprecated 4735 public static int getSingleCodePoint(CharSequence s) { 4736 return CharSequences.getSingleCodePoint(s); 4737 } 4738 4739 /** 4740 * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set. 4741 * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E 4742 * if the dontCare set includes unassigned characters (for a particular version of Unicode). 4743 * @param dontCare Set with the don't-care characters for spanning 4744 * @return the input set, modified 4745 * @internal 4746 * @deprecated This API is ICU internal only. 4747 */ 4748 @Deprecated 4749 public UnicodeSet addBridges(UnicodeSet dontCare) { 4750 UnicodeSet notInInput = new UnicodeSet(this).complement(); 4751 for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) { 4752 if (it.codepoint != 0 && it.codepoint != UnicodeSetIterator.IS_STRING && it.codepointEnd != 0x10FFFF && dontCare.contains(it.codepoint,it.codepointEnd)) { 4753 add(it.codepoint,it.codepointEnd); 4754 } 4755 } 4756 return this; 4757 } 4758 4759 /** 4760 * Find the first index at or after fromIndex where the UnicodeSet matches at that index. 4761 * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match. 4762 * If there is no match, length is returned. 4763 * @internal 4764 * @deprecated This API is ICU internal only. Use span instead. 4765 */ 4766 @Deprecated 4767 public int findIn(CharSequence value, int fromIndex, boolean findNot) { 4768 //TODO add strings, optimize, using ICU4C algorithms 4769 int cp; 4770 for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) { 4771 cp = UTF16.charAt(value, fromIndex); 4772 if (contains(cp) != findNot) { 4773 break; 4774 } 4775 } 4776 return fromIndex; 4777 } 4778 4779 /** 4780 * Find the last index before fromIndex where the UnicodeSet matches at that index. 4781 * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match. 4782 * If there is no match, -1 is returned. 4783 * BEFORE index is not in the UnicodeSet. 4784 * @internal 4785 * @deprecated This API is ICU internal only. Use spanBack instead. 4786 */ 4787 @Deprecated 4788 public int findLastIn(CharSequence value, int fromIndex, boolean findNot) { 4789 //TODO add strings, optimize, using ICU4C algorithms 4790 int cp; 4791 fromIndex -= 1; 4792 for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) { 4793 cp = UTF16.charAt(value, fromIndex); 4794 if (contains(cp) != findNot) { 4795 break; 4796 } 4797 } 4798 return fromIndex < 0 ? -1 : fromIndex; 4799 } 4800 4801 /** 4802 * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match. 4803 * @param source The source of the CharSequence to strip from. 4804 * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object. 4805 * @return The string after it has been stripped. 4806 * @internal 4807 * @deprecated This API is ICU internal only. Use replaceFrom. 4808 */ 4809 @Deprecated 4810 public String stripFrom(CharSequence source, boolean matches) { 4811 StringBuilder result = new StringBuilder(); 4812 for (int pos = 0; pos < source.length();) { 4813 int inside = findIn(source, pos, !matches); 4814 result.append(source.subSequence(pos, inside)); 4815 pos = findIn(source, inside, matches); // get next start 4816 } 4817 return result.toString(); 4818 } 4819 4820 /** 4821 * Argument values for whether span() and similar functions continue while the current character is contained vs. 4822 * not contained in the set. 4823 * <p> 4824 * The functionality is straightforward for sets with only single code points, without strings (which is the common 4825 * case): 4826 * <ul> 4827 * <li>CONTAINED and SIMPLE work the same. 4828 * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED. 4829 * <li>span() and spanBack() partition any string the 4830 * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). 4831 * <li>Using a 4832 * complemented (inverted) set and the opposite span conditions yields the same results. 4833 * </ul> 4834 * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in 4835 * the set (for example, whether they overlap with each other) and the string that is processed. For a set with 4836 * strings: 4837 * <ul> 4838 * <li>The complement of the set contains the opposite set of code points, but the same set of strings. 4839 * Therefore, complementing both the set and the span conditions may yield different results. 4840 * <li>When starting spans 4841 * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 4842 * because a set string may start before the later position. 4843 * <li>span(SIMPLE) may be shorter than 4844 * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which 4845 * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", 4846 * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). 4847 * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, 4848 * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield 4849 * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. 4850 * </ul> 4851 * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then 4852 * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could 4853 * be used. 4854 * <p> 4855 * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point 4856 * boundaries, never in the middle of a surrogate pair. 4857 * 4858 * @stable ICU 4.4 4859 */ 4860 public enum SpanCondition { 4861 /** 4862 * Continues a span() while there is no set element at the current position. 4863 * Increments by one code point at a time. 4864 * Stops before the first set element (character or string). 4865 * (For code points only, this is like while contains(current)==false). 4866 * <p> 4867 * When span() returns, the substring between where it started and the position it returned consists only of 4868 * characters that are not in the set, and none of its strings overlap with the span. 4869 * 4870 * @stable ICU 4.4 4871 */ 4872 NOT_CONTAINED, 4873 4874 /** 4875 * Spans the longest substring that is a concatenation of set elements (characters or strings). 4876 * (For characters only, this is like while contains(current)==true). 4877 * <p> 4878 * When span() returns, the substring between where it started and the position it returned consists only of set 4879 * elements (characters or strings) that are in the set. 4880 * <p> 4881 * If a set contains strings, then the span will be the longest substring for which there 4882 * exists at least one non-overlapping concatenation of set elements (characters or strings). 4883 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 4884 * (Java/ICU/Perl regex stops at the first match of an OR.) 4885 * 4886 * @stable ICU 4.4 4887 */ 4888 CONTAINED, 4889 4890 /** 4891 * Continues a span() while there is a set element at the current position. 4892 * Increments by the longest matching element at each position. 4893 * (For characters only, this is like while contains(current)==true). 4894 * <p> 4895 * When span() returns, the substring between where it started and the position it returned consists only of set 4896 * elements (characters or strings) that are in the set. 4897 * <p> 4898 * If a set only contains single characters, then this is the same as CONTAINED. 4899 * <p> 4900 * If a set contains strings, then the span will be the longest substring with a match at each position with the 4901 * longest single set element (character or string). 4902 * <p> 4903 * Use this span condition together with other longest-match algorithms, such as ICU converters 4904 * (ucnv_getUnicodeSet()). 4905 * 4906 * @stable ICU 4.4 4907 */ 4908 SIMPLE, 4909 4910 /** 4911 * One more than the last span condition. 4912 * 4913 * @stable ICU 4.4 4914 */ 4915 CONDITION_COUNT 4916 } 4917 4918 /** 4919 * Get the default symbol table. Null means ordinary processing. For internal use only. 4920 * @return the symbol table 4921 * @internal 4922 * @deprecated This API is ICU internal only. 4923 */ 4924 @Deprecated 4925 public static XSymbolTable getDefaultXSymbolTable() { 4926 return XSYMBOL_TABLE; 4927 } 4928 4929 /** 4930 * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing 4931 * of UnicodeSets. 4932 * <p> 4933 * WARNING: If this function is used with a UnicodeProperty, and the 4934 * Unassigned characters (gc=Cn) are different than in ICU, you MUST call 4935 * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable} 4936 * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}. 4937 * 4938 * @param xSymbolTable the new default symbol table. 4939 * @internal 4940 * @deprecated This API is ICU internal only. 4941 */ 4942 @Deprecated 4943 public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) { 4944 // If the properties override inclusions, these have to be regenerated. 4945 // TODO: Check if the Unicode Tools or Unicode Utilities really need this. 4946 CharacterPropertiesImpl.clear(); 4947 XSYMBOL_TABLE = xSymbolTable; 4948 } 4949 } 4950 //eof 4951