1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package android.icu.text; 11 12 import java.io.IOException; 13 import java.text.ParsePosition; 14 import java.util.ArrayList; 15 import java.util.Collection; 16 import java.util.Collections; 17 import java.util.Iterator; 18 import java.util.NoSuchElementException; 19 import java.util.TreeSet; 20 21 import android.icu.impl.BMPSet; 22 import android.icu.impl.Norm2AllModes; 23 import android.icu.impl.PatternProps; 24 import android.icu.impl.RuleCharacterIterator; 25 import android.icu.impl.SortedSetRelation; 26 import android.icu.impl.StringRange; 27 import android.icu.impl.UBiDiProps; 28 import android.icu.impl.UCaseProps; 29 import android.icu.impl.UCharacterProperty; 30 import android.icu.impl.UPropertyAliases; 31 import android.icu.impl.UnicodeSetStringSpan; 32 import android.icu.impl.Utility; 33 import android.icu.lang.CharSequences; 34 import android.icu.lang.UCharacter; 35 import android.icu.lang.UProperty; 36 import android.icu.lang.UScript; 37 import android.icu.util.Freezable; 38 import android.icu.util.ICUUncheckedIOException; 39 import android.icu.util.OutputInt; 40 import android.icu.util.ULocale; 41 import android.icu.util.VersionInfo; 42 43 /** 44 * A mutable set of Unicode characters and multicharacter strings. 45 * Objects of this class represent <em>character classes</em> used 46 * in regular expressions. A character specifies a subset of Unicode 47 * code points. Legal code points are U+0000 to U+10FFFF, inclusive. 48 * 49 * Note: method freeze() will not only make the set immutable, but 50 * also makes important methods much higher performance: 51 * contains(c), containsNone(...), span(...), spanBack(...) etc. 52 * After the object is frozen, any subsequent call that wants to change 53 * the object will throw UnsupportedOperationException. 54 * 55 * <p>The UnicodeSet class is not designed to be subclassed. 56 * 57 * <p><code>UnicodeSet</code> supports two APIs. The first is the 58 * <em>operand</em> API that allows the caller to modify the value of 59 * a <code>UnicodeSet</code> object. It conforms to Java 2's 60 * <code>java.util.Set</code> interface, although 61 * <code>UnicodeSet</code> does not actually implement that 62 * interface. All methods of <code>Set</code> are supported, with the 63 * modification that they take a character range or single character 64 * instead of an <code>Object</code>, and they take a 65 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 66 * operand API may be thought of in terms of boolean logic: a boolean 67 * OR is implemented by <code>add</code>, a boolean AND is implemented 68 * by <code>retain</code>, a boolean XOR is implemented by 69 * <code>complement</code> taking an argument, and a boolean NOT is 70 * implemented by <code>complement</code> with no argument. In terms 71 * of traditional set theory function names, <code>add</code> is a 72 * union, <code>retain</code> is an intersection, <code>remove</code> 73 * is an asymmetric difference, and <code>complement</code> with no 74 * argument is a set complement with respect to the superset range 75 * <code>MIN_VALUE-MAX_VALUE</code> 76 * 77 * <p>The second API is the 78 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 79 * <code>java.text.Format</code>-derived classes. Unlike the 80 * methods that add characters, add categories, and control the logic 81 * of the set, the method <code>applyPattern()</code> sets all 82 * attributes of a <code>UnicodeSet</code> at once, based on a 83 * string pattern. 84 * 85 * <p><b>Pattern syntax</b></p> 86 * 87 * Patterns are accepted by the constructors and the 88 * <code>applyPattern()</code> methods and returned by the 89 * <code>toPattern()</code> method. These patterns follow a syntax 90 * similar to that employed by version 8 regular expression character 91 * classes. Here are some simple examples: 92 * 93 * <blockquote> 94 * <table> 95 * <tr style="vertical-align: top"> 96 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td> 97 * <td style="vertical-align: top;">No characters</td> 98 * </tr><tr style="vertical-align: top"> 99 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td> 100 * <td style="vertical-align: top;">The character 'a'</td> 101 * </tr><tr style="vertical-align: top"> 102 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td> 103 * <td style="vertical-align: top;">The characters 'a' and 'e'</td> 104 * </tr> 105 * <tr> 106 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td> 107 * <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code 108 * point order</td> 109 * </tr> 110 * <tr> 111 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td> 112 * <td style="vertical-align: top;">The character U+4E01</td> 113 * </tr> 114 * <tr> 115 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td> 116 * <td style="vertical-align: top;">The character 'a' and the multicharacter strings "ab" and 117 * "ac"</td> 118 * </tr> 119 * <tr> 120 * <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td> 121 * <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td> 122 * </tr> 123 * </table> 124 * </blockquote> 125 * 126 * Any character may be preceded by a backslash in order to remove any special 127 * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are 128 * ignored, unless they are escaped. 129 * 130 * <p>Property patterns specify a set of characters having a certain 131 * property as defined by the Unicode standard. Both the POSIX-like 132 * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a 133 * complete list of supported property patterns, see the User's Guide 134 * for UnicodeSet at 135 * <a href="http://www.icu-project.org/userguide/unicodeSet.html"> 136 * http://www.icu-project.org/userguide/unicodeSet.html</a>. 137 * Actual determination of property data is defined by the underlying 138 * Unicode database as implemented by UCharacter. 139 * 140 * <p>Patterns specify individual characters, ranges of characters, and 141 * Unicode property sets. When elements are concatenated, they 142 * specify their union. To complement a set, place a '^' immediately 143 * after the opening '['. Property patterns are inverted by modifying 144 * their delimiters; "[:^foo]" and "\P{foo}". In any other location, 145 * '^' has no special meaning. 146 * 147 * <p>Ranges are indicated by placing two a '-' between two 148 * characters, as in "a-z". This specifies the range of all 149 * characters from the left to the right, in Unicode order. If the 150 * left character is greater than or equal to the 151 * right character it is a syntax error. If a '-' occurs as the first 152 * character after the opening '[' or '[^', or if it occurs as the 153 * last character before the closing ']', then it is taken as a 154 * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same 155 * set of three characters, 'a', 'b', and '-'. 156 * 157 * <p>Sets may be intersected using the '&' operator or the asymmetric 158 * set difference may be taken using the '-' operator, for example, 159 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 160 * with values less than 4096. Operators ('&' and '|') have equal 161 * precedence and bind left-to-right. Thus 162 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 163 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 164 * difference; intersection is commutative. 165 * 166 * <table> 167 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a' 168 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a' 169 * through 'z' and all letters in between, in Unicode order 170 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing 171 * all characters but 'a' through 'z', 172 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 173 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 174 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 175 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 176 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 177 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 178 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 179 * <em>pat2</em> 180 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code> 181 * <td>The set of characters having the specified 182 * Unicode property; in 183 * this case, Unicode uppercase letters 184 * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code> 185 * <td>The set of characters <em>not</em> having the given 186 * Unicode property 187 * </table> 188 * 189 * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p> 190 * 191 * <p><b>Formal syntax</b></p> 192 * 193 * <blockquote> 194 * <table> 195 * <tr style="vertical-align: top"> 196 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern := </code></td> 197 * <td style="vertical-align: top;"><code>('[' '^'? item* ']') | 198 * property</code></td> 199 * </tr> 200 * <tr style="vertical-align: top"> 201 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item := </code></td> 202 * <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br> 203 * </code></td> 204 * </tr> 205 * <tr style="vertical-align: top"> 206 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr := </code></td> 207 * <td style="vertical-align: top;"><code>pattern | pattern-expr pattern | 208 * pattern-expr op pattern<br> 209 * </code></td> 210 * </tr> 211 * <tr style="vertical-align: top"> 212 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op := </code></td> 213 * <td style="vertical-align: top;"><code>'&' | '-'<br> 214 * </code></td> 215 * </tr> 216 * <tr style="vertical-align: top"> 217 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special := </code></td> 218 * <td style="vertical-align: top;"><code>'[' | ']' | '-'<br> 219 * </code></td> 220 * </tr> 221 * <tr style="vertical-align: top"> 222 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char := </code></td> 223 * <td style="vertical-align: top;"><em>any character that is not</em><code> special<br> 224 * | ('\\' </code><em>any character</em><code>)<br> 225 * | ('\u' hex hex hex hex)<br> 226 * </code></td> 227 * </tr> 228 * <tr style="vertical-align: top"> 229 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex := </code></td> 230 * <td style="vertical-align: top;"><em>any character for which 231 * </em><code>Character.digit(c, 16)</code><em> 232 * returns a non-negative result</em></td> 233 * </tr> 234 * <tr> 235 * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property := </code></td> 236 * <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td> 237 * </tr> 238 * </table> 239 * <br> 240 * <table border="1"> 241 * <tr> 242 * <td>Legend: <table> 243 * <tr> 244 * <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td> 245 * <td style="width: 20; vertical-align: top;"> </td> 246 * <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td> 247 * </tr> 248 * <tr> 249 * <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td> 250 * <td style="vertical-align: top;"></td> 251 * <td style="vertical-align: top;">zero or one instance of <code>a</code><br> 252 * </td> 253 * </tr> 254 * <tr> 255 * <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td> 256 * <td style="vertical-align: top;"></td> 257 * <td style="vertical-align: top;">one or more instances of <code>a</code><br> 258 * </td> 259 * </tr> 260 * <tr> 261 * <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td> 262 * <td style="vertical-align: top;"></td> 263 * <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br> 264 * </td> 265 * </tr> 266 * <tr> 267 * <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td> 268 * <td style="vertical-align: top;"></td> 269 * <td style="vertical-align: top;">the literal string between the quotes </td> 270 * </tr> 271 * </table> 272 * </td> 273 * </tr> 274 * </table> 275 * </blockquote> 276 * <p>To iterate over contents of UnicodeSet, the following are available: 277 * <ul><li>{@link #ranges()} to iterate through the ranges</li> 278 * <li>{@link #strings()} to iterate through the strings</li> 279 * <li>{@link #iterator()} to iterate through the entire contents in a single loop. 280 * That method is, however, not particularly efficient, since it "boxes" each code point into a String. 281 * </ul> 282 * All of the above can be used in <b>for</b> loops. 283 * The {@link android.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops. 284 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 285 * 286 * @author Alan Liu 287 * @see UnicodeSetIterator 288 * @see UnicodeSetSpanner 289 */ 290 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> { 291 292 /** 293 * Constant for the empty set. 294 */ 295 public static final UnicodeSet EMPTY = new UnicodeSet().freeze(); 296 /** 297 * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.) 298 */ 299 public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze(); 300 301 private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing 302 303 private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints 304 private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. 305 // 110000 for codepoints 306 307 /** 308 * Minimum value that can be stored in a UnicodeSet. 309 */ 310 public static final int MIN_VALUE = LOW; 311 312 /** 313 * Maximum value that can be stored in a UnicodeSet. 314 */ 315 public static final int MAX_VALUE = HIGH - 1; 316 317 private int len; // length used; list may be longer to minimize reallocs 318 private int[] list; // MUST be terminated with HIGH 319 private int[] rangeList; // internal buffer 320 private int[] buffer; // internal buffer 321 322 // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!! 323 // is not private so that UnicodeSetIterator can get access 324 TreeSet<String> strings = new TreeSet<String>(); 325 326 /** 327 * The pattern representation of this set. This may not be the 328 * most economical pattern. It is the pattern supplied to 329 * applyPattern(), with variables substituted and whitespace 330 * removed. For sets constructed without applyPattern(), or 331 * modified using the non-pattern API, this string will be null, 332 * indicating that toPattern() must generate a pattern 333 * representation from the inversion list. 334 */ 335 private String pat = null; 336 337 private static final int START_EXTRA = 16; // initial storage. Must be >= 0 338 private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0 339 340 // Special property set IDs 341 private static final String ANY_ID = "ANY"; // [\u0000-\U0010FFFF] 342 private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F] 343 private static final String ASSIGNED = "Assigned"; // [:^Cn:] 344 345 /** 346 * A set of all characters _except_ the second through last characters of 347 * certain ranges. These ranges are ranges of characters whose 348 * properties are all exactly alike, e.g. CJK Ideographs from 349 * U+4E00 to U+9FA5. 350 */ 351 private static UnicodeSet INCLUSIONS[] = null; 352 353 private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. 354 private volatile UnicodeSetStringSpan stringSpan; 355 //---------------------------------------------------------------- 356 // Public API 357 //---------------------------------------------------------------- 358 359 /** 360 * Constructs an empty set. 361 */ 362 public UnicodeSet() { 363 list = new int[1 + START_EXTRA]; 364 list[len++] = HIGH; 365 } 366 367 /** 368 * Constructs a copy of an existing set. 369 */ 370 public UnicodeSet(UnicodeSet other) { 371 set(other); 372 } 373 374 /** 375 * Constructs a set containing the given range. If <code>end > 376 * start</code> then an empty set is created. 377 * 378 * @param start first character, inclusive, of range 379 * @param end last character, inclusive, of range 380 */ 381 public UnicodeSet(int start, int end) { 382 this(); 383 complement(start, end); 384 } 385 386 /** 387 * Quickly constructs a set from a set of ranges <s0, e0, s1, e1, s2, e2, ..., sn, en>. 388 * There must be an even number of integers, and they must be all greater than zero, 389 * all less than or equal to Character.MAX_CODE_POINT. 390 * In each pair (..., si, ei, ...) it must be true that si <= ei 391 * Between adjacent pairs (...ei, sj...), it must be true that ei+1 < sj 392 * @param pairs pairs of character representing ranges 393 */ 394 public UnicodeSet(int... pairs) { 395 if ((pairs.length & 1) != 0) { 396 throw new IllegalArgumentException("Must have even number of integers"); 397 } 398 list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set. 399 len = list.length; 400 int last = -1; // used to ensure that the results are monotonically increasing. 401 int i = 0; 402 while (i < pairs.length) { 403 // start of pair 404 int start = pairs[i]; 405 if (last >= start) { 406 throw new IllegalArgumentException("Must be monotonically increasing."); 407 } 408 list[i++] = last = start; 409 // end of pair 410 int end = pairs[i] + 1; 411 if (last >= end) { 412 throw new IllegalArgumentException("Must be monotonically increasing."); 413 } 414 list[i++] = last = end; 415 } 416 list[i] = HIGH; // terminate 417 } 418 419 /** 420 * Constructs a set from the given pattern. See the class description 421 * for the syntax of the pattern language. Whitespace is ignored. 422 * @param pattern a string specifying what characters are in the set 423 * @exception java.lang.IllegalArgumentException if the pattern contains 424 * a syntax error. 425 */ 426 public UnicodeSet(String pattern) { 427 this(); 428 applyPattern(pattern, null, null, IGNORE_SPACE); 429 } 430 431 /** 432 * Constructs a set from the given pattern. See the class description 433 * for the syntax of the pattern language. 434 * @param pattern a string specifying what characters are in the set 435 * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters 436 * @exception java.lang.IllegalArgumentException if the pattern contains 437 * a syntax error. 438 */ 439 public UnicodeSet(String pattern, boolean ignoreWhitespace) { 440 this(); 441 applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 442 } 443 444 /** 445 * Constructs a set from the given pattern. See the class description 446 * for the syntax of the pattern language. 447 * @param pattern a string specifying what characters are in the set 448 * @param options a bitmask indicating which options to apply. 449 * Valid options are IGNORE_SPACE and CASE. 450 * @exception java.lang.IllegalArgumentException if the pattern contains 451 * a syntax error. 452 */ 453 public UnicodeSet(String pattern, int options) { 454 this(); 455 applyPattern(pattern, null, null, options); 456 } 457 458 /** 459 * Constructs a set from the given pattern. See the class description 460 * for the syntax of the pattern language. 461 * @param pattern a string specifying what characters are in the set 462 * @param pos on input, the position in pattern at which to start parsing. 463 * On output, the position after the last character parsed. 464 * @param symbols a symbol table mapping variables to char[] arrays 465 * and chars to UnicodeSets 466 * @exception java.lang.IllegalArgumentException if the pattern 467 * contains a syntax error. 468 */ 469 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) { 470 this(); 471 applyPattern(pattern, pos, symbols, IGNORE_SPACE); 472 } 473 474 /** 475 * Constructs a set from the given pattern. See the class description 476 * for the syntax of the pattern language. 477 * @param pattern a string specifying what characters are in the set 478 * @param pos on input, the position in pattern at which to start parsing. 479 * On output, the position after the last character parsed. 480 * @param symbols a symbol table mapping variables to char[] arrays 481 * and chars to UnicodeSets 482 * @param options a bitmask indicating which options to apply. 483 * Valid options are IGNORE_SPACE and CASE. 484 * @exception java.lang.IllegalArgumentException if the pattern 485 * contains a syntax error. 486 */ 487 public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) { 488 this(); 489 applyPattern(pattern, pos, symbols, options); 490 } 491 492 493 /** 494 * Return a new set that is equivalent to this one. 495 */ 496 @Override 497 public Object clone() { 498 if (isFrozen()) { 499 return this; 500 } 501 UnicodeSet result = new UnicodeSet(this); 502 result.bmpSet = this.bmpSet; 503 result.stringSpan = this.stringSpan; 504 return result; 505 } 506 507 /** 508 * Make this object represent the range <code>start - end</code>. 509 * If <code>end > start</code> then this object is set to an 510 * an empty range. 511 * 512 * @param start first character in the set, inclusive 513 * @param end last character in the set, inclusive 514 */ 515 public UnicodeSet set(int start, int end) { 516 checkFrozen(); 517 clear(); 518 complement(start, end); 519 return this; 520 } 521 522 /** 523 * Make this object represent the same set as <code>other</code>. 524 * @param other a <code>UnicodeSet</code> whose value will be 525 * copied to this object 526 */ 527 public UnicodeSet set(UnicodeSet other) { 528 checkFrozen(); 529 list = other.list.clone(); 530 len = other.len; 531 pat = other.pat; 532 strings = new TreeSet<String>(other.strings); 533 return this; 534 } 535 536 /** 537 * Modifies this set to represent the set specified by the given pattern. 538 * See the class description for the syntax of the pattern language. 539 * Whitespace is ignored. 540 * @param pattern a string specifying what characters are in the set 541 * @exception java.lang.IllegalArgumentException if the pattern 542 * contains a syntax error. 543 */ 544 public final UnicodeSet applyPattern(String pattern) { 545 checkFrozen(); 546 return applyPattern(pattern, null, null, IGNORE_SPACE); 547 } 548 549 /** 550 * Modifies this set to represent the set specified by the given pattern, 551 * optionally ignoring whitespace. 552 * See the class description for the syntax of the pattern language. 553 * @param pattern a string specifying what characters are in the set 554 * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored 555 * @exception java.lang.IllegalArgumentException if the pattern 556 * contains a syntax error. 557 */ 558 public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) { 559 checkFrozen(); 560 return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0); 561 } 562 563 /** 564 * Modifies this set to represent the set specified by the given pattern, 565 * optionally ignoring whitespace. 566 * See the class description for the syntax of the pattern language. 567 * @param pattern a string specifying what characters are in the set 568 * @param options a bitmask indicating which options to apply. 569 * Valid options are IGNORE_SPACE and CASE. 570 * @exception java.lang.IllegalArgumentException if the pattern 571 * contains a syntax error. 572 */ 573 public UnicodeSet applyPattern(String pattern, int options) { 574 checkFrozen(); 575 return applyPattern(pattern, null, null, options); 576 } 577 578 /** 579 * Return true if the given position, in the given pattern, appears 580 * to be the start of a UnicodeSet pattern. 581 * @hide unsupported on Android 582 */ 583 public static boolean resemblesPattern(String pattern, int pos) { 584 return ((pos+1) < pattern.length() && 585 pattern.charAt(pos) == '[') || 586 resemblesPropertyPattern(pattern, pos); 587 } 588 589 /** 590 * TODO: create Appendable version of UTF16.append(buf, c), 591 * maybe in new class Appendables? 592 * @throws IOException 593 */ 594 private static void appendCodePoint(Appendable app, int c) { 595 assert 0 <= c && c <= 0x10ffff; 596 try { 597 if (c <= 0xffff) { 598 app.append((char) c); 599 } else { 600 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c)); 601 } 602 } catch (IOException e) { 603 throw new ICUUncheckedIOException(e); 604 } 605 } 606 607 /** 608 * TODO: create class Appendables? 609 * @throws IOException 610 */ 611 private static void append(Appendable app, CharSequence s) { 612 try { 613 app.append(s); 614 } catch (IOException e) { 615 throw new ICUUncheckedIOException(e); 616 } 617 } 618 619 /** 620 * Append the <code>toPattern()</code> representation of a 621 * string to the given <code>Appendable</code>. 622 */ 623 private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) { 624 int cp; 625 for (int i = 0; i < s.length(); i += Character.charCount(cp)) { 626 cp = s.codePointAt(i); 627 _appendToPat(buf, cp, escapeUnprintable); 628 } 629 return buf; 630 } 631 632 /** 633 * Append the <code>toPattern()</code> representation of a 634 * character to the given <code>Appendable</code>. 635 */ 636 private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) { 637 try { 638 if (escapeUnprintable && Utility.isUnprintable(c)) { 639 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything 640 // unprintable 641 if (Utility.escapeUnprintable(buf, c)) { 642 return buf; 643 } 644 } 645 // Okay to let ':' pass through 646 switch (c) { 647 case '[': // SET_OPEN: 648 case ']': // SET_CLOSE: 649 case '-': // HYPHEN: 650 case '^': // COMPLEMENT: 651 case '&': // INTERSECTION: 652 case '\\': //BACKSLASH: 653 case '{': 654 case '}': 655 case '$': 656 case ':': 657 buf.append('\\'); 658 break; 659 default: 660 // Escape whitespace 661 if (PatternProps.isWhiteSpace(c)) { 662 buf.append('\\'); 663 } 664 break; 665 } 666 appendCodePoint(buf, c); 667 return buf; 668 } catch (IOException e) { 669 throw new ICUUncheckedIOException(e); 670 } 671 } 672 673 /** 674 * Returns a string representation of this set. If the result of 675 * calling this function is passed to a UnicodeSet constructor, it 676 * will produce another set that is equal to this one. 677 */ 678 @Override 679 public String toPattern(boolean escapeUnprintable) { 680 if (pat != null && !escapeUnprintable) { 681 return pat; 682 } 683 StringBuilder result = new StringBuilder(); 684 return _toPattern(result, escapeUnprintable).toString(); 685 } 686 687 /** 688 * Append a string representation of this set to result. This will be 689 * a cleaned version of the string passed to applyPattern(), if there 690 * is one. Otherwise it will be generated. 691 */ 692 private <T extends Appendable> T _toPattern(T result, 693 boolean escapeUnprintable) { 694 if (pat == null) { 695 return appendNewPattern(result, escapeUnprintable, true); 696 } 697 try { 698 if (!escapeUnprintable) { 699 result.append(pat); 700 return result; 701 } 702 boolean oddNumberOfBackslashes = false; 703 for (int i=0; i<pat.length(); ) { 704 int c = pat.codePointAt(i); 705 i += Character.charCount(c); 706 if (Utility.isUnprintable(c)) { 707 // If the unprintable character is preceded by an odd 708 // number of backslashes, then it has been escaped 709 // and we omit the last backslash. 710 Utility.escapeUnprintable(result, c); 711 oddNumberOfBackslashes = false; 712 } else if (!oddNumberOfBackslashes && c == '\\') { 713 // Temporarily withhold an odd-numbered backslash. 714 oddNumberOfBackslashes = true; 715 } else { 716 if (oddNumberOfBackslashes) { 717 result.append('\\'); 718 } 719 appendCodePoint(result, c); 720 oddNumberOfBackslashes = false; 721 } 722 } 723 if (oddNumberOfBackslashes) { 724 result.append('\\'); 725 } 726 return result; 727 } catch (IOException e) { 728 throw new ICUUncheckedIOException(e); 729 } 730 } 731 732 /** 733 * Generate and append a string representation of this set to result. 734 * This does not use this.pat, the cleaned up copy of the string 735 * passed to applyPattern(). 736 * @param result the buffer into which to generate the pattern 737 * @param escapeUnprintable escape unprintable characters if true 738 */ 739 public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) { 740 return _generatePattern(result, escapeUnprintable, true); 741 } 742 743 /** 744 * Generate and append a string representation of this set to result. 745 * This does not use this.pat, the cleaned up copy of the string 746 * passed to applyPattern(). 747 * @param includeStrings if false, doesn't include the strings. 748 */ 749 public StringBuffer _generatePattern(StringBuffer result, 750 boolean escapeUnprintable, boolean includeStrings) { 751 return appendNewPattern(result, escapeUnprintable, includeStrings); 752 } 753 754 private <T extends Appendable> T appendNewPattern( 755 T result, boolean escapeUnprintable, boolean includeStrings) { 756 try { 757 result.append('['); 758 759 int count = getRangeCount(); 760 761 // If the set contains at least 2 intervals and includes both 762 // MIN_VALUE and MAX_VALUE, then the inverse representation will 763 // be more economical. 764 if (count > 1 && 765 getRangeStart(0) == MIN_VALUE && 766 getRangeEnd(count-1) == MAX_VALUE) { 767 768 // Emit the inverse 769 result.append('^'); 770 771 for (int i = 1; i < count; ++i) { 772 int start = getRangeEnd(i-1)+1; 773 int end = getRangeStart(i)-1; 774 _appendToPat(result, start, escapeUnprintable); 775 if (start != end) { 776 if ((start+1) != end) { 777 result.append('-'); 778 } 779 _appendToPat(result, end, escapeUnprintable); 780 } 781 } 782 } 783 784 // Default; emit the ranges as pairs 785 else { 786 for (int i = 0; i < count; ++i) { 787 int start = getRangeStart(i); 788 int end = getRangeEnd(i); 789 _appendToPat(result, start, escapeUnprintable); 790 if (start != end) { 791 if ((start+1) != end) { 792 result.append('-'); 793 } 794 _appendToPat(result, end, escapeUnprintable); 795 } 796 } 797 } 798 799 if (includeStrings && strings.size() > 0) { 800 for (String s : strings) { 801 result.append('{'); 802 _appendToPat(result, s, escapeUnprintable); 803 result.append('}'); 804 } 805 } 806 result.append(']'); 807 return result; 808 } catch (IOException e) { 809 throw new ICUUncheckedIOException(e); 810 } 811 } 812 813 /** 814 * Returns the number of elements in this set (its cardinality) 815 * Note than the elements of a set may include both individual 816 * codepoints and strings. 817 * 818 * @return the number of elements in this set (its cardinality). 819 */ 820 public int size() { 821 int n = 0; 822 int count = getRangeCount(); 823 for (int i = 0; i < count; ++i) { 824 n += getRangeEnd(i) - getRangeStart(i) + 1; 825 } 826 return n + strings.size(); 827 } 828 829 /** 830 * Returns <tt>true</tt> if this set contains no elements. 831 * 832 * @return <tt>true</tt> if this set contains no elements. 833 */ 834 public boolean isEmpty() { 835 return len == 1 && strings.size() == 0; 836 } 837 838 /** 839 * Implementation of UnicodeMatcher API. Returns <tt>true</tt> if 840 * this set contains any character whose low byte is the given 841 * value. This is used by <tt>RuleBasedTransliterator</tt> for 842 * indexing. 843 */ 844 @Override 845 public boolean matchesIndexValue(int v) { 846 /* The index value v, in the range [0,255], is contained in this set if 847 * it is contained in any pair of this set. Pairs either have the high 848 * bytes equal, or unequal. If the high bytes are equal, then we have 849 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 850 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 851 * Then v is contained if xx <= v || v <= yy. (This is identical to the 852 * time zone month containment logic.) 853 */ 854 for (int i=0; i<getRangeCount(); ++i) { 855 int low = getRangeStart(i); 856 int high = getRangeEnd(i); 857 if ((low & ~0xFF) == (high & ~0xFF)) { 858 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 859 return true; 860 } 861 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 862 return true; 863 } 864 } 865 if (strings.size() != 0) { 866 for (String s : strings) { 867 //if (s.length() == 0) { 868 // // Empty strings match everything 869 // return true; 870 //} 871 // assert(s.length() != 0); // We enforce this elsewhere 872 int c = UTF16.charAt(s, 0); 873 if ((c & 0xFF) == v) { 874 return true; 875 } 876 } 877 } 878 return false; 879 } 880 881 /** 882 * Implementation of UnicodeMatcher.matches(). Always matches the 883 * longest possible multichar string. 884 */ 885 @Override 886 public int matches(Replaceable text, 887 int[] offset, 888 int limit, 889 boolean incremental) { 890 891 if (offset[0] == limit) { 892 // Strings, if any, have length != 0, so we don't worry 893 // about them here. If we ever allow zero-length strings 894 // we much check for them here. 895 if (contains(UnicodeMatcher.ETHER)) { 896 return incremental ? U_PARTIAL_MATCH : U_MATCH; 897 } else { 898 return U_MISMATCH; 899 } 900 } else { 901 if (strings.size() != 0) { // try strings first 902 903 // might separate forward and backward loops later 904 // for now they are combined 905 906 // TODO Improve efficiency of this, at least in the forward 907 // direction, if not in both. In the forward direction we 908 // can assume the strings are sorted. 909 910 boolean forward = offset[0] < limit; 911 912 // firstChar is the leftmost char to match in the 913 // forward direction or the rightmost char to match in 914 // the reverse direction. 915 char firstChar = text.charAt(offset[0]); 916 917 // If there are multiple strings that can match we 918 // return the longest match. 919 int highWaterLength = 0; 920 921 for (String trial : strings) { 922 //if (trial.length() == 0) { 923 // return U_MATCH; // null-string always matches 924 //} 925 // assert(trial.length() != 0); // We ensure this elsewhere 926 927 char c = trial.charAt(forward ? 0 : trial.length() - 1); 928 929 // Strings are sorted, so we can optimize in the 930 // forward direction. 931 if (forward && c > firstChar) break; 932 if (c != firstChar) continue; 933 934 int length = matchRest(text, offset[0], limit, trial); 935 936 if (incremental) { 937 int maxLen = forward ? limit-offset[0] : offset[0]-limit; 938 if (length == maxLen) { 939 // We have successfully matched but only up to limit. 940 return U_PARTIAL_MATCH; 941 } 942 } 943 944 if (length == trial.length()) { 945 // We have successfully matched the whole string. 946 if (length > highWaterLength) { 947 highWaterLength = length; 948 } 949 // In the forward direction we know strings 950 // are sorted so we can bail early. 951 if (forward && length < highWaterLength) { 952 break; 953 } 954 continue; 955 } 956 } 957 958 // We've checked all strings without a partial match. 959 // If we have full matches, return the longest one. 960 if (highWaterLength != 0) { 961 offset[0] += forward ? highWaterLength : -highWaterLength; 962 return U_MATCH; 963 } 964 } 965 return super.matches(text, offset, limit, incremental); 966 } 967 } 968 969 /** 970 * Returns the longest match for s in text at the given position. 971 * If limit > start then match forward from start+1 to limit 972 * matching all characters except s.charAt(0). If limit < start, 973 * go backward starting from start-1 matching all characters 974 * except s.charAt(s.length()-1). This method assumes that the 975 * first character, text.charAt(start), matches s, so it does not 976 * check it. 977 * @param text the text to match 978 * @param start the first character to match. In the forward 979 * direction, text.charAt(start) is matched against s.charAt(0). 980 * In the reverse direction, it is matched against 981 * s.charAt(s.length()-1). 982 * @param limit the limit offset for matching, either last+1 in 983 * the forward direction, or last-1 in the reverse direction, 984 * where last is the index of the last character to match. 985 * @return If part of s matches up to the limit, return |limit - 986 * start|. If all of s matches before reaching the limit, return 987 * s.length(). If there is a mismatch between s and text, return 988 * 0 989 */ 990 private static int matchRest (Replaceable text, int start, int limit, String s) { 991 int maxLen; 992 int slen = s.length(); 993 if (start < limit) { 994 maxLen = limit - start; 995 if (maxLen > slen) maxLen = slen; 996 for (int i = 1; i < maxLen; ++i) { 997 if (text.charAt(start + i) != s.charAt(i)) return 0; 998 } 999 } else { 1000 maxLen = start - limit; 1001 if (maxLen > slen) maxLen = slen; 1002 --slen; // <=> slen = s.length() - 1; 1003 for (int i = 1; i < maxLen; ++i) { 1004 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 1005 } 1006 } 1007 return maxLen; 1008 } 1009 1010 /** 1011 * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1. 1012 * @deprecated This API is ICU internal only. 1013 * @hide original deprecated declaration 1014 * @hide draft / provisional / internal are hidden on Android 1015 */ 1016 @Deprecated 1017 public int matchesAt(CharSequence text, int offset) { 1018 int lastLen = -1; 1019 strings: 1020 if (strings.size() != 0) { 1021 char firstChar = text.charAt(offset); 1022 String trial = null; 1023 // find the first string starting with firstChar 1024 Iterator<String> it = strings.iterator(); 1025 while (it.hasNext()) { 1026 trial = it.next(); 1027 char firstStringChar = trial.charAt(0); 1028 if (firstStringChar < firstChar) continue; 1029 if (firstStringChar > firstChar) break strings; 1030 } 1031 1032 // now keep checking string until we get the longest one 1033 for (;;) { 1034 int tempLen = matchesAt(text, offset, trial); 1035 if (lastLen > tempLen) break strings; 1036 lastLen = tempLen; 1037 if (!it.hasNext()) break; 1038 trial = it.next(); 1039 } 1040 } 1041 1042 if (lastLen < 2) { 1043 int cp = UTF16.charAt(text, offset); 1044 if (contains(cp)) lastLen = UTF16.getCharCount(cp); 1045 } 1046 1047 return offset+lastLen; 1048 } 1049 1050 /** 1051 * Does one string contain another, starting at a specific offset? 1052 * @param text text to match 1053 * @param offsetInText offset within that text 1054 * @param substring substring to match at offset in text 1055 * @return -1 if match fails, otherwise other.length() 1056 */ 1057 // Note: This method was moved from CollectionUtilities 1058 private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) { 1059 int len = substring.length(); 1060 int textLength = text.length(); 1061 if (textLength + offsetInText > len) { 1062 return -1; 1063 } 1064 int i = 0; 1065 for (int j = offsetInText; i < len; ++i, ++j) { 1066 char pc = substring.charAt(i); 1067 char tc = text.charAt(j); 1068 if (pc != tc) return -1; 1069 } 1070 return i; 1071 } 1072 1073 /** 1074 * Implementation of UnicodeMatcher API. Union the set of all 1075 * characters that may be matched by this object into the given 1076 * set. 1077 * @param toUnionTo the set into which to union the source characters 1078 */ 1079 @Override 1080 public void addMatchSetTo(UnicodeSet toUnionTo) { 1081 toUnionTo.addAll(this); 1082 } 1083 1084 /** 1085 * Returns the index of the given character within this set, where 1086 * the set is ordered by ascending code point. If the character 1087 * is not in this set, return -1. The inverse of this method is 1088 * <code>charAt()</code>. 1089 * @return an index from 0..size()-1, or -1 1090 */ 1091 public int indexOf(int c) { 1092 if (c < MIN_VALUE || c > MAX_VALUE) { 1093 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1094 } 1095 int i = 0; 1096 int n = 0; 1097 for (;;) { 1098 int start = list[i++]; 1099 if (c < start) { 1100 return -1; 1101 } 1102 int limit = list[i++]; 1103 if (c < limit) { 1104 return n + c - start; 1105 } 1106 n += limit - start; 1107 } 1108 } 1109 1110 /** 1111 * Returns the character at the given index within this set, where 1112 * the set is ordered by ascending code point. If the index is 1113 * out of range, return -1. The inverse of this method is 1114 * <code>indexOf()</code>. 1115 * @param index an index from 0..size()-1 1116 * @return the character at the given index, or -1. 1117 */ 1118 public int charAt(int index) { 1119 if (index >= 0) { 1120 // len2 is the largest even integer <= len, that is, it is len 1121 // for even values and len-1 for odd values. With odd values 1122 // the last entry is UNICODESET_HIGH. 1123 int len2 = len & ~1; 1124 for (int i=0; i < len2;) { 1125 int start = list[i++]; 1126 int count = list[i++] - start; 1127 if (index < count) { 1128 return start + index; 1129 } 1130 index -= count; 1131 } 1132 } 1133 return -1; 1134 } 1135 1136 /** 1137 * Adds the specified range to this set if it is not already 1138 * present. If this set already contains the specified range, 1139 * the call leaves this set unchanged. If <code>end > start</code> 1140 * then an empty range is added, leaving the set unchanged. 1141 * 1142 * @param start first character, inclusive, of range to be added 1143 * to this set. 1144 * @param end last character, inclusive, of range to be added 1145 * to this set. 1146 */ 1147 public UnicodeSet add(int start, int end) { 1148 checkFrozen(); 1149 return add_unchecked(start, end); 1150 } 1151 1152 /** 1153 * Adds all characters in range (uses preferred naming convention). 1154 * @param start The index of where to start on adding all characters. 1155 * @param end The index of where to end on adding all characters. 1156 * @return a reference to this object 1157 */ 1158 public UnicodeSet addAll(int start, int end) { 1159 checkFrozen(); 1160 return add_unchecked(start, end); 1161 } 1162 1163 // for internal use, after checkFrozen has been called 1164 private UnicodeSet add_unchecked(int start, int end) { 1165 if (start < MIN_VALUE || start > MAX_VALUE) { 1166 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1167 } 1168 if (end < MIN_VALUE || end > MAX_VALUE) { 1169 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1170 } 1171 if (start < end) { 1172 add(range(start, end), 2, 0); 1173 } else if (start == end) { 1174 add(start); 1175 } 1176 return this; 1177 } 1178 1179 // /** 1180 // * Format out the inversion list as a string, for debugging. Uncomment when 1181 // * needed. 1182 // */ 1183 // public final String dump() { 1184 // StringBuffer buf = new StringBuffer("["); 1185 // for (int i=0; i<len; ++i) { 1186 // if (i != 0) buf.append(", "); 1187 // int c = list[i]; 1188 // //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') { 1189 // // buf.append((char) c); 1190 // //} else { 1191 // buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6)); 1192 // //} 1193 // } 1194 // buf.append("]"); 1195 // return buf.toString(); 1196 // } 1197 1198 /** 1199 * Adds the specified character to this set if it is not already 1200 * present. If this set already contains the specified character, 1201 * the call leaves this set unchanged. 1202 */ 1203 public final UnicodeSet add(int c) { 1204 checkFrozen(); 1205 return add_unchecked(c); 1206 } 1207 1208 // for internal use only, after checkFrozen has been called 1209 private final UnicodeSet add_unchecked(int c) { 1210 if (c < MIN_VALUE || c > MAX_VALUE) { 1211 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1212 } 1213 1214 // find smallest i such that c < list[i] 1215 // if odd, then it is IN the set 1216 // if even, then it is OUT of the set 1217 int i = findCodePoint(c); 1218 1219 // already in set? 1220 if ((i & 1) != 0) return this; 1221 1222 // HIGH is 0x110000 1223 // assert(list[len-1] == HIGH); 1224 1225 // empty = [HIGH] 1226 // [start_0, limit_0, start_1, limit_1, HIGH] 1227 1228 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1229 // ^ 1230 // list[i] 1231 1232 // i == 0 means c is before the first range 1233 // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into 1234 // add_unchecked, the maximum value that "c" will be compared to 1235 // is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will 1236 // never be reached according to this logic. 1237 if (c == list[i]-1) { 1238 // c is before start of next range 1239 list[i] = c; 1240 // if we touched the HIGH mark, then add a new one 1241 if (c == MAX_VALUE) { 1242 ensureCapacity(len+1); 1243 list[len++] = HIGH; 1244 } 1245 if (i > 0 && c == list[i-1]) { 1246 // collapse adjacent ranges 1247 1248 // [..., start_k-1, c, c, limit_k, ..., HIGH] 1249 // ^ 1250 // list[i] 1251 System.arraycopy(list, i+1, list, i-1, len-i-1); 1252 len -= 2; 1253 } 1254 } 1255 1256 else if (i > 0 && c == list[i-1]) { 1257 // c is after end of prior range 1258 list[i-1]++; 1259 // no need to chcek for collapse here 1260 } 1261 1262 else { 1263 // At this point we know the new char is not adjacent to 1264 // any existing ranges, and it is not 10FFFF. 1265 1266 1267 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1268 // ^ 1269 // list[i] 1270 1271 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 1272 // ^ 1273 // list[i] 1274 1275 // Don't use ensureCapacity() to save on copying. 1276 // NOTE: This has no measurable impact on performance, 1277 // but it might help in some usage patterns. 1278 if (len+2 > list.length) { 1279 int[] temp = new int[len + 2 + GROW_EXTRA]; 1280 if (i != 0) System.arraycopy(list, 0, temp, 0, i); 1281 System.arraycopy(list, i, temp, i+2, len-i); 1282 list = temp; 1283 } else { 1284 System.arraycopy(list, i, list, i+2, len-i); 1285 } 1286 1287 list[i] = c; 1288 list[i+1] = c+1; 1289 len += 2; 1290 } 1291 1292 pat = null; 1293 return this; 1294 } 1295 1296 /** 1297 * Adds the specified multicharacter to this set if it is not already 1298 * present. If this set already contains the multicharacter, 1299 * the call leaves this set unchanged. 1300 * Thus "ch" => {"ch"} 1301 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1302 * @param s the source string 1303 * @return this object, for chaining 1304 */ 1305 public final UnicodeSet add(CharSequence s) { 1306 checkFrozen(); 1307 int cp = getSingleCP(s); 1308 if (cp < 0) { 1309 strings.add(s.toString()); 1310 pat = null; 1311 } else { 1312 add_unchecked(cp, cp); 1313 } 1314 return this; 1315 } 1316 1317 /** 1318 * Utility for getting code point from single code point CharSequence. 1319 * See the public UTF16.getSingleCodePoint() 1320 * @return a code point IF the string consists of a single one. 1321 * otherwise returns -1. 1322 * @param s to test 1323 */ 1324 private static int getSingleCP(CharSequence s) { 1325 if (s.length() < 1) { 1326 throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); 1327 } 1328 if (s.length() > 2) return -1; 1329 if (s.length() == 1) return s.charAt(0); 1330 1331 // at this point, len = 2 1332 int cp = UTF16.charAt(s, 0); 1333 if (cp > 0xFFFF) { // is surrogate pair 1334 return cp; 1335 } 1336 return -1; 1337 } 1338 1339 /** 1340 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1341 * If this set already any particular character, it has no effect on that character. 1342 * @param s the source string 1343 * @return this object, for chaining 1344 */ 1345 public final UnicodeSet addAll(CharSequence s) { 1346 checkFrozen(); 1347 int cp; 1348 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1349 cp = UTF16.charAt(s, i); 1350 add_unchecked(cp, cp); 1351 } 1352 return this; 1353 } 1354 1355 /** 1356 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1357 * If this set already any particular character, it has no effect on that character. 1358 * @param s the source string 1359 * @return this object, for chaining 1360 */ 1361 public final UnicodeSet retainAll(CharSequence s) { 1362 return retainAll(fromAll(s)); 1363 } 1364 1365 /** 1366 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1367 * If this set already any particular character, it has no effect on that character. 1368 * @param s the source string 1369 * @return this object, for chaining 1370 */ 1371 public final UnicodeSet complementAll(CharSequence s) { 1372 return complementAll(fromAll(s)); 1373 } 1374 1375 /** 1376 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1377 * If this set already any particular character, it has no effect on that character. 1378 * @param s the source string 1379 * @return this object, for chaining 1380 */ 1381 public final UnicodeSet removeAll(CharSequence s) { 1382 return removeAll(fromAll(s)); 1383 } 1384 1385 /** 1386 * Remove all strings from this UnicodeSet 1387 * @return this object, for chaining 1388 */ 1389 public final UnicodeSet removeAllStrings() { 1390 checkFrozen(); 1391 if (strings.size() != 0) { 1392 strings.clear(); 1393 pat = null; 1394 } 1395 return this; 1396 } 1397 1398 /** 1399 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1400 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1401 * @param s the source string 1402 * @return a newly created set containing the given string 1403 */ 1404 public static UnicodeSet from(CharSequence s) { 1405 return new UnicodeSet().add(s); 1406 } 1407 1408 1409 /** 1410 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1411 * @param s the source string 1412 * @return a newly created set containing the given characters 1413 */ 1414 public static UnicodeSet fromAll(CharSequence s) { 1415 return new UnicodeSet().addAll(s); 1416 } 1417 1418 1419 /** 1420 * Retain only the elements in this set that are contained in the 1421 * specified range. If <code>end > start</code> then an empty range is 1422 * retained, leaving the set empty. 1423 * 1424 * @param start first character, inclusive, of range to be retained 1425 * to this set. 1426 * @param end last character, inclusive, of range to be retained 1427 * to this set. 1428 */ 1429 public UnicodeSet retain(int start, int end) { 1430 checkFrozen(); 1431 if (start < MIN_VALUE || start > MAX_VALUE) { 1432 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1433 } 1434 if (end < MIN_VALUE || end > MAX_VALUE) { 1435 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1436 } 1437 if (start <= end) { 1438 retain(range(start, end), 2, 0); 1439 } else { 1440 clear(); 1441 } 1442 return this; 1443 } 1444 1445 /** 1446 * Retain the specified character from this set if it is present. 1447 * Upon return this set will be empty if it did not contain c, or 1448 * will only contain c if it did contain c. 1449 * @param c the character to be retained 1450 * @return this object, for chaining 1451 */ 1452 public final UnicodeSet retain(int c) { 1453 return retain(c, c); 1454 } 1455 1456 /** 1457 * Retain the specified string in this set if it is present. 1458 * Upon return this set will be empty if it did not contain s, or 1459 * will only contain s if it did contain s. 1460 * @param cs the string to be retained 1461 * @return this object, for chaining 1462 */ 1463 public final UnicodeSet retain(CharSequence cs) { 1464 1465 int cp = getSingleCP(cs); 1466 if (cp < 0) { 1467 String s = cs.toString(); 1468 boolean isIn = strings.contains(s); 1469 if (isIn && size() == 1) { 1470 return this; 1471 } 1472 clear(); 1473 strings.add(s); 1474 pat = null; 1475 } else { 1476 retain(cp, cp); 1477 } 1478 return this; 1479 } 1480 1481 /** 1482 * Removes the specified range from this set if it is present. 1483 * The set will not contain the specified range once the call 1484 * returns. If <code>end > start</code> then an empty range is 1485 * removed, leaving the set unchanged. 1486 * 1487 * @param start first character, inclusive, of range to be removed 1488 * from this set. 1489 * @param end last character, inclusive, of range to be removed 1490 * from this set. 1491 */ 1492 public UnicodeSet remove(int start, int end) { 1493 checkFrozen(); 1494 if (start < MIN_VALUE || start > MAX_VALUE) { 1495 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1496 } 1497 if (end < MIN_VALUE || end > MAX_VALUE) { 1498 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1499 } 1500 if (start <= end) { 1501 retain(range(start, end), 2, 2); 1502 } 1503 return this; 1504 } 1505 1506 /** 1507 * Removes the specified character from this set if it is present. 1508 * The set will not contain the specified character once the call 1509 * returns. 1510 * @param c the character to be removed 1511 * @return this object, for chaining 1512 */ 1513 public final UnicodeSet remove(int c) { 1514 return remove(c, c); 1515 } 1516 1517 /** 1518 * Removes the specified string from this set if it is present. 1519 * The set will not contain the specified string once the call 1520 * returns. 1521 * @param s the string to be removed 1522 * @return this object, for chaining 1523 */ 1524 public final UnicodeSet remove(CharSequence s) { 1525 int cp = getSingleCP(s); 1526 if (cp < 0) { 1527 strings.remove(s.toString()); 1528 pat = null; 1529 } else { 1530 remove(cp, cp); 1531 } 1532 return this; 1533 } 1534 1535 /** 1536 * Complements the specified range in this set. Any character in 1537 * the range will be removed if it is in this set, or will be 1538 * added if it is not in this set. If <code>end > start</code> 1539 * then an empty range is complemented, leaving the set unchanged. 1540 * 1541 * @param start first character, inclusive, of range to be removed 1542 * from this set. 1543 * @param end last character, inclusive, of range to be removed 1544 * from this set. 1545 */ 1546 public UnicodeSet complement(int start, int end) { 1547 checkFrozen(); 1548 if (start < MIN_VALUE || start > MAX_VALUE) { 1549 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1550 } 1551 if (end < MIN_VALUE || end > MAX_VALUE) { 1552 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1553 } 1554 if (start <= end) { 1555 xor(range(start, end), 2, 0); 1556 } 1557 pat = null; 1558 return this; 1559 } 1560 1561 /** 1562 * Complements the specified character in this set. The character 1563 * will be removed if it is in this set, or will be added if it is 1564 * not in this set. 1565 */ 1566 public final UnicodeSet complement(int c) { 1567 return complement(c, c); 1568 } 1569 1570 /** 1571 * This is equivalent to 1572 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1573 */ 1574 public UnicodeSet complement() { 1575 checkFrozen(); 1576 if (list[0] == LOW) { 1577 System.arraycopy(list, 1, list, 0, len-1); 1578 --len; 1579 } else { 1580 ensureCapacity(len+1); 1581 System.arraycopy(list, 0, list, 1, len); 1582 list[0] = LOW; 1583 ++len; 1584 } 1585 pat = null; 1586 return this; 1587 } 1588 1589 /** 1590 * Complement the specified string in this set. 1591 * The set will not contain the specified string once the call 1592 * returns. 1593 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1594 * @param s the string to complement 1595 * @return this object, for chaining 1596 */ 1597 public final UnicodeSet complement(CharSequence s) { 1598 checkFrozen(); 1599 int cp = getSingleCP(s); 1600 if (cp < 0) { 1601 String s2 = s.toString(); 1602 if (strings.contains(s2)) { 1603 strings.remove(s2); 1604 } else { 1605 strings.add(s2); 1606 } 1607 pat = null; 1608 } else { 1609 complement(cp, cp); 1610 } 1611 return this; 1612 } 1613 1614 /** 1615 * Returns true if this set contains the given character. 1616 * @param c character to be checked for containment 1617 * @return true if the test condition is met 1618 */ 1619 @Override 1620 public boolean contains(int c) { 1621 if (c < MIN_VALUE || c > MAX_VALUE) { 1622 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); 1623 } 1624 if (bmpSet != null) { 1625 return bmpSet.contains(c); 1626 } 1627 if (stringSpan != null) { 1628 return stringSpan.contains(c); 1629 } 1630 1631 /* 1632 // Set i to the index of the start item greater than ch 1633 // We know we will terminate without length test! 1634 int i = -1; 1635 while (true) { 1636 if (c < list[++i]) break; 1637 } 1638 */ 1639 1640 int i = findCodePoint(c); 1641 1642 return ((i & 1) != 0); // return true if odd 1643 } 1644 1645 /** 1646 * Returns the smallest value i such that c < list[i]. Caller 1647 * must ensure that c is a legal value or this method will enter 1648 * an infinite loop. This method performs a binary search. 1649 * @param c a character in the range MIN_VALUE..MAX_VALUE 1650 * inclusive 1651 * @return the smallest integer i in the range 0..len-1, 1652 * inclusive, such that c < list[i] 1653 */ 1654 private final int findCodePoint(int c) { 1655 /* Examples: 1656 findCodePoint(c) 1657 set list[] c=0 1 3 4 7 8 1658 === ============== =========== 1659 [] [110000] 0 0 0 0 0 0 1660 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 1661 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 1662 [:all:] [0, 110000] 1 1 1 1 1 1 1663 */ 1664 1665 // Return the smallest i such that c < list[i]. Assume 1666 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 1667 if (c < list[0]) return 0; 1668 // High runner test. c is often after the last range, so an 1669 // initial check for this condition pays off. 1670 if (len >= 2 && c >= list[len-2]) return len-1; 1671 int lo = 0; 1672 int hi = len - 1; 1673 // invariant: c >= list[lo] 1674 // invariant: c < list[hi] 1675 for (;;) { 1676 int i = (lo + hi) >>> 1; 1677 if (i == lo) return hi; 1678 if (c < list[i]) { 1679 hi = i; 1680 } else { 1681 lo = i; 1682 } 1683 } 1684 } 1685 1686 // //---------------------------------------------------------------- 1687 // // Unrolled binary search 1688 // //---------------------------------------------------------------- 1689 // 1690 // private int validLen = -1; // validated value of len 1691 // private int topOfLow; 1692 // private int topOfHigh; 1693 // private int power; 1694 // private int deltaStart; 1695 // 1696 // private void validate() { 1697 // if (len <= 1) { 1698 // throw new IllegalArgumentException("list.len==" + len + "; must be >1"); 1699 // } 1700 // 1701 // // find greatest power of 2 less than or equal to len 1702 // for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {} 1703 // 1704 // // assert(exp2[power] <= len); 1705 // 1706 // // determine the starting points 1707 // topOfLow = exp2[power] - 1; 1708 // topOfHigh = len - 1; 1709 // deltaStart = exp2[power-1]; 1710 // validLen = len; 1711 // } 1712 // 1713 // private static final int exp2[] = { 1714 // 0x1, 0x2, 0x4, 0x8, 1715 // 0x10, 0x20, 0x40, 0x80, 1716 // 0x100, 0x200, 0x400, 0x800, 1717 // 0x1000, 0x2000, 0x4000, 0x8000, 1718 // 0x10000, 0x20000, 0x40000, 0x80000, 1719 // 0x100000, 0x200000, 0x400000, 0x800000, 1720 // 0x1000000, 0x2000000, 0x4000000, 0x8000000, 1721 // 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java 1722 // }; 1723 // 1724 // /** 1725 // * Unrolled lowest index GT. 1726 // */ 1727 // private final int leastIndexGT(int searchValue) { 1728 // 1729 // if (len != validLen) { 1730 // if (len == 1) return 0; 1731 // validate(); 1732 // } 1733 // int temp; 1734 // 1735 // // set up initial range to search. Each subrange is a power of two in length 1736 // int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh; 1737 // 1738 // // Completely unrolled binary search, folhighing "Programming Pearls" 1739 // // Each case deliberately falls through to the next 1740 // // Logically, list[-1] < all_search_values && list[count] > all_search_values 1741 // // although the values -1 and count are never actually touched. 1742 // 1743 // // The bounds at each point are low & high, 1744 // // where low == high - delta*2 1745 // // so high - delta is the midpoint 1746 // 1747 // // The invariant AFTER each line is that list[low] < searchValue <= list[high] 1748 // 1749 // switch (power) { 1750 // //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java 1751 // case 30: if (searchValue < list[temp = high-0x20000000]) high = temp; 1752 // case 29: if (searchValue < list[temp = high-0x10000000]) high = temp; 1753 // 1754 // case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp; 1755 // case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp; 1756 // case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp; 1757 // case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp; 1758 // 1759 // case 24: if (searchValue < list[temp = high- 0x800000]) high = temp; 1760 // case 23: if (searchValue < list[temp = high- 0x400000]) high = temp; 1761 // case 22: if (searchValue < list[temp = high- 0x200000]) high = temp; 1762 // case 21: if (searchValue < list[temp = high- 0x100000]) high = temp; 1763 // 1764 // case 20: if (searchValue < list[temp = high- 0x80000]) high = temp; 1765 // case 19: if (searchValue < list[temp = high- 0x40000]) high = temp; 1766 // case 18: if (searchValue < list[temp = high- 0x20000]) high = temp; 1767 // case 17: if (searchValue < list[temp = high- 0x10000]) high = temp; 1768 // 1769 // case 16: if (searchValue < list[temp = high- 0x8000]) high = temp; 1770 // case 15: if (searchValue < list[temp = high- 0x4000]) high = temp; 1771 // case 14: if (searchValue < list[temp = high- 0x2000]) high = temp; 1772 // case 13: if (searchValue < list[temp = high- 0x1000]) high = temp; 1773 // 1774 // case 12: if (searchValue < list[temp = high- 0x800]) high = temp; 1775 // case 11: if (searchValue < list[temp = high- 0x400]) high = temp; 1776 // case 10: if (searchValue < list[temp = high- 0x200]) high = temp; 1777 // case 9: if (searchValue < list[temp = high- 0x100]) high = temp; 1778 // 1779 // case 8: if (searchValue < list[temp = high- 0x80]) high = temp; 1780 // case 7: if (searchValue < list[temp = high- 0x40]) high = temp; 1781 // case 6: if (searchValue < list[temp = high- 0x20]) high = temp; 1782 // case 5: if (searchValue < list[temp = high- 0x10]) high = temp; 1783 // 1784 // case 4: if (searchValue < list[temp = high- 0x8]) high = temp; 1785 // case 3: if (searchValue < list[temp = high- 0x4]) high = temp; 1786 // case 2: if (searchValue < list[temp = high- 0x2]) high = temp; 1787 // case 1: if (searchValue < list[temp = high- 0x1]) high = temp; 1788 // } 1789 // 1790 // return high; 1791 // } 1792 // 1793 // // For debugging only 1794 // public int len() { 1795 // return len; 1796 // } 1797 // 1798 // //---------------------------------------------------------------- 1799 // //---------------------------------------------------------------- 1800 1801 /** 1802 * Returns true if this set contains every character 1803 * of the given range. 1804 * @param start first character, inclusive, of the range 1805 * @param end last character, inclusive, of the range 1806 * @return true if the test condition is met 1807 */ 1808 public boolean contains(int start, int end) { 1809 if (start < MIN_VALUE || start > MAX_VALUE) { 1810 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1811 } 1812 if (end < MIN_VALUE || end > MAX_VALUE) { 1813 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1814 } 1815 //int i = -1; 1816 //while (true) { 1817 // if (start < list[++i]) break; 1818 //} 1819 int i = findCodePoint(start); 1820 return ((i & 1) != 0 && end < list[i]); 1821 } 1822 1823 /** 1824 * Returns <tt>true</tt> if this set contains the given 1825 * multicharacter string. 1826 * @param s string to be checked for containment 1827 * @return <tt>true</tt> if this set contains the specified string 1828 */ 1829 public final boolean contains(CharSequence s) { 1830 1831 int cp = getSingleCP(s); 1832 if (cp < 0) { 1833 return strings.contains(s.toString()); 1834 } else { 1835 return contains(cp); 1836 } 1837 } 1838 1839 /** 1840 * Returns true if this set contains all the characters and strings 1841 * of the given set. 1842 * @param b set to be checked for containment 1843 * @return true if the test condition is met 1844 */ 1845 public boolean containsAll(UnicodeSet b) { 1846 // The specified set is a subset if all of its pairs are contained in 1847 // this set. This implementation accesses the lists directly for speed. 1848 // TODO: this could be faster if size() were cached. But that would affect building speed 1849 // so it needs investigation. 1850 int[] listB = b.list; 1851 boolean needA = true; 1852 boolean needB = true; 1853 int aPtr = 0; 1854 int bPtr = 0; 1855 int aLen = len - 1; 1856 int bLen = b.len - 1; 1857 int startA = 0, startB = 0, limitA = 0, limitB = 0; 1858 while (true) { 1859 // double iterations are such a pain... 1860 if (needA) { 1861 if (aPtr >= aLen) { 1862 // ran out of A. If B is also exhausted, then break; 1863 if (needB && bPtr >= bLen) { 1864 break; 1865 } 1866 return false; 1867 } 1868 startA = list[aPtr++]; 1869 limitA = list[aPtr++]; 1870 } 1871 if (needB) { 1872 if (bPtr >= bLen) { 1873 // ran out of B. Since we got this far, we have an A and we are ok so far 1874 break; 1875 } 1876 startB = listB[bPtr++]; 1877 limitB = listB[bPtr++]; 1878 } 1879 // if B doesn't overlap and is greater than A, get new A 1880 if (startB >= limitA) { 1881 needA = true; 1882 needB = false; 1883 continue; 1884 } 1885 // if B is wholy contained in A, then get a new B 1886 if (startB >= startA && limitB <= limitA) { 1887 needA = false; 1888 needB = true; 1889 continue; 1890 } 1891 // all other combinations mean we fail 1892 return false; 1893 } 1894 1895 if (!strings.containsAll(b.strings)) return false; 1896 return true; 1897 } 1898 1899 // /** 1900 // * Returns true if this set contains all the characters and strings 1901 // * of the given set. 1902 // * @param c set to be checked for containment 1903 // * @return true if the test condition is met 1904 // * @stable ICU 2.0 1905 // */ 1906 // public boolean containsAllOld(UnicodeSet c) { 1907 // // The specified set is a subset if all of its pairs are contained in 1908 // // this set. It's possible to code this more efficiently in terms of 1909 // // direct manipulation of the inversion lists if the need arises. 1910 // int n = c.getRangeCount(); 1911 // for (int i=0; i<n; ++i) { 1912 // if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 1913 // return false; 1914 // } 1915 // } 1916 // if (!strings.containsAll(c.strings)) return false; 1917 // return true; 1918 // } 1919 1920 /** 1921 * Returns true if there is a partition of the string such that this set contains each of the partitioned strings. 1922 * For example, for the Unicode set [a{bc}{cd}]<br> 1923 * containsAll is true for each of: "a", "bc", ""cdbca"<br> 1924 * containsAll is false for each of: "acb", "bcda", "bcx"<br> 1925 * @param s string containing characters to be checked for containment 1926 * @return true if the test condition is met 1927 */ 1928 public boolean containsAll(String s) { 1929 int cp; 1930 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 1931 cp = UTF16.charAt(s, i); 1932 if (!contains(cp)) { 1933 if (strings.size() == 0) { 1934 return false; 1935 } 1936 return containsAll(s, 0); 1937 } 1938 } 1939 return true; 1940 } 1941 1942 /** 1943 * Recursive routine called if we fail to find a match in containsAll, and there are strings 1944 * @param s source string 1945 * @param i point to match to the end on 1946 * @return true if ok 1947 */ 1948 private boolean containsAll(String s, int i) { 1949 if (i >= s.length()) { 1950 return true; 1951 } 1952 int cp= UTF16.charAt(s, i); 1953 if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) { 1954 return true; 1955 } 1956 for (String setStr : strings) { 1957 if (s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) { 1958 return true; 1959 } 1960 } 1961 return false; 1962 1963 } 1964 1965 /** 1966 * Get the Regex equivalent for this UnicodeSet 1967 * @return regex pattern equivalent to this UnicodeSet 1968 * @deprecated This API is ICU internal only. 1969 * @hide original deprecated declaration 1970 * @hide draft / provisional / internal are hidden on Android 1971 */ 1972 @Deprecated 1973 public String getRegexEquivalent() { 1974 if (strings.size() == 0) { 1975 return toString(); 1976 } 1977 StringBuilder result = new StringBuilder("(?:"); 1978 appendNewPattern(result, true, false); 1979 for (String s : strings) { 1980 result.append('|'); 1981 _appendToPat(result, s, true); 1982 } 1983 return result.append(")").toString(); 1984 } 1985 1986 /** 1987 * Returns true if this set contains none of the characters 1988 * of the given range. 1989 * @param start first character, inclusive, of the range 1990 * @param end last character, inclusive, of the range 1991 * @return true if the test condition is met 1992 */ 1993 public boolean containsNone(int start, int end) { 1994 if (start < MIN_VALUE || start > MAX_VALUE) { 1995 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); 1996 } 1997 if (end < MIN_VALUE || end > MAX_VALUE) { 1998 throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); 1999 } 2000 int i = -1; 2001 while (true) { 2002 if (start < list[++i]) break; 2003 } 2004 return ((i & 1) == 0 && end < list[i]); 2005 } 2006 2007 /** 2008 * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2009 * For example, for the Unicode set [a{bc}{cd}]<br> 2010 * containsNone is true for: "xy", "cb"<br> 2011 * containsNone is false for: "a", "bc", "bcd"<br> 2012 * @param b set to be checked for containment 2013 * @return true if the test condition is met 2014 */ 2015 public boolean containsNone(UnicodeSet b) { 2016 // The specified set is a subset if some of its pairs overlap with some of this set's pairs. 2017 // This implementation accesses the lists directly for speed. 2018 int[] listB = b.list; 2019 boolean needA = true; 2020 boolean needB = true; 2021 int aPtr = 0; 2022 int bPtr = 0; 2023 int aLen = len - 1; 2024 int bLen = b.len - 1; 2025 int startA = 0, startB = 0, limitA = 0, limitB = 0; 2026 while (true) { 2027 // double iterations are such a pain... 2028 if (needA) { 2029 if (aPtr >= aLen) { 2030 // ran out of A: break so we test strings 2031 break; 2032 } 2033 startA = list[aPtr++]; 2034 limitA = list[aPtr++]; 2035 } 2036 if (needB) { 2037 if (bPtr >= bLen) { 2038 // ran out of B: break so we test strings 2039 break; 2040 } 2041 startB = listB[bPtr++]; 2042 limitB = listB[bPtr++]; 2043 } 2044 // if B is higher than any part of A, get new A 2045 if (startB >= limitA) { 2046 needA = true; 2047 needB = false; 2048 continue; 2049 } 2050 // if A is higher than any part of B, get new B 2051 if (startA >= limitB) { 2052 needA = false; 2053 needB = true; 2054 continue; 2055 } 2056 // all other combinations mean we fail 2057 return false; 2058 } 2059 2060 if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false; 2061 return true; 2062 } 2063 2064 // /** 2065 // * Returns true if none of the characters or strings in this UnicodeSet appears in the string. 2066 // * For example, for the Unicode set [a{bc}{cd}]<br> 2067 // * containsNone is true for: "xy", "cb"<br> 2068 // * containsNone is false for: "a", "bc", "bcd"<br> 2069 // * @param c set to be checked for containment 2070 // * @return true if the test condition is met 2071 // * @stable ICU 2.0 2072 // */ 2073 // public boolean containsNoneOld(UnicodeSet c) { 2074 // // The specified set is a subset if all of its pairs are contained in 2075 // // this set. It's possible to code this more efficiently in terms of 2076 // // direct manipulation of the inversion lists if the need arises. 2077 // int n = c.getRangeCount(); 2078 // for (int i=0; i<n; ++i) { 2079 // if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 2080 // return false; 2081 // } 2082 // } 2083 // if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false; 2084 // return true; 2085 // } 2086 2087 /** 2088 * Returns true if this set contains none of the characters 2089 * of the given string. 2090 * @param s string containing characters to be checked for containment 2091 * @return true if the test condition is met 2092 */ 2093 public boolean containsNone(CharSequence s) { 2094 return span(s, SpanCondition.NOT_CONTAINED) == s.length(); 2095 } 2096 2097 /** 2098 * Returns true if this set contains one or more of the characters 2099 * in the given range. 2100 * @param start first character, inclusive, of the range 2101 * @param end last character, inclusive, of the range 2102 * @return true if the condition is met 2103 */ 2104 public final boolean containsSome(int start, int end) { 2105 return !containsNone(start, end); 2106 } 2107 2108 /** 2109 * Returns true if this set contains one or more of the characters 2110 * and strings of the given set. 2111 * @param s set to be checked for containment 2112 * @return true if the condition is met 2113 */ 2114 public final boolean containsSome(UnicodeSet s) { 2115 return !containsNone(s); 2116 } 2117 2118 /** 2119 * Returns true if this set contains one or more of the characters 2120 * of the given string. 2121 * @param s string containing characters to be checked for containment 2122 * @return true if the condition is met 2123 */ 2124 public final boolean containsSome(CharSequence s) { 2125 return !containsNone(s); 2126 } 2127 2128 2129 /** 2130 * Adds all of the elements in the specified set to this set if 2131 * they're not already present. This operation effectively 2132 * modifies this set so that its value is the <i>union</i> of the two 2133 * sets. The behavior of this operation is unspecified if the specified 2134 * collection is modified while the operation is in progress. 2135 * 2136 * @param c set whose elements are to be added to this set. 2137 */ 2138 public UnicodeSet addAll(UnicodeSet c) { 2139 checkFrozen(); 2140 add(c.list, c.len, 0); 2141 strings.addAll(c.strings); 2142 return this; 2143 } 2144 2145 /** 2146 * Retains only the elements in this set that are contained in the 2147 * specified set. In other words, removes from this set all of 2148 * its elements that are not contained in the specified set. This 2149 * operation effectively modifies this set so that its value is 2150 * the <i>intersection</i> of the two sets. 2151 * 2152 * @param c set that defines which elements this set will retain. 2153 */ 2154 public UnicodeSet retainAll(UnicodeSet c) { 2155 checkFrozen(); 2156 retain(c.list, c.len, 0); 2157 strings.retainAll(c.strings); 2158 return this; 2159 } 2160 2161 /** 2162 * Removes from this set all of its elements that are contained in the 2163 * specified set. This operation effectively modifies this 2164 * set so that its value is the <i>asymmetric set difference</i> of 2165 * the two sets. 2166 * 2167 * @param c set that defines which elements will be removed from 2168 * this set. 2169 */ 2170 public UnicodeSet removeAll(UnicodeSet c) { 2171 checkFrozen(); 2172 retain(c.list, c.len, 2); 2173 strings.removeAll(c.strings); 2174 return this; 2175 } 2176 2177 /** 2178 * Complements in this set all elements contained in the specified 2179 * set. Any character in the other set will be removed if it is 2180 * in this set, or will be added if it is not in this set. 2181 * 2182 * @param c set that defines which elements will be complemented from 2183 * this set. 2184 */ 2185 public UnicodeSet complementAll(UnicodeSet c) { 2186 checkFrozen(); 2187 xor(c.list, c.len, 0); 2188 SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings); 2189 return this; 2190 } 2191 2192 /** 2193 * Removes all of the elements from this set. This set will be 2194 * empty after this call returns. 2195 */ 2196 public UnicodeSet clear() { 2197 checkFrozen(); 2198 list[0] = HIGH; 2199 len = 1; 2200 pat = null; 2201 strings.clear(); 2202 return this; 2203 } 2204 2205 /** 2206 * Iteration method that returns the number of ranges contained in 2207 * this set. 2208 * @see #getRangeStart 2209 * @see #getRangeEnd 2210 */ 2211 public int getRangeCount() { 2212 return len/2; 2213 } 2214 2215 /** 2216 * Iteration method that returns the first character in the 2217 * specified range of this set. 2218 * @exception ArrayIndexOutOfBoundsException if index is outside 2219 * the range <code>0..getRangeCount()-1</code> 2220 * @see #getRangeCount 2221 * @see #getRangeEnd 2222 */ 2223 public int getRangeStart(int index) { 2224 return list[index*2]; 2225 } 2226 2227 /** 2228 * Iteration method that returns the last character in the 2229 * specified range of this set. 2230 * @exception ArrayIndexOutOfBoundsException if index is outside 2231 * the range <code>0..getRangeCount()-1</code> 2232 * @see #getRangeStart 2233 * @see #getRangeEnd 2234 */ 2235 public int getRangeEnd(int index) { 2236 return (list[index*2 + 1] - 1); 2237 } 2238 2239 /** 2240 * Reallocate this objects internal structures to take up the least 2241 * possible space, without changing this object's value. 2242 */ 2243 public UnicodeSet compact() { 2244 checkFrozen(); 2245 if (len != list.length) { 2246 int[] temp = new int[len]; 2247 System.arraycopy(list, 0, temp, 0, len); 2248 list = temp; 2249 } 2250 rangeList = null; 2251 buffer = null; 2252 return this; 2253 } 2254 2255 /** 2256 * Compares the specified object with this set for equality. Returns 2257 * <tt>true</tt> if the specified object is also a set, the two sets 2258 * have the same size, and every member of the specified set is 2259 * contained in this set (or equivalently, every member of this set is 2260 * contained in the specified set). 2261 * 2262 * @param o Object to be compared for equality with this set. 2263 * @return <tt>true</tt> if the specified Object is equal to this set. 2264 */ 2265 @Override 2266 public boolean equals(Object o) { 2267 if (o == null) { 2268 return false; 2269 } 2270 if (this == o) { 2271 return true; 2272 } 2273 try { 2274 UnicodeSet that = (UnicodeSet) o; 2275 if (len != that.len) return false; 2276 for (int i = 0; i < len; ++i) { 2277 if (list[i] != that.list[i]) return false; 2278 } 2279 if (!strings.equals(that.strings)) return false; 2280 } catch (Exception e) { 2281 return false; 2282 } 2283 return true; 2284 } 2285 2286 /** 2287 * Returns the hash code value for this set. 2288 * 2289 * @return the hash code value for this set. 2290 * @see java.lang.Object#hashCode() 2291 */ 2292 @Override 2293 public int hashCode() { 2294 int result = len; 2295 for (int i = 0; i < len; ++i) { 2296 result *= 1000003; 2297 result += list[i]; 2298 } 2299 return result; 2300 } 2301 2302 /** 2303 * Return a programmer-readable string representation of this object. 2304 */ 2305 @Override 2306 public String toString() { 2307 return toPattern(true); 2308 } 2309 2310 //---------------------------------------------------------------- 2311 // Implementation: Pattern parsing 2312 //---------------------------------------------------------------- 2313 2314 /** 2315 * Parses the given pattern, starting at the given position. The character 2316 * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. 2317 * Parsing continues until the corresponding closing ']'. If a syntax error 2318 * is encountered between the opening and closing brace, the parse fails. 2319 * Upon return from a successful parse, the ParsePosition is updated to 2320 * point to the character following the closing ']', and an inversion 2321 * list for the parsed pattern is returned. This method 2322 * calls itself recursively to parse embedded subpatterns. 2323 * 2324 * @param pattern the string containing the pattern to be parsed. The 2325 * portion of the string from pos.getIndex(), which must be a '[', to the 2326 * corresponding closing ']', is parsed. 2327 * @param pos upon entry, the position at which to being parsing. The 2328 * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return 2329 * from a successful parse, pos.getIndex() is either the character after the 2330 * closing ']' of the parsed pattern, or pattern.length() if the closing ']' 2331 * is the last character of the pattern string. 2332 * @return an inversion list for the parsed substring 2333 * of <code>pattern</code> 2334 * @exception java.lang.IllegalArgumentException if the parse fails. 2335 * @deprecated This API is ICU internal only. 2336 * @hide original deprecated declaration 2337 * @hide draft / provisional / internal are hidden on Android 2338 */ 2339 @Deprecated 2340 public UnicodeSet applyPattern(String pattern, 2341 ParsePosition pos, 2342 SymbolTable symbols, 2343 int options) { 2344 2345 // Need to build the pattern in a temporary string because 2346 // _applyPattern calls add() etc., which set pat to empty. 2347 boolean parsePositionWasNull = pos == null; 2348 if (parsePositionWasNull) { 2349 pos = new ParsePosition(0); 2350 } 2351 2352 StringBuilder rebuiltPat = new StringBuilder(); 2353 RuleCharacterIterator chars = 2354 new RuleCharacterIterator(pattern, symbols, pos); 2355 applyPattern(chars, symbols, rebuiltPat, options); 2356 if (chars.inVariable()) { 2357 syntaxError(chars, "Extra chars in variable value"); 2358 } 2359 pat = rebuiltPat.toString(); 2360 if (parsePositionWasNull) { 2361 int i = pos.getIndex(); 2362 2363 // Skip over trailing whitespace 2364 if ((options & IGNORE_SPACE) != 0) { 2365 i = PatternProps.skipWhiteSpace(pattern, i); 2366 } 2367 2368 if (i != pattern.length()) { 2369 throw new IllegalArgumentException("Parse of \"" + pattern + 2370 "\" failed at " + i); 2371 } 2372 } 2373 return this; 2374 } 2375 2376 // Add constants to make the applyPattern() code easier to follow. 2377 2378 private static final int LAST0_START = 0, 2379 LAST1_RANGE = 1, 2380 LAST2_SET = 2; 2381 2382 private static final int MODE0_NONE = 0, 2383 MODE1_INBRACKET = 1, 2384 MODE2_OUTBRACKET = 2; 2385 2386 private static final int SETMODE0_NONE = 0, 2387 SETMODE1_UNICODESET = 1, 2388 SETMODE2_PROPERTYPAT = 2, 2389 SETMODE3_PREPARSED = 3; 2390 2391 /** 2392 * Parse the pattern from the given RuleCharacterIterator. The 2393 * iterator is advanced over the parsed pattern. 2394 * @param chars iterator over the pattern characters. Upon return 2395 * it will be advanced to the first character after the parsed 2396 * pattern, or the end of the iteration if all characters are 2397 * parsed. 2398 * @param symbols symbol table to use to parse and dereference 2399 * variables, or null if none. 2400 * @param rebuiltPat the pattern that was parsed, rebuilt or 2401 * copied from the input pattern, as appropriate. 2402 * @param options a bit mask of zero or more of the following: 2403 * IGNORE_SPACE, CASE. 2404 */ 2405 private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, 2406 Appendable rebuiltPat, int options) { 2407 2408 // Syntax characters: [ ] ^ - & { } 2409 2410 // Recognized special forms for chars, sets: c-c s-s s&s 2411 2412 int opts = RuleCharacterIterator.PARSE_VARIABLES | 2413 RuleCharacterIterator.PARSE_ESCAPES; 2414 if ((options & IGNORE_SPACE) != 0) { 2415 opts |= RuleCharacterIterator.SKIP_WHITESPACE; 2416 } 2417 2418 StringBuilder patBuf = new StringBuilder(), buf = null; 2419 boolean usePat = false; 2420 UnicodeSet scratch = null; 2421 Object backup = null; 2422 2423 // mode: 0=before [, 1=between [...], 2=after ] 2424 // lastItem: 0=none, 1=char, 2=set 2425 int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE; 2426 char op = 0; 2427 2428 boolean invert = false; 2429 2430 clear(); 2431 String lastString = null; 2432 2433 while (mode != MODE2_OUTBRACKET && !chars.atEnd()) { 2434 //Eclipse stated the following is "dead code" 2435 /* 2436 if (false) { 2437 // Debugging assertion 2438 if (!((lastItem == 0 && op == 0) || 2439 (lastItem == 1 && (op == 0 || op == '-')) || 2440 (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { 2441 throw new IllegalArgumentException(); 2442 } 2443 }*/ 2444 2445 int c = 0; 2446 boolean literal = false; 2447 UnicodeSet nested = null; 2448 2449 // -------- Check for property pattern 2450 2451 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 2452 int setMode = SETMODE0_NONE; 2453 if (resemblesPropertyPattern(chars, opts)) { 2454 setMode = SETMODE2_PROPERTYPAT; 2455 } 2456 2457 // -------- Parse '[' of opening delimiter OR nested set. 2458 // If there is a nested set, use `setMode' to define how 2459 // the set should be parsed. If the '[' is part of the 2460 // opening delimiter for this pattern, parse special 2461 // strings "[", "[^", "[-", and "[^-". Check for stand-in 2462 // characters representing a nested set in the symbol 2463 // table. 2464 2465 else { 2466 // Prepare to backup if necessary 2467 backup = chars.getPos(backup); 2468 c = chars.next(opts); 2469 literal = chars.isEscaped(); 2470 2471 if (c == '[' && !literal) { 2472 if (mode == MODE1_INBRACKET) { 2473 chars.setPos(backup); // backup 2474 setMode = SETMODE1_UNICODESET; 2475 } else { 2476 // Handle opening '[' delimiter 2477 mode = MODE1_INBRACKET; 2478 patBuf.append('['); 2479 backup = chars.getPos(backup); // prepare to backup 2480 c = chars.next(opts); 2481 literal = chars.isEscaped(); 2482 if (c == '^' && !literal) { 2483 invert = true; 2484 patBuf.append('^'); 2485 backup = chars.getPos(backup); // prepare to backup 2486 c = chars.next(opts); 2487 literal = chars.isEscaped(); 2488 } 2489 // Fall through to handle special leading '-'; 2490 // otherwise restart loop for nested [], \p{}, etc. 2491 if (c == '-') { 2492 literal = true; 2493 // Fall through to handle literal '-' below 2494 } else { 2495 chars.setPos(backup); // backup 2496 continue; 2497 } 2498 } 2499 } else if (symbols != null) { 2500 UnicodeMatcher m = symbols.lookupMatcher(c); // may be null 2501 if (m != null) { 2502 try { 2503 nested = (UnicodeSet) m; 2504 setMode = SETMODE3_PREPARSED; 2505 } catch (ClassCastException e) { 2506 syntaxError(chars, "Syntax error"); 2507 } 2508 } 2509 } 2510 } 2511 2512 // -------- Handle a nested set. This either is inline in 2513 // the pattern or represented by a stand-in that has 2514 // previously been parsed and was looked up in the symbol 2515 // table. 2516 2517 if (setMode != SETMODE0_NONE) { 2518 if (lastItem == LAST1_RANGE) { 2519 if (op != 0) { 2520 syntaxError(chars, "Char expected after operator"); 2521 } 2522 add_unchecked(lastChar, lastChar); 2523 _appendToPat(patBuf, lastChar, false); 2524 lastItem = LAST0_START; 2525 op = 0; 2526 } 2527 2528 if (op == '-' || op == '&') { 2529 patBuf.append(op); 2530 } 2531 2532 if (nested == null) { 2533 if (scratch == null) scratch = new UnicodeSet(); 2534 nested = scratch; 2535 } 2536 switch (setMode) { 2537 case SETMODE1_UNICODESET: 2538 nested.applyPattern(chars, symbols, patBuf, options); 2539 break; 2540 case SETMODE2_PROPERTYPAT: 2541 chars.skipIgnored(opts); 2542 nested.applyPropertyPattern(chars, patBuf, symbols); 2543 break; 2544 case SETMODE3_PREPARSED: // `nested' already parsed 2545 nested._toPattern(patBuf, false); 2546 break; 2547 } 2548 2549 usePat = true; 2550 2551 if (mode == MODE0_NONE) { 2552 // Entire pattern is a category; leave parse loop 2553 set(nested); 2554 mode = MODE2_OUTBRACKET; 2555 break; 2556 } 2557 2558 switch (op) { 2559 case '-': 2560 removeAll(nested); 2561 break; 2562 case '&': 2563 retainAll(nested); 2564 break; 2565 case 0: 2566 addAll(nested); 2567 break; 2568 } 2569 2570 op = 0; 2571 lastItem = LAST2_SET; 2572 2573 continue; 2574 } 2575 2576 if (mode == MODE0_NONE) { 2577 syntaxError(chars, "Missing '['"); 2578 } 2579 2580 // -------- Parse special (syntax) characters. If the 2581 // current character is not special, or if it is escaped, 2582 // then fall through and handle it below. 2583 2584 if (!literal) { 2585 switch (c) { 2586 case ']': 2587 if (lastItem == LAST1_RANGE) { 2588 add_unchecked(lastChar, lastChar); 2589 _appendToPat(patBuf, lastChar, false); 2590 } 2591 // Treat final trailing '-' as a literal 2592 if (op == '-') { 2593 add_unchecked(op, op); 2594 patBuf.append(op); 2595 } else if (op == '&') { 2596 syntaxError(chars, "Trailing '&'"); 2597 } 2598 patBuf.append(']'); 2599 mode = MODE2_OUTBRACKET; 2600 continue; 2601 case '-': 2602 if (op == 0) { 2603 if (lastItem != LAST0_START) { 2604 op = (char) c; 2605 continue; 2606 } else if (lastString != null) { 2607 op = (char) c; 2608 continue; 2609 } else { 2610 // Treat final trailing '-' as a literal 2611 add_unchecked(c, c); 2612 c = chars.next(opts); 2613 literal = chars.isEscaped(); 2614 if (c == ']' && !literal) { 2615 patBuf.append("-]"); 2616 mode = MODE2_OUTBRACKET; 2617 continue; 2618 } 2619 } 2620 } 2621 syntaxError(chars, "'-' not after char, string, or set"); 2622 break; 2623 case '&': 2624 if (lastItem == LAST2_SET && op == 0) { 2625 op = (char) c; 2626 continue; 2627 } 2628 syntaxError(chars, "'&' not after set"); 2629 break; 2630 case '^': 2631 syntaxError(chars, "'^' not after '['"); 2632 break; 2633 case '{': 2634 if (op != 0 && op != '-') { 2635 syntaxError(chars, "Missing operand after operator"); 2636 } 2637 if (lastItem == LAST1_RANGE) { 2638 add_unchecked(lastChar, lastChar); 2639 _appendToPat(patBuf, lastChar, false); 2640 } 2641 lastItem = LAST0_START; 2642 if (buf == null) { 2643 buf = new StringBuilder(); 2644 } else { 2645 buf.setLength(0); 2646 } 2647 boolean ok = false; 2648 while (!chars.atEnd()) { 2649 c = chars.next(opts); 2650 literal = chars.isEscaped(); 2651 if (c == '}' && !literal) { 2652 ok = true; 2653 break; 2654 } 2655 appendCodePoint(buf, c); 2656 } 2657 if (buf.length() < 1 || !ok) { 2658 syntaxError(chars, "Invalid multicharacter string"); 2659 } 2660 // We have new string. Add it to set and continue; 2661 // we don't need to drop through to the further 2662 // processing 2663 String curString = buf.toString(); 2664 if (op == '-') { 2665 int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString); 2666 int curSingle = CharSequences.getSingleCodePoint(curString); 2667 if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) { 2668 add(lastSingle,curSingle); 2669 } else { 2670 try { 2671 StringRange.expand(lastString, curString, true, strings); 2672 } catch (Exception e) { 2673 syntaxError(chars, e.getMessage()); 2674 } 2675 } 2676 lastString = null; 2677 op = 0; 2678 } else { 2679 add(curString); 2680 lastString = curString; 2681 } 2682 patBuf.append('{'); 2683 _appendToPat(patBuf, curString, false); 2684 patBuf.append('}'); 2685 continue; 2686 case SymbolTable.SYMBOL_REF: 2687 // symbols nosymbols 2688 // [a-$] error error (ambiguous) 2689 // [a$] anchor anchor 2690 // [a-$x] var "x"* literal '$' 2691 // [a-$.] error literal '$' 2692 // *We won't get here in the case of var "x" 2693 backup = chars.getPos(backup); 2694 c = chars.next(opts); 2695 literal = chars.isEscaped(); 2696 boolean anchor = (c == ']' && !literal); 2697 if (symbols == null && !anchor) { 2698 c = SymbolTable.SYMBOL_REF; 2699 chars.setPos(backup); 2700 break; // literal '$' 2701 } 2702 if (anchor && op == 0) { 2703 if (lastItem == LAST1_RANGE) { 2704 add_unchecked(lastChar, lastChar); 2705 _appendToPat(patBuf, lastChar, false); 2706 } 2707 add_unchecked(UnicodeMatcher.ETHER); 2708 usePat = true; 2709 patBuf.append(SymbolTable.SYMBOL_REF).append(']'); 2710 mode = MODE2_OUTBRACKET; 2711 continue; 2712 } 2713 syntaxError(chars, "Unquoted '$'"); 2714 break; 2715 default: 2716 break; 2717 } 2718 } 2719 2720 // -------- Parse literal characters. This includes both 2721 // escaped chars ("\u4E01") and non-syntax characters 2722 // ("a"). 2723 2724 switch (lastItem) { 2725 case LAST0_START: 2726 if (op == '-' && lastString != null) { 2727 syntaxError(chars, "Invalid range"); 2728 } 2729 lastItem = LAST1_RANGE; 2730 lastChar = c; 2731 lastString = null; 2732 break; 2733 case LAST1_RANGE: 2734 if (op == '-') { 2735 if (lastString != null) { 2736 syntaxError(chars, "Invalid range"); 2737 } 2738 if (lastChar >= c) { 2739 // Don't allow redundant (a-a) or empty (b-a) ranges; 2740 // these are most likely typos. 2741 syntaxError(chars, "Invalid range"); 2742 } 2743 add_unchecked(lastChar, c); 2744 _appendToPat(patBuf, lastChar, false); 2745 patBuf.append(op); 2746 _appendToPat(patBuf, c, false); 2747 lastItem = LAST0_START; 2748 op = 0; 2749 } else { 2750 add_unchecked(lastChar, lastChar); 2751 _appendToPat(patBuf, lastChar, false); 2752 lastChar = c; 2753 } 2754 break; 2755 case LAST2_SET: 2756 if (op != 0) { 2757 syntaxError(chars, "Set expected after operator"); 2758 } 2759 lastChar = c; 2760 lastItem = LAST1_RANGE; 2761 break; 2762 } 2763 } 2764 2765 if (mode != MODE2_OUTBRACKET) { 2766 syntaxError(chars, "Missing ']'"); 2767 } 2768 2769 chars.skipIgnored(opts); 2770 2771 /** 2772 * Handle global flags (invert, case insensitivity). If this 2773 * pattern should be compiled case-insensitive, then we need 2774 * to close over case BEFORE COMPLEMENTING. This makes 2775 * patterns like /[^abc]/i work. 2776 */ 2777 if ((options & CASE) != 0) { 2778 closeOver(CASE); 2779 } 2780 if (invert) { 2781 complement(); 2782 } 2783 2784 // Use the rebuilt pattern (pat) only if necessary. Prefer the 2785 // generated pattern. 2786 if (usePat) { 2787 append(rebuiltPat, patBuf.toString()); 2788 } else { 2789 appendNewPattern(rebuiltPat, false, true); 2790 } 2791 } 2792 2793 private static void syntaxError(RuleCharacterIterator chars, String msg) { 2794 throw new IllegalArgumentException("Error: " + msg + " at \"" + 2795 Utility.escape(chars.toString()) + 2796 '"'); 2797 } 2798 2799 /** 2800 * Add the contents of the UnicodeSet (as strings) into a collection. 2801 * @param target collection to add into 2802 */ 2803 public <T extends Collection<String>> T addAllTo(T target) { 2804 return addAllTo(this, target); 2805 } 2806 2807 2808 /** 2809 * Add the contents of the UnicodeSet (as strings) into a collection. 2810 * @param target collection to add into 2811 * @hide unsupported on Android 2812 */ 2813 public String[] addAllTo(String[] target) { 2814 return addAllTo(this, target); 2815 } 2816 2817 /** 2818 * Add the contents of the UnicodeSet (as strings) into an array. 2819 * @hide unsupported on Android 2820 */ 2821 public static String[] toArray(UnicodeSet set) { 2822 return addAllTo(set, new String[set.size()]); 2823 } 2824 2825 /** 2826 * Add the contents of the collection (as strings) into this UnicodeSet. 2827 * The collection must not contain null. 2828 * @param source the collection to add 2829 * @return a reference to this object 2830 */ 2831 public UnicodeSet add(Iterable<?> source) { 2832 return addAll(source); 2833 } 2834 2835 /** 2836 * Add a collection (as strings) into this UnicodeSet. 2837 * Uses standard naming convention. 2838 * @param source collection to add into 2839 * @return a reference to this object 2840 */ 2841 public UnicodeSet addAll(Iterable<?> source) { 2842 checkFrozen(); 2843 for (Object o : source) { 2844 add(o.toString()); 2845 } 2846 return this; 2847 } 2848 2849 //---------------------------------------------------------------- 2850 // Implementation: Utility methods 2851 //---------------------------------------------------------------- 2852 2853 private void ensureCapacity(int newLen) { 2854 if (newLen <= list.length) return; 2855 int[] temp = new int[newLen + GROW_EXTRA]; 2856 System.arraycopy(list, 0, temp, 0, len); 2857 list = temp; 2858 } 2859 2860 private void ensureBufferCapacity(int newLen) { 2861 if (buffer != null && newLen <= buffer.length) return; 2862 buffer = new int[newLen + GROW_EXTRA]; 2863 } 2864 2865 /** 2866 * Assumes start <= end. 2867 */ 2868 private int[] range(int start, int end) { 2869 if (rangeList == null) { 2870 rangeList = new int[] { start, end+1, HIGH }; 2871 } else { 2872 rangeList[0] = start; 2873 rangeList[1] = end+1; 2874 } 2875 return rangeList; 2876 } 2877 2878 //---------------------------------------------------------------- 2879 // Implementation: Fundamental operations 2880 //---------------------------------------------------------------- 2881 2882 // polarity = 0, 3 is normal: x xor y 2883 // polarity = 1, 2: x xor ~y == x === y 2884 2885 private UnicodeSet xor(int[] other, int otherLen, int polarity) { 2886 ensureBufferCapacity(len + otherLen); 2887 int i = 0, j = 0, k = 0; 2888 int a = list[i++]; 2889 int b; 2890 // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used 2891 // so the following if statement will not be called. 2892 ///CLOVER:OFF 2893 if (polarity == 1 || polarity == 2) { 2894 b = LOW; 2895 if (other[j] == LOW) { // skip base if already LOW 2896 ++j; 2897 b = other[j]; 2898 } 2899 ///CLOVER:ON 2900 } else { 2901 b = other[j++]; 2902 } 2903 // simplest of all the routines 2904 // sort the values, discarding identicals! 2905 while (true) { 2906 if (a < b) { 2907 buffer[k++] = a; 2908 a = list[i++]; 2909 } else if (b < a) { 2910 buffer[k++] = b; 2911 b = other[j++]; 2912 } else if (a != HIGH) { // at this point, a == b 2913 // discard both values! 2914 a = list[i++]; 2915 b = other[j++]; 2916 } else { // DONE! 2917 buffer[k++] = HIGH; 2918 len = k; 2919 break; 2920 } 2921 } 2922 // swap list and buffer 2923 int[] temp = list; 2924 list = buffer; 2925 buffer = temp; 2926 pat = null; 2927 return this; 2928 } 2929 2930 // polarity = 0 is normal: x union y 2931 // polarity = 2: x union ~y 2932 // polarity = 1: ~x union y 2933 // polarity = 3: ~x union ~y 2934 2935 private UnicodeSet add(int[] other, int otherLen, int polarity) { 2936 ensureBufferCapacity(len + otherLen); 2937 int i = 0, j = 0, k = 0; 2938 int a = list[i++]; 2939 int b = other[j++]; 2940 // change from xor is that we have to check overlapping pairs 2941 // polarity bit 1 means a is second, bit 2 means b is. 2942 main: 2943 while (true) { 2944 switch (polarity) { 2945 case 0: // both first; take lower if unequal 2946 if (a < b) { // take a 2947 // Back up over overlapping ranges in buffer[] 2948 if (k > 0 && a <= buffer[k-1]) { 2949 // Pick latter end value in buffer[] vs. list[] 2950 a = max(list[i], buffer[--k]); 2951 } else { 2952 // No overlap 2953 buffer[k++] = a; 2954 a = list[i]; 2955 } 2956 i++; // Common if/else code factored out 2957 polarity ^= 1; 2958 } else if (b < a) { // take b 2959 if (k > 0 && b <= buffer[k-1]) { 2960 b = max(other[j], buffer[--k]); 2961 } else { 2962 buffer[k++] = b; 2963 b = other[j]; 2964 } 2965 j++; 2966 polarity ^= 2; 2967 } else { // a == b, take a, drop b 2968 if (a == HIGH) break main; 2969 // This is symmetrical; it doesn't matter if 2970 // we backtrack with a or b. - liu 2971 if (k > 0 && a <= buffer[k-1]) { 2972 a = max(list[i], buffer[--k]); 2973 } else { 2974 // No overlap 2975 buffer[k++] = a; 2976 a = list[i]; 2977 } 2978 i++; 2979 polarity ^= 1; 2980 b = other[j++]; polarity ^= 2; 2981 } 2982 break; 2983 case 3: // both second; take higher if unequal, and drop other 2984 if (b <= a) { // take a 2985 if (a == HIGH) break main; 2986 buffer[k++] = a; 2987 } else { // take b 2988 if (b == HIGH) break main; 2989 buffer[k++] = b; 2990 } 2991 a = list[i++]; polarity ^= 1; // factored common code 2992 b = other[j++]; polarity ^= 2; 2993 break; 2994 case 1: // a second, b first; if b < a, overlap 2995 if (a < b) { // no overlap, take a 2996 buffer[k++] = a; a = list[i++]; polarity ^= 1; 2997 } else if (b < a) { // OVERLAP, drop b 2998 b = other[j++]; polarity ^= 2; 2999 } else { // a == b, drop both! 3000 if (a == HIGH) break main; 3001 a = list[i++]; polarity ^= 1; 3002 b = other[j++]; polarity ^= 2; 3003 } 3004 break; 3005 case 2: // a first, b second; if a < b, overlap 3006 if (b < a) { // no overlap, take b 3007 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3008 } else if (a < b) { // OVERLAP, drop a 3009 a = list[i++]; polarity ^= 1; 3010 } else { // a == b, drop both! 3011 if (a == HIGH) break main; 3012 a = list[i++]; polarity ^= 1; 3013 b = other[j++]; polarity ^= 2; 3014 } 3015 break; 3016 } 3017 } 3018 buffer[k++] = HIGH; // terminate 3019 len = k; 3020 // swap list and buffer 3021 int[] temp = list; 3022 list = buffer; 3023 buffer = temp; 3024 pat = null; 3025 return this; 3026 } 3027 3028 // polarity = 0 is normal: x intersect y 3029 // polarity = 2: x intersect ~y == set-minus 3030 // polarity = 1: ~x intersect y 3031 // polarity = 3: ~x intersect ~y 3032 3033 private UnicodeSet retain(int[] other, int otherLen, int polarity) { 3034 ensureBufferCapacity(len + otherLen); 3035 int i = 0, j = 0, k = 0; 3036 int a = list[i++]; 3037 int b = other[j++]; 3038 // change from xor is that we have to check overlapping pairs 3039 // polarity bit 1 means a is second, bit 2 means b is. 3040 main: 3041 while (true) { 3042 switch (polarity) { 3043 case 0: // both first; drop the smaller 3044 if (a < b) { // drop a 3045 a = list[i++]; polarity ^= 1; 3046 } else if (b < a) { // drop b 3047 b = other[j++]; polarity ^= 2; 3048 } else { // a == b, take one, drop other 3049 if (a == HIGH) break main; 3050 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3051 b = other[j++]; polarity ^= 2; 3052 } 3053 break; 3054 case 3: // both second; take lower if unequal 3055 if (a < b) { // take a 3056 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3057 } else if (b < a) { // take b 3058 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3059 } else { // a == b, take one, drop other 3060 if (a == HIGH) break main; 3061 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3062 b = other[j++]; polarity ^= 2; 3063 } 3064 break; 3065 case 1: // a second, b first; 3066 if (a < b) { // NO OVERLAP, drop a 3067 a = list[i++]; polarity ^= 1; 3068 } else if (b < a) { // OVERLAP, take b 3069 buffer[k++] = b; b = other[j++]; polarity ^= 2; 3070 } else { // a == b, drop both! 3071 if (a == HIGH) break main; 3072 a = list[i++]; polarity ^= 1; 3073 b = other[j++]; polarity ^= 2; 3074 } 3075 break; 3076 case 2: // a first, b second; if a < b, overlap 3077 if (b < a) { // no overlap, drop b 3078 b = other[j++]; polarity ^= 2; 3079 } else if (a < b) { // OVERLAP, take a 3080 buffer[k++] = a; a = list[i++]; polarity ^= 1; 3081 } else { // a == b, drop both! 3082 if (a == HIGH) break main; 3083 a = list[i++]; polarity ^= 1; 3084 b = other[j++]; polarity ^= 2; 3085 } 3086 break; 3087 } 3088 } 3089 buffer[k++] = HIGH; // terminate 3090 len = k; 3091 // swap list and buffer 3092 int[] temp = list; 3093 list = buffer; 3094 buffer = temp; 3095 pat = null; 3096 return this; 3097 } 3098 3099 private static final int max(int a, int b) { 3100 return (a > b) ? a : b; 3101 } 3102 3103 //---------------------------------------------------------------- 3104 // Generic filter-based scanning code 3105 //---------------------------------------------------------------- 3106 3107 private static interface Filter { 3108 boolean contains(int codePoint); 3109 } 3110 3111 private static class NumericValueFilter implements Filter { 3112 double value; 3113 NumericValueFilter(double value) { this.value = value; } 3114 @Override 3115 public boolean contains(int ch) { 3116 return UCharacter.getUnicodeNumericValue(ch) == value; 3117 } 3118 } 3119 3120 private static class GeneralCategoryMaskFilter implements Filter { 3121 int mask; 3122 GeneralCategoryMaskFilter(int mask) { this.mask = mask; } 3123 @Override 3124 public boolean contains(int ch) { 3125 return ((1 << UCharacter.getType(ch)) & mask) != 0; 3126 } 3127 } 3128 3129 private static class IntPropertyFilter implements Filter { 3130 int prop; 3131 int value; 3132 IntPropertyFilter(int prop, int value) { 3133 this.prop = prop; 3134 this.value = value; 3135 } 3136 @Override 3137 public boolean contains(int ch) { 3138 return UCharacter.getIntPropertyValue(ch, prop) == value; 3139 } 3140 } 3141 3142 private static class ScriptExtensionsFilter implements Filter { 3143 int script; 3144 ScriptExtensionsFilter(int script) { this.script = script; } 3145 @Override 3146 public boolean contains(int c) { 3147 return UScript.hasScript(c, script); 3148 } 3149 } 3150 3151 // VersionInfo for unassigned characters 3152 private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); 3153 3154 private static class VersionFilter implements Filter { 3155 VersionInfo version; 3156 VersionFilter(VersionInfo version) { this.version = version; } 3157 @Override 3158 public boolean contains(int ch) { 3159 VersionInfo v = UCharacter.getAge(ch); 3160 // Reference comparison ok; VersionInfo caches and reuses 3161 // unique objects. 3162 return !Utility.sameObjects(v, NO_VERSION) && 3163 v.compareTo(version) <= 0; 3164 } 3165 } 3166 3167 private static synchronized UnicodeSet getInclusions(int src) { 3168 if (INCLUSIONS == null) { 3169 INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT]; 3170 } 3171 if(INCLUSIONS[src] == null) { 3172 UnicodeSet incl = new UnicodeSet(); 3173 switch(src) { 3174 case UCharacterProperty.SRC_CHAR: 3175 UCharacterProperty.INSTANCE.addPropertyStarts(incl); 3176 break; 3177 case UCharacterProperty.SRC_PROPSVEC: 3178 UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl); 3179 break; 3180 case UCharacterProperty.SRC_CHAR_AND_PROPSVEC: 3181 UCharacterProperty.INSTANCE.addPropertyStarts(incl); 3182 UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl); 3183 break; 3184 case UCharacterProperty.SRC_CASE_AND_NORM: 3185 Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl); 3186 UCaseProps.INSTANCE.addPropertyStarts(incl); 3187 break; 3188 case UCharacterProperty.SRC_NFC: 3189 Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl); 3190 break; 3191 case UCharacterProperty.SRC_NFKC: 3192 Norm2AllModes.getNFKCInstance().impl.addPropertyStarts(incl); 3193 break; 3194 case UCharacterProperty.SRC_NFKC_CF: 3195 Norm2AllModes.getNFKC_CFInstance().impl.addPropertyStarts(incl); 3196 break; 3197 case UCharacterProperty.SRC_NFC_CANON_ITER: 3198 Norm2AllModes.getNFCInstance().impl.addCanonIterPropertyStarts(incl); 3199 break; 3200 case UCharacterProperty.SRC_CASE: 3201 UCaseProps.INSTANCE.addPropertyStarts(incl); 3202 break; 3203 case UCharacterProperty.SRC_BIDI: 3204 UBiDiProps.INSTANCE.addPropertyStarts(incl); 3205 break; 3206 default: 3207 throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")"); 3208 } 3209 INCLUSIONS[src] = incl; 3210 } 3211 return INCLUSIONS[src]; 3212 } 3213 3214 /** 3215 * Generic filter-based scanning code for UCD property UnicodeSets. 3216 */ 3217 private UnicodeSet applyFilter(Filter filter, int src) { 3218 // Logically, walk through all Unicode characters, noting the start 3219 // and end of each range for which filter.contain(c) is 3220 // true. Add each range to a set. 3221 // 3222 // To improve performance, use an inclusions set which 3223 // encodes information about character ranges that are known 3224 // to have identical properties. 3225 // getInclusions(src) contains exactly the first characters of 3226 // same-value ranges for the given properties "source". 3227 3228 clear(); 3229 3230 int startHasProperty = -1; 3231 UnicodeSet inclusions = getInclusions(src); 3232 int limitRange = inclusions.getRangeCount(); 3233 3234 for (int j=0; j<limitRange; ++j) { 3235 // get current range 3236 int start = inclusions.getRangeStart(j); 3237 int end = inclusions.getRangeEnd(j); 3238 3239 // for all the code points in the range, process 3240 for (int ch = start; ch <= end; ++ch) { 3241 // only add to the unicodeset on inflection points -- 3242 // where the hasProperty value changes to false 3243 if (filter.contains(ch)) { 3244 if (startHasProperty < 0) { 3245 startHasProperty = ch; 3246 } 3247 } else if (startHasProperty >= 0) { 3248 add_unchecked(startHasProperty, ch-1); 3249 startHasProperty = -1; 3250 } 3251 } 3252 } 3253 if (startHasProperty >= 0) { 3254 add_unchecked(startHasProperty, 0x10FFFF); 3255 } 3256 3257 return this; 3258 } 3259 3260 3261 /** 3262 * Remove leading and trailing Pattern_White_Space and compress 3263 * internal Pattern_White_Space to a single space character. 3264 */ 3265 private static String mungeCharName(String source) { 3266 source = PatternProps.trimWhiteSpace(source); 3267 StringBuilder buf = null; 3268 for (int i=0; i<source.length(); ++i) { 3269 char ch = source.charAt(i); 3270 if (PatternProps.isWhiteSpace(ch)) { 3271 if (buf == null) { 3272 buf = new StringBuilder().append(source, 0, i); 3273 } else if (buf.charAt(buf.length() - 1) == ' ') { 3274 continue; 3275 } 3276 ch = ' '; // convert to ' ' 3277 } 3278 if (buf != null) { 3279 buf.append(ch); 3280 } 3281 } 3282 return buf == null ? source : buf.toString(); 3283 } 3284 3285 //---------------------------------------------------------------- 3286 // Property set API 3287 //---------------------------------------------------------------- 3288 3289 /** 3290 * Modifies this set to contain those code points which have the 3291 * given value for the given binary or enumerated property, as 3292 * returned by UCharacter.getIntPropertyValue. Prior contents of 3293 * this set are lost. 3294 * 3295 * @param prop a property in the range 3296 * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or 3297 * UProperty.INT_START..UProperty.INT_LIMIT-1 or. 3298 * UProperty.MASK_START..UProperty.MASK_LIMIT-1. 3299 * 3300 * @param value a value in the range 3301 * UCharacter.getIntPropertyMinValue(prop).. 3302 * UCharacter.getIntPropertyMaxValue(prop), with one exception. 3303 * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be 3304 * a UCharacter.getType() result, but rather a mask value produced 3305 * by logically ORing (1 << UCharacter.getType()) values together. 3306 * This allows grouped categories such as [:L:] to be represented. 3307 * 3308 * @return a reference to this set 3309 */ 3310 public UnicodeSet applyIntPropertyValue(int prop, int value) { 3311 checkFrozen(); 3312 if (prop == UProperty.GENERAL_CATEGORY_MASK) { 3313 applyFilter(new GeneralCategoryMaskFilter(value), UCharacterProperty.SRC_CHAR); 3314 } else if (prop == UProperty.SCRIPT_EXTENSIONS) { 3315 applyFilter(new ScriptExtensionsFilter(value), UCharacterProperty.SRC_PROPSVEC); 3316 } else { 3317 applyFilter(new IntPropertyFilter(prop, value), UCharacterProperty.INSTANCE.getSource(prop)); 3318 } 3319 return this; 3320 } 3321 3322 3323 3324 /** 3325 * Modifies this set to contain those code points which have the 3326 * given value for the given property. Prior contents of this 3327 * set are lost. 3328 * 3329 * @param propertyAlias a property alias, either short or long. 3330 * The name is matched loosely. See PropertyAliases.txt for names 3331 * and a description of loose matching. If the value string is 3332 * empty, then this string is interpreted as either a 3333 * General_Category value alias, a Script value alias, a binary 3334 * property alias, or a special ID. Special IDs are matched 3335 * loosely and correspond to the following sets: 3336 * 3337 * "ANY" = [\\u0000-\\U0010FFFF], 3338 * "ASCII" = [\\u0000-\\u007F]. 3339 * 3340 * @param valueAlias a value alias, either short or long. The 3341 * name is matched loosely. See PropertyValueAliases.txt for 3342 * names and a description of loose matching. In addition to 3343 * aliases listed, numeric values and canonical combining classes 3344 * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc", 3345 * "220"). The value string may also be empty. 3346 * 3347 * @return a reference to this set 3348 */ 3349 public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) { 3350 return applyPropertyAlias(propertyAlias, valueAlias, null); 3351 } 3352 3353 /** 3354 * Modifies this set to contain those code points which have the 3355 * given value for the given property. Prior contents of this 3356 * set are lost. 3357 * @param propertyAlias A string of the property alias. 3358 * @param valueAlias A string of the value alias. 3359 * @param symbols if not null, then symbols are first called to see if a property 3360 * is available. If true, then everything else is skipped. 3361 * @return this set 3362 */ 3363 public UnicodeSet applyPropertyAlias(String propertyAlias, 3364 String valueAlias, SymbolTable symbols) { 3365 checkFrozen(); 3366 int p; 3367 int v; 3368 boolean invert = false; 3369 3370 if (symbols != null 3371 && (symbols instanceof XSymbolTable) 3372 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) { 3373 return this; 3374 } 3375 3376 if (XSYMBOL_TABLE != null) { 3377 if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) { 3378 return this; 3379 } 3380 } 3381 3382 if (valueAlias.length() > 0) { 3383 p = UCharacter.getPropertyEnum(propertyAlias); 3384 3385 // Treat gc as gcm 3386 if (p == UProperty.GENERAL_CATEGORY) { 3387 p = UProperty.GENERAL_CATEGORY_MASK; 3388 } 3389 3390 if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) || 3391 (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) || 3392 (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) { 3393 try { 3394 v = UCharacter.getPropertyValueEnum(p, valueAlias); 3395 } catch (IllegalArgumentException e) { 3396 // Handle numeric CCC 3397 if (p == UProperty.CANONICAL_COMBINING_CLASS || 3398 p == UProperty.LEAD_CANONICAL_COMBINING_CLASS || 3399 p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) { 3400 v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias)); 3401 // Anything between 0 and 255 is valid even if unused. 3402 if (v < 0 || v > 255) throw e; 3403 } else { 3404 throw e; 3405 } 3406 } 3407 } 3408 3409 else { 3410 switch (p) { 3411 case UProperty.NUMERIC_VALUE: 3412 { 3413 double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias)); 3414 applyFilter(new NumericValueFilter(value), UCharacterProperty.SRC_CHAR); 3415 return this; 3416 } 3417 case UProperty.NAME: 3418 { 3419 // Must munge name, since 3420 // UCharacter.charFromName() does not do 3421 // 'loose' matching. 3422 String buf = mungeCharName(valueAlias); 3423 int ch = UCharacter.getCharFromExtendedName(buf); 3424 if (ch == -1) { 3425 throw new IllegalArgumentException("Invalid character name"); 3426 } 3427 clear(); 3428 add_unchecked(ch); 3429 return this; 3430 } 3431 case UProperty.UNICODE_1_NAME: 3432 // ICU 49 deprecates the Unicode_1_Name property APIs. 3433 throw new IllegalArgumentException("Unicode_1_Name (na1) not supported"); 3434 case UProperty.AGE: 3435 { 3436 // Must munge name, since 3437 // VersionInfo.getInstance() does not do 3438 // 'loose' matching. 3439 VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); 3440 applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC); 3441 return this; 3442 } 3443 case UProperty.SCRIPT_EXTENSIONS: 3444 v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias); 3445 // fall through to calling applyIntPropertyValue() 3446 break; 3447 default: 3448 // p is a non-binary, non-enumerated property that we 3449 // don't support (yet). 3450 throw new IllegalArgumentException("Unsupported property"); 3451 } 3452 } 3453 } 3454 3455 else { 3456 // valueAlias is empty. Interpret as General Category, Script, 3457 // Binary property, or ANY or ASCII. Upon success, p and v will 3458 // be set. 3459 UPropertyAliases pnames = UPropertyAliases.INSTANCE; 3460 p = UProperty.GENERAL_CATEGORY_MASK; 3461 v = pnames.getPropertyValueEnum(p, propertyAlias); 3462 if (v == UProperty.UNDEFINED) { 3463 p = UProperty.SCRIPT; 3464 v = pnames.getPropertyValueEnum(p, propertyAlias); 3465 if (v == UProperty.UNDEFINED) { 3466 p = pnames.getPropertyEnum(propertyAlias); 3467 if (p == UProperty.UNDEFINED) { 3468 p = -1; 3469 } 3470 if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) { 3471 v = 1; 3472 } else if (p == -1) { 3473 if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) { 3474 set(MIN_VALUE, MAX_VALUE); 3475 return this; 3476 } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) { 3477 set(0, 0x7F); 3478 return this; 3479 } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) { 3480 // [:Assigned:]=[:^Cn:] 3481 p = UProperty.GENERAL_CATEGORY_MASK; 3482 v = (1<<UCharacter.UNASSIGNED); 3483 invert = true; 3484 } else { 3485 // Property name was never matched. 3486 throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias); 3487 } 3488 } else { 3489 // Valid propery name, but it isn't binary, so the value 3490 // must be supplied. 3491 throw new IllegalArgumentException("Missing property value"); 3492 } 3493 } 3494 } 3495 } 3496 3497 applyIntPropertyValue(p, v); 3498 if(invert) { 3499 complement(); 3500 } 3501 3502 return this; 3503 } 3504 3505 //---------------------------------------------------------------- 3506 // Property set patterns 3507 //---------------------------------------------------------------- 3508 3509 /** 3510 * Return true if the given position, in the given pattern, appears 3511 * to be the start of a property set pattern. 3512 */ 3513 private static boolean resemblesPropertyPattern(String pattern, int pos) { 3514 // Patterns are at least 5 characters long 3515 if ((pos+5) > pattern.length()) { 3516 return false; 3517 } 3518 3519 // Look for an opening [:, [:^, \p, or \P 3520 return pattern.regionMatches(pos, "[:", 0, 2) || 3521 pattern.regionMatches(true, pos, "\\p", 0, 2) || 3522 pattern.regionMatches(pos, "\\N", 0, 2); 3523 } 3524 3525 /** 3526 * Return true if the given iterator appears to point at a 3527 * property pattern. Regardless of the result, return with the 3528 * iterator unchanged. 3529 * @param chars iterator over the pattern characters. Upon return 3530 * it will be unchanged. 3531 * @param iterOpts RuleCharacterIterator options 3532 */ 3533 private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, 3534 int iterOpts) { 3535 boolean result = false; 3536 iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; 3537 Object pos = chars.getPos(null); 3538 int c = chars.next(iterOpts); 3539 if (c == '[' || c == '\\') { 3540 int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); 3541 result = (c == '[') ? (d == ':') : 3542 (d == 'N' || d == 'p' || d == 'P'); 3543 } 3544 chars.setPos(pos); 3545 return result; 3546 } 3547 3548 /** 3549 * Parse the given property pattern at the given parse position. 3550 * @param symbols TODO 3551 */ 3552 private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { 3553 int pos = ppos.getIndex(); 3554 3555 // On entry, ppos should point to one of the following locations: 3556 3557 // Minimum length is 5 characters, e.g. \p{L} 3558 if ((pos+5) > pattern.length()) { 3559 return null; 3560 } 3561 3562 boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 3563 boolean isName = false; // true for \N{pat}, o/w false 3564 boolean invert = false; 3565 3566 // Look for an opening [:, [:^, \p, or \P 3567 if (pattern.regionMatches(pos, "[:", 0, 2)) { 3568 posix = true; 3569 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3570 if (pos < pattern.length() && pattern.charAt(pos) == '^') { 3571 ++pos; 3572 invert = true; 3573 } 3574 } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || 3575 pattern.regionMatches(pos, "\\N", 0, 2)) { 3576 char c = pattern.charAt(pos+1); 3577 invert = (c == 'P'); 3578 isName = (c == 'N'); 3579 pos = PatternProps.skipWhiteSpace(pattern, (pos+2)); 3580 if (pos == pattern.length() || pattern.charAt(pos++) != '{') { 3581 // Syntax error; "\p" or "\P" not followed by "{" 3582 return null; 3583 } 3584 } else { 3585 // Open delimiter not seen 3586 return null; 3587 } 3588 3589 // Look for the matching close delimiter, either :] or } 3590 int close = pattern.indexOf(posix ? ":]" : "}", pos); 3591 if (close < 0) { 3592 // Syntax error; close delimiter missing 3593 return null; 3594 } 3595 3596 // Look for an '=' sign. If this is present, we will parse a 3597 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 3598 // pattern. 3599 int equals = pattern.indexOf('=', pos); 3600 String propName, valueName; 3601 if (equals >= 0 && equals < close && !isName) { 3602 // Equals seen; parse medium/long pattern 3603 propName = pattern.substring(pos, equals); 3604 valueName = pattern.substring(equals+1, close); 3605 } 3606 3607 else { 3608 // Handle case where no '=' is seen, and \N{} 3609 propName = pattern.substring(pos, close); 3610 valueName = ""; 3611 3612 // Handle \N{name} 3613 if (isName) { 3614 // This is a little inefficient since it means we have to 3615 // parse "na" back to UProperty.NAME even though we already 3616 // know it's UProperty.NAME. If we refactor the API to 3617 // support args of (int, String) then we can remove 3618 // "na" and make this a little more efficient. 3619 valueName = propName; 3620 propName = "na"; 3621 } 3622 } 3623 3624 applyPropertyAlias(propName, valueName, symbols); 3625 3626 if (invert) { 3627 complement(); 3628 } 3629 3630 // Move to the limit position after the close delimiter 3631 ppos.setIndex(close + (posix ? 2 : 1)); 3632 3633 return this; 3634 } 3635 3636 /** 3637 * Parse a property pattern. 3638 * @param chars iterator over the pattern characters. Upon return 3639 * it will be advanced to the first character after the parsed 3640 * pattern, or the end of the iteration if all characters are 3641 * parsed. 3642 * @param rebuiltPat the pattern that was parsed, rebuilt or 3643 * copied from the input pattern, as appropriate. 3644 * @param symbols TODO 3645 */ 3646 private void applyPropertyPattern(RuleCharacterIterator chars, 3647 Appendable rebuiltPat, SymbolTable symbols) { 3648 String patStr = chars.lookahead(); 3649 ParsePosition pos = new ParsePosition(0); 3650 applyPropertyPattern(patStr, pos, symbols); 3651 if (pos.getIndex() == 0) { 3652 syntaxError(chars, "Invalid property pattern"); 3653 } 3654 chars.jumpahead(pos.getIndex()); 3655 append(rebuiltPat, patStr.substring(0, pos.getIndex())); 3656 } 3657 3658 //---------------------------------------------------------------- 3659 // Case folding API 3660 //---------------------------------------------------------------- 3661 3662 /** 3663 * Bitmask for constructor and applyPattern() indicating that 3664 * white space should be ignored. If set, ignore Unicode Pattern_White_Space characters, 3665 * unless they are quoted or escaped. This may be ORed together 3666 * with other selectors. 3667 */ 3668 public static final int IGNORE_SPACE = 1; 3669 3670 /** 3671 * Bitmask for constructor, applyPattern(), and closeOver() 3672 * indicating letter case. This may be ORed together with other 3673 * selectors. 3674 * 3675 * Enable case insensitive matching. E.g., "[ab]" with this flag 3676 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3677 * match all except 'a', 'A', 'b', and 'B'. This performs a full 3678 * closure over case mappings, e.g. U+017F for s. 3679 * 3680 * The resulting set is a superset of the input for the code points but 3681 * not for the strings. 3682 * It performs a case mapping closure of the code points and adds 3683 * full case folding strings for the code points, and reduces strings of 3684 * the original set to their full case folding equivalents. 3685 * 3686 * This is designed for case-insensitive matches, for example 3687 * in regular expressions. The full code point case closure allows checking of 3688 * an input character directly against the closure set. 3689 * Strings are matched by comparing the case-folded form from the closure 3690 * set with an incremental case folding of the string in question. 3691 * 3692 * The closure set will also contain single code points if the original 3693 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 3694 * This is not necessary (that is, redundant) for the above matching method 3695 * but results in the same closure sets regardless of whether the original 3696 * set contained the code point or a string. 3697 */ 3698 public static final int CASE = 2; 3699 3700 /** 3701 * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C 3702 * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h). 3703 * @see #CASE 3704 */ 3705 public static final int CASE_INSENSITIVE = 2; 3706 3707 /** 3708 * Bitmask for constructor, applyPattern(), and closeOver() 3709 * indicating letter case. This may be ORed together with other 3710 * selectors. 3711 * 3712 * Enable case insensitive matching. E.g., "[ab]" with this flag 3713 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 3714 * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, 3715 * title-, and uppercase mappings as well as the case folding 3716 * of each existing element in the set. 3717 */ 3718 public static final int ADD_CASE_MAPPINGS = 4; 3719 3720 // add the result of a full case mapping to the set 3721 // use str as a temporary string to avoid constructing one 3722 private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) { 3723 if(result >= 0) { 3724 if(result > UCaseProps.MAX_STRING_LENGTH) { 3725 // add a single-code point case mapping 3726 set.add(result); 3727 } else { 3728 // add a string case mapping from full with length result 3729 set.add(full.toString()); 3730 full.setLength(0); 3731 } 3732 } 3733 // result < 0: the code point mapped to itself, no need to add it 3734 // see UCaseProps 3735 } 3736 3737 /** 3738 * Close this set over the given attribute. For the attribute 3739 * CASE, the result is to modify this set so that: 3740 * 3741 * 1. For each character or string 'a' in this set, all strings 3742 * 'b' such that foldCase(a) == foldCase(b) are added to this set. 3743 * (For most 'a' that are single characters, 'b' will have 3744 * b.length() == 1.) 3745 * 3746 * 2. For each string 'e' in the resulting set, if e != 3747 * foldCase(e), 'e' will be removed. 3748 * 3749 * Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}] 3750 * 3751 * (Here foldCase(x) refers to the operation 3752 * UCharacter.foldCase(x, true), and a == b actually denotes 3753 * a.equals(b), not pointer comparison.) 3754 * 3755 * @param attribute bitmask for attributes to close over. 3756 * Currently only the CASE bit is supported. Any undefined bits 3757 * are ignored. 3758 * @return a reference to this set. 3759 */ 3760 public UnicodeSet closeOver(int attribute) { 3761 checkFrozen(); 3762 if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) { 3763 UCaseProps csp = UCaseProps.INSTANCE; 3764 UnicodeSet foldSet = new UnicodeSet(this); 3765 ULocale root = ULocale.ROOT; 3766 3767 // start with input set to guarantee inclusion 3768 // CASE: remove strings because the strings will actually be reduced (folded); 3769 // therefore, start with no strings and add only those needed 3770 if((attribute & CASE) != 0) { 3771 foldSet.strings.clear(); 3772 } 3773 3774 int n = getRangeCount(); 3775 int result; 3776 StringBuilder full = new StringBuilder(); 3777 3778 for (int i=0; i<n; ++i) { 3779 int start = getRangeStart(i); 3780 int end = getRangeEnd(i); 3781 3782 if((attribute & CASE) != 0) { 3783 // full case closure 3784 for (int cp=start; cp<=end; ++cp) { 3785 csp.addCaseClosure(cp, foldSet); 3786 } 3787 } else { 3788 // add case mappings 3789 // (does not add long s for regular s, or Kelvin for k, for example) 3790 for (int cp=start; cp<=end; ++cp) { 3791 result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT); 3792 addCaseMapping(foldSet, result, full); 3793 3794 result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT); 3795 addCaseMapping(foldSet, result, full); 3796 3797 result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT); 3798 addCaseMapping(foldSet, result, full); 3799 3800 result = csp.toFullFolding(cp, full, 0); 3801 addCaseMapping(foldSet, result, full); 3802 } 3803 } 3804 } 3805 if (!strings.isEmpty()) { 3806 if ((attribute & CASE) != 0) { 3807 for (String s : strings) { 3808 String str = UCharacter.foldCase(s, 0); 3809 if(!csp.addStringCaseClosure(str, foldSet)) { 3810 foldSet.add(str); // does not map to code points: add the folded string itself 3811 } 3812 } 3813 } else { 3814 BreakIterator bi = BreakIterator.getWordInstance(root); 3815 for (String str : strings) { 3816 // TODO: call lower-level functions 3817 foldSet.add(UCharacter.toLowerCase(root, str)); 3818 foldSet.add(UCharacter.toTitleCase(root, str, bi)); 3819 foldSet.add(UCharacter.toUpperCase(root, str)); 3820 foldSet.add(UCharacter.foldCase(str, 0)); 3821 } 3822 } 3823 } 3824 set(foldSet); 3825 } 3826 return this; 3827 } 3828 3829 /** 3830 * Internal class for customizing UnicodeSet parsing of properties. 3831 * TODO: extend to allow customizing of codepoint ranges 3832 * @author medavis 3833 * @hide draft / provisional / internal are hidden on Android 3834 */ 3835 abstract public static class XSymbolTable implements SymbolTable { 3836 /** 3837 * Default constructor 3838 * @hide draft / provisional / internal are hidden on Android 3839 */ 3840 public XSymbolTable(){} 3841 /** 3842 * Supplies default implementation for SymbolTable (no action). 3843 * @hide draft / provisional / internal are hidden on Android 3844 */ 3845 @Override 3846 public UnicodeMatcher lookupMatcher(int i) { 3847 return null; 3848 } 3849 3850 /** 3851 * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style 3852 * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be. 3853 * <p> 3854 * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a 3855 * propertyName+propertyValue combination. 3856 * 3857 * @param propertyName 3858 * the name of the property 3859 * @param propertyValue 3860 * the name of the property value 3861 * @param result UnicodeSet value to change 3862 * a set to which the characters having the propertyName+propertyValue are to be added. 3863 * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters 3864 * with that property have been added to the UnicodeSet, and returns false if the 3865 * propertyName+propertyValue combination is not recognized (in which case result is unaltered). 3866 * @hide draft / provisional / internal are hidden on Android 3867 */ 3868 public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { 3869 return false; 3870 } 3871 /** 3872 * Supplies default implementation for SymbolTable (no action). 3873 * @hide draft / provisional / internal are hidden on Android 3874 */ 3875 @Override 3876 public char[] lookup(String s) { 3877 return null; 3878 } 3879 /** 3880 * Supplies default implementation for SymbolTable (no action). 3881 * @hide draft / provisional / internal are hidden on Android 3882 */ 3883 @Override 3884 public String parseReference(String text, ParsePosition pos, int limit) { 3885 return null; 3886 } 3887 } 3888 3889 /** 3890 * Is this frozen, according to the Freezable interface? 3891 * 3892 * @return value 3893 */ 3894 @Override 3895 public boolean isFrozen() { 3896 return (bmpSet != null || stringSpan != null); 3897 } 3898 3899 /** 3900 * Freeze this class, according to the Freezable interface. 3901 * 3902 * @return this 3903 */ 3904 @Override 3905 public UnicodeSet freeze() { 3906 if (!isFrozen()) { 3907 // Do most of what compact() does before freezing because 3908 // compact() will not work when the set is frozen. 3909 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). 3910 3911 // Delete buffer first to defragment memory less. 3912 buffer = null; 3913 if (list.length > (len + GROW_EXTRA)) { 3914 // Make the capacity equal to len or 1. 3915 // We don't want to realloc of 0 size. 3916 int capacity = (len == 0) ? 1 : len; 3917 int[] oldList = list; 3918 list = new int[capacity]; 3919 for (int i = capacity; i-- > 0;) { 3920 list[i] = oldList[i]; 3921 } 3922 } 3923 3924 // Optimize contains() and span() and similar functions. 3925 if (!strings.isEmpty()) { 3926 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL); 3927 } 3928 if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { 3929 // Optimize for code point spans. 3930 // There are no strings, or 3931 // all strings are irrelevant for span() etc. because 3932 // all of each string's code points are contained in this set. 3933 // However, fully contained strings are relevant for spanAndCount(), 3934 // so we create both objects. 3935 bmpSet = new BMPSet(list, len); 3936 } 3937 } 3938 return this; 3939 } 3940 3941 /** 3942 * Span a string using this UnicodeSet. 3943 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 3944 * @param s The string to be spanned 3945 * @param spanCondition The span condition 3946 * @return the length of the span 3947 */ 3948 public int span(CharSequence s, SpanCondition spanCondition) { 3949 return span(s, 0, spanCondition); 3950 } 3951 3952 /** 3953 * Span a string using this UnicodeSet. 3954 * If the start index is less than 0, span will start from 0. 3955 * If the start index is greater than the string length, span returns the string length. 3956 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 3957 * @param s The string to be spanned 3958 * @param start The start index that the span begins 3959 * @param spanCondition The span condition 3960 * @return the string index which ends the span (i.e. exclusive) 3961 */ 3962 public int span(CharSequence s, int start, SpanCondition spanCondition) { 3963 int end = s.length(); 3964 if (start < 0) { 3965 start = 0; 3966 } else if (start >= end) { 3967 return end; 3968 } 3969 if (bmpSet != null) { 3970 // Frozen set without strings, or no string is relevant for span(). 3971 return bmpSet.span(s, start, spanCondition, null); 3972 } 3973 if (stringSpan != null) { 3974 return stringSpan.span(s, start, spanCondition); 3975 } else if (!strings.isEmpty()) { 3976 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 3977 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 3978 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which); 3979 if (strSpan.needsStringSpanUTF16()) { 3980 return strSpan.span(s, start, spanCondition); 3981 } 3982 } 3983 3984 return spanCodePointsAndCount(s, start, spanCondition, null); 3985 } 3986 3987 /** 3988 * Same as span() but also counts the smallest number of set elements on any path across the span. 3989 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 3990 * @param outCount An output-only object (must not be null) for returning the count. 3991 * @return the limit (exclusive end) of the span 3992 * @deprecated This API is ICU internal only. 3993 * @hide original deprecated declaration 3994 * @hide draft / provisional / internal are hidden on Android 3995 */ 3996 @Deprecated 3997 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { 3998 if (outCount == null) { 3999 throw new IllegalArgumentException("outCount must not be null"); 4000 } 4001 int end = s.length(); 4002 if (start < 0) { 4003 start = 0; 4004 } else if (start >= end) { 4005 return end; 4006 } 4007 if (stringSpan != null) { 4008 // We might also have bmpSet != null, 4009 // but fully-contained strings are relevant for counting elements. 4010 return stringSpan.spanAndCount(s, start, spanCondition, outCount); 4011 } else if (bmpSet != null) { 4012 return bmpSet.span(s, start, spanCondition, outCount); 4013 } else if (!strings.isEmpty()) { 4014 int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED 4015 : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; 4016 which |= UnicodeSetStringSpan.WITH_COUNT; 4017 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which); 4018 return strSpan.spanAndCount(s, start, spanCondition, outCount); 4019 } 4020 4021 return spanCodePointsAndCount(s, start, spanCondition, outCount); 4022 } 4023 4024 private int spanCodePointsAndCount(CharSequence s, int start, 4025 SpanCondition spanCondition, OutputInt outCount) { 4026 // Pin to 0/1 values. 4027 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4028 4029 int c; 4030 int next = start; 4031 int length = s.length(); 4032 int count = 0; 4033 do { 4034 c = Character.codePointAt(s, next); 4035 if (spanContained != contains(c)) { 4036 break; 4037 } 4038 ++count; 4039 next += Character.charCount(c); 4040 } while (next < length); 4041 if (outCount != null) { outCount.value = count; } 4042 return next; 4043 } 4044 4045 /** 4046 * Span a string backwards (from the end) using this UnicodeSet. 4047 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4048 * @param s The string to be spanned 4049 * @param spanCondition The span condition 4050 * @return The string index which starts the span (i.e. inclusive). 4051 */ 4052 public int spanBack(CharSequence s, SpanCondition spanCondition) { 4053 return spanBack(s, s.length(), spanCondition); 4054 } 4055 4056 /** 4057 * Span a string backwards (from the fromIndex) using this UnicodeSet. 4058 * If the fromIndex is less than 0, spanBack will return 0. 4059 * If fromIndex is greater than the string length, spanBack will start from the string length. 4060 * <p>To replace, count elements, or delete spans, see {@link android.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. 4061 * @param s The string to be spanned 4062 * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards 4063 * @param spanCondition The span condition 4064 * @return The string index which starts the span (i.e. inclusive). 4065 */ 4066 public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { 4067 if (fromIndex <= 0) { 4068 return 0; 4069 } 4070 if (fromIndex > s.length()) { 4071 fromIndex = s.length(); 4072 } 4073 if (bmpSet != null) { 4074 // Frozen set without strings, or no string is relevant for spanBack(). 4075 return bmpSet.spanBack(s, fromIndex, spanCondition); 4076 } 4077 if (stringSpan != null) { 4078 return stringSpan.spanBack(s, fromIndex, spanCondition); 4079 } else if (!strings.isEmpty()) { 4080 int which = (spanCondition == SpanCondition.NOT_CONTAINED) 4081 ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED 4082 : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; 4083 UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which); 4084 if (strSpan.needsStringSpanUTF16()) { 4085 return strSpan.spanBack(s, fromIndex, spanCondition); 4086 } 4087 } 4088 4089 // Pin to 0/1 values. 4090 boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); 4091 4092 int c; 4093 int prev = fromIndex; 4094 do { 4095 c = Character.codePointBefore(s, prev); 4096 if (spanContained != contains(c)) { 4097 break; 4098 } 4099 prev -= Character.charCount(c); 4100 } while (prev > 0); 4101 return prev; 4102 } 4103 4104 /** 4105 * Clone a thawed version of this class, according to the Freezable interface. 4106 * @return the clone, not frozen 4107 */ 4108 @Override 4109 public UnicodeSet cloneAsThawed() { 4110 UnicodeSet result = new UnicodeSet(this); 4111 assert !result.isFrozen(); 4112 return result; 4113 } 4114 4115 // internal function 4116 private void checkFrozen() { 4117 if (isFrozen()) { 4118 throw new UnsupportedOperationException("Attempt to modify frozen object"); 4119 } 4120 } 4121 4122 // ************************ 4123 // Additional methods for integration with Generics and Collections 4124 // ************************ 4125 4126 /** 4127 * A struct-like class used for iteration through ranges, for faster iteration than by String. 4128 * Read about the restrictions on usage in {@link UnicodeSet#ranges()}. 4129 */ 4130 public static class EntryRange { 4131 /** 4132 * The starting code point of the range. 4133 */ 4134 public int codepoint; 4135 /** 4136 * The ending code point of the range 4137 */ 4138 public int codepointEnd; 4139 4140 EntryRange() { 4141 } 4142 4143 /** 4144 * {@inheritDoc} 4145 */ 4146 @Override 4147 public String toString() { 4148 StringBuilder b = new StringBuilder(); 4149 return ( 4150 codepoint == codepointEnd ? _appendToPat(b, codepoint, false) 4151 : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false)) 4152 .toString(); 4153 } 4154 } 4155 4156 /** 4157 * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points. 4158 * The UnicodeSet must not be altered during the iteration. 4159 * The EntryRange instance is the same each time; the contents are just reset. 4160 * 4161 * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings. 4162 * 4163 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4164 * Do not alter the UnicodeSet while iterating. 4165 * 4166 * <pre> 4167 * // Sample code 4168 * for (EntryRange range : us1.ranges()) { 4169 * // do something with code points between range.codepoint and range.codepointEnd; 4170 * } 4171 * for (String s : us1.strings()) { 4172 * // do something with each string; 4173 * } 4174 * </pre> 4175 */ 4176 public Iterable<EntryRange> ranges() { 4177 return new EntryRangeIterable(); 4178 } 4179 4180 private class EntryRangeIterable implements Iterable<EntryRange> { 4181 @Override 4182 public Iterator<EntryRange> iterator() { 4183 return new EntryRangeIterator(); 4184 } 4185 } 4186 4187 private class EntryRangeIterator implements Iterator<EntryRange> { 4188 int pos; 4189 EntryRange result = new EntryRange(); 4190 4191 @Override 4192 public boolean hasNext() { 4193 return pos < len-1; 4194 } 4195 @Override 4196 public EntryRange next() { 4197 if (pos < len-1) { 4198 result.codepoint = list[pos++]; 4199 result.codepointEnd = list[pos++]-1; 4200 } else { 4201 throw new NoSuchElementException(); 4202 } 4203 return result; 4204 } 4205 @Override 4206 public void remove() { 4207 throw new UnsupportedOperationException(); 4208 } 4209 } 4210 4211 4212 /** 4213 * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}. 4214 * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification. 4215 * Do not alter the UnicodeSet while iterating. 4216 * @see java.util.Set#iterator() 4217 */ 4218 @Override 4219 public Iterator<String> iterator() { 4220 return new UnicodeSetIterator2(this); 4221 } 4222 4223 // Cover for string iteration. 4224 private static class UnicodeSetIterator2 implements Iterator<String> { 4225 // Invariants: 4226 // sourceList != null then sourceList[item] is a valid character 4227 // sourceList == null then delegates to stringIterator 4228 private int[] sourceList; 4229 private int len; 4230 private int item; 4231 private int current; 4232 private int limit; 4233 private TreeSet<String> sourceStrings; 4234 private Iterator<String> stringIterator; 4235 private char[] buffer; 4236 4237 UnicodeSetIterator2(UnicodeSet source) { 4238 // set according to invariants 4239 len = source.len - 1; 4240 if (len > 0) { 4241 sourceStrings = source.strings; 4242 sourceList = source.list; 4243 current = sourceList[item++]; 4244 limit = sourceList[item++]; 4245 } else { 4246 stringIterator = source.strings.iterator(); 4247 sourceList = null; 4248 } 4249 } 4250 4251 /* (non-Javadoc) 4252 * @see java.util.Iterator#hasNext() 4253 */ 4254 @Override 4255 public boolean hasNext() { 4256 return sourceList != null || stringIterator.hasNext(); 4257 } 4258 4259 /* (non-Javadoc) 4260 * @see java.util.Iterator#next() 4261 */ 4262 @Override 4263 public String next() { 4264 if (sourceList == null) { 4265 return stringIterator.next(); 4266 } 4267 int codepoint = current++; 4268 // we have the codepoint we need, but we may need to adjust the state 4269 if (current >= limit) { 4270 if (item >= len) { 4271 stringIterator = sourceStrings.iterator(); 4272 sourceList = null; 4273 } else { 4274 current = sourceList[item++]; 4275 limit = sourceList[item++]; 4276 } 4277 } 4278 // Now return. Single code point is easy 4279 if (codepoint <= 0xFFFF) { 4280 return String.valueOf((char)codepoint); 4281 } 4282 // But Java lacks a valueOfCodePoint, so we handle ourselves for speed 4283 // allocate a buffer the first time, to make conversion faster. 4284 if (buffer == null) { 4285 buffer = new char[2]; 4286 } 4287 // compute ourselves, to save tests and calls 4288 int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT; 4289 buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE); 4290 buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE); 4291 return String.valueOf(buffer); 4292 } 4293 4294 /* (non-Javadoc) 4295 * @see java.util.Iterator#remove() 4296 */ 4297 @Override 4298 public void remove() { 4299 throw new UnsupportedOperationException(); 4300 } 4301 } 4302 4303 /** 4304 * @see #containsAll(android.icu.text.UnicodeSet) 4305 */ 4306 public <T extends CharSequence> boolean containsAll(Iterable<T> collection) { 4307 for (T o : collection) { 4308 if (!contains(o)) { 4309 return false; 4310 } 4311 } 4312 return true; 4313 } 4314 4315 /** 4316 * @see #containsNone(android.icu.text.UnicodeSet) 4317 */ 4318 public <T extends CharSequence> boolean containsNone(Iterable<T> collection) { 4319 for (T o : collection) { 4320 if (contains(o)) { 4321 return false; 4322 } 4323 } 4324 return true; 4325 } 4326 4327 /** 4328 * @see #containsAll(android.icu.text.UnicodeSet) 4329 */ 4330 public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) { 4331 return !containsNone(collection); 4332 } 4333 4334 /** 4335 * @see #addAll(android.icu.text.UnicodeSet) 4336 */ 4337 @SuppressWarnings("unchecked") // See ticket #11395, this is safe. 4338 public <T extends CharSequence> UnicodeSet addAll(T... collection) { 4339 checkFrozen(); 4340 for (T str : collection) { 4341 add(str); 4342 } 4343 return this; 4344 } 4345 4346 4347 /** 4348 * @see #removeAll(android.icu.text.UnicodeSet) 4349 */ 4350 public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) { 4351 checkFrozen(); 4352 for (T o : collection) { 4353 remove(o); 4354 } 4355 return this; 4356 } 4357 4358 /** 4359 * @see #retainAll(android.icu.text.UnicodeSet) 4360 */ 4361 public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) { 4362 checkFrozen(); 4363 // TODO optimize 4364 UnicodeSet toRetain = new UnicodeSet(); 4365 toRetain.addAll(collection); 4366 retainAll(toRetain); 4367 return this; 4368 } 4369 4370 /** 4371 * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}. 4372 */ 4373 public enum ComparisonStyle { 4374 /** 4375 */ 4376 SHORTER_FIRST, 4377 /** 4378 */ 4379 LEXICOGRAPHIC, 4380 /** 4381 */ 4382 LONGER_FIRST 4383 } 4384 4385 /** 4386 * Compares UnicodeSets, where shorter come first, and otherwise lexigraphically 4387 * (according to the comparison of the first characters that differ). 4388 * @see java.lang.Comparable#compareTo(java.lang.Object) 4389 */ 4390 @Override 4391 public int compareTo(UnicodeSet o) { 4392 return compareTo(o, ComparisonStyle.SHORTER_FIRST); 4393 } 4394 /** 4395 * Compares UnicodeSets, in three different ways. 4396 * @see java.lang.Comparable#compareTo(java.lang.Object) 4397 */ 4398 public int compareTo(UnicodeSet o, ComparisonStyle style) { 4399 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4400 int diff = size() - o.size(); 4401 if (diff != 0) { 4402 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4403 } 4404 } 4405 int result; 4406 for (int i = 0; ; ++i) { 4407 if (0 != (result = list[i] - o.list[i])) { 4408 // if either list ran out, compare to the last string 4409 if (list[i] == HIGH) { 4410 if (strings.isEmpty()) return 1; 4411 String item = strings.first(); 4412 return compare(item, o.list[i]); 4413 } 4414 if (o.list[i] == HIGH) { 4415 if (o.strings.isEmpty()) return -1; 4416 String item = o.strings.first(); 4417 int compareResult = compare(item, list[i]); 4418 return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order. 4419 } 4420 // otherwise return the result if even index, or the reversal if not 4421 return (i & 1) == 0 ? result : -result; 4422 } 4423 if (list[i] == HIGH) { 4424 break; 4425 } 4426 } 4427 return compare(strings, o.strings); 4428 } 4429 4430 /** 4431 */ 4432 public int compareTo(Iterable<String> other) { 4433 return compare(this, other); 4434 } 4435 4436 /** 4437 * Utility to compare a string to a code point. 4438 * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString()) 4439 * and comparing, but much faster (no object creation). 4440 * Actually, there is one difference; a null compares as less. 4441 * Note that this (=String) order is UTF-16 order -- *not* code point order. 4442 * @hide unsupported on Android 4443 */ 4444 4445 public static int compare(CharSequence string, int codePoint) { 4446 return CharSequences.compare(string, codePoint); 4447 } 4448 4449 /** 4450 * Utility to compare a string to a code point. 4451 * Same results as turning the code point into a string and comparing, but much faster (no object creation). 4452 * Actually, there is one difference; a null compares as less. 4453 * Note that this (=String) order is UTF-16 order -- *not* code point order. 4454 * @hide unsupported on Android 4455 */ 4456 public static int compare(int codePoint, CharSequence string) { 4457 return -CharSequences.compare(string, codePoint); 4458 } 4459 4460 4461 /** 4462 * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered, 4463 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4464 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4465 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4466 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4467 * @hide unsupported on Android 4468 */ 4469 public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) { 4470 return compare(collection1.iterator(), collection2.iterator()); 4471 } 4472 4473 /** 4474 * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered, 4475 * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration. 4476 * That means that sets can't be compared directly with this method, unless they are TreeSets without 4477 * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of 4478 * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances. 4479 * @deprecated This API is ICU internal only. 4480 * @hide original deprecated declaration 4481 * @hide draft / provisional / internal are hidden on Android 4482 */ 4483 @Deprecated 4484 public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) { 4485 while (true) { 4486 if (!first.hasNext()) { 4487 return other.hasNext() ? -1 : 0; 4488 } else if (!other.hasNext()) { 4489 return 1; 4490 } 4491 T item1 = first.next(); 4492 T item2 = other.next(); 4493 int result = item1.compareTo(item2); 4494 if (result != 0) { 4495 return result; 4496 } 4497 } 4498 } 4499 4500 4501 /** 4502 * Utility to compare two collections, optionally by size, and then lexicographically. 4503 * @hide unsupported on Android 4504 */ 4505 public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) { 4506 if (style != ComparisonStyle.LEXICOGRAPHIC) { 4507 int diff = collection1.size() - collection2.size(); 4508 if (diff != 0) { 4509 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1; 4510 } 4511 } 4512 return compare(collection1, collection2); 4513 } 4514 4515 /** 4516 * Utility for adding the contents of an iterable to a collection. 4517 * @hide unsupported on Android 4518 */ 4519 public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) { 4520 for (T item : source) { 4521 target.add(item); 4522 } 4523 return target; 4524 } 4525 4526 /** 4527 * Utility for adding the contents of an iterable to a collection. 4528 * @hide unsupported on Android 4529 */ 4530 public static <T> T[] addAllTo(Iterable<T> source, T[] target) { 4531 int i = 0; 4532 for (T item : source) { 4533 target[i++] = item; 4534 } 4535 return target; 4536 } 4537 4538 /** 4539 * For iterating through the strings in the set. Example: 4540 * <pre> 4541 * for (String key : myUnicodeSet.strings()) { 4542 * doSomethingWith(key); 4543 * } 4544 * </pre> 4545 */ 4546 public Collection<String> strings() { 4547 return Collections.unmodifiableSortedSet(strings); 4548 } 4549 4550 /** 4551 * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE. 4552 * @deprecated This API is ICU internal only. 4553 * @hide original deprecated declaration 4554 * @hide draft / provisional / internal are hidden on Android 4555 */ 4556 @Deprecated 4557 public static int getSingleCodePoint(CharSequence s) { 4558 return CharSequences.getSingleCodePoint(s); 4559 } 4560 4561 /** 4562 * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set. 4563 * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E 4564 * if the dontCare set includes unassigned characters (for a particular version of Unicode). 4565 * @param dontCare Set with the don't-care characters for spanning 4566 * @return the input set, modified 4567 * @deprecated This API is ICU internal only. 4568 * @hide original deprecated declaration 4569 * @hide draft / provisional / internal are hidden on Android 4570 */ 4571 @Deprecated 4572 public UnicodeSet addBridges(UnicodeSet dontCare) { 4573 UnicodeSet notInInput = new UnicodeSet(this).complement(); 4574 for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) { 4575 if (it.codepoint != 0 && it.codepoint != UnicodeSetIterator.IS_STRING && it.codepointEnd != 0x10FFFF && dontCare.contains(it.codepoint,it.codepointEnd)) { 4576 add(it.codepoint,it.codepointEnd); 4577 } 4578 } 4579 return this; 4580 } 4581 4582 /** 4583 * Find the first index at or after fromIndex where the UnicodeSet matches at that index. 4584 * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match. 4585 * If there is no match, length is returned. 4586 * @deprecated This API is ICU internal only. Use span instead. 4587 * @hide original deprecated declaration 4588 * @hide draft / provisional / internal are hidden on Android 4589 */ 4590 @Deprecated 4591 public int findIn(CharSequence value, int fromIndex, boolean findNot) { 4592 //TODO add strings, optimize, using ICU4C algorithms 4593 int cp; 4594 for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) { 4595 cp = UTF16.charAt(value, fromIndex); 4596 if (contains(cp) != findNot) { 4597 break; 4598 } 4599 } 4600 return fromIndex; 4601 } 4602 4603 /** 4604 * Find the last index before fromIndex where the UnicodeSet matches at that index. 4605 * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match. 4606 * If there is no match, -1 is returned. 4607 * BEFORE index is not in the UnicodeSet. 4608 * @deprecated This API is ICU internal only. Use spanBack instead. 4609 * @hide original deprecated declaration 4610 * @hide draft / provisional / internal are hidden on Android 4611 */ 4612 @Deprecated 4613 public int findLastIn(CharSequence value, int fromIndex, boolean findNot) { 4614 //TODO add strings, optimize, using ICU4C algorithms 4615 int cp; 4616 fromIndex -= 1; 4617 for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) { 4618 cp = UTF16.charAt(value, fromIndex); 4619 if (contains(cp) != findNot) { 4620 break; 4621 } 4622 } 4623 return fromIndex < 0 ? -1 : fromIndex; 4624 } 4625 4626 /** 4627 * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match. 4628 * @param source The source of the CharSequence to strip from. 4629 * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object. 4630 * @return The string after it has been stripped. 4631 * @deprecated This API is ICU internal only. Use replaceFrom. 4632 * @hide original deprecated declaration 4633 * @hide draft / provisional / internal are hidden on Android 4634 */ 4635 @Deprecated 4636 public String stripFrom(CharSequence source, boolean matches) { 4637 StringBuilder result = new StringBuilder(); 4638 for (int pos = 0; pos < source.length();) { 4639 int inside = findIn(source, pos, !matches); 4640 result.append(source.subSequence(pos, inside)); 4641 pos = findIn(source, inside, matches); // get next start 4642 } 4643 return result.toString(); 4644 } 4645 4646 /** 4647 * Argument values for whether span() and similar functions continue while the current character is contained vs. 4648 * not contained in the set. 4649 * <p> 4650 * The functionality is straightforward for sets with only single code points, without strings (which is the common 4651 * case): 4652 * <ul> 4653 * <li>CONTAINED and SIMPLE work the same. 4654 * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED. 4655 * <li>span() and spanBack() partition any string the 4656 * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). 4657 * <li>Using a 4658 * complemented (inverted) set and the opposite span conditions yields the same results. 4659 * </ul> 4660 * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in 4661 * the set (for example, whether they overlap with each other) and the string that is processed. For a set with 4662 * strings: 4663 * <ul> 4664 * <li>The complement of the set contains the opposite set of code points, but the same set of strings. 4665 * Therefore, complementing both the set and the span conditions may yield different results. 4666 * <li>When starting spans 4667 * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 4668 * because a set string may start before the later position. 4669 * <li>span(SIMPLE) may be shorter than 4670 * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which 4671 * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", 4672 * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). 4673 * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, 4674 * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield 4675 * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. 4676 * </ul> 4677 * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then 4678 * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could 4679 * be used. 4680 * <p> 4681 * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point 4682 * boundaries, never in the middle of a surrogate pair. 4683 */ 4684 public enum SpanCondition { 4685 /** 4686 * Continues a span() while there is no set element at the current position. 4687 * Increments by one code point at a time. 4688 * Stops before the first set element (character or string). 4689 * (For code points only, this is like while contains(current)==false). 4690 * <p> 4691 * When span() returns, the substring between where it started and the position it returned consists only of 4692 * characters that are not in the set, and none of its strings overlap with the span. 4693 */ 4694 NOT_CONTAINED, 4695 4696 /** 4697 * Spans the longest substring that is a concatenation of set elements (characters or strings). 4698 * (For characters only, this is like while contains(current)==true). 4699 * <p> 4700 * When span() returns, the substring between where it started and the position it returned consists only of set 4701 * elements (characters or strings) that are in the set. 4702 * <p> 4703 * If a set contains strings, then the span will be the longest substring for which there 4704 * exists at least one non-overlapping concatenation of set elements (characters or strings). 4705 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 4706 * (Java/ICU/Perl regex stops at the first match of an OR.) 4707 */ 4708 CONTAINED, 4709 4710 /** 4711 * Continues a span() while there is a set element at the current position. 4712 * Increments by the longest matching element at each position. 4713 * (For characters only, this is like while contains(current)==true). 4714 * <p> 4715 * When span() returns, the substring between where it started and the position it returned consists only of set 4716 * elements (characters or strings) that are in the set. 4717 * <p> 4718 * If a set only contains single characters, then this is the same as CONTAINED. 4719 * <p> 4720 * If a set contains strings, then the span will be the longest substring with a match at each position with the 4721 * longest single set element (character or string). 4722 * <p> 4723 * Use this span condition together with other longest-match algorithms, such as ICU converters 4724 * (ucnv_getUnicodeSet()). 4725 */ 4726 SIMPLE, 4727 4728 /** 4729 * One more than the last span condition. 4730 */ 4731 CONDITION_COUNT 4732 } 4733 4734 /** 4735 * Get the default symbol table. Null means ordinary processing. For internal use only. 4736 * @return the symbol table 4737 * @deprecated This API is ICU internal only. 4738 * @hide original deprecated declaration 4739 * @hide draft / provisional / internal are hidden on Android 4740 */ 4741 @Deprecated 4742 public static XSymbolTable getDefaultXSymbolTable() { 4743 return XSYMBOL_TABLE; 4744 } 4745 4746 /** 4747 * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing 4748 * of UnicodeSets. 4749 * <p> 4750 * WARNING: If this function is used with a UnicodeProperty, and the 4751 * Unassigned characters (gc=Cn) are different than in ICU other than in ICU, you MUST call 4752 * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable} 4753 * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}. 4754 * 4755 * @param xSymbolTable the new default symbol table. 4756 * @deprecated This API is ICU internal only. 4757 * @hide original deprecated declaration 4758 * @hide draft / provisional / internal are hidden on Android 4759 */ 4760 @Deprecated 4761 public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) { 4762 INCLUSIONS = null; // If the properties override inclusions, these have to be regenerated. 4763 XSYMBOL_TABLE = xSymbolTable; 4764 } 4765 } 4766 //eof 4767