Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 1996-2016, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.text;
     10 
     11 import java.io.IOException;
     12 import java.text.ParsePosition;
     13 import java.util.ArrayList;
     14 import java.util.Collection;
     15 import java.util.Collections;
     16 import java.util.Iterator;
     17 import java.util.NoSuchElementException;
     18 import java.util.TreeSet;
     19 
     20 import com.ibm.icu.impl.BMPSet;
     21 import com.ibm.icu.impl.Norm2AllModes;
     22 import com.ibm.icu.impl.PatternProps;
     23 import com.ibm.icu.impl.RuleCharacterIterator;
     24 import com.ibm.icu.impl.SortedSetRelation;
     25 import com.ibm.icu.impl.StringRange;
     26 import com.ibm.icu.impl.UBiDiProps;
     27 import com.ibm.icu.impl.UCaseProps;
     28 import com.ibm.icu.impl.UCharacterProperty;
     29 import com.ibm.icu.impl.UPropertyAliases;
     30 import com.ibm.icu.impl.UnicodeSetStringSpan;
     31 import com.ibm.icu.impl.Utility;
     32 import com.ibm.icu.lang.CharSequences;
     33 import com.ibm.icu.lang.UCharacter;
     34 import com.ibm.icu.lang.UProperty;
     35 import com.ibm.icu.lang.UScript;
     36 import com.ibm.icu.util.Freezable;
     37 import com.ibm.icu.util.ICUUncheckedIOException;
     38 import com.ibm.icu.util.OutputInt;
     39 import com.ibm.icu.util.ULocale;
     40 import com.ibm.icu.util.VersionInfo;
     41 
     42 /**
     43  * A mutable set of Unicode characters and multicharacter strings.
     44  * Objects of this class represent <em>character classes</em> used
     45  * in regular expressions. A character specifies a subset of Unicode
     46  * code points.  Legal code points are U+0000 to U+10FFFF, inclusive.
     47  *
     48  * Note: method freeze() will not only make the set immutable, but
     49  * also makes important methods much higher performance:
     50  * contains(c), containsNone(...), span(...), spanBack(...) etc.
     51  * After the object is frozen, any subsequent call that wants to change
     52  * the object will throw UnsupportedOperationException.
     53  *
     54  * <p>The UnicodeSet class is not designed to be subclassed.
     55  *
     56  * <p><code>UnicodeSet</code> supports two APIs. The first is the
     57  * <em>operand</em> API that allows the caller to modify the value of
     58  * a <code>UnicodeSet</code> object. It conforms to Java 2's
     59  * <code>java.util.Set</code> interface, although
     60  * <code>UnicodeSet</code> does not actually implement that
     61  * interface. All methods of <code>Set</code> are supported, with the
     62  * modification that they take a character range or single character
     63  * instead of an <code>Object</code>, and they take a
     64  * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
     65  * operand API may be thought of in terms of boolean logic: a boolean
     66  * OR is implemented by <code>add</code>, a boolean AND is implemented
     67  * by <code>retain</code>, a boolean XOR is implemented by
     68  * <code>complement</code> taking an argument, and a boolean NOT is
     69  * implemented by <code>complement</code> with no argument.  In terms
     70  * of traditional set theory function names, <code>add</code> is a
     71  * union, <code>retain</code> is an intersection, <code>remove</code>
     72  * is an asymmetric difference, and <code>complement</code> with no
     73  * argument is a set complement with respect to the superset range
     74  * <code>MIN_VALUE-MAX_VALUE</code>
     75  *
     76  * <p>The second API is the
     77  * <code>applyPattern()</code>/<code>toPattern()</code> API from the
     78  * <code>java.text.Format</code>-derived classes.  Unlike the
     79  * methods that add characters, add categories, and control the logic
     80  * of the set, the method <code>applyPattern()</code> sets all
     81  * attributes of a <code>UnicodeSet</code> at once, based on a
     82  * string pattern.
     83  *
     84  * <p><b>Pattern syntax</b></p>
     85  *
     86  * Patterns are accepted by the constructors and the
     87  * <code>applyPattern()</code> methods and returned by the
     88  * <code>toPattern()</code> method.  These patterns follow a syntax
     89  * similar to that employed by version 8 regular expression character
     90  * classes.  Here are some simple examples:
     91  *
     92  * <blockquote>
     93  *   <table>
     94  *     <tr style="vertical-align: top">
     95  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[]</code></td>
     96  *       <td style="vertical-align: top;">No characters</td>
     97  *     </tr><tr style="vertical-align: top">
     98  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a]</code></td>
     99  *       <td style="vertical-align: top;">The character 'a'</td>
    100  *     </tr><tr style="vertical-align: top">
    101  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[ae]</code></td>
    102  *       <td style="vertical-align: top;">The characters 'a' and 'e'</td>
    103  *     </tr>
    104  *     <tr>
    105  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a-e]</code></td>
    106  *       <td style="vertical-align: top;">The characters 'a' through 'e' inclusive, in Unicode code
    107  *       point order</td>
    108  *     </tr>
    109  *     <tr>
    110  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\\u4E01]</code></td>
    111  *       <td style="vertical-align: top;">The character U+4E01</td>
    112  *     </tr>
    113  *     <tr>
    114  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[a{ab}{ac}]</code></td>
    115  *       <td style="vertical-align: top;">The character 'a' and the multicharacter strings &quot;ab&quot; and
    116  *       &quot;ac&quot;</td>
    117  *     </tr>
    118  *     <tr>
    119  *       <td style="white-space: nowrap; vertical-align: top; horizontal-align: left;"><code>[\p{Lu}]</code></td>
    120  *       <td style="vertical-align: top;">All characters in the general category Uppercase Letter</td>
    121  *     </tr>
    122  *   </table>
    123  * </blockquote>
    124  *
    125  * Any character may be preceded by a backslash in order to remove any special
    126  * meaning.  White space characters, as defined by the Unicode Pattern_White_Space property, are
    127  * ignored, unless they are escaped.
    128  *
    129  * <p>Property patterns specify a set of characters having a certain
    130  * property as defined by the Unicode standard.  Both the POSIX-like
    131  * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized.  For a
    132  * complete list of supported property patterns, see the User's Guide
    133  * for UnicodeSet at
    134  * <a href="http://www.icu-project.org/userguide/unicodeSet.html">
    135  * http://www.icu-project.org/userguide/unicodeSet.html</a>.
    136  * Actual determination of property data is defined by the underlying
    137  * Unicode database as implemented by UCharacter.
    138  *
    139  * <p>Patterns specify individual characters, ranges of characters, and
    140  * Unicode property sets.  When elements are concatenated, they
    141  * specify their union.  To complement a set, place a '^' immediately
    142  * after the opening '['.  Property patterns are inverted by modifying
    143  * their delimiters; "[:^foo]" and "\P{foo}".  In any other location,
    144  * '^' has no special meaning.
    145  *
    146  * <p>Ranges are indicated by placing two a '-' between two
    147  * characters, as in "a-z".  This specifies the range of all
    148  * characters from the left to the right, in Unicode order.  If the
    149  * left character is greater than or equal to the
    150  * right character it is a syntax error.  If a '-' occurs as the first
    151  * character after the opening '[' or '[^', or if it occurs as the
    152  * last character before the closing ']', then it is taken as a
    153  * literal.  Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
    154  * set of three characters, 'a', 'b', and '-'.
    155  *
    156  * <p>Sets may be intersected using the '&amp;' operator or the asymmetric
    157  * set difference may be taken using the '-' operator, for example,
    158  * "[[:L:]&amp;[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
    159  * with values less than 4096.  Operators ('&amp;' and '|') have equal
    160  * precedence and bind left-to-right.  Thus
    161  * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
    162  * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
    163  * difference; intersection is commutative.
    164  *
    165  * <table>
    166  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a]</code><td>The set containing 'a'
    167  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[a-z]</code><td>The set containing 'a'
    168  * through 'z' and all letters in between, in Unicode order
    169  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[^a-z]</code><td>The set containing
    170  * all characters but 'a' through 'z',
    171  * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
    172  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
    173  * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
    174  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]&amp;[<em>pat2</em>]]</code>
    175  * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
    176  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
    177  * <td>The asymmetric difference of sets specified by <em>pat1</em> and
    178  * <em>pat2</em>
    179  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:Lu:] or \p{Lu}</code>
    180  * <td>The set of characters having the specified
    181  * Unicode property; in
    182  * this case, Unicode uppercase letters
    183  * <tr style="vertical-align: top;"><td style="white-space: nowrap;"><code>[:^Lu:] or \P{Lu}</code>
    184  * <td>The set of characters <em>not</em> having the given
    185  * Unicode property
    186  * </table>
    187  *
    188  * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
    189  *
    190  * <p><b>Formal syntax</b></p>
    191  *
    192  * <blockquote>
    193  *   <table>
    194  *     <tr style="vertical-align: top">
    195  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern :=&nbsp; </code></td>
    196  *       <td style="vertical-align: top;"><code>('[' '^'? item* ']') |
    197  *       property</code></td>
    198  *     </tr>
    199  *     <tr style="vertical-align: top">
    200  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>item :=&nbsp; </code></td>
    201  *       <td style="vertical-align: top;"><code>char | (char '-' char) | pattern-expr<br>
    202  *       </code></td>
    203  *     </tr>
    204  *     <tr style="vertical-align: top">
    205  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>pattern-expr :=&nbsp; </code></td>
    206  *       <td style="vertical-align: top;"><code>pattern | pattern-expr pattern |
    207  *       pattern-expr op pattern<br>
    208  *       </code></td>
    209  *     </tr>
    210  *     <tr style="vertical-align: top">
    211  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>op :=&nbsp; </code></td>
    212  *       <td style="vertical-align: top;"><code>'&amp;' | '-'<br>
    213  *       </code></td>
    214  *     </tr>
    215  *     <tr style="vertical-align: top">
    216  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>special :=&nbsp; </code></td>
    217  *       <td style="vertical-align: top;"><code>'[' | ']' | '-'<br>
    218  *       </code></td>
    219  *     </tr>
    220  *     <tr style="vertical-align: top">
    221  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>char :=&nbsp; </code></td>
    222  *       <td style="vertical-align: top;"><em>any character that is not</em><code> special<br>
    223  *       | ('\\' </code><em>any character</em><code>)<br>
    224  *       | ('&#92;u' hex hex hex hex)<br>
    225  *       </code></td>
    226  *     </tr>
    227  *     <tr style="vertical-align: top">
    228  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex :=&nbsp; </code></td>
    229  *       <td style="vertical-align: top;"><em>any character for which
    230  *       </em><code>Character.digit(c, 16)</code><em>
    231  *       returns a non-negative result</em></td>
    232  *     </tr>
    233  *     <tr>
    234  *       <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property :=&nbsp; </code></td>
    235  *       <td style="vertical-align: top;"><em>a Unicode property set pattern</em></td>
    236  *     </tr>
    237  *   </table>
    238  *   <br>
    239  *   <table border="1">
    240  *     <tr>
    241  *       <td>Legend: <table>
    242  *         <tr>
    243  *           <td style="white-space: nowrap; vertical-align: top;"><code>a := b</code></td>
    244  *           <td style="width: 20; vertical-align: top;">&nbsp; </td>
    245  *           <td style="vertical-align: top;"><code>a</code> may be replaced by <code>b</code> </td>
    246  *         </tr>
    247  *         <tr>
    248  *           <td style="white-space: nowrap; vertical-align: top;"><code>a?</code></td>
    249  *           <td style="vertical-align: top;"></td>
    250  *           <td style="vertical-align: top;">zero or one instance of <code>a</code><br>
    251  *           </td>
    252  *         </tr>
    253  *         <tr>
    254  *           <td style="white-space: nowrap; vertical-align: top;"><code>a*</code></td>
    255  *           <td style="vertical-align: top;"></td>
    256  *           <td style="vertical-align: top;">one or more instances of <code>a</code><br>
    257  *           </td>
    258  *         </tr>
    259  *         <tr>
    260  *           <td style="white-space: nowrap; vertical-align: top;"><code>a | b</code></td>
    261  *           <td style="vertical-align: top;"></td>
    262  *           <td style="vertical-align: top;">either <code>a</code> or <code>b</code><br>
    263  *           </td>
    264  *         </tr>
    265  *         <tr>
    266  *           <td style="white-space: nowrap; vertical-align: top;"><code>'a'</code></td>
    267  *           <td style="vertical-align: top;"></td>
    268  *           <td style="vertical-align: top;">the literal string between the quotes </td>
    269  *         </tr>
    270  *       </table>
    271  *       </td>
    272  *     </tr>
    273  *   </table>
    274  * </blockquote>
    275  * <p>To iterate over contents of UnicodeSet, the following are available:
    276  * <ul><li>{@link #ranges()} to iterate through the ranges</li>
    277  * <li>{@link #strings()} to iterate through the strings</li>
    278  * <li>{@link #iterator()} to iterate through the entire contents in a single loop.
    279  * That method is, however, not particularly efficient, since it "boxes" each code point into a String.
    280  * </ul>
    281  * All of the above can be used in <b>for</b> loops.
    282  * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
    283  * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
    284  *
    285  * @author Alan Liu
    286  * @stable ICU 2.0
    287  * @see UnicodeSetIterator
    288  * @see UnicodeSetSpanner
    289  */
    290 public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Comparable<UnicodeSet>, Freezable<UnicodeSet> {
    291 
    292     /**
    293      * Constant for the empty set.
    294      * @stable ICU 4.8
    295      */
    296     public static final UnicodeSet EMPTY = new UnicodeSet().freeze();
    297     /**
    298      * Constant for the set of all code points. (Since UnicodeSets can include strings, does not include everything that a UnicodeSet can.)
    299      * @stable ICU 4.8
    300      */
    301     public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze();
    302 
    303     private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing
    304 
    305     private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
    306     private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
    307     // 110000 for codepoints
    308 
    309     /**
    310      * Minimum value that can be stored in a UnicodeSet.
    311      * @stable ICU 2.0
    312      */
    313     public static final int MIN_VALUE = LOW;
    314 
    315     /**
    316      * Maximum value that can be stored in a UnicodeSet.
    317      * @stable ICU 2.0
    318      */
    319     public static final int MAX_VALUE = HIGH - 1;
    320 
    321     private int len;      // length used; list may be longer to minimize reallocs
    322     private int[] list;   // MUST be terminated with HIGH
    323     private int[] rangeList; // internal buffer
    324     private int[] buffer; // internal buffer
    325 
    326     // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
    327     // is not private so that UnicodeSetIterator can get access
    328     TreeSet<String> strings = new TreeSet<String>();
    329 
    330     /**
    331      * The pattern representation of this set.  This may not be the
    332      * most economical pattern.  It is the pattern supplied to
    333      * applyPattern(), with variables substituted and whitespace
    334      * removed.  For sets constructed without applyPattern(), or
    335      * modified using the non-pattern API, this string will be null,
    336      * indicating that toPattern() must generate a pattern
    337      * representation from the inversion list.
    338      */
    339     private String pat = null;
    340 
    341     private static final int START_EXTRA = 16;         // initial storage. Must be >= 0
    342     private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
    343 
    344     // Special property set IDs
    345     private static final String ANY_ID   = "ANY";   // [\u0000-\U0010FFFF]
    346     private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F]
    347     private static final String ASSIGNED = "Assigned"; // [:^Cn:]
    348 
    349     /**
    350      * A set of all characters _except_ the second through last characters of
    351      * certain ranges.  These ranges are ranges of characters whose
    352      * properties are all exactly alike, e.g. CJK Ideographs from
    353      * U+4E00 to U+9FA5.
    354      */
    355     private static UnicodeSet INCLUSIONS[] = null;
    356 
    357     private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null.
    358     private volatile UnicodeSetStringSpan stringSpan;
    359     //----------------------------------------------------------------
    360     // Public API
    361     //----------------------------------------------------------------
    362 
    363     /**
    364      * Constructs an empty set.
    365      * @stable ICU 2.0
    366      */
    367     public UnicodeSet() {
    368         list = new int[1 + START_EXTRA];
    369         list[len++] = HIGH;
    370     }
    371 
    372     /**
    373      * Constructs a copy of an existing set.
    374      * @stable ICU 2.0
    375      */
    376     public UnicodeSet(UnicodeSet other) {
    377         set(other);
    378     }
    379 
    380     /**
    381      * Constructs a set containing the given range. If <code>end &gt;
    382      * start</code> then an empty set is created.
    383      *
    384      * @param start first character, inclusive, of range
    385      * @param end last character, inclusive, of range
    386      * @stable ICU 2.0
    387      */
    388     public UnicodeSet(int start, int end) {
    389         this();
    390         complement(start, end);
    391     }
    392 
    393     /**
    394      * Quickly constructs a set from a set of ranges &lt;s0, e0, s1, e1, s2, e2, ..., sn, en&gt;.
    395      * There must be an even number of integers, and they must be all greater than zero,
    396      * all less than or equal to Character.MAX_CODE_POINT.
    397      * In each pair (..., si, ei, ...) it must be true that si &lt;= ei
    398      * Between adjacent pairs (...ei, sj...), it must be true that ei+1 &lt; sj
    399      * @param pairs pairs of character representing ranges
    400      * @stable ICU 4.4
    401      */
    402     public UnicodeSet(int... pairs) {
    403         if ((pairs.length & 1) != 0) {
    404             throw new IllegalArgumentException("Must have even number of integers");
    405         }
    406         list = new int[pairs.length + 1]; // don't allocate extra space, because it is likely that this is a fixed set.
    407         len = list.length;
    408         int last = -1; // used to ensure that the results are monotonically increasing.
    409         int i = 0;
    410         while (i < pairs.length) {
    411             // start of pair
    412             int start = pairs[i];
    413             if (last >= start) {
    414                 throw new IllegalArgumentException("Must be monotonically increasing.");
    415             }
    416             list[i++] = last = start;
    417             // end of pair
    418             int end = pairs[i] + 1;
    419             if (last >= end) {
    420                 throw new IllegalArgumentException("Must be monotonically increasing.");
    421             }
    422             list[i++] = last = end;
    423         }
    424         list[i] = HIGH; // terminate
    425     }
    426 
    427     /**
    428      * Constructs a set from the given pattern.  See the class description
    429      * for the syntax of the pattern language.  Whitespace is ignored.
    430      * @param pattern a string specifying what characters are in the set
    431      * @exception java.lang.IllegalArgumentException if the pattern contains
    432      * a syntax error.
    433      * @stable ICU 2.0
    434      */
    435     public UnicodeSet(String pattern) {
    436         this();
    437         applyPattern(pattern, null, null, IGNORE_SPACE);
    438     }
    439 
    440     /**
    441      * Constructs a set from the given pattern.  See the class description
    442      * for the syntax of the pattern language.
    443      * @param pattern a string specifying what characters are in the set
    444      * @param ignoreWhitespace if true, ignore Unicode Pattern_White_Space characters
    445      * @exception java.lang.IllegalArgumentException if the pattern contains
    446      * a syntax error.
    447      * @stable ICU 2.0
    448      */
    449     public UnicodeSet(String pattern, boolean ignoreWhitespace) {
    450         this();
    451         applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
    452     }
    453 
    454     /**
    455      * Constructs a set from the given pattern.  See the class description
    456      * for the syntax of the pattern language.
    457      * @param pattern a string specifying what characters are in the set
    458      * @param options a bitmask indicating which options to apply.
    459      * Valid options are IGNORE_SPACE and CASE.
    460      * @exception java.lang.IllegalArgumentException if the pattern contains
    461      * a syntax error.
    462      * @stable ICU 3.8
    463      */
    464     public UnicodeSet(String pattern, int options) {
    465         this();
    466         applyPattern(pattern, null, null, options);
    467     }
    468 
    469     /**
    470      * Constructs a set from the given pattern.  See the class description
    471      * for the syntax of the pattern language.
    472      * @param pattern a string specifying what characters are in the set
    473      * @param pos on input, the position in pattern at which to start parsing.
    474      * On output, the position after the last character parsed.
    475      * @param symbols a symbol table mapping variables to char[] arrays
    476      * and chars to UnicodeSets
    477      * @exception java.lang.IllegalArgumentException if the pattern
    478      * contains a syntax error.
    479      * @stable ICU 2.0
    480      */
    481     public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) {
    482         this();
    483         applyPattern(pattern, pos, symbols, IGNORE_SPACE);
    484     }
    485 
    486     /**
    487      * Constructs a set from the given pattern.  See the class description
    488      * for the syntax of the pattern language.
    489      * @param pattern a string specifying what characters are in the set
    490      * @param pos on input, the position in pattern at which to start parsing.
    491      * On output, the position after the last character parsed.
    492      * @param symbols a symbol table mapping variables to char[] arrays
    493      * and chars to UnicodeSets
    494      * @param options a bitmask indicating which options to apply.
    495      * Valid options are IGNORE_SPACE and CASE.
    496      * @exception java.lang.IllegalArgumentException if the pattern
    497      * contains a syntax error.
    498      * @stable ICU 3.2
    499      */
    500     public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) {
    501         this();
    502         applyPattern(pattern, pos, symbols, options);
    503     }
    504 
    505 
    506     /**
    507      * Return a new set that is equivalent to this one.
    508      * @stable ICU 2.0
    509      */
    510     @Override
    511     public Object clone() {
    512         if (isFrozen()) {
    513             return this;
    514         }
    515         UnicodeSet result = new UnicodeSet(this);
    516         result.bmpSet = this.bmpSet;
    517         result.stringSpan = this.stringSpan;
    518         return result;
    519     }
    520 
    521     /**
    522      * Make this object represent the range <code>start - end</code>.
    523      * If <code>end &gt; start</code> then this object is set to an
    524      * an empty range.
    525      *
    526      * @param start first character in the set, inclusive
    527      * @param end last character in the set, inclusive
    528      * @stable ICU 2.0
    529      */
    530     public UnicodeSet set(int start, int end) {
    531         checkFrozen();
    532         clear();
    533         complement(start, end);
    534         return this;
    535     }
    536 
    537     /**
    538      * Make this object represent the same set as <code>other</code>.
    539      * @param other a <code>UnicodeSet</code> whose value will be
    540      * copied to this object
    541      * @stable ICU 2.0
    542      */
    543     public UnicodeSet set(UnicodeSet other) {
    544         checkFrozen();
    545         list = other.list.clone();
    546         len = other.len;
    547         pat = other.pat;
    548         strings = new TreeSet<String>(other.strings);
    549         return this;
    550     }
    551 
    552     /**
    553      * Modifies this set to represent the set specified by the given pattern.
    554      * See the class description for the syntax of the pattern language.
    555      * Whitespace is ignored.
    556      * @param pattern a string specifying what characters are in the set
    557      * @exception java.lang.IllegalArgumentException if the pattern
    558      * contains a syntax error.
    559      * @stable ICU 2.0
    560      */
    561     public final UnicodeSet applyPattern(String pattern) {
    562         checkFrozen();
    563         return applyPattern(pattern, null, null, IGNORE_SPACE);
    564     }
    565 
    566     /**
    567      * Modifies this set to represent the set specified by the given pattern,
    568      * optionally ignoring whitespace.
    569      * See the class description for the syntax of the pattern language.
    570      * @param pattern a string specifying what characters are in the set
    571      * @param ignoreWhitespace if true then Unicode Pattern_White_Space characters are ignored
    572      * @exception java.lang.IllegalArgumentException if the pattern
    573      * contains a syntax error.
    574      * @stable ICU 2.0
    575      */
    576     public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) {
    577         checkFrozen();
    578         return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
    579     }
    580 
    581     /**
    582      * Modifies this set to represent the set specified by the given pattern,
    583      * optionally ignoring whitespace.
    584      * See the class description for the syntax of the pattern language.
    585      * @param pattern a string specifying what characters are in the set
    586      * @param options a bitmask indicating which options to apply.
    587      * Valid options are IGNORE_SPACE and CASE.
    588      * @exception java.lang.IllegalArgumentException if the pattern
    589      * contains a syntax error.
    590      * @stable ICU 3.8
    591      */
    592     public UnicodeSet applyPattern(String pattern, int options) {
    593         checkFrozen();
    594         return applyPattern(pattern, null, null, options);
    595     }
    596 
    597     /**
    598      * Return true if the given position, in the given pattern, appears
    599      * to be the start of a UnicodeSet pattern.
    600      * @stable ICU 2.0
    601      */
    602     public static boolean resemblesPattern(String pattern, int pos) {
    603         return ((pos+1) < pattern.length() &&
    604                 pattern.charAt(pos) == '[') ||
    605                 resemblesPropertyPattern(pattern, pos);
    606     }
    607 
    608     /**
    609      * TODO: create Appendable version of UTF16.append(buf, c),
    610      * maybe in new class Appendables?
    611      * @throws IOException
    612      */
    613     private static void appendCodePoint(Appendable app, int c) {
    614         assert 0 <= c && c <= 0x10ffff;
    615         try {
    616             if (c <= 0xffff) {
    617                 app.append((char) c);
    618             } else {
    619                 app.append(UTF16.getLeadSurrogate(c)).append(UTF16.getTrailSurrogate(c));
    620             }
    621         } catch (IOException e) {
    622             throw new ICUUncheckedIOException(e);
    623         }
    624     }
    625 
    626     /**
    627      * TODO: create class Appendables?
    628      * @throws IOException
    629      */
    630     private static void append(Appendable app, CharSequence s) {
    631         try {
    632             app.append(s);
    633         } catch (IOException e) {
    634             throw new ICUUncheckedIOException(e);
    635         }
    636     }
    637 
    638     /**
    639      * Append the <code>toPattern()</code> representation of a
    640      * string to the given <code>Appendable</code>.
    641      */
    642     private static <T extends Appendable> T _appendToPat(T buf, String s, boolean escapeUnprintable) {
    643         int cp;
    644         for (int i = 0; i < s.length(); i += Character.charCount(cp)) {
    645             cp = s.codePointAt(i);
    646             _appendToPat(buf, cp, escapeUnprintable);
    647         }
    648         return buf;
    649     }
    650 
    651     /**
    652      * Append the <code>toPattern()</code> representation of a
    653      * character to the given <code>Appendable</code>.
    654      */
    655     private static <T extends Appendable> T _appendToPat(T buf, int c, boolean escapeUnprintable) {
    656         try {
    657             if (escapeUnprintable && Utility.isUnprintable(c)) {
    658                 // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
    659                 // unprintable
    660                 if (Utility.escapeUnprintable(buf, c)) {
    661                     return buf;
    662                 }
    663             }
    664             // Okay to let ':' pass through
    665             switch (c) {
    666             case '[': // SET_OPEN:
    667             case ']': // SET_CLOSE:
    668             case '-': // HYPHEN:
    669             case '^': // COMPLEMENT:
    670             case '&': // INTERSECTION:
    671             case '\\': //BACKSLASH:
    672             case '{':
    673             case '}':
    674             case '$':
    675             case ':':
    676                 buf.append('\\');
    677                 break;
    678             default:
    679                 // Escape whitespace
    680                 if (PatternProps.isWhiteSpace(c)) {
    681                     buf.append('\\');
    682                 }
    683                 break;
    684             }
    685             appendCodePoint(buf, c);
    686             return buf;
    687         } catch (IOException e) {
    688             throw new ICUUncheckedIOException(e);
    689         }
    690     }
    691 
    692     /**
    693      * Returns a string representation of this set.  If the result of
    694      * calling this function is passed to a UnicodeSet constructor, it
    695      * will produce another set that is equal to this one.
    696      * @stable ICU 2.0
    697      */
    698     @Override
    699     public String toPattern(boolean escapeUnprintable) {
    700         if (pat != null && !escapeUnprintable) {
    701             return pat;
    702         }
    703         StringBuilder result = new StringBuilder();
    704         return _toPattern(result, escapeUnprintable).toString();
    705     }
    706 
    707     /**
    708      * Append a string representation of this set to result.  This will be
    709      * a cleaned version of the string passed to applyPattern(), if there
    710      * is one.  Otherwise it will be generated.
    711      */
    712     private <T extends Appendable> T _toPattern(T result,
    713             boolean escapeUnprintable) {
    714         if (pat == null) {
    715             return appendNewPattern(result, escapeUnprintable, true);
    716         }
    717         try {
    718             if (!escapeUnprintable) {
    719                 result.append(pat);
    720                 return result;
    721             }
    722             boolean oddNumberOfBackslashes = false;
    723             for (int i=0; i<pat.length(); ) {
    724                 int c = pat.codePointAt(i);
    725                 i += Character.charCount(c);
    726                 if (Utility.isUnprintable(c)) {
    727                     // If the unprintable character is preceded by an odd
    728                     // number of backslashes, then it has been escaped
    729                     // and we omit the last backslash.
    730                     Utility.escapeUnprintable(result, c);
    731                     oddNumberOfBackslashes = false;
    732                 } else if (!oddNumberOfBackslashes && c == '\\') {
    733                     // Temporarily withhold an odd-numbered backslash.
    734                     oddNumberOfBackslashes = true;
    735                 } else {
    736                     if (oddNumberOfBackslashes) {
    737                         result.append('\\');
    738                     }
    739                     appendCodePoint(result, c);
    740                     oddNumberOfBackslashes = false;
    741                 }
    742             }
    743             if (oddNumberOfBackslashes) {
    744                 result.append('\\');
    745             }
    746             return result;
    747         } catch (IOException e) {
    748             throw new ICUUncheckedIOException(e);
    749         }
    750     }
    751 
    752     /**
    753      * Generate and append a string representation of this set to result.
    754      * This does not use this.pat, the cleaned up copy of the string
    755      * passed to applyPattern().
    756      * @param result the buffer into which to generate the pattern
    757      * @param escapeUnprintable escape unprintable characters if true
    758      * @stable ICU 2.0
    759      */
    760     public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) {
    761         return _generatePattern(result, escapeUnprintable, true);
    762     }
    763 
    764     /**
    765      * Generate and append a string representation of this set to result.
    766      * This does not use this.pat, the cleaned up copy of the string
    767      * passed to applyPattern().
    768      * @param includeStrings if false, doesn't include the strings.
    769      * @stable ICU 3.8
    770      */
    771     public StringBuffer _generatePattern(StringBuffer result,
    772             boolean escapeUnprintable, boolean includeStrings) {
    773         return appendNewPattern(result, escapeUnprintable, includeStrings);
    774     }
    775 
    776     private <T extends Appendable> T appendNewPattern(
    777             T result, boolean escapeUnprintable, boolean includeStrings) {
    778         try {
    779             result.append('[');
    780 
    781             int count = getRangeCount();
    782 
    783             // If the set contains at least 2 intervals and includes both
    784             // MIN_VALUE and MAX_VALUE, then the inverse representation will
    785             // be more economical.
    786             if (count > 1 &&
    787                     getRangeStart(0) == MIN_VALUE &&
    788                     getRangeEnd(count-1) == MAX_VALUE) {
    789 
    790                 // Emit the inverse
    791                 result.append('^');
    792 
    793                 for (int i = 1; i < count; ++i) {
    794                     int start = getRangeEnd(i-1)+1;
    795                     int end = getRangeStart(i)-1;
    796                     _appendToPat(result, start, escapeUnprintable);
    797                     if (start != end) {
    798                         if ((start+1) != end) {
    799                             result.append('-');
    800                         }
    801                         _appendToPat(result, end, escapeUnprintable);
    802                     }
    803                 }
    804             }
    805 
    806             // Default; emit the ranges as pairs
    807             else {
    808                 for (int i = 0; i < count; ++i) {
    809                     int start = getRangeStart(i);
    810                     int end = getRangeEnd(i);
    811                     _appendToPat(result, start, escapeUnprintable);
    812                     if (start != end) {
    813                         if ((start+1) != end) {
    814                             result.append('-');
    815                         }
    816                         _appendToPat(result, end, escapeUnprintable);
    817                     }
    818                 }
    819             }
    820 
    821             if (includeStrings && strings.size() > 0) {
    822                 for (String s : strings) {
    823                     result.append('{');
    824                     _appendToPat(result, s, escapeUnprintable);
    825                     result.append('}');
    826                 }
    827             }
    828             result.append(']');
    829             return result;
    830         } catch (IOException e) {
    831             throw new ICUUncheckedIOException(e);
    832         }
    833     }
    834 
    835     /**
    836      * Returns the number of elements in this set (its cardinality)
    837      * Note than the elements of a set may include both individual
    838      * codepoints and strings.
    839      *
    840      * @return the number of elements in this set (its cardinality).
    841      * @stable ICU 2.0
    842      */
    843     public int size() {
    844         int n = 0;
    845         int count = getRangeCount();
    846         for (int i = 0; i < count; ++i) {
    847             n += getRangeEnd(i) - getRangeStart(i) + 1;
    848         }
    849         return n + strings.size();
    850     }
    851 
    852     /**
    853      * Returns <tt>true</tt> if this set contains no elements.
    854      *
    855      * @return <tt>true</tt> if this set contains no elements.
    856      * @stable ICU 2.0
    857      */
    858     public boolean isEmpty() {
    859         return len == 1 && strings.size() == 0;
    860     }
    861 
    862     /**
    863      * Implementation of UnicodeMatcher API.  Returns <tt>true</tt> if
    864      * this set contains any character whose low byte is the given
    865      * value.  This is used by <tt>RuleBasedTransliterator</tt> for
    866      * indexing.
    867      * @stable ICU 2.0
    868      */
    869     @Override
    870     public boolean matchesIndexValue(int v) {
    871         /* The index value v, in the range [0,255], is contained in this set if
    872          * it is contained in any pair of this set.  Pairs either have the high
    873          * bytes equal, or unequal.  If the high bytes are equal, then we have
    874          * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
    875          * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
    876          * Then v is contained if xx <= v || v <= yy.  (This is identical to the
    877          * time zone month containment logic.)
    878          */
    879         for (int i=0; i<getRangeCount(); ++i) {
    880             int low = getRangeStart(i);
    881             int high = getRangeEnd(i);
    882             if ((low & ~0xFF) == (high & ~0xFF)) {
    883                 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
    884                     return true;
    885                 }
    886             } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
    887                 return true;
    888             }
    889         }
    890         if (strings.size() != 0) {
    891             for (String s : strings) {
    892                 //if (s.length() == 0) {
    893                 //    // Empty strings match everything
    894                 //    return true;
    895                 //}
    896                 // assert(s.length() != 0); // We enforce this elsewhere
    897                 int c = UTF16.charAt(s, 0);
    898                 if ((c & 0xFF) == v) {
    899                     return true;
    900                 }
    901             }
    902         }
    903         return false;
    904     }
    905 
    906     /**
    907      * Implementation of UnicodeMatcher.matches().  Always matches the
    908      * longest possible multichar string.
    909      * @stable ICU 2.0
    910      */
    911     @Override
    912     public int matches(Replaceable text,
    913             int[] offset,
    914             int limit,
    915             boolean incremental) {
    916 
    917         if (offset[0] == limit) {
    918             // Strings, if any, have length != 0, so we don't worry
    919             // about them here.  If we ever allow zero-length strings
    920             // we much check for them here.
    921             if (contains(UnicodeMatcher.ETHER)) {
    922                 return incremental ? U_PARTIAL_MATCH : U_MATCH;
    923             } else {
    924                 return U_MISMATCH;
    925             }
    926         } else {
    927             if (strings.size() != 0) { // try strings first
    928 
    929                 // might separate forward and backward loops later
    930                 // for now they are combined
    931 
    932                 // TODO Improve efficiency of this, at least in the forward
    933                 // direction, if not in both.  In the forward direction we
    934                 // can assume the strings are sorted.
    935 
    936                 boolean forward = offset[0] < limit;
    937 
    938                 // firstChar is the leftmost char to match in the
    939                 // forward direction or the rightmost char to match in
    940                 // the reverse direction.
    941                 char firstChar = text.charAt(offset[0]);
    942 
    943                 // If there are multiple strings that can match we
    944                 // return the longest match.
    945                 int highWaterLength = 0;
    946 
    947                 for (String trial : strings) {
    948                     //if (trial.length() == 0) {
    949                     //    return U_MATCH; // null-string always matches
    950                     //}
    951                     // assert(trial.length() != 0); // We ensure this elsewhere
    952 
    953                     char c = trial.charAt(forward ? 0 : trial.length() - 1);
    954 
    955                     // Strings are sorted, so we can optimize in the
    956                     // forward direction.
    957                     if (forward && c > firstChar) break;
    958                     if (c != firstChar) continue;
    959 
    960                     int length = matchRest(text, offset[0], limit, trial);
    961 
    962                     if (incremental) {
    963                         int maxLen = forward ? limit-offset[0] : offset[0]-limit;
    964                         if (length == maxLen) {
    965                             // We have successfully matched but only up to limit.
    966                             return U_PARTIAL_MATCH;
    967                         }
    968                     }
    969 
    970                     if (length == trial.length()) {
    971                         // We have successfully matched the whole string.
    972                         if (length > highWaterLength) {
    973                             highWaterLength = length;
    974                         }
    975                         // In the forward direction we know strings
    976                         // are sorted so we can bail early.
    977                         if (forward && length < highWaterLength) {
    978                             break;
    979                         }
    980                         continue;
    981                     }
    982                 }
    983 
    984                 // We've checked all strings without a partial match.
    985                 // If we have full matches, return the longest one.
    986                 if (highWaterLength != 0) {
    987                     offset[0] += forward ? highWaterLength : -highWaterLength;
    988                     return U_MATCH;
    989                 }
    990             }
    991             return super.matches(text, offset, limit, incremental);
    992         }
    993     }
    994 
    995     /**
    996      * Returns the longest match for s in text at the given position.
    997      * If limit > start then match forward from start+1 to limit
    998      * matching all characters except s.charAt(0).  If limit < start,
    999      * go backward starting from start-1 matching all characters
   1000      * except s.charAt(s.length()-1).  This method assumes that the
   1001      * first character, text.charAt(start), matches s, so it does not
   1002      * check it.
   1003      * @param text the text to match
   1004      * @param start the first character to match.  In the forward
   1005      * direction, text.charAt(start) is matched against s.charAt(0).
   1006      * In the reverse direction, it is matched against
   1007      * s.charAt(s.length()-1).
   1008      * @param limit the limit offset for matching, either last+1 in
   1009      * the forward direction, or last-1 in the reverse direction,
   1010      * where last is the index of the last character to match.
   1011      * @return If part of s matches up to the limit, return |limit -
   1012      * start|.  If all of s matches before reaching the limit, return
   1013      * s.length().  If there is a mismatch between s and text, return
   1014      * 0
   1015      */
   1016     private static int matchRest (Replaceable text, int start, int limit, String s) {
   1017         int maxLen;
   1018         int slen = s.length();
   1019         if (start < limit) {
   1020             maxLen = limit - start;
   1021             if (maxLen > slen) maxLen = slen;
   1022             for (int i = 1; i < maxLen; ++i) {
   1023                 if (text.charAt(start + i) != s.charAt(i)) return 0;
   1024             }
   1025         } else {
   1026             maxLen = start - limit;
   1027             if (maxLen > slen) maxLen = slen;
   1028             --slen; // <=> slen = s.length() - 1;
   1029             for (int i = 1; i < maxLen; ++i) {
   1030                 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
   1031             }
   1032         }
   1033         return maxLen;
   1034     }
   1035 
   1036     /**
   1037      * Tests whether the text matches at the offset. If so, returns the end of the longest substring that it matches. If not, returns -1.
   1038      * @internal
   1039      * @deprecated This API is ICU internal only.
   1040      */
   1041     @Deprecated
   1042     public int matchesAt(CharSequence text, int offset) {
   1043         int lastLen = -1;
   1044         strings:
   1045             if (strings.size() != 0) {
   1046                 char firstChar = text.charAt(offset);
   1047                 String trial = null;
   1048                 // find the first string starting with firstChar
   1049                 Iterator<String> it = strings.iterator();
   1050                 while (it.hasNext()) {
   1051                     trial = it.next();
   1052                     char firstStringChar = trial.charAt(0);
   1053                     if (firstStringChar < firstChar) continue;
   1054                     if (firstStringChar > firstChar) break strings;
   1055                 }
   1056 
   1057                 // now keep checking string until we get the longest one
   1058                 for (;;) {
   1059                     int tempLen = matchesAt(text, offset, trial);
   1060                     if (lastLen > tempLen) break strings;
   1061                     lastLen = tempLen;
   1062                     if (!it.hasNext()) break;
   1063                     trial = it.next();
   1064                 }
   1065             }
   1066 
   1067         if (lastLen < 2) {
   1068             int cp = UTF16.charAt(text, offset);
   1069             if (contains(cp)) lastLen = UTF16.getCharCount(cp);
   1070         }
   1071 
   1072         return offset+lastLen;
   1073     }
   1074 
   1075     /**
   1076      * Does one string contain another, starting at a specific offset?
   1077      * @param text text to match
   1078      * @param offsetInText offset within that text
   1079      * @param substring substring to match at offset in text
   1080      * @return -1 if match fails, otherwise other.length()
   1081      */
   1082     // Note: This method was moved from CollectionUtilities
   1083     private static int matchesAt(CharSequence text, int offsetInText, CharSequence substring) {
   1084         int len = substring.length();
   1085         int textLength = text.length();
   1086         if (textLength + offsetInText > len) {
   1087             return -1;
   1088         }
   1089         int i = 0;
   1090         for (int j = offsetInText; i < len; ++i, ++j) {
   1091             char pc = substring.charAt(i);
   1092             char tc = text.charAt(j);
   1093             if (pc != tc) return -1;
   1094         }
   1095         return i;
   1096     }
   1097 
   1098     /**
   1099      * Implementation of UnicodeMatcher API.  Union the set of all
   1100      * characters that may be matched by this object into the given
   1101      * set.
   1102      * @param toUnionTo the set into which to union the source characters
   1103      * @stable ICU 2.2
   1104      */
   1105     @Override
   1106     public void addMatchSetTo(UnicodeSet toUnionTo) {
   1107         toUnionTo.addAll(this);
   1108     }
   1109 
   1110     /**
   1111      * Returns the index of the given character within this set, where
   1112      * the set is ordered by ascending code point.  If the character
   1113      * is not in this set, return -1.  The inverse of this method is
   1114      * <code>charAt()</code>.
   1115      * @return an index from 0..size()-1, or -1
   1116      * @stable ICU 2.0
   1117      */
   1118     public int indexOf(int c) {
   1119         if (c < MIN_VALUE || c > MAX_VALUE) {
   1120             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
   1121         }
   1122         int i = 0;
   1123         int n = 0;
   1124         for (;;) {
   1125             int start = list[i++];
   1126             if (c < start) {
   1127                 return -1;
   1128             }
   1129             int limit = list[i++];
   1130             if (c < limit) {
   1131                 return n + c - start;
   1132             }
   1133             n += limit - start;
   1134         }
   1135     }
   1136 
   1137     /**
   1138      * Returns the character at the given index within this set, where
   1139      * the set is ordered by ascending code point.  If the index is
   1140      * out of range, return -1.  The inverse of this method is
   1141      * <code>indexOf()</code>.
   1142      * @param index an index from 0..size()-1
   1143      * @return the character at the given index, or -1.
   1144      * @stable ICU 2.0
   1145      */
   1146     public int charAt(int index) {
   1147         if (index >= 0) {
   1148             // len2 is the largest even integer <= len, that is, it is len
   1149             // for even values and len-1 for odd values.  With odd values
   1150             // the last entry is UNICODESET_HIGH.
   1151             int len2 = len & ~1;
   1152             for (int i=0; i < len2;) {
   1153                 int start = list[i++];
   1154                 int count = list[i++] - start;
   1155                 if (index < count) {
   1156                     return start + index;
   1157                 }
   1158                 index -= count;
   1159             }
   1160         }
   1161         return -1;
   1162     }
   1163 
   1164     /**
   1165      * Adds the specified range to this set if it is not already
   1166      * present.  If this set already contains the specified range,
   1167      * the call leaves this set unchanged.  If <code>end &gt; start</code>
   1168      * then an empty range is added, leaving the set unchanged.
   1169      *
   1170      * @param start first character, inclusive, of range to be added
   1171      * to this set.
   1172      * @param end last character, inclusive, of range to be added
   1173      * to this set.
   1174      * @stable ICU 2.0
   1175      */
   1176     public UnicodeSet add(int start, int end) {
   1177         checkFrozen();
   1178         return add_unchecked(start, end);
   1179     }
   1180 
   1181     /**
   1182      * Adds all characters in range (uses preferred naming convention).
   1183      * @param start The index of where to start on adding all characters.
   1184      * @param end The index of where to end on adding all characters.
   1185      * @return a reference to this object
   1186      * @stable ICU 4.4
   1187      */
   1188     public UnicodeSet addAll(int start, int end) {
   1189         checkFrozen();
   1190         return add_unchecked(start, end);
   1191     }
   1192 
   1193     // for internal use, after checkFrozen has been called
   1194     private UnicodeSet add_unchecked(int start, int end) {
   1195         if (start < MIN_VALUE || start > MAX_VALUE) {
   1196             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
   1197         }
   1198         if (end < MIN_VALUE || end > MAX_VALUE) {
   1199             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
   1200         }
   1201         if (start < end) {
   1202             add(range(start, end), 2, 0);
   1203         } else if (start == end) {
   1204             add(start);
   1205         }
   1206         return this;
   1207     }
   1208 
   1209     //    /**
   1210     //     * Format out the inversion list as a string, for debugging.  Uncomment when
   1211     //     * needed.
   1212     //     */
   1213     //    public final String dump() {
   1214     //        StringBuffer buf = new StringBuffer("[");
   1215     //        for (int i=0; i<len; ++i) {
   1216     //            if (i != 0) buf.append(", ");
   1217     //            int c = list[i];
   1218     //            //if (c <= 0x7F && c != '\n' && c != '\r' && c != '\t' && c != ' ') {
   1219     //            //    buf.append((char) c);
   1220     //            //} else {
   1221     //                buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6));
   1222     //            //}
   1223     //        }
   1224     //        buf.append("]");
   1225     //        return buf.toString();
   1226     //    }
   1227 
   1228     /**
   1229      * Adds the specified character to this set if it is not already
   1230      * present.  If this set already contains the specified character,
   1231      * the call leaves this set unchanged.
   1232      * @stable ICU 2.0
   1233      */
   1234     public final UnicodeSet add(int c) {
   1235         checkFrozen();
   1236         return add_unchecked(c);
   1237     }
   1238 
   1239     // for internal use only, after checkFrozen has been called
   1240     private final UnicodeSet add_unchecked(int c) {
   1241         if (c < MIN_VALUE || c > MAX_VALUE) {
   1242             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
   1243         }
   1244 
   1245         // find smallest i such that c < list[i]
   1246         // if odd, then it is IN the set
   1247         // if even, then it is OUT of the set
   1248         int i = findCodePoint(c);
   1249 
   1250         // already in set?
   1251         if ((i & 1) != 0) return this;
   1252 
   1253         // HIGH is 0x110000
   1254         // assert(list[len-1] == HIGH);
   1255 
   1256         // empty = [HIGH]
   1257         // [start_0, limit_0, start_1, limit_1, HIGH]
   1258 
   1259         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
   1260         //                             ^
   1261         //                             list[i]
   1262 
   1263         // i == 0 means c is before the first range
   1264         // TODO: Is the "list[i]-1" a typo? Even if you pass MAX_VALUE into
   1265         //      add_unchecked, the maximum value that "c" will be compared to
   1266         //      is "MAX_VALUE-1" meaning that "if (c == MAX_VALUE)" will
   1267         //      never be reached according to this logic.
   1268         if (c == list[i]-1) {
   1269             // c is before start of next range
   1270             list[i] = c;
   1271             // if we touched the HIGH mark, then add a new one
   1272             if (c == MAX_VALUE) {
   1273                 ensureCapacity(len+1);
   1274                 list[len++] = HIGH;
   1275             }
   1276             if (i > 0 && c == list[i-1]) {
   1277                 // collapse adjacent ranges
   1278 
   1279                 // [..., start_k-1, c, c, limit_k, ..., HIGH]
   1280                 //                     ^
   1281                 //                     list[i]
   1282                 System.arraycopy(list, i+1, list, i-1, len-i-1);
   1283                 len -= 2;
   1284             }
   1285         }
   1286 
   1287         else if (i > 0 && c == list[i-1]) {
   1288             // c is after end of prior range
   1289             list[i-1]++;
   1290             // no need to chcek for collapse here
   1291         }
   1292 
   1293         else {
   1294             // At this point we know the new char is not adjacent to
   1295             // any existing ranges, and it is not 10FFFF.
   1296 
   1297 
   1298             // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
   1299             //                             ^
   1300             //                             list[i]
   1301 
   1302             // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
   1303             //                             ^
   1304             //                             list[i]
   1305 
   1306             // Don't use ensureCapacity() to save on copying.
   1307             // NOTE: This has no measurable impact on performance,
   1308             // but it might help in some usage patterns.
   1309             if (len+2 > list.length) {
   1310                 int[] temp = new int[len + 2 + GROW_EXTRA];
   1311                 if (i != 0) System.arraycopy(list, 0, temp, 0, i);
   1312                 System.arraycopy(list, i, temp, i+2, len-i);
   1313                 list = temp;
   1314             } else {
   1315                 System.arraycopy(list, i, list, i+2, len-i);
   1316             }
   1317 
   1318             list[i] = c;
   1319             list[i+1] = c+1;
   1320             len += 2;
   1321         }
   1322 
   1323         pat = null;
   1324         return this;
   1325     }
   1326 
   1327     /**
   1328      * Adds the specified multicharacter to this set if it is not already
   1329      * present.  If this set already contains the multicharacter,
   1330      * the call leaves this set unchanged.
   1331      * Thus "ch" =&gt; {"ch"}
   1332      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   1333      * @param s the source string
   1334      * @return this object, for chaining
   1335      * @stable ICU 2.0
   1336      */
   1337     public final UnicodeSet add(CharSequence s) {
   1338         checkFrozen();
   1339         int cp = getSingleCP(s);
   1340         if (cp < 0) {
   1341             strings.add(s.toString());
   1342             pat = null;
   1343         } else {
   1344             add_unchecked(cp, cp);
   1345         }
   1346         return this;
   1347     }
   1348 
   1349     /**
   1350      * Utility for getting code point from single code point CharSequence.
   1351      * See the public UTF16.getSingleCodePoint()
   1352      * @return a code point IF the string consists of a single one.
   1353      * otherwise returns -1.
   1354      * @param s to test
   1355      */
   1356     private static int getSingleCP(CharSequence s) {
   1357         if (s.length() < 1) {
   1358             throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
   1359         }
   1360         if (s.length() > 2) return -1;
   1361         if (s.length() == 1) return s.charAt(0);
   1362 
   1363         // at this point, len = 2
   1364         int cp = UTF16.charAt(s, 0);
   1365         if (cp > 0xFFFF) { // is surrogate pair
   1366             return cp;
   1367         }
   1368         return -1;
   1369     }
   1370 
   1371     /**
   1372      * Adds each of the characters in this string to the set. Thus "ch" =&gt; {"c", "h"}
   1373      * If this set already any particular character, it has no effect on that character.
   1374      * @param s the source string
   1375      * @return this object, for chaining
   1376      * @stable ICU 2.0
   1377      */
   1378     public final UnicodeSet addAll(CharSequence s) {
   1379         checkFrozen();
   1380         int cp;
   1381         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
   1382             cp = UTF16.charAt(s, i);
   1383             add_unchecked(cp, cp);
   1384         }
   1385         return this;
   1386     }
   1387 
   1388     /**
   1389      * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1390      * If this set already any particular character, it has no effect on that character.
   1391      * @param s the source string
   1392      * @return this object, for chaining
   1393      * @stable ICU 2.0
   1394      */
   1395     public final UnicodeSet retainAll(CharSequence s) {
   1396         return retainAll(fromAll(s));
   1397     }
   1398 
   1399     /**
   1400      * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1401      * If this set already any particular character, it has no effect on that character.
   1402      * @param s the source string
   1403      * @return this object, for chaining
   1404      * @stable ICU 2.0
   1405      */
   1406     public final UnicodeSet complementAll(CharSequence s) {
   1407         return complementAll(fromAll(s));
   1408     }
   1409 
   1410     /**
   1411      * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1412      * If this set already any particular character, it has no effect on that character.
   1413      * @param s the source string
   1414      * @return this object, for chaining
   1415      * @stable ICU 2.0
   1416      */
   1417     public final UnicodeSet removeAll(CharSequence s) {
   1418         return removeAll(fromAll(s));
   1419     }
   1420 
   1421     /**
   1422      * Remove all strings from this UnicodeSet
   1423      * @return this object, for chaining
   1424      * @stable ICU 4.2
   1425      */
   1426     public final UnicodeSet removeAllStrings() {
   1427         checkFrozen();
   1428         if (strings.size() != 0) {
   1429             strings.clear();
   1430             pat = null;
   1431         }
   1432         return this;
   1433     }
   1434 
   1435     /**
   1436      * Makes a set from a multicharacter string. Thus "ch" =&gt; {"ch"}
   1437      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   1438      * @param s the source string
   1439      * @return a newly created set containing the given string
   1440      * @stable ICU 2.0
   1441      */
   1442     public static UnicodeSet from(CharSequence s) {
   1443         return new UnicodeSet().add(s);
   1444     }
   1445 
   1446 
   1447     /**
   1448      * Makes a set from each of the characters in the string. Thus "ch" =&gt; {"c", "h"}
   1449      * @param s the source string
   1450      * @return a newly created set containing the given characters
   1451      * @stable ICU 2.0
   1452      */
   1453     public static UnicodeSet fromAll(CharSequence s) {
   1454         return new UnicodeSet().addAll(s);
   1455     }
   1456 
   1457 
   1458     /**
   1459      * Retain only the elements in this set that are contained in the
   1460      * specified range.  If <code>end &gt; start</code> then an empty range is
   1461      * retained, leaving the set empty.
   1462      *
   1463      * @param start first character, inclusive, of range to be retained
   1464      * to this set.
   1465      * @param end last character, inclusive, of range to be retained
   1466      * to this set.
   1467      * @stable ICU 2.0
   1468      */
   1469     public UnicodeSet retain(int start, int end) {
   1470         checkFrozen();
   1471         if (start < MIN_VALUE || start > MAX_VALUE) {
   1472             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
   1473         }
   1474         if (end < MIN_VALUE || end > MAX_VALUE) {
   1475             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
   1476         }
   1477         if (start <= end) {
   1478             retain(range(start, end), 2, 0);
   1479         } else {
   1480             clear();
   1481         }
   1482         return this;
   1483     }
   1484 
   1485     /**
   1486      * Retain the specified character from this set if it is present.
   1487      * Upon return this set will be empty if it did not contain c, or
   1488      * will only contain c if it did contain c.
   1489      * @param c the character to be retained
   1490      * @return this object, for chaining
   1491      * @stable ICU 2.0
   1492      */
   1493     public final UnicodeSet retain(int c) {
   1494         return retain(c, c);
   1495     }
   1496 
   1497     /**
   1498      * Retain the specified string in this set if it is present.
   1499      * Upon return this set will be empty if it did not contain s, or
   1500      * will only contain s if it did contain s.
   1501      * @param cs the string to be retained
   1502      * @return this object, for chaining
   1503      * @stable ICU 2.0
   1504      */
   1505     public final UnicodeSet retain(CharSequence cs) {
   1506 
   1507         int cp = getSingleCP(cs);
   1508         if (cp < 0) {
   1509             String s = cs.toString();
   1510             boolean isIn = strings.contains(s);
   1511             if (isIn && size() == 1) {
   1512                 return this;
   1513             }
   1514             clear();
   1515             strings.add(s);
   1516             pat = null;
   1517         } else {
   1518             retain(cp, cp);
   1519         }
   1520         return this;
   1521     }
   1522 
   1523     /**
   1524      * Removes the specified range from this set if it is present.
   1525      * The set will not contain the specified range once the call
   1526      * returns.  If <code>end &gt; start</code> then an empty range is
   1527      * removed, leaving the set unchanged.
   1528      *
   1529      * @param start first character, inclusive, of range to be removed
   1530      * from this set.
   1531      * @param end last character, inclusive, of range to be removed
   1532      * from this set.
   1533      * @stable ICU 2.0
   1534      */
   1535     public UnicodeSet remove(int start, int end) {
   1536         checkFrozen();
   1537         if (start < MIN_VALUE || start > MAX_VALUE) {
   1538             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
   1539         }
   1540         if (end < MIN_VALUE || end > MAX_VALUE) {
   1541             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
   1542         }
   1543         if (start <= end) {
   1544             retain(range(start, end), 2, 2);
   1545         }
   1546         return this;
   1547     }
   1548 
   1549     /**
   1550      * Removes the specified character from this set if it is present.
   1551      * The set will not contain the specified character once the call
   1552      * returns.
   1553      * @param c the character to be removed
   1554      * @return this object, for chaining
   1555      * @stable ICU 2.0
   1556      */
   1557     public final UnicodeSet remove(int c) {
   1558         return remove(c, c);
   1559     }
   1560 
   1561     /**
   1562      * Removes the specified string from this set if it is present.
   1563      * The set will not contain the specified string once the call
   1564      * returns.
   1565      * @param s the string to be removed
   1566      * @return this object, for chaining
   1567      * @stable ICU 2.0
   1568      */
   1569     public final UnicodeSet remove(CharSequence s) {
   1570         int cp = getSingleCP(s);
   1571         if (cp < 0) {
   1572             strings.remove(s.toString());
   1573             pat = null;
   1574         } else {
   1575             remove(cp, cp);
   1576         }
   1577         return this;
   1578     }
   1579 
   1580     /**
   1581      * Complements the specified range in this set.  Any character in
   1582      * the range will be removed if it is in this set, or will be
   1583      * added if it is not in this set.  If <code>end &gt; start</code>
   1584      * then an empty range is complemented, leaving the set unchanged.
   1585      *
   1586      * @param start first character, inclusive, of range to be removed
   1587      * from this set.
   1588      * @param end last character, inclusive, of range to be removed
   1589      * from this set.
   1590      * @stable ICU 2.0
   1591      */
   1592     public UnicodeSet complement(int start, int end) {
   1593         checkFrozen();
   1594         if (start < MIN_VALUE || start > MAX_VALUE) {
   1595             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
   1596         }
   1597         if (end < MIN_VALUE || end > MAX_VALUE) {
   1598             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
   1599         }
   1600         if (start <= end) {
   1601             xor(range(start, end), 2, 0);
   1602         }
   1603         pat = null;
   1604         return this;
   1605     }
   1606 
   1607     /**
   1608      * Complements the specified character in this set.  The character
   1609      * will be removed if it is in this set, or will be added if it is
   1610      * not in this set.
   1611      * @stable ICU 2.0
   1612      */
   1613     public final UnicodeSet complement(int c) {
   1614         return complement(c, c);
   1615     }
   1616 
   1617     /**
   1618      * This is equivalent to
   1619      * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
   1620      * @stable ICU 2.0
   1621      */
   1622     public UnicodeSet complement() {
   1623         checkFrozen();
   1624         if (list[0] == LOW) {
   1625             System.arraycopy(list, 1, list, 0, len-1);
   1626             --len;
   1627         } else {
   1628             ensureCapacity(len+1);
   1629             System.arraycopy(list, 0, list, 1, len);
   1630             list[0] = LOW;
   1631             ++len;
   1632         }
   1633         pat = null;
   1634         return this;
   1635     }
   1636 
   1637     /**
   1638      * Complement the specified string in this set.
   1639      * The set will not contain the specified string once the call
   1640      * returns.
   1641      * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   1642      * @param s the string to complement
   1643      * @return this object, for chaining
   1644      * @stable ICU 2.0
   1645      */
   1646     public final UnicodeSet complement(CharSequence s) {
   1647         checkFrozen();
   1648         int cp = getSingleCP(s);
   1649         if (cp < 0) {
   1650             String s2 = s.toString();
   1651             if (strings.contains(s2)) {
   1652                 strings.remove(s2);
   1653             } else {
   1654                 strings.add(s2);
   1655             }
   1656             pat = null;
   1657         } else {
   1658             complement(cp, cp);
   1659         }
   1660         return this;
   1661     }
   1662 
   1663     /**
   1664      * Returns true if this set contains the given character.
   1665      * @param c character to be checked for containment
   1666      * @return true if the test condition is met
   1667      * @stable ICU 2.0
   1668      */
   1669     @Override
   1670     public boolean contains(int c) {
   1671         if (c < MIN_VALUE || c > MAX_VALUE) {
   1672             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
   1673         }
   1674         if (bmpSet != null) {
   1675             return bmpSet.contains(c);
   1676         }
   1677         if (stringSpan != null) {
   1678             return stringSpan.contains(c);
   1679         }
   1680 
   1681         /*
   1682         // Set i to the index of the start item greater than ch
   1683         // We know we will terminate without length test!
   1684         int i = -1;
   1685         while (true) {
   1686             if (c < list[++i]) break;
   1687         }
   1688          */
   1689 
   1690         int i = findCodePoint(c);
   1691 
   1692         return ((i & 1) != 0); // return true if odd
   1693     }
   1694 
   1695     /**
   1696      * Returns the smallest value i such that c < list[i].  Caller
   1697      * must ensure that c is a legal value or this method will enter
   1698      * an infinite loop.  This method performs a binary search.
   1699      * @param c a character in the range MIN_VALUE..MAX_VALUE
   1700      * inclusive
   1701      * @return the smallest integer i in the range 0..len-1,
   1702      * inclusive, such that c < list[i]
   1703      */
   1704     private final int findCodePoint(int c) {
   1705         /* Examples:
   1706                                            findCodePoint(c)
   1707            set              list[]         c=0 1 3 4 7 8
   1708            ===              ==============   ===========
   1709            []               [110000]         0 0 0 0 0 0
   1710            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
   1711            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
   1712            [:all:]          [0, 110000]      1 1 1 1 1 1
   1713          */
   1714 
   1715         // Return the smallest i such that c < list[i].  Assume
   1716         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
   1717         if (c < list[0]) return 0;
   1718         // High runner test.  c is often after the last range, so an
   1719         // initial check for this condition pays off.
   1720         if (len >= 2 && c >= list[len-2]) return len-1;
   1721         int lo = 0;
   1722         int hi = len - 1;
   1723         // invariant: c >= list[lo]
   1724         // invariant: c < list[hi]
   1725         for (;;) {
   1726             int i = (lo + hi) >>> 1;
   1727         if (i == lo) return hi;
   1728         if (c < list[i]) {
   1729             hi = i;
   1730         } else {
   1731             lo = i;
   1732         }
   1733         }
   1734     }
   1735 
   1736     //    //----------------------------------------------------------------
   1737     //    // Unrolled binary search
   1738     //    //----------------------------------------------------------------
   1739     //
   1740     //    private int validLen = -1; // validated value of len
   1741     //    private int topOfLow;
   1742     //    private int topOfHigh;
   1743     //    private int power;
   1744     //    private int deltaStart;
   1745     //
   1746     //    private void validate() {
   1747     //        if (len <= 1) {
   1748     //            throw new IllegalArgumentException("list.len==" + len + "; must be >1");
   1749     //        }
   1750     //
   1751     //        // find greatest power of 2 less than or equal to len
   1752     //        for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {}
   1753     //
   1754     //        // assert(exp2[power] <= len);
   1755     //
   1756     //        // determine the starting points
   1757     //        topOfLow = exp2[power] - 1;
   1758     //        topOfHigh = len - 1;
   1759     //        deltaStart = exp2[power-1];
   1760     //        validLen = len;
   1761     //    }
   1762     //
   1763     //    private static final int exp2[] = {
   1764     //        0x1, 0x2, 0x4, 0x8,
   1765     //        0x10, 0x20, 0x40, 0x80,
   1766     //        0x100, 0x200, 0x400, 0x800,
   1767     //        0x1000, 0x2000, 0x4000, 0x8000,
   1768     //        0x10000, 0x20000, 0x40000, 0x80000,
   1769     //        0x100000, 0x200000, 0x400000, 0x800000,
   1770     //        0x1000000, 0x2000000, 0x4000000, 0x8000000,
   1771     //        0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java
   1772     //    };
   1773     //
   1774     //    /**
   1775     //     * Unrolled lowest index GT.
   1776     //     */
   1777     //    private final int leastIndexGT(int searchValue) {
   1778     //
   1779     //        if (len != validLen) {
   1780     //            if (len == 1) return 0;
   1781     //            validate();
   1782     //        }
   1783     //        int temp;
   1784     //
   1785     //        // set up initial range to search. Each subrange is a power of two in length
   1786     //        int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh;
   1787     //
   1788     //        // Completely unrolled binary search, folhighing "Programming Pearls"
   1789     //        // Each case deliberately falls through to the next
   1790     //        // Logically, list[-1] < all_search_values && list[count] > all_search_values
   1791     //        // although the values -1 and count are never actually touched.
   1792     //
   1793     //        // The bounds at each point are low & high,
   1794     //        // where low == high - delta*2
   1795     //        // so high - delta is the midpoint
   1796     //
   1797     //        // The invariant AFTER each line is that list[low] < searchValue <= list[high]
   1798     //
   1799     //        switch (power) {
   1800     //        //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java
   1801     //        case 30: if (searchValue < list[temp = high-0x20000000]) high = temp;
   1802     //        case 29: if (searchValue < list[temp = high-0x10000000]) high = temp;
   1803     //
   1804     //        case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp;
   1805     //        case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp;
   1806     //        case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp;
   1807     //        case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp;
   1808     //
   1809     //        case 24: if (searchValue < list[temp = high-  0x800000]) high = temp;
   1810     //        case 23: if (searchValue < list[temp = high-  0x400000]) high = temp;
   1811     //        case 22: if (searchValue < list[temp = high-  0x200000]) high = temp;
   1812     //        case 21: if (searchValue < list[temp = high-  0x100000]) high = temp;
   1813     //
   1814     //        case 20: if (searchValue < list[temp = high-   0x80000]) high = temp;
   1815     //        case 19: if (searchValue < list[temp = high-   0x40000]) high = temp;
   1816     //        case 18: if (searchValue < list[temp = high-   0x20000]) high = temp;
   1817     //        case 17: if (searchValue < list[temp = high-   0x10000]) high = temp;
   1818     //
   1819     //        case 16: if (searchValue < list[temp = high-    0x8000]) high = temp;
   1820     //        case 15: if (searchValue < list[temp = high-    0x4000]) high = temp;
   1821     //        case 14: if (searchValue < list[temp = high-    0x2000]) high = temp;
   1822     //        case 13: if (searchValue < list[temp = high-    0x1000]) high = temp;
   1823     //
   1824     //        case 12: if (searchValue < list[temp = high-     0x800]) high = temp;
   1825     //        case 11: if (searchValue < list[temp = high-     0x400]) high = temp;
   1826     //        case 10: if (searchValue < list[temp = high-     0x200]) high = temp;
   1827     //        case  9: if (searchValue < list[temp = high-     0x100]) high = temp;
   1828     //
   1829     //        case  8: if (searchValue < list[temp = high-      0x80]) high = temp;
   1830     //        case  7: if (searchValue < list[temp = high-      0x40]) high = temp;
   1831     //        case  6: if (searchValue < list[temp = high-      0x20]) high = temp;
   1832     //        case  5: if (searchValue < list[temp = high-      0x10]) high = temp;
   1833     //
   1834     //        case  4: if (searchValue < list[temp = high-       0x8]) high = temp;
   1835     //        case  3: if (searchValue < list[temp = high-       0x4]) high = temp;
   1836     //        case  2: if (searchValue < list[temp = high-       0x2]) high = temp;
   1837     //        case  1: if (searchValue < list[temp = high-       0x1]) high = temp;
   1838     //        }
   1839     //
   1840     //        return high;
   1841     //    }
   1842     //
   1843     //    // For debugging only
   1844     //    public int len() {
   1845     //        return len;
   1846     //    }
   1847     //
   1848     //    //----------------------------------------------------------------
   1849     //    //----------------------------------------------------------------
   1850 
   1851     /**
   1852      * Returns true if this set contains every character
   1853      * of the given range.
   1854      * @param start first character, inclusive, of the range
   1855      * @param end last character, inclusive, of the range
   1856      * @return true if the test condition is met
   1857      * @stable ICU 2.0
   1858      */
   1859     public boolean contains(int start, int end) {
   1860         if (start < MIN_VALUE || start > MAX_VALUE) {
   1861             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
   1862         }
   1863         if (end < MIN_VALUE || end > MAX_VALUE) {
   1864             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
   1865         }
   1866         //int i = -1;
   1867         //while (true) {
   1868         //    if (start < list[++i]) break;
   1869         //}
   1870         int i = findCodePoint(start);
   1871         return ((i & 1) != 0 && end < list[i]);
   1872     }
   1873 
   1874     /**
   1875      * Returns <tt>true</tt> if this set contains the given
   1876      * multicharacter string.
   1877      * @param s string to be checked for containment
   1878      * @return <tt>true</tt> if this set contains the specified string
   1879      * @stable ICU 2.0
   1880      */
   1881     public final boolean contains(CharSequence s) {
   1882 
   1883         int cp = getSingleCP(s);
   1884         if (cp < 0) {
   1885             return strings.contains(s.toString());
   1886         } else {
   1887             return contains(cp);
   1888         }
   1889     }
   1890 
   1891     /**
   1892      * Returns true if this set contains all the characters and strings
   1893      * of the given set.
   1894      * @param b set to be checked for containment
   1895      * @return true if the test condition is met
   1896      * @stable ICU 2.0
   1897      */
   1898     public boolean containsAll(UnicodeSet b) {
   1899         // The specified set is a subset if all of its pairs are contained in
   1900         // this set. This implementation accesses the lists directly for speed.
   1901         // TODO: this could be faster if size() were cached. But that would affect building speed
   1902         // so it needs investigation.
   1903         int[] listB = b.list;
   1904         boolean needA = true;
   1905         boolean needB = true;
   1906         int aPtr = 0;
   1907         int bPtr = 0;
   1908         int aLen = len - 1;
   1909         int bLen = b.len - 1;
   1910         int startA = 0, startB = 0, limitA = 0, limitB = 0;
   1911         while (true) {
   1912             // double iterations are such a pain...
   1913             if (needA) {
   1914                 if (aPtr >= aLen) {
   1915                     // ran out of A. If B is also exhausted, then break;
   1916                     if (needB && bPtr >= bLen) {
   1917                         break;
   1918                     }
   1919                     return false;
   1920                 }
   1921                 startA = list[aPtr++];
   1922                 limitA = list[aPtr++];
   1923             }
   1924             if (needB) {
   1925                 if (bPtr >= bLen) {
   1926                     // ran out of B. Since we got this far, we have an A and we are ok so far
   1927                     break;
   1928                 }
   1929                 startB = listB[bPtr++];
   1930                 limitB = listB[bPtr++];
   1931             }
   1932             // if B doesn't overlap and is greater than A, get new A
   1933             if (startB >= limitA) {
   1934                 needA = true;
   1935                 needB = false;
   1936                 continue;
   1937             }
   1938             // if B is wholy contained in A, then get a new B
   1939             if (startB >= startA && limitB <= limitA) {
   1940                 needA = false;
   1941                 needB = true;
   1942                 continue;
   1943             }
   1944             // all other combinations mean we fail
   1945             return false;
   1946         }
   1947 
   1948         if (!strings.containsAll(b.strings)) return false;
   1949         return true;
   1950     }
   1951 
   1952     //    /**
   1953     //     * Returns true if this set contains all the characters and strings
   1954     //     * of the given set.
   1955     //     * @param c set to be checked for containment
   1956     //     * @return true if the test condition is met
   1957     //     * @stable ICU 2.0
   1958     //     */
   1959     //    public boolean containsAllOld(UnicodeSet c) {
   1960     //        // The specified set is a subset if all of its pairs are contained in
   1961     //        // this set.  It's possible to code this more efficiently in terms of
   1962     //        // direct manipulation of the inversion lists if the need arises.
   1963     //        int n = c.getRangeCount();
   1964     //        for (int i=0; i<n; ++i) {
   1965     //            if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
   1966     //                return false;
   1967     //            }
   1968     //        }
   1969     //        if (!strings.containsAll(c.strings)) return false;
   1970     //        return true;
   1971     //    }
   1972 
   1973     /**
   1974      * Returns true if there is a partition of the string such that this set contains each of the partitioned strings.
   1975      * For example, for the Unicode set [a{bc}{cd}]<br>
   1976      * containsAll is true for each of: "a", "bc", ""cdbca"<br>
   1977      * containsAll is false for each of: "acb", "bcda", "bcx"<br>
   1978      * @param s string containing characters to be checked for containment
   1979      * @return true if the test condition is met
   1980      * @stable ICU 2.0
   1981      */
   1982     public boolean containsAll(String s) {
   1983         int cp;
   1984         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
   1985             cp = UTF16.charAt(s, i);
   1986             if (!contains(cp))  {
   1987                 if (strings.size() == 0) {
   1988                     return false;
   1989                 }
   1990                 return containsAll(s, 0);
   1991             }
   1992         }
   1993         return true;
   1994     }
   1995 
   1996     /**
   1997      * Recursive routine called if we fail to find a match in containsAll, and there are strings
   1998      * @param s source string
   1999      * @param i point to match to the end on
   2000      * @return true if ok
   2001      */
   2002     private boolean containsAll(String s, int i) {
   2003         if (i >= s.length()) {
   2004             return true;
   2005         }
   2006         int  cp= UTF16.charAt(s, i);
   2007         if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) {
   2008             return true;
   2009         }
   2010         for (String setStr : strings) {
   2011             if (s.startsWith(setStr, i) &&  containsAll(s, i+setStr.length())) {
   2012                 return true;
   2013             }
   2014         }
   2015         return false;
   2016 
   2017     }
   2018 
   2019     /**
   2020      * Get the Regex equivalent for this UnicodeSet
   2021      * @return regex pattern equivalent to this UnicodeSet
   2022      * @internal
   2023      * @deprecated This API is ICU internal only.
   2024      */
   2025     @Deprecated
   2026     public String getRegexEquivalent() {
   2027         if (strings.size() == 0) {
   2028             return toString();
   2029         }
   2030         StringBuilder result = new StringBuilder("(?:");
   2031         appendNewPattern(result, true, false);
   2032         for (String s : strings) {
   2033             result.append('|');
   2034             _appendToPat(result, s, true);
   2035         }
   2036         return result.append(")").toString();
   2037     }
   2038 
   2039     /**
   2040      * Returns true if this set contains none of the characters
   2041      * of the given range.
   2042      * @param start first character, inclusive, of the range
   2043      * @param end last character, inclusive, of the range
   2044      * @return true if the test condition is met
   2045      * @stable ICU 2.0
   2046      */
   2047     public boolean containsNone(int start, int end) {
   2048         if (start < MIN_VALUE || start > MAX_VALUE) {
   2049             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
   2050         }
   2051         if (end < MIN_VALUE || end > MAX_VALUE) {
   2052             throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
   2053         }
   2054         int i = -1;
   2055         while (true) {
   2056             if (start < list[++i]) break;
   2057         }
   2058         return ((i & 1) == 0 && end < list[i]);
   2059     }
   2060 
   2061     /**
   2062      * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
   2063      * For example, for the Unicode set [a{bc}{cd}]<br>
   2064      * containsNone is true for: "xy", "cb"<br>
   2065      * containsNone is false for: "a", "bc", "bcd"<br>
   2066      * @param b set to be checked for containment
   2067      * @return true if the test condition is met
   2068      * @stable ICU 2.0
   2069      */
   2070     public boolean containsNone(UnicodeSet b) {
   2071         // The specified set is a subset if some of its pairs overlap with some of this set's pairs.
   2072         // This implementation accesses the lists directly for speed.
   2073         int[] listB = b.list;
   2074         boolean needA = true;
   2075         boolean needB = true;
   2076         int aPtr = 0;
   2077         int bPtr = 0;
   2078         int aLen = len - 1;
   2079         int bLen = b.len - 1;
   2080         int startA = 0, startB = 0, limitA = 0, limitB = 0;
   2081         while (true) {
   2082             // double iterations are such a pain...
   2083             if (needA) {
   2084                 if (aPtr >= aLen) {
   2085                     // ran out of A: break so we test strings
   2086                     break;
   2087                 }
   2088                 startA = list[aPtr++];
   2089                 limitA = list[aPtr++];
   2090             }
   2091             if (needB) {
   2092                 if (bPtr >= bLen) {
   2093                     // ran out of B: break so we test strings
   2094                     break;
   2095                 }
   2096                 startB = listB[bPtr++];
   2097                 limitB = listB[bPtr++];
   2098             }
   2099             // if B is higher than any part of A, get new A
   2100             if (startB >= limitA) {
   2101                 needA = true;
   2102                 needB = false;
   2103                 continue;
   2104             }
   2105             // if A is higher than any part of B, get new B
   2106             if (startA >= limitB) {
   2107                 needA = false;
   2108                 needB = true;
   2109                 continue;
   2110             }
   2111             // all other combinations mean we fail
   2112             return false;
   2113         }
   2114 
   2115         if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false;
   2116         return true;
   2117     }
   2118 
   2119     //    /**
   2120     //     * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
   2121     //     * For example, for the Unicode set [a{bc}{cd}]<br>
   2122     //     * containsNone is true for: "xy", "cb"<br>
   2123     //     * containsNone is false for: "a", "bc", "bcd"<br>
   2124     //     * @param c set to be checked for containment
   2125     //     * @return true if the test condition is met
   2126     //     * @stable ICU 2.0
   2127     //     */
   2128     //    public boolean containsNoneOld(UnicodeSet c) {
   2129     //        // The specified set is a subset if all of its pairs are contained in
   2130     //        // this set.  It's possible to code this more efficiently in terms of
   2131     //        // direct manipulation of the inversion lists if the need arises.
   2132     //        int n = c.getRangeCount();
   2133     //        for (int i=0; i<n; ++i) {
   2134     //            if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
   2135     //                return false;
   2136     //            }
   2137     //        }
   2138     //        if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, c.strings)) return false;
   2139     //        return true;
   2140     //    }
   2141 
   2142     /**
   2143      * Returns true if this set contains none of the characters
   2144      * of the given string.
   2145      * @param s string containing characters to be checked for containment
   2146      * @return true if the test condition is met
   2147      * @stable ICU 2.0
   2148      */
   2149     public boolean containsNone(CharSequence s) {
   2150         return span(s, SpanCondition.NOT_CONTAINED) == s.length();
   2151     }
   2152 
   2153     /**
   2154      * Returns true if this set contains one or more of the characters
   2155      * in the given range.
   2156      * @param start first character, inclusive, of the range
   2157      * @param end last character, inclusive, of the range
   2158      * @return true if the condition is met
   2159      * @stable ICU 2.0
   2160      */
   2161     public final boolean containsSome(int start, int end) {
   2162         return !containsNone(start, end);
   2163     }
   2164 
   2165     /**
   2166      * Returns true if this set contains one or more of the characters
   2167      * and strings of the given set.
   2168      * @param s set to be checked for containment
   2169      * @return true if the condition is met
   2170      * @stable ICU 2.0
   2171      */
   2172     public final boolean containsSome(UnicodeSet s) {
   2173         return !containsNone(s);
   2174     }
   2175 
   2176     /**
   2177      * Returns true if this set contains one or more of the characters
   2178      * of the given string.
   2179      * @param s string containing characters to be checked for containment
   2180      * @return true if the condition is met
   2181      * @stable ICU 2.0
   2182      */
   2183     public final boolean containsSome(CharSequence s) {
   2184         return !containsNone(s);
   2185     }
   2186 
   2187 
   2188     /**
   2189      * Adds all of the elements in the specified set to this set if
   2190      * they're not already present.  This operation effectively
   2191      * modifies this set so that its value is the <i>union</i> of the two
   2192      * sets.  The behavior of this operation is unspecified if the specified
   2193      * collection is modified while the operation is in progress.
   2194      *
   2195      * @param c set whose elements are to be added to this set.
   2196      * @stable ICU 2.0
   2197      */
   2198     public UnicodeSet addAll(UnicodeSet c) {
   2199         checkFrozen();
   2200         add(c.list, c.len, 0);
   2201         strings.addAll(c.strings);
   2202         return this;
   2203     }
   2204 
   2205     /**
   2206      * Retains only the elements in this set that are contained in the
   2207      * specified set.  In other words, removes from this set all of
   2208      * its elements that are not contained in the specified set.  This
   2209      * operation effectively modifies this set so that its value is
   2210      * the <i>intersection</i> of the two sets.
   2211      *
   2212      * @param c set that defines which elements this set will retain.
   2213      * @stable ICU 2.0
   2214      */
   2215     public UnicodeSet retainAll(UnicodeSet c) {
   2216         checkFrozen();
   2217         retain(c.list, c.len, 0);
   2218         strings.retainAll(c.strings);
   2219         return this;
   2220     }
   2221 
   2222     /**
   2223      * Removes from this set all of its elements that are contained in the
   2224      * specified set.  This operation effectively modifies this
   2225      * set so that its value is the <i>asymmetric set difference</i> of
   2226      * the two sets.
   2227      *
   2228      * @param c set that defines which elements will be removed from
   2229      *          this set.
   2230      * @stable ICU 2.0
   2231      */
   2232     public UnicodeSet removeAll(UnicodeSet c) {
   2233         checkFrozen();
   2234         retain(c.list, c.len, 2);
   2235         strings.removeAll(c.strings);
   2236         return this;
   2237     }
   2238 
   2239     /**
   2240      * Complements in this set all elements contained in the specified
   2241      * set.  Any character in the other set will be removed if it is
   2242      * in this set, or will be added if it is not in this set.
   2243      *
   2244      * @param c set that defines which elements will be complemented from
   2245      *          this set.
   2246      * @stable ICU 2.0
   2247      */
   2248     public UnicodeSet complementAll(UnicodeSet c) {
   2249         checkFrozen();
   2250         xor(c.list, c.len, 0);
   2251         SortedSetRelation.doOperation(strings, SortedSetRelation.COMPLEMENTALL, c.strings);
   2252         return this;
   2253     }
   2254 
   2255     /**
   2256      * Removes all of the elements from this set.  This set will be
   2257      * empty after this call returns.
   2258      * @stable ICU 2.0
   2259      */
   2260     public UnicodeSet clear() {
   2261         checkFrozen();
   2262         list[0] = HIGH;
   2263         len = 1;
   2264         pat = null;
   2265         strings.clear();
   2266         return this;
   2267     }
   2268 
   2269     /**
   2270      * Iteration method that returns the number of ranges contained in
   2271      * this set.
   2272      * @see #getRangeStart
   2273      * @see #getRangeEnd
   2274      * @stable ICU 2.0
   2275      */
   2276     public int getRangeCount() {
   2277         return len/2;
   2278     }
   2279 
   2280     /**
   2281      * Iteration method that returns the first character in the
   2282      * specified range of this set.
   2283      * @exception ArrayIndexOutOfBoundsException if index is outside
   2284      * the range <code>0..getRangeCount()-1</code>
   2285      * @see #getRangeCount
   2286      * @see #getRangeEnd
   2287      * @stable ICU 2.0
   2288      */
   2289     public int getRangeStart(int index) {
   2290         return list[index*2];
   2291     }
   2292 
   2293     /**
   2294      * Iteration method that returns the last character in the
   2295      * specified range of this set.
   2296      * @exception ArrayIndexOutOfBoundsException if index is outside
   2297      * the range <code>0..getRangeCount()-1</code>
   2298      * @see #getRangeStart
   2299      * @see #getRangeEnd
   2300      * @stable ICU 2.0
   2301      */
   2302     public int getRangeEnd(int index) {
   2303         return (list[index*2 + 1] - 1);
   2304     }
   2305 
   2306     /**
   2307      * Reallocate this objects internal structures to take up the least
   2308      * possible space, without changing this object's value.
   2309      * @stable ICU 2.0
   2310      */
   2311     public UnicodeSet compact() {
   2312         checkFrozen();
   2313         if (len != list.length) {
   2314             int[] temp = new int[len];
   2315             System.arraycopy(list, 0, temp, 0, len);
   2316             list = temp;
   2317         }
   2318         rangeList = null;
   2319         buffer = null;
   2320         return this;
   2321     }
   2322 
   2323     /**
   2324      * Compares the specified object with this set for equality.  Returns
   2325      * <tt>true</tt> if the specified object is also a set, the two sets
   2326      * have the same size, and every member of the specified set is
   2327      * contained in this set (or equivalently, every member of this set is
   2328      * contained in the specified set).
   2329      *
   2330      * @param o Object to be compared for equality with this set.
   2331      * @return <tt>true</tt> if the specified Object is equal to this set.
   2332      * @stable ICU 2.0
   2333      */
   2334     @Override
   2335     public boolean equals(Object o) {
   2336         if (o == null) {
   2337             return false;
   2338         }
   2339         if (this == o) {
   2340             return true;
   2341         }
   2342         try {
   2343             UnicodeSet that = (UnicodeSet) o;
   2344             if (len != that.len) return false;
   2345             for (int i = 0; i < len; ++i) {
   2346                 if (list[i] != that.list[i]) return false;
   2347             }
   2348             if (!strings.equals(that.strings)) return false;
   2349         } catch (Exception e) {
   2350             return false;
   2351         }
   2352         return true;
   2353     }
   2354 
   2355     /**
   2356      * Returns the hash code value for this set.
   2357      *
   2358      * @return the hash code value for this set.
   2359      * @see java.lang.Object#hashCode()
   2360      * @stable ICU 2.0
   2361      */
   2362     @Override
   2363     public int hashCode() {
   2364         int result = len;
   2365         for (int i = 0; i < len; ++i) {
   2366             result *= 1000003;
   2367             result += list[i];
   2368         }
   2369         return result;
   2370     }
   2371 
   2372     /**
   2373      * Return a programmer-readable string representation of this object.
   2374      * @stable ICU 2.0
   2375      */
   2376     @Override
   2377     public String toString() {
   2378         return toPattern(true);
   2379     }
   2380 
   2381     //----------------------------------------------------------------
   2382     // Implementation: Pattern parsing
   2383     //----------------------------------------------------------------
   2384 
   2385     /**
   2386      * Parses the given pattern, starting at the given position.  The character
   2387      * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
   2388      * Parsing continues until the corresponding closing ']'.  If a syntax error
   2389      * is encountered between the opening and closing brace, the parse fails.
   2390      * Upon return from a successful parse, the ParsePosition is updated to
   2391      * point to the character following the closing ']', and an inversion
   2392      * list for the parsed pattern is returned.  This method
   2393      * calls itself recursively to parse embedded subpatterns.
   2394      *
   2395      * @param pattern the string containing the pattern to be parsed.  The
   2396      * portion of the string from pos.getIndex(), which must be a '[', to the
   2397      * corresponding closing ']', is parsed.
   2398      * @param pos upon entry, the position at which to being parsing.  The
   2399      * character at pattern.charAt(pos.getIndex()) must be a '['.  Upon return
   2400      * from a successful parse, pos.getIndex() is either the character after the
   2401      * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
   2402      * is the last character of the pattern string.
   2403      * @return an inversion list for the parsed substring
   2404      * of <code>pattern</code>
   2405      * @exception java.lang.IllegalArgumentException if the parse fails.
   2406      * @internal
   2407      * @deprecated This API is ICU internal only.
   2408      */
   2409     @Deprecated
   2410     public UnicodeSet applyPattern(String pattern,
   2411             ParsePosition pos,
   2412             SymbolTable symbols,
   2413             int options) {
   2414 
   2415         // Need to build the pattern in a temporary string because
   2416         // _applyPattern calls add() etc., which set pat to empty.
   2417         boolean parsePositionWasNull = pos == null;
   2418         if (parsePositionWasNull) {
   2419             pos = new ParsePosition(0);
   2420         }
   2421 
   2422         StringBuilder rebuiltPat = new StringBuilder();
   2423         RuleCharacterIterator chars =
   2424                 new RuleCharacterIterator(pattern, symbols, pos);
   2425         applyPattern(chars, symbols, rebuiltPat, options);
   2426         if (chars.inVariable()) {
   2427             syntaxError(chars, "Extra chars in variable value");
   2428         }
   2429         pat = rebuiltPat.toString();
   2430         if (parsePositionWasNull) {
   2431             int i = pos.getIndex();
   2432 
   2433             // Skip over trailing whitespace
   2434             if ((options & IGNORE_SPACE) != 0) {
   2435                 i = PatternProps.skipWhiteSpace(pattern, i);
   2436             }
   2437 
   2438             if (i != pattern.length()) {
   2439                 throw new IllegalArgumentException("Parse of \"" + pattern +
   2440                         "\" failed at " + i);
   2441             }
   2442         }
   2443         return this;
   2444     }
   2445 
   2446     // Add constants to make the applyPattern() code easier to follow.
   2447 
   2448     private static final int LAST0_START = 0,
   2449             LAST1_RANGE = 1,
   2450             LAST2_SET = 2;
   2451 
   2452     private static final int MODE0_NONE = 0,
   2453             MODE1_INBRACKET = 1,
   2454             MODE2_OUTBRACKET = 2;
   2455 
   2456     private static final int SETMODE0_NONE = 0,
   2457             SETMODE1_UNICODESET = 1,
   2458             SETMODE2_PROPERTYPAT = 2,
   2459             SETMODE3_PREPARSED = 3;
   2460 
   2461     /**
   2462      * Parse the pattern from the given RuleCharacterIterator.  The
   2463      * iterator is advanced over the parsed pattern.
   2464      * @param chars iterator over the pattern characters.  Upon return
   2465      * it will be advanced to the first character after the parsed
   2466      * pattern, or the end of the iteration if all characters are
   2467      * parsed.
   2468      * @param symbols symbol table to use to parse and dereference
   2469      * variables, or null if none.
   2470      * @param rebuiltPat the pattern that was parsed, rebuilt or
   2471      * copied from the input pattern, as appropriate.
   2472      * @param options a bit mask of zero or more of the following:
   2473      * IGNORE_SPACE, CASE.
   2474      */
   2475     private void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
   2476             Appendable rebuiltPat, int options) {
   2477 
   2478         // Syntax characters: [ ] ^ - & { }
   2479 
   2480         // Recognized special forms for chars, sets: c-c s-s s&s
   2481 
   2482         int opts = RuleCharacterIterator.PARSE_VARIABLES |
   2483                 RuleCharacterIterator.PARSE_ESCAPES;
   2484         if ((options & IGNORE_SPACE) != 0) {
   2485             opts |= RuleCharacterIterator.SKIP_WHITESPACE;
   2486         }
   2487 
   2488         StringBuilder patBuf = new StringBuilder(), buf = null;
   2489         boolean usePat = false;
   2490         UnicodeSet scratch = null;
   2491         Object backup = null;
   2492 
   2493         // mode: 0=before [, 1=between [...], 2=after ]
   2494         // lastItem: 0=none, 1=char, 2=set
   2495         int lastItem = LAST0_START, lastChar = 0, mode = MODE0_NONE;
   2496         char op = 0;
   2497 
   2498         boolean invert = false;
   2499 
   2500         clear();
   2501         String lastString = null;
   2502 
   2503         while (mode != MODE2_OUTBRACKET && !chars.atEnd()) {
   2504             //Eclipse stated the following is "dead code"
   2505             /*
   2506             if (false) {
   2507                 // Debugging assertion
   2508                 if (!((lastItem == 0 && op == 0) ||
   2509                         (lastItem == 1 && (op == 0 || op == '-')) ||
   2510                         (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
   2511                     throw new IllegalArgumentException();
   2512                 }
   2513             }*/
   2514 
   2515             int c = 0;
   2516             boolean literal = false;
   2517             UnicodeSet nested = null;
   2518 
   2519             // -------- Check for property pattern
   2520 
   2521             // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
   2522             int setMode = SETMODE0_NONE;
   2523             if (resemblesPropertyPattern(chars, opts)) {
   2524                 setMode = SETMODE2_PROPERTYPAT;
   2525             }
   2526 
   2527             // -------- Parse '[' of opening delimiter OR nested set.
   2528             // If there is a nested set, use `setMode' to define how
   2529             // the set should be parsed.  If the '[' is part of the
   2530             // opening delimiter for this pattern, parse special
   2531             // strings "[", "[^", "[-", and "[^-".  Check for stand-in
   2532             // characters representing a nested set in the symbol
   2533             // table.
   2534 
   2535             else {
   2536                 // Prepare to backup if necessary
   2537                 backup = chars.getPos(backup);
   2538                 c = chars.next(opts);
   2539                 literal = chars.isEscaped();
   2540 
   2541                 if (c == '[' && !literal) {
   2542                     if (mode == MODE1_INBRACKET) {
   2543                         chars.setPos(backup); // backup
   2544                         setMode = SETMODE1_UNICODESET;
   2545                     } else {
   2546                         // Handle opening '[' delimiter
   2547                         mode = MODE1_INBRACKET;
   2548                         patBuf.append('[');
   2549                         backup = chars.getPos(backup); // prepare to backup
   2550                         c = chars.next(opts);
   2551                         literal = chars.isEscaped();
   2552                         if (c == '^' && !literal) {
   2553                             invert = true;
   2554                             patBuf.append('^');
   2555                             backup = chars.getPos(backup); // prepare to backup
   2556                             c = chars.next(opts);
   2557                             literal = chars.isEscaped();
   2558                         }
   2559                         // Fall through to handle special leading '-';
   2560                         // otherwise restart loop for nested [], \p{}, etc.
   2561                         if (c == '-') {
   2562                             literal = true;
   2563                             // Fall through to handle literal '-' below
   2564                         } else {
   2565                             chars.setPos(backup); // backup
   2566                             continue;
   2567                         }
   2568                     }
   2569                 } else if (symbols != null) {
   2570                     UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
   2571                     if (m != null) {
   2572                         try {
   2573                             nested = (UnicodeSet) m;
   2574                             setMode = SETMODE3_PREPARSED;
   2575                         } catch (ClassCastException e) {
   2576                             syntaxError(chars, "Syntax error");
   2577                         }
   2578                     }
   2579                 }
   2580             }
   2581 
   2582             // -------- Handle a nested set.  This either is inline in
   2583             // the pattern or represented by a stand-in that has
   2584             // previously been parsed and was looked up in the symbol
   2585             // table.
   2586 
   2587             if (setMode != SETMODE0_NONE) {
   2588                 if (lastItem == LAST1_RANGE) {
   2589                     if (op != 0) {
   2590                         syntaxError(chars, "Char expected after operator");
   2591                     }
   2592                     add_unchecked(lastChar, lastChar);
   2593                     _appendToPat(patBuf, lastChar, false);
   2594                     lastItem = LAST0_START;
   2595                     op = 0;
   2596                 }
   2597 
   2598                 if (op == '-' || op == '&') {
   2599                     patBuf.append(op);
   2600                 }
   2601 
   2602                 if (nested == null) {
   2603                     if (scratch == null) scratch = new UnicodeSet();
   2604                     nested = scratch;
   2605                 }
   2606                 switch (setMode) {
   2607                 case SETMODE1_UNICODESET:
   2608                     nested.applyPattern(chars, symbols, patBuf, options);
   2609                     break;
   2610                 case SETMODE2_PROPERTYPAT:
   2611                     chars.skipIgnored(opts);
   2612                     nested.applyPropertyPattern(chars, patBuf, symbols);
   2613                     break;
   2614                 case SETMODE3_PREPARSED: // `nested' already parsed
   2615                     nested._toPattern(patBuf, false);
   2616                     break;
   2617                 }
   2618 
   2619                 usePat = true;
   2620 
   2621                 if (mode == MODE0_NONE) {
   2622                     // Entire pattern is a category; leave parse loop
   2623                     set(nested);
   2624                     mode = MODE2_OUTBRACKET;
   2625                     break;
   2626                 }
   2627 
   2628                 switch (op) {
   2629                 case '-':
   2630                     removeAll(nested);
   2631                     break;
   2632                 case '&':
   2633                     retainAll(nested);
   2634                     break;
   2635                 case 0:
   2636                     addAll(nested);
   2637                     break;
   2638                 }
   2639 
   2640                 op = 0;
   2641                 lastItem = LAST2_SET;
   2642 
   2643                 continue;
   2644             }
   2645 
   2646             if (mode == MODE0_NONE) {
   2647                 syntaxError(chars, "Missing '['");
   2648             }
   2649 
   2650             // -------- Parse special (syntax) characters.  If the
   2651             // current character is not special, or if it is escaped,
   2652             // then fall through and handle it below.
   2653 
   2654             if (!literal) {
   2655                 switch (c) {
   2656                 case ']':
   2657                     if (lastItem == LAST1_RANGE) {
   2658                         add_unchecked(lastChar, lastChar);
   2659                         _appendToPat(patBuf, lastChar, false);
   2660                     }
   2661                     // Treat final trailing '-' as a literal
   2662                     if (op == '-') {
   2663                         add_unchecked(op, op);
   2664                         patBuf.append(op);
   2665                     } else if (op == '&') {
   2666                         syntaxError(chars, "Trailing '&'");
   2667                     }
   2668                     patBuf.append(']');
   2669                     mode = MODE2_OUTBRACKET;
   2670                     continue;
   2671                 case '-':
   2672                     if (op == 0) {
   2673                         if (lastItem != LAST0_START) {
   2674                             op = (char) c;
   2675                             continue;
   2676                         } else if (lastString != null) {
   2677                             op = (char) c;
   2678                             continue;
   2679                         } else {
   2680                             // Treat final trailing '-' as a literal
   2681                             add_unchecked(c, c);
   2682                             c = chars.next(opts);
   2683                             literal = chars.isEscaped();
   2684                             if (c == ']' && !literal) {
   2685                                 patBuf.append("-]");
   2686                                 mode = MODE2_OUTBRACKET;
   2687                                 continue;
   2688                             }
   2689                         }
   2690                     }
   2691                     syntaxError(chars, "'-' not after char, string, or set");
   2692                     break;
   2693                 case '&':
   2694                     if (lastItem == LAST2_SET && op == 0) {
   2695                         op = (char) c;
   2696                         continue;
   2697                     }
   2698                     syntaxError(chars, "'&' not after set");
   2699                     break;
   2700                 case '^':
   2701                     syntaxError(chars, "'^' not after '['");
   2702                     break;
   2703                 case '{':
   2704                     if (op != 0 && op != '-') {
   2705                         syntaxError(chars, "Missing operand after operator");
   2706                     }
   2707                     if (lastItem == LAST1_RANGE) {
   2708                         add_unchecked(lastChar, lastChar);
   2709                         _appendToPat(patBuf, lastChar, false);
   2710                     }
   2711                     lastItem = LAST0_START;
   2712                     if (buf == null) {
   2713                         buf = new StringBuilder();
   2714                     } else {
   2715                         buf.setLength(0);
   2716                     }
   2717                     boolean ok = false;
   2718                     while (!chars.atEnd()) {
   2719                         c = chars.next(opts);
   2720                         literal = chars.isEscaped();
   2721                         if (c == '}' && !literal) {
   2722                             ok = true;
   2723                             break;
   2724                         }
   2725                         appendCodePoint(buf, c);
   2726                     }
   2727                     if (buf.length() < 1 || !ok) {
   2728                         syntaxError(chars, "Invalid multicharacter string");
   2729                     }
   2730                     // We have new string. Add it to set and continue;
   2731                     // we don't need to drop through to the further
   2732                     // processing
   2733                     String curString = buf.toString();
   2734                     if (op == '-') {
   2735                         int lastSingle = CharSequences.getSingleCodePoint(lastString == null ? "" : lastString);
   2736                         int curSingle = CharSequences.getSingleCodePoint(curString);
   2737                         if (lastSingle != Integer.MAX_VALUE && curSingle != Integer.MAX_VALUE) {
   2738                             add(lastSingle,curSingle);
   2739                         } else {
   2740                             try {
   2741                                 StringRange.expand(lastString, curString, true, strings);
   2742                             } catch (Exception e) {
   2743                                 syntaxError(chars, e.getMessage());
   2744                             }
   2745                         }
   2746                         lastString = null;
   2747                         op = 0;
   2748                     } else {
   2749                         add(curString);
   2750                         lastString = curString;
   2751                     }
   2752                     patBuf.append('{');
   2753                     _appendToPat(patBuf, curString, false);
   2754                     patBuf.append('}');
   2755                     continue;
   2756                 case SymbolTable.SYMBOL_REF:
   2757                     //         symbols  nosymbols
   2758                     // [a-$]   error    error (ambiguous)
   2759                     // [a$]    anchor   anchor
   2760                     // [a-$x]  var "x"* literal '$'
   2761                     // [a-$.]  error    literal '$'
   2762                     // *We won't get here in the case of var "x"
   2763                     backup = chars.getPos(backup);
   2764                     c = chars.next(opts);
   2765                     literal = chars.isEscaped();
   2766                     boolean anchor = (c == ']' && !literal);
   2767                     if (symbols == null && !anchor) {
   2768                         c = SymbolTable.SYMBOL_REF;
   2769                         chars.setPos(backup);
   2770                         break; // literal '$'
   2771                     }
   2772                     if (anchor && op == 0) {
   2773                         if (lastItem == LAST1_RANGE) {
   2774                             add_unchecked(lastChar, lastChar);
   2775                             _appendToPat(patBuf, lastChar, false);
   2776                         }
   2777                         add_unchecked(UnicodeMatcher.ETHER);
   2778                         usePat = true;
   2779                         patBuf.append(SymbolTable.SYMBOL_REF).append(']');
   2780                         mode = MODE2_OUTBRACKET;
   2781                         continue;
   2782                     }
   2783                     syntaxError(chars, "Unquoted '$'");
   2784                     break;
   2785                 default:
   2786                     break;
   2787                 }
   2788             }
   2789 
   2790             // -------- Parse literal characters.  This includes both
   2791             // escaped chars ("\u4E01") and non-syntax characters
   2792             // ("a").
   2793 
   2794             switch (lastItem) {
   2795             case LAST0_START:
   2796                 if (op == '-' && lastString != null) {
   2797                     syntaxError(chars, "Invalid range");
   2798                 }
   2799                 lastItem = LAST1_RANGE;
   2800                 lastChar = c;
   2801                 lastString = null;
   2802                 break;
   2803             case LAST1_RANGE:
   2804                 if (op == '-') {
   2805                     if (lastString != null) {
   2806                         syntaxError(chars, "Invalid range");
   2807                     }
   2808                     if (lastChar >= c) {
   2809                         // Don't allow redundant (a-a) or empty (b-a) ranges;
   2810                         // these are most likely typos.
   2811                         syntaxError(chars, "Invalid range");
   2812                     }
   2813                     add_unchecked(lastChar, c);
   2814                     _appendToPat(patBuf, lastChar, false);
   2815                     patBuf.append(op);
   2816                     _appendToPat(patBuf, c, false);
   2817                     lastItem = LAST0_START;
   2818                     op = 0;
   2819                 } else {
   2820                     add_unchecked(lastChar, lastChar);
   2821                     _appendToPat(patBuf, lastChar, false);
   2822                     lastChar = c;
   2823                 }
   2824                 break;
   2825             case LAST2_SET:
   2826                 if (op != 0) {
   2827                     syntaxError(chars, "Set expected after operator");
   2828                 }
   2829                 lastChar = c;
   2830                 lastItem = LAST1_RANGE;
   2831                 break;
   2832             }
   2833         }
   2834 
   2835         if (mode != MODE2_OUTBRACKET) {
   2836             syntaxError(chars, "Missing ']'");
   2837         }
   2838 
   2839         chars.skipIgnored(opts);
   2840 
   2841         /**
   2842          * Handle global flags (invert, case insensitivity).  If this
   2843          * pattern should be compiled case-insensitive, then we need
   2844          * to close over case BEFORE COMPLEMENTING.  This makes
   2845          * patterns like /[^abc]/i work.
   2846          */
   2847         if ((options & CASE) != 0) {
   2848             closeOver(CASE);
   2849         }
   2850         if (invert) {
   2851             complement();
   2852         }
   2853 
   2854         // Use the rebuilt pattern (pat) only if necessary.  Prefer the
   2855         // generated pattern.
   2856         if (usePat) {
   2857             append(rebuiltPat, patBuf.toString());
   2858         } else {
   2859             appendNewPattern(rebuiltPat, false, true);
   2860         }
   2861     }
   2862 
   2863     private static void syntaxError(RuleCharacterIterator chars, String msg) {
   2864         throw new IllegalArgumentException("Error: " + msg + " at \"" +
   2865                 Utility.escape(chars.toString()) +
   2866                 '"');
   2867     }
   2868 
   2869     /**
   2870      * Add the contents of the UnicodeSet (as strings) into a collection.
   2871      * @param target collection to add into
   2872      * @stable ICU 4.4
   2873      */
   2874     public <T extends Collection<String>> T addAllTo(T target) {
   2875         return addAllTo(this, target);
   2876     }
   2877 
   2878 
   2879     /**
   2880      * Add the contents of the UnicodeSet (as strings) into a collection.
   2881      * @param target collection to add into
   2882      * @stable ICU 4.4
   2883      */
   2884     public String[] addAllTo(String[] target) {
   2885         return addAllTo(this, target);
   2886     }
   2887 
   2888     /**
   2889      * Add the contents of the UnicodeSet (as strings) into an array.
   2890      * @stable ICU 4.4
   2891      */
   2892     public static String[] toArray(UnicodeSet set) {
   2893         return addAllTo(set, new String[set.size()]);
   2894     }
   2895 
   2896     /**
   2897      * Add the contents of the collection (as strings) into this UnicodeSet.
   2898      * The collection must not contain null.
   2899      * @param source the collection to add
   2900      * @return a reference to this object
   2901      * @stable ICU 4.4
   2902      */
   2903     public UnicodeSet add(Iterable<?> source) {
   2904         return addAll(source);
   2905     }
   2906 
   2907     /**
   2908      * Add a collection (as strings) into this UnicodeSet.
   2909      * Uses standard naming convention.
   2910      * @param source collection to add into
   2911      * @return a reference to this object
   2912      * @stable ICU 4.4
   2913      */
   2914     public UnicodeSet addAll(Iterable<?> source) {
   2915         checkFrozen();
   2916         for (Object o : source) {
   2917             add(o.toString());
   2918         }
   2919         return this;
   2920     }
   2921 
   2922     //----------------------------------------------------------------
   2923     // Implementation: Utility methods
   2924     //----------------------------------------------------------------
   2925 
   2926     private void ensureCapacity(int newLen) {
   2927         if (newLen <= list.length) return;
   2928         int[] temp = new int[newLen + GROW_EXTRA];
   2929         System.arraycopy(list, 0, temp, 0, len);
   2930         list = temp;
   2931     }
   2932 
   2933     private void ensureBufferCapacity(int newLen) {
   2934         if (buffer != null && newLen <= buffer.length) return;
   2935         buffer = new int[newLen + GROW_EXTRA];
   2936     }
   2937 
   2938     /**
   2939      * Assumes start <= end.
   2940      */
   2941     private int[] range(int start, int end) {
   2942         if (rangeList == null) {
   2943             rangeList = new int[] { start, end+1, HIGH };
   2944         } else {
   2945             rangeList[0] = start;
   2946             rangeList[1] = end+1;
   2947         }
   2948         return rangeList;
   2949     }
   2950 
   2951     //----------------------------------------------------------------
   2952     // Implementation: Fundamental operations
   2953     //----------------------------------------------------------------
   2954 
   2955     // polarity = 0, 3 is normal: x xor y
   2956     // polarity = 1, 2: x xor ~y == x === y
   2957 
   2958     private UnicodeSet xor(int[] other, int otherLen, int polarity) {
   2959         ensureBufferCapacity(len + otherLen);
   2960         int i = 0, j = 0, k = 0;
   2961         int a = list[i++];
   2962         int b;
   2963         // TODO: Based on the call hierarchy, polarity of 1 or 2 is never used
   2964         //      so the following if statement will not be called.
   2965         ///CLOVER:OFF
   2966         if (polarity == 1 || polarity == 2) {
   2967             b = LOW;
   2968             if (other[j] == LOW) { // skip base if already LOW
   2969                 ++j;
   2970                 b = other[j];
   2971             }
   2972             ///CLOVER:ON
   2973         } else {
   2974             b = other[j++];
   2975         }
   2976         // simplest of all the routines
   2977         // sort the values, discarding identicals!
   2978         while (true) {
   2979             if (a < b) {
   2980                 buffer[k++] = a;
   2981                 a = list[i++];
   2982             } else if (b < a) {
   2983                 buffer[k++] = b;
   2984                 b = other[j++];
   2985             } else if (a != HIGH) { // at this point, a == b
   2986                 // discard both values!
   2987                 a = list[i++];
   2988                 b = other[j++];
   2989             } else { // DONE!
   2990                 buffer[k++] = HIGH;
   2991                 len = k;
   2992                 break;
   2993             }
   2994         }
   2995         // swap list and buffer
   2996         int[] temp = list;
   2997         list = buffer;
   2998         buffer = temp;
   2999         pat = null;
   3000         return this;
   3001     }
   3002 
   3003     // polarity = 0 is normal: x union y
   3004     // polarity = 2: x union ~y
   3005     // polarity = 1: ~x union y
   3006     // polarity = 3: ~x union ~y
   3007 
   3008     private UnicodeSet add(int[] other, int otherLen, int polarity) {
   3009         ensureBufferCapacity(len + otherLen);
   3010         int i = 0, j = 0, k = 0;
   3011         int a = list[i++];
   3012         int b = other[j++];
   3013         // change from xor is that we have to check overlapping pairs
   3014         // polarity bit 1 means a is second, bit 2 means b is.
   3015         main:
   3016             while (true) {
   3017                 switch (polarity) {
   3018                 case 0: // both first; take lower if unequal
   3019                     if (a < b) { // take a
   3020                         // Back up over overlapping ranges in buffer[]
   3021                         if (k > 0 && a <= buffer[k-1]) {
   3022                             // Pick latter end value in buffer[] vs. list[]
   3023                             a = max(list[i], buffer[--k]);
   3024                         } else {
   3025                             // No overlap
   3026                             buffer[k++] = a;
   3027                             a = list[i];
   3028                         }
   3029                         i++; // Common if/else code factored out
   3030                         polarity ^= 1;
   3031                     } else if (b < a) { // take b
   3032                         if (k > 0 && b <= buffer[k-1]) {
   3033                             b = max(other[j], buffer[--k]);
   3034                         } else {
   3035                             buffer[k++] = b;
   3036                             b = other[j];
   3037                         }
   3038                         j++;
   3039                         polarity ^= 2;
   3040                     } else { // a == b, take a, drop b
   3041                         if (a == HIGH) break main;
   3042                         // This is symmetrical; it doesn't matter if
   3043                         // we backtrack with a or b. - liu
   3044                         if (k > 0 && a <= buffer[k-1]) {
   3045                             a = max(list[i], buffer[--k]);
   3046                         } else {
   3047                             // No overlap
   3048                             buffer[k++] = a;
   3049                             a = list[i];
   3050                         }
   3051                         i++;
   3052                         polarity ^= 1;
   3053                         b = other[j++]; polarity ^= 2;
   3054                     }
   3055                     break;
   3056                 case 3: // both second; take higher if unequal, and drop other
   3057                     if (b <= a) { // take a
   3058                         if (a == HIGH) break main;
   3059                         buffer[k++] = a;
   3060                     } else { // take b
   3061                         if (b == HIGH) break main;
   3062                         buffer[k++] = b;
   3063                     }
   3064                     a = list[i++]; polarity ^= 1;   // factored common code
   3065                     b = other[j++]; polarity ^= 2;
   3066                     break;
   3067                 case 1: // a second, b first; if b < a, overlap
   3068                     if (a < b) { // no overlap, take a
   3069                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
   3070                     } else if (b < a) { // OVERLAP, drop b
   3071                         b = other[j++]; polarity ^= 2;
   3072                     } else { // a == b, drop both!
   3073                         if (a == HIGH) break main;
   3074                         a = list[i++]; polarity ^= 1;
   3075                         b = other[j++]; polarity ^= 2;
   3076                     }
   3077                     break;
   3078                 case 2: // a first, b second; if a < b, overlap
   3079                     if (b < a) { // no overlap, take b
   3080                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
   3081                     } else  if (a < b) { // OVERLAP, drop a
   3082                         a = list[i++]; polarity ^= 1;
   3083                     } else { // a == b, drop both!
   3084                         if (a == HIGH) break main;
   3085                         a = list[i++]; polarity ^= 1;
   3086                         b = other[j++]; polarity ^= 2;
   3087                     }
   3088                     break;
   3089                 }
   3090             }
   3091         buffer[k++] = HIGH;    // terminate
   3092         len = k;
   3093         // swap list and buffer
   3094         int[] temp = list;
   3095         list = buffer;
   3096         buffer = temp;
   3097         pat = null;
   3098         return this;
   3099     }
   3100 
   3101     // polarity = 0 is normal: x intersect y
   3102     // polarity = 2: x intersect ~y == set-minus
   3103     // polarity = 1: ~x intersect y
   3104     // polarity = 3: ~x intersect ~y
   3105 
   3106     private UnicodeSet retain(int[] other, int otherLen, int polarity) {
   3107         ensureBufferCapacity(len + otherLen);
   3108         int i = 0, j = 0, k = 0;
   3109         int a = list[i++];
   3110         int b = other[j++];
   3111         // change from xor is that we have to check overlapping pairs
   3112         // polarity bit 1 means a is second, bit 2 means b is.
   3113         main:
   3114             while (true) {
   3115                 switch (polarity) {
   3116                 case 0: // both first; drop the smaller
   3117                     if (a < b) { // drop a
   3118                         a = list[i++]; polarity ^= 1;
   3119                     } else if (b < a) { // drop b
   3120                         b = other[j++]; polarity ^= 2;
   3121                     } else { // a == b, take one, drop other
   3122                         if (a == HIGH) break main;
   3123                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
   3124                         b = other[j++]; polarity ^= 2;
   3125                     }
   3126                     break;
   3127                 case 3: // both second; take lower if unequal
   3128                     if (a < b) { // take a
   3129                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
   3130                     } else if (b < a) { // take b
   3131                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
   3132                     } else { // a == b, take one, drop other
   3133                         if (a == HIGH) break main;
   3134                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
   3135                         b = other[j++]; polarity ^= 2;
   3136                     }
   3137                     break;
   3138                 case 1: // a second, b first;
   3139                     if (a < b) { // NO OVERLAP, drop a
   3140                         a = list[i++]; polarity ^= 1;
   3141                     } else if (b < a) { // OVERLAP, take b
   3142                         buffer[k++] = b; b = other[j++]; polarity ^= 2;
   3143                     } else { // a == b, drop both!
   3144                         if (a == HIGH) break main;
   3145                         a = list[i++]; polarity ^= 1;
   3146                         b = other[j++]; polarity ^= 2;
   3147                     }
   3148                     break;
   3149                 case 2: // a first, b second; if a < b, overlap
   3150                     if (b < a) { // no overlap, drop b
   3151                         b = other[j++]; polarity ^= 2;
   3152                     } else  if (a < b) { // OVERLAP, take a
   3153                         buffer[k++] = a; a = list[i++]; polarity ^= 1;
   3154                     } else { // a == b, drop both!
   3155                         if (a == HIGH) break main;
   3156                         a = list[i++]; polarity ^= 1;
   3157                         b = other[j++]; polarity ^= 2;
   3158                     }
   3159                     break;
   3160                 }
   3161             }
   3162         buffer[k++] = HIGH;    // terminate
   3163         len = k;
   3164         // swap list and buffer
   3165         int[] temp = list;
   3166         list = buffer;
   3167         buffer = temp;
   3168         pat = null;
   3169         return this;
   3170     }
   3171 
   3172     private static final int max(int a, int b) {
   3173         return (a > b) ? a : b;
   3174     }
   3175 
   3176     //----------------------------------------------------------------
   3177     // Generic filter-based scanning code
   3178     //----------------------------------------------------------------
   3179 
   3180     private static interface Filter {
   3181         boolean contains(int codePoint);
   3182     }
   3183 
   3184     private static class NumericValueFilter implements Filter {
   3185         double value;
   3186         NumericValueFilter(double value) { this.value = value; }
   3187         @Override
   3188         public boolean contains(int ch) {
   3189             return UCharacter.getUnicodeNumericValue(ch) == value;
   3190         }
   3191     }
   3192 
   3193     private static class GeneralCategoryMaskFilter implements Filter {
   3194         int mask;
   3195         GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
   3196         @Override
   3197         public boolean contains(int ch) {
   3198             return ((1 << UCharacter.getType(ch)) & mask) != 0;
   3199         }
   3200     }
   3201 
   3202     private static class IntPropertyFilter implements Filter {
   3203         int prop;
   3204         int value;
   3205         IntPropertyFilter(int prop, int value) {
   3206             this.prop = prop;
   3207             this.value = value;
   3208         }
   3209         @Override
   3210         public boolean contains(int ch) {
   3211             return UCharacter.getIntPropertyValue(ch, prop) == value;
   3212         }
   3213     }
   3214 
   3215     private static class ScriptExtensionsFilter implements Filter {
   3216         int script;
   3217         ScriptExtensionsFilter(int script) { this.script = script; }
   3218         @Override
   3219         public boolean contains(int c) {
   3220             return UScript.hasScript(c, script);
   3221         }
   3222     }
   3223 
   3224     // VersionInfo for unassigned characters
   3225     private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
   3226 
   3227     private static class VersionFilter implements Filter {
   3228         VersionInfo version;
   3229         VersionFilter(VersionInfo version) { this.version = version; }
   3230         @Override
   3231         public boolean contains(int ch) {
   3232             VersionInfo v = UCharacter.getAge(ch);
   3233             // Reference comparison ok; VersionInfo caches and reuses
   3234             // unique objects.
   3235             return !Utility.sameObjects(v, NO_VERSION) &&
   3236                     v.compareTo(version) <= 0;
   3237         }
   3238     }
   3239 
   3240     private static synchronized UnicodeSet getInclusions(int src) {
   3241         if (INCLUSIONS == null) {
   3242             INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
   3243         }
   3244         if(INCLUSIONS[src] == null) {
   3245             UnicodeSet incl = new UnicodeSet();
   3246             switch(src) {
   3247             case UCharacterProperty.SRC_CHAR:
   3248                 UCharacterProperty.INSTANCE.addPropertyStarts(incl);
   3249                 break;
   3250             case UCharacterProperty.SRC_PROPSVEC:
   3251                 UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
   3252                 break;
   3253             case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
   3254                 UCharacterProperty.INSTANCE.addPropertyStarts(incl);
   3255                 UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
   3256                 break;
   3257             case UCharacterProperty.SRC_CASE_AND_NORM:
   3258                 Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
   3259                 UCaseProps.INSTANCE.addPropertyStarts(incl);
   3260                 break;
   3261             case UCharacterProperty.SRC_NFC:
   3262                 Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl);
   3263                 break;
   3264             case UCharacterProperty.SRC_NFKC:
   3265                 Norm2AllModes.getNFKCInstance().impl.addPropertyStarts(incl);
   3266                 break;
   3267             case UCharacterProperty.SRC_NFKC_CF:
   3268                 Norm2AllModes.getNFKC_CFInstance().impl.addPropertyStarts(incl);
   3269                 break;
   3270             case UCharacterProperty.SRC_NFC_CANON_ITER:
   3271                 Norm2AllModes.getNFCInstance().impl.addCanonIterPropertyStarts(incl);
   3272                 break;
   3273             case UCharacterProperty.SRC_CASE:
   3274                 UCaseProps.INSTANCE.addPropertyStarts(incl);
   3275                 break;
   3276             case UCharacterProperty.SRC_BIDI:
   3277                 UBiDiProps.INSTANCE.addPropertyStarts(incl);
   3278                 break;
   3279             default:
   3280                 throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
   3281             }
   3282             INCLUSIONS[src] = incl;
   3283         }
   3284         return INCLUSIONS[src];
   3285     }
   3286 
   3287     /**
   3288      * Generic filter-based scanning code for UCD property UnicodeSets.
   3289      */
   3290     private UnicodeSet applyFilter(Filter filter, int src) {
   3291         // Logically, walk through all Unicode characters, noting the start
   3292         // and end of each range for which filter.contain(c) is
   3293         // true.  Add each range to a set.
   3294         //
   3295         // To improve performance, use an inclusions set which
   3296         // encodes information about character ranges that are known
   3297         // to have identical properties.
   3298         // getInclusions(src) contains exactly the first characters of
   3299         // same-value ranges for the given properties "source".
   3300 
   3301         clear();
   3302 
   3303         int startHasProperty = -1;
   3304         UnicodeSet inclusions = getInclusions(src);
   3305         int limitRange = inclusions.getRangeCount();
   3306 
   3307         for (int j=0; j<limitRange; ++j) {
   3308             // get current range
   3309             int start = inclusions.getRangeStart(j);
   3310             int end = inclusions.getRangeEnd(j);
   3311 
   3312             // for all the code points in the range, process
   3313             for (int ch = start; ch <= end; ++ch) {
   3314                 // only add to the unicodeset on inflection points --
   3315                 // where the hasProperty value changes to false
   3316                 if (filter.contains(ch)) {
   3317                     if (startHasProperty < 0) {
   3318                         startHasProperty = ch;
   3319                     }
   3320                 } else if (startHasProperty >= 0) {
   3321                     add_unchecked(startHasProperty, ch-1);
   3322                     startHasProperty = -1;
   3323                 }
   3324             }
   3325         }
   3326         if (startHasProperty >= 0) {
   3327             add_unchecked(startHasProperty, 0x10FFFF);
   3328         }
   3329 
   3330         return this;
   3331     }
   3332 
   3333 
   3334     /**
   3335      * Remove leading and trailing Pattern_White_Space and compress
   3336      * internal Pattern_White_Space to a single space character.
   3337      */
   3338     private static String mungeCharName(String source) {
   3339         source = PatternProps.trimWhiteSpace(source);
   3340         StringBuilder buf = null;
   3341         for (int i=0; i<source.length(); ++i) {
   3342             char ch = source.charAt(i);
   3343             if (PatternProps.isWhiteSpace(ch)) {
   3344                 if (buf == null) {
   3345                     buf = new StringBuilder().append(source, 0, i);
   3346                 } else if (buf.charAt(buf.length() - 1) == ' ') {
   3347                     continue;
   3348                 }
   3349                 ch = ' '; // convert to ' '
   3350             }
   3351             if (buf != null) {
   3352                 buf.append(ch);
   3353             }
   3354         }
   3355         return buf == null ? source : buf.toString();
   3356     }
   3357 
   3358     //----------------------------------------------------------------
   3359     // Property set API
   3360     //----------------------------------------------------------------
   3361 
   3362     /**
   3363      * Modifies this set to contain those code points which have the
   3364      * given value for the given binary or enumerated property, as
   3365      * returned by UCharacter.getIntPropertyValue.  Prior contents of
   3366      * this set are lost.
   3367      *
   3368      * @param prop a property in the range
   3369      * UProperty.BIN_START..UProperty.BIN_LIMIT-1 or
   3370      * UProperty.INT_START..UProperty.INT_LIMIT-1 or.
   3371      * UProperty.MASK_START..UProperty.MASK_LIMIT-1.
   3372      *
   3373      * @param value a value in the range
   3374      * UCharacter.getIntPropertyMinValue(prop)..
   3375      * UCharacter.getIntPropertyMaxValue(prop), with one exception.
   3376      * If prop is UProperty.GENERAL_CATEGORY_MASK, then value should not be
   3377      * a UCharacter.getType() result, but rather a mask value produced
   3378      * by logically ORing (1 &lt;&lt; UCharacter.getType()) values together.
   3379      * This allows grouped categories such as [:L:] to be represented.
   3380      *
   3381      * @return a reference to this set
   3382      *
   3383      * @stable ICU 2.4
   3384      */
   3385     public UnicodeSet applyIntPropertyValue(int prop, int value) {
   3386         checkFrozen();
   3387         if (prop == UProperty.GENERAL_CATEGORY_MASK) {
   3388             applyFilter(new GeneralCategoryMaskFilter(value), UCharacterProperty.SRC_CHAR);
   3389         } else if (prop == UProperty.SCRIPT_EXTENSIONS) {
   3390             applyFilter(new ScriptExtensionsFilter(value), UCharacterProperty.SRC_PROPSVEC);
   3391         } else {
   3392             applyFilter(new IntPropertyFilter(prop, value), UCharacterProperty.INSTANCE.getSource(prop));
   3393         }
   3394         return this;
   3395     }
   3396 
   3397 
   3398 
   3399     /**
   3400      * Modifies this set to contain those code points which have the
   3401      * given value for the given property.  Prior contents of this
   3402      * set are lost.
   3403      *
   3404      * @param propertyAlias a property alias, either short or long.
   3405      * The name is matched loosely.  See PropertyAliases.txt for names
   3406      * and a description of loose matching.  If the value string is
   3407      * empty, then this string is interpreted as either a
   3408      * General_Category value alias, a Script value alias, a binary
   3409      * property alias, or a special ID.  Special IDs are matched
   3410      * loosely and correspond to the following sets:
   3411      *
   3412      * "ANY" = [\\u0000-\\U0010FFFF],
   3413      * "ASCII" = [\\u0000-\\u007F].
   3414      *
   3415      * @param valueAlias a value alias, either short or long.  The
   3416      * name is matched loosely.  See PropertyValueAliases.txt for
   3417      * names and a description of loose matching.  In addition to
   3418      * aliases listed, numeric values and canonical combining classes
   3419      * may be expressed numerically, e.g., ("nv", "0.5") or ("ccc",
   3420      * "220").  The value string may also be empty.
   3421      *
   3422      * @return a reference to this set
   3423      *
   3424      * @stable ICU 2.4
   3425      */
   3426     public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) {
   3427         return applyPropertyAlias(propertyAlias, valueAlias, null);
   3428     }
   3429 
   3430     /**
   3431      * Modifies this set to contain those code points which have the
   3432      * given value for the given property.  Prior contents of this
   3433      * set are lost.
   3434      * @param propertyAlias A string of the property alias.
   3435      * @param valueAlias A string of the value alias.
   3436      * @param symbols if not null, then symbols are first called to see if a property
   3437      * is available. If true, then everything else is skipped.
   3438      * @return this set
   3439      * @stable ICU 3.2
   3440      */
   3441     public UnicodeSet applyPropertyAlias(String propertyAlias,
   3442             String valueAlias, SymbolTable symbols) {
   3443         checkFrozen();
   3444         int p;
   3445         int v;
   3446         boolean invert = false;
   3447 
   3448         if (symbols != null
   3449                 && (symbols instanceof XSymbolTable)
   3450                 && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) {
   3451             return this;
   3452         }
   3453 
   3454         if (XSYMBOL_TABLE != null) {
   3455             if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) {
   3456                 return this;
   3457             }
   3458         }
   3459 
   3460         if (valueAlias.length() > 0) {
   3461             p = UCharacter.getPropertyEnum(propertyAlias);
   3462 
   3463             // Treat gc as gcm
   3464             if (p == UProperty.GENERAL_CATEGORY) {
   3465                 p = UProperty.GENERAL_CATEGORY_MASK;
   3466             }
   3467 
   3468             if ((p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) ||
   3469                     (p >= UProperty.INT_START && p < UProperty.INT_LIMIT) ||
   3470                     (p >= UProperty.MASK_START && p < UProperty.MASK_LIMIT)) {
   3471                 try {
   3472                     v = UCharacter.getPropertyValueEnum(p, valueAlias);
   3473                 } catch (IllegalArgumentException e) {
   3474                     // Handle numeric CCC
   3475                     if (p == UProperty.CANONICAL_COMBINING_CLASS ||
   3476                             p == UProperty.LEAD_CANONICAL_COMBINING_CLASS ||
   3477                             p == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) {
   3478                         v = Integer.parseInt(PatternProps.trimWhiteSpace(valueAlias));
   3479                         // Anything between 0 and 255 is valid even if unused.
   3480                         if (v < 0 || v > 255) throw e;
   3481                     } else {
   3482                         throw e;
   3483                     }
   3484                 }
   3485             }
   3486 
   3487             else {
   3488                 switch (p) {
   3489                 case UProperty.NUMERIC_VALUE:
   3490                 {
   3491                     double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias));
   3492                     applyFilter(new NumericValueFilter(value), UCharacterProperty.SRC_CHAR);
   3493                     return this;
   3494                 }
   3495                 case UProperty.NAME:
   3496                 {
   3497                     // Must munge name, since
   3498                     // UCharacter.charFromName() does not do
   3499                     // 'loose' matching.
   3500                     String buf = mungeCharName(valueAlias);
   3501                     int ch = UCharacter.getCharFromExtendedName(buf);
   3502                     if (ch == -1) {
   3503                         throw new IllegalArgumentException("Invalid character name");
   3504                     }
   3505                     clear();
   3506                     add_unchecked(ch);
   3507                     return this;
   3508                 }
   3509                 case UProperty.UNICODE_1_NAME:
   3510                     // ICU 49 deprecates the Unicode_1_Name property APIs.
   3511                     throw new IllegalArgumentException("Unicode_1_Name (na1) not supported");
   3512                 case UProperty.AGE:
   3513                 {
   3514                     // Must munge name, since
   3515                     // VersionInfo.getInstance() does not do
   3516                     // 'loose' matching.
   3517                     VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
   3518                     applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
   3519                     return this;
   3520                 }
   3521                 case UProperty.SCRIPT_EXTENSIONS:
   3522                     v = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, valueAlias);
   3523                     // fall through to calling applyIntPropertyValue()
   3524                     break;
   3525                 default:
   3526                     // p is a non-binary, non-enumerated property that we
   3527                     // don't support (yet).
   3528                     throw new IllegalArgumentException("Unsupported property");
   3529                 }
   3530             }
   3531         }
   3532 
   3533         else {
   3534             // valueAlias is empty.  Interpret as General Category, Script,
   3535             // Binary property, or ANY or ASCII.  Upon success, p and v will
   3536             // be set.
   3537             UPropertyAliases pnames = UPropertyAliases.INSTANCE;
   3538             p = UProperty.GENERAL_CATEGORY_MASK;
   3539             v = pnames.getPropertyValueEnum(p, propertyAlias);
   3540             if (v == UProperty.UNDEFINED) {
   3541                 p = UProperty.SCRIPT;
   3542                 v = pnames.getPropertyValueEnum(p, propertyAlias);
   3543                 if (v == UProperty.UNDEFINED) {
   3544                     p = pnames.getPropertyEnum(propertyAlias);
   3545                     if (p == UProperty.UNDEFINED) {
   3546                         p = -1;
   3547                     }
   3548                     if (p >= UProperty.BINARY_START && p < UProperty.BINARY_LIMIT) {
   3549                         v = 1;
   3550                     } else if (p == -1) {
   3551                         if (0 == UPropertyAliases.compare(ANY_ID, propertyAlias)) {
   3552                             set(MIN_VALUE, MAX_VALUE);
   3553                             return this;
   3554                         } else if (0 == UPropertyAliases.compare(ASCII_ID, propertyAlias)) {
   3555                             set(0, 0x7F);
   3556                             return this;
   3557                         } else if (0 == UPropertyAliases.compare(ASSIGNED, propertyAlias)) {
   3558                             // [:Assigned:]=[:^Cn:]
   3559                             p = UProperty.GENERAL_CATEGORY_MASK;
   3560                             v = (1<<UCharacter.UNASSIGNED);
   3561                             invert = true;
   3562                         } else {
   3563                             // Property name was never matched.
   3564                             throw new IllegalArgumentException("Invalid property alias: " + propertyAlias + "=" + valueAlias);
   3565                         }
   3566                     } else {
   3567                         // Valid propery name, but it isn't binary, so the value
   3568                         // must be supplied.
   3569                         throw new IllegalArgumentException("Missing property value");
   3570                     }
   3571                 }
   3572             }
   3573         }
   3574 
   3575         applyIntPropertyValue(p, v);
   3576         if(invert) {
   3577             complement();
   3578         }
   3579 
   3580         return this;
   3581     }
   3582 
   3583     //----------------------------------------------------------------
   3584     // Property set patterns
   3585     //----------------------------------------------------------------
   3586 
   3587     /**
   3588      * Return true if the given position, in the given pattern, appears
   3589      * to be the start of a property set pattern.
   3590      */
   3591     private static boolean resemblesPropertyPattern(String pattern, int pos) {
   3592         // Patterns are at least 5 characters long
   3593         if ((pos+5) > pattern.length()) {
   3594             return false;
   3595         }
   3596 
   3597         // Look for an opening [:, [:^, \p, or \P
   3598         return pattern.regionMatches(pos, "[:", 0, 2) ||
   3599                 pattern.regionMatches(true, pos, "\\p", 0, 2) ||
   3600                 pattern.regionMatches(pos, "\\N", 0, 2);
   3601     }
   3602 
   3603     /**
   3604      * Return true if the given iterator appears to point at a
   3605      * property pattern.  Regardless of the result, return with the
   3606      * iterator unchanged.
   3607      * @param chars iterator over the pattern characters.  Upon return
   3608      * it will be unchanged.
   3609      * @param iterOpts RuleCharacterIterator options
   3610      */
   3611     private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
   3612             int iterOpts) {
   3613         boolean result = false;
   3614         iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
   3615         Object pos = chars.getPos(null);
   3616         int c = chars.next(iterOpts);
   3617         if (c == '[' || c == '\\') {
   3618             int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
   3619             result = (c == '[') ? (d == ':') :
   3620                 (d == 'N' || d == 'p' || d == 'P');
   3621         }
   3622         chars.setPos(pos);
   3623         return result;
   3624     }
   3625 
   3626     /**
   3627      * Parse the given property pattern at the given parse position.
   3628      * @param symbols TODO
   3629      */
   3630     private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
   3631         int pos = ppos.getIndex();
   3632 
   3633         // On entry, ppos should point to one of the following locations:
   3634 
   3635         // Minimum length is 5 characters, e.g. \p{L}
   3636         if ((pos+5) > pattern.length()) {
   3637             return null;
   3638         }
   3639 
   3640         boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
   3641         boolean isName = false; // true for \N{pat}, o/w false
   3642         boolean invert = false;
   3643 
   3644         // Look for an opening [:, [:^, \p, or \P
   3645         if (pattern.regionMatches(pos, "[:", 0, 2)) {
   3646             posix = true;
   3647             pos = PatternProps.skipWhiteSpace(pattern, (pos+2));
   3648             if (pos < pattern.length() && pattern.charAt(pos) == '^') {
   3649                 ++pos;
   3650                 invert = true;
   3651             }
   3652         } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) ||
   3653                 pattern.regionMatches(pos, "\\N", 0, 2)) {
   3654             char c = pattern.charAt(pos+1);
   3655             invert = (c == 'P');
   3656             isName = (c == 'N');
   3657             pos = PatternProps.skipWhiteSpace(pattern, (pos+2));
   3658             if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
   3659                 // Syntax error; "\p" or "\P" not followed by "{"
   3660                 return null;
   3661             }
   3662         } else {
   3663             // Open delimiter not seen
   3664             return null;
   3665         }
   3666 
   3667         // Look for the matching close delimiter, either :] or }
   3668         int close = pattern.indexOf(posix ? ":]" : "}", pos);
   3669         if (close < 0) {
   3670             // Syntax error; close delimiter missing
   3671             return null;
   3672         }
   3673 
   3674         // Look for an '=' sign.  If this is present, we will parse a
   3675         // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
   3676         // pattern.
   3677         int equals = pattern.indexOf('=', pos);
   3678         String propName, valueName;
   3679         if (equals >= 0 && equals < close && !isName) {
   3680             // Equals seen; parse medium/long pattern
   3681             propName = pattern.substring(pos, equals);
   3682             valueName = pattern.substring(equals+1, close);
   3683         }
   3684 
   3685         else {
   3686             // Handle case where no '=' is seen, and \N{}
   3687             propName = pattern.substring(pos, close);
   3688             valueName = "";
   3689 
   3690             // Handle \N{name}
   3691             if (isName) {
   3692                 // This is a little inefficient since it means we have to
   3693                 // parse "na" back to UProperty.NAME even though we already
   3694                 // know it's UProperty.NAME.  If we refactor the API to
   3695                 // support args of (int, String) then we can remove
   3696                 // "na" and make this a little more efficient.
   3697                 valueName = propName;
   3698                 propName = "na";
   3699             }
   3700         }
   3701 
   3702         applyPropertyAlias(propName, valueName, symbols);
   3703 
   3704         if (invert) {
   3705             complement();
   3706         }
   3707 
   3708         // Move to the limit position after the close delimiter
   3709         ppos.setIndex(close + (posix ? 2 : 1));
   3710 
   3711         return this;
   3712     }
   3713 
   3714     /**
   3715      * Parse a property pattern.
   3716      * @param chars iterator over the pattern characters.  Upon return
   3717      * it will be advanced to the first character after the parsed
   3718      * pattern, or the end of the iteration if all characters are
   3719      * parsed.
   3720      * @param rebuiltPat the pattern that was parsed, rebuilt or
   3721      * copied from the input pattern, as appropriate.
   3722      * @param symbols TODO
   3723      */
   3724     private void applyPropertyPattern(RuleCharacterIterator chars,
   3725             Appendable rebuiltPat, SymbolTable symbols) {
   3726         String patStr = chars.lookahead();
   3727         ParsePosition pos = new ParsePosition(0);
   3728         applyPropertyPattern(patStr, pos, symbols);
   3729         if (pos.getIndex() == 0) {
   3730             syntaxError(chars, "Invalid property pattern");
   3731         }
   3732         chars.jumpahead(pos.getIndex());
   3733         append(rebuiltPat, patStr.substring(0, pos.getIndex()));
   3734     }
   3735 
   3736     //----------------------------------------------------------------
   3737     // Case folding API
   3738     //----------------------------------------------------------------
   3739 
   3740     /**
   3741      * Bitmask for constructor and applyPattern() indicating that
   3742      * white space should be ignored.  If set, ignore Unicode Pattern_White_Space characters,
   3743      * unless they are quoted or escaped.  This may be ORed together
   3744      * with other selectors.
   3745      * @stable ICU 3.8
   3746      */
   3747     public static final int IGNORE_SPACE = 1;
   3748 
   3749     /**
   3750      * Bitmask for constructor, applyPattern(), and closeOver()
   3751      * indicating letter case.  This may be ORed together with other
   3752      * selectors.
   3753      *
   3754      * Enable case insensitive matching.  E.g., "[ab]" with this flag
   3755      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
   3756      * match all except 'a', 'A', 'b', and 'B'. This performs a full
   3757      * closure over case mappings, e.g. U+017F for s.
   3758      *
   3759      * The resulting set is a superset of the input for the code points but
   3760      * not for the strings.
   3761      * It performs a case mapping closure of the code points and adds
   3762      * full case folding strings for the code points, and reduces strings of
   3763      * the original set to their full case folding equivalents.
   3764      *
   3765      * This is designed for case-insensitive matches, for example
   3766      * in regular expressions. The full code point case closure allows checking of
   3767      * an input character directly against the closure set.
   3768      * Strings are matched by comparing the case-folded form from the closure
   3769      * set with an incremental case folding of the string in question.
   3770      *
   3771      * The closure set will also contain single code points if the original
   3772      * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
   3773      * This is not necessary (that is, redundant) for the above matching method
   3774      * but results in the same closure sets regardless of whether the original
   3775      * set contained the code point or a string.
   3776      * @stable ICU 3.8
   3777      */
   3778     public static final int CASE = 2;
   3779 
   3780     /**
   3781      * Alias for UnicodeSet.CASE, for ease of porting from C++ where ICU4C
   3782      * also has both USET_CASE and USET_CASE_INSENSITIVE (see uset.h).
   3783      * @see #CASE
   3784      * @stable ICU 3.4
   3785      */
   3786     public static final int CASE_INSENSITIVE = 2;
   3787 
   3788     /**
   3789      * Bitmask for constructor, applyPattern(), and closeOver()
   3790      * indicating letter case.  This may be ORed together with other
   3791      * selectors.
   3792      *
   3793      * Enable case insensitive matching.  E.g., "[ab]" with this flag
   3794      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
   3795      * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
   3796      * title-, and uppercase mappings as well as the case folding
   3797      * of each existing element in the set.
   3798      * @stable ICU 3.4
   3799      */
   3800     public static final int ADD_CASE_MAPPINGS = 4;
   3801 
   3802     //  add the result of a full case mapping to the set
   3803     //  use str as a temporary string to avoid constructing one
   3804     private static final void addCaseMapping(UnicodeSet set, int result, StringBuilder full) {
   3805         if(result >= 0) {
   3806             if(result > UCaseProps.MAX_STRING_LENGTH) {
   3807                 // add a single-code point case mapping
   3808                 set.add(result);
   3809             } else {
   3810                 // add a string case mapping from full with length result
   3811                 set.add(full.toString());
   3812                 full.setLength(0);
   3813             }
   3814         }
   3815         // result < 0: the code point mapped to itself, no need to add it
   3816         // see UCaseProps
   3817     }
   3818 
   3819     /**
   3820      * Close this set over the given attribute.  For the attribute
   3821      * CASE, the result is to modify this set so that:
   3822      *
   3823      * 1. For each character or string 'a' in this set, all strings
   3824      * 'b' such that foldCase(a) == foldCase(b) are added to this set.
   3825      * (For most 'a' that are single characters, 'b' will have
   3826      * b.length() == 1.)
   3827      *
   3828      * 2. For each string 'e' in the resulting set, if e !=
   3829      * foldCase(e), 'e' will be removed.
   3830      *
   3831      * Example: [aq\u00DF{Bc}{bC}{Fi}] =&gt; [aAqQ\u00DF\uFB01{ss}{bc}{fi}]
   3832      *
   3833      * (Here foldCase(x) refers to the operation
   3834      * UCharacter.foldCase(x, true), and a == b actually denotes
   3835      * a.equals(b), not pointer comparison.)
   3836      *
   3837      * @param attribute bitmask for attributes to close over.
   3838      * Currently only the CASE bit is supported.  Any undefined bits
   3839      * are ignored.
   3840      * @return a reference to this set.
   3841      * @stable ICU 3.8
   3842      */
   3843     public UnicodeSet closeOver(int attribute) {
   3844         checkFrozen();
   3845         if ((attribute & (CASE | ADD_CASE_MAPPINGS)) != 0) {
   3846             UCaseProps csp = UCaseProps.INSTANCE;
   3847             UnicodeSet foldSet = new UnicodeSet(this);
   3848             ULocale root = ULocale.ROOT;
   3849 
   3850             // start with input set to guarantee inclusion
   3851             // CASE: remove strings because the strings will actually be reduced (folded);
   3852             //       therefore, start with no strings and add only those needed
   3853             if((attribute & CASE) != 0) {
   3854                 foldSet.strings.clear();
   3855             }
   3856 
   3857             int n = getRangeCount();
   3858             int result;
   3859             StringBuilder full = new StringBuilder();
   3860 
   3861             for (int i=0; i<n; ++i) {
   3862                 int start = getRangeStart(i);
   3863                 int end   = getRangeEnd(i);
   3864 
   3865                 if((attribute & CASE) != 0) {
   3866                     // full case closure
   3867                     for (int cp=start; cp<=end; ++cp) {
   3868                         csp.addCaseClosure(cp, foldSet);
   3869                     }
   3870                 } else {
   3871                     // add case mappings
   3872                     // (does not add long s for regular s, or Kelvin for k, for example)
   3873                     for (int cp=start; cp<=end; ++cp) {
   3874                         result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
   3875                         addCaseMapping(foldSet, result, full);
   3876 
   3877                         result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
   3878                         addCaseMapping(foldSet, result, full);
   3879 
   3880                         result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
   3881                         addCaseMapping(foldSet, result, full);
   3882 
   3883                         result = csp.toFullFolding(cp, full, 0);
   3884                         addCaseMapping(foldSet, result, full);
   3885                     }
   3886                 }
   3887             }
   3888             if (!strings.isEmpty()) {
   3889                 if ((attribute & CASE) != 0) {
   3890                     for (String s : strings) {
   3891                         String str = UCharacter.foldCase(s, 0);
   3892                         if(!csp.addStringCaseClosure(str, foldSet)) {
   3893                             foldSet.add(str); // does not map to code points: add the folded string itself
   3894                         }
   3895                     }
   3896                 } else {
   3897                     BreakIterator bi = BreakIterator.getWordInstance(root);
   3898                     for (String str : strings) {
   3899                         // TODO: call lower-level functions
   3900                         foldSet.add(UCharacter.toLowerCase(root, str));
   3901                         foldSet.add(UCharacter.toTitleCase(root, str, bi));
   3902                         foldSet.add(UCharacter.toUpperCase(root, str));
   3903                         foldSet.add(UCharacter.foldCase(str, 0));
   3904                     }
   3905                 }
   3906             }
   3907             set(foldSet);
   3908         }
   3909         return this;
   3910     }
   3911 
   3912     /**
   3913      * Internal class for customizing UnicodeSet parsing of properties.
   3914      * TODO: extend to allow customizing of codepoint ranges
   3915      * @draft ICU3.8 (retain)
   3916      * @provisional This API might change or be removed in a future release.
   3917      * @author medavis
   3918      */
   3919     abstract public static class XSymbolTable implements SymbolTable {
   3920         /**
   3921          * Default constructor
   3922          * @draft ICU3.8 (retain)
   3923          * @provisional This API might change or be removed in a future release.
   3924          */
   3925         public XSymbolTable(){}
   3926         /**
   3927          * Supplies default implementation for SymbolTable (no action).
   3928          * @draft ICU3.8 (retain)
   3929          * @provisional This API might change or be removed in a future release.
   3930          */
   3931         @Override
   3932         public UnicodeMatcher lookupMatcher(int i) {
   3933             return null;
   3934         }
   3935 
   3936         /**
   3937          * Override the interpretation of the sequence [:propertyName=propertyValue:] (and its negated and Perl-style
   3938          * variant). The propertyName and propertyValue may be existing Unicode aliases, or may not be.
   3939          * <p>
   3940          * This routine will be called whenever the parsing of a UnicodeSet pattern finds such a
   3941          * propertyName+propertyValue combination.
   3942          *
   3943          * @param propertyName
   3944          *            the name of the property
   3945          * @param propertyValue
   3946          *            the name of the property value
   3947          * @param result UnicodeSet value to change
   3948          *            a set to which the characters having the propertyName+propertyValue are to be added.
   3949          * @return returns true if the propertyName+propertyValue combination is to be overridden, and the characters
   3950          *         with that property have been added to the UnicodeSet, and returns false if the
   3951          *         propertyName+propertyValue combination is not recognized (in which case result is unaltered).
   3952          * @draft ICU3.8 (retain)
   3953          * @provisional This API might change or be removed in a future release.
   3954          */
   3955         public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) {
   3956             return false;
   3957         }
   3958         /**
   3959          * Supplies default implementation for SymbolTable (no action).
   3960          * @draft ICU3.8 (retain)
   3961          * @provisional This API might change or be removed in a future release.
   3962          */
   3963         @Override
   3964         public char[] lookup(String s) {
   3965             return null;
   3966         }
   3967         /**
   3968          * Supplies default implementation for SymbolTable (no action).
   3969          * @draft ICU3.8 (retain)
   3970          * @provisional This API might change or be removed in a future release.
   3971          */
   3972         @Override
   3973         public String parseReference(String text, ParsePosition pos, int limit) {
   3974             return null;
   3975         }
   3976     }
   3977 
   3978     /**
   3979      * Is this frozen, according to the Freezable interface?
   3980      *
   3981      * @return value
   3982      * @stable ICU 3.8
   3983      */
   3984     @Override
   3985     public boolean isFrozen() {
   3986         return (bmpSet != null || stringSpan != null);
   3987     }
   3988 
   3989     /**
   3990      * Freeze this class, according to the Freezable interface.
   3991      *
   3992      * @return this
   3993      * @stable ICU 4.4
   3994      */
   3995     @Override
   3996     public UnicodeSet freeze() {
   3997         if (!isFrozen()) {
   3998             // Do most of what compact() does before freezing because
   3999             // compact() will not work when the set is frozen.
   4000             // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
   4001 
   4002             // Delete buffer first to defragment memory less.
   4003             buffer = null;
   4004             if (list.length > (len + GROW_EXTRA)) {
   4005                 // Make the capacity equal to len or 1.
   4006                 // We don't want to realloc of 0 size.
   4007                 int capacity = (len == 0) ? 1 : len;
   4008                 int[] oldList = list;
   4009                 list = new int[capacity];
   4010                 for (int i = capacity; i-- > 0;) {
   4011                     list[i] = oldList[i];
   4012                 }
   4013             }
   4014 
   4015             // Optimize contains() and span() and similar functions.
   4016             if (!strings.isEmpty()) {
   4017                 stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL);
   4018             }
   4019             if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) {
   4020                 // Optimize for code point spans.
   4021                 // There are no strings, or
   4022                 // all strings are irrelevant for span() etc. because
   4023                 // all of each string's code points are contained in this set.
   4024                 // However, fully contained strings are relevant for spanAndCount(),
   4025                 // so we create both objects.
   4026                 bmpSet = new BMPSet(list, len);
   4027             }
   4028         }
   4029         return this;
   4030     }
   4031 
   4032     /**
   4033      * Span a string using this UnicodeSet.
   4034      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
   4035      * @param s The string to be spanned
   4036      * @param spanCondition The span condition
   4037      * @return the length of the span
   4038      * @stable ICU 4.4
   4039      */
   4040     public int span(CharSequence s, SpanCondition spanCondition) {
   4041         return span(s, 0, spanCondition);
   4042     }
   4043 
   4044     /**
   4045      * Span a string using this UnicodeSet.
   4046      *   If the start index is less than 0, span will start from 0.
   4047      *   If the start index is greater than the string length, span returns the string length.
   4048      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
   4049      * @param s The string to be spanned
   4050      * @param start The start index that the span begins
   4051      * @param spanCondition The span condition
   4052      * @return the string index which ends the span (i.e. exclusive)
   4053      * @stable ICU 4.4
   4054      */
   4055     public int span(CharSequence s, int start, SpanCondition spanCondition) {
   4056         int end = s.length();
   4057         if (start < 0) {
   4058             start = 0;
   4059         } else if (start >= end) {
   4060             return end;
   4061         }
   4062         if (bmpSet != null) {
   4063             // Frozen set without strings, or no string is relevant for span().
   4064             return bmpSet.span(s, start, spanCondition, null);
   4065         }
   4066         if (stringSpan != null) {
   4067             return stringSpan.span(s, start, spanCondition);
   4068         } else if (!strings.isEmpty()) {
   4069             int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
   4070                     : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
   4071             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
   4072             if (strSpan.needsStringSpanUTF16()) {
   4073                 return strSpan.span(s, start, spanCondition);
   4074             }
   4075         }
   4076 
   4077         return spanCodePointsAndCount(s, start, spanCondition, null);
   4078     }
   4079 
   4080     /**
   4081      * Same as span() but also counts the smallest number of set elements on any path across the span.
   4082      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
   4083      * @param outCount An output-only object (must not be null) for returning the count.
   4084      * @return the limit (exclusive end) of the span
   4085      * @internal
   4086      * @deprecated This API is ICU internal only.
   4087      */
   4088     @Deprecated
   4089     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
   4090         if (outCount == null) {
   4091             throw new IllegalArgumentException("outCount must not be null");
   4092         }
   4093         int end = s.length();
   4094         if (start < 0) {
   4095             start = 0;
   4096         } else if (start >= end) {
   4097             return end;
   4098         }
   4099         if (stringSpan != null) {
   4100             // We might also have bmpSet != null,
   4101             // but fully-contained strings are relevant for counting elements.
   4102             return stringSpan.spanAndCount(s, start, spanCondition, outCount);
   4103         } else if (bmpSet != null) {
   4104             return bmpSet.span(s, start, spanCondition, outCount);
   4105         } else if (!strings.isEmpty()) {
   4106             int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
   4107                     : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
   4108             which |= UnicodeSetStringSpan.WITH_COUNT;
   4109             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
   4110             return strSpan.spanAndCount(s, start, spanCondition, outCount);
   4111         }
   4112 
   4113         return spanCodePointsAndCount(s, start, spanCondition, outCount);
   4114     }
   4115 
   4116     private int spanCodePointsAndCount(CharSequence s, int start,
   4117             SpanCondition spanCondition, OutputInt outCount) {
   4118         // Pin to 0/1 values.
   4119         boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
   4120 
   4121         int c;
   4122         int next = start;
   4123         int length = s.length();
   4124         int count = 0;
   4125         do {
   4126             c = Character.codePointAt(s, next);
   4127             if (spanContained != contains(c)) {
   4128                 break;
   4129             }
   4130             ++count;
   4131             next += Character.charCount(c);
   4132         } while (next < length);
   4133         if (outCount != null) { outCount.value = count; }
   4134         return next;
   4135     }
   4136 
   4137     /**
   4138      * Span a string backwards (from the end) using this UnicodeSet.
   4139      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
   4140      * @param s The string to be spanned
   4141      * @param spanCondition The span condition
   4142      * @return The string index which starts the span (i.e. inclusive).
   4143      * @stable ICU 4.4
   4144      */
   4145     public int spanBack(CharSequence s, SpanCondition spanCondition) {
   4146         return spanBack(s, s.length(), spanCondition);
   4147     }
   4148 
   4149     /**
   4150      * Span a string backwards (from the fromIndex) using this UnicodeSet.
   4151      * If the fromIndex is less than 0, spanBack will return 0.
   4152      * If fromIndex is greater than the string length, spanBack will start from the string length.
   4153      * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
   4154      * @param s The string to be spanned
   4155      * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
   4156      * @param spanCondition The span condition
   4157      * @return The string index which starts the span (i.e. inclusive).
   4158      * @stable ICU 4.4
   4159      */
   4160     public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
   4161         if (fromIndex <= 0) {
   4162             return 0;
   4163         }
   4164         if (fromIndex > s.length()) {
   4165             fromIndex = s.length();
   4166         }
   4167         if (bmpSet != null) {
   4168             // Frozen set without strings, or no string is relevant for spanBack().
   4169             return bmpSet.spanBack(s, fromIndex, spanCondition);
   4170         }
   4171         if (stringSpan != null) {
   4172             return stringSpan.spanBack(s, fromIndex, spanCondition);
   4173         } else if (!strings.isEmpty()) {
   4174             int which = (spanCondition == SpanCondition.NOT_CONTAINED)
   4175                     ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
   4176                             : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
   4177             UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which);
   4178             if (strSpan.needsStringSpanUTF16()) {
   4179                 return strSpan.spanBack(s, fromIndex, spanCondition);
   4180             }
   4181         }
   4182 
   4183         // Pin to 0/1 values.
   4184         boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED);
   4185 
   4186         int c;
   4187         int prev = fromIndex;
   4188         do {
   4189             c = Character.codePointBefore(s, prev);
   4190             if (spanContained != contains(c)) {
   4191                 break;
   4192             }
   4193             prev -= Character.charCount(c);
   4194         } while (prev > 0);
   4195         return prev;
   4196     }
   4197 
   4198     /**
   4199      * Clone a thawed version of this class, according to the Freezable interface.
   4200      * @return the clone, not frozen
   4201      * @stable ICU 4.4
   4202      */
   4203     @Override
   4204     public UnicodeSet cloneAsThawed() {
   4205         UnicodeSet result = new UnicodeSet(this);
   4206         assert !result.isFrozen();
   4207         return result;
   4208     }
   4209 
   4210     // internal function
   4211     private void checkFrozen() {
   4212         if (isFrozen()) {
   4213             throw new UnsupportedOperationException("Attempt to modify frozen object");
   4214         }
   4215     }
   4216 
   4217     // ************************
   4218     // Additional methods for integration with Generics and Collections
   4219     // ************************
   4220 
   4221     /**
   4222      * A struct-like class used for iteration through ranges, for faster iteration than by String.
   4223      * Read about the restrictions on usage in {@link UnicodeSet#ranges()}.
   4224      *
   4225      * @stable ICU 54
   4226      */
   4227     public static class EntryRange {
   4228         /**
   4229          * The starting code point of the range.
   4230          *
   4231          * @stable ICU 54
   4232          */
   4233         public int codepoint;
   4234         /**
   4235          * The ending code point of the range
   4236          *
   4237          * @stable ICU 54
   4238          */
   4239         public int codepointEnd;
   4240 
   4241         EntryRange() {
   4242         }
   4243 
   4244         /**
   4245          * {@inheritDoc}
   4246          *
   4247          * @stable ICU 54
   4248          */
   4249         @Override
   4250         public String toString() {
   4251             StringBuilder b = new StringBuilder();
   4252             return (
   4253                     codepoint == codepointEnd ? _appendToPat(b, codepoint, false)
   4254                             : _appendToPat(_appendToPat(b, codepoint, false).append('-'), codepointEnd, false))
   4255                             .toString();
   4256         }
   4257     }
   4258 
   4259     /**
   4260      * Provide for faster iteration than by String. Returns an Iterable/Iterator over ranges of code points.
   4261      * The UnicodeSet must not be altered during the iteration.
   4262      * The EntryRange instance is the same each time; the contents are just reset.
   4263      *
   4264      * <p><b>Warning: </b>To iterate over the full contents, you have to also iterate over the strings.
   4265      *
   4266      * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
   4267      * Do not alter the UnicodeSet while iterating.
   4268      *
   4269      * <pre>
   4270      * // Sample code
   4271      * for (EntryRange range : us1.ranges()) {
   4272      *     // do something with code points between range.codepoint and range.codepointEnd;
   4273      * }
   4274      * for (String s : us1.strings()) {
   4275      *     // do something with each string;
   4276      * }
   4277      * </pre>
   4278      *
   4279      * @stable ICU 54
   4280      */
   4281     public Iterable<EntryRange> ranges() {
   4282         return new EntryRangeIterable();
   4283     }
   4284 
   4285     private class EntryRangeIterable implements Iterable<EntryRange> {
   4286         @Override
   4287         public Iterator<EntryRange> iterator() {
   4288             return new EntryRangeIterator();
   4289         }
   4290     }
   4291 
   4292     private class EntryRangeIterator implements Iterator<EntryRange> {
   4293         int pos;
   4294         EntryRange result = new EntryRange();
   4295 
   4296         @Override
   4297         public boolean hasNext() {
   4298             return pos < len-1;
   4299         }
   4300         @Override
   4301         public EntryRange next() {
   4302             if (pos < len-1) {
   4303                 result.codepoint = list[pos++];
   4304                 result.codepointEnd = list[pos++]-1;
   4305             } else {
   4306                 throw new NoSuchElementException();
   4307             }
   4308             return result;
   4309         }
   4310         @Override
   4311         public void remove() {
   4312             throw new UnsupportedOperationException();
   4313         }
   4314     }
   4315 
   4316 
   4317     /**
   4318      * Returns a string iterator. Uses the same order of iteration as {@link UnicodeSetIterator}.
   4319      * <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
   4320      * Do not alter the UnicodeSet while iterating.
   4321      * @see java.util.Set#iterator()
   4322      * @stable ICU 4.4
   4323      */
   4324     @Override
   4325     public Iterator<String> iterator() {
   4326         return new UnicodeSetIterator2(this);
   4327     }
   4328 
   4329     // Cover for string iteration.
   4330     private static class UnicodeSetIterator2 implements Iterator<String> {
   4331         // Invariants:
   4332         // sourceList != null then sourceList[item] is a valid character
   4333         // sourceList == null then delegates to stringIterator
   4334         private int[] sourceList;
   4335         private int len;
   4336         private int item;
   4337         private int current;
   4338         private int limit;
   4339         private TreeSet<String> sourceStrings;
   4340         private Iterator<String> stringIterator;
   4341         private char[] buffer;
   4342 
   4343         UnicodeSetIterator2(UnicodeSet source) {
   4344             // set according to invariants
   4345             len = source.len - 1;
   4346             if (len > 0) {
   4347                 sourceStrings = source.strings;
   4348                 sourceList = source.list;
   4349                 current = sourceList[item++];
   4350                 limit = sourceList[item++];
   4351             } else {
   4352                 stringIterator = source.strings.iterator();
   4353                 sourceList = null;
   4354             }
   4355         }
   4356 
   4357         /* (non-Javadoc)
   4358          * @see java.util.Iterator#hasNext()
   4359          */
   4360         @Override
   4361         public boolean hasNext() {
   4362             return sourceList != null || stringIterator.hasNext();
   4363         }
   4364 
   4365         /* (non-Javadoc)
   4366          * @see java.util.Iterator#next()
   4367          */
   4368         @Override
   4369         public String next() {
   4370             if (sourceList == null) {
   4371                 return stringIterator.next();
   4372             }
   4373             int codepoint = current++;
   4374             // we have the codepoint we need, but we may need to adjust the state
   4375             if (current >= limit) {
   4376                 if (item >= len) {
   4377                     stringIterator = sourceStrings.iterator();
   4378                     sourceList = null;
   4379                 } else {
   4380                     current = sourceList[item++];
   4381                     limit = sourceList[item++];
   4382                 }
   4383             }
   4384             // Now return. Single code point is easy
   4385             if (codepoint <= 0xFFFF) {
   4386                 return String.valueOf((char)codepoint);
   4387             }
   4388             // But Java lacks a valueOfCodePoint, so we handle ourselves for speed
   4389             // allocate a buffer the first time, to make conversion faster.
   4390             if (buffer == null) {
   4391                 buffer = new char[2];
   4392             }
   4393             // compute ourselves, to save tests and calls
   4394             int offset = codepoint - Character.MIN_SUPPLEMENTARY_CODE_POINT;
   4395             buffer[0] = (char)((offset >>> 10) + Character.MIN_HIGH_SURROGATE);
   4396             buffer[1] = (char)((offset & 0x3ff) + Character.MIN_LOW_SURROGATE);
   4397             return String.valueOf(buffer);
   4398         }
   4399 
   4400         /* (non-Javadoc)
   4401          * @see java.util.Iterator#remove()
   4402          */
   4403         @Override
   4404         public void remove() {
   4405             throw new UnsupportedOperationException();
   4406         }
   4407     }
   4408 
   4409     /**
   4410      * @see #containsAll(com.ibm.icu.text.UnicodeSet)
   4411      * @stable ICU 4.4
   4412      */
   4413     public <T extends CharSequence> boolean containsAll(Iterable<T> collection) {
   4414         for (T o : collection) {
   4415             if (!contains(o)) {
   4416                 return false;
   4417             }
   4418         }
   4419         return true;
   4420     }
   4421 
   4422     /**
   4423      * @see #containsNone(com.ibm.icu.text.UnicodeSet)
   4424      * @stable ICU 4.4
   4425      */
   4426     public <T extends CharSequence> boolean containsNone(Iterable<T> collection) {
   4427         for (T o : collection) {
   4428             if (contains(o)) {
   4429                 return false;
   4430             }
   4431         }
   4432         return true;
   4433     }
   4434 
   4435     /**
   4436      * @see #containsAll(com.ibm.icu.text.UnicodeSet)
   4437      * @stable ICU 4.4
   4438      */
   4439     public final <T extends CharSequence> boolean containsSome(Iterable<T> collection) {
   4440         return !containsNone(collection);
   4441     }
   4442 
   4443     /**
   4444      * @see #addAll(com.ibm.icu.text.UnicodeSet)
   4445      * @stable ICU 4.4
   4446      */
   4447     @SuppressWarnings("unchecked")  // See ticket #11395, this is safe.
   4448     public <T extends CharSequence> UnicodeSet addAll(T... collection) {
   4449         checkFrozen();
   4450         for (T str : collection) {
   4451             add(str);
   4452         }
   4453         return this;
   4454     }
   4455 
   4456 
   4457     /**
   4458      * @see #removeAll(com.ibm.icu.text.UnicodeSet)
   4459      * @stable ICU 4.4
   4460      */
   4461     public <T extends CharSequence> UnicodeSet removeAll(Iterable<T> collection) {
   4462         checkFrozen();
   4463         for (T o : collection) {
   4464             remove(o);
   4465         }
   4466         return this;
   4467     }
   4468 
   4469     /**
   4470      * @see #retainAll(com.ibm.icu.text.UnicodeSet)
   4471      * @stable ICU 4.4
   4472      */
   4473     public <T extends CharSequence> UnicodeSet retainAll(Iterable<T> collection) {
   4474         checkFrozen();
   4475         // TODO optimize
   4476         UnicodeSet toRetain = new UnicodeSet();
   4477         toRetain.addAll(collection);
   4478         retainAll(toRetain);
   4479         return this;
   4480     }
   4481 
   4482     /**
   4483      * Comparison style enums used by {@link UnicodeSet#compareTo(UnicodeSet, ComparisonStyle)}.
   4484      * @stable ICU 4.4
   4485      */
   4486     public enum ComparisonStyle {
   4487         /**
   4488          * @stable ICU 4.4
   4489          */
   4490         SHORTER_FIRST,
   4491         /**
   4492          * @stable ICU 4.4
   4493          */
   4494         LEXICOGRAPHIC,
   4495         /**
   4496          * @stable ICU 4.4
   4497          */
   4498         LONGER_FIRST
   4499     }
   4500 
   4501     /**
   4502      * Compares UnicodeSets, where shorter come first, and otherwise lexigraphically
   4503      * (according to the comparison of the first characters that differ).
   4504      * @see java.lang.Comparable#compareTo(java.lang.Object)
   4505      * @stable ICU 4.4
   4506      */
   4507     @Override
   4508     public int compareTo(UnicodeSet o) {
   4509         return compareTo(o, ComparisonStyle.SHORTER_FIRST);
   4510     }
   4511     /**
   4512      * Compares UnicodeSets, in three different ways.
   4513      * @see java.lang.Comparable#compareTo(java.lang.Object)
   4514      * @stable ICU 4.4
   4515      */
   4516     public int compareTo(UnicodeSet o, ComparisonStyle style) {
   4517         if (style != ComparisonStyle.LEXICOGRAPHIC) {
   4518             int diff = size() - o.size();
   4519             if (diff != 0) {
   4520                 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1;
   4521             }
   4522         }
   4523         int result;
   4524         for (int i = 0; ; ++i) {
   4525             if (0 != (result = list[i] - o.list[i])) {
   4526                 // if either list ran out, compare to the last string
   4527                 if (list[i] == HIGH) {
   4528                     if (strings.isEmpty()) return 1;
   4529                     String item = strings.first();
   4530                     return compare(item, o.list[i]);
   4531                 }
   4532                 if (o.list[i] == HIGH) {
   4533                     if (o.strings.isEmpty()) return -1;
   4534                     String item = o.strings.first();
   4535                     int compareResult = compare(item, list[i]);
   4536                     return compareResult > 0 ? -1 : compareResult < 0 ? 1 : 0; // Reverse the order.
   4537                 }
   4538                 // otherwise return the result if even index, or the reversal if not
   4539                 return (i & 1) == 0 ? result : -result;
   4540             }
   4541             if (list[i] == HIGH) {
   4542                 break;
   4543             }
   4544         }
   4545         return compare(strings, o.strings);
   4546     }
   4547 
   4548     /**
   4549      * @stable ICU 4.4
   4550      */
   4551     public int compareTo(Iterable<String> other) {
   4552         return compare(this, other);
   4553     }
   4554 
   4555     /**
   4556      * Utility to compare a string to a code point.
   4557      * Same results as turning the code point into a string (with the [ugly] new StringBuilder().appendCodePoint(codepoint).toString())
   4558      * and comparing, but much faster (no object creation).
   4559      * Actually, there is one difference; a null compares as less.
   4560      * Note that this (=String) order is UTF-16 order -- *not* code point order.
   4561      * @stable ICU 4.4
   4562      */
   4563 
   4564     public static int compare(CharSequence string, int codePoint) {
   4565         return CharSequences.compare(string, codePoint);
   4566     }
   4567 
   4568     /**
   4569      * Utility to compare a string to a code point.
   4570      * Same results as turning the code point into a string and comparing, but much faster (no object creation).
   4571      * Actually, there is one difference; a null compares as less.
   4572      * Note that this (=String) order is UTF-16 order -- *not* code point order.
   4573      * @stable ICU 4.4
   4574      */
   4575     public static int compare(int codePoint, CharSequence string) {
   4576         return -CharSequences.compare(string, codePoint);
   4577     }
   4578 
   4579 
   4580     /**
   4581      * Utility to compare two iterables. Warning: the ordering in iterables is important. For Collections that are ordered,
   4582      * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration.
   4583      * That means that sets can't be compared directly with this method, unless they are TreeSets without
   4584      * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of
   4585      * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances.
   4586      * @stable ICU 4.4
   4587      */
   4588     public static <T extends Comparable<T>> int compare(Iterable<T> collection1, Iterable<T> collection2) {
   4589         return compare(collection1.iterator(), collection2.iterator());
   4590     }
   4591 
   4592     /**
   4593      * Utility to compare two iterators. Warning: the ordering in iterables is important. For Collections that are ordered,
   4594      * like Lists, that is expected. However, Sets in Java violate Leibniz's law when it comes to iteration.
   4595      * That means that sets can't be compared directly with this method, unless they are TreeSets without
   4596      * (or with the same) comparator. Unfortunately, it is impossible to reliably detect in Java whether subclass of
   4597      * Collection satisfies the right criteria, so it is left to the user to avoid those circumstances.
   4598      * @internal
   4599      * @deprecated This API is ICU internal only.
   4600      */
   4601     @Deprecated
   4602     public static <T extends Comparable<T>> int compare(Iterator<T> first, Iterator<T> other) {
   4603         while (true) {
   4604             if (!first.hasNext()) {
   4605                 return other.hasNext() ? -1 : 0;
   4606             } else if (!other.hasNext()) {
   4607                 return 1;
   4608             }
   4609             T item1 = first.next();
   4610             T item2 = other.next();
   4611             int result = item1.compareTo(item2);
   4612             if (result != 0) {
   4613                 return result;
   4614             }
   4615         }
   4616     }
   4617 
   4618 
   4619     /**
   4620      * Utility to compare two collections, optionally by size, and then lexicographically.
   4621      * @stable ICU 4.4
   4622      */
   4623     public static <T extends Comparable<T>> int compare(Collection<T> collection1, Collection<T> collection2, ComparisonStyle style) {
   4624         if (style != ComparisonStyle.LEXICOGRAPHIC) {
   4625             int diff = collection1.size() - collection2.size();
   4626             if (diff != 0) {
   4627                 return (diff < 0) == (style == ComparisonStyle.SHORTER_FIRST) ? -1 : 1;
   4628             }
   4629         }
   4630         return compare(collection1, collection2);
   4631     }
   4632 
   4633     /**
   4634      * Utility for adding the contents of an iterable to a collection.
   4635      * @stable ICU 4.4
   4636      */
   4637     public static <T, U extends Collection<T>> U addAllTo(Iterable<T> source, U target) {
   4638         for (T item : source) {
   4639             target.add(item);
   4640         }
   4641         return target;
   4642     }
   4643 
   4644     /**
   4645      * Utility for adding the contents of an iterable to a collection.
   4646      * @stable ICU 4.4
   4647      */
   4648     public static <T> T[] addAllTo(Iterable<T> source, T[] target) {
   4649         int i = 0;
   4650         for (T item : source) {
   4651             target[i++] = item;
   4652         }
   4653         return target;
   4654     }
   4655 
   4656     /**
   4657      * For iterating through the strings in the set. Example:
   4658      * <pre>
   4659      * for (String key : myUnicodeSet.strings()) {
   4660      *   doSomethingWith(key);
   4661      * }
   4662      * </pre>
   4663      * @stable ICU 4.4
   4664      */
   4665     public Collection<String> strings() {
   4666         return Collections.unmodifiableSortedSet(strings);
   4667     }
   4668 
   4669     /**
   4670      * Return the value of the first code point, if the string is exactly one code point. Otherwise return Integer.MAX_VALUE.
   4671      * @internal
   4672      * @deprecated This API is ICU internal only.
   4673      */
   4674     @Deprecated
   4675     public static int getSingleCodePoint(CharSequence s) {
   4676         return CharSequences.getSingleCodePoint(s);
   4677     }
   4678 
   4679     /**
   4680      * Simplify the ranges in a Unicode set by merging any ranges that are only separated by characters in the dontCare set.
   4681      * For example, the ranges: \\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3000-\\u303E change to \\u2E80-\\u303E
   4682      * if the dontCare set includes unassigned characters (for a particular version of Unicode).
   4683      * @param dontCare Set with the don't-care characters for spanning
   4684      * @return the input set, modified
   4685      * @internal
   4686      * @deprecated This API is ICU internal only.
   4687      */
   4688     @Deprecated
   4689     public UnicodeSet addBridges(UnicodeSet dontCare) {
   4690         UnicodeSet notInInput = new UnicodeSet(this).complement();
   4691         for (UnicodeSetIterator it = new UnicodeSetIterator(notInInput); it.nextRange();) {
   4692             if (it.codepoint != 0 && it.codepoint != UnicodeSetIterator.IS_STRING && it.codepointEnd != 0x10FFFF && dontCare.contains(it.codepoint,it.codepointEnd)) {
   4693                 add(it.codepoint,it.codepointEnd);
   4694             }
   4695         }
   4696         return this;
   4697     }
   4698 
   4699     /**
   4700      * Find the first index at or after fromIndex where the UnicodeSet matches at that index.
   4701      * If findNot is true, then reverse the sense of the match: find the first place where the UnicodeSet doesn't match.
   4702      * If there is no match, length is returned.
   4703      * @internal
   4704      * @deprecated This API is ICU internal only. Use span instead.
   4705      */
   4706     @Deprecated
   4707     public int findIn(CharSequence value, int fromIndex, boolean findNot) {
   4708         //TODO add strings, optimize, using ICU4C algorithms
   4709         int cp;
   4710         for (; fromIndex < value.length(); fromIndex += UTF16.getCharCount(cp)) {
   4711             cp = UTF16.charAt(value, fromIndex);
   4712             if (contains(cp) != findNot) {
   4713                 break;
   4714             }
   4715         }
   4716         return fromIndex;
   4717     }
   4718 
   4719     /**
   4720      * Find the last index before fromIndex where the UnicodeSet matches at that index.
   4721      * If findNot is true, then reverse the sense of the match: find the last place where the UnicodeSet doesn't match.
   4722      * If there is no match, -1 is returned.
   4723      * BEFORE index is not in the UnicodeSet.
   4724      * @internal
   4725      * @deprecated This API is ICU internal only. Use spanBack instead.
   4726      */
   4727     @Deprecated
   4728     public int findLastIn(CharSequence value, int fromIndex, boolean findNot) {
   4729         //TODO add strings, optimize, using ICU4C algorithms
   4730         int cp;
   4731         fromIndex -= 1;
   4732         for (; fromIndex >= 0; fromIndex -= UTF16.getCharCount(cp)) {
   4733             cp = UTF16.charAt(value, fromIndex);
   4734             if (contains(cp) != findNot) {
   4735                 break;
   4736             }
   4737         }
   4738         return fromIndex < 0 ? -1 : fromIndex;
   4739     }
   4740 
   4741     /**
   4742      * Strips code points from source. If matches is true, script all that match <i>this</i>. If matches is false, then strip all that <i>don't</i> match.
   4743      * @param source The source of the CharSequence to strip from.
   4744      * @param matches A boolean to either strip all that matches or don't match with the current UnicodeSet object.
   4745      * @return The string after it has been stripped.
   4746      * @internal
   4747      * @deprecated This API is ICU internal only. Use replaceFrom.
   4748      */
   4749     @Deprecated
   4750     public String stripFrom(CharSequence source, boolean matches) {
   4751         StringBuilder result = new StringBuilder();
   4752         for (int pos = 0; pos < source.length();) {
   4753             int inside = findIn(source, pos, !matches);
   4754             result.append(source.subSequence(pos, inside));
   4755             pos = findIn(source, inside, matches); // get next start
   4756         }
   4757         return result.toString();
   4758     }
   4759 
   4760     /**
   4761      * Argument values for whether span() and similar functions continue while the current character is contained vs.
   4762      * not contained in the set.
   4763      * <p>
   4764      * The functionality is straightforward for sets with only single code points, without strings (which is the common
   4765      * case):
   4766      * <ul>
   4767      * <li>CONTAINED and SIMPLE work the same.
   4768      * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED.
   4769      * <li>span() and spanBack() partition any string the
   4770      * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition).
   4771      * <li>Using a
   4772      * complemented (inverted) set and the opposite span conditions yields the same results.
   4773      * </ul>
   4774      * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
   4775      * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
   4776      * strings:
   4777      * <ul>
   4778      * <li>The complement of the set contains the opposite set of code points, but the same set of strings.
   4779      * Therefore, complementing both the set and the span conditions may yield different results.
   4780      * <li>When starting spans
   4781      * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
   4782      * because a set string may start before the later position.
   4783      * <li>span(SIMPLE) may be shorter than
   4784      * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which
   4785      * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax",
   4786      * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED).
   4787      * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example,
   4788      * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield
   4789      * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.
   4790      * </ul>
   4791      * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
   4792      * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
   4793      * be used.
   4794      * <p>
   4795      * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
   4796      * boundaries, never in the middle of a surrogate pair.
   4797      *
   4798      * @stable ICU 4.4
   4799      */
   4800     public enum SpanCondition {
   4801         /**
   4802          * Continues a span() while there is no set element at the current position.
   4803          * Increments by one code point at a time.
   4804          * Stops before the first set element (character or string).
   4805          * (For code points only, this is like while contains(current)==false).
   4806          * <p>
   4807          * When span() returns, the substring between where it started and the position it returned consists only of
   4808          * characters that are not in the set, and none of its strings overlap with the span.
   4809          *
   4810          * @stable ICU 4.4
   4811          */
   4812         NOT_CONTAINED,
   4813 
   4814         /**
   4815          * Spans the longest substring that is a concatenation of set elements (characters or strings).
   4816          * (For characters only, this is like while contains(current)==true).
   4817          * <p>
   4818          * When span() returns, the substring between where it started and the position it returned consists only of set
   4819          * elements (characters or strings) that are in the set.
   4820          * <p>
   4821          * If a set contains strings, then the span will be the longest substring for which there
   4822          * exists at least one non-overlapping concatenation of set elements (characters or strings).
   4823          * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
   4824          * (Java/ICU/Perl regex stops at the first match of an OR.)
   4825          *
   4826          * @stable ICU 4.4
   4827          */
   4828         CONTAINED,
   4829 
   4830         /**
   4831          * Continues a span() while there is a set element at the current position.
   4832          * Increments by the longest matching element at each position.
   4833          * (For characters only, this is like while contains(current)==true).
   4834          * <p>
   4835          * When span() returns, the substring between where it started and the position it returned consists only of set
   4836          * elements (characters or strings) that are in the set.
   4837          * <p>
   4838          * If a set only contains single characters, then this is the same as CONTAINED.
   4839          * <p>
   4840          * If a set contains strings, then the span will be the longest substring with a match at each position with the
   4841          * longest single set element (character or string).
   4842          * <p>
   4843          * Use this span condition together with other longest-match algorithms, such as ICU converters
   4844          * (ucnv_getUnicodeSet()).
   4845          *
   4846          * @stable ICU 4.4
   4847          */
   4848         SIMPLE,
   4849 
   4850         /**
   4851          * One more than the last span condition.
   4852          *
   4853          * @stable ICU 4.4
   4854          */
   4855         CONDITION_COUNT
   4856     }
   4857 
   4858     /**
   4859      * Get the default symbol table. Null means ordinary processing. For internal use only.
   4860      * @return the symbol table
   4861      * @internal
   4862      * @deprecated This API is ICU internal only.
   4863      */
   4864     @Deprecated
   4865     public static XSymbolTable getDefaultXSymbolTable() {
   4866         return XSYMBOL_TABLE;
   4867     }
   4868 
   4869     /**
   4870      * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing
   4871      * of UnicodeSets.
   4872      * <p>
   4873      * WARNING: If this function is used with a UnicodeProperty, and the
   4874      * Unassigned characters (gc=Cn) are different than in ICU other than in ICU, you MUST call
   4875      * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable}
   4876      * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}.
   4877      *
   4878      * @param xSymbolTable the new default symbol table.
   4879      * @internal
   4880      * @deprecated This API is ICU internal only.
   4881      */
   4882     @Deprecated
   4883     public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) {
   4884         INCLUSIONS = null; // If the properties override inclusions, these have to be regenerated.
   4885         XSYMBOL_TABLE = xSymbolTable;
   4886     }
   4887 }
   4888 //eof
   4889