Home | History | Annotate | Download | only in common
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 1999-2009, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   10/20/99    alan        Creation.
      8 **********************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 #include "unicode/uniset.h"
     13 #include "unicode/parsepos.h"
     14 #include "unicode/symtable.h"
     15 #include "ruleiter.h"
     16 #include "cmemory.h"
     17 #include "cstring.h"
     18 #include "uhash.h"
     19 #include "util.h"
     20 #include "uvector.h"
     21 #include "charstr.h"
     22 #include "ustrfmt.h"
     23 #include "uassert.h"
     24 #include "hash.h"
     25 #include "bmpset.h"
     26 #include "unisetspan.h"
     27 
     28 // Define UChar constants using hex for EBCDIC compatibility
     29 // Used #define to reduce private static exports and memory access time.
     30 #define SET_OPEN        ((UChar)0x005B) /*[*/
     31 #define SET_CLOSE       ((UChar)0x005D) /*]*/
     32 #define HYPHEN          ((UChar)0x002D) /*-*/
     33 #define COMPLEMENT      ((UChar)0x005E) /*^*/
     34 #define COLON           ((UChar)0x003A) /*:*/
     35 #define BACKSLASH       ((UChar)0x005C) /*\*/
     36 #define INTERSECTION    ((UChar)0x0026) /*&*/
     37 #define UPPER_U         ((UChar)0x0055) /*U*/
     38 #define LOWER_U         ((UChar)0x0075) /*u*/
     39 #define OPEN_BRACE      ((UChar)123)    /*{*/
     40 #define CLOSE_BRACE     ((UChar)125)    /*}*/
     41 #define UPPER_P         ((UChar)0x0050) /*P*/
     42 #define LOWER_P         ((UChar)0x0070) /*p*/
     43 #define UPPER_N         ((UChar)78)     /*N*/
     44 #define EQUALS          ((UChar)0x003D) /*=*/
     45 
     46 // HIGH_VALUE > all valid values. 110000 for codepoints
     47 #define UNICODESET_HIGH 0x0110000
     48 
     49 // LOW <= all valid values. ZERO for codepoints
     50 #define UNICODESET_LOW 0x000000
     51 
     52 // initial storage. Must be >= 0
     53 #define START_EXTRA 16
     54 
     55 // extra amount for growth. Must be >= 0
     56 #define GROW_EXTRA START_EXTRA
     57 
     58 U_NAMESPACE_BEGIN
     59 
     60 SymbolTable::~SymbolTable() {}
     61 
     62 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
     63 
     64 /**
     65  * Modify the given UChar32 variable so that it is in range, by
     66  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
     67  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
     68  * It modifies its argument in-place and also returns it.
     69  */
     70 static inline UChar32 pinCodePoint(UChar32& c) {
     71     if (c < UNICODESET_LOW) {
     72         c = UNICODESET_LOW;
     73     } else if (c > (UNICODESET_HIGH-1)) {
     74         c = (UNICODESET_HIGH-1);
     75     }
     76     return c;
     77 }
     78 
     79 //----------------------------------------------------------------
     80 // Debugging
     81 //----------------------------------------------------------------
     82 
     83 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
     84 // To enable the debugging, define the symbol DEBUG_MEM in the line
     85 // below.  This will result in text being sent to stdout that looks
     86 // like this:
     87 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
     88 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
     89 // Each line lists a construction (ct) or destruction (dt) event, the
     90 // object address, the number of outstanding objects after the event,
     91 // and the pattern of the object in question.
     92 
     93 // #define DEBUG_MEM
     94 
     95 #ifdef DEBUG_MEM
     96 #include <stdio.h>
     97 static int32_t _dbgCount = 0;
     98 
     99 static inline void _dbgct(UnicodeSet* set) {
    100     UnicodeString str;
    101     set->toPattern(str, TRUE);
    102     char buf[40];
    103     str.extract(0, 39, buf, "");
    104     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
    105 }
    106 
    107 static inline void _dbgdt(UnicodeSet* set) {
    108     UnicodeString str;
    109     set->toPattern(str, TRUE);
    110     char buf[40];
    111     str.extract(0, 39, buf, "");
    112     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
    113 }
    114 
    115 #else
    116 
    117 #define _dbgct(set)
    118 #define _dbgdt(set)
    119 
    120 #endif
    121 
    122 //----------------------------------------------------------------
    123 // UnicodeString in UVector support
    124 //----------------------------------------------------------------
    125 
    126 static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) {
    127     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
    128 }
    129 
    130 static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {
    131     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
    132     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
    133     return a.compare(b);
    134 }
    135 
    136 //----------------------------------------------------------------
    137 // Constructors &c
    138 //----------------------------------------------------------------
    139 
    140 /**
    141  * Constructs an empty set.
    142  */
    143 UnicodeSet::UnicodeSet() :
    144     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
    145     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
    146     fFlags(0)
    147 {
    148     UErrorCode status = U_ZERO_ERROR;
    149     allocateStrings(status);
    150     if (U_FAILURE(status)) {
    151         return;
    152     }
    153     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
    154     if(list!=NULL){
    155         list[0] = UNICODESET_HIGH;
    156     } else { // If memory allocation failed, set to bogus state.
    157         setToBogus();
    158         return;
    159     }
    160     _dbgct(this);
    161 }
    162 
    163 /**
    164  * Constructs a set containing the given range. If <code>end >
    165  * start</code> then an empty set is created.
    166  *
    167  * @param start first character, inclusive, of range
    168  * @param end last character, inclusive, of range
    169  */
    170 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
    171     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
    172     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
    173     fFlags(0)
    174 {
    175     UErrorCode status = U_ZERO_ERROR;
    176     allocateStrings(status);
    177     if (U_FAILURE(status)) {
    178         return;
    179     }
    180     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
    181     if(list!=NULL){
    182         list[0] = UNICODESET_HIGH;
    183         complement(start, end);
    184     } else { // If memory allocation failed, set to bogus state.
    185         setToBogus();
    186         return;
    187     }
    188     _dbgct(this);
    189 }
    190 
    191 /**
    192  * Constructs a set that is identical to the given UnicodeSet.
    193  */
    194 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
    195     UnicodeFilter(o),
    196     len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
    197     bmpSet(0),
    198     buffer(0), bufferCapacity(0),
    199     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
    200     fFlags(0)
    201 {
    202     UErrorCode status = U_ZERO_ERROR;
    203     allocateStrings(status);
    204     if (U_FAILURE(status)) {
    205         return;
    206     }
    207     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
    208     if(list!=NULL){
    209         *this = o;
    210     } else { // If memory allocation failed, set to bogus state.
    211         setToBogus();
    212         return;
    213     }
    214     _dbgct(this);
    215 }
    216 
    217 // Copy-construct as thawed.
    218 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
    219     UnicodeFilter(o),
    220     len(0), capacity(o.len + GROW_EXTRA), list(0),
    221     bmpSet(0),
    222     buffer(0), bufferCapacity(0),
    223     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
    224     fFlags(0)
    225 {
    226     UErrorCode status = U_ZERO_ERROR;
    227     allocateStrings(status);
    228     if (U_FAILURE(status)) {
    229         return;
    230     }
    231     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
    232     if(list!=NULL){
    233         // *this = o except for bmpSet and stringSpan
    234         len = o.len;
    235         uprv_memcpy(list, o.list, len*sizeof(UChar32));
    236         if (strings != NULL && o.strings != NULL) {
    237             strings->assign(*o.strings, cloneUnicodeString, status);
    238         } else { // Invalid strings.
    239             setToBogus();
    240             return;
    241         }
    242         if (o.pat) {
    243             setPattern(UnicodeString(o.pat, o.patLen));
    244         }
    245     } else { // If memory allocation failed, set to bogus state.
    246         setToBogus();
    247         return;
    248     }
    249     _dbgct(this);
    250 }
    251 
    252 /**
    253  * Destructs the set.
    254  */
    255 UnicodeSet::~UnicodeSet() {
    256     _dbgdt(this); // first!
    257     uprv_free(list);
    258     delete bmpSet;
    259     if (buffer) {
    260         uprv_free(buffer);
    261     }
    262     delete strings;
    263     delete stringSpan;
    264     releasePattern();
    265 }
    266 
    267 /**
    268  * Assigns this object to be a copy of another.
    269  */
    270 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
    271     if (this == &o) {
    272         return *this;
    273     }
    274     if (isFrozen()) {
    275         return *this;
    276     }
    277     if (o.isBogus()) {
    278         setToBogus();
    279         return *this;
    280     }
    281     UErrorCode ec = U_ZERO_ERROR;
    282     ensureCapacity(o.len, ec);
    283     if (U_FAILURE(ec)) {
    284         return *this; // There is no way to report this error :-(
    285     }
    286     len = o.len;
    287     uprv_memcpy(list, o.list, len*sizeof(UChar32));
    288     if (o.bmpSet == NULL) {
    289         bmpSet = NULL;
    290     } else {
    291         bmpSet = new BMPSet(*o.bmpSet, list, len);
    292         if (bmpSet == NULL) { // Check for memory allocation error.
    293             setToBogus();
    294             return *this;
    295         }
    296     }
    297     if (strings != NULL && o.strings != NULL) {
    298         strings->assign(*o.strings, cloneUnicodeString, ec);
    299     } else { // Invalid strings.
    300         setToBogus();
    301         return *this;
    302     }
    303     if (o.stringSpan == NULL) {
    304         stringSpan = NULL;
    305     } else {
    306         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
    307         if (stringSpan == NULL) { // Check for memory allocation error.
    308             setToBogus();
    309             return *this;
    310         }
    311     }
    312     releasePattern();
    313     if (o.pat) {
    314         setPattern(UnicodeString(o.pat, o.patLen));
    315     }
    316     return *this;
    317 }
    318 
    319 /**
    320  * Returns a copy of this object.  All UnicodeMatcher objects have
    321  * to support cloning in order to allow classes using
    322  * UnicodeMatchers, such as Transliterator, to implement cloning.
    323  */
    324 UnicodeFunctor* UnicodeSet::clone() const {
    325     return new UnicodeSet(*this);
    326 }
    327 
    328 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
    329     return new UnicodeSet(*this, TRUE);
    330 }
    331 
    332 /**
    333  * Compares the specified object with this set for equality.  Returns
    334  * <tt>true</tt> if the two sets
    335  * have the same size, and every member of the specified set is
    336  * contained in this set (or equivalently, every member of this set is
    337  * contained in the specified set).
    338  *
    339  * @param o set to be compared for equality with this set.
    340  * @return <tt>true</tt> if the specified set is equal to this set.
    341  */
    342 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
    343     if (len != o.len) return FALSE;
    344     for (int32_t i = 0; i < len; ++i) {
    345         if (list[i] != o.list[i]) return FALSE;
    346     }
    347     if (*strings != *o.strings) return FALSE;
    348     return TRUE;
    349 }
    350 
    351 /**
    352  * Returns the hash code value for this set.
    353  *
    354  * @return the hash code value for this set.
    355  * @see Object#hashCode()
    356  */
    357 int32_t UnicodeSet::hashCode(void) const {
    358     int32_t result = len;
    359     for (int32_t i = 0; i < len; ++i) {
    360         result *= 1000003;
    361         result += list[i];
    362     }
    363     return result;
    364 }
    365 
    366 //----------------------------------------------------------------
    367 // Public API
    368 //----------------------------------------------------------------
    369 
    370 /**
    371  * Returns the number of elements in this set (its cardinality),
    372  * Note than the elements of a set may include both individual
    373  * codepoints and strings.
    374  *
    375  * @return the number of elements in this set (its cardinality).
    376  */
    377 int32_t UnicodeSet::size(void) const {
    378     int32_t n = 0;
    379     int32_t count = getRangeCount();
    380     for (int32_t i = 0; i < count; ++i) {
    381         n += getRangeEnd(i) - getRangeStart(i) + 1;
    382     }
    383     return n + strings->size();
    384 }
    385 
    386 /**
    387  * Returns <tt>true</tt> if this set contains no elements.
    388  *
    389  * @return <tt>true</tt> if this set contains no elements.
    390  */
    391 UBool UnicodeSet::isEmpty(void) const {
    392     return len == 1 && strings->size() == 0;
    393 }
    394 
    395 /**
    396  * Returns true if this set contains the given character.
    397  * @param c character to be checked for containment
    398  * @return true if the test condition is met
    399  */
    400 UBool UnicodeSet::contains(UChar32 c) const {
    401     // Set i to the index of the start item greater than ch
    402     // We know we will terminate without length test!
    403     // LATER: for large sets, add binary search
    404     //int32_t i = -1;
    405     //for (;;) {
    406     //    if (c < list[++i]) break;
    407     //}
    408     if (bmpSet != NULL) {
    409         return bmpSet->contains(c);
    410     }
    411     if (stringSpan != NULL) {
    412         return stringSpan->contains(c);
    413     }
    414     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
    415         return FALSE;
    416     }
    417     int32_t i = findCodePoint(c);
    418     return (UBool)(i & 1); // return true if odd
    419 }
    420 
    421 /**
    422  * Returns the smallest value i such that c < list[i].  Caller
    423  * must ensure that c is a legal value or this method will enter
    424  * an infinite loop.  This method performs a binary search.
    425  * @param c a character in the range MIN_VALUE..MAX_VALUE
    426  * inclusive
    427  * @return the smallest integer i in the range 0..len-1,
    428  * inclusive, such that c < list[i]
    429  */
    430 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
    431     /* Examples:
    432                                        findCodePoint(c)
    433        set              list[]         c=0 1 3 4 7 8
    434        ===              ==============   ===========
    435        []               [110000]         0 0 0 0 0 0
    436        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
    437        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
    438        [:Any:]          [0, 110000]      1 1 1 1 1 1
    439      */
    440 
    441     // Return the smallest i such that c < list[i].  Assume
    442     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
    443     if (c < list[0])
    444         return 0;
    445     // High runner test.  c is often after the last range, so an
    446     // initial check for this condition pays off.
    447     int32_t lo = 0;
    448     int32_t hi = len - 1;
    449     if (lo >= hi || c >= list[hi-1])
    450         return hi;
    451     // invariant: c >= list[lo]
    452     // invariant: c < list[hi]
    453     for (;;) {
    454         int32_t i = (lo + hi) >> 1;
    455         if (i == lo) {
    456             break; // Found!
    457         } else if (c < list[i]) {
    458             hi = i;
    459         } else {
    460             lo = i;
    461         }
    462     }
    463     return hi;
    464 }
    465 
    466 /**
    467  * Returns true if this set contains every character
    468  * of the given range.
    469  * @param start first character, inclusive, of the range
    470  * @param end last character, inclusive, of the range
    471  * @return true if the test condition is met
    472  */
    473 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
    474     //int32_t i = -1;
    475     //for (;;) {
    476     //    if (start < list[++i]) break;
    477     //}
    478     int32_t i = findCodePoint(start);
    479     return ((i & 1) != 0 && end < list[i]);
    480 }
    481 
    482 /**
    483  * Returns <tt>true</tt> if this set contains the given
    484  * multicharacter string.
    485  * @param s string to be checked for containment
    486  * @return <tt>true</tt> if this set contains the specified string
    487  */
    488 UBool UnicodeSet::contains(const UnicodeString& s) const {
    489     if (s.length() == 0) return FALSE;
    490     int32_t cp = getSingleCP(s);
    491     if (cp < 0) {
    492         return strings->contains((void*) &s);
    493     } else {
    494         return contains((UChar32) cp);
    495     }
    496 }
    497 
    498 /**
    499  * Returns true if this set contains all the characters and strings
    500  * of the given set.
    501  * @param c set to be checked for containment
    502  * @return true if the test condition is met
    503  */
    504 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
    505     // The specified set is a subset if all of its pairs are contained in
    506     // this set.  It's possible to code this more efficiently in terms of
    507     // direct manipulation of the inversion lists if the need arises.
    508     int32_t n = c.getRangeCount();
    509     for (int i=0; i<n; ++i) {
    510         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
    511             return FALSE;
    512         }
    513     }
    514     if (!strings->containsAll(*c.strings)) return FALSE;
    515     return TRUE;
    516 }
    517 
    518 /**
    519  * Returns true if this set contains all the characters
    520  * of the given string.
    521  * @param s string containing characters to be checked for containment
    522  * @return true if the test condition is met
    523  */
    524 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
    525     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
    526                    s.length());
    527 }
    528 
    529 /**
    530  * Returns true if this set contains none of the characters
    531  * of the given range.
    532  * @param start first character, inclusive, of the range
    533  * @param end last character, inclusive, of the range
    534  * @return true if the test condition is met
    535  */
    536 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
    537     //int32_t i = -1;
    538     //for (;;) {
    539     //    if (start < list[++i]) break;
    540     //}
    541     int32_t i = findCodePoint(start);
    542     return ((i & 1) == 0 && end < list[i]);
    543 }
    544 
    545 /**
    546  * Returns true if this set contains none of the characters and strings
    547  * of the given set.
    548  * @param c set to be checked for containment
    549  * @return true if the test condition is met
    550  */
    551 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
    552     // The specified set is a subset if all of its pairs are contained in
    553     // this set.  It's possible to code this more efficiently in terms of
    554     // direct manipulation of the inversion lists if the need arises.
    555     int32_t n = c.getRangeCount();
    556     for (int32_t i=0; i<n; ++i) {
    557         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
    558             return FALSE;
    559         }
    560     }
    561     if (!strings->containsNone(*c.strings)) return FALSE;
    562     return TRUE;
    563 }
    564 
    565 /**
    566  * Returns true if this set contains none of the characters
    567  * of the given string.
    568  * @param s string containing characters to be checked for containment
    569  * @return true if the test condition is met
    570  */
    571 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
    572     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
    573                    s.length());
    574 }
    575 
    576 /**
    577  * Returns <tt>true</tt> if this set contains any character whose low byte
    578  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
    579  * indexing.
    580  */
    581 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
    582     /* The index value v, in the range [0,255], is contained in this set if
    583      * it is contained in any pair of this set.  Pairs either have the high
    584      * bytes equal, or unequal.  If the high bytes are equal, then we have
    585      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
    586      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
    587      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
    588      * time zone month containment logic.)
    589      */
    590     int32_t i;
    591     int32_t rangeCount=getRangeCount();
    592     for (i=0; i<rangeCount; ++i) {
    593         UChar32 low = getRangeStart(i);
    594         UChar32 high = getRangeEnd(i);
    595         if ((low & ~0xFF) == (high & ~0xFF)) {
    596             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
    597                 return TRUE;
    598             }
    599         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
    600             return TRUE;
    601         }
    602     }
    603     if (strings->size() != 0) {
    604         for (i=0; i<strings->size(); ++i) {
    605             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
    606             //if (s.length() == 0) {
    607             //    // Empty strings match everything
    608             //    return TRUE;
    609             //}
    610             // assert(s.length() != 0); // We enforce this elsewhere
    611             UChar32 c = s.char32At(0);
    612             if ((c & 0xFF) == v) {
    613                 return TRUE;
    614             }
    615         }
    616     }
    617     return FALSE;
    618 }
    619 
    620 /**
    621  * Implementation of UnicodeMatcher::matches().  Always matches the
    622  * longest possible multichar string.
    623  */
    624 UMatchDegree UnicodeSet::matches(const Replaceable& text,
    625                                  int32_t& offset,
    626                                  int32_t limit,
    627                                  UBool incremental) {
    628     if (offset == limit) {
    629         // Strings, if any, have length != 0, so we don't worry
    630         // about them here.  If we ever allow zero-length strings
    631         // we much check for them here.
    632         if (contains(U_ETHER)) {
    633             return incremental ? U_PARTIAL_MATCH : U_MATCH;
    634         } else {
    635             return U_MISMATCH;
    636         }
    637     } else {
    638         if (strings->size() != 0) { // try strings first
    639 
    640             // might separate forward and backward loops later
    641             // for now they are combined
    642 
    643             // TODO Improve efficiency of this, at least in the forward
    644             // direction, if not in both.  In the forward direction we
    645             // can assume the strings are sorted.
    646 
    647             int32_t i;
    648             UBool forward = offset < limit;
    649 
    650             // firstChar is the leftmost char to match in the
    651             // forward direction or the rightmost char to match in
    652             // the reverse direction.
    653             UChar firstChar = text.charAt(offset);
    654 
    655             // If there are multiple strings that can match we
    656             // return the longest match.
    657             int32_t highWaterLength = 0;
    658 
    659             for (i=0; i<strings->size(); ++i) {
    660                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
    661 
    662                 //if (trial.length() == 0) {
    663                 //    return U_MATCH; // null-string always matches
    664                 //}
    665                 // assert(trial.length() != 0); // We ensure this elsewhere
    666 
    667                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
    668 
    669                 // Strings are sorted, so we can optimize in the
    670                 // forward direction.
    671                 if (forward && c > firstChar) break;
    672                 if (c != firstChar) continue;
    673 
    674                 int32_t matchLen = matchRest(text, offset, limit, trial);
    675 
    676                 if (incremental) {
    677                     int32_t maxLen = forward ? limit-offset : offset-limit;
    678                     if (matchLen == maxLen) {
    679                         // We have successfully matched but only up to limit.
    680                         return U_PARTIAL_MATCH;
    681                     }
    682                 }
    683 
    684                 if (matchLen == trial.length()) {
    685                     // We have successfully matched the whole string.
    686                     if (matchLen > highWaterLength) {
    687                         highWaterLength = matchLen;
    688                     }
    689                     // In the forward direction we know strings
    690                     // are sorted so we can bail early.
    691                     if (forward && matchLen < highWaterLength) {
    692                         break;
    693                     }
    694                     continue;
    695                 }
    696             }
    697 
    698             // We've checked all strings without a partial match.
    699             // If we have full matches, return the longest one.
    700             if (highWaterLength != 0) {
    701                 offset += forward ? highWaterLength : -highWaterLength;
    702                 return U_MATCH;
    703             }
    704         }
    705         return UnicodeFilter::matches(text, offset, limit, incremental);
    706     }
    707 }
    708 
    709 /**
    710  * Returns the longest match for s in text at the given position.
    711  * If limit > start then match forward from start+1 to limit
    712  * matching all characters except s.charAt(0).  If limit < start,
    713  * go backward starting from start-1 matching all characters
    714  * except s.charAt(s.length()-1).  This method assumes that the
    715  * first character, text.charAt(start), matches s, so it does not
    716  * check it.
    717  * @param text the text to match
    718  * @param start the first character to match.  In the forward
    719  * direction, text.charAt(start) is matched against s.charAt(0).
    720  * In the reverse direction, it is matched against
    721  * s.charAt(s.length()-1).
    722  * @param limit the limit offset for matching, either last+1 in
    723  * the forward direction, or last-1 in the reverse direction,
    724  * where last is the index of the last character to match.
    725  * @return If part of s matches up to the limit, return |limit -
    726  * start|.  If all of s matches before reaching the limit, return
    727  * s.length().  If there is a mismatch between s and text, return
    728  * 0
    729  */
    730 int32_t UnicodeSet::matchRest(const Replaceable& text,
    731                               int32_t start, int32_t limit,
    732                               const UnicodeString& s) {
    733     int32_t i;
    734     int32_t maxLen;
    735     int32_t slen = s.length();
    736     if (start < limit) {
    737         maxLen = limit - start;
    738         if (maxLen > slen) maxLen = slen;
    739         for (i = 1; i < maxLen; ++i) {
    740             if (text.charAt(start + i) != s.charAt(i)) return 0;
    741         }
    742     } else {
    743         maxLen = start - limit;
    744         if (maxLen > slen) maxLen = slen;
    745         --slen; // <=> slen = s.length() - 1;
    746         for (i = 1; i < maxLen; ++i) {
    747             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
    748         }
    749     }
    750     return maxLen;
    751 }
    752 
    753 /**
    754  * Implement of UnicodeMatcher
    755  */
    756 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
    757     toUnionTo.addAll(*this);
    758 }
    759 
    760 /**
    761  * Returns the index of the given character within this set, where
    762  * the set is ordered by ascending code point.  If the character
    763  * is not in this set, return -1.  The inverse of this method is
    764  * <code>charAt()</code>.
    765  * @return an index from 0..size()-1, or -1
    766  */
    767 int32_t UnicodeSet::indexOf(UChar32 c) const {
    768     if (c < MIN_VALUE || c > MAX_VALUE) {
    769         return -1;
    770     }
    771     int32_t i = 0;
    772     int32_t n = 0;
    773     for (;;) {
    774         UChar32 start = list[i++];
    775         if (c < start) {
    776             return -1;
    777         }
    778         UChar32 limit = list[i++];
    779         if (c < limit) {
    780             return n + c - start;
    781         }
    782         n += limit - start;
    783     }
    784 }
    785 
    786 /**
    787  * Returns the character at the given index within this set, where
    788  * the set is ordered by ascending code point.  If the index is
    789  * out of range, return (UChar32)-1.  The inverse of this method is
    790  * <code>indexOf()</code>.
    791  * @param index an index from 0..size()-1
    792  * @return the character at the given index, or (UChar32)-1.
    793  */
    794 UChar32 UnicodeSet::charAt(int32_t index) const {
    795     if (index >= 0) {
    796         // len2 is the largest even integer <= len, that is, it is len
    797         // for even values and len-1 for odd values.  With odd values
    798         // the last entry is UNICODESET_HIGH.
    799         int32_t len2 = len & ~1;
    800         for (int32_t i=0; i < len2;) {
    801             UChar32 start = list[i++];
    802             int32_t count = list[i++] - start;
    803             if (index < count) {
    804                 return (UChar32)(start + index);
    805             }
    806             index -= count;
    807         }
    808     }
    809     return (UChar32)-1;
    810 }
    811 
    812 /**
    813  * Make this object represent the range <code>start - end</code>.
    814  * If <code>end > start</code> then this object is set to an
    815  * an empty range.
    816  *
    817  * @param start first character in the set, inclusive
    818  * @rparam end last character in the set, inclusive
    819  */
    820 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
    821     clear();
    822     complement(start, end);
    823     return *this;
    824 }
    825 
    826 /**
    827  * Adds the specified range to this set if it is not already
    828  * present.  If this set already contains the specified range,
    829  * the call leaves this set unchanged.  If <code>end > start</code>
    830  * then an empty range is added, leaving the set unchanged.
    831  *
    832  * @param start first character, inclusive, of range to be added
    833  * to this set.
    834  * @param end last character, inclusive, of range to be added
    835  * to this set.
    836  */
    837 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
    838     if (pinCodePoint(start) < pinCodePoint(end)) {
    839         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
    840         add(range, 2, 0);
    841     } else if (start == end) {
    842         add(start);
    843     }
    844     return *this;
    845 }
    846 
    847 // #define DEBUG_US_ADD
    848 
    849 #ifdef DEBUG_US_ADD
    850 #include <stdio.h>
    851 void dump(UChar32 c) {
    852     if (c <= 0xFF) {
    853         printf("%c", (char)c);
    854     } else {
    855         printf("U+%04X", c);
    856     }
    857 }
    858 void dump(const UChar32* list, int32_t len) {
    859     printf("[");
    860     for (int32_t i=0; i<len; ++i) {
    861         if (i != 0) printf(", ");
    862         dump(list[i]);
    863     }
    864     printf("]");
    865 }
    866 #endif
    867 
    868 /**
    869  * Adds the specified character to this set if it is not already
    870  * present.  If this set already contains the specified character,
    871  * the call leaves this set unchanged.
    872  */
    873 UnicodeSet& UnicodeSet::add(UChar32 c) {
    874     // find smallest i such that c < list[i]
    875     // if odd, then it is IN the set
    876     // if even, then it is OUT of the set
    877     int32_t i = findCodePoint(pinCodePoint(c));
    878 
    879     // already in set?
    880     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
    881 
    882     // HIGH is 0x110000
    883     // assert(list[len-1] == HIGH);
    884 
    885     // empty = [HIGH]
    886     // [start_0, limit_0, start_1, limit_1, HIGH]
    887 
    888     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
    889     //                             ^
    890     //                             list[i]
    891 
    892     // i == 0 means c is before the first range
    893 
    894 #ifdef DEBUG_US_ADD
    895     printf("Add of ");
    896     dump(c);
    897     printf(" found at %d", i);
    898     printf(": ");
    899     dump(list, len);
    900     printf(" => ");
    901 #endif
    902 
    903     if (c == list[i]-1) {
    904         // c is before start of next range
    905         list[i] = c;
    906         // if we touched the HIGH mark, then add a new one
    907         if (c == (UNICODESET_HIGH - 1)) {
    908             UErrorCode status = U_ZERO_ERROR;
    909             ensureCapacity(len+1, status);
    910             if (U_FAILURE(status)) {
    911                 return *this; // There is no way to report this error :-(
    912             }
    913             list[len++] = UNICODESET_HIGH;
    914         }
    915         if (i > 0 && c == list[i-1]) {
    916             // collapse adjacent ranges
    917 
    918             // [..., start_k-1, c, c, limit_k, ..., HIGH]
    919             //                     ^
    920             //                     list[i]
    921 
    922             //for (int32_t k=i-1; k<len-2; ++k) {
    923             //    list[k] = list[k+2];
    924             //}
    925             UChar32* dst = list + i - 1;
    926             UChar32* src = dst + 2;
    927             UChar32* srclimit = list + len;
    928             while (src < srclimit) *(dst++) = *(src++);
    929 
    930             len -= 2;
    931         }
    932     }
    933 
    934     else if (i > 0 && c == list[i-1]) {
    935         // c is after end of prior range
    936         list[i-1]++;
    937         // no need to check for collapse here
    938     }
    939 
    940     else {
    941         // At this point we know the new char is not adjacent to
    942         // any existing ranges, and it is not 10FFFF.
    943 
    944 
    945         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
    946         //                             ^
    947         //                             list[i]
    948 
    949         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
    950         //                             ^
    951         //                             list[i]
    952 
    953         UErrorCode status = U_ZERO_ERROR;
    954         ensureCapacity(len+2, status);
    955         if (U_FAILURE(status)) {
    956             return *this; // There is no way to report this error :-(
    957         }
    958 
    959         //for (int32_t k=len-1; k>=i; --k) {
    960         //    list[k+2] = list[k];
    961         //}
    962         UChar32* src = list + len;
    963         UChar32* dst = src + 2;
    964         UChar32* srclimit = list + i;
    965         while (src > srclimit) *(--dst) = *(--src);
    966 
    967         list[i] = c;
    968         list[i+1] = c+1;
    969         len += 2;
    970     }
    971 
    972 #ifdef DEBUG_US_ADD
    973     dump(list, len);
    974     printf("\n");
    975 
    976     for (i=1; i<len; ++i) {
    977         if (list[i] <= list[i-1]) {
    978             // Corrupt array!
    979             printf("ERROR: list has been corrupted\n");
    980             exit(1);
    981         }
    982     }
    983 #endif
    984 
    985     releasePattern();
    986     return *this;
    987 }
    988 
    989 /**
    990  * Adds the specified multicharacter to this set if it is not already
    991  * present.  If this set already contains the multicharacter,
    992  * the call leaves this set unchanged.
    993  * Thus "ch" => {"ch"}
    994  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
    995  * @param s the source string
    996  * @return the modified set, for chaining
    997  */
    998 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
    999     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
   1000     int32_t cp = getSingleCP(s);
   1001     if (cp < 0) {
   1002         if (!strings->contains((void*) &s)) {
   1003             _add(s);
   1004             releasePattern();
   1005         }
   1006     } else {
   1007         add((UChar32)cp);
   1008     }
   1009     return *this;
   1010 }
   1011 
   1012 /**
   1013  * Adds the given string, in order, to 'strings'.  The given string
   1014  * must have been checked by the caller to not be empty and to not
   1015  * already be in 'strings'.
   1016  */
   1017 void UnicodeSet::_add(const UnicodeString& s) {
   1018     if (isFrozen() || isBogus()) {
   1019         return;
   1020     }
   1021     UnicodeString* t = new UnicodeString(s);
   1022     if (t == NULL) { // Check for memory allocation error.
   1023         setToBogus();
   1024         return;
   1025     }
   1026     UErrorCode ec = U_ZERO_ERROR;
   1027     strings->sortedInsert(t, compareUnicodeString, ec);
   1028     if (U_FAILURE(ec)) {
   1029         setToBogus();
   1030         delete t;
   1031     }
   1032 }
   1033 
   1034 /**
   1035  * @return a code point IF the string consists of a single one.
   1036  * otherwise returns -1.
   1037  * @param string to test
   1038  */
   1039 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
   1040     //if (s.length() < 1) {
   1041     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
   1042     //}
   1043     if (s.length() > 2) return -1;
   1044     if (s.length() == 1) return s.charAt(0);
   1045 
   1046     // at this point, len = 2
   1047     UChar32 cp = s.char32At(0);
   1048     if (cp > 0xFFFF) { // is surrogate pair
   1049         return cp;
   1050     }
   1051     return -1;
   1052 }
   1053 
   1054 /**
   1055  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
   1056  * If this set already any particular character, it has no effect on that character.
   1057  * @param the source string
   1058  * @return the modified set, for chaining
   1059  */
   1060 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
   1061     UChar32 cp;
   1062     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
   1063         cp = s.char32At(i);
   1064         add(cp);
   1065     }
   1066     return *this;
   1067 }
   1068 
   1069 /**
   1070  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1071  * If this set already any particular character, it has no effect on that character.
   1072  * @param the source string
   1073  * @return the modified set, for chaining
   1074  */
   1075 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
   1076     UnicodeSet set;
   1077     set.addAll(s);
   1078     retainAll(set);
   1079     return *this;
   1080 }
   1081 
   1082 /**
   1083  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1084  * If this set already any particular character, it has no effect on that character.
   1085  * @param the source string
   1086  * @return the modified set, for chaining
   1087  */
   1088 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
   1089     UnicodeSet set;
   1090     set.addAll(s);
   1091     complementAll(set);
   1092     return *this;
   1093 }
   1094 
   1095 /**
   1096  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
   1097  * If this set already any particular character, it has no effect on that character.
   1098  * @param the source string
   1099  * @return the modified set, for chaining
   1100  */
   1101 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
   1102     UnicodeSet set;
   1103     set.addAll(s);
   1104     removeAll(set);
   1105     return *this;
   1106 }
   1107 
   1108 UnicodeSet& UnicodeSet::removeAllStrings() {
   1109     strings->removeAllElements();
   1110     return *this;
   1111 }
   1112 
   1113 
   1114 /**
   1115  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
   1116  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   1117  * @param the source string
   1118  * @return a newly created set containing the given string
   1119  */
   1120 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
   1121     UnicodeSet *set = new UnicodeSet();
   1122     if (set != NULL) { // Check for memory allocation error.
   1123         set->add(s);
   1124     }
   1125     return set;
   1126 }
   1127 
   1128 
   1129 /**
   1130  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
   1131  * @param the source string
   1132  * @return a newly created set containing the given characters
   1133  */
   1134 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
   1135     UnicodeSet *set = new UnicodeSet();
   1136     if (set != NULL) { // Check for memory allocation error.
   1137         set->addAll(s);
   1138     }
   1139     return set;
   1140 }
   1141 
   1142 /**
   1143  * Retain only the elements in this set that are contained in the
   1144  * specified range.  If <code>end > start</code> then an empty range is
   1145  * retained, leaving the set empty.
   1146  *
   1147  * @param start first character, inclusive, of range to be retained
   1148  * to this set.
   1149  * @param end last character, inclusive, of range to be retained
   1150  * to this set.
   1151  */
   1152 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
   1153     if (pinCodePoint(start) <= pinCodePoint(end)) {
   1154         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
   1155         retain(range, 2, 0);
   1156     } else {
   1157         clear();
   1158     }
   1159     return *this;
   1160 }
   1161 
   1162 UnicodeSet& UnicodeSet::retain(UChar32 c) {
   1163     return retain(c, c);
   1164 }
   1165 
   1166 /**
   1167  * Removes the specified range from this set if it is present.
   1168  * The set will not contain the specified range once the call
   1169  * returns.  If <code>end > start</code> then an empty range is
   1170  * removed, leaving the set unchanged.
   1171  *
   1172  * @param start first character, inclusive, of range to be removed
   1173  * from this set.
   1174  * @param end last character, inclusive, of range to be removed
   1175  * from this set.
   1176  */
   1177 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
   1178     if (pinCodePoint(start) <= pinCodePoint(end)) {
   1179         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
   1180         retain(range, 2, 2);
   1181     }
   1182     return *this;
   1183 }
   1184 
   1185 /**
   1186  * Removes the specified character from this set if it is present.
   1187  * The set will not contain the specified range once the call
   1188  * returns.
   1189  */
   1190 UnicodeSet& UnicodeSet::remove(UChar32 c) {
   1191     return remove(c, c);
   1192 }
   1193 
   1194 /**
   1195  * Removes the specified string from this set if it is present.
   1196  * The set will not contain the specified character once the call
   1197  * returns.
   1198  * @param the source string
   1199  * @return the modified set, for chaining
   1200  */
   1201 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
   1202     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
   1203     int32_t cp = getSingleCP(s);
   1204     if (cp < 0) {
   1205         strings->removeElement((void*) &s);
   1206         releasePattern();
   1207     } else {
   1208         remove((UChar32)cp, (UChar32)cp);
   1209     }
   1210     return *this;
   1211 }
   1212 
   1213 /**
   1214  * Complements the specified range in this set.  Any character in
   1215  * the range will be removed if it is in this set, or will be
   1216  * added if it is not in this set.  If <code>end > start</code>
   1217  * then an empty range is xor'ed, leaving the set unchanged.
   1218  *
   1219  * @param start first character, inclusive, of range to be removed
   1220  * from this set.
   1221  * @param end last character, inclusive, of range to be removed
   1222  * from this set.
   1223  */
   1224 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
   1225     if (isFrozen() || isBogus()) {
   1226         return *this;
   1227     }
   1228     if (pinCodePoint(start) <= pinCodePoint(end)) {
   1229         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
   1230         exclusiveOr(range, 2, 0);
   1231     }
   1232     releasePattern();
   1233     return *this;
   1234 }
   1235 
   1236 UnicodeSet& UnicodeSet::complement(UChar32 c) {
   1237     return complement(c, c);
   1238 }
   1239 
   1240 /**
   1241  * This is equivalent to
   1242  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
   1243  */
   1244 UnicodeSet& UnicodeSet::complement(void) {
   1245     if (isFrozen() || isBogus()) {
   1246         return *this;
   1247     }
   1248     UErrorCode status = U_ZERO_ERROR;
   1249     if (list[0] == UNICODESET_LOW) {
   1250         ensureBufferCapacity(len-1, status);
   1251         if (U_FAILURE(status)) {
   1252             return *this;
   1253         }
   1254         uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
   1255         --len;
   1256     } else {
   1257         ensureBufferCapacity(len+1, status);
   1258         if (U_FAILURE(status)) {
   1259             return *this;
   1260         }
   1261         uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
   1262         buffer[0] = UNICODESET_LOW;
   1263         ++len;
   1264     }
   1265     swapBuffers();
   1266     releasePattern();
   1267     return *this;
   1268 }
   1269 
   1270 /**
   1271  * Complement the specified string in this set.
   1272  * The set will not contain the specified string once the call
   1273  * returns.
   1274  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
   1275  * @param s the string to complement
   1276  * @return this object, for chaining
   1277  */
   1278 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
   1279     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
   1280     int32_t cp = getSingleCP(s);
   1281     if (cp < 0) {
   1282         if (strings->contains((void*) &s)) {
   1283             strings->removeElement((void*) &s);
   1284         } else {
   1285             _add(s);
   1286         }
   1287         releasePattern();
   1288     } else {
   1289         complement((UChar32)cp, (UChar32)cp);
   1290     }
   1291     return *this;
   1292 }
   1293 
   1294 /**
   1295  * Adds all of the elements in the specified set to this set if
   1296  * they're not already present.  This operation effectively
   1297  * modifies this set so that its value is the <i>union</i> of the two
   1298  * sets.  The behavior of this operation is unspecified if the specified
   1299  * collection is modified while the operation is in progress.
   1300  *
   1301  * @param c set whose elements are to be added to this set.
   1302  * @see #add(char, char)
   1303  */
   1304 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
   1305     if ( c.len>0 && c.list!=NULL ) {
   1306         add(c.list, c.len, 0);
   1307     }
   1308 
   1309     // Add strings in order
   1310     if ( c.strings!=NULL ) {
   1311         for (int32_t i=0; i<c.strings->size(); ++i) {
   1312             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
   1313             if (!strings->contains((void*) s)) {
   1314                 _add(*s);
   1315             }
   1316         }
   1317     }
   1318     return *this;
   1319 }
   1320 
   1321 /**
   1322  * Retains only the elements in this set that are contained in the
   1323  * specified set.  In other words, removes from this set all of
   1324  * its elements that are not contained in the specified set.  This
   1325  * operation effectively modifies this set so that its value is
   1326  * the <i>intersection</i> of the two sets.
   1327  *
   1328  * @param c set that defines which elements this set will retain.
   1329  */
   1330 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
   1331     if (isFrozen() || isBogus()) {
   1332         return *this;
   1333     }
   1334     retain(c.list, c.len, 0);
   1335     strings->retainAll(*c.strings);
   1336     return *this;
   1337 }
   1338 
   1339 /**
   1340  * Removes from this set all of its elements that are contained in the
   1341  * specified set.  This operation effectively modifies this
   1342  * set so that its value is the <i>asymmetric set difference</i> of
   1343  * the two sets.
   1344  *
   1345  * @param c set that defines which elements will be removed from
   1346  *          this set.
   1347  */
   1348 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
   1349     if (isFrozen() || isBogus()) {
   1350         return *this;
   1351     }
   1352     retain(c.list, c.len, 2);
   1353     strings->removeAll(*c.strings);
   1354     return *this;
   1355 }
   1356 
   1357 /**
   1358  * Complements in this set all elements contained in the specified
   1359  * set.  Any character in the other set will be removed if it is
   1360  * in this set, or will be added if it is not in this set.
   1361  *
   1362  * @param c set that defines which elements will be xor'ed from
   1363  *          this set.
   1364  */
   1365 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
   1366     if (isFrozen() || isBogus()) {
   1367         return *this;
   1368     }
   1369     exclusiveOr(c.list, c.len, 0);
   1370 
   1371     for (int32_t i=0; i<c.strings->size(); ++i) {
   1372         void* e = c.strings->elementAt(i);
   1373         if (!strings->removeElement(e)) {
   1374             _add(*(const UnicodeString*)e);
   1375         }
   1376     }
   1377     return *this;
   1378 }
   1379 
   1380 /**
   1381  * Removes all of the elements from this set.  This set will be
   1382  * empty after this call returns.
   1383  */
   1384 UnicodeSet& UnicodeSet::clear(void) {
   1385     if (isFrozen()) {
   1386         return *this;
   1387     }
   1388     if (list != NULL) {
   1389         list[0] = UNICODESET_HIGH;
   1390     }
   1391     len = 1;
   1392     releasePattern();
   1393     if (strings != NULL) {
   1394         strings->removeAllElements();
   1395     }
   1396     if (list != NULL && strings != NULL) {
   1397         // Remove bogus
   1398         fFlags = 0;
   1399     }
   1400     return *this;
   1401 }
   1402 
   1403 /**
   1404  * Iteration method that returns the number of ranges contained in
   1405  * this set.
   1406  * @see #getRangeStart
   1407  * @see #getRangeEnd
   1408  */
   1409 int32_t UnicodeSet::getRangeCount() const {
   1410     return len/2;
   1411 }
   1412 
   1413 /**
   1414  * Iteration method that returns the first character in the
   1415  * specified range of this set.
   1416  * @see #getRangeCount
   1417  * @see #getRangeEnd
   1418  */
   1419 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
   1420     return list[index*2];
   1421 }
   1422 
   1423 /**
   1424  * Iteration method that returns the last character in the
   1425  * specified range of this set.
   1426  * @see #getRangeStart
   1427  * @see #getRangeEnd
   1428  */
   1429 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
   1430     return list[index*2 + 1] - 1;
   1431 }
   1432 
   1433 int32_t UnicodeSet::getStringCount() const {
   1434     return strings->size();
   1435 }
   1436 
   1437 const UnicodeString* UnicodeSet::getString(int32_t index) const {
   1438     return (const UnicodeString*) strings->elementAt(index);
   1439 }
   1440 
   1441 /**
   1442  * Reallocate this objects internal structures to take up the least
   1443  * possible space, without changing this object's value.
   1444  */
   1445 UnicodeSet& UnicodeSet::compact() {
   1446     if (isFrozen() || isBogus()) {
   1447         return *this;
   1448     }
   1449     // Delete buffer first to defragment memory less.
   1450     if (buffer != NULL) {
   1451         uprv_free(buffer);
   1452         buffer = NULL;
   1453     }
   1454     if (len < capacity) {
   1455         // Make the capacity equal to len or 1.
   1456         // We don't want to realloc of 0 size.
   1457         int32_t newCapacity = len + (len == 0);
   1458         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
   1459         if (temp) {
   1460             list = temp;
   1461             capacity = newCapacity;
   1462         }
   1463         // else what the heck happened?! We allocated less memory!
   1464         // Oh well. We'll keep our original array.
   1465     }
   1466     return *this;
   1467 }
   1468 
   1469 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
   1470     int32_t bmpLength, length, destLength;
   1471 
   1472     if (U_FAILURE(ec)) {
   1473         return 0;
   1474     }
   1475 
   1476     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
   1477         ec=U_ILLEGAL_ARGUMENT_ERROR;
   1478         return 0;
   1479     }
   1480 
   1481     /* count necessary 16-bit units */
   1482     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
   1483     // assert(length>=0);
   1484     if (length==0) {
   1485         /* empty set */
   1486         if (destCapacity>0) {
   1487             *dest=0;
   1488         } else {
   1489             ec=U_BUFFER_OVERFLOW_ERROR;
   1490         }
   1491         return 1;
   1492     }
   1493     /* now length>0 */
   1494 
   1495     if (this->list[length-1]<=0xffff) {
   1496         /* all BMP */
   1497         bmpLength=length;
   1498     } else if (this->list[0]>=0x10000) {
   1499         /* all supplementary */
   1500         bmpLength=0;
   1501         length*=2;
   1502     } else {
   1503         /* some BMP, some supplementary */
   1504         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
   1505         length=bmpLength+2*(length-bmpLength);
   1506     }
   1507 
   1508     /* length: number of 16-bit array units */
   1509     if (length>0x7fff) {
   1510         /* there are only 15 bits for the length in the first serialized word */
   1511         ec=U_INDEX_OUTOFBOUNDS_ERROR;
   1512         return 0;
   1513     }
   1514 
   1515     /*
   1516      * total serialized length:
   1517      * number of 16-bit array units (length) +
   1518      * 1 length unit (always) +
   1519      * 1 bmpLength unit (if there are supplementary values)
   1520      */
   1521     destLength=length+((length>bmpLength)?2:1);
   1522     if (destLength<=destCapacity) {
   1523         const UChar32 *p;
   1524         int32_t i;
   1525 
   1526         *dest=(uint16_t)length;
   1527         if (length>bmpLength) {
   1528             *dest|=0x8000;
   1529             *++dest=(uint16_t)bmpLength;
   1530         }
   1531         ++dest;
   1532 
   1533         /* write the BMP part of the array */
   1534         p=this->list;
   1535         for (i=0; i<bmpLength; ++i) {
   1536             *dest++=(uint16_t)*p++;
   1537         }
   1538 
   1539         /* write the supplementary part of the array */
   1540         for (; i<length; i+=2) {
   1541             *dest++=(uint16_t)(*p>>16);
   1542             *dest++=(uint16_t)*p++;
   1543         }
   1544     } else {
   1545         ec=U_BUFFER_OVERFLOW_ERROR;
   1546     }
   1547     return destLength;
   1548 }
   1549 
   1550 //----------------------------------------------------------------
   1551 // Implementation: Utility methods
   1552 //----------------------------------------------------------------
   1553 
   1554 /**
   1555  * Allocate our strings vector and return TRUE if successful.
   1556  */
   1557 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
   1558     if (U_FAILURE(status)) {
   1559         return FALSE;
   1560     }
   1561     strings = new UVector(uhash_deleteUnicodeString,
   1562                           uhash_compareUnicodeString, 1, status);
   1563     if (strings == NULL) { // Check for memory allocation error.
   1564         status = U_MEMORY_ALLOCATION_ERROR;
   1565         return FALSE;
   1566     }
   1567     if (U_FAILURE(status)) {
   1568         delete strings;
   1569         strings = NULL;
   1570         return FALSE;
   1571     }
   1572     return TRUE;
   1573 }
   1574 
   1575 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
   1576     if (newLen <= capacity)
   1577         return;
   1578     UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
   1579     if (temp == NULL) {
   1580         ec = U_MEMORY_ALLOCATION_ERROR;
   1581         setToBogus();
   1582         return;
   1583     }
   1584     list = temp;
   1585     capacity = newLen + GROW_EXTRA;
   1586     // else we keep the original contents on the memory failure.
   1587 }
   1588 
   1589 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
   1590     if (buffer != NULL && newLen <= bufferCapacity)
   1591         return;
   1592     UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
   1593     if (temp == NULL) {
   1594         ec = U_MEMORY_ALLOCATION_ERROR;
   1595         setToBogus();
   1596         return;
   1597     }
   1598     buffer = temp;
   1599     bufferCapacity = newLen + GROW_EXTRA;
   1600     // else we keep the original contents on the memory failure.
   1601 }
   1602 
   1603 /**
   1604  * Swap list and buffer.
   1605  */
   1606 void UnicodeSet::swapBuffers(void) {
   1607     // swap list and buffer
   1608     UChar32* temp = list;
   1609     list = buffer;
   1610     buffer = temp;
   1611 
   1612     int32_t c = capacity;
   1613     capacity = bufferCapacity;
   1614     bufferCapacity = c;
   1615 }
   1616 
   1617 void UnicodeSet::setToBogus() {
   1618     clear(); // Remove everything in the set.
   1619     fFlags = kIsBogus;
   1620 }
   1621 
   1622 //----------------------------------------------------------------
   1623 // Implementation: Fundamental operators
   1624 //----------------------------------------------------------------
   1625 
   1626 static inline UChar32 max(UChar32 a, UChar32 b) {
   1627     return (a > b) ? a : b;
   1628 }
   1629 
   1630 // polarity = 0, 3 is normal: x xor y
   1631 // polarity = 1, 2: x xor ~y == x === y
   1632 
   1633 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
   1634     if (isFrozen() || isBogus()) {
   1635         return;
   1636     }
   1637     UErrorCode status = U_ZERO_ERROR;
   1638     ensureBufferCapacity(len + otherLen, status);
   1639     if (U_FAILURE(status)) {
   1640         return;
   1641     }
   1642 
   1643     int32_t i = 0, j = 0, k = 0;
   1644     UChar32 a = list[i++];
   1645     UChar32 b;
   1646     if (polarity == 1 || polarity == 2) {
   1647         b = UNICODESET_LOW;
   1648         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
   1649             ++j;
   1650             b = other[j];
   1651         }
   1652     } else {
   1653         b = other[j++];
   1654     }
   1655     // simplest of all the routines
   1656     // sort the values, discarding identicals!
   1657     for (;;) {
   1658         if (a < b) {
   1659             buffer[k++] = a;
   1660             a = list[i++];
   1661         } else if (b < a) {
   1662             buffer[k++] = b;
   1663             b = other[j++];
   1664         } else if (a != UNICODESET_HIGH) { // at this point, a == b
   1665             // discard both values!
   1666             a = list[i++];
   1667             b = other[j++];
   1668         } else { // DONE!
   1669             buffer[k++] = UNICODESET_HIGH;
   1670             len = k;
   1671             break;
   1672         }
   1673     }
   1674     swapBuffers();
   1675     releasePattern();
   1676 }
   1677 
   1678 // polarity = 0 is normal: x union y
   1679 // polarity = 2: x union ~y
   1680 // polarity = 1: ~x union y
   1681 // polarity = 3: ~x union ~y
   1682 
   1683 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
   1684     if (isFrozen() || isBogus() || other==NULL) {
   1685         return;
   1686     }
   1687     UErrorCode status = U_ZERO_ERROR;
   1688     ensureBufferCapacity(len + otherLen, status);
   1689     if (U_FAILURE(status)) {
   1690         return;
   1691     }
   1692 
   1693     int32_t i = 0, j = 0, k = 0;
   1694     UChar32 a = list[i++];
   1695     UChar32 b = other[j++];
   1696     // change from xor is that we have to check overlapping pairs
   1697     // polarity bit 1 means a is second, bit 2 means b is.
   1698     for (;;) {
   1699         switch (polarity) {
   1700           case 0: // both first; take lower if unequal
   1701             if (a < b) { // take a
   1702                 // Back up over overlapping ranges in buffer[]
   1703                 if (k > 0 && a <= buffer[k-1]) {
   1704                     // Pick latter end value in buffer[] vs. list[]
   1705                     a = max(list[i], buffer[--k]);
   1706                 } else {
   1707                     // No overlap
   1708                     buffer[k++] = a;
   1709                     a = list[i];
   1710                 }
   1711                 i++; // Common if/else code factored out
   1712                 polarity ^= 1;
   1713             } else if (b < a) { // take b
   1714                 if (k > 0 && b <= buffer[k-1]) {
   1715                     b = max(other[j], buffer[--k]);
   1716                 } else {
   1717                     buffer[k++] = b;
   1718                     b = other[j];
   1719                 }
   1720                 j++;
   1721                 polarity ^= 2;
   1722             } else { // a == b, take a, drop b
   1723                 if (a == UNICODESET_HIGH) goto loop_end;
   1724                 // This is symmetrical; it doesn't matter if
   1725                 // we backtrack with a or b. - liu
   1726                 if (k > 0 && a <= buffer[k-1]) {
   1727                     a = max(list[i], buffer[--k]);
   1728                 } else {
   1729                     // No overlap
   1730                     buffer[k++] = a;
   1731                     a = list[i];
   1732                 }
   1733                 i++;
   1734                 polarity ^= 1;
   1735                 b = other[j++];
   1736                 polarity ^= 2;
   1737             }
   1738             break;
   1739           case 3: // both second; take higher if unequal, and drop other
   1740             if (b <= a) { // take a
   1741                 if (a == UNICODESET_HIGH) goto loop_end;
   1742                 buffer[k++] = a;
   1743             } else { // take b
   1744                 if (b == UNICODESET_HIGH) goto loop_end;
   1745                 buffer[k++] = b;
   1746             }
   1747             a = list[i++];
   1748             polarity ^= 1;   // factored common code
   1749             b = other[j++];
   1750             polarity ^= 2;
   1751             break;
   1752           case 1: // a second, b first; if b < a, overlap
   1753             if (a < b) { // no overlap, take a
   1754                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
   1755             } else if (b < a) { // OVERLAP, drop b
   1756                 b = other[j++];
   1757                 polarity ^= 2;
   1758             } else { // a == b, drop both!
   1759                 if (a == UNICODESET_HIGH) goto loop_end;
   1760                 a = list[i++];
   1761                 polarity ^= 1;
   1762                 b = other[j++];
   1763                 polarity ^= 2;
   1764             }
   1765             break;
   1766           case 2: // a first, b second; if a < b, overlap
   1767             if (b < a) { // no overlap, take b
   1768                 buffer[k++] = b;
   1769                 b = other[j++];
   1770                 polarity ^= 2;
   1771             } else  if (a < b) { // OVERLAP, drop a
   1772                 a = list[i++];
   1773                 polarity ^= 1;
   1774             } else { // a == b, drop both!
   1775                 if (a == UNICODESET_HIGH) goto loop_end;
   1776                 a = list[i++];
   1777                 polarity ^= 1;
   1778                 b = other[j++];
   1779                 polarity ^= 2;
   1780             }
   1781             break;
   1782         }
   1783     }
   1784  loop_end:
   1785     buffer[k++] = UNICODESET_HIGH;    // terminate
   1786     len = k;
   1787     swapBuffers();
   1788     releasePattern();
   1789 }
   1790 
   1791 // polarity = 0 is normal: x intersect y
   1792 // polarity = 2: x intersect ~y == set-minus
   1793 // polarity = 1: ~x intersect y
   1794 // polarity = 3: ~x intersect ~y
   1795 
   1796 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
   1797     if (isFrozen() || isBogus()) {
   1798         return;
   1799     }
   1800     UErrorCode status = U_ZERO_ERROR;
   1801     ensureBufferCapacity(len + otherLen, status);
   1802     if (U_FAILURE(status)) {
   1803         return;
   1804     }
   1805 
   1806     int32_t i = 0, j = 0, k = 0;
   1807     UChar32 a = list[i++];
   1808     UChar32 b = other[j++];
   1809     // change from xor is that we have to check overlapping pairs
   1810     // polarity bit 1 means a is second, bit 2 means b is.
   1811     for (;;) {
   1812         switch (polarity) {
   1813           case 0: // both first; drop the smaller
   1814             if (a < b) { // drop a
   1815                 a = list[i++];
   1816                 polarity ^= 1;
   1817             } else if (b < a) { // drop b
   1818                 b = other[j++];
   1819                 polarity ^= 2;
   1820             } else { // a == b, take one, drop other
   1821                 if (a == UNICODESET_HIGH) goto loop_end;
   1822                 buffer[k++] = a;
   1823                 a = list[i++];
   1824                 polarity ^= 1;
   1825                 b = other[j++];
   1826                 polarity ^= 2;
   1827             }
   1828             break;
   1829           case 3: // both second; take lower if unequal
   1830             if (a < b) { // take a
   1831                 buffer[k++] = a;
   1832                 a = list[i++];
   1833                 polarity ^= 1;
   1834             } else if (b < a) { // take b
   1835                 buffer[k++] = b;
   1836                 b = other[j++];
   1837                 polarity ^= 2;
   1838             } else { // a == b, take one, drop other
   1839                 if (a == UNICODESET_HIGH) goto loop_end;
   1840                 buffer[k++] = a;
   1841                 a = list[i++];
   1842                 polarity ^= 1;
   1843                 b = other[j++];
   1844                 polarity ^= 2;
   1845             }
   1846             break;
   1847           case 1: // a second, b first;
   1848             if (a < b) { // NO OVERLAP, drop a
   1849                 a = list[i++];
   1850                 polarity ^= 1;
   1851             } else if (b < a) { // OVERLAP, take b
   1852                 buffer[k++] = b;
   1853                 b = other[j++];
   1854                 polarity ^= 2;
   1855             } else { // a == b, drop both!
   1856                 if (a == UNICODESET_HIGH) goto loop_end;
   1857                 a = list[i++];
   1858                 polarity ^= 1;
   1859                 b = other[j++];
   1860                 polarity ^= 2;
   1861             }
   1862             break;
   1863           case 2: // a first, b second; if a < b, overlap
   1864             if (b < a) { // no overlap, drop b
   1865                 b = other[j++];
   1866                 polarity ^= 2;
   1867             } else  if (a < b) { // OVERLAP, take a
   1868                 buffer[k++] = a;
   1869                 a = list[i++];
   1870                 polarity ^= 1;
   1871             } else { // a == b, drop both!
   1872                 if (a == UNICODESET_HIGH) goto loop_end;
   1873                 a = list[i++];
   1874                 polarity ^= 1;
   1875                 b = other[j++];
   1876                 polarity ^= 2;
   1877             }
   1878             break;
   1879         }
   1880     }
   1881  loop_end:
   1882     buffer[k++] = UNICODESET_HIGH;    // terminate
   1883     len = k;
   1884     swapBuffers();
   1885     releasePattern();
   1886 }
   1887 
   1888 /**
   1889  * Append the <code>toPattern()</code> representation of a
   1890  * string to the given <code>StringBuffer</code>.
   1891  */
   1892 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
   1893 escapeUnprintable) {
   1894     UChar32 cp;
   1895     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
   1896         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
   1897     }
   1898 }
   1899 
   1900 /**
   1901  * Append the <code>toPattern()</code> representation of a
   1902  * character to the given <code>StringBuffer</code>.
   1903  */
   1904 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
   1905 escapeUnprintable) {
   1906     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
   1907         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
   1908         // unprintable
   1909         if (ICU_Utility::escapeUnprintable(buf, c)) {
   1910             return;
   1911         }
   1912     }
   1913     // Okay to let ':' pass through
   1914     switch (c) {
   1915     case SET_OPEN:
   1916     case SET_CLOSE:
   1917     case HYPHEN:
   1918     case COMPLEMENT:
   1919     case INTERSECTION:
   1920     case BACKSLASH:
   1921     case OPEN_BRACE:
   1922     case CLOSE_BRACE:
   1923     case COLON:
   1924     case SymbolTable::SYMBOL_REF:
   1925         buf.append(BACKSLASH);
   1926         break;
   1927     default:
   1928         // Escape whitespace
   1929         if (uprv_isRuleWhiteSpace(c)) {
   1930             buf.append(BACKSLASH);
   1931         }
   1932         break;
   1933     }
   1934     buf.append(c);
   1935 }
   1936 
   1937 /**
   1938  * Append a string representation of this set to result.  This will be
   1939  * a cleaned version of the string passed to applyPattern(), if there
   1940  * is one.  Otherwise it will be generated.
   1941  */
   1942 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
   1943                                       UBool escapeUnprintable) const
   1944 {
   1945     if (pat != NULL) {
   1946         int32_t i;
   1947         int32_t backslashCount = 0;
   1948         for (i=0; i<patLen; ) {
   1949             UChar32 c;
   1950             U16_NEXT(pat, i, patLen, c);
   1951             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
   1952                 // If the unprintable character is preceded by an odd
   1953                 // number of backslashes, then it has been escaped.
   1954                 // Before unescaping it, we delete the final
   1955                 // backslash.
   1956                 if ((backslashCount % 2) == 1) {
   1957                     result.truncate(result.length() - 1);
   1958                 }
   1959                 ICU_Utility::escapeUnprintable(result, c);
   1960                 backslashCount = 0;
   1961             } else {
   1962                 result.append(c);
   1963                 if (c == BACKSLASH) {
   1964                     ++backslashCount;
   1965                 } else {
   1966                     backslashCount = 0;
   1967                 }
   1968             }
   1969         }
   1970         return result;
   1971     }
   1972 
   1973     return _generatePattern(result, escapeUnprintable);
   1974 }
   1975 
   1976 /**
   1977  * Returns a string representation of this set.  If the result of
   1978  * calling this function is passed to a UnicodeSet constructor, it
   1979  * will produce another set that is equal to this one.
   1980  */
   1981 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
   1982                                      UBool escapeUnprintable) const
   1983 {
   1984     result.truncate(0);
   1985     return _toPattern(result, escapeUnprintable);
   1986 }
   1987 
   1988 /**
   1989  * Generate and append a string representation of this set to result.
   1990  * This does not use this.pat, the cleaned up copy of the string
   1991  * passed to applyPattern().
   1992  */
   1993 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
   1994                                             UBool escapeUnprintable) const
   1995 {
   1996     result.append(SET_OPEN);
   1997 
   1998 //  // Check against the predefined categories.  We implicitly build
   1999 //  // up ALL category sets the first time toPattern() is called.
   2000 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
   2001 //      if (*this == getCategorySet(cat)) {
   2002 //          result.append(COLON);
   2003 //          result.append(CATEGORY_NAMES, cat*2, 2);
   2004 //          return result.append(CATEGORY_CLOSE);
   2005 //      }
   2006 //  }
   2007 
   2008     int32_t count = getRangeCount();
   2009 
   2010     // If the set contains at least 2 intervals and includes both
   2011     // MIN_VALUE and MAX_VALUE, then the inverse representation will
   2012     // be more economical.
   2013     if (count > 1 &&
   2014         getRangeStart(0) == MIN_VALUE &&
   2015         getRangeEnd(count-1) == MAX_VALUE) {
   2016 
   2017         // Emit the inverse
   2018         result.append(COMPLEMENT);
   2019 
   2020         for (int32_t i = 1; i < count; ++i) {
   2021             UChar32 start = getRangeEnd(i-1)+1;
   2022             UChar32 end = getRangeStart(i)-1;
   2023             _appendToPat(result, start, escapeUnprintable);
   2024             if (start != end) {
   2025                 if ((start+1) != end) {
   2026                     result.append(HYPHEN);
   2027                 }
   2028                 _appendToPat(result, end, escapeUnprintable);
   2029             }
   2030         }
   2031     }
   2032 
   2033     // Default; emit the ranges as pairs
   2034     else {
   2035         for (int32_t i = 0; i < count; ++i) {
   2036             UChar32 start = getRangeStart(i);
   2037             UChar32 end = getRangeEnd(i);
   2038             _appendToPat(result, start, escapeUnprintable);
   2039             if (start != end) {
   2040                 if ((start+1) != end) {
   2041                     result.append(HYPHEN);
   2042                 }
   2043                 _appendToPat(result, end, escapeUnprintable);
   2044             }
   2045         }
   2046     }
   2047 
   2048     for (int32_t i = 0; i<strings->size(); ++i) {
   2049         result.append(OPEN_BRACE);
   2050         _appendToPat(result,
   2051                      *(const UnicodeString*) strings->elementAt(i),
   2052                      escapeUnprintable);
   2053         result.append(CLOSE_BRACE);
   2054     }
   2055     return result.append(SET_CLOSE);
   2056 }
   2057 
   2058 /**
   2059 * Release existing cached pattern
   2060 */
   2061 void UnicodeSet::releasePattern() {
   2062     if (pat) {
   2063         uprv_free(pat);
   2064         pat = NULL;
   2065         patLen = 0;
   2066     }
   2067 }
   2068 
   2069 /**
   2070 * Set the new pattern to cache.
   2071 */
   2072 void UnicodeSet::setPattern(const UnicodeString& newPat) {
   2073     releasePattern();
   2074     int32_t newPatLen = newPat.length();
   2075     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
   2076     if (pat) {
   2077         patLen = newPatLen;
   2078         newPat.extractBetween(0, patLen, pat);
   2079         pat[patLen] = 0;
   2080     }
   2081     // else we don't care if malloc failed. This was just a nice cache.
   2082     // We can regenerate an equivalent pattern later when requested.
   2083 }
   2084 
   2085 UnicodeFunctor *UnicodeSet::freeze() {
   2086     if(!isFrozen() && !isBogus()) {
   2087         // Do most of what compact() does before freezing because
   2088         // compact() will not work when the set is frozen.
   2089         // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
   2090 
   2091         // Delete buffer first to defragment memory less.
   2092         if (buffer != NULL) {
   2093             uprv_free(buffer);
   2094             buffer = NULL;
   2095         }
   2096         if (capacity > (len + GROW_EXTRA)) {
   2097             // Make the capacity equal to len or 1.
   2098             // We don't want to realloc of 0 size.
   2099             capacity = len + (len == 0);
   2100             list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
   2101             if (list == NULL) { // Check for memory allocation error.
   2102                 setToBogus();
   2103                 return this;
   2104             }
   2105         }
   2106 
   2107         // Optimize contains() and span() and similar functions.
   2108         if (!strings->isEmpty()) {
   2109             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
   2110             if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
   2111                 // All strings are irrelevant for span() etc. because
   2112                 // all of each string's code points are contained in this set.
   2113                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
   2114                 // many relevant strings as UTF-16.
   2115                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
   2116                 delete stringSpan;
   2117                 stringSpan = NULL;
   2118             }
   2119         }
   2120         if (stringSpan == NULL) {
   2121             // No span-relevant strings: Optimize for code point spans.
   2122             bmpSet=new BMPSet(list, len);
   2123             if (bmpSet == NULL) { // Check for memory allocation error.
   2124                 setToBogus();
   2125             }
   2126         }
   2127     }
   2128     return this;
   2129 }
   2130 
   2131 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
   2132     if(length>0 && bmpSet!=NULL) {
   2133         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
   2134     }
   2135     if(length<0) {
   2136         length=u_strlen(s);
   2137     }
   2138     if(length==0) {
   2139         return 0;
   2140     }
   2141     if(stringSpan!=NULL) {
   2142         return stringSpan->span(s, length, spanCondition);
   2143     } else if(!strings->isEmpty()) {
   2144         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
   2145                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
   2146                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
   2147         UnicodeSetStringSpan strSpan(*this, *strings, which);
   2148         if(strSpan.needsStringSpanUTF16()) {
   2149             return strSpan.span(s, length, spanCondition);
   2150         }
   2151     }
   2152 
   2153     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2154         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2155     }
   2156 
   2157     UChar32 c;
   2158     int32_t start=0, prev=0;
   2159     do {
   2160         U16_NEXT(s, start, length, c);
   2161         if(spanCondition!=contains(c)) {
   2162             break;
   2163         }
   2164     } while((prev=start)<length);
   2165     return prev;
   2166 }
   2167 
   2168 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
   2169     if(length>0 && bmpSet!=NULL) {
   2170         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
   2171     }
   2172     if(length<0) {
   2173         length=u_strlen(s);
   2174     }
   2175     if(length==0) {
   2176         return 0;
   2177     }
   2178     if(stringSpan!=NULL) {
   2179         return stringSpan->spanBack(s, length, spanCondition);
   2180     } else if(!strings->isEmpty()) {
   2181         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
   2182                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
   2183                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
   2184         UnicodeSetStringSpan strSpan(*this, *strings, which);
   2185         if(strSpan.needsStringSpanUTF16()) {
   2186             return strSpan.spanBack(s, length, spanCondition);
   2187         }
   2188     }
   2189 
   2190     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2191         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2192     }
   2193 
   2194     UChar32 c;
   2195     int32_t prev=length;
   2196     do {
   2197         U16_PREV(s, 0, length, c);
   2198         if(spanCondition!=contains(c)) {
   2199             break;
   2200         }
   2201     } while((prev=length)>0);
   2202     return prev;
   2203 }
   2204 
   2205 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
   2206     if(length>0 && bmpSet!=NULL) {
   2207         const uint8_t *s0=(const uint8_t *)s;
   2208         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
   2209     }
   2210     if(length<0) {
   2211         length=(int32_t)uprv_strlen(s);
   2212     }
   2213     if(length==0) {
   2214         return 0;
   2215     }
   2216     if(stringSpan!=NULL) {
   2217         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
   2218     } else if(!strings->isEmpty()) {
   2219         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
   2220                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
   2221                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
   2222         UnicodeSetStringSpan strSpan(*this, *strings, which);
   2223         if(strSpan.needsStringSpanUTF8()) {
   2224             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
   2225         }
   2226     }
   2227 
   2228     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2229         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2230     }
   2231 
   2232     UChar32 c;
   2233     int32_t start=0, prev=0;
   2234     do {
   2235         U8_NEXT(s, start, length, c);
   2236         if(c<0) {
   2237             c=0xfffd;
   2238         }
   2239         if(spanCondition!=contains(c)) {
   2240             break;
   2241         }
   2242     } while((prev=start)<length);
   2243     return prev;
   2244 }
   2245 
   2246 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
   2247     if(length>0 && bmpSet!=NULL) {
   2248         const uint8_t *s0=(const uint8_t *)s;
   2249         return bmpSet->spanBackUTF8(s0, length, spanCondition);
   2250     }
   2251     if(length<0) {
   2252         length=(int32_t)uprv_strlen(s);
   2253     }
   2254     if(length==0) {
   2255         return 0;
   2256     }
   2257     if(stringSpan!=NULL) {
   2258         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
   2259     } else if(!strings->isEmpty()) {
   2260         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
   2261                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
   2262                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
   2263         UnicodeSetStringSpan strSpan(*this, *strings, which);
   2264         if(strSpan.needsStringSpanUTF8()) {
   2265             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
   2266         }
   2267     }
   2268 
   2269     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2270         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2271     }
   2272 
   2273     UChar32 c;
   2274     int32_t prev=length;
   2275     do {
   2276         U8_PREV(s, 0, length, c);
   2277         if(c<0) {
   2278             c=0xfffd;
   2279         }
   2280         if(spanCondition!=contains(c)) {
   2281             break;
   2282         }
   2283     } while((prev=length)>0);
   2284     return prev;
   2285 }
   2286 
   2287 U_NAMESPACE_END
   2288