Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  uniset_props.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2004aug25
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Character property dependent functions moved here from uniset.cpp
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/parsepos.h"
     22 #include "unicode/uchar.h"
     23 #include "unicode/uscript.h"
     24 #include "unicode/symtable.h"
     25 #include "unicode/uset.h"
     26 #include "unicode/locid.h"
     27 #include "unicode/brkiter.h"
     28 #include "uset_imp.h"
     29 #include "ruleiter.h"
     30 #include "cmemory.h"
     31 #include "ucln_cmn.h"
     32 #include "util.h"
     33 #include "uvector.h"
     34 #include "uprops.h"
     35 #include "propname.h"
     36 #include "normalizer2impl.h"
     37 #include "ucase.h"
     38 #include "ubidi_props.h"
     39 #include "uinvchar.h"
     40 #include "uprops.h"
     41 #include "charstr.h"
     42 #include "cstring.h"
     43 #include "mutex.h"
     44 #include "umutex.h"
     45 #include "uassert.h"
     46 #include "hash.h"
     47 
     48 U_NAMESPACE_USE
     49 
     50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     51 
     52 // initial storage. Must be >= 0
     53 // *** same as in uniset.cpp ! ***
     54 #define START_EXTRA 16
     55 
     56 // Define UChar constants using hex for EBCDIC compatibility
     57 // Used #define to reduce private static exports and memory access time.
     58 #define SET_OPEN        ((UChar)0x005B) /*[*/
     59 #define SET_CLOSE       ((UChar)0x005D) /*]*/
     60 #define HYPHEN          ((UChar)0x002D) /*-*/
     61 #define COMPLEMENT      ((UChar)0x005E) /*^*/
     62 #define COLON           ((UChar)0x003A) /*:*/
     63 #define BACKSLASH       ((UChar)0x005C) /*\*/
     64 #define INTERSECTION    ((UChar)0x0026) /*&*/
     65 #define UPPER_U         ((UChar)0x0055) /*U*/
     66 #define LOWER_U         ((UChar)0x0075) /*u*/
     67 #define OPEN_BRACE      ((UChar)123)    /*{*/
     68 #define CLOSE_BRACE     ((UChar)125)    /*}*/
     69 #define UPPER_P         ((UChar)0x0050) /*P*/
     70 #define LOWER_P         ((UChar)0x0070) /*p*/
     71 #define UPPER_N         ((UChar)78)     /*N*/
     72 #define EQUALS          ((UChar)0x003D) /*=*/
     73 
     74 //static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"
     75 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"
     76 //static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"
     77 static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"
     78 //static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"
     79 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
     80 
     81 // Special property set IDs
     82 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
     83 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
     84 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
     85 
     86 // Unicode name property alias
     87 #define NAME_PROP "na"
     88 #define NAME_PROP_LENGTH 2
     89 
     90 /**
     91  * Delimiter string used in patterns to close a category reference:
     92  * ":]".  Example: "[:Lu:]".
     93  */
     94 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
     95 
     96 // Cached sets ------------------------------------------------------------- ***
     97 
     98 U_CDECL_BEGIN
     99 static UBool U_CALLCONV uset_cleanup();
    100 U_CDECL_END
    101 
    102 // Not a TriStateSingletonWrapper because we think the UnicodeSet constructor
    103 // can only fail with an out-of-memory error
    104 // if we have a correct pattern and the properties data is hardcoded and always available.
    105 class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> {
    106 public:
    107     UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) :
    108             SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {}
    109     UnicodeSet *getInstance(UErrorCode &errorCode) {
    110         return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode);
    111     }
    112 private:
    113     static void *createInstance(const void *context, UErrorCode &errorCode) {
    114         UnicodeString pattern((const char *)context, -1, US_INV);
    115         UnicodeSet *set=new UnicodeSet(pattern, errorCode);
    116         if(set==NULL) {
    117             errorCode=U_MEMORY_ALLOCATION_ERROR;
    118         }
    119         set->freeze();
    120         ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
    121         return set;
    122     }
    123 
    124     const char *fPattern;
    125 };
    126 
    127 U_CDECL_BEGIN
    128 
    129 static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
    130 
    131 STATIC_SIMPLE_SINGLETON(uni32Singleton);
    132 
    133 //----------------------------------------------------------------
    134 // Inclusions list
    135 //----------------------------------------------------------------
    136 
    137 // USetAdder implementation
    138 // Does not use uset.h to reduce code dependencies
    139 static void U_CALLCONV
    140 _set_add(USet *set, UChar32 c) {
    141     ((UnicodeSet *)set)->add(c);
    142 }
    143 
    144 static void U_CALLCONV
    145 _set_addRange(USet *set, UChar32 start, UChar32 end) {
    146     ((UnicodeSet *)set)->add(start, end);
    147 }
    148 
    149 static void U_CALLCONV
    150 _set_addString(USet *set, const UChar *str, int32_t length) {
    151     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
    152 }
    153 
    154 /**
    155  * Cleanup function for UnicodeSet
    156  */
    157 static UBool U_CALLCONV uset_cleanup(void) {
    158     int32_t i;
    159 
    160     for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
    161         if (INCLUSIONS[i] != NULL) {
    162             delete INCLUSIONS[i];
    163             INCLUSIONS[i] = NULL;
    164         }
    165     }
    166     UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance();
    167     return TRUE;
    168 }
    169 
    170 U_CDECL_END
    171 
    172 U_NAMESPACE_BEGIN
    173 
    174 /*
    175 Reduce excessive reallocation, and make it easier to detect initialization
    176 problems.
    177 Usually you don't see smaller sets than this for Unicode 5.0.
    178 */
    179 #define DEFAULT_INCLUSION_CAPACITY 3072
    180 
    181 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
    182     UBool needInit;
    183     UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit);
    184     if (needInit) {
    185         UnicodeSet* incl = new UnicodeSet();
    186         USetAdder sa = {
    187             (USet *)incl,
    188             _set_add,
    189             _set_addRange,
    190             _set_addString,
    191             NULL, // don't need remove()
    192             NULL // don't need removeRange()
    193         };
    194         incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
    195         if (incl != NULL) {
    196             switch(src) {
    197             case UPROPS_SRC_CHAR:
    198                 uchar_addPropertyStarts(&sa, &status);
    199                 break;
    200             case UPROPS_SRC_PROPSVEC:
    201                 upropsvec_addPropertyStarts(&sa, &status);
    202                 break;
    203             case UPROPS_SRC_CHAR_AND_PROPSVEC:
    204                 uchar_addPropertyStarts(&sa, &status);
    205                 upropsvec_addPropertyStarts(&sa, &status);
    206                 break;
    207 #if !UCONFIG_NO_NORMALIZATION
    208             case UPROPS_SRC_CASE_AND_NORM: {
    209                 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
    210                 if(U_SUCCESS(status)) {
    211                     impl->addPropertyStarts(&sa, status);
    212                 }
    213                 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
    214                 break;
    215             }
    216             case UPROPS_SRC_NFC: {
    217                 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
    218                 if(U_SUCCESS(status)) {
    219                     impl->addPropertyStarts(&sa, status);
    220                 }
    221                 break;
    222             }
    223             case UPROPS_SRC_NFKC: {
    224                 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
    225                 if(U_SUCCESS(status)) {
    226                     impl->addPropertyStarts(&sa, status);
    227                 }
    228                 break;
    229             }
    230             case UPROPS_SRC_NFKC_CF: {
    231                 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
    232                 if(U_SUCCESS(status)) {
    233                     impl->addPropertyStarts(&sa, status);
    234                 }
    235                 break;
    236             }
    237             case UPROPS_SRC_NFC_CANON_ITER: {
    238                 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
    239                 if(U_SUCCESS(status)) {
    240                     impl->addCanonIterPropertyStarts(&sa, status);
    241                 }
    242                 break;
    243             }
    244 #endif
    245             case UPROPS_SRC_CASE:
    246                 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
    247                 break;
    248             case UPROPS_SRC_BIDI:
    249                 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);
    250                 break;
    251             default:
    252                 status = U_INTERNAL_PROGRAM_ERROR;
    253                 break;
    254             }
    255             if (U_SUCCESS(status)) {
    256                 // Compact for caching
    257                 incl->compact();
    258                 umtx_lock(NULL);
    259                 if (INCLUSIONS[src] == NULL) {
    260                     INCLUSIONS[src] = incl;
    261                     incl = NULL;
    262                     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
    263                 }
    264                 umtx_unlock(NULL);
    265             }
    266             delete incl;
    267         } else {
    268             status = U_MEMORY_ALLOCATION_ERROR;
    269         }
    270     }
    271     return INCLUSIONS[src];
    272 }
    273 
    274 // Cache some sets for other services -------------------------------------- ***
    275 
    276 U_CFUNC UnicodeSet *
    277 uniset_getUnicode32Instance(UErrorCode &errorCode) {
    278     return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode);
    279 }
    280 
    281 // helper functions for matching of pattern syntax pieces ------------------ ***
    282 // these functions are parallel to the PERL_OPEN etc. strings above
    283 
    284 // using these functions is not only faster than UnicodeString::compare() and
    285 // caseCompare(), but they also make UnicodeSet work for simple patterns when
    286 // no Unicode properties data is available - when caseCompare() fails
    287 
    288 static inline UBool
    289 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
    290     UChar c;
    291     return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
    292 }
    293 
    294 /*static inline UBool
    295 isPerlClose(const UnicodeString &pattern, int32_t pos) {
    296     return pattern.charAt(pos)==CLOSE_BRACE;
    297 }*/
    298 
    299 static inline UBool
    300 isNameOpen(const UnicodeString &pattern, int32_t pos) {
    301     return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
    302 }
    303 
    304 static inline UBool
    305 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
    306     return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
    307 }
    308 
    309 /*static inline UBool
    310 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
    311     return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
    312 }*/
    313 
    314 // TODO memory debugging provided inside uniset.cpp
    315 // could be made available here but probably obsolete with use of modern
    316 // memory leak checker tools
    317 #define _dbgct(me)
    318 
    319 //----------------------------------------------------------------
    320 // Constructors &c
    321 //----------------------------------------------------------------
    322 
    323 /**
    324  * Constructs a set from the given pattern, optionally ignoring
    325  * white space.  See the class description for the syntax of the
    326  * pattern language.
    327  * @param pattern a string specifying what characters are in the set
    328  */
    329 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
    330                        UErrorCode& status) :
    331     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
    332     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
    333     fFlags(0)
    334 {
    335     if(U_SUCCESS(status)){
    336         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
    337         /* test for NULL */
    338         if(list == NULL) {
    339             status = U_MEMORY_ALLOCATION_ERROR;
    340         }else{
    341             allocateStrings(status);
    342             applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
    343         }
    344     }
    345     _dbgct(this);
    346 }
    347 
    348 /**
    349  * Constructs a set from the given pattern, optionally ignoring
    350  * white space.  See the class description for the syntax of the
    351  * pattern language.
    352  * @param pattern a string specifying what characters are in the set
    353  * @param options bitmask for options to apply to the pattern.
    354  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
    355  */
    356 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
    357                        uint32_t options,
    358                        const SymbolTable* symbols,
    359                        UErrorCode& status) :
    360     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
    361     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
    362     fFlags(0)
    363 {
    364     if(U_SUCCESS(status)){
    365         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
    366         /* test for NULL */
    367         if(list == NULL) {
    368             status = U_MEMORY_ALLOCATION_ERROR;
    369         }else{
    370             allocateStrings(status);
    371             applyPattern(pattern, options, symbols, status);
    372         }
    373     }
    374     _dbgct(this);
    375 }
    376 
    377 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
    378                        uint32_t options,
    379                        const SymbolTable* symbols,
    380                        UErrorCode& status) :
    381     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
    382     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
    383     fFlags(0)
    384 {
    385     if(U_SUCCESS(status)){
    386         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
    387         /* test for NULL */
    388         if(list == NULL) {
    389             status = U_MEMORY_ALLOCATION_ERROR;
    390         }else{
    391             allocateStrings(status);
    392             applyPattern(pattern, pos, options, symbols, status);
    393         }
    394     }
    395     _dbgct(this);
    396 }
    397 
    398 //----------------------------------------------------------------
    399 // Public API
    400 //----------------------------------------------------------------
    401 
    402 /**
    403  * Modifies this set to represent the set specified by the given
    404  * pattern, optionally ignoring white space.  See the class
    405  * description for the syntax of the pattern language.
    406  * @param pattern a string specifying what characters are in the set
    407  * @param ignoreSpaces if <code>true</code>, all spaces in the
    408  * pattern are ignored.  Spaces are those characters for which
    409  * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>.
    410  * Characters preceded by '\\' are escaped, losing any special
    411  * meaning they otherwise have.  Spaces may be included by
    412  * escaping them.
    413  * @exception <code>IllegalArgumentException</code> if the pattern
    414  * contains a syntax error.
    415  */
    416 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
    417                                      UErrorCode& status) {
    418     return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
    419 }
    420 
    421 
    422 /**
    423  * Modifies this set to represent the set specified by the given
    424  * pattern, optionally ignoring white space.  See the class
    425  * description for the syntax of the pattern language.
    426  * @param pattern a string specifying what characters are in the set
    427  * @param options bitmask for options to apply to the pattern.
    428  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
    429  */
    430 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
    431                                      uint32_t options,
    432                                      const SymbolTable* symbols,
    433                                      UErrorCode& status) {
    434     if (U_FAILURE(status) || isFrozen()) {
    435         return *this;
    436     }
    437 
    438     ParsePosition pos(0);
    439     applyPattern(pattern, pos, options, symbols, status);
    440     if (U_FAILURE(status)) return *this;
    441 
    442     int32_t i = pos.getIndex();
    443 
    444     if (options & USET_IGNORE_SPACE) {
    445         // Skip over trailing whitespace
    446         ICU_Utility::skipWhitespace(pattern, i, TRUE);
    447     }
    448 
    449     if (i != pattern.length()) {
    450         status = U_ILLEGAL_ARGUMENT_ERROR;
    451     }
    452     return *this;
    453 }
    454 
    455 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
    456                               ParsePosition& pos,
    457                               uint32_t options,
    458                               const SymbolTable* symbols,
    459                               UErrorCode& status) {
    460     if (U_FAILURE(status) || isFrozen()) {
    461         return *this;
    462     }
    463     // Need to build the pattern in a temporary string because
    464     // _applyPattern calls add() etc., which set pat to empty.
    465     UnicodeString rebuiltPat;
    466     RuleCharacterIterator chars(pattern, symbols, pos);
    467     applyPattern(chars, symbols, rebuiltPat, options, status);
    468     if (U_FAILURE(status)) return *this;
    469     if (chars.inVariable()) {
    470         // syntaxError(chars, "Extra chars in variable value");
    471         status = U_MALFORMED_SET;
    472         return *this;
    473     }
    474     setPattern(rebuiltPat);
    475     return *this;
    476 }
    477 
    478 /**
    479  * Return true if the given position, in the given pattern, appears
    480  * to be the start of a UnicodeSet pattern.
    481  */
    482 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
    483     return ((pos+1) < pattern.length() &&
    484             pattern.charAt(pos) == (UChar)91/*[*/) ||
    485         resemblesPropertyPattern(pattern, pos);
    486 }
    487 
    488 //----------------------------------------------------------------
    489 // Implementation: Pattern parsing
    490 //----------------------------------------------------------------
    491 
    492 /**
    493  * A small all-inline class to manage a UnicodeSet pointer.  Add
    494  * operator->() etc. as needed.
    495  */
    496 class UnicodeSetPointer {
    497     UnicodeSet* p;
    498 public:
    499     inline UnicodeSetPointer() : p(0) {}
    500     inline ~UnicodeSetPointer() { delete p; }
    501     inline UnicodeSet* pointer() { return p; }
    502     inline UBool allocate() {
    503         if (p == 0) {
    504             p = new UnicodeSet();
    505         }
    506         return p != 0;
    507     }
    508 };
    509 
    510 /**
    511  * Parse the pattern from the given RuleCharacterIterator.  The
    512  * iterator is advanced over the parsed pattern.
    513  * @param chars iterator over the pattern characters.  Upon return
    514  * it will be advanced to the first character after the parsed
    515  * pattern, or the end of the iteration if all characters are
    516  * parsed.
    517  * @param symbols symbol table to use to parse and dereference
    518  * variables, or null if none.
    519  * @param rebuiltPat the pattern that was parsed, rebuilt or
    520  * copied from the input pattern, as appropriate.
    521  * @param options a bit mask of zero or more of the following:
    522  * IGNORE_SPACE, CASE.
    523  */
    524 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
    525                               const SymbolTable* symbols,
    526                               UnicodeString& rebuiltPat,
    527                               uint32_t options,
    528                               UErrorCode& ec) {
    529     if (U_FAILURE(ec)) return;
    530 
    531     // Syntax characters: [ ] ^ - & { }
    532 
    533     // Recognized special forms for chars, sets: c-c s-s s&s
    534 
    535     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
    536                    RuleCharacterIterator::PARSE_ESCAPES;
    537     if ((options & USET_IGNORE_SPACE) != 0) {
    538         opts |= RuleCharacterIterator::SKIP_WHITESPACE;
    539     }
    540 
    541     UnicodeString patLocal, buf;
    542     UBool usePat = FALSE;
    543     UnicodeSetPointer scratch;
    544     RuleCharacterIterator::Pos backup;
    545 
    546     // mode: 0=before [, 1=between [...], 2=after ]
    547     // lastItem: 0=none, 1=char, 2=set
    548     int8_t lastItem = 0, mode = 0;
    549     UChar32 lastChar = 0;
    550     UChar op = 0;
    551 
    552     UBool invert = FALSE;
    553 
    554     clear();
    555 
    556     while (mode != 2 && !chars.atEnd()) {
    557         U_ASSERT((lastItem == 0 && op == 0) ||
    558                  (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
    559                  (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
    560                                     op == INTERSECTION /*'&'*/)));
    561 
    562         UChar32 c = 0;
    563         UBool literal = FALSE;
    564         UnicodeSet* nested = 0; // alias - do not delete
    565 
    566         // -------- Check for property pattern
    567 
    568         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
    569         int8_t setMode = 0;
    570         if (resemblesPropertyPattern(chars, opts)) {
    571             setMode = 2;
    572         }
    573 
    574         // -------- Parse '[' of opening delimiter OR nested set.
    575         // If there is a nested set, use `setMode' to define how
    576         // the set should be parsed.  If the '[' is part of the
    577         // opening delimiter for this pattern, parse special
    578         // strings "[", "[^", "[-", and "[^-".  Check for stand-in
    579         // characters representing a nested set in the symbol
    580         // table.
    581 
    582         else {
    583             // Prepare to backup if necessary
    584             chars.getPos(backup);
    585             c = chars.next(opts, literal, ec);
    586             if (U_FAILURE(ec)) return;
    587 
    588             if (c == 0x5B /*'['*/ && !literal) {
    589                 if (mode == 1) {
    590                     chars.setPos(backup); // backup
    591                     setMode = 1;
    592                 } else {
    593                     // Handle opening '[' delimiter
    594                     mode = 1;
    595                     patLocal.append((UChar) 0x5B /*'['*/);
    596                     chars.getPos(backup); // prepare to backup
    597                     c = chars.next(opts, literal, ec);
    598                     if (U_FAILURE(ec)) return;
    599                     if (c == 0x5E /*'^'*/ && !literal) {
    600                         invert = TRUE;
    601                         patLocal.append((UChar) 0x5E /*'^'*/);
    602                         chars.getPos(backup); // prepare to backup
    603                         c = chars.next(opts, literal, ec);
    604                         if (U_FAILURE(ec)) return;
    605                     }
    606                     // Fall through to handle special leading '-';
    607                     // otherwise restart loop for nested [], \p{}, etc.
    608                     if (c == HYPHEN /*'-'*/) {
    609                         literal = TRUE;
    610                         // Fall through to handle literal '-' below
    611                     } else {
    612                         chars.setPos(backup); // backup
    613                         continue;
    614                     }
    615                 }
    616             } else if (symbols != 0) {
    617                 const UnicodeFunctor *m = symbols->lookupMatcher(c);
    618                 if (m != 0) {
    619                     const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
    620                     if (ms == NULL) {
    621                         ec = U_MALFORMED_SET;
    622                         return;
    623                     }
    624                     // casting away const, but `nested' won't be modified
    625                     // (important not to modify stored set)
    626                     nested = const_cast<UnicodeSet*>(ms);
    627                     setMode = 3;
    628                 }
    629             }
    630         }
    631 
    632         // -------- Handle a nested set.  This either is inline in
    633         // the pattern or represented by a stand-in that has
    634         // previously been parsed and was looked up in the symbol
    635         // table.
    636 
    637         if (setMode != 0) {
    638             if (lastItem == 1) {
    639                 if (op != 0) {
    640                     // syntaxError(chars, "Char expected after operator");
    641                     ec = U_MALFORMED_SET;
    642                     return;
    643                 }
    644                 add(lastChar, lastChar);
    645                 _appendToPat(patLocal, lastChar, FALSE);
    646                 lastItem = 0;
    647                 op = 0;
    648             }
    649 
    650             if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
    651                 patLocal.append(op);
    652             }
    653 
    654             if (nested == 0) {
    655                 // lazy allocation
    656                 if (!scratch.allocate()) {
    657                     ec = U_MEMORY_ALLOCATION_ERROR;
    658                     return;
    659                 }
    660                 nested = scratch.pointer();
    661             }
    662             switch (setMode) {
    663             case 1:
    664                 nested->applyPattern(chars, symbols, patLocal, options, ec);
    665                 break;
    666             case 2:
    667                 chars.skipIgnored(opts);
    668                 nested->applyPropertyPattern(chars, patLocal, ec);
    669                 if (U_FAILURE(ec)) return;
    670                 break;
    671             case 3: // `nested' already parsed
    672                 nested->_toPattern(patLocal, FALSE);
    673                 break;
    674             }
    675 
    676             usePat = TRUE;
    677 
    678             if (mode == 0) {
    679                 // Entire pattern is a category; leave parse loop
    680                 *this = *nested;
    681                 mode = 2;
    682                 break;
    683             }
    684 
    685             switch (op) {
    686             case HYPHEN: /*'-'*/
    687                 removeAll(*nested);
    688                 break;
    689             case INTERSECTION: /*'&'*/
    690                 retainAll(*nested);
    691                 break;
    692             case 0:
    693                 addAll(*nested);
    694                 break;
    695             }
    696 
    697             op = 0;
    698             lastItem = 2;
    699 
    700             continue;
    701         }
    702 
    703         if (mode == 0) {
    704             // syntaxError(chars, "Missing '['");
    705             ec = U_MALFORMED_SET;
    706             return;
    707         }
    708 
    709         // -------- Parse special (syntax) characters.  If the
    710         // current character is not special, or if it is escaped,
    711         // then fall through and handle it below.
    712 
    713         if (!literal) {
    714             switch (c) {
    715             case 0x5D /*']'*/:
    716                 if (lastItem == 1) {
    717                     add(lastChar, lastChar);
    718                     _appendToPat(patLocal, lastChar, FALSE);
    719                 }
    720                 // Treat final trailing '-' as a literal
    721                 if (op == HYPHEN /*'-'*/) {
    722                     add(op, op);
    723                     patLocal.append(op);
    724                 } else if (op == INTERSECTION /*'&'*/) {
    725                     // syntaxError(chars, "Trailing '&'");
    726                     ec = U_MALFORMED_SET;
    727                     return;
    728                 }
    729                 patLocal.append((UChar) 0x5D /*']'*/);
    730                 mode = 2;
    731                 continue;
    732             case HYPHEN /*'-'*/:
    733                 if (op == 0) {
    734                     if (lastItem != 0) {
    735                         op = (UChar) c;
    736                         continue;
    737                     } else {
    738                         // Treat final trailing '-' as a literal
    739                         add(c, c);
    740                         c = chars.next(opts, literal, ec);
    741                         if (U_FAILURE(ec)) return;
    742                         if (c == 0x5D /*']'*/ && !literal) {
    743                             patLocal.append(HYPHEN_RIGHT_BRACE);
    744                             mode = 2;
    745                             continue;
    746                         }
    747                     }
    748                 }
    749                 // syntaxError(chars, "'-' not after char or set");
    750                 ec = U_MALFORMED_SET;
    751                 return;
    752             case INTERSECTION /*'&'*/:
    753                 if (lastItem == 2 && op == 0) {
    754                     op = (UChar) c;
    755                     continue;
    756                 }
    757                 // syntaxError(chars, "'&' not after set");
    758                 ec = U_MALFORMED_SET;
    759                 return;
    760             case 0x5E /*'^'*/:
    761                 // syntaxError(chars, "'^' not after '['");
    762                 ec = U_MALFORMED_SET;
    763                 return;
    764             case 0x7B /*'{'*/:
    765                 if (op != 0) {
    766                     // syntaxError(chars, "Missing operand after operator");
    767                     ec = U_MALFORMED_SET;
    768                     return;
    769                 }
    770                 if (lastItem == 1) {
    771                     add(lastChar, lastChar);
    772                     _appendToPat(patLocal, lastChar, FALSE);
    773                 }
    774                 lastItem = 0;
    775                 buf.truncate(0);
    776                 {
    777                     UBool ok = FALSE;
    778                     while (!chars.atEnd()) {
    779                         c = chars.next(opts, literal, ec);
    780                         if (U_FAILURE(ec)) return;
    781                         if (c == 0x7D /*'}'*/ && !literal) {
    782                             ok = TRUE;
    783                             break;
    784                         }
    785                         buf.append(c);
    786                     }
    787                     if (buf.length() < 1 || !ok) {
    788                         // syntaxError(chars, "Invalid multicharacter string");
    789                         ec = U_MALFORMED_SET;
    790                         return;
    791                     }
    792                 }
    793                 // We have new string. Add it to set and continue;
    794                 // we don't need to drop through to the further
    795                 // processing
    796                 add(buf);
    797                 patLocal.append((UChar) 0x7B /*'{'*/);
    798                 _appendToPat(patLocal, buf, FALSE);
    799                 patLocal.append((UChar) 0x7D /*'}'*/);
    800                 continue;
    801             case SymbolTable::SYMBOL_REF:
    802                 //         symbols  nosymbols
    803                 // [a-$]   error    error (ambiguous)
    804                 // [a$]    anchor   anchor
    805                 // [a-$x]  var "x"* literal '$'
    806                 // [a-$.]  error    literal '$'
    807                 // *We won't get here in the case of var "x"
    808                 {
    809                     chars.getPos(backup);
    810                     c = chars.next(opts, literal, ec);
    811                     if (U_FAILURE(ec)) return;
    812                     UBool anchor = (c == 0x5D /*']'*/ && !literal);
    813                     if (symbols == 0 && !anchor) {
    814                         c = SymbolTable::SYMBOL_REF;
    815                         chars.setPos(backup);
    816                         break; // literal '$'
    817                     }
    818                     if (anchor && op == 0) {
    819                         if (lastItem == 1) {
    820                             add(lastChar, lastChar);
    821                             _appendToPat(patLocal, lastChar, FALSE);
    822                         }
    823                         add(U_ETHER);
    824                         usePat = TRUE;
    825                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);
    826                         patLocal.append((UChar) 0x5D /*']'*/);
    827                         mode = 2;
    828                         continue;
    829                     }
    830                     // syntaxError(chars, "Unquoted '$'");
    831                     ec = U_MALFORMED_SET;
    832                     return;
    833                 }
    834             default:
    835                 break;
    836             }
    837         }
    838 
    839         // -------- Parse literal characters.  This includes both
    840         // escaped chars ("\u4E01") and non-syntax characters
    841         // ("a").
    842 
    843         switch (lastItem) {
    844         case 0:
    845             lastItem = 1;
    846             lastChar = c;
    847             break;
    848         case 1:
    849             if (op == HYPHEN /*'-'*/) {
    850                 if (lastChar >= c) {
    851                     // Don't allow redundant (a-a) or empty (b-a) ranges;
    852                     // these are most likely typos.
    853                     // syntaxError(chars, "Invalid range");
    854                     ec = U_MALFORMED_SET;
    855                     return;
    856                 }
    857                 add(lastChar, c);
    858                 _appendToPat(patLocal, lastChar, FALSE);
    859                 patLocal.append(op);
    860                 _appendToPat(patLocal, c, FALSE);
    861                 lastItem = 0;
    862                 op = 0;
    863             } else {
    864                 add(lastChar, lastChar);
    865                 _appendToPat(patLocal, lastChar, FALSE);
    866                 lastChar = c;
    867             }
    868             break;
    869         case 2:
    870             if (op != 0) {
    871                 // syntaxError(chars, "Set expected after operator");
    872                 ec = U_MALFORMED_SET;
    873                 return;
    874             }
    875             lastChar = c;
    876             lastItem = 1;
    877             break;
    878         }
    879     }
    880 
    881     if (mode != 2) {
    882         // syntaxError(chars, "Missing ']'");
    883         ec = U_MALFORMED_SET;
    884         return;
    885     }
    886 
    887     chars.skipIgnored(opts);
    888 
    889     /**
    890      * Handle global flags (invert, case insensitivity).  If this
    891      * pattern should be compiled case-insensitive, then we need
    892      * to close over case BEFORE COMPLEMENTING.  This makes
    893      * patterns like /[^abc]/i work.
    894      */
    895     if ((options & USET_CASE_INSENSITIVE) != 0) {
    896         closeOver(USET_CASE_INSENSITIVE);
    897     }
    898     else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
    899         closeOver(USET_ADD_CASE_MAPPINGS);
    900     }
    901     if (invert) {
    902         complement();
    903     }
    904 
    905     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
    906     // generated pattern.
    907     if (usePat) {
    908         rebuiltPat.append(patLocal);
    909     } else {
    910         _generatePattern(rebuiltPat, FALSE);
    911     }
    912     if (isBogus() && U_SUCCESS(ec)) {
    913         // We likely ran out of memory. AHHH!
    914         ec = U_MEMORY_ALLOCATION_ERROR;
    915     }
    916 }
    917 
    918 //----------------------------------------------------------------
    919 // Property set implementation
    920 //----------------------------------------------------------------
    921 
    922 static UBool numericValueFilter(UChar32 ch, void* context) {
    923     return u_getNumericValue(ch) == *(double*)context;
    924 }
    925 
    926 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
    927     int32_t value = *(int32_t*)context;
    928     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
    929 }
    930 
    931 static UBool versionFilter(UChar32 ch, void* context) {
    932     static const UVersionInfo none = { 0, 0, 0, 0 };
    933     UVersionInfo v;
    934     u_charAge(ch, v);
    935     UVersionInfo* version = (UVersionInfo*)context;
    936     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
    937 }
    938 
    939 typedef struct {
    940     UProperty prop;
    941     int32_t value;
    942 } IntPropertyContext;
    943 
    944 static UBool intPropertyFilter(UChar32 ch, void* context) {
    945     IntPropertyContext* c = (IntPropertyContext*)context;
    946     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
    947 }
    948 
    949 static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
    950     return uscript_hasScript(ch, *(UScriptCode*)context);
    951 }
    952 
    953 /**
    954  * Generic filter-based scanning code for UCD property UnicodeSets.
    955  */
    956 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
    957                              void* context,
    958                              int32_t src,
    959                              UErrorCode &status) {
    960     if (U_FAILURE(status)) return;
    961 
    962     // Logically, walk through all Unicode characters, noting the start
    963     // and end of each range for which filter.contain(c) is
    964     // true.  Add each range to a set.
    965     //
    966     // To improve performance, use an inclusions set which
    967     // encodes information about character ranges that are known
    968     // to have identical properties.
    969     // getInclusions(src) contains exactly the first characters of
    970     // same-value ranges for the given properties "source".
    971     const UnicodeSet* inclusions = getInclusions(src, status);
    972     if (U_FAILURE(status)) {
    973         return;
    974     }
    975 
    976     clear();
    977 
    978     UChar32 startHasProperty = -1;
    979     int32_t limitRange = inclusions->getRangeCount();
    980 
    981     for (int j=0; j<limitRange; ++j) {
    982         // get current range
    983         UChar32 start = inclusions->getRangeStart(j);
    984         UChar32 end = inclusions->getRangeEnd(j);
    985 
    986         // for all the code points in the range, process
    987         for (UChar32 ch = start; ch <= end; ++ch) {
    988             // only add to this UnicodeSet on inflection points --
    989             // where the hasProperty value changes to false
    990             if ((*filter)(ch, context)) {
    991                 if (startHasProperty < 0) {
    992                     startHasProperty = ch;
    993                 }
    994             } else if (startHasProperty >= 0) {
    995                 add(startHasProperty, ch-1);
    996                 startHasProperty = -1;
    997             }
    998         }
    999     }
   1000     if (startHasProperty >= 0) {
   1001         add((UChar32)startHasProperty, (UChar32)0x10FFFF);
   1002     }
   1003     if (isBogus() && U_SUCCESS(status)) {
   1004         // We likely ran out of memory. AHHH!
   1005         status = U_MEMORY_ALLOCATION_ERROR;
   1006     }
   1007 }
   1008 
   1009 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
   1010     /* Note: we use ' ' in compiler code page */
   1011     int32_t j = 0;
   1012     char ch;
   1013     --dstCapacity; /* make room for term. zero */
   1014     while ((ch = *src++) != 0) {
   1015         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
   1016             continue;
   1017         }
   1018         if (j >= dstCapacity) return FALSE;
   1019         dst[j++] = ch;
   1020     }
   1021     if (j > 0 && dst[j-1] == ' ') --j;
   1022     dst[j] = 0;
   1023     return TRUE;
   1024 }
   1025 
   1026 //----------------------------------------------------------------
   1027 // Property set API
   1028 //----------------------------------------------------------------
   1029 
   1030 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
   1031 
   1032 UnicodeSet&
   1033 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
   1034     if (U_FAILURE(ec) || isFrozen()) return *this;
   1035 
   1036     if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
   1037         applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
   1038     } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
   1039         UScriptCode script = (UScriptCode)value;
   1040         applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);
   1041     } else {
   1042         IntPropertyContext c = {prop, value};
   1043         applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
   1044     }
   1045     return *this;
   1046 }
   1047 
   1048 UnicodeSet&
   1049 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
   1050                                const UnicodeString& value,
   1051                                UErrorCode& ec) {
   1052     if (U_FAILURE(ec) || isFrozen()) return *this;
   1053 
   1054     // prop and value used to be converted to char * using the default
   1055     // converter instead of the invariant conversion.
   1056     // This should not be necessary because all Unicode property and value
   1057     // names use only invariant characters.
   1058     // If there are any variant characters, then we won't find them anyway.
   1059     // Checking first avoids assertion failures in the conversion.
   1060     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
   1061         !uprv_isInvariantUString(value.getBuffer(), value.length())
   1062     ) {
   1063         FAIL(ec);
   1064     }
   1065     CharString pname, vname;
   1066     pname.appendInvariantChars(prop, ec);
   1067     vname.appendInvariantChars(value, ec);
   1068     if (U_FAILURE(ec)) return *this;
   1069 
   1070     UProperty p;
   1071     int32_t v;
   1072     UBool mustNotBeEmpty = FALSE, invert = FALSE;
   1073 
   1074     if (value.length() > 0) {
   1075         p = u_getPropertyEnum(pname.data());
   1076         if (p == UCHAR_INVALID_CODE) FAIL(ec);
   1077 
   1078         // Treat gc as gcm
   1079         if (p == UCHAR_GENERAL_CATEGORY) {
   1080             p = UCHAR_GENERAL_CATEGORY_MASK;
   1081         }
   1082 
   1083         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
   1084             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
   1085             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
   1086             v = u_getPropertyValueEnum(p, vname.data());
   1087             if (v == UCHAR_INVALID_CODE) {
   1088                 // Handle numeric CCC
   1089                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
   1090                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
   1091                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
   1092                     char* end;
   1093                     double value = uprv_strtod(vname.data(), &end);
   1094                     v = (int32_t) value;
   1095                     if (v != value || v < 0 || *end != 0) {
   1096                         // non-integral or negative value, or trailing junk
   1097                         FAIL(ec);
   1098                     }
   1099                     // If the resultant set is empty then the numeric value
   1100                     // was invalid.
   1101                     mustNotBeEmpty = TRUE;
   1102                 } else {
   1103                     FAIL(ec);
   1104                 }
   1105             }
   1106         }
   1107 
   1108         else {
   1109 
   1110             switch (p) {
   1111             case UCHAR_NUMERIC_VALUE:
   1112                 {
   1113                     char* end;
   1114                     double value = uprv_strtod(vname.data(), &end);
   1115                     if (*end != 0) {
   1116                         FAIL(ec);
   1117                     }
   1118                     applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
   1119                     return *this;
   1120                 }
   1121                 break;
   1122             case UCHAR_NAME:
   1123             case UCHAR_UNICODE_1_NAME:
   1124                 {
   1125                     // Must munge name, since u_charFromName() does not do
   1126                     // 'loose' matching.
   1127                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
   1128                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
   1129                     UCharNameChoice choice = (p == UCHAR_NAME) ?
   1130                         U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME;
   1131                     UChar32 ch = u_charFromName(choice, buf, &ec);
   1132                     if (U_SUCCESS(ec)) {
   1133                         clear();
   1134                         add(ch);
   1135                         return *this;
   1136                     } else {
   1137                         FAIL(ec);
   1138                     }
   1139                 }
   1140                 break;
   1141             case UCHAR_AGE:
   1142                 {
   1143                     // Must munge name, since u_versionFromString() does not do
   1144                     // 'loose' matching.
   1145                     char buf[128];
   1146                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
   1147                     UVersionInfo version;
   1148                     u_versionFromString(version, buf);
   1149                     applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
   1150                     return *this;
   1151                 }
   1152                 break;
   1153             case UCHAR_SCRIPT_EXTENSIONS:
   1154                 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
   1155                 if (v == UCHAR_INVALID_CODE) {
   1156                     FAIL(ec);
   1157                 }
   1158                 // fall through to calling applyIntPropertyValue()
   1159                 break;
   1160             default:
   1161                 // p is a non-binary, non-enumerated property that we
   1162                 // don't support (yet).
   1163                 FAIL(ec);
   1164             }
   1165         }
   1166     }
   1167 
   1168     else {
   1169         // value is empty.  Interpret as General Category, Script, or
   1170         // Binary property.
   1171         p = UCHAR_GENERAL_CATEGORY_MASK;
   1172         v = u_getPropertyValueEnum(p, pname.data());
   1173         if (v == UCHAR_INVALID_CODE) {
   1174             p = UCHAR_SCRIPT;
   1175             v = u_getPropertyValueEnum(p, pname.data());
   1176             if (v == UCHAR_INVALID_CODE) {
   1177                 p = u_getPropertyEnum(pname.data());
   1178                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
   1179                     v = 1;
   1180                 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
   1181                     set(MIN_VALUE, MAX_VALUE);
   1182                     return *this;
   1183                 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
   1184                     set(0, 0x7F);
   1185                     return *this;
   1186                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
   1187                     // [:Assigned:]=[:^Cn:]
   1188                     p = UCHAR_GENERAL_CATEGORY_MASK;
   1189                     v = U_GC_CN_MASK;
   1190                     invert = TRUE;
   1191                 } else {
   1192                     FAIL(ec);
   1193                 }
   1194             }
   1195         }
   1196     }
   1197 
   1198     applyIntPropertyValue(p, v, ec);
   1199     if(invert) {
   1200         complement();
   1201     }
   1202 
   1203     if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
   1204         // mustNotBeEmpty is set to true if an empty set indicates
   1205         // invalid input.
   1206         ec = U_ILLEGAL_ARGUMENT_ERROR;
   1207     }
   1208 
   1209     if (isBogus() && U_SUCCESS(ec)) {
   1210         // We likely ran out of memory. AHHH!
   1211         ec = U_MEMORY_ALLOCATION_ERROR;
   1212     }
   1213     return *this;
   1214 }
   1215 
   1216 //----------------------------------------------------------------
   1217 // Property set patterns
   1218 //----------------------------------------------------------------
   1219 
   1220 /**
   1221  * Return true if the given position, in the given pattern, appears
   1222  * to be the start of a property set pattern.
   1223  */
   1224 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
   1225                                            int32_t pos) {
   1226     // Patterns are at least 5 characters long
   1227     if ((pos+5) > pattern.length()) {
   1228         return FALSE;
   1229     }
   1230 
   1231     // Look for an opening [:, [:^, \p, or \P
   1232     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
   1233 }
   1234 
   1235 /**
   1236  * Return true if the given iterator appears to point at a
   1237  * property pattern.  Regardless of the result, return with the
   1238  * iterator unchanged.
   1239  * @param chars iterator over the pattern characters.  Upon return
   1240  * it will be unchanged.
   1241  * @param iterOpts RuleCharacterIterator options
   1242  */
   1243 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
   1244                                            int32_t iterOpts) {
   1245     // NOTE: literal will always be FALSE, because we don't parse escapes.
   1246     UBool result = FALSE, literal;
   1247     UErrorCode ec = U_ZERO_ERROR;
   1248     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
   1249     RuleCharacterIterator::Pos pos;
   1250     chars.getPos(pos);
   1251     UChar32 c = chars.next(iterOpts, literal, ec);
   1252     if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
   1253         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
   1254                                literal, ec);
   1255         result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
   1256                  (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
   1257     }
   1258     chars.setPos(pos);
   1259     return result && U_SUCCESS(ec);
   1260 }
   1261 
   1262 /**
   1263  * Parse the given property pattern at the given parse position.
   1264  */
   1265 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
   1266                                              ParsePosition& ppos,
   1267                                              UErrorCode &ec) {
   1268     int32_t pos = ppos.getIndex();
   1269 
   1270     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
   1271     UBool isName = FALSE; // true for \N{pat}, o/w false
   1272     UBool invert = FALSE;
   1273 
   1274     if (U_FAILURE(ec)) return *this;
   1275 
   1276     // Minimum length is 5 characters, e.g. \p{L}
   1277     if ((pos+5) > pattern.length()) {
   1278         FAIL(ec);
   1279     }
   1280 
   1281     // On entry, ppos should point to one of the following locations:
   1282     // Look for an opening [:, [:^, \p, or \P
   1283     if (isPOSIXOpen(pattern, pos)) {
   1284         posix = TRUE;
   1285         pos += 2;
   1286         pos = ICU_Utility::skipWhitespace(pattern, pos);
   1287         if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
   1288             ++pos;
   1289             invert = TRUE;
   1290         }
   1291     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
   1292         UChar c = pattern.charAt(pos+1);
   1293         invert = (c == UPPER_P);
   1294         isName = (c == UPPER_N);
   1295         pos += 2;
   1296         pos = ICU_Utility::skipWhitespace(pattern, pos);
   1297         if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
   1298             // Syntax error; "\p" or "\P" not followed by "{"
   1299             FAIL(ec);
   1300         }
   1301     } else {
   1302         // Open delimiter not seen
   1303         FAIL(ec);
   1304     }
   1305 
   1306     // Look for the matching close delimiter, either :] or }
   1307     int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
   1308     if (close < 0) {
   1309         // Syntax error; close delimiter missing
   1310         FAIL(ec);
   1311     }
   1312 
   1313     // Look for an '=' sign.  If this is present, we will parse a
   1314     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
   1315     // pattern.
   1316     int32_t equals = pattern.indexOf(EQUALS, pos);
   1317     UnicodeString propName, valueName;
   1318     if (equals >= 0 && equals < close && !isName) {
   1319         // Equals seen; parse medium/long pattern
   1320         pattern.extractBetween(pos, equals, propName);
   1321         pattern.extractBetween(equals+1, close, valueName);
   1322     }
   1323 
   1324     else {
   1325         // Handle case where no '=' is seen, and \N{}
   1326         pattern.extractBetween(pos, close, propName);
   1327 
   1328         // Handle \N{name}
   1329         if (isName) {
   1330             // This is a little inefficient since it means we have to
   1331             // parse NAME_PROP back to UCHAR_NAME even though we already
   1332             // know it's UCHAR_NAME.  If we refactor the API to
   1333             // support args of (UProperty, char*) then we can remove
   1334             // NAME_PROP and make this a little more efficient.
   1335             valueName = propName;
   1336             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
   1337         }
   1338     }
   1339 
   1340     applyPropertyAlias(propName, valueName, ec);
   1341 
   1342     if (U_SUCCESS(ec)) {
   1343         if (invert) {
   1344             complement();
   1345         }
   1346 
   1347         // Move to the limit position after the close delimiter if the
   1348         // parse succeeded.
   1349         ppos.setIndex(close + (posix ? 2 : 1));
   1350     }
   1351 
   1352     return *this;
   1353 }
   1354 
   1355 /**
   1356  * Parse a property pattern.
   1357  * @param chars iterator over the pattern characters.  Upon return
   1358  * it will be advanced to the first character after the parsed
   1359  * pattern, or the end of the iteration if all characters are
   1360  * parsed.
   1361  * @param rebuiltPat the pattern that was parsed, rebuilt or
   1362  * copied from the input pattern, as appropriate.
   1363  */
   1364 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
   1365                                       UnicodeString& rebuiltPat,
   1366                                       UErrorCode& ec) {
   1367     if (U_FAILURE(ec)) return;
   1368     UnicodeString pattern;
   1369     chars.lookahead(pattern);
   1370     ParsePosition pos(0);
   1371     applyPropertyPattern(pattern, pos, ec);
   1372     if (U_FAILURE(ec)) return;
   1373     if (pos.getIndex() == 0) {
   1374         // syntaxError(chars, "Invalid property pattern");
   1375         ec = U_MALFORMED_SET;
   1376         return;
   1377     }
   1378     chars.jumpahead(pos.getIndex());
   1379     rebuiltPat.append(pattern, 0, pos.getIndex());
   1380 }
   1381 
   1382 //----------------------------------------------------------------
   1383 // Case folding API
   1384 //----------------------------------------------------------------
   1385 
   1386 // add the result of a full case mapping to the set
   1387 // use str as a temporary string to avoid constructing one
   1388 static inline void
   1389 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {
   1390     if(result >= 0) {
   1391         if(result > UCASE_MAX_STRING_LENGTH) {
   1392             // add a single-code point case mapping
   1393             set.add(result);
   1394         } else {
   1395             // add a string case mapping from full with length result
   1396             str.setTo((UBool)FALSE, full, result);
   1397             set.add(str);
   1398         }
   1399     }
   1400     // result < 0: the code point mapped to itself, no need to add it
   1401     // see ucase.h
   1402 }
   1403 
   1404 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
   1405     if (isFrozen() || isBogus()) {
   1406         return *this;
   1407     }
   1408     if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
   1409         const UCaseProps *csp = ucase_getSingleton();
   1410         {
   1411             UnicodeSet foldSet(*this);
   1412             UnicodeString str;
   1413             USetAdder sa = {
   1414                 foldSet.toUSet(),
   1415                 _set_add,
   1416                 _set_addRange,
   1417                 _set_addString,
   1418                 NULL, // don't need remove()
   1419                 NULL // don't need removeRange()
   1420             };
   1421 
   1422             // start with input set to guarantee inclusion
   1423             // USET_CASE: remove strings because the strings will actually be reduced (folded);
   1424             //            therefore, start with no strings and add only those needed
   1425             if (attribute & USET_CASE_INSENSITIVE) {
   1426                 foldSet.strings->removeAllElements();
   1427             }
   1428 
   1429             int32_t n = getRangeCount();
   1430             UChar32 result;
   1431             const UChar *full;
   1432             int32_t locCache = 0;
   1433 
   1434             for (int32_t i=0; i<n; ++i) {
   1435                 UChar32 start = getRangeStart(i);
   1436                 UChar32 end   = getRangeEnd(i);
   1437 
   1438                 if (attribute & USET_CASE_INSENSITIVE) {
   1439                     // full case closure
   1440                     for (UChar32 cp=start; cp<=end; ++cp) {
   1441                         ucase_addCaseClosure(csp, cp, &sa);
   1442                     }
   1443                 } else {
   1444                     // add case mappings
   1445                     // (does not add long s for regular s, or Kelvin for k, for example)
   1446                     for (UChar32 cp=start; cp<=end; ++cp) {
   1447                         result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
   1448                         addCaseMapping(foldSet, result, full, str);
   1449 
   1450                         result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
   1451                         addCaseMapping(foldSet, result, full, str);
   1452 
   1453                         result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
   1454                         addCaseMapping(foldSet, result, full, str);
   1455 
   1456                         result = ucase_toFullFolding(csp, cp, &full, 0);
   1457                         addCaseMapping(foldSet, result, full, str);
   1458                     }
   1459                 }
   1460             }
   1461             if (strings != NULL && strings->size() > 0) {
   1462                 if (attribute & USET_CASE_INSENSITIVE) {
   1463                     for (int32_t j=0; j<strings->size(); ++j) {
   1464                         str = *(const UnicodeString *) strings->elementAt(j);
   1465                         str.foldCase();
   1466                         if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
   1467                             foldSet.add(str); // does not map to code points: add the folded string itself
   1468                         }
   1469                     }
   1470                 } else {
   1471                     Locale root("");
   1472 #if !UCONFIG_NO_BREAK_ITERATION
   1473                     UErrorCode status = U_ZERO_ERROR;
   1474                     BreakIterator *bi = BreakIterator::createWordInstance(root, status);
   1475                     if (U_SUCCESS(status)) {
   1476 #endif
   1477                         const UnicodeString *pStr;
   1478 
   1479                         for (int32_t j=0; j<strings->size(); ++j) {
   1480                             pStr = (const UnicodeString *) strings->elementAt(j);
   1481                             (str = *pStr).toLower(root);
   1482                             foldSet.add(str);
   1483 #if !UCONFIG_NO_BREAK_ITERATION
   1484                             (str = *pStr).toTitle(bi, root);
   1485                             foldSet.add(str);
   1486 #endif
   1487                             (str = *pStr).toUpper(root);
   1488                             foldSet.add(str);
   1489                             (str = *pStr).foldCase();
   1490                             foldSet.add(str);
   1491                         }
   1492 #if !UCONFIG_NO_BREAK_ITERATION
   1493                     }
   1494                     delete bi;
   1495 #endif
   1496                 }
   1497             }
   1498             *this = foldSet;
   1499         }
   1500     }
   1501     return *this;
   1502 }
   1503 
   1504 U_NAMESPACE_END
   1505