Home | History | Annotate | Download | only in intltest
      1 /*
      2 ********************************************************************************
      3 *   Copyright (C) 1999-2010 International Business Machines Corporation and
      4 *   others. All Rights Reserved.
      5 ********************************************************************************
      6 *   Date        Name        Description
      7 *   10/20/99    alan        Creation.
      8 *   03/22/2000  Madhu       Added additional tests
      9 ********************************************************************************
     10 */
     11 
     12 #include <stdio.h>
     13 
     14 #include <string.h>
     15 #include "unicode/utypes.h"
     16 #include "usettest.h"
     17 #include "unicode/ucnv.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/usetiter.h"
     21 #include "unicode/ustring.h"
     22 #include "unicode/parsepos.h"
     23 #include "unicode/symtable.h"
     24 #include "unicode/uversion.h"
     25 #include "hash.h"
     26 
     27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     28 
     29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
     30     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
     31     u_errorName(status));}}
     32 
     33 #define TEST_ASSERT(expr) {if (!(expr)) { \
     34     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
     35 
     36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
     37     UnicodeString pat;
     38     set.toPattern(pat);
     39     return left + UnicodeSetTest::escape(pat);
     40 }
     41 
     42 #define CASE(id,test) case id:                          \
     43                           name = #test;                 \
     44                           if (exec) {                   \
     45                               logln(#test "---");       \
     46                               logln();                  \
     47                               test();                   \
     48                           }                             \
     49                           break
     50 
     51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
     52 }
     53 
     54 UConverter *UnicodeSetTest::openUTF8Converter() {
     55     if(utf8Cnv==NULL) {
     56         UErrorCode errorCode=U_ZERO_ERROR;
     57         utf8Cnv=ucnv_open("UTF-8", &errorCode);
     58     }
     59     return utf8Cnv;
     60 }
     61 
     62 UnicodeSetTest::~UnicodeSetTest() {
     63     ucnv_close(utf8Cnv);
     64 }
     65 
     66 void
     67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     68                                const char* &name, char* /*par*/) {
     69     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
     70     switch (index) {
     71         CASE(0,TestPatterns);
     72         CASE(1,TestAddRemove);
     73         CASE(2,TestCategories);
     74         CASE(3,TestCloneEqualHash);
     75         CASE(4,TestMinimalRep);
     76         CASE(5,TestAPI);
     77         CASE(6,TestScriptSet);
     78         CASE(7,TestPropertySet);
     79         CASE(8,TestClone);
     80         CASE(9,TestExhaustive);
     81         CASE(10,TestToPattern);
     82         CASE(11,TestIndexOf);
     83         CASE(12,TestStrings);
     84         CASE(13,Testj2268);
     85         CASE(14,TestCloseOver);
     86         CASE(15,TestEscapePattern);
     87         CASE(16,TestInvalidCodePoint);
     88         CASE(17,TestSymbolTable);
     89         CASE(18,TestSurrogate);
     90         CASE(19,TestPosixClasses);
     91         CASE(20,TestIteration);
     92         CASE(21,TestFreezable);
     93         CASE(22,TestSpan);
     94         CASE(23,TestStringSpan);
     95         default: name = ""; break;
     96     }
     97 }
     98 
     99 static const char NOT[] = "%%%%";
    100 
    101 /**
    102  * UVector was improperly copying contents
    103  * This code will crash this is still true
    104  */
    105 void UnicodeSetTest::Testj2268() {
    106   UnicodeSet t;
    107   t.add(UnicodeString("abc"));
    108   UnicodeSet test(t);
    109   UnicodeString ustrPat;
    110   test.toPattern(ustrPat, TRUE);
    111 }
    112 
    113 /**
    114  * Test toPattern().
    115  */
    116 void UnicodeSetTest::TestToPattern() {
    117     UErrorCode ec = U_ZERO_ERROR;
    118 
    119     // Test that toPattern() round trips with syntax characters and
    120     // whitespace.
    121     {
    122         static const char* OTHER_TOPATTERN_TESTS[] = {
    123             "[[:latin:]&[:greek:]]",
    124             "[[:latin:]-[:greek:]]",
    125             "[:nonspacing mark:]",
    126             NULL
    127         };
    128 
    129         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
    130             ec = U_ZERO_ERROR;
    131             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
    132             if (U_FAILURE(ec)) {
    133                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
    134                 continue;
    135             }
    136             checkPat(OTHER_TOPATTERN_TESTS[j], s);
    137         }
    138 
    139         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
    140             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
    141 
    142                 // check various combinations to make sure they all work.
    143                 if (i != 0 && !toPatternAux(i, i)){
    144                     continue;
    145                 }
    146                 if (!toPatternAux(0, i)){
    147                     continue;
    148                 }
    149                 if (!toPatternAux(i, 0xFFFF)){
    150                     continue;
    151                 }
    152             }
    153         }
    154     }
    155 
    156     // Test pattern behavior of multicharacter strings.
    157     {
    158         ec = U_ZERO_ERROR;
    159         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
    160 
    161         // This loop isn't a loop.  It's here to make the compiler happy.
    162         // If you're curious, try removing it and changing the 'break'
    163         // statements (except for the last) to goto's.
    164         for (;;) {
    165             if (U_FAILURE(ec)) break;
    166             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
    167             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
    168 
    169             s->add("ac");
    170             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
    171             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
    172 
    173             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
    174             if (U_FAILURE(ec)) break;
    175             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
    176             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
    177 
    178             s->add("[]");
    179             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
    180             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
    181 
    182             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
    183             if (U_FAILURE(ec)) break;
    184             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
    185             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
    186 
    187             // j2189
    188             s->clear();
    189             s->add(UnicodeString("abc", ""));
    190             s->add(UnicodeString("abc", ""));
    191             const char* exp6[] = {"abc", NOT, "ab", NULL};
    192             expectToPattern(*s, "[{abc}]", exp6);
    193 
    194             break;
    195         }
    196 
    197         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
    198         delete s;
    199     }
    200 
    201     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
    202     UnicodeSet s;
    203     s.add((UChar)97, (UChar)98); // 'a', 'b'
    204     expectToPattern(s, "[ab]", NULL);
    205 }
    206 
    207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
    208 
    209     // use Integer.toString because Utility.hex doesn't handle ints
    210     UnicodeString pat = "";
    211     // TODO do these in hex
    212     //String source = "0x" + Integer.toString(start,16).toUpperCase();
    213     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
    214     UnicodeString source;
    215     source = source + (uint32_t)start;
    216     if (start != end)
    217         source = source + ".." + (uint32_t)end;
    218     UnicodeSet testSet;
    219     testSet.add(start, end);
    220     return checkPat(source, testSet);
    221 }
    222 
    223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
    224                                const UnicodeSet& testSet) {
    225     // What we want to make sure of is that a pattern generated
    226     // by toPattern(), with or without escaped unprintables, can
    227     // be passed back into the UnicodeSet constructor.
    228     UnicodeString pat0;
    229 
    230     testSet.toPattern(pat0, TRUE);
    231 
    232     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
    233 
    234     //String pat1 = unescapeLeniently(pat0);
    235     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
    236 
    237     UnicodeString pat2;
    238     testSet.toPattern(pat2, FALSE);
    239     if (!checkPat(source, testSet, pat2)) return FALSE;
    240 
    241     //String pat3 = unescapeLeniently(pat2);
    242     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
    243 
    244     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
    245     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
    246     return TRUE;
    247 }
    248 
    249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
    250                                const UnicodeSet& testSet,
    251                                const UnicodeString& pat) {
    252     UErrorCode ec = U_ZERO_ERROR;
    253     UnicodeSet testSet2(pat, ec);
    254     if (testSet2 != testSet) {
    255         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
    256         return FALSE;
    257     }
    258     return TRUE;
    259 }
    260 
    261 void
    262 UnicodeSetTest::TestPatterns(void) {
    263     UnicodeSet set;
    264     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
    265     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
    266     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
    267     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
    268     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
    269     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
    270 
    271     // Throw in a test of complement
    272     set.complement();
    273     UnicodeString exp;
    274     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
    275     expectPairs(set, exp);
    276 }
    277 
    278 void
    279 UnicodeSetTest::TestCategories(void) {
    280     UErrorCode status = U_ZERO_ERROR;
    281     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
    282     UnicodeSet set(pat, status);
    283     if (U_FAILURE(status)) {
    284         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
    285         return;
    286     } else {
    287         expectContainment(set, pat, "ABC", "abc");
    288     }
    289 
    290     UChar32 i;
    291     int32_t failures = 0;
    292     // Make sure generation of L doesn't pollute cached Lu set
    293     // First generate L, then Lu
    294     set.applyPattern("[:L:]", status);
    295     if (U_FAILURE(status)) { errln("FAIL"); return; }
    296     for (i=0; i<0x200; ++i) {
    297         UBool l = u_isalpha((UChar)i);
    298         if (l != set.contains(i)) {
    299             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
    300                   set.contains(i));
    301             if (++failures == 10) break;
    302         }
    303     }
    304 
    305     set.applyPattern("[:Lu:]", status);
    306     if (U_FAILURE(status)) { errln("FAIL"); return; }
    307     for (i=0; i<0x200; ++i) {
    308         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
    309         if (lu != set.contains(i)) {
    310             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
    311                   set.contains(i));
    312             if (++failures == 20) break;
    313         }
    314     }
    315 }
    316 void
    317 UnicodeSetTest::TestCloneEqualHash(void) {
    318     UErrorCode status = U_ZERO_ERROR;
    319     // set1 and set2 used to be built with the obsolete constructor taking
    320     // UCharCategory values; replaced with pattern constructors
    321     // markus 20030502
    322     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
    323     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
    324     if (U_FAILURE(status)){
    325         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
    326         return;
    327     }
    328     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
    329     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
    330     if (U_FAILURE(status)){
    331         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
    332         return;
    333     }
    334 
    335     if (*set1 != *set1a) {
    336         errln("FAIL: category constructor for Ll broken");
    337     }
    338     if (*set2 != *set2a) {
    339         errln("FAIL: category constructor for Nd broken");
    340     }
    341     delete set1a;
    342     delete set2a;
    343 
    344     logln("Testing copy construction");
    345     UnicodeSet *set1copy=new UnicodeSet(*set1);
    346     if(*set1 != *set1copy || *set1 == *set2 ||
    347         getPairs(*set1) != getPairs(*set1copy) ||
    348         set1->hashCode() != set1copy->hashCode()){
    349         errln("FAIL : Error in copy construction");
    350         return;
    351     }
    352 
    353     logln("Testing =operator");
    354     UnicodeSet set1equal=*set1;
    355     UnicodeSet set2equal=*set2;
    356     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
    357         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
    358         errln("FAIL: Error in =operator");
    359     }
    360 
    361     logln("Testing clone()");
    362     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
    363     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
    364     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
    365         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
    366         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
    367         errln("FAIL: Error in clone");
    368     }
    369 
    370     logln("Testing hashcode");
    371     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
    372         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
    373         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
    374         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
    375         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
    376         errln("FAIL: Error in hashCode()");
    377     }
    378 
    379     delete set1;
    380     delete set1copy;
    381     delete set2;
    382     delete set1clone;
    383     delete set2clone;
    384 
    385 
    386 }
    387 void
    388 UnicodeSetTest::TestAddRemove(void) {
    389     UnicodeSet set; // Construct empty set
    390     doAssert(set.isEmpty() == TRUE, "set should be empty");
    391     doAssert(set.size() == 0, "size should be 0");
    392     set.complement();
    393     doAssert(set.size() == 0x110000, "size should be 0x110000");
    394     set.clear();
    395     set.add(0x0061, 0x007a);
    396     expectPairs(set, "az");
    397     doAssert(set.isEmpty() == FALSE, "set should not be empty");
    398     doAssert(set.size() != 0, "size should not be equal to 0");
    399     doAssert(set.size() == 26, "size should be equal to 26");
    400     set.remove(0x006d, 0x0070);
    401     expectPairs(set, "alqz");
    402     doAssert(set.size() == 22, "size should be equal to 22");
    403     set.remove(0x0065, 0x0067);
    404     expectPairs(set, "adhlqz");
    405     doAssert(set.size() == 19, "size should be equal to 19");
    406     set.remove(0x0064, 0x0069);
    407     expectPairs(set, "acjlqz");
    408     doAssert(set.size() == 16, "size should be equal to 16");
    409     set.remove(0x0063, 0x0072);
    410     expectPairs(set, "absz");
    411     doAssert(set.size() == 10, "size should be equal to 10");
    412     set.add(0x0066, 0x0071);
    413     expectPairs(set, "abfqsz");
    414     doAssert(set.size() == 22, "size should be equal to 22");
    415     set.remove(0x0061, 0x0067);
    416     expectPairs(set, "hqsz");
    417     set.remove(0x0061, 0x007a);
    418     expectPairs(set, "");
    419     doAssert(set.isEmpty() == TRUE, "set should be empty");
    420     doAssert(set.size() == 0, "size should be 0");
    421     set.add(0x0061);
    422     doAssert(set.isEmpty() == FALSE, "set should not be empty");
    423     doAssert(set.size() == 1, "size should not be equal to 1");
    424     set.add(0x0062);
    425     set.add(0x0063);
    426     expectPairs(set, "ac");
    427     doAssert(set.size() == 3, "size should not be equal to 3");
    428     set.add(0x0070);
    429     set.add(0x0071);
    430     expectPairs(set, "acpq");
    431     doAssert(set.size() == 5, "size should not be equal to 5");
    432     set.clear();
    433     expectPairs(set, "");
    434     doAssert(set.isEmpty() == TRUE, "set should be empty");
    435     doAssert(set.size() == 0, "size should be 0");
    436 
    437     // Try removing an entire set from another set
    438     expectPattern(set, "[c-x]", "cx");
    439     UnicodeSet set2;
    440     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
    441     set.removeAll(set2);
    442     expectPairs(set, "deluxx");
    443 
    444     // Try adding an entire set to another set
    445     expectPattern(set, "[jackiemclean]", "aacceein");
    446     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
    447     set.addAll(set2);
    448     expectPairs(set, "aacehort");
    449     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
    450 
    451     // Try retaining an set of elements contained in another set (intersection)
    452     UnicodeSet set3;
    453     expectPattern(set3, "[a-c]", "ac");
    454     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
    455     set3.remove(0x0062);
    456     expectPairs(set3, "aacc");
    457     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
    458     set.retainAll(set3);
    459     expectPairs(set, "aacc");
    460     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
    461     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
    462     set.clear();
    463     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
    464 
    465     // Test commutativity
    466     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
    467     expectPattern(set2, "[jackiemclean]", "aacceein");
    468     set.addAll(set2);
    469     expectPairs(set, "aacehort");
    470     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
    471 
    472 
    473 
    474 
    475 }
    476 
    477 /**
    478  * Make sure minimal representation is maintained.
    479  */
    480 void UnicodeSetTest::TestMinimalRep() {
    481     UErrorCode status = U_ZERO_ERROR;
    482     // This is pretty thoroughly tested by checkCanonicalRep()
    483     // run against the exhaustive operation results.  Use the code
    484     // here for debugging specific spot problems.
    485 
    486     // 1 overlap against 2
    487     UnicodeSet set("[h-km-q]", status);
    488     if (U_FAILURE(status)) { errln("FAIL"); return; }
    489     UnicodeSet set2("[i-o]", status);
    490     if (U_FAILURE(status)) { errln("FAIL"); return; }
    491     set.addAll(set2);
    492     expectPairs(set, "hq");
    493     // right
    494     set.applyPattern("[a-m]", status);
    495     if (U_FAILURE(status)) { errln("FAIL"); return; }
    496     set2.applyPattern("[e-o]", status);
    497     if (U_FAILURE(status)) { errln("FAIL"); return; }
    498     set.addAll(set2);
    499     expectPairs(set, "ao");
    500     // left
    501     set.applyPattern("[e-o]", status);
    502     if (U_FAILURE(status)) { errln("FAIL"); return; }
    503     set2.applyPattern("[a-m]", status);
    504     if (U_FAILURE(status)) { errln("FAIL"); return; }
    505     set.addAll(set2);
    506     expectPairs(set, "ao");
    507     // 1 overlap against 3
    508     set.applyPattern("[a-eg-mo-w]", status);
    509     if (U_FAILURE(status)) { errln("FAIL"); return; }
    510     set2.applyPattern("[d-q]", status);
    511     if (U_FAILURE(status)) { errln("FAIL"); return; }
    512     set.addAll(set2);
    513     expectPairs(set, "aw");
    514 }
    515 
    516 void UnicodeSetTest::TestAPI() {
    517     UErrorCode status = U_ZERO_ERROR;
    518     // default ct
    519     UnicodeSet set;
    520     if (!set.isEmpty() || set.getRangeCount() != 0) {
    521         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
    522               set);
    523     }
    524 
    525     // clear(), isEmpty()
    526     set.add(0x0061);
    527     if (set.isEmpty()) {
    528         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
    529               set);
    530     }
    531     set.clear();
    532     if (!set.isEmpty()) {
    533         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
    534               set);
    535     }
    536 
    537     // size()
    538     set.clear();
    539     if (set.size() != 0) {
    540         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
    541               ": " + set);
    542     }
    543     set.add(0x0061);
    544     if (set.size() != 1) {
    545         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
    546               ": " + set);
    547     }
    548     set.add(0x0031, 0x0039);
    549     if (set.size() != 10) {
    550         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
    551               ": " + set);
    552     }
    553 
    554     // contains(first, last)
    555     set.clear();
    556     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
    557     if (U_FAILURE(status)) { errln("FAIL"); return; }
    558     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
    559         UChar32 a = set.getRangeStart(i);
    560         UChar32 b = set.getRangeEnd(i);
    561         if (!set.contains(a, b)) {
    562             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
    563                   " but doesn't: " + set);
    564         }
    565         if (set.contains((UChar32)(a-1), b)) {
    566             errln((UnicodeString)"FAIL, shouldn't contain " +
    567                   (unsigned short)(a-1) + '-' + (unsigned short)b +
    568                   " but does: " + set);
    569         }
    570         if (set.contains(a, (UChar32)(b+1))) {
    571             errln((UnicodeString)"FAIL, shouldn't contain " +
    572                   (unsigned short)a + '-' + (unsigned short)(b+1) +
    573                   " but does: " + set);
    574         }
    575     }
    576 
    577     // Ported InversionList test.
    578     UnicodeSet a((UChar32)3,(UChar32)10);
    579     UnicodeSet b((UChar32)7,(UChar32)15);
    580     UnicodeSet c;
    581 
    582     logln((UnicodeString)"a [3-10]: " + a);
    583     logln((UnicodeString)"b [7-15]: " + b);
    584     c = a;
    585     c.addAll(b);
    586     UnicodeSet exp((UChar32)3,(UChar32)15);
    587     if (c == exp) {
    588         logln((UnicodeString)"c.set(a).add(b): " + c);
    589     } else {
    590         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
    591     }
    592     c.complement();
    593     exp.set((UChar32)0, (UChar32)2);
    594     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
    595     if (c == exp) {
    596         logln((UnicodeString)"c.complement(): " + c);
    597     } else {
    598         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
    599     }
    600     c.complement();
    601     exp.set((UChar32)3, (UChar32)15);
    602     if (c == exp) {
    603         logln((UnicodeString)"c.complement(): " + c);
    604     } else {
    605         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
    606     }
    607     c = a;
    608     c.complementAll(b);
    609     exp.set((UChar32)3,(UChar32)6);
    610     exp.add((UChar32)11,(UChar32) 15);
    611     if (c == exp) {
    612         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
    613     } else {
    614         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
    615     }
    616 
    617     exp = c;
    618     bitsToSet(setToBits(c), c);
    619     if (c == exp) {
    620         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
    621     } else {
    622         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
    623     }
    624 
    625     // Additional tests for coverage JB#2118
    626     //UnicodeSet::complement(class UnicodeString const &)
    627     //UnicodeSet::complementAll(class UnicodeString const &)
    628     //UnicodeSet::containsNone(class UnicodeSet const &)
    629     //UnicodeSet::containsNone(long,long)
    630     //UnicodeSet::containsSome(class UnicodeSet const &)
    631     //UnicodeSet::containsSome(long,long)
    632     //UnicodeSet::removeAll(class UnicodeString const &)
    633     //UnicodeSet::retain(long)
    634     //UnicodeSet::retainAll(class UnicodeString const &)
    635     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
    636     //UnicodeSetIterator::getString(void)
    637     set.clear();
    638     set.complement("ab");
    639     exp.applyPattern("[{ab}]", status);
    640     if (U_FAILURE(status)) { errln("FAIL"); return; }
    641     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
    642 
    643     UnicodeSetIterator iset(set);
    644     if (!iset.next() || !iset.isString()) {
    645         errln("FAIL: UnicodeSetIterator::next/isString");
    646     } else if (iset.getString() != "ab") {
    647         errln("FAIL: UnicodeSetIterator::getString");
    648     }
    649 
    650     set.add((UChar32)0x61, (UChar32)0x7A);
    651     set.complementAll("alan");
    652     exp.applyPattern("[{ab}b-kmo-z]", status);
    653     if (U_FAILURE(status)) { errln("FAIL"); return; }
    654     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
    655 
    656     exp.applyPattern("[a-z]", status);
    657     if (U_FAILURE(status)) { errln("FAIL"); return; }
    658     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
    659     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
    660     exp.applyPattern("[aln]", status);
    661     if (U_FAILURE(status)) { errln("FAIL"); return; }
    662     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
    663     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
    664 
    665     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
    666         errln("FAIL: containsNone(UChar32, UChar32)");
    667     }
    668     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
    669         errln("FAIL: containsSome(UChar32, UChar32)");
    670     }
    671     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
    672         errln("FAIL: containsNone(UChar32, UChar32)");
    673     }
    674     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
    675         errln("FAIL: containsSome(UChar32, UChar32)");
    676     }
    677 
    678     set.removeAll("liu");
    679     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
    680     if (U_FAILURE(status)) { errln("FAIL"); return; }
    681     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
    682 
    683     set.retainAll("star");
    684     exp.applyPattern("[rst]", status);
    685     if (U_FAILURE(status)) { errln("FAIL"); return; }
    686     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
    687 
    688     set.retain((UChar32)0x73);
    689     exp.applyPattern("[s]", status);
    690     if (U_FAILURE(status)) { errln("FAIL"); return; }
    691     if (set != exp) { errln("FAIL: retain('s')"); return; }
    692 
    693     uint16_t buf[32];
    694     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
    695     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
    696     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
    697         errln("FAIL: serialize");
    698         return;
    699     }
    700 
    701     // Conversions to and from USet
    702     UnicodeSet *uniset = &set;
    703     USet *uset = uniset->toUSet();
    704     TEST_ASSERT((void *)uset == (void *)uniset);
    705     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
    706     TEST_ASSERT((void *)setx == (void *)uset);
    707     const UnicodeSet *constSet = uniset;
    708     const USet *constUSet = constSet->toUSet();
    709     TEST_ASSERT((void *)constUSet == (void *)constSet);
    710     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
    711     TEST_ASSERT((void *)constSetx == (void *)constUSet);
    712 
    713     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
    714     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
    715     UnicodeSet ac(0x61, 0x63);
    716     ac.remove(0x62).freeze();
    717     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
    718         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
    719         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
    720         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
    721         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
    722         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
    723         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
    724         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
    725         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
    726         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
    727     ) {
    728         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
    729     }
    730     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
    731         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
    732         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
    733         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
    734         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
    735         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
    736         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
    737         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
    738         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
    739         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
    740     ) {
    741         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
    742     }
    743 }
    744 
    745 void UnicodeSetTest::TestIteration() {
    746     UErrorCode ec = U_ZERO_ERROR;
    747     int i = 0;
    748     int outerLoop;
    749 
    750     // 6 code points, 3 ranges, 2 strings, 8 total elements
    751     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
    752     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
    753     TEST_ASSERT_SUCCESS(ec);
    754     UnicodeSetIterator it(set);
    755 
    756     for (outerLoop=0; outerLoop<3; outerLoop++) {
    757         // Run the test multiple times, to check that iterator.reset() is working.
    758         for (i=0; i<10; i++) {
    759             UBool         nextv        = it.next();
    760             UBool         isString     = it.isString();
    761             int32_t       codePoint    = it.getCodepoint();
    762             //int32_t       codePointEnd = it.getCodepointEnd();
    763             UnicodeString s   = it.getString();
    764             switch (i) {
    765             case 0:
    766                 TEST_ASSERT(nextv == TRUE);
    767                 TEST_ASSERT(isString == FALSE);
    768                 TEST_ASSERT(codePoint==0x61);
    769                 TEST_ASSERT(s == "a");
    770                 break;
    771             case 1:
    772                 TEST_ASSERT(nextv == TRUE);
    773                 TEST_ASSERT(isString == FALSE);
    774                 TEST_ASSERT(codePoint==0x62);
    775                 TEST_ASSERT(s == "b");
    776                 break;
    777             case 2:
    778                 TEST_ASSERT(nextv == TRUE);
    779                 TEST_ASSERT(isString == FALSE);
    780                 TEST_ASSERT(codePoint==0x63);
    781                 TEST_ASSERT(s == "c");
    782                 break;
    783             case 3:
    784                 TEST_ASSERT(nextv == TRUE);
    785                 TEST_ASSERT(isString == FALSE);
    786                 TEST_ASSERT(codePoint==0x79);
    787                 TEST_ASSERT(s == "y");
    788                 break;
    789             case 4:
    790                 TEST_ASSERT(nextv == TRUE);
    791                 TEST_ASSERT(isString == FALSE);
    792                 TEST_ASSERT(codePoint==0x7a);
    793                 TEST_ASSERT(s == "z");
    794                 break;
    795             case 5:
    796                 TEST_ASSERT(nextv == TRUE);
    797                 TEST_ASSERT(isString == FALSE);
    798                 TEST_ASSERT(codePoint==0x1abcd);
    799                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
    800                 break;
    801             case 6:
    802                 TEST_ASSERT(nextv == TRUE);
    803                 TEST_ASSERT(isString == TRUE);
    804                 TEST_ASSERT(s == "str1");
    805                 break;
    806             case 7:
    807                 TEST_ASSERT(nextv == TRUE);
    808                 TEST_ASSERT(isString == TRUE);
    809                 TEST_ASSERT(s == "str2");
    810                 break;
    811             case 8:
    812                 TEST_ASSERT(nextv == FALSE);
    813                 break;
    814             case 9:
    815                 TEST_ASSERT(nextv == FALSE);
    816                 break;
    817             }
    818         }
    819         it.reset();  // prepare to run the iteration again.
    820     }
    821 }
    822 
    823 
    824 
    825 
    826 void UnicodeSetTest::TestStrings() {
    827     UErrorCode ec = U_ZERO_ERROR;
    828 
    829     UnicodeSet* testList[] = {
    830         UnicodeSet::createFromAll("abc"),
    831         new UnicodeSet("[a-c]", ec),
    832 
    833         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
    834         new UnicodeSet("[{ll}{ch}a-z]", ec),
    835 
    836         UnicodeSet::createFrom("ab}c"),
    837         new UnicodeSet("[{ab\\}c}]", ec),
    838 
    839         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
    840         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
    841 
    842         NULL
    843     };
    844 
    845     if (U_FAILURE(ec)) {
    846         errln("FAIL: couldn't construct test sets");
    847     }
    848 
    849     for (int32_t i = 0; testList[i] != NULL; i+=2) {
    850         if (U_SUCCESS(ec)) {
    851             UnicodeString pat0, pat1;
    852             testList[i]->toPattern(pat0, TRUE);
    853             testList[i+1]->toPattern(pat1, TRUE);
    854             if (*testList[i] == *testList[i+1]) {
    855                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
    856             } else {
    857                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
    858             }
    859         }
    860         delete testList[i];
    861         delete testList[i+1];
    862     }
    863 }
    864 
    865 /**
    866  * Test the [:Latin:] syntax.
    867  */
    868 void UnicodeSetTest::TestScriptSet() {
    869     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
    870 
    871     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
    872 
    873     /* Jitterbug 1423 */
    874     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
    875 
    876 }
    877 
    878 /**
    879  * Test the [:Latin:] syntax.
    880  */
    881 void UnicodeSetTest::TestPropertySet() {
    882     static const char* const DATA[] = {
    883         // Pattern, Chars IN, Chars NOT in
    884 
    885         "[:Latin:]",
    886         "aA",
    887         "\\u0391\\u03B1",
    888 
    889         "[\\p{Greek}]",
    890         "\\u0391\\u03B1",
    891         "aA",
    892 
    893         "\\P{ GENERAL Category = upper case letter }",
    894         "abc",
    895         "ABC",
    896 
    897 #if !UCONFIG_NO_NORMALIZATION
    898         // Combining class: @since ICU 2.2
    899         // Check both symbolic and numeric
    900         "\\p{ccc=Nukta}",
    901         "\\u0ABC",
    902         "abc",
    903 
    904         "\\p{Canonical Combining Class = 11}",
    905         "\\u05B1",
    906         "\\u05B2",
    907 
    908         "[:c c c = iota subscript :]",
    909         "\\u0345",
    910         "xyz",
    911 #endif
    912 
    913         // Bidi class: @since ICU 2.2
    914         "\\p{bidiclass=lefttoright}",
    915         "abc",
    916         "\\u0671\\u0672",
    917 
    918         // Binary properties: @since ICU 2.2
    919         "\\p{ideographic}",
    920         "\\u4E0A",
    921         "x",
    922 
    923         "[:math=false:]",
    924         "q)*(",
    925         // weiv: )(and * were removed from math in Unicode 4.0.1
    926         //"(*+)",
    927         "+<>^",
    928 
    929         // JB#1767 \N{}, \p{ASCII}
    930         "[:Ascii:]",
    931         "abc\\u0000\\u007F",
    932         "\\u0080\\u4E00",
    933 
    934         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
    935         "az",
    936         "qrs",
    937 
    938         // JB#2015
    939         "[:any:]",
    940         "a\\U0010FFFF",
    941         "",
    942 
    943         "[:nv=0.5:]",
    944         "\\u00BD\\u0F2A",
    945         "\\u00BC",
    946 
    947         // JB#2653: Age
    948         "[:Age=1.1:]",
    949         "\\u03D6", // 1.1
    950         "\\u03D8\\u03D9", // 3.2
    951 
    952         "[:Age=3.1:]",
    953         "\\u1800\\u3400\\U0002f800",
    954         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
    955 
    956         // JB#2350: Case_Sensitive
    957         "[:Case Sensitive:]",
    958         "A\\u1FFC\\U00010410",
    959         ";\\u00B4\\U00010500",
    960 
    961         // JB#2832: C99-compatibility props
    962         "[:blank:]",
    963         " \\u0009",
    964         "1-9A-Z",
    965 
    966         "[:graph:]",
    967         "19AZ",
    968         " \\u0003\\u0007\\u0009\\u000A\\u000D",
    969 
    970         "[:punct:]",
    971         "!@#%&*()[]{}-_\\/;:,.?'\"",
    972         "09azAZ",
    973 
    974         "[:xdigit:]",
    975         "09afAF",
    976         "gG!",
    977 
    978         // Regex compatibility test
    979         "[-b]", // leading '-' is literal
    980         "-b",
    981         "ac",
    982 
    983         "[^-b]", // leading '-' is literal
    984         "ac",
    985         "-b",
    986 
    987         "[b-]", // trailing '-' is literal
    988         "-b",
    989         "ac",
    990 
    991         "[^b-]", // trailing '-' is literal
    992         "ac",
    993         "-b",
    994 
    995         "[a-b-]", // trailing '-' is literal
    996         "ab-",
    997         "c=",
    998 
    999         "[[a-q]&[p-z]-]", // trailing '-' is literal
   1000         "pq-",
   1001         "or=",
   1002 
   1003         "[\\s|\\)|:|$|\\>]", // from regex tests
   1004         "s|):$>",
   1005         "abc",
   1006 
   1007         "[\\uDC00cd]", // JB#2906: isolated trail at start
   1008         "cd\\uDC00",
   1009         "ab\\uD800\\U00010000",
   1010 
   1011         "[ab\\uD800]", // JB#2906: isolated trail at start
   1012         "ab\\uD800",
   1013         "cd\\uDC00\\U00010000",
   1014 
   1015         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
   1016         "abcd\\uD800",
   1017         "ef\\uDC00\\U00010000",
   1018 
   1019         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
   1020         "abcd\\uDC00",
   1021         "ef\\uD800\\U00010000",
   1022 
   1023 #if !UCONFIG_NO_NORMALIZATION
   1024         "[:^lccc=0:]", // Lead canonical class
   1025         "\\u0300\\u0301",
   1026         "abcd\\u00c0\\u00c5",
   1027 
   1028         "[:^tccc=0:]", // Trail canonical class
   1029         "\\u0300\\u0301\\u00c0\\u00c5",
   1030         "abcd",
   1031 
   1032         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
   1033         "\\u0300\\u0301\\u00c0\\u00c5",
   1034         "abcd",
   1035 
   1036         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
   1037         "",
   1038         "abcd\\u0300\\u0301\\u00c0\\u00c5",
   1039 
   1040         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
   1041         "\\u0F73\\u0F75\\u0F81",
   1042         "abcd\\u0300\\u0301\\u00c0\\u00c5",
   1043 #endif /* !UCONFIG_NO_NORMALIZATION */
   1044 
   1045         "[:Assigned:]",
   1046         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
   1047         "\\u0888\\uFDD3\\uFFFE\\U00050005",
   1048 
   1049         // Script_Extensions, new in Unicode 6.0
   1050         "[:scx=Arab:]",
   1051         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
   1052         "\\u061D\\u065F\\uFDEF\\uFDFE",
   1053 
   1054         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
   1055         // so scx-sc is missing U+FDF2.
   1056         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
   1057         "\\u0640\\u064B\\u0650\\u0655\\uFDFD",
   1058         "\\uFDF2"
   1059     };
   1060 
   1061     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
   1062 
   1063     for (int32_t i=0; i<DATA_LEN; i+=3) {
   1064         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
   1065                           CharsToUnicodeString(DATA[i+2]));
   1066     }
   1067 }
   1068 
   1069 /**
   1070   * Test that Posix style character classes [:digit:], etc.
   1071   *   have the Unicode definitions from TR 18.
   1072   */
   1073 void UnicodeSetTest::TestPosixClasses() {
   1074     {
   1075         UErrorCode status = U_ZERO_ERROR;
   1076         UnicodeSet s1("[:alpha:]", status);
   1077         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
   1078         TEST_ASSERT_SUCCESS(status);
   1079         TEST_ASSERT(s1==s2);
   1080     }
   1081     {
   1082         UErrorCode status = U_ZERO_ERROR;
   1083         UnicodeSet s1("[:lower:]", status);
   1084         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
   1085         TEST_ASSERT_SUCCESS(status);
   1086         TEST_ASSERT(s1==s2);
   1087     }
   1088     {
   1089         UErrorCode status = U_ZERO_ERROR;
   1090         UnicodeSet s1("[:upper:]", status);
   1091         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
   1092         TEST_ASSERT_SUCCESS(status);
   1093         TEST_ASSERT(s1==s2);
   1094     }
   1095     {
   1096         UErrorCode status = U_ZERO_ERROR;
   1097         UnicodeSet s1("[:punct:]", status);
   1098         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
   1099         TEST_ASSERT_SUCCESS(status);
   1100         TEST_ASSERT(s1==s2);
   1101     }
   1102     {
   1103         UErrorCode status = U_ZERO_ERROR;
   1104         UnicodeSet s1("[:digit:]", status);
   1105         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
   1106         TEST_ASSERT_SUCCESS(status);
   1107         TEST_ASSERT(s1==s2);
   1108     }
   1109     {
   1110         UErrorCode status = U_ZERO_ERROR;
   1111         UnicodeSet s1("[:xdigit:]", status);
   1112         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
   1113         TEST_ASSERT_SUCCESS(status);
   1114         TEST_ASSERT(s1==s2);
   1115     }
   1116     {
   1117         UErrorCode status = U_ZERO_ERROR;
   1118         UnicodeSet s1("[:alnum:]", status);
   1119         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
   1120         TEST_ASSERT_SUCCESS(status);
   1121         TEST_ASSERT(s1==s2);
   1122     }
   1123     {
   1124         UErrorCode status = U_ZERO_ERROR;
   1125         UnicodeSet s1("[:space:]", status);
   1126         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
   1127         TEST_ASSERT_SUCCESS(status);
   1128         TEST_ASSERT(s1==s2);
   1129     }
   1130     {
   1131         UErrorCode status = U_ZERO_ERROR;
   1132         UnicodeSet s1("[:blank:]", status);
   1133         TEST_ASSERT_SUCCESS(status);
   1134         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
   1135             status);
   1136         TEST_ASSERT_SUCCESS(status);
   1137         TEST_ASSERT(s1==s2);
   1138     }
   1139     {
   1140         UErrorCode status = U_ZERO_ERROR;
   1141         UnicodeSet s1("[:cntrl:]", status);
   1142         TEST_ASSERT_SUCCESS(status);
   1143         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
   1144         TEST_ASSERT_SUCCESS(status);
   1145         TEST_ASSERT(s1==s2);
   1146     }
   1147     {
   1148         UErrorCode status = U_ZERO_ERROR;
   1149         UnicodeSet s1("[:graph:]", status);
   1150         TEST_ASSERT_SUCCESS(status);
   1151         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
   1152         TEST_ASSERT_SUCCESS(status);
   1153         TEST_ASSERT(s1==s2);
   1154     }
   1155     {
   1156         UErrorCode status = U_ZERO_ERROR;
   1157         UnicodeSet s1("[:print:]", status);
   1158         TEST_ASSERT_SUCCESS(status);
   1159         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
   1160         TEST_ASSERT_SUCCESS(status);
   1161         TEST_ASSERT(s1==s2);
   1162     }
   1163 }
   1164 /**
   1165  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
   1166  */
   1167 void UnicodeSetTest::TestClone() {
   1168     UErrorCode ec = U_ZERO_ERROR;
   1169     UnicodeSet s("[abcxyz]", ec);
   1170     UnicodeSet t(s);
   1171     expectContainment(t, "abc", "def");
   1172 }
   1173 
   1174 /**
   1175  * Test the indexOf() and charAt() methods.
   1176  */
   1177 void UnicodeSetTest::TestIndexOf() {
   1178     UErrorCode ec = U_ZERO_ERROR;
   1179     UnicodeSet set("[a-cx-y3578]", ec);
   1180     if (U_FAILURE(ec)) {
   1181         errln("FAIL: UnicodeSet constructor");
   1182         return;
   1183     }
   1184     for (int32_t i=0; i<set.size(); ++i) {
   1185         UChar32 c = set.charAt(i);
   1186         if (set.indexOf(c) != i) {
   1187             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
   1188                 i, c, set.indexOf(c));
   1189         }
   1190     }
   1191     UChar32 c = set.charAt(set.size());
   1192     if (c != -1) {
   1193         errln("FAIL: charAt(<out of range>) = %X", c);
   1194     }
   1195     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
   1196     if (j != -1) {
   1197         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
   1198     }
   1199 }
   1200 
   1201 /**
   1202  * Test closure API.
   1203  */
   1204 void UnicodeSetTest::TestCloseOver() {
   1205     UErrorCode ec = U_ZERO_ERROR;
   1206 
   1207     char CASE[] = {(char)USET_CASE_INSENSITIVE};
   1208     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
   1209     const char* DATA[] = {
   1210         // selector, input, output
   1211         CASE,
   1212         "[aq\\u00DF{Bc}{bC}{Fi}]",
   1213         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
   1214 
   1215         CASE,
   1216         "[\\u01F1]", // 'DZ'
   1217         "[\\u01F1\\u01F2\\u01F3]",
   1218 
   1219         CASE,
   1220         "[\\u1FB4]",
   1221         "[\\u1FB4{\\u03AC\\u03B9}]",
   1222 
   1223         CASE,
   1224         "[{F\\uFB01}]",
   1225         "[\\uFB03{ffi}]",
   1226 
   1227         CASE, // make sure binary search finds limits
   1228         "[a\\uFF3A]",
   1229         "[aA\\uFF3A\\uFF5A]",
   1230 
   1231         CASE,
   1232         "[a-z]","[A-Za-z\\u017F\\u212A]",
   1233         CASE,
   1234         "[abc]","[A-Ca-c]",
   1235         CASE,
   1236         "[ABC]","[A-Ca-c]",
   1237 
   1238         CASE, "[i]", "[iI]",
   1239 
   1240         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
   1241         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
   1242 
   1243         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
   1244 
   1245         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
   1246 
   1247         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
   1248 
   1249         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
   1250 
   1251         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
   1252 
   1253         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
   1254 
   1255         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
   1256         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
   1257 
   1258         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
   1259 
   1260         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
   1261 
   1262         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
   1263 
   1264 #if !UCONFIG_NO_FILE_IO
   1265         CASE_MAPPINGS,
   1266         "[aq\\u00DF{Bc}{bC}{Fi}]",
   1267         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
   1268 #endif
   1269 
   1270         CASE_MAPPINGS,
   1271         "[\\u01F1]", // 'DZ'
   1272         "[\\u01F1\\u01F2\\u01F3]",
   1273 
   1274         CASE_MAPPINGS,
   1275         "[a-z]",
   1276         "[A-Za-z]",
   1277 
   1278         NULL
   1279     };
   1280 
   1281     UnicodeSet s;
   1282     UnicodeSet t;
   1283     UnicodeString buf;
   1284     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
   1285         int32_t selector = DATA[i][0];
   1286         UnicodeString pat(DATA[i+1], -1, US_INV);
   1287         UnicodeString exp(DATA[i+2], -1, US_INV);
   1288         s.applyPattern(pat, ec);
   1289         s.closeOver(selector);
   1290         t.applyPattern(exp, ec);
   1291         if (U_FAILURE(ec)) {
   1292             errln("FAIL: applyPattern failed");
   1293             continue;
   1294         }
   1295         if (s == t) {
   1296             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
   1297         } else {
   1298             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
   1299                   s.toPattern(buf, TRUE) + ", expected " + exp);
   1300         }
   1301     }
   1302 
   1303 #if 0
   1304     /*
   1305      * Unused test code.
   1306      * This was used to compare the old implementation (using USET_CASE)
   1307      * with the new one (using 0x100 temporarily)
   1308      * while transitioning from hardcoded case closure tables in uniset.cpp
   1309      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
   1310      * and using ucase.c functions for closure.
   1311      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
   1312      *
   1313      * Note: The old and new implementation never fully matched because
   1314      * the old implementation turned out to not map U+0130 and U+0131 correctly
   1315      * (dotted I and dotless i) and because the old implementation's data tables
   1316      * were outdated compared to Unicode 4.0.1 at the time of the change to the
   1317      * new implementation. (So sigmas and some other characters were not handled
   1318      * according to the newer Unicode version.)
   1319      */
   1320     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
   1321     UnicodeSetIterator si(sens);
   1322     UnicodeString str, buf2;
   1323     const UnicodeString *pStr;
   1324     UChar32 c;
   1325     while(si.next()) {
   1326         if(!si.isString()) {
   1327             c=si.getCodepoint();
   1328             s.clear();
   1329             s.add(c);
   1330 
   1331             str.setTo(c);
   1332             str.foldCase();
   1333             sens2.add(str);
   1334 
   1335             t=s;
   1336             s.closeOver(USET_CASE);
   1337             t.closeOver(0x100);
   1338             if(s!=t) {
   1339                 errln("FAIL: closeOver(U+%04x) differs: ", c);
   1340                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
   1341             }
   1342         }
   1343     }
   1344     // remove all code points
   1345     // should contain all full case folding mapping strings
   1346     sens2.remove(0, 0x10ffff);
   1347     si.reset(sens2);
   1348     while(si.next()) {
   1349         if(si.isString()) {
   1350             pStr=&si.getString();
   1351             s.clear();
   1352             s.add(*pStr);
   1353             t=s2=s;
   1354             s.closeOver(USET_CASE);
   1355             t.closeOver(0x100);
   1356             if(s!=t) {
   1357                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
   1358                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
   1359             }
   1360         }
   1361     }
   1362 #endif
   1363 
   1364     // Test the pattern API
   1365     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
   1366     if (U_FAILURE(ec)) {
   1367         errln("FAIL: applyPattern failed");
   1368     } else {
   1369         expectContainment(s, "abcABC", "defDEF");
   1370     }
   1371     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
   1372     if (U_FAILURE(ec)) {
   1373         errln("FAIL: constructor failed");
   1374     } else {
   1375         expectContainment(v, "defDEF", "abcABC");
   1376     }
   1377     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
   1378     if (U_FAILURE(ec)) {
   1379         errln("FAIL: construct w/case mappings failed");
   1380     } else {
   1381         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
   1382     }
   1383 }
   1384 
   1385 void UnicodeSetTest::TestEscapePattern() {
   1386     const char pattern[] =
   1387         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
   1388     const char exp[] =
   1389         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
   1390     // We test this with two passes; in the second pass we
   1391     // pre-unescape the pattern.  Since U+200E is rule whitespace,
   1392     // this fails -- which is what we expect.
   1393     for (int32_t pass=1; pass<=2; ++pass) {
   1394         UErrorCode ec = U_ZERO_ERROR;
   1395         UnicodeString pat(pattern, -1, US_INV);
   1396         if (pass==2) {
   1397             pat = pat.unescape();
   1398         }
   1399         // Pattern is only good for pass 1
   1400         UBool isPatternValid = (pass==1);
   1401 
   1402         UnicodeSet set(pat, ec);
   1403         if (U_SUCCESS(ec) != isPatternValid){
   1404             errln((UnicodeString)"FAIL: applyPattern(" +
   1405                   escape(pat) + ") => " +
   1406                   u_errorName(ec));
   1407             continue;
   1408         }
   1409         if (U_FAILURE(ec)) {
   1410             continue;
   1411         }
   1412         if (set.contains((UChar)0x0644)){
   1413             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
   1414         }
   1415 
   1416         UnicodeString newpat;
   1417         set.toPattern(newpat, TRUE);
   1418         if (newpat == UnicodeString(exp, -1, US_INV)) {
   1419             logln(escape(pat) + " => " + newpat);
   1420         } else {
   1421             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
   1422         }
   1423 
   1424         for (int32_t i=0; i<set.getRangeCount(); ++i) {
   1425             UnicodeString str("Range ");
   1426             str.append((UChar)(0x30 + i))
   1427                 .append(": ")
   1428                 .append((UChar32)set.getRangeStart(i))
   1429                 .append(" - ")
   1430                 .append((UChar32)set.getRangeEnd(i));
   1431             str = str + " (" + set.getRangeStart(i) + " - " +
   1432                 set.getRangeEnd(i) + ")";
   1433             if (set.getRangeStart(i) < 0) {
   1434                 errln((UnicodeString)"FAIL: " + escape(str));
   1435             } else {
   1436                 logln(escape(str));
   1437             }
   1438         }
   1439     }
   1440 }
   1441 
   1442 void UnicodeSetTest::expectRange(const UnicodeString& label,
   1443                                  const UnicodeSet& set,
   1444                                  UChar32 start, UChar32 end) {
   1445     UnicodeSet exp(start, end);
   1446     UnicodeString pat;
   1447     if (set == exp) {
   1448         logln(label + " => " + set.toPattern(pat, TRUE));
   1449     } else {
   1450         UnicodeString xpat;
   1451         errln((UnicodeString)"FAIL: " + label + " => " +
   1452               set.toPattern(pat, TRUE) +
   1453               ", expected " + exp.toPattern(xpat, TRUE));
   1454     }
   1455 }
   1456 
   1457 void UnicodeSetTest::TestInvalidCodePoint() {
   1458 
   1459     const UChar32 DATA[] = {
   1460         // Test range             Expected range
   1461         0, 0x10FFFF,              0, 0x10FFFF,
   1462         (UChar32)-1, 8,           0, 8,
   1463         8, 0x110000,              8, 0x10FFFF
   1464     };
   1465     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
   1466 
   1467     UnicodeString pat;
   1468     int32_t i;
   1469 
   1470     for (i=0; i<DATA_LENGTH; i+=4) {
   1471         UChar32 start  = DATA[i];
   1472         UChar32 end    = DATA[i+1];
   1473         UChar32 xstart = DATA[i+2];
   1474         UChar32 xend   = DATA[i+3];
   1475 
   1476         // Try various API using the test code points
   1477 
   1478         UnicodeSet set(start, end);
   1479         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
   1480                     set, xstart, xend);
   1481 
   1482         set.clear();
   1483         set.set(start, end);
   1484         expectRange((UnicodeString)"set(" + start + "," + end + ")",
   1485                     set, xstart, xend);
   1486 
   1487         UBool b = set.contains(start);
   1488         b = set.contains(start, end);
   1489         b = set.containsNone(start, end);
   1490         b = set.containsSome(start, end);
   1491 
   1492         /*int32_t index = set.indexOf(start);*/
   1493 
   1494         set.clear();
   1495         set.add(start);
   1496         set.add(start, end);
   1497         expectRange((UnicodeString)"add(" + start + "," + end + ")",
   1498                     set, xstart, xend);
   1499 
   1500         set.set(0, 0x10FFFF);
   1501         set.retain(start, end);
   1502         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
   1503                     set, xstart, xend);
   1504         set.retain(start);
   1505 
   1506         set.set(0, 0x10FFFF);
   1507         set.remove(start);
   1508         set.remove(start, end);
   1509         set.complement();
   1510         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
   1511                     set, xstart, xend);
   1512 
   1513         set.set(0, 0x10FFFF);
   1514         set.complement(start, end);
   1515         set.complement();
   1516         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
   1517                     set, xstart, xend);
   1518         set.complement(start);
   1519     }
   1520 
   1521     const UChar32 DATA2[] = {
   1522         0,
   1523         0x10FFFF,
   1524         (UChar32)-1,
   1525         0x110000
   1526     };
   1527     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
   1528 
   1529     for (i=0; i<DATA2_LENGTH; ++i) {
   1530         UChar32 c = DATA2[i], end = 0x10FFFF;
   1531         UBool valid = (c >= 0 && c <= 0x10FFFF);
   1532 
   1533         UnicodeSet set(0, 0x10FFFF);
   1534 
   1535         // For single-codepoint contains, invalid codepoints are NOT contained
   1536         UBool b = set.contains(c);
   1537         if (b == valid) {
   1538             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
   1539                   ") = " + b);
   1540         } else {
   1541             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
   1542                   ") = " + b);
   1543         }
   1544 
   1545         // For codepoint range contains, containsNone, and containsSome,
   1546         // invalid or empty (start > end) ranges have UNDEFINED behavior.
   1547         b = set.contains(c, end);
   1548         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
   1549               "," + end + ") = " + b);
   1550 
   1551         b = set.containsNone(c, end);
   1552         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
   1553               "," + end + ") = " + b);
   1554 
   1555         b = set.containsSome(c, end);
   1556         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
   1557               "," + end + ") = " + b);
   1558 
   1559         int32_t index = set.indexOf(c);
   1560         if ((index >= 0) == valid) {
   1561             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
   1562                   ") = " + index);
   1563         } else {
   1564             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
   1565                   ") = " + index);
   1566         }
   1567     }
   1568 }
   1569 
   1570 // Used by TestSymbolTable
   1571 class TokenSymbolTable : public SymbolTable {
   1572 public:
   1573     Hashtable contents;
   1574 
   1575     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
   1576         contents.setValueDeleter(uhash_deleteUnicodeString);
   1577     }
   1578 
   1579     ~TokenSymbolTable() {}
   1580 
   1581     /**
   1582      * (Non-SymbolTable API) Add the given variable and value to
   1583      * the table.  Variable should NOT contain leading '$'.
   1584      */
   1585     void add(const UnicodeString& var, const UnicodeString& value,
   1586              UErrorCode& ec) {
   1587         if (U_SUCCESS(ec)) {
   1588             contents.put(var, new UnicodeString(value), ec);
   1589         }
   1590     }
   1591 
   1592     /**
   1593      * SymbolTable API
   1594      */
   1595     virtual const UnicodeString* lookup(const UnicodeString& s) const {
   1596         return (const UnicodeString*) contents.get(s);
   1597     }
   1598 
   1599     /**
   1600      * SymbolTable API
   1601      */
   1602     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
   1603         return NULL;
   1604     }
   1605 
   1606     /**
   1607      * SymbolTable API
   1608      */
   1609     virtual UnicodeString parseReference(const UnicodeString& text,
   1610                                          ParsePosition& pos, int32_t limit) const {
   1611         int32_t start = pos.getIndex();
   1612         int32_t i = start;
   1613         UnicodeString result;
   1614         while (i < limit) {
   1615             UChar c = text.charAt(i);
   1616             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
   1617                 break;
   1618             }
   1619             ++i;
   1620         }
   1621         if (i == start) { // No valid name chars
   1622             return result; // Indicate failure with empty string
   1623         }
   1624         pos.setIndex(i);
   1625         text.extractBetween(start, i, result);
   1626         return result;
   1627     }
   1628 };
   1629 
   1630 void UnicodeSetTest::TestSymbolTable() {
   1631     // Multiple test cases can be set up here.  Each test case
   1632     // is terminated by null:
   1633     // var, value, var, value,..., input pat., exp. output pat., null
   1634     const char* DATA[] = {
   1635         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
   1636         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
   1637         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
   1638         NULL
   1639     };
   1640 
   1641     for (int32_t i=0; DATA[i]!=NULL; ++i) {
   1642         UErrorCode ec = U_ZERO_ERROR;
   1643         TokenSymbolTable sym(ec);
   1644         if (U_FAILURE(ec)) {
   1645             errln("FAIL: couldn't construct TokenSymbolTable");
   1646             continue;
   1647         }
   1648 
   1649         // Set up variables
   1650         while (DATA[i+2] != NULL) {
   1651             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
   1652             if (U_FAILURE(ec)) {
   1653                 errln("FAIL: couldn't add to TokenSymbolTable");
   1654                 continue;
   1655             }
   1656             i += 2;
   1657         }
   1658 
   1659         // Input pattern and expected output pattern
   1660         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
   1661         i += 2;
   1662 
   1663         ParsePosition pos(0);
   1664         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
   1665         if (U_FAILURE(ec)) {
   1666             errln("FAIL: couldn't construct UnicodeSet");
   1667             continue;
   1668         }
   1669 
   1670         // results
   1671         if (pos.getIndex() != inpat.length()) {
   1672             errln((UnicodeString)"Failed to read to end of string \""
   1673                   + inpat + "\": read to "
   1674                   + pos.getIndex() + ", length is "
   1675                   + inpat.length());
   1676         }
   1677 
   1678         UnicodeSet us2(exppat, ec);
   1679         if (U_FAILURE(ec)) {
   1680             errln("FAIL: couldn't construct expected UnicodeSet");
   1681             continue;
   1682         }
   1683 
   1684         UnicodeString a, b;
   1685         if (us != us2) {
   1686             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
   1687                   ", expected " + us2.toPattern(b, TRUE));
   1688         } else {
   1689             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
   1690         }
   1691     }
   1692 }
   1693 
   1694 void UnicodeSetTest::TestSurrogate() {
   1695     const char* DATA[] = {
   1696         // These should all behave identically
   1697         "[abc\\uD800\\uDC00]",
   1698         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
   1699         "[abc\\U00010000]",
   1700         0
   1701     };
   1702     for (int i=0; DATA[i] != 0; ++i) {
   1703         UErrorCode ec = U_ZERO_ERROR;
   1704         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
   1705         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
   1706         UnicodeSet set(str, ec);
   1707         if (U_FAILURE(ec)) {
   1708             errln("FAIL: UnicodeSet constructor");
   1709             continue;
   1710         }
   1711         expectContainment(set,
   1712                           CharsToUnicodeString("abc\\U00010000"),
   1713                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
   1714         if (set.size() != 4) {
   1715             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
   1716                   set.size() + ", expected 4");
   1717         }
   1718     }
   1719 }
   1720 
   1721 void UnicodeSetTest::TestExhaustive() {
   1722     // exhaustive tests. Simulate UnicodeSets with integers.
   1723     // That gives us very solid tests (except for large memory tests).
   1724 
   1725     int32_t limit = 128;
   1726 
   1727     UnicodeSet x, y, z, aa;
   1728 
   1729     for (int32_t i = 0; i < limit; ++i) {
   1730         bitsToSet(i, x);
   1731         logln((UnicodeString)"Testing " + i + ", " + x);
   1732         _testComplement(i, x, y);
   1733 
   1734         // AS LONG AS WE ARE HERE, check roundtrip
   1735         checkRoundTrip(bitsToSet(i, aa));
   1736 
   1737         for (int32_t j = 0; j < limit; ++j) {
   1738             _testAdd(i,j,  x,y,z);
   1739             _testXor(i,j,  x,y,z);
   1740             _testRetain(i,j,  x,y,z);
   1741             _testRemove(i,j,  x,y,z);
   1742         }
   1743     }
   1744 }
   1745 
   1746 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
   1747     bitsToSet(a, x);
   1748     z = x;
   1749     z.complement();
   1750     int32_t c = setToBits(z);
   1751     if (c != (~a)) {
   1752         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
   1753         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
   1754     }
   1755     checkCanonicalRep(z, (UnicodeString)"complement " + a);
   1756 }
   1757 
   1758 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1759     bitsToSet(a, x);
   1760     bitsToSet(b, y);
   1761     z = x;
   1762     z.addAll(y);
   1763     int32_t c = setToBits(z);
   1764     if (c != (a | b)) {
   1765         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
   1766         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
   1767     }
   1768     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
   1769 }
   1770 
   1771 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1772     bitsToSet(a, x);
   1773     bitsToSet(b, y);
   1774     z = x;
   1775     z.retainAll(y);
   1776     int32_t c = setToBits(z);
   1777     if (c != (a & b)) {
   1778         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
   1779         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
   1780     }
   1781     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
   1782 }
   1783 
   1784 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1785     bitsToSet(a, x);
   1786     bitsToSet(b, y);
   1787     z = x;
   1788     z.removeAll(y);
   1789     int32_t c = setToBits(z);
   1790     if (c != (a &~ b)) {
   1791         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
   1792         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
   1793     }
   1794     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
   1795 }
   1796 
   1797 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1798     bitsToSet(a, x);
   1799     bitsToSet(b, y);
   1800     z = x;
   1801     z.complementAll(y);
   1802     int32_t c = setToBits(z);
   1803     if (c != (a ^ b)) {
   1804         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
   1805         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
   1806     }
   1807     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
   1808 }
   1809 
   1810 /**
   1811  * Check that ranges are monotonically increasing and non-
   1812  * overlapping.
   1813  */
   1814 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
   1815     int32_t n = set.getRangeCount();
   1816     if (n < 0) {
   1817         errln((UnicodeString)"FAIL result of " + msg +
   1818               ": range count should be >= 0 but is " +
   1819               n /*+ " for " + set.toPattern())*/);
   1820         return;
   1821     }
   1822     UChar32 last = 0;
   1823     for (int32_t i=0; i<n; ++i) {
   1824         UChar32 start = set.getRangeStart(i);
   1825         UChar32 end = set.getRangeEnd(i);
   1826         if (start > end) {
   1827             errln((UnicodeString)"FAIL result of " + msg +
   1828                   ": range " + (i+1) +
   1829                   " start > end: " + (int)start + ", " + (int)end +
   1830                   " for " + set);
   1831         }
   1832         if (i > 0 && start <= last) {
   1833             errln((UnicodeString)"FAIL result of " + msg +
   1834                   ": range " + (i+1) +
   1835                   " overlaps previous range: " + (int)start + ", " + (int)end +
   1836                   " for " + set);
   1837         }
   1838         last = end;
   1839     }
   1840 }
   1841 
   1842 /**
   1843  * Convert a bitmask to a UnicodeSet.
   1844  */
   1845 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
   1846     result.clear();
   1847     for (UChar32 i = 0; i < 32; ++i) {
   1848         if ((a & (1<<i)) != 0) {
   1849             result.add(i);
   1850         }
   1851     }
   1852     return result;
   1853 }
   1854 
   1855 /**
   1856  * Convert a UnicodeSet to a bitmask.  Only the characters
   1857  * U+0000 to U+0020 are represented in the bitmask.
   1858  */
   1859 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
   1860     int32_t result = 0;
   1861     for (int32_t i = 0; i < 32; ++i) {
   1862         if (x.contains((UChar32)i)) {
   1863             result |= (1<<i);
   1864         }
   1865     }
   1866     return result;
   1867 }
   1868 
   1869 /**
   1870  * Return the representation of an inversion list based UnicodeSet
   1871  * as a pairs list.  Ranges are listed in ascending Unicode order.
   1872  * For example, the set [a-zA-M3] is represented as "33AMaz".
   1873  */
   1874 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
   1875     UnicodeString pairs;
   1876     for (int32_t i=0; i<set.getRangeCount(); ++i) {
   1877         UChar32 start = set.getRangeStart(i);
   1878         UChar32 end = set.getRangeEnd(i);
   1879         if (end > 0xFFFF) {
   1880             end = 0xFFFF;
   1881             i = set.getRangeCount(); // Should be unnecessary
   1882         }
   1883         pairs.append((UChar)start).append((UChar)end);
   1884     }
   1885     return pairs;
   1886 }
   1887 
   1888 /**
   1889  * Basic consistency check for a few items.
   1890  * That the iterator works, and that we can create a pattern and
   1891  * get the same thing back
   1892  */
   1893 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
   1894     UErrorCode ec = U_ZERO_ERROR;
   1895 
   1896     UnicodeSet t(s);
   1897     checkEqual(s, t, "copy ct");
   1898 
   1899     t = s;
   1900     checkEqual(s, t, "operator=");
   1901 
   1902     copyWithIterator(t, s, FALSE);
   1903     checkEqual(s, t, "iterator roundtrip");
   1904 
   1905     copyWithIterator(t, s, TRUE); // try range
   1906     checkEqual(s, t, "iterator roundtrip");
   1907 
   1908     UnicodeString pat; s.toPattern(pat, FALSE);
   1909     t.applyPattern(pat, ec);
   1910     if (U_FAILURE(ec)) {
   1911         errln("FAIL: applyPattern");
   1912         return;
   1913     } else {
   1914         checkEqual(s, t, "toPattern(false)");
   1915     }
   1916 
   1917     s.toPattern(pat, TRUE);
   1918     t.applyPattern(pat, ec);
   1919     if (U_FAILURE(ec)) {
   1920         errln("FAIL: applyPattern");
   1921         return;
   1922     } else {
   1923         checkEqual(s, t, "toPattern(true)");
   1924     }
   1925 }
   1926 
   1927 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
   1928     t.clear();
   1929     UnicodeSetIterator it(s);
   1930     if (withRange) {
   1931         while (it.nextRange()) {
   1932             if (it.isString()) {
   1933                 t.add(it.getString());
   1934             } else {
   1935                 t.add(it.getCodepoint(), it.getCodepointEnd());
   1936             }
   1937         }
   1938     } else {
   1939         while (it.next()) {
   1940             if (it.isString()) {
   1941                 t.add(it.getString());
   1942             } else {
   1943                 t.add(it.getCodepoint());
   1944             }
   1945         }
   1946     }
   1947 }
   1948 
   1949 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
   1950     UnicodeString source; s.toPattern(source, TRUE);
   1951     UnicodeString result; t.toPattern(result, TRUE);
   1952     if (s != t) {
   1953         errln((UnicodeString)"FAIL: " + message
   1954               + "; source = " + source
   1955               + "; result = " + result
   1956               );
   1957         return FALSE;
   1958     } else {
   1959         logln((UnicodeString)"Ok: " + message
   1960               + "; source = " + source
   1961               + "; result = " + result
   1962               );
   1963     }
   1964     return TRUE;
   1965 }
   1966 
   1967 void
   1968 UnicodeSetTest::expectContainment(const UnicodeString& pat,
   1969                                   const UnicodeString& charsIn,
   1970                                   const UnicodeString& charsOut) {
   1971     UErrorCode ec = U_ZERO_ERROR;
   1972     UnicodeSet set(pat, ec);
   1973     if (U_FAILURE(ec)) {
   1974         dataerrln((UnicodeString)"FAIL: pattern \"" +
   1975               pat + "\" => " + u_errorName(ec));
   1976         return;
   1977     }
   1978     expectContainment(set, pat, charsIn, charsOut);
   1979 }
   1980 
   1981 void
   1982 UnicodeSetTest::expectContainment(const UnicodeSet& set,
   1983                                   const UnicodeString& charsIn,
   1984                                   const UnicodeString& charsOut) {
   1985     UnicodeString pat;
   1986     set.toPattern(pat);
   1987     expectContainment(set, pat, charsIn, charsOut);
   1988 }
   1989 
   1990 void
   1991 UnicodeSetTest::expectContainment(const UnicodeSet& set,
   1992                                   const UnicodeString& setName,
   1993                                   const UnicodeString& charsIn,
   1994                                   const UnicodeString& charsOut) {
   1995     UnicodeString bad;
   1996     UChar32 c;
   1997     int32_t i;
   1998 
   1999     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
   2000         c = charsIn.char32At(i);
   2001         if (!set.contains(c)) {
   2002             bad.append(c);
   2003         }
   2004     }
   2005     if (bad.length() > 0) {
   2006         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
   2007               ", expected containment of " + prettify(charsIn));
   2008     } else {
   2009         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
   2010     }
   2011 
   2012     bad.truncate(0);
   2013     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
   2014         c = charsOut.char32At(i);
   2015         if (set.contains(c)) {
   2016             bad.append(c);
   2017         }
   2018     }
   2019     if (bad.length() > 0) {
   2020         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
   2021               ", expected non-containment of " + prettify(charsOut));
   2022     } else {
   2023         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
   2024     }
   2025 }
   2026 
   2027 void
   2028 UnicodeSetTest::expectPattern(UnicodeSet& set,
   2029                               const UnicodeString& pattern,
   2030                               const UnicodeString& expectedPairs){
   2031     UErrorCode status = U_ZERO_ERROR;
   2032     set.applyPattern(pattern, status);
   2033     if (U_FAILURE(status)) {
   2034         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
   2035               "\") failed");
   2036         return;
   2037     } else {
   2038         if (getPairs(set) != expectedPairs ) {
   2039             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
   2040                   "\") => pairs \"" +
   2041                   escape(getPairs(set)) + "\", expected \"" +
   2042                   escape(expectedPairs) + "\"");
   2043         } else {
   2044             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
   2045                   "\") => pairs \"" +
   2046                   escape(getPairs(set)) + "\"");
   2047         }
   2048     }
   2049     // the result of calling set.toPattern(), which is the string representation of
   2050     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
   2051     // will produce another set that is equal to this one.
   2052     UnicodeString temppattern;
   2053     set.toPattern(temppattern);
   2054     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
   2055     if (U_FAILURE(status)) {
   2056         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
   2057         return;
   2058     }
   2059     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
   2060         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
   2061             escape(getPairs(set)) + "\""));
   2062     } else{
   2063         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
   2064     }
   2065 
   2066     delete tempset;
   2067 
   2068 }
   2069 
   2070 void
   2071 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
   2072     if (getPairs(set) != expectedPairs) {
   2073         errln(UnicodeString("FAIL: Expected pair list \"") +
   2074               escape(expectedPairs) + "\", got \"" +
   2075               escape(getPairs(set)) + "\"");
   2076     }
   2077 }
   2078 
   2079 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
   2080                                      const UnicodeString& expPat,
   2081                                      const char** expStrings) {
   2082     UnicodeString pat;
   2083     set.toPattern(pat, TRUE);
   2084     if (pat == expPat) {
   2085         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
   2086     } else {
   2087         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
   2088         return;
   2089     }
   2090     if (expStrings == NULL) {
   2091         return;
   2092     }
   2093     UBool in = TRUE;
   2094     for (int32_t i=0; expStrings[i] != NULL; ++i) {
   2095         if (expStrings[i] == NOT) { // sic; pointer comparison
   2096             in = FALSE;
   2097             continue;
   2098         }
   2099         UnicodeString s = CharsToUnicodeString(expStrings[i]);
   2100         UBool contained = set.contains(s);
   2101         if (contained == in) {
   2102             logln((UnicodeString)"Ok: " + expPat +
   2103                   (contained ? " contains {" : " does not contain {") +
   2104                   escape(expStrings[i]) + "}");
   2105         } else {
   2106             errln((UnicodeString)"FAIL: " + expPat +
   2107                   (contained ? " contains {" : " does not contain {") +
   2108                   escape(expStrings[i]) + "}");
   2109         }
   2110     }
   2111 }
   2112 
   2113 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
   2114 
   2115 void
   2116 UnicodeSetTest::doAssert(UBool condition, const char *message)
   2117 {
   2118     if (!condition) {
   2119         errln(UnicodeString("ERROR : ") + message);
   2120     }
   2121 }
   2122 
   2123 UnicodeString
   2124 UnicodeSetTest::escape(const UnicodeString& s) {
   2125     UnicodeString buf;
   2126     for (int32_t i=0; i<s.length(); )
   2127     {
   2128         UChar32 c = s.char32At(i);
   2129         if (0x0020 <= c && c <= 0x007F) {
   2130             buf += c;
   2131         } else {
   2132             if (c <= 0xFFFF) {
   2133                 buf += (UChar)0x5c; buf += (UChar)0x75;
   2134             } else {
   2135                 buf += (UChar)0x5c; buf += (UChar)0x55;
   2136                 buf += toHexString((c & 0xF0000000) >> 28);
   2137                 buf += toHexString((c & 0x0F000000) >> 24);
   2138                 buf += toHexString((c & 0x00F00000) >> 20);
   2139                 buf += toHexString((c & 0x000F0000) >> 16);
   2140             }
   2141             buf += toHexString((c & 0xF000) >> 12);
   2142             buf += toHexString((c & 0x0F00) >> 8);
   2143             buf += toHexString((c & 0x00F0) >> 4);
   2144             buf += toHexString(c & 0x000F);
   2145         }
   2146         i += U16_LENGTH(c);
   2147     }
   2148     return buf;
   2149 }
   2150 
   2151 void UnicodeSetTest::TestFreezable() {
   2152     UErrorCode errorCode=U_ZERO_ERROR;
   2153     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
   2154     UnicodeSet idSet(idPattern, errorCode);
   2155     if(U_FAILURE(errorCode)) {
   2156         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
   2157         return;
   2158     }
   2159 
   2160     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
   2161     UnicodeSet wsSet(wsPattern, errorCode);
   2162     if(U_FAILURE(errorCode)) {
   2163         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
   2164         return;
   2165     }
   2166 
   2167     idSet.add(idPattern);
   2168     UnicodeSet frozen(idSet);
   2169     frozen.freeze();
   2170 
   2171     if(idSet.isFrozen() || !frozen.isFrozen()) {
   2172         errln("FAIL: isFrozen() is wrong");
   2173     }
   2174     if(frozen!=idSet || !(frozen==idSet)) {
   2175         errln("FAIL: a copy-constructed frozen set differs from its original");
   2176     }
   2177 
   2178     frozen=wsSet;
   2179     if(frozen!=idSet || !(frozen==idSet)) {
   2180         errln("FAIL: a frozen set was modified by operator=");
   2181     }
   2182 
   2183     UnicodeSet frozen2(frozen);
   2184     if(frozen2!=frozen || frozen2!=idSet) {
   2185         errln("FAIL: a copied frozen set differs from its frozen original");
   2186     }
   2187     if(!frozen2.isFrozen()) {
   2188         errln("FAIL: copy-constructing a frozen set results in a thawed one");
   2189     }
   2190     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
   2191     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
   2192         errln("FAIL: UnicodeSet(5, 55) failed");
   2193     }
   2194     frozen3=frozen;
   2195     if(!frozen3.isFrozen()) {
   2196         errln("FAIL: copying a frozen set results in a thawed one");
   2197     }
   2198 
   2199     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
   2200     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
   2201         errln("FAIL: clone() failed");
   2202     }
   2203     cloned->add(0xd802, 0xd805);
   2204     if(cloned->containsSome(0xd802, 0xd805)) {
   2205         errln("FAIL: unable to modify clone");
   2206     }
   2207     delete cloned;
   2208 
   2209     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
   2210     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
   2211         errln("FAIL: cloneAsThawed() failed");
   2212     }
   2213     thawed->add(0xd802, 0xd805);
   2214     if(!thawed->contains(0xd802, 0xd805)) {
   2215         errln("FAIL: unable to modify thawed clone");
   2216     }
   2217     delete thawed;
   2218 
   2219     frozen.set(5, 55);
   2220     if(frozen!=idSet || !(frozen==idSet)) {
   2221         errln("FAIL: UnicodeSet::set() modified a frozen set");
   2222     }
   2223 
   2224     frozen.clear();
   2225     if(frozen!=idSet || !(frozen==idSet)) {
   2226         errln("FAIL: UnicodeSet::clear() modified a frozen set");
   2227     }
   2228 
   2229     frozen.closeOver(USET_CASE_INSENSITIVE);
   2230     if(frozen!=idSet || !(frozen==idSet)) {
   2231         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
   2232     }
   2233 
   2234     frozen.compact();
   2235     if(frozen!=idSet || !(frozen==idSet)) {
   2236         errln("FAIL: UnicodeSet::compact() modified a frozen set");
   2237     }
   2238 
   2239     ParsePosition pos;
   2240     frozen.
   2241         applyPattern(wsPattern, errorCode).
   2242         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
   2243         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
   2244         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
   2245         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
   2246     if(frozen!=idSet || !(frozen==idSet)) {
   2247         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
   2248     }
   2249 
   2250     frozen.
   2251         add(0xd800).
   2252         add(0xd802, 0xd805).
   2253         add(wsPattern).
   2254         addAll(idPattern).
   2255         addAll(wsSet);
   2256     if(frozen!=idSet || !(frozen==idSet)) {
   2257         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
   2258     }
   2259 
   2260     frozen.
   2261         retain(0x62).
   2262         retain(0x64, 0x69).
   2263         retainAll(wsPattern).
   2264         retainAll(wsSet);
   2265     if(frozen!=idSet || !(frozen==idSet)) {
   2266         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
   2267     }
   2268 
   2269     frozen.
   2270         remove(0x62).
   2271         remove(0x64, 0x69).
   2272         remove(idPattern).
   2273         removeAll(idPattern).
   2274         removeAll(idSet);
   2275     if(frozen!=idSet || !(frozen==idSet)) {
   2276         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
   2277     }
   2278 
   2279     frozen.
   2280         complement().
   2281         complement(0x62).
   2282         complement(0x64, 0x69).
   2283         complement(idPattern).
   2284         complementAll(idPattern).
   2285         complementAll(idSet);
   2286     if(frozen!=idSet || !(frozen==idSet)) {
   2287         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
   2288     }
   2289 }
   2290 
   2291 // Test span() etc. -------------------------------------------------------- ***
   2292 
   2293 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
   2294 static int32_t
   2295 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
   2296     UErrorCode errorCode=U_ZERO_ERROR;
   2297     int32_t length8=0;
   2298     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
   2299     if(U_SUCCESS(errorCode)) {
   2300         return length8;
   2301     } else {
   2302         // The string contains an unpaired surrogate.
   2303         // Ignore this string.
   2304         return 0;
   2305     }
   2306 }
   2307 
   2308 class UnicodeSetWithStringsIterator;
   2309 
   2310 // Make the strings in a UnicodeSet easily accessible.
   2311 class UnicodeSetWithStrings {
   2312 public:
   2313     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
   2314             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
   2315         int32_t size=set.size();
   2316         if(size>0 && set.charAt(size-1)<0) {
   2317             // If a set's last element is not a code point, then it must contain strings.
   2318             // Iterate over the set, skip all code point ranges, and cache the strings.
   2319             // Convert them to UTF-8 for spanUTF8().
   2320             UnicodeSetIterator iter(set);
   2321             const UnicodeString *s;
   2322             char *s8=utf8;
   2323             int32_t length8, utf8Count=0;
   2324             while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
   2325                 if(iter.isString()) {
   2326                     // Store the pointer to the set's string element
   2327                     // which we happen to know is a stable pointer.
   2328                     strings[stringsLength]=s=&iter.getString();
   2329                     utf8Count+=
   2330                         utf8Lengths[stringsLength]=length8=
   2331                         appendUTF8(s->getBuffer(), s->length(),
   2332                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
   2333                     if(length8==0) {
   2334                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
   2335                     }
   2336                     s8+=length8;
   2337                     ++stringsLength;
   2338                 }
   2339             }
   2340         }
   2341     }
   2342 
   2343     const UnicodeSet &getSet() const {
   2344         return set;
   2345     }
   2346 
   2347     UBool hasStrings() const {
   2348         return (UBool)(stringsLength>0);
   2349     }
   2350 
   2351     UBool hasStringsWithSurrogates() const {
   2352         return hasSurrogates;
   2353     }
   2354 
   2355 private:
   2356     friend class UnicodeSetWithStringsIterator;
   2357 
   2358     const UnicodeSet &set;
   2359 
   2360     const UnicodeString *strings[20];
   2361     int32_t stringsLength;
   2362     UBool hasSurrogates;
   2363 
   2364     char utf8[1024];
   2365     int32_t utf8Lengths[20];
   2366 
   2367     int32_t nextStringIndex;
   2368     int32_t nextUTF8Start;
   2369 };
   2370 
   2371 class UnicodeSetWithStringsIterator {
   2372 public:
   2373     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
   2374             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
   2375     }
   2376 
   2377     void reset() {
   2378         nextStringIndex=nextUTF8Start=0;
   2379     }
   2380 
   2381     const UnicodeString *nextString() {
   2382         if(nextStringIndex<fSet.stringsLength) {
   2383             return fSet.strings[nextStringIndex++];
   2384         } else {
   2385             return NULL;
   2386         }
   2387     }
   2388 
   2389     // Do not mix with calls to nextString().
   2390     const char *nextUTF8(int32_t &length) {
   2391         if(nextStringIndex<fSet.stringsLength) {
   2392             const char *s8=fSet.utf8+nextUTF8Start;
   2393             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
   2394             return s8;
   2395         } else {
   2396             length=0;
   2397             return NULL;
   2398         }
   2399     }
   2400 
   2401 private:
   2402     const UnicodeSetWithStrings &fSet;
   2403     int32_t nextStringIndex;
   2404     int32_t nextUTF8Start;
   2405 };
   2406 
   2407 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
   2408 // at code point boundaries.
   2409 // That is, each edge of a match must not be in the middle of a surrogate pair.
   2410 static inline UBool
   2411 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
   2412     s+=start;
   2413     limit-=start;
   2414     int32_t length=t.length();
   2415     return 0==t.compare(s, length) &&
   2416            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
   2417            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
   2418 }
   2419 
   2420 // Implement span() with contains() for comparison.
   2421 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
   2422                                  USetSpanCondition spanCondition) {
   2423     const UnicodeSet &realSet(set.getSet());
   2424     if(!set.hasStrings()) {
   2425         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2426             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2427         }
   2428 
   2429         UChar32 c;
   2430         int32_t start=0, prev;
   2431         while((prev=start)<length) {
   2432             U16_NEXT(s, start, length, c);
   2433             if(realSet.contains(c)!=spanCondition) {
   2434                 break;
   2435             }
   2436         }
   2437         return prev;
   2438     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2439         UnicodeSetWithStringsIterator iter(set);
   2440         UChar32 c;
   2441         int32_t start, next;
   2442         for(start=next=0; start<length;) {
   2443             U16_NEXT(s, next, length, c);
   2444             if(realSet.contains(c)) {
   2445                 break;
   2446             }
   2447             const UnicodeString *str;
   2448             iter.reset();
   2449             while((str=iter.nextString())!=NULL) {
   2450                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
   2451                     // spanNeedsStrings=TRUE;
   2452                     return start;
   2453                 }
   2454             }
   2455             start=next;
   2456         }
   2457         return start;
   2458     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2459         UnicodeSetWithStringsIterator iter(set);
   2460         UChar32 c;
   2461         int32_t start, next, maxSpanLimit=0;
   2462         for(start=next=0; start<length;) {
   2463             U16_NEXT(s, next, length, c);
   2464             if(!realSet.contains(c)) {
   2465                 next=start;  // Do not span this single, not-contained code point.
   2466             }
   2467             const UnicodeString *str;
   2468             iter.reset();
   2469             while((str=iter.nextString())!=NULL) {
   2470                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
   2471                     // spanNeedsStrings=TRUE;
   2472                     int32_t matchLimit=start+str->length();
   2473                     if(matchLimit==length) {
   2474                         return length;
   2475                     }
   2476                     if(spanCondition==USET_SPAN_CONTAINED) {
   2477                         // Iterate for the shortest match at each position.
   2478                         // Recurse for each but the shortest match.
   2479                         if(next==start) {
   2480                             next=matchLimit;  // First match from start.
   2481                         } else {
   2482                             if(matchLimit<next) {
   2483                                 // Remember shortest match from start for iteration.
   2484                                 int32_t temp=next;
   2485                                 next=matchLimit;
   2486                                 matchLimit=temp;
   2487                             }
   2488                             // Recurse for non-shortest match from start.
   2489                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
   2490                                                                  USET_SPAN_CONTAINED);
   2491                             if((matchLimit+spanLength)>maxSpanLimit) {
   2492                                 maxSpanLimit=matchLimit+spanLength;
   2493                                 if(maxSpanLimit==length) {
   2494                                     return length;
   2495                                 }
   2496                             }
   2497                         }
   2498                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2499                         if(matchLimit>next) {
   2500                             // Remember longest match from start.
   2501                             next=matchLimit;
   2502                         }
   2503                     }
   2504                 }
   2505             }
   2506             if(next==start) {
   2507                 break;  // No match from start.
   2508             }
   2509             start=next;
   2510         }
   2511         if(start>maxSpanLimit) {
   2512             return start;
   2513         } else {
   2514             return maxSpanLimit;
   2515         }
   2516     }
   2517 }
   2518 
   2519 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
   2520                                      USetSpanCondition spanCondition) {
   2521     if(length==0) {
   2522         return 0;
   2523     }
   2524     const UnicodeSet &realSet(set.getSet());
   2525     if(!set.hasStrings()) {
   2526         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2527             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2528         }
   2529 
   2530         UChar32 c;
   2531         int32_t prev=length;
   2532         do {
   2533             U16_PREV(s, 0, length, c);
   2534             if(realSet.contains(c)!=spanCondition) {
   2535                 break;
   2536             }
   2537         } while((prev=length)>0);
   2538         return prev;
   2539     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2540         UnicodeSetWithStringsIterator iter(set);
   2541         UChar32 c;
   2542         int32_t prev=length, length0=length;
   2543         do {
   2544             U16_PREV(s, 0, length, c);
   2545             if(realSet.contains(c)) {
   2546                 break;
   2547             }
   2548             const UnicodeString *str;
   2549             iter.reset();
   2550             while((str=iter.nextString())!=NULL) {
   2551                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
   2552                     // spanNeedsStrings=TRUE;
   2553                     return prev;
   2554                 }
   2555             }
   2556         } while((prev=length)>0);
   2557         return prev;
   2558     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2559         UnicodeSetWithStringsIterator iter(set);
   2560         UChar32 c;
   2561         int32_t prev=length, minSpanStart=length, length0=length;
   2562         do {
   2563             U16_PREV(s, 0, length, c);
   2564             if(!realSet.contains(c)) {
   2565                 length=prev;  // Do not span this single, not-contained code point.
   2566             }
   2567             const UnicodeString *str;
   2568             iter.reset();
   2569             while((str=iter.nextString())!=NULL) {
   2570                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
   2571                     // spanNeedsStrings=TRUE;
   2572                     int32_t matchStart=prev-str->length();
   2573                     if(matchStart==0) {
   2574                         return 0;
   2575                     }
   2576                     if(spanCondition==USET_SPAN_CONTAINED) {
   2577                         // Iterate for the shortest match at each position.
   2578                         // Recurse for each but the shortest match.
   2579                         if(length==prev) {
   2580                             length=matchStart;  // First match from prev.
   2581                         } else {
   2582                             if(matchStart>length) {
   2583                                 // Remember shortest match from prev for iteration.
   2584                                 int32_t temp=length;
   2585                                 length=matchStart;
   2586                                 matchStart=temp;
   2587                             }
   2588                             // Recurse for non-shortest match from prev.
   2589                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
   2590                                                                     USET_SPAN_CONTAINED);
   2591                             if(spanStart<minSpanStart) {
   2592                                 minSpanStart=spanStart;
   2593                                 if(minSpanStart==0) {
   2594                                     return 0;
   2595                                 }
   2596                             }
   2597                         }
   2598                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2599                         if(matchStart<length) {
   2600                             // Remember longest match from prev.
   2601                             length=matchStart;
   2602                         }
   2603                     }
   2604                 }
   2605             }
   2606             if(length==prev) {
   2607                 break;  // No match from prev.
   2608             }
   2609         } while((prev=length)>0);
   2610         if(prev<minSpanStart) {
   2611             return prev;
   2612         } else {
   2613             return minSpanStart;
   2614         }
   2615     }
   2616 }
   2617 
   2618 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
   2619                                 USetSpanCondition spanCondition) {
   2620     const UnicodeSet &realSet(set.getSet());
   2621     if(!set.hasStrings()) {
   2622         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2623             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2624         }
   2625 
   2626         UChar32 c;
   2627         int32_t start=0, prev;
   2628         while((prev=start)<length) {
   2629             U8_NEXT(s, start, length, c);
   2630             if(c<0) {
   2631                 c=0xfffd;
   2632             }
   2633             if(realSet.contains(c)!=spanCondition) {
   2634                 break;
   2635             }
   2636         }
   2637         return prev;
   2638     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2639         UnicodeSetWithStringsIterator iter(set);
   2640         UChar32 c;
   2641         int32_t start, next;
   2642         for(start=next=0; start<length;) {
   2643             U8_NEXT(s, next, length, c);
   2644             if(c<0) {
   2645                 c=0xfffd;
   2646             }
   2647             if(realSet.contains(c)) {
   2648                 break;
   2649             }
   2650             const char *s8;
   2651             int32_t length8;
   2652             iter.reset();
   2653             while((s8=iter.nextUTF8(length8))!=NULL) {
   2654                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
   2655                     // spanNeedsStrings=TRUE;
   2656                     return start;
   2657                 }
   2658             }
   2659             start=next;
   2660         }
   2661         return start;
   2662     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2663         UnicodeSetWithStringsIterator iter(set);
   2664         UChar32 c;
   2665         int32_t start, next, maxSpanLimit=0;
   2666         for(start=next=0; start<length;) {
   2667             U8_NEXT(s, next, length, c);
   2668             if(c<0) {
   2669                 c=0xfffd;
   2670             }
   2671             if(!realSet.contains(c)) {
   2672                 next=start;  // Do not span this single, not-contained code point.
   2673             }
   2674             const char *s8;
   2675             int32_t length8;
   2676             iter.reset();
   2677             while((s8=iter.nextUTF8(length8))!=NULL) {
   2678                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
   2679                     // spanNeedsStrings=TRUE;
   2680                     int32_t matchLimit=start+length8;
   2681                     if(matchLimit==length) {
   2682                         return length;
   2683                     }
   2684                     if(spanCondition==USET_SPAN_CONTAINED) {
   2685                         // Iterate for the shortest match at each position.
   2686                         // Recurse for each but the shortest match.
   2687                         if(next==start) {
   2688                             next=matchLimit;  // First match from start.
   2689                         } else {
   2690                             if(matchLimit<next) {
   2691                                 // Remember shortest match from start for iteration.
   2692                                 int32_t temp=next;
   2693                                 next=matchLimit;
   2694                                 matchLimit=temp;
   2695                             }
   2696                             // Recurse for non-shortest match from start.
   2697                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
   2698                                                                 USET_SPAN_CONTAINED);
   2699                             if((matchLimit+spanLength)>maxSpanLimit) {
   2700                                 maxSpanLimit=matchLimit+spanLength;
   2701                                 if(maxSpanLimit==length) {
   2702                                     return length;
   2703                                 }
   2704                             }
   2705                         }
   2706                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2707                         if(matchLimit>next) {
   2708                             // Remember longest match from start.
   2709                             next=matchLimit;
   2710                         }
   2711                     }
   2712                 }
   2713             }
   2714             if(next==start) {
   2715                 break;  // No match from start.
   2716             }
   2717             start=next;
   2718         }
   2719         if(start>maxSpanLimit) {
   2720             return start;
   2721         } else {
   2722             return maxSpanLimit;
   2723         }
   2724     }
   2725 }
   2726 
   2727 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
   2728                                     USetSpanCondition spanCondition) {
   2729     if(length==0) {
   2730         return 0;
   2731     }
   2732     const UnicodeSet &realSet(set.getSet());
   2733     if(!set.hasStrings()) {
   2734         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2735             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2736         }
   2737 
   2738         UChar32 c;
   2739         int32_t prev=length;
   2740         do {
   2741             U8_PREV(s, 0, length, c);
   2742             if(c<0) {
   2743                 c=0xfffd;
   2744             }
   2745             if(realSet.contains(c)!=spanCondition) {
   2746                 break;
   2747             }
   2748         } while((prev=length)>0);
   2749         return prev;
   2750     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2751         UnicodeSetWithStringsIterator iter(set);
   2752         UChar32 c;
   2753         int32_t prev=length;
   2754         do {
   2755             U8_PREV(s, 0, length, c);
   2756             if(c<0) {
   2757                 c=0xfffd;
   2758             }
   2759             if(realSet.contains(c)) {
   2760                 break;
   2761             }
   2762             const char *s8;
   2763             int32_t length8;
   2764             iter.reset();
   2765             while((s8=iter.nextUTF8(length8))!=NULL) {
   2766                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
   2767                     // spanNeedsStrings=TRUE;
   2768                     return prev;
   2769                 }
   2770             }
   2771         } while((prev=length)>0);
   2772         return prev;
   2773     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2774         UnicodeSetWithStringsIterator iter(set);
   2775         UChar32 c;
   2776         int32_t prev=length, minSpanStart=length;
   2777         do {
   2778             U8_PREV(s, 0, length, c);
   2779             if(c<0) {
   2780                 c=0xfffd;
   2781             }
   2782             if(!realSet.contains(c)) {
   2783                 length=prev;  // Do not span this single, not-contained code point.
   2784             }
   2785             const char *s8;
   2786             int32_t length8;
   2787             iter.reset();
   2788             while((s8=iter.nextUTF8(length8))!=NULL) {
   2789                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
   2790                     // spanNeedsStrings=TRUE;
   2791                     int32_t matchStart=prev-length8;
   2792                     if(matchStart==0) {
   2793                         return 0;
   2794                     }
   2795                     if(spanCondition==USET_SPAN_CONTAINED) {
   2796                         // Iterate for the shortest match at each position.
   2797                         // Recurse for each but the shortest match.
   2798                         if(length==prev) {
   2799                             length=matchStart;  // First match from prev.
   2800                         } else {
   2801                             if(matchStart>length) {
   2802                                 // Remember shortest match from prev for iteration.
   2803                                 int32_t temp=length;
   2804                                 length=matchStart;
   2805                                 matchStart=temp;
   2806                             }
   2807                             // Recurse for non-shortest match from prev.
   2808                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
   2809                                                                    USET_SPAN_CONTAINED);
   2810                             if(spanStart<minSpanStart) {
   2811                                 minSpanStart=spanStart;
   2812                                 if(minSpanStart==0) {
   2813                                     return 0;
   2814                                 }
   2815                             }
   2816                         }
   2817                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2818                         if(matchStart<length) {
   2819                             // Remember longest match from prev.
   2820                             length=matchStart;
   2821                         }
   2822                     }
   2823                 }
   2824             }
   2825             if(length==prev) {
   2826                 break;  // No match from prev.
   2827             }
   2828         } while((prev=length)>0);
   2829         if(prev<minSpanStart) {
   2830             return prev;
   2831         } else {
   2832             return minSpanStart;
   2833         }
   2834     }
   2835 }
   2836 
   2837 // spans to be performed and compared
   2838 enum {
   2839     SPAN_UTF16          =1,
   2840     SPAN_UTF8           =2,
   2841     SPAN_UTFS           =3,
   2842 
   2843     SPAN_SET            =4,
   2844     SPAN_COMPLEMENT     =8,
   2845     SPAN_POLARITY       =0xc,
   2846 
   2847     SPAN_FWD            =0x10,
   2848     SPAN_BACK           =0x20,
   2849     SPAN_DIRS           =0x30,
   2850 
   2851     SPAN_CONTAINED      =0x100,
   2852     SPAN_SIMPLE         =0x200,
   2853     SPAN_CONDITION      =0x300,
   2854 
   2855     SPAN_ALL            =0x33f
   2856 };
   2857 
   2858 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
   2859     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
   2860 }
   2861 
   2862 static inline int32_t slen(const void *s, UBool isUTF16) {
   2863     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
   2864 }
   2865 
   2866 /*
   2867  * Count spans on a string with the method according to type and set the span limits.
   2868  * The set may be the complement of the original.
   2869  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
   2870  * according to the expected number of spans.
   2871  * Sets typeName to an empty string if there is no such type.
   2872  * Returns -1 if the span option is filtered out.
   2873  */
   2874 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
   2875                         const void *s, int32_t length, UBool isUTF16,
   2876                         uint32_t whichSpans,
   2877                         int type, const char *&typeName,
   2878                         int32_t limits[], int32_t limitsCapacity,
   2879                         int32_t expectCount) {
   2880     const UnicodeSet &realSet(set.getSet());
   2881     int32_t start, count;
   2882     USetSpanCondition spanCondition, firstSpanCondition, contained;
   2883     UBool isForward;
   2884 
   2885     if(type<0 || 7<type) {
   2886         typeName="";
   2887         return 0;
   2888     }
   2889 
   2890     static const char *const typeNames16[]={
   2891         "contains", "contains(LM)",
   2892         "span", "span(LM)",
   2893         "containsBack", "containsBack(LM)",
   2894         "spanBack", "spanBack(LM)"
   2895     };
   2896 
   2897     static const char *const typeNames8[]={
   2898         "containsUTF8", "containsUTF8(LM)",
   2899         "spanUTF8", "spanUTF8(LM)",
   2900         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
   2901         "spanBackUTF8", "spanBackUTF8(LM)"
   2902     };
   2903 
   2904     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
   2905 
   2906     // filter span options
   2907     if(type<=3) {
   2908         // span forward
   2909         if((whichSpans&SPAN_FWD)==0) {
   2910             return -1;
   2911         }
   2912         isForward=TRUE;
   2913     } else {
   2914         // span backward
   2915         if((whichSpans&SPAN_BACK)==0) {
   2916             return -1;
   2917         }
   2918         isForward=FALSE;
   2919     }
   2920     if((type&1)==0) {
   2921         // use USET_SPAN_CONTAINED
   2922         if((whichSpans&SPAN_CONTAINED)==0) {
   2923             return -1;
   2924         }
   2925         contained=USET_SPAN_CONTAINED;
   2926     } else {
   2927         // use USET_SPAN_SIMPLE
   2928         if((whichSpans&SPAN_SIMPLE)==0) {
   2929             return -1;
   2930         }
   2931         contained=USET_SPAN_SIMPLE;
   2932     }
   2933 
   2934     // Default first span condition for going forward with an uncomplemented set.
   2935     spanCondition=USET_SPAN_NOT_CONTAINED;
   2936     if(isComplement) {
   2937         spanCondition=invertSpanCondition(spanCondition, contained);
   2938     }
   2939 
   2940     // First span condition for span(), used to terminate the spanBack() iteration.
   2941     firstSpanCondition=spanCondition;
   2942 
   2943     // spanBack(): Its initial span condition is span()'s last span condition,
   2944     // which is the opposite of span()'s first span condition
   2945     // if we expect an even number of spans.
   2946     // (The loop inverts spanCondition (expectCount-1) times
   2947     // before the expectCount'th span() call.)
   2948     // If we do not compare forward and backward directions, then we do not have an
   2949     // expectCount and just start with firstSpanCondition.
   2950     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
   2951         spanCondition=invertSpanCondition(spanCondition, contained);
   2952     }
   2953 
   2954     count=0;
   2955     switch(type) {
   2956     case 0:
   2957     case 1:
   2958         start=0;
   2959         if(length<0) {
   2960             length=slen(s, isUTF16);
   2961         }
   2962         for(;;) {
   2963             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
   2964                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
   2965             if(count<limitsCapacity) {
   2966                 limits[count]=start;
   2967             }
   2968             ++count;
   2969             if(start>=length) {
   2970                 break;
   2971             }
   2972             spanCondition=invertSpanCondition(spanCondition, contained);
   2973         }
   2974         break;
   2975     case 2:
   2976     case 3:
   2977         start=0;
   2978         for(;;) {
   2979             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
   2980                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
   2981             if(count<limitsCapacity) {
   2982                 limits[count]=start;
   2983             }
   2984             ++count;
   2985             if(length>=0 ? start>=length :
   2986                            isUTF16 ? ((const UChar *)s)[start]==0 :
   2987                                      ((const char *)s)[start]==0
   2988             ) {
   2989                 break;
   2990             }
   2991             spanCondition=invertSpanCondition(spanCondition, contained);
   2992         }
   2993         break;
   2994     case 4:
   2995     case 5:
   2996         if(length<0) {
   2997             length=slen(s, isUTF16);
   2998         }
   2999         for(;;) {
   3000             ++count;
   3001             if(count<=limitsCapacity) {
   3002                 limits[limitsCapacity-count]=length;
   3003             }
   3004             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
   3005                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
   3006             if(length==0 && spanCondition==firstSpanCondition) {
   3007                 break;
   3008             }
   3009             spanCondition=invertSpanCondition(spanCondition, contained);
   3010         }
   3011         if(count<limitsCapacity) {
   3012             memmove(limits, limits+(limitsCapacity-count), count*4);
   3013         }
   3014         break;
   3015     case 6:
   3016     case 7:
   3017         for(;;) {
   3018             ++count;
   3019             if(count<=limitsCapacity) {
   3020                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
   3021             }
   3022             // Note: Length<0 is tested only for the first spanBack().
   3023             // If we wanted to keep length<0 for all spanBack()s, we would have to
   3024             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
   3025             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
   3026                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
   3027             if(length==0 && spanCondition==firstSpanCondition) {
   3028                 break;
   3029             }
   3030             spanCondition=invertSpanCondition(spanCondition, contained);
   3031         }
   3032         if(count<limitsCapacity) {
   3033             memmove(limits, limits+(limitsCapacity-count), count*4);
   3034         }
   3035         break;
   3036     default:
   3037         typeName="";
   3038         return -1;
   3039     }
   3040 
   3041     return count;
   3042 }
   3043 
   3044 // sets to be tested; odd index=isComplement
   3045 enum {
   3046     SLOW,
   3047     SLOW_NOT,
   3048     FAST,
   3049     FAST_NOT,
   3050     SET_COUNT
   3051 };
   3052 
   3053 static const char *const setNames[SET_COUNT]={
   3054     "slow",
   3055     "slow.not",
   3056     "fast",
   3057     "fast.not"
   3058 };
   3059 
   3060 /*
   3061  * Verify that we get the same results whether we look at text with contains(),
   3062  * span() or spanBack(), using unfrozen or frozen versions of the set,
   3063  * and using the set or its complement (switching the spanConditions accordingly).
   3064  * The latter verifies that
   3065  *   set.span(spanCondition) == set.complement().span(!spanCondition).
   3066  *
   3067  * The expectLimits[] are either provided by the caller (with expectCount>=0)
   3068  * or returned to the caller (with an input expectCount<0).
   3069  */
   3070 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
   3071                               const void *s, int32_t length, UBool isUTF16,
   3072                               uint32_t whichSpans,
   3073                               int32_t expectLimits[], int32_t &expectCount,
   3074                               const char *testName, int32_t index) {
   3075     int32_t limits[500];
   3076     int32_t limitsCount;
   3077     int i, j;
   3078 
   3079     const char *typeName;
   3080     int type;
   3081 
   3082     for(i=0; i<SET_COUNT; ++i) {
   3083         if((i&1)==0) {
   3084             // Even-numbered sets are original, uncomplemented sets.
   3085             if((whichSpans&SPAN_SET)==0) {
   3086                 continue;
   3087             }
   3088         } else {
   3089             // Odd-numbered sets are complemented.
   3090             if((whichSpans&SPAN_COMPLEMENT)==0) {
   3091                 continue;
   3092             }
   3093         }
   3094         for(type=0;; ++type) {
   3095             limitsCount=getSpans(*sets[i], (UBool)(i&1),
   3096                                  s, length, isUTF16,
   3097                                  whichSpans,
   3098                                  type, typeName,
   3099                                  limits, LENGTHOF(limits), expectCount);
   3100             if(typeName[0]==0) {
   3101                 break; // All types tried.
   3102             }
   3103             if(limitsCount<0) {
   3104                 continue; // Span option filtered out.
   3105             }
   3106             if(expectCount<0) {
   3107                 expectCount=limitsCount;
   3108                 if(limitsCount>LENGTHOF(limits)) {
   3109                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
   3110                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
   3111                     return;
   3112                 }
   3113                 memcpy(expectLimits, limits, limitsCount*4);
   3114             } else if(limitsCount!=expectCount) {
   3115                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
   3116                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
   3117             } else {
   3118                 for(j=0; j<limitsCount; ++j) {
   3119                     if(limits[j]!=expectLimits[j]) {
   3120                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
   3121                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
   3122                               j, (long)limits[j], (long)expectLimits[j]);
   3123                         break;
   3124                     }
   3125                 }
   3126             }
   3127         }
   3128     }
   3129 
   3130     // Compare span() with containsAll()/containsNone(),
   3131     // but only if we have expectLimits[] from the uncomplemented set.
   3132     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
   3133         const UChar *s16=(const UChar *)s;
   3134         UnicodeString string;
   3135         int32_t prev=0, limit, length;
   3136         for(i=0; i<expectCount; ++i) {
   3137             limit=expectLimits[i];
   3138             length=limit-prev;
   3139             if(length>0) {
   3140                 string.setTo(FALSE, s16+prev, length);  // read-only alias
   3141                 if(i&1) {
   3142                     if(!sets[SLOW]->getSet().containsAll(string)) {
   3143                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
   3144                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
   3145                         return;
   3146                     }
   3147                     if(!sets[FAST]->getSet().containsAll(string)) {
   3148                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
   3149                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
   3150                         return;
   3151                     }
   3152                 } else {
   3153                     if(!sets[SLOW]->getSet().containsNone(string)) {
   3154                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
   3155                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
   3156                         return;
   3157                     }
   3158                     if(!sets[FAST]->getSet().containsNone(string)) {
   3159                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
   3160                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
   3161                         return;
   3162                     }
   3163                 }
   3164             }
   3165             prev=limit;
   3166         }
   3167     }
   3168 }
   3169 
   3170 // Specifically test either UTF-16 or UTF-8.
   3171 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
   3172                               const void *s, int32_t length, UBool isUTF16,
   3173                               uint32_t whichSpans,
   3174                               const char *testName, int32_t index) {
   3175     int32_t expectLimits[500];
   3176     int32_t expectCount=-1;
   3177     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
   3178 }
   3179 
   3180 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
   3181     UChar c, c2;
   3182 
   3183     if(length>=0) {
   3184         while(length>0) {
   3185             c=*s++;
   3186             --length;
   3187             if(0xd800<=c && c<0xe000) {
   3188                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
   3189                     return TRUE;
   3190                 }
   3191                 --length;
   3192             }
   3193         }
   3194     } else {
   3195         while((c=*s++)!=0) {
   3196             if(0xd800<=c && c<0xe000) {
   3197                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
   3198                     return TRUE;
   3199                 }
   3200             }
   3201         }
   3202     }
   3203     return FALSE;
   3204 }
   3205 
   3206 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
   3207 // unless either UTF is turned off in whichSpans.
   3208 // Testing UTF-16 and UTF-8 together requires that surrogate code points
   3209 // have the same contains(c) value as U+FFFD.
   3210 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
   3211                                       const UChar *s16, int32_t length16,
   3212                                       uint32_t whichSpans,
   3213                                       const char *testName, int32_t index) {
   3214     int32_t expectLimits[500];
   3215     int32_t expectCount;
   3216 
   3217     expectCount=-1;  // Get expectLimits[] from testSpan().
   3218 
   3219     if((whichSpans&SPAN_UTF16)!=0) {
   3220         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
   3221     }
   3222     if((whichSpans&SPAN_UTF8)==0) {
   3223         return;
   3224     }
   3225 
   3226     // Convert s16[] and expectLimits[] to UTF-8.
   3227     uint8_t s8[3000];
   3228     int32_t offsets[3000];
   3229 
   3230     const UChar *s16Limit=s16+length16;
   3231     char *t=(char *)s8;
   3232     char *tLimit=t+sizeof(s8);
   3233     int32_t *o=offsets;
   3234     UErrorCode errorCode=U_ZERO_ERROR;
   3235 
   3236     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
   3237     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
   3238     if(U_FAILURE(errorCode)) {
   3239         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
   3240               testName, (long)index, u_errorName(errorCode));
   3241         ucnv_resetFromUnicode(utf8Cnv);
   3242         return;
   3243     }
   3244     int32_t length8=(int32_t)(t-(char *)s8);
   3245 
   3246     // Convert expectLimits[].
   3247     int32_t i, j, expect;
   3248     for(i=j=0; i<expectCount; ++i) {
   3249         expect=expectLimits[i];
   3250         if(expect==length16) {
   3251             expectLimits[i]=length8;
   3252         } else {
   3253             while(offsets[j]<expect) {
   3254                 ++j;
   3255             }
   3256             expectLimits[i]=j;
   3257         }
   3258     }
   3259 
   3260     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
   3261 }
   3262 
   3263 static UChar32 nextCodePoint(UChar32 c) {
   3264     // Skip some large and boring ranges.
   3265     switch(c) {
   3266     case 0x3441:
   3267         return 0x4d7f;
   3268     case 0x5100:
   3269         return 0x9f00;
   3270     case 0xb040:
   3271         return 0xd780;
   3272     case 0xe041:
   3273         return 0xf8fe;
   3274     case 0x10100:
   3275         return 0x20000;
   3276     case 0x20041:
   3277         return 0xe0000;
   3278     case 0xe0101:
   3279         return 0x10fffd;
   3280     default:
   3281         return c+1;
   3282     }
   3283 }
   3284 
   3285 // Verify that all implementations represent the same set.
   3286 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3287     // contains(U+FFFD) is inconsistent with contains(some surrogates),
   3288     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
   3289     // Skip the UTF-8 part of the test - if the string contains surrogates -
   3290     // because it is likely to produce a different result.
   3291     UBool inconsistentSurrogates=
   3292             (!(sets[0]->getSet().contains(0xfffd) ?
   3293                sets[0]->getSet().contains(0xd800, 0xdfff) :
   3294                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
   3295              sets[0]->hasStringsWithSurrogates());
   3296 
   3297     UChar s[1000];
   3298     int32_t length=0;
   3299     uint32_t localWhichSpans;
   3300 
   3301     UChar32 c, first;
   3302     for(first=c=0;; c=nextCodePoint(c)) {
   3303         if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
   3304             localWhichSpans=whichSpans;
   3305             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
   3306                 localWhichSpans&=~SPAN_UTF8;
   3307             }
   3308             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
   3309             if(c>0x10ffff) {
   3310                 break;
   3311             }
   3312             length=0;
   3313             first=c;
   3314         }
   3315         U16_APPEND_UNSAFE(s, length, c);
   3316     }
   3317 }
   3318 
   3319 // Test with a particular, interesting string.
   3320 // Specify length and try NUL-termination.
   3321 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3322     static const UChar s[]={
   3323         0x61, 0x62, 0x20,                       // Latin, space
   3324         0x3b1, 0x3b2, 0x3b3,                    // Greek
   3325         0xd900,                                 // lead surrogate
   3326         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
   3327         0xdc05,                                 // trail surrogate
   3328         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
   3329         0xd900, 0xdc05,                         // unassigned supplementary
   3330         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
   3331         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
   3332         0                                       // NUL
   3333     };
   3334 
   3335     if((whichSpans&SPAN_UTF16)==0) {
   3336         return;
   3337     }
   3338     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
   3339     testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
   3340 }
   3341 
   3342 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3343     static const char s[]={
   3344         "abc"                                   // Latin
   3345 
   3346         /* trail byte in lead position */
   3347         "\x80"
   3348 
   3349         " "                                     // space
   3350 
   3351         /* truncated multi-byte sequences */
   3352         "\xd0"
   3353         "\xe0"
   3354         "\xe1"
   3355         "\xed"
   3356         "\xee"
   3357         "\xf0"
   3358         "\xf1"
   3359         "\xf4"
   3360         "\xf8"
   3361         "\xfc"
   3362 
   3363         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
   3364 
   3365         /* trail byte in lead position */
   3366         "\x80"
   3367 
   3368         "\xe0\x80"
   3369         "\xe0\xa0"
   3370         "\xe1\x80"
   3371         "\xed\x80"
   3372         "\xed\xa0"
   3373         "\xee\x80"
   3374         "\xf0\x80"
   3375         "\xf0\x90"
   3376         "\xf1\x80"
   3377         "\xf4\x80"
   3378         "\xf4\x90"
   3379         "\xf8\x80"
   3380         "\xfc\x80"
   3381 
   3382         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
   3383 
   3384         /* trail byte in lead position */
   3385         "\x80"
   3386 
   3387         "\xf0\x80\x80"
   3388         "\xf0\x90\x80"
   3389         "\xf1\x80\x80"
   3390         "\xf4\x80\x80"
   3391         "\xf4\x90\x80"
   3392         "\xf8\x80\x80"
   3393         "\xfc\x80\x80"
   3394 
   3395         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
   3396 
   3397         /* trail byte in lead position */
   3398         "\x80"
   3399 
   3400         "\xf8\x80\x80\x80"
   3401         "\xfc\x80\x80\x80"
   3402 
   3403         "\xF1\x90\x80\x85"                      // unassigned supplementary
   3404 
   3405         /* trail byte in lead position */
   3406         "\x80"
   3407 
   3408         "\xfc\x80\x80\x80\x80"
   3409 
   3410         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
   3411 
   3412         /* trail byte in lead position */
   3413         "\x80"
   3414 
   3415         /* complete sequences but non-shortest forms or out of range etc. */
   3416         "\xc0\x80"
   3417         "\xe0\x80\x80"
   3418         "\xed\xa0\x80"
   3419         "\xf0\x80\x80\x80"
   3420         "\xf4\x90\x80\x80"
   3421         "\xf8\x80\x80\x80\x80"
   3422         "\xfc\x80\x80\x80\x80\x80"
   3423         "\xfe"
   3424         "\xff"
   3425 
   3426         /* trail byte in lead position */
   3427         "\x80"
   3428 
   3429         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
   3430     };
   3431 
   3432     if((whichSpans&SPAN_UTF8)==0) {
   3433         return;
   3434     }
   3435     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
   3436     testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
   3437 }
   3438 
   3439 // Take a set of span options and multiply them so that
   3440 // each portion only has one of the options a, b and c.
   3441 // If b==0, then the set of options is just modified with mask and a.
   3442 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
   3443 static int32_t
   3444 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
   3445                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
   3446     uint32_t s;
   3447     int32_t i;
   3448 
   3449     for(i=0; i<whichSpansCount; ++i) {
   3450         s=whichSpans[i]&mask;
   3451         whichSpans[i]=s|a;
   3452         if(b!=0) {
   3453             whichSpans[whichSpansCount+i]=s|b;
   3454             if(c!=0) {
   3455                 whichSpans[2*whichSpansCount+i]=s|c;
   3456             }
   3457         }
   3458     }
   3459     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
   3460 }
   3461 
   3462 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
   3463 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
   3464 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
   3465 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
   3466 
   3467 void UnicodeSetTest::TestSpan() {
   3468     // "[...]" is a UnicodeSet pattern.
   3469     // "*" performs tests on all Unicode code points and on a selection of
   3470     //   malformed UTF-8/16 strings.
   3471     // "-options" limits the scope of testing for the current set.
   3472     //   By default, the test verifies that equivalent boundaries are found
   3473     //   for UTF-16 and UTF-8, going forward and backward,
   3474     //   alternating USET_SPAN_NOT_CONTAINED with
   3475     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
   3476     //   Single-character options:
   3477     //     8 -- UTF-16 and UTF-8 boundaries may differ.
   3478     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
   3479     //          or the set contains strings with unpaired surrogates
   3480     //          which do not translate to valid UTF-8.
   3481     //     c -- set.span() and set.complement().span() boundaries may differ.
   3482     //          Cause: Set strings are not complemented.
   3483     //     b -- span() and spanBack() boundaries may differ.
   3484     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
   3485     //          and spanBack(USET_SPAN_SIMPLE) are defined to
   3486     //          match with non-overlapping substrings.
   3487     //          For example, with a set containing "ab" and "ba",
   3488     //          span() of "aba" yields boundaries { 0, 2, 3 }
   3489     //          because the initial "ab" matches from 0 to 2,
   3490     //          while spanBack() yields boundaries { 0, 1, 3 }
   3491     //          because the final "ba" matches from 1 to 3.
   3492     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
   3493     //          Cause: Strings in the set overlap, and a longer match may
   3494     //          require a sequence including non-longest substrings.
   3495     //          For example, with a set containing "ab", "abc" and "cd",
   3496     //          span(contained) of "abcd" spans the entire string
   3497     //          but span(longest match) only spans the first 3 characters.
   3498     //   Each "-options" first resets all options and then applies the specified options.
   3499     //   A "-" without options resets the options.
   3500     //   The options are also reset for each new set.
   3501     // Other strings will be spanned.
   3502     static const char *const testdata[]={
   3503         "[:ID_Continue:]",
   3504         "*",
   3505         "[:White_Space:]",
   3506         "*",
   3507         "[]",
   3508         "*",
   3509         "[\\u0000-\\U0010FFFF]",
   3510         "*",
   3511         "[\\u0000\\u0080\\u0800\\U00010000]",
   3512         "*",
   3513         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
   3514         "*",
   3515         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
   3516         "-c",
   3517         "*",
   3518         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
   3519         "-c",
   3520         "*",
   3521 
   3522         // Overlapping strings cause overlapping attempts to match.
   3523         "[x{xy}{xya}{axy}{ax}]",
   3524         "-cl",
   3525 
   3526         // More repetitions of "xya" would take too long with the recursive
   3527         // reference implementation.
   3528         // containsAll()=FALSE
   3529         // test_string 0x14
   3530         "xx"
   3531         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
   3532         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
   3533         "xyaxyaxyaxya"
   3534         "xx"
   3535         "xyaxyaxyaxya"  // span() ends here.
   3536         "aaa",
   3537 
   3538         // containsAll()=TRUE
   3539         // test_string 0x15
   3540         "xx"
   3541         "xyaxyaxyaxya"
   3542         "xx"
   3543         "xyaxyaxyaxya"
   3544         "xx"
   3545         "xyaxyaxyaxy",
   3546 
   3547         "-bc",
   3548         // test_string 0x17
   3549         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
   3550         "-c",
   3551         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
   3552         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
   3553         "-",
   3554         "byaya",     // span() -> { 5 }
   3555         "byay",      // span() -> { 4 }
   3556         "bya",       // span() -> { 3 }
   3557 
   3558         // span(longest match) will not span the whole string.
   3559         "[a{ab}{bc}]",
   3560         "-cl",
   3561         // test_string 0x21
   3562         "abc",
   3563 
   3564         "[a{ab}{abc}{cd}]",
   3565         "-cl",
   3566         "acdabcdabccd",
   3567 
   3568         // spanBack(longest match) will not span the whole string.
   3569         "[c{ab}{bc}]",
   3570         "-cl",
   3571         "abc",
   3572 
   3573         "[d{cd}{bcd}{ab}]",
   3574         "-cl",
   3575         "abbcdabcdabd",
   3576 
   3577         // Test with non-ASCII set strings - test proper handling of surrogate pairs
   3578         // and UTF-8 trail bytes.
   3579         // Copies of above test sets and strings, but transliterated to have
   3580         // different code points with similar trail units.
   3581         // Previous: a      b         c            d
   3582         // Unicode:  042B   30AB      200AB        204AB
   3583         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
   3584         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
   3585         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
   3586         "-cl",
   3587         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
   3588 
   3589         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
   3590         "-cl",
   3591         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
   3592 
   3593         // Stress bookkeeping and recursion.
   3594         // The following strings are barely doable with the recursive
   3595         // reference implementation.
   3596         // The not-contained character at the end prevents an early exit from the span().
   3597         "[b{bb}]",
   3598         "-c",
   3599         // test_string 0x33
   3600         "bbbbbbbbbbbbbbbbbbbbbbbb-",
   3601         // On complement sets, span() and spanBack() get different results
   3602         // because b is not in the complement set and there is an odd number of b's
   3603         // in the test string.
   3604         "-bc",
   3605         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
   3606 
   3607         // Test with set strings with an initial or final code point span
   3608         // longer than 254.
   3609         "[a{" _64_a _64_a _64_a _64_a "b}"
   3610           "{a" _64_b _64_b _64_b _64_b "}]",
   3611         "-c",
   3612         _64_a _64_a _64_a _63_a "b",
   3613         _64_a _64_a _64_a _64_a "b",
   3614         _64_a _64_a _64_a _64_a "aaaabbbb",
   3615         "a" _64_b _64_b _64_b _63_b,
   3616         "a" _64_b _64_b _64_b _64_b,
   3617         "aaaabbbb" _64_b _64_b _64_b _64_b,
   3618 
   3619         // Test with strings containing unpaired surrogates.
   3620         // They are not representable in UTF-8, and a leading trail surrogate
   3621         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
   3622         // U+20001 == \\uD840\\uDC01
   3623         // U+20400 == \\uD841\\uDC00
   3624         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
   3625         "-8cl",
   3626         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
   3627     };
   3628     uint32_t whichSpans[96]={ SPAN_ALL };
   3629     int32_t whichSpansCount=1;
   3630 
   3631     UnicodeSet *sets[SET_COUNT]={ NULL };
   3632     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
   3633 
   3634     char testName[1024];
   3635     char *testNameLimit=testName;
   3636 
   3637     int32_t i, j;
   3638     for(i=0; i<LENGTHOF(testdata); ++i) {
   3639         const char *s=testdata[i];
   3640         if(s[0]=='[') {
   3641             // Create new test sets from this pattern.
   3642             for(j=0; j<SET_COUNT; ++j) {
   3643                 delete sets_with_str[j];
   3644                 delete sets[j];
   3645             }
   3646             UErrorCode errorCode=U_ZERO_ERROR;
   3647             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
   3648             if(U_FAILURE(errorCode)) {
   3649                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
   3650                 break;
   3651             }
   3652             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
   3653             sets[SLOW_NOT]->complement();
   3654             // Intermediate set: Test cloning of a frozen set.
   3655             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
   3656             fast->freeze();
   3657             sets[FAST]=(UnicodeSet *)fast->clone();
   3658             delete fast;
   3659             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
   3660             fastNot->freeze();
   3661             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
   3662             delete fastNot;
   3663 
   3664             for(j=0; j<SET_COUNT; ++j) {
   3665                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
   3666             }
   3667 
   3668             strcpy(testName, s);
   3669             testNameLimit=strchr(testName, 0);
   3670             *testNameLimit++=':';
   3671             *testNameLimit=0;
   3672 
   3673             whichSpans[0]=SPAN_ALL;
   3674             whichSpansCount=1;
   3675         } else if(s[0]=='-') {
   3676             whichSpans[0]=SPAN_ALL;
   3677             whichSpansCount=1;
   3678 
   3679             while(*++s!=0) {
   3680                 switch(*s) {
   3681                 case 'c':
   3682                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3683                                                    ~SPAN_POLARITY,
   3684                                                    SPAN_SET,
   3685                                                    SPAN_COMPLEMENT,
   3686                                                    0);
   3687                     break;
   3688                 case 'b':
   3689                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3690                                                    ~SPAN_DIRS,
   3691                                                    SPAN_FWD,
   3692                                                    SPAN_BACK,
   3693                                                    0);
   3694                     break;
   3695                 case 'l':
   3696                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
   3697                     // USET_SPAN_SIMPLE only FWD, and separately
   3698                     // USET_SPAN_SIMPLE only BACK
   3699                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3700                                                    ~(SPAN_DIRS|SPAN_CONDITION),
   3701                                                    SPAN_DIRS|SPAN_CONTAINED,
   3702                                                    SPAN_FWD|SPAN_SIMPLE,
   3703                                                    SPAN_BACK|SPAN_SIMPLE);
   3704                     break;
   3705                 case '8':
   3706                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3707                                                    ~SPAN_UTFS,
   3708                                                    SPAN_UTF16,
   3709                                                    SPAN_UTF8,
   3710                                                    0);
   3711                     break;
   3712                 default:
   3713                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
   3714                     break;
   3715                 }
   3716             }
   3717         } else if(0==strcmp(s, "*")) {
   3718             strcpy(testNameLimit, "bad_string");
   3719             for(j=0; j<whichSpansCount; ++j) {
   3720                 if(whichSpansCount>1) {
   3721                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
   3722                             "%%0x%3x",
   3723                             whichSpans[j]);
   3724                 }
   3725                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
   3726                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
   3727             }
   3728 
   3729             strcpy(testNameLimit, "contents");
   3730             for(j=0; j<whichSpansCount; ++j) {
   3731                 if(whichSpansCount>1) {
   3732                     sprintf(testNameLimit+8 /* strlen("contents") */,
   3733                             "%%0x%3x",
   3734                             whichSpans[j]);
   3735                 }
   3736                 testSpanContents(sets_with_str, whichSpans[j], testName);
   3737             }
   3738         } else {
   3739             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
   3740             strcpy(testNameLimit, "test_string");
   3741             for(j=0; j<whichSpansCount; ++j) {
   3742                 if(whichSpansCount>1) {
   3743                     sprintf(testNameLimit+11 /* strlen("test_string") */,
   3744                             "%%0x%3x",
   3745                             whichSpans[j]);
   3746                 }
   3747                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
   3748             }
   3749         }
   3750     }
   3751     for(j=0; j<SET_COUNT; ++j) {
   3752         delete sets_with_str[j];
   3753         delete sets[j];
   3754     }
   3755 }
   3756 
   3757 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
   3758 void UnicodeSetTest::TestStringSpan() {
   3759     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
   3760     static const char *const string=
   3761         "xx"
   3762         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
   3763         "xx"
   3764         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
   3765         "xx"
   3766         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
   3767         "aaaa";
   3768 
   3769     UErrorCode errorCode=U_ZERO_ERROR;
   3770     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
   3771     UnicodeSet set(pattern16, errorCode);
   3772     if(U_FAILURE(errorCode)) {
   3773         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3774         return;
   3775     }
   3776 
   3777     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
   3778 
   3779     if(set.containsAll(string16)) {
   3780         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
   3781     }
   3782 
   3783     // Remove trailing "aaaa".
   3784     string16.truncate(string16.length()-4);
   3785     if(!set.containsAll(string16)) {
   3786         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
   3787     }
   3788 
   3789     string16=UNICODE_STRING_SIMPLE("byayaxya");
   3790     const UChar *s16=string16.getBuffer();
   3791     int32_t length16=string16.length();
   3792     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
   3793         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
   3794         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
   3795         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
   3796         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
   3797         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
   3798     ) {
   3799         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
   3800     }
   3801 
   3802     pattern="[a{ab}{abc}{cd}]";
   3803     pattern16=UnicodeString(pattern, -1, US_INV);
   3804     set.applyPattern(pattern16, errorCode);
   3805     if(U_FAILURE(errorCode)) {
   3806         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3807         return;
   3808     }
   3809     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
   3810     s16=string16.getBuffer();
   3811     length16=string16.length();
   3812     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
   3813         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
   3814         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
   3815     ) {
   3816         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
   3817     }
   3818 
   3819     pattern="[d{cd}{bcd}{ab}]";
   3820     pattern16=UnicodeString(pattern, -1, US_INV);
   3821     set.applyPattern(pattern16, errorCode).freeze();
   3822     if(U_FAILURE(errorCode)) {
   3823         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3824         return;
   3825     }
   3826     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
   3827     s16=string16.getBuffer();
   3828     length16=string16.length();
   3829     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
   3830         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
   3831         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
   3832     ) {
   3833         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
   3834     }
   3835 }
   3836