Home | History | Annotate | Download | only in intltest
      1 /*
      2 ********************************************************************************
      3 *   Copyright (C) 1999-2013 International Business Machines Corporation and
      4 *   others. All Rights Reserved.
      5 ********************************************************************************
      6 *   Date        Name        Description
      7 *   10/20/99    alan        Creation.
      8 *   03/22/2000  Madhu       Added additional tests
      9 ********************************************************************************
     10 */
     11 
     12 #include <stdio.h>
     13 
     14 #include <string.h>
     15 #include "unicode/utypes.h"
     16 #include "usettest.h"
     17 #include "unicode/ucnv.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/usetiter.h"
     21 #include "unicode/ustring.h"
     22 #include "unicode/parsepos.h"
     23 #include "unicode/symtable.h"
     24 #include "unicode/uversion.h"
     25 #include "hash.h"
     26 
     27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     28 
     29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
     30     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
     31     u_errorName(status));}}
     32 
     33 #define TEST_ASSERT(expr) {if (!(expr)) { \
     34     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
     35 
     36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
     37     UnicodeString pat;
     38     set.toPattern(pat);
     39     return left + UnicodeSetTest::escape(pat);
     40 }
     41 
     42 #define CASE(id,test) case id:                          \
     43                           name = #test;                 \
     44                           if (exec) {                   \
     45                               logln(#test "---");       \
     46                               logln();                  \
     47                               test();                   \
     48                           }                             \
     49                           break
     50 
     51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
     52 }
     53 
     54 UConverter *UnicodeSetTest::openUTF8Converter() {
     55     if(utf8Cnv==NULL) {
     56         UErrorCode errorCode=U_ZERO_ERROR;
     57         utf8Cnv=ucnv_open("UTF-8", &errorCode);
     58     }
     59     return utf8Cnv;
     60 }
     61 
     62 UnicodeSetTest::~UnicodeSetTest() {
     63     ucnv_close(utf8Cnv);
     64 }
     65 
     66 void
     67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     68                                const char* &name, char* /*par*/) {
     69     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
     70     switch (index) {
     71         CASE(0,TestPatterns);
     72         CASE(1,TestAddRemove);
     73         CASE(2,TestCategories);
     74         CASE(3,TestCloneEqualHash);
     75         CASE(4,TestMinimalRep);
     76         CASE(5,TestAPI);
     77         CASE(6,TestScriptSet);
     78         CASE(7,TestPropertySet);
     79         CASE(8,TestClone);
     80         CASE(9,TestExhaustive);
     81         CASE(10,TestToPattern);
     82         CASE(11,TestIndexOf);
     83         CASE(12,TestStrings);
     84         CASE(13,Testj2268);
     85         CASE(14,TestCloseOver);
     86         CASE(15,TestEscapePattern);
     87         CASE(16,TestInvalidCodePoint);
     88         CASE(17,TestSymbolTable);
     89         CASE(18,TestSurrogate);
     90         CASE(19,TestPosixClasses);
     91         CASE(20,TestIteration);
     92         CASE(21,TestFreezable);
     93         CASE(22,TestSpan);
     94         CASE(23,TestStringSpan);
     95         default: name = ""; break;
     96     }
     97 }
     98 
     99 static const char NOT[] = "%%%%";
    100 
    101 /**
    102  * UVector was improperly copying contents
    103  * This code will crash this is still true
    104  */
    105 void UnicodeSetTest::Testj2268() {
    106   UnicodeSet t;
    107   t.add(UnicodeString("abc"));
    108   UnicodeSet test(t);
    109   UnicodeString ustrPat;
    110   test.toPattern(ustrPat, TRUE);
    111 }
    112 
    113 /**
    114  * Test toPattern().
    115  */
    116 void UnicodeSetTest::TestToPattern() {
    117     UErrorCode ec = U_ZERO_ERROR;
    118 
    119     // Test that toPattern() round trips with syntax characters and
    120     // whitespace.
    121     {
    122         static const char* OTHER_TOPATTERN_TESTS[] = {
    123             "[[:latin:]&[:greek:]]",
    124             "[[:latin:]-[:greek:]]",
    125             "[:nonspacing mark:]",
    126             NULL
    127         };
    128 
    129         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
    130             ec = U_ZERO_ERROR;
    131             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
    132             if (U_FAILURE(ec)) {
    133                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
    134                 continue;
    135             }
    136             checkPat(OTHER_TOPATTERN_TESTS[j], s);
    137         }
    138 
    139         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
    140             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
    141 
    142                 // check various combinations to make sure they all work.
    143                 if (i != 0 && !toPatternAux(i, i)){
    144                     continue;
    145                 }
    146                 if (!toPatternAux(0, i)){
    147                     continue;
    148                 }
    149                 if (!toPatternAux(i, 0xFFFF)){
    150                     continue;
    151                 }
    152             }
    153         }
    154     }
    155 
    156     // Test pattern behavior of multicharacter strings.
    157     {
    158         ec = U_ZERO_ERROR;
    159         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
    160 
    161         // This loop isn't a loop.  It's here to make the compiler happy.
    162         // If you're curious, try removing it and changing the 'break'
    163         // statements (except for the last) to goto's.
    164         for (;;) {
    165             if (U_FAILURE(ec)) break;
    166             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
    167             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
    168 
    169             s->add("ac");
    170             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
    171             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
    172 
    173             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
    174             if (U_FAILURE(ec)) break;
    175             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
    176             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
    177 
    178             s->add("[]");
    179             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
    180             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
    181 
    182             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
    183             if (U_FAILURE(ec)) break;
    184             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
    185             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
    186 
    187             // j2189
    188             s->clear();
    189             s->add(UnicodeString("abc", ""));
    190             s->add(UnicodeString("abc", ""));
    191             const char* exp6[] = {"abc", NOT, "ab", NULL};
    192             expectToPattern(*s, "[{abc}]", exp6);
    193 
    194             break;
    195         }
    196 
    197         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
    198         delete s;
    199     }
    200 
    201     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
    202     UnicodeSet s;
    203     s.add((UChar)97, (UChar)98); // 'a', 'b'
    204     expectToPattern(s, "[ab]", NULL);
    205 }
    206 
    207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
    208 
    209     // use Integer.toString because Utility.hex doesn't handle ints
    210     UnicodeString pat = "";
    211     // TODO do these in hex
    212     //String source = "0x" + Integer.toString(start,16).toUpperCase();
    213     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
    214     UnicodeString source;
    215     source = source + (uint32_t)start;
    216     if (start != end)
    217         source = source + ".." + (uint32_t)end;
    218     UnicodeSet testSet;
    219     testSet.add(start, end);
    220     return checkPat(source, testSet);
    221 }
    222 
    223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
    224                                const UnicodeSet& testSet) {
    225     // What we want to make sure of is that a pattern generated
    226     // by toPattern(), with or without escaped unprintables, can
    227     // be passed back into the UnicodeSet constructor.
    228     UnicodeString pat0;
    229 
    230     testSet.toPattern(pat0, TRUE);
    231 
    232     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
    233 
    234     //String pat1 = unescapeLeniently(pat0);
    235     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
    236 
    237     UnicodeString pat2;
    238     testSet.toPattern(pat2, FALSE);
    239     if (!checkPat(source, testSet, pat2)) return FALSE;
    240 
    241     //String pat3 = unescapeLeniently(pat2);
    242     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
    243 
    244     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
    245     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
    246     return TRUE;
    247 }
    248 
    249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
    250                                const UnicodeSet& testSet,
    251                                const UnicodeString& pat) {
    252     UErrorCode ec = U_ZERO_ERROR;
    253     UnicodeSet testSet2(pat, ec);
    254     if (testSet2 != testSet) {
    255         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
    256         return FALSE;
    257     }
    258     return TRUE;
    259 }
    260 
    261 void
    262 UnicodeSetTest::TestPatterns(void) {
    263     UnicodeSet set;
    264     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
    265     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
    266     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
    267     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
    268     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
    269     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
    270 
    271     // Throw in a test of complement
    272     set.complement();
    273     UnicodeString exp;
    274     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
    275     expectPairs(set, exp);
    276 }
    277 
    278 void
    279 UnicodeSetTest::TestCategories(void) {
    280     UErrorCode status = U_ZERO_ERROR;
    281     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
    282     UnicodeSet set(pat, status);
    283     if (U_FAILURE(status)) {
    284         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
    285         return;
    286     } else {
    287         expectContainment(set, pat, "ABC", "abc");
    288     }
    289 
    290     UChar32 i;
    291     int32_t failures = 0;
    292     // Make sure generation of L doesn't pollute cached Lu set
    293     // First generate L, then Lu
    294     set.applyPattern("[:L:]", status);
    295     if (U_FAILURE(status)) { errln("FAIL"); return; }
    296     for (i=0; i<0x200; ++i) {
    297         UBool l = u_isalpha((UChar)i);
    298         if (l != set.contains(i)) {
    299             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
    300                   set.contains(i));
    301             if (++failures == 10) break;
    302         }
    303     }
    304 
    305     set.applyPattern("[:Lu:]", status);
    306     if (U_FAILURE(status)) { errln("FAIL"); return; }
    307     for (i=0; i<0x200; ++i) {
    308         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
    309         if (lu != set.contains(i)) {
    310             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
    311                   set.contains(i));
    312             if (++failures == 20) break;
    313         }
    314     }
    315 }
    316 void
    317 UnicodeSetTest::TestCloneEqualHash(void) {
    318     UErrorCode status = U_ZERO_ERROR;
    319     // set1 and set2 used to be built with the obsolete constructor taking
    320     // UCharCategory values; replaced with pattern constructors
    321     // markus 20030502
    322     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
    323     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
    324     if (U_FAILURE(status)){
    325         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
    326         return;
    327     }
    328     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
    329     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
    330     if (U_FAILURE(status)){
    331         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
    332         return;
    333     }
    334 
    335     if (*set1 != *set1a) {
    336         errln("FAIL: category constructor for Ll broken");
    337     }
    338     if (*set2 != *set2a) {
    339         errln("FAIL: category constructor for Nd broken");
    340     }
    341     delete set1a;
    342     delete set2a;
    343 
    344     logln("Testing copy construction");
    345     UnicodeSet *set1copy=new UnicodeSet(*set1);
    346     if(*set1 != *set1copy || *set1 == *set2 ||
    347         getPairs(*set1) != getPairs(*set1copy) ||
    348         set1->hashCode() != set1copy->hashCode()){
    349         errln("FAIL : Error in copy construction");
    350         return;
    351     }
    352 
    353     logln("Testing =operator");
    354     UnicodeSet set1equal=*set1;
    355     UnicodeSet set2equal=*set2;
    356     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
    357         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
    358         errln("FAIL: Error in =operator");
    359     }
    360 
    361     logln("Testing clone()");
    362     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
    363     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
    364     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
    365         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
    366         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
    367         errln("FAIL: Error in clone");
    368     }
    369 
    370     logln("Testing hashcode");
    371     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
    372         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
    373         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
    374         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
    375         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
    376         errln("FAIL: Error in hashCode()");
    377     }
    378 
    379     delete set1;
    380     delete set1copy;
    381     delete set2;
    382     delete set1clone;
    383     delete set2clone;
    384 
    385 
    386 }
    387 void
    388 UnicodeSetTest::TestAddRemove(void) {
    389     UnicodeSet set; // Construct empty set
    390     doAssert(set.isEmpty() == TRUE, "set should be empty");
    391     doAssert(set.size() == 0, "size should be 0");
    392     set.complement();
    393     doAssert(set.size() == 0x110000, "size should be 0x110000");
    394     set.clear();
    395     set.add(0x0061, 0x007a);
    396     expectPairs(set, "az");
    397     doAssert(set.isEmpty() == FALSE, "set should not be empty");
    398     doAssert(set.size() != 0, "size should not be equal to 0");
    399     doAssert(set.size() == 26, "size should be equal to 26");
    400     set.remove(0x006d, 0x0070);
    401     expectPairs(set, "alqz");
    402     doAssert(set.size() == 22, "size should be equal to 22");
    403     set.remove(0x0065, 0x0067);
    404     expectPairs(set, "adhlqz");
    405     doAssert(set.size() == 19, "size should be equal to 19");
    406     set.remove(0x0064, 0x0069);
    407     expectPairs(set, "acjlqz");
    408     doAssert(set.size() == 16, "size should be equal to 16");
    409     set.remove(0x0063, 0x0072);
    410     expectPairs(set, "absz");
    411     doAssert(set.size() == 10, "size should be equal to 10");
    412     set.add(0x0066, 0x0071);
    413     expectPairs(set, "abfqsz");
    414     doAssert(set.size() == 22, "size should be equal to 22");
    415     set.remove(0x0061, 0x0067);
    416     expectPairs(set, "hqsz");
    417     set.remove(0x0061, 0x007a);
    418     expectPairs(set, "");
    419     doAssert(set.isEmpty() == TRUE, "set should be empty");
    420     doAssert(set.size() == 0, "size should be 0");
    421     set.add(0x0061);
    422     doAssert(set.isEmpty() == FALSE, "set should not be empty");
    423     doAssert(set.size() == 1, "size should not be equal to 1");
    424     set.add(0x0062);
    425     set.add(0x0063);
    426     expectPairs(set, "ac");
    427     doAssert(set.size() == 3, "size should not be equal to 3");
    428     set.add(0x0070);
    429     set.add(0x0071);
    430     expectPairs(set, "acpq");
    431     doAssert(set.size() == 5, "size should not be equal to 5");
    432     set.clear();
    433     expectPairs(set, "");
    434     doAssert(set.isEmpty() == TRUE, "set should be empty");
    435     doAssert(set.size() == 0, "size should be 0");
    436 
    437     // Try removing an entire set from another set
    438     expectPattern(set, "[c-x]", "cx");
    439     UnicodeSet set2;
    440     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
    441     set.removeAll(set2);
    442     expectPairs(set, "deluxx");
    443 
    444     // Try adding an entire set to another set
    445     expectPattern(set, "[jackiemclean]", "aacceein");
    446     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
    447     set.addAll(set2);
    448     expectPairs(set, "aacehort");
    449     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
    450 
    451     // Try retaining an set of elements contained in another set (intersection)
    452     UnicodeSet set3;
    453     expectPattern(set3, "[a-c]", "ac");
    454     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
    455     set3.remove(0x0062);
    456     expectPairs(set3, "aacc");
    457     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
    458     set.retainAll(set3);
    459     expectPairs(set, "aacc");
    460     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
    461     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
    462     set.clear();
    463     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
    464 
    465     // Test commutativity
    466     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
    467     expectPattern(set2, "[jackiemclean]", "aacceein");
    468     set.addAll(set2);
    469     expectPairs(set, "aacehort");
    470     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
    471 
    472 
    473 
    474 
    475 }
    476 
    477 /**
    478  * Make sure minimal representation is maintained.
    479  */
    480 void UnicodeSetTest::TestMinimalRep() {
    481     UErrorCode status = U_ZERO_ERROR;
    482     // This is pretty thoroughly tested by checkCanonicalRep()
    483     // run against the exhaustive operation results.  Use the code
    484     // here for debugging specific spot problems.
    485 
    486     // 1 overlap against 2
    487     UnicodeSet set("[h-km-q]", status);
    488     if (U_FAILURE(status)) { errln("FAIL"); return; }
    489     UnicodeSet set2("[i-o]", status);
    490     if (U_FAILURE(status)) { errln("FAIL"); return; }
    491     set.addAll(set2);
    492     expectPairs(set, "hq");
    493     // right
    494     set.applyPattern("[a-m]", status);
    495     if (U_FAILURE(status)) { errln("FAIL"); return; }
    496     set2.applyPattern("[e-o]", status);
    497     if (U_FAILURE(status)) { errln("FAIL"); return; }
    498     set.addAll(set2);
    499     expectPairs(set, "ao");
    500     // left
    501     set.applyPattern("[e-o]", status);
    502     if (U_FAILURE(status)) { errln("FAIL"); return; }
    503     set2.applyPattern("[a-m]", status);
    504     if (U_FAILURE(status)) { errln("FAIL"); return; }
    505     set.addAll(set2);
    506     expectPairs(set, "ao");
    507     // 1 overlap against 3
    508     set.applyPattern("[a-eg-mo-w]", status);
    509     if (U_FAILURE(status)) { errln("FAIL"); return; }
    510     set2.applyPattern("[d-q]", status);
    511     if (U_FAILURE(status)) { errln("FAIL"); return; }
    512     set.addAll(set2);
    513     expectPairs(set, "aw");
    514 }
    515 
    516 void UnicodeSetTest::TestAPI() {
    517     UErrorCode status = U_ZERO_ERROR;
    518     // default ct
    519     UnicodeSet set;
    520     if (!set.isEmpty() || set.getRangeCount() != 0) {
    521         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
    522               set);
    523     }
    524 
    525     // clear(), isEmpty()
    526     set.add(0x0061);
    527     if (set.isEmpty()) {
    528         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
    529               set);
    530     }
    531     set.clear();
    532     if (!set.isEmpty()) {
    533         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
    534               set);
    535     }
    536 
    537     // size()
    538     set.clear();
    539     if (set.size() != 0) {
    540         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
    541               ": " + set);
    542     }
    543     set.add(0x0061);
    544     if (set.size() != 1) {
    545         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
    546               ": " + set);
    547     }
    548     set.add(0x0031, 0x0039);
    549     if (set.size() != 10) {
    550         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
    551               ": " + set);
    552     }
    553 
    554     // contains(first, last)
    555     set.clear();
    556     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
    557     if (U_FAILURE(status)) { errln("FAIL"); return; }
    558     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
    559         UChar32 a = set.getRangeStart(i);
    560         UChar32 b = set.getRangeEnd(i);
    561         if (!set.contains(a, b)) {
    562             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
    563                   " but doesn't: " + set);
    564         }
    565         if (set.contains((UChar32)(a-1), b)) {
    566             errln((UnicodeString)"FAIL, shouldn't contain " +
    567                   (unsigned short)(a-1) + '-' + (unsigned short)b +
    568                   " but does: " + set);
    569         }
    570         if (set.contains(a, (UChar32)(b+1))) {
    571             errln((UnicodeString)"FAIL, shouldn't contain " +
    572                   (unsigned short)a + '-' + (unsigned short)(b+1) +
    573                   " but does: " + set);
    574         }
    575     }
    576 
    577     // Ported InversionList test.
    578     UnicodeSet a((UChar32)3,(UChar32)10);
    579     UnicodeSet b((UChar32)7,(UChar32)15);
    580     UnicodeSet c;
    581 
    582     logln((UnicodeString)"a [3-10]: " + a);
    583     logln((UnicodeString)"b [7-15]: " + b);
    584     c = a;
    585     c.addAll(b);
    586     UnicodeSet exp((UChar32)3,(UChar32)15);
    587     if (c == exp) {
    588         logln((UnicodeString)"c.set(a).add(b): " + c);
    589     } else {
    590         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
    591     }
    592     c.complement();
    593     exp.set((UChar32)0, (UChar32)2);
    594     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
    595     if (c == exp) {
    596         logln((UnicodeString)"c.complement(): " + c);
    597     } else {
    598         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
    599     }
    600     c.complement();
    601     exp.set((UChar32)3, (UChar32)15);
    602     if (c == exp) {
    603         logln((UnicodeString)"c.complement(): " + c);
    604     } else {
    605         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
    606     }
    607     c = a;
    608     c.complementAll(b);
    609     exp.set((UChar32)3,(UChar32)6);
    610     exp.add((UChar32)11,(UChar32) 15);
    611     if (c == exp) {
    612         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
    613     } else {
    614         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
    615     }
    616 
    617     exp = c;
    618     bitsToSet(setToBits(c), c);
    619     if (c == exp) {
    620         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
    621     } else {
    622         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
    623     }
    624 
    625     // Additional tests for coverage JB#2118
    626     //UnicodeSet::complement(class UnicodeString const &)
    627     //UnicodeSet::complementAll(class UnicodeString const &)
    628     //UnicodeSet::containsNone(class UnicodeSet const &)
    629     //UnicodeSet::containsNone(long,long)
    630     //UnicodeSet::containsSome(class UnicodeSet const &)
    631     //UnicodeSet::containsSome(long,long)
    632     //UnicodeSet::removeAll(class UnicodeString const &)
    633     //UnicodeSet::retain(long)
    634     //UnicodeSet::retainAll(class UnicodeString const &)
    635     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
    636     //UnicodeSetIterator::getString(void)
    637     set.clear();
    638     set.complement("ab");
    639     exp.applyPattern("[{ab}]", status);
    640     if (U_FAILURE(status)) { errln("FAIL"); return; }
    641     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
    642 
    643     UnicodeSetIterator iset(set);
    644     if (!iset.next() || !iset.isString()) {
    645         errln("FAIL: UnicodeSetIterator::next/isString");
    646     } else if (iset.getString() != "ab") {
    647         errln("FAIL: UnicodeSetIterator::getString");
    648     }
    649 
    650     set.add((UChar32)0x61, (UChar32)0x7A);
    651     set.complementAll("alan");
    652     exp.applyPattern("[{ab}b-kmo-z]", status);
    653     if (U_FAILURE(status)) { errln("FAIL"); return; }
    654     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
    655 
    656     exp.applyPattern("[a-z]", status);
    657     if (U_FAILURE(status)) { errln("FAIL"); return; }
    658     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
    659     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
    660     exp.applyPattern("[aln]", status);
    661     if (U_FAILURE(status)) { errln("FAIL"); return; }
    662     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
    663     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
    664 
    665     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
    666         errln("FAIL: containsNone(UChar32, UChar32)");
    667     }
    668     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
    669         errln("FAIL: containsSome(UChar32, UChar32)");
    670     }
    671     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
    672         errln("FAIL: containsNone(UChar32, UChar32)");
    673     }
    674     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
    675         errln("FAIL: containsSome(UChar32, UChar32)");
    676     }
    677 
    678     set.removeAll("liu");
    679     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
    680     if (U_FAILURE(status)) { errln("FAIL"); return; }
    681     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
    682 
    683     set.retainAll("star");
    684     exp.applyPattern("[rst]", status);
    685     if (U_FAILURE(status)) { errln("FAIL"); return; }
    686     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
    687 
    688     set.retain((UChar32)0x73);
    689     exp.applyPattern("[s]", status);
    690     if (U_FAILURE(status)) { errln("FAIL"); return; }
    691     if (set != exp) { errln("FAIL: retain('s')"); return; }
    692 
    693     uint16_t buf[32];
    694     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
    695     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
    696     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
    697         errln("FAIL: serialize");
    698         return;
    699     }
    700 
    701     // Conversions to and from USet
    702     UnicodeSet *uniset = &set;
    703     USet *uset = uniset->toUSet();
    704     TEST_ASSERT((void *)uset == (void *)uniset);
    705     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
    706     TEST_ASSERT((void *)setx == (void *)uset);
    707     const UnicodeSet *constSet = uniset;
    708     const USet *constUSet = constSet->toUSet();
    709     TEST_ASSERT((void *)constUSet == (void *)constSet);
    710     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
    711     TEST_ASSERT((void *)constSetx == (void *)constUSet);
    712 
    713     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
    714     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
    715     UnicodeSet ac(0x61, 0x63);
    716     ac.remove(0x62).freeze();
    717     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
    718         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
    719         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
    720         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
    721         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
    722         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
    723         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
    724         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
    725         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
    726         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
    727     ) {
    728         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
    729     }
    730     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
    731         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
    732         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
    733         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
    734         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
    735         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
    736         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
    737         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
    738         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
    739         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
    740     ) {
    741         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
    742     }
    743 }
    744 
    745 void UnicodeSetTest::TestIteration() {
    746     UErrorCode ec = U_ZERO_ERROR;
    747     int i = 0;
    748     int outerLoop;
    749 
    750     // 6 code points, 3 ranges, 2 strings, 8 total elements
    751     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
    752     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
    753     TEST_ASSERT_SUCCESS(ec);
    754     UnicodeSetIterator it(set);
    755 
    756     for (outerLoop=0; outerLoop<3; outerLoop++) {
    757         // Run the test multiple times, to check that iterator.reset() is working.
    758         for (i=0; i<10; i++) {
    759             UBool         nextv        = it.next();
    760             UBool         isString     = it.isString();
    761             int32_t       codePoint    = it.getCodepoint();
    762             //int32_t       codePointEnd = it.getCodepointEnd();
    763             UnicodeString s   = it.getString();
    764             switch (i) {
    765             case 0:
    766                 TEST_ASSERT(nextv == TRUE);
    767                 TEST_ASSERT(isString == FALSE);
    768                 TEST_ASSERT(codePoint==0x61);
    769                 TEST_ASSERT(s == "a");
    770                 break;
    771             case 1:
    772                 TEST_ASSERT(nextv == TRUE);
    773                 TEST_ASSERT(isString == FALSE);
    774                 TEST_ASSERT(codePoint==0x62);
    775                 TEST_ASSERT(s == "b");
    776                 break;
    777             case 2:
    778                 TEST_ASSERT(nextv == TRUE);
    779                 TEST_ASSERT(isString == FALSE);
    780                 TEST_ASSERT(codePoint==0x63);
    781                 TEST_ASSERT(s == "c");
    782                 break;
    783             case 3:
    784                 TEST_ASSERT(nextv == TRUE);
    785                 TEST_ASSERT(isString == FALSE);
    786                 TEST_ASSERT(codePoint==0x79);
    787                 TEST_ASSERT(s == "y");
    788                 break;
    789             case 4:
    790                 TEST_ASSERT(nextv == TRUE);
    791                 TEST_ASSERT(isString == FALSE);
    792                 TEST_ASSERT(codePoint==0x7a);
    793                 TEST_ASSERT(s == "z");
    794                 break;
    795             case 5:
    796                 TEST_ASSERT(nextv == TRUE);
    797                 TEST_ASSERT(isString == FALSE);
    798                 TEST_ASSERT(codePoint==0x1abcd);
    799                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
    800                 break;
    801             case 6:
    802                 TEST_ASSERT(nextv == TRUE);
    803                 TEST_ASSERT(isString == TRUE);
    804                 TEST_ASSERT(s == "str1");
    805                 break;
    806             case 7:
    807                 TEST_ASSERT(nextv == TRUE);
    808                 TEST_ASSERT(isString == TRUE);
    809                 TEST_ASSERT(s == "str2");
    810                 break;
    811             case 8:
    812                 TEST_ASSERT(nextv == FALSE);
    813                 break;
    814             case 9:
    815                 TEST_ASSERT(nextv == FALSE);
    816                 break;
    817             }
    818         }
    819         it.reset();  // prepare to run the iteration again.
    820     }
    821 }
    822 
    823 
    824 
    825 
    826 void UnicodeSetTest::TestStrings() {
    827     UErrorCode ec = U_ZERO_ERROR;
    828 
    829     UnicodeSet* testList[] = {
    830         UnicodeSet::createFromAll("abc"),
    831         new UnicodeSet("[a-c]", ec),
    832 
    833         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
    834         new UnicodeSet("[{ll}{ch}a-z]", ec),
    835 
    836         UnicodeSet::createFrom("ab}c"),
    837         new UnicodeSet("[{ab\\}c}]", ec),
    838 
    839         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
    840         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
    841 
    842         NULL
    843     };
    844 
    845     if (U_FAILURE(ec)) {
    846         errln("FAIL: couldn't construct test sets");
    847     }
    848 
    849     for (int32_t i = 0; testList[i] != NULL; i+=2) {
    850         if (U_SUCCESS(ec)) {
    851             UnicodeString pat0, pat1;
    852             testList[i]->toPattern(pat0, TRUE);
    853             testList[i+1]->toPattern(pat1, TRUE);
    854             if (*testList[i] == *testList[i+1]) {
    855                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
    856             } else {
    857                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
    858             }
    859         }
    860         delete testList[i];
    861         delete testList[i+1];
    862     }
    863 }
    864 
    865 /**
    866  * Test the [:Latin:] syntax.
    867  */
    868 void UnicodeSetTest::TestScriptSet() {
    869     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
    870 
    871     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
    872 
    873     /* Jitterbug 1423 */
    874     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
    875 
    876 }
    877 
    878 /**
    879  * Test the [:Latin:] syntax.
    880  */
    881 void UnicodeSetTest::TestPropertySet() {
    882     static const char* const DATA[] = {
    883         // Pattern, Chars IN, Chars NOT in
    884 
    885         "[:Latin:]",
    886         "aA",
    887         "\\u0391\\u03B1",
    888 
    889         "[\\p{Greek}]",
    890         "\\u0391\\u03B1",
    891         "aA",
    892 
    893         "\\P{ GENERAL Category = upper case letter }",
    894         "abc",
    895         "ABC",
    896 
    897 #if !UCONFIG_NO_NORMALIZATION
    898         // Combining class: @since ICU 2.2
    899         // Check both symbolic and numeric
    900         "\\p{ccc=Nukta}",
    901         "\\u0ABC",
    902         "abc",
    903 
    904         "\\p{Canonical Combining Class = 11}",
    905         "\\u05B1",
    906         "\\u05B2",
    907 
    908         "[:c c c = iota subscript :]",
    909         "\\u0345",
    910         "xyz",
    911 #endif
    912 
    913         // Bidi class: @since ICU 2.2
    914         "\\p{bidiclass=lefttoright}",
    915         "abc",
    916         "\\u0671\\u0672",
    917 
    918         // Binary properties: @since ICU 2.2
    919         "\\p{ideographic}",
    920         "\\u4E0A",
    921         "x",
    922 
    923         "[:math=false:]",
    924         "q)*(",
    925         // weiv: )(and * were removed from math in Unicode 4.0.1
    926         //"(*+)",
    927         "+<>^",
    928 
    929         // JB#1767 \N{}, \p{ASCII}
    930         "[:Ascii:]",
    931         "abc\\u0000\\u007F",
    932         "\\u0080\\u4E00",
    933 
    934         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
    935         "az",
    936         "qrs",
    937 
    938         // JB#2015
    939         "[:any:]",
    940         "a\\U0010FFFF",
    941         "",
    942 
    943         "[:nv=0.5:]",
    944         "\\u00BD\\u0F2A",
    945         "\\u00BC",
    946 
    947         // JB#2653: Age
    948         "[:Age=1.1:]",
    949         "\\u03D6", // 1.1
    950         "\\u03D8\\u03D9", // 3.2
    951 
    952         "[:Age=3.1:]",
    953         "\\u1800\\u3400\\U0002f800",
    954         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
    955 
    956         // JB#2350: Case_Sensitive
    957         "[:Case Sensitive:]",
    958         "A\\u1FFC\\U00010410",
    959         ";\\u00B4\\U00010500",
    960 
    961         // JB#2832: C99-compatibility props
    962         "[:blank:]",
    963         " \\u0009",
    964         "1-9A-Z",
    965 
    966         "[:graph:]",
    967         "19AZ",
    968         " \\u0003\\u0007\\u0009\\u000A\\u000D",
    969 
    970         "[:punct:]",
    971         "!@#%&*()[]{}-_\\/;:,.?'\"",
    972         "09azAZ",
    973 
    974         "[:xdigit:]",
    975         "09afAF",
    976         "gG!",
    977 
    978         // Regex compatibility test
    979         "[-b]", // leading '-' is literal
    980         "-b",
    981         "ac",
    982 
    983         "[^-b]", // leading '-' is literal
    984         "ac",
    985         "-b",
    986 
    987         "[b-]", // trailing '-' is literal
    988         "-b",
    989         "ac",
    990 
    991         "[^b-]", // trailing '-' is literal
    992         "ac",
    993         "-b",
    994 
    995         "[a-b-]", // trailing '-' is literal
    996         "ab-",
    997         "c=",
    998 
    999         "[[a-q]&[p-z]-]", // trailing '-' is literal
   1000         "pq-",
   1001         "or=",
   1002 
   1003         "[\\s|\\)|:|$|\\>]", // from regex tests
   1004         "s|):$>",
   1005         "abc",
   1006 
   1007         "[\\uDC00cd]", // JB#2906: isolated trail at start
   1008         "cd\\uDC00",
   1009         "ab\\uD800\\U00010000",
   1010 
   1011         "[ab\\uD800]", // JB#2906: isolated trail at start
   1012         "ab\\uD800",
   1013         "cd\\uDC00\\U00010000",
   1014 
   1015         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
   1016         "abcd\\uD800",
   1017         "ef\\uDC00\\U00010000",
   1018 
   1019         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
   1020         "abcd\\uDC00",
   1021         "ef\\uD800\\U00010000",
   1022 
   1023 #if !UCONFIG_NO_NORMALIZATION
   1024         "[:^lccc=0:]", // Lead canonical class
   1025         "\\u0300\\u0301",
   1026         "abcd\\u00c0\\u00c5",
   1027 
   1028         "[:^tccc=0:]", // Trail canonical class
   1029         "\\u0300\\u0301\\u00c0\\u00c5",
   1030         "abcd",
   1031 
   1032         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
   1033         "\\u0300\\u0301\\u00c0\\u00c5",
   1034         "abcd",
   1035 
   1036         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
   1037         "",
   1038         "abcd\\u0300\\u0301\\u00c0\\u00c5",
   1039 
   1040         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
   1041         "\\u0F73\\u0F75\\u0F81",
   1042         "abcd\\u0300\\u0301\\u00c0\\u00c5",
   1043 #endif /* !UCONFIG_NO_NORMALIZATION */
   1044 
   1045         "[:Assigned:]",
   1046         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
   1047         "\\u0888\\uFDD3\\uFFFE\\U00050005",
   1048 
   1049         // Script_Extensions, new in Unicode 6.0
   1050         "[:scx=Arab:]",
   1051         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
   1052         "\\u061D\\uFDEF\\uFDFE",
   1053 
   1054         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
   1055         // so scx-sc is missing U+FDF2.
   1056         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
   1057         "\\u0640\\u064B\\u0650\\u0655\\uFDFD",
   1058         "\\uFDF2"
   1059     };
   1060 
   1061     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
   1062 
   1063     for (int32_t i=0; i<DATA_LEN; i+=3) {
   1064         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
   1065                           CharsToUnicodeString(DATA[i+2]));
   1066     }
   1067 }
   1068 
   1069 /**
   1070   * Test that Posix style character classes [:digit:], etc.
   1071   *   have the Unicode definitions from TR 18.
   1072   */
   1073 void UnicodeSetTest::TestPosixClasses() {
   1074     {
   1075         UErrorCode status = U_ZERO_ERROR;
   1076         UnicodeSet s1("[:alpha:]", status);
   1077         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
   1078         TEST_ASSERT_SUCCESS(status);
   1079         TEST_ASSERT(s1==s2);
   1080     }
   1081     {
   1082         UErrorCode status = U_ZERO_ERROR;
   1083         UnicodeSet s1("[:lower:]", status);
   1084         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
   1085         TEST_ASSERT_SUCCESS(status);
   1086         TEST_ASSERT(s1==s2);
   1087     }
   1088     {
   1089         UErrorCode status = U_ZERO_ERROR;
   1090         UnicodeSet s1("[:upper:]", status);
   1091         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
   1092         TEST_ASSERT_SUCCESS(status);
   1093         TEST_ASSERT(s1==s2);
   1094     }
   1095     {
   1096         UErrorCode status = U_ZERO_ERROR;
   1097         UnicodeSet s1("[:punct:]", status);
   1098         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
   1099         TEST_ASSERT_SUCCESS(status);
   1100         TEST_ASSERT(s1==s2);
   1101     }
   1102     {
   1103         UErrorCode status = U_ZERO_ERROR;
   1104         UnicodeSet s1("[:digit:]", status);
   1105         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
   1106         TEST_ASSERT_SUCCESS(status);
   1107         TEST_ASSERT(s1==s2);
   1108     }
   1109     {
   1110         UErrorCode status = U_ZERO_ERROR;
   1111         UnicodeSet s1("[:xdigit:]", status);
   1112         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
   1113         TEST_ASSERT_SUCCESS(status);
   1114         TEST_ASSERT(s1==s2);
   1115     }
   1116     {
   1117         UErrorCode status = U_ZERO_ERROR;
   1118         UnicodeSet s1("[:alnum:]", status);
   1119         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
   1120         TEST_ASSERT_SUCCESS(status);
   1121         TEST_ASSERT(s1==s2);
   1122     }
   1123     {
   1124         UErrorCode status = U_ZERO_ERROR;
   1125         UnicodeSet s1("[:space:]", status);
   1126         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
   1127         TEST_ASSERT_SUCCESS(status);
   1128         TEST_ASSERT(s1==s2);
   1129     }
   1130     {
   1131         UErrorCode status = U_ZERO_ERROR;
   1132         UnicodeSet s1("[:blank:]", status);
   1133         TEST_ASSERT_SUCCESS(status);
   1134         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
   1135             status);
   1136         TEST_ASSERT_SUCCESS(status);
   1137         TEST_ASSERT(s1==s2);
   1138     }
   1139     {
   1140         UErrorCode status = U_ZERO_ERROR;
   1141         UnicodeSet s1("[:cntrl:]", status);
   1142         TEST_ASSERT_SUCCESS(status);
   1143         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
   1144         TEST_ASSERT_SUCCESS(status);
   1145         TEST_ASSERT(s1==s2);
   1146     }
   1147     {
   1148         UErrorCode status = U_ZERO_ERROR;
   1149         UnicodeSet s1("[:graph:]", status);
   1150         TEST_ASSERT_SUCCESS(status);
   1151         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
   1152         TEST_ASSERT_SUCCESS(status);
   1153         TEST_ASSERT(s1==s2);
   1154     }
   1155     {
   1156         UErrorCode status = U_ZERO_ERROR;
   1157         UnicodeSet s1("[:print:]", status);
   1158         TEST_ASSERT_SUCCESS(status);
   1159         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
   1160         TEST_ASSERT_SUCCESS(status);
   1161         TEST_ASSERT(s1==s2);
   1162     }
   1163 }
   1164 /**
   1165  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
   1166  */
   1167 void UnicodeSetTest::TestClone() {
   1168     UErrorCode ec = U_ZERO_ERROR;
   1169     UnicodeSet s("[abcxyz]", ec);
   1170     UnicodeSet t(s);
   1171     expectContainment(t, "abc", "def");
   1172 }
   1173 
   1174 /**
   1175  * Test the indexOf() and charAt() methods.
   1176  */
   1177 void UnicodeSetTest::TestIndexOf() {
   1178     UErrorCode ec = U_ZERO_ERROR;
   1179     UnicodeSet set("[a-cx-y3578]", ec);
   1180     if (U_FAILURE(ec)) {
   1181         errln("FAIL: UnicodeSet constructor");
   1182         return;
   1183     }
   1184     for (int32_t i=0; i<set.size(); ++i) {
   1185         UChar32 c = set.charAt(i);
   1186         if (set.indexOf(c) != i) {
   1187             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
   1188                 i, c, set.indexOf(c));
   1189         }
   1190     }
   1191     UChar32 c = set.charAt(set.size());
   1192     if (c != -1) {
   1193         errln("FAIL: charAt(<out of range>) = %X", c);
   1194     }
   1195     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
   1196     if (j != -1) {
   1197         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
   1198     }
   1199 }
   1200 
   1201 /**
   1202  * Test closure API.
   1203  */
   1204 void UnicodeSetTest::TestCloseOver() {
   1205     UErrorCode ec = U_ZERO_ERROR;
   1206 
   1207     char CASE[] = {(char)USET_CASE_INSENSITIVE};
   1208     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
   1209     const char* DATA[] = {
   1210         // selector, input, output
   1211         CASE,
   1212         "[aq\\u00DF{Bc}{bC}{Fi}]",
   1213         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
   1214 
   1215         CASE,
   1216         "[\\u01F1]", // 'DZ'
   1217         "[\\u01F1\\u01F2\\u01F3]",
   1218 
   1219         CASE,
   1220         "[\\u1FB4]",
   1221         "[\\u1FB4{\\u03AC\\u03B9}]",
   1222 
   1223         CASE,
   1224         "[{F\\uFB01}]",
   1225         "[\\uFB03{ffi}]",
   1226 
   1227         CASE, // make sure binary search finds limits
   1228         "[a\\uFF3A]",
   1229         "[aA\\uFF3A\\uFF5A]",
   1230 
   1231         CASE,
   1232         "[a-z]","[A-Za-z\\u017F\\u212A]",
   1233         CASE,
   1234         "[abc]","[A-Ca-c]",
   1235         CASE,
   1236         "[ABC]","[A-Ca-c]",
   1237 
   1238         CASE, "[i]", "[iI]",
   1239 
   1240         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
   1241         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
   1242 
   1243         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
   1244 
   1245         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
   1246 
   1247         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
   1248 
   1249         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
   1250 
   1251         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
   1252 
   1253         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
   1254 
   1255         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
   1256         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
   1257 
   1258         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
   1259 
   1260         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
   1261 
   1262         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
   1263 
   1264 #if !UCONFIG_NO_FILE_IO
   1265         CASE_MAPPINGS,
   1266         "[aq\\u00DF{Bc}{bC}{Fi}]",
   1267         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
   1268 #endif
   1269 
   1270         CASE_MAPPINGS,
   1271         "[\\u01F1]", // 'DZ'
   1272         "[\\u01F1\\u01F2\\u01F3]",
   1273 
   1274         CASE_MAPPINGS,
   1275         "[a-z]",
   1276         "[A-Za-z]",
   1277 
   1278         NULL
   1279     };
   1280 
   1281     UnicodeSet s;
   1282     UnicodeSet t;
   1283     UnicodeString buf;
   1284     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
   1285         int32_t selector = DATA[i][0];
   1286         UnicodeString pat(DATA[i+1], -1, US_INV);
   1287         UnicodeString exp(DATA[i+2], -1, US_INV);
   1288         s.applyPattern(pat, ec);
   1289         s.closeOver(selector);
   1290         t.applyPattern(exp, ec);
   1291         if (U_FAILURE(ec)) {
   1292             errln("FAIL: applyPattern failed");
   1293             continue;
   1294         }
   1295         if (s == t) {
   1296             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
   1297         } else {
   1298             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
   1299                   s.toPattern(buf, TRUE) + ", expected " + exp);
   1300         }
   1301     }
   1302 
   1303 #if 0
   1304     /*
   1305      * Unused test code.
   1306      * This was used to compare the old implementation (using USET_CASE)
   1307      * with the new one (using 0x100 temporarily)
   1308      * while transitioning from hardcoded case closure tables in uniset.cpp
   1309      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
   1310      * and using ucase.c functions for closure.
   1311      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
   1312      *
   1313      * Note: The old and new implementation never fully matched because
   1314      * the old implementation turned out to not map U+0130 and U+0131 correctly
   1315      * (dotted I and dotless i) and because the old implementation's data tables
   1316      * were outdated compared to Unicode 4.0.1 at the time of the change to the
   1317      * new implementation. (So sigmas and some other characters were not handled
   1318      * according to the newer Unicode version.)
   1319      */
   1320     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
   1321     UnicodeSetIterator si(sens);
   1322     UnicodeString str, buf2;
   1323     const UnicodeString *pStr;
   1324     UChar32 c;
   1325     while(si.next()) {
   1326         if(!si.isString()) {
   1327             c=si.getCodepoint();
   1328             s.clear();
   1329             s.add(c);
   1330 
   1331             str.setTo(c);
   1332             str.foldCase();
   1333             sens2.add(str);
   1334 
   1335             t=s;
   1336             s.closeOver(USET_CASE);
   1337             t.closeOver(0x100);
   1338             if(s!=t) {
   1339                 errln("FAIL: closeOver(U+%04x) differs: ", c);
   1340                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
   1341             }
   1342         }
   1343     }
   1344     // remove all code points
   1345     // should contain all full case folding mapping strings
   1346     sens2.remove(0, 0x10ffff);
   1347     si.reset(sens2);
   1348     while(si.next()) {
   1349         if(si.isString()) {
   1350             pStr=&si.getString();
   1351             s.clear();
   1352             s.add(*pStr);
   1353             t=s2=s;
   1354             s.closeOver(USET_CASE);
   1355             t.closeOver(0x100);
   1356             if(s!=t) {
   1357                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
   1358                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
   1359             }
   1360         }
   1361     }
   1362 #endif
   1363 
   1364     // Test the pattern API
   1365     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
   1366     if (U_FAILURE(ec)) {
   1367         errln("FAIL: applyPattern failed");
   1368     } else {
   1369         expectContainment(s, "abcABC", "defDEF");
   1370     }
   1371     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
   1372     if (U_FAILURE(ec)) {
   1373         errln("FAIL: constructor failed");
   1374     } else {
   1375         expectContainment(v, "defDEF", "abcABC");
   1376     }
   1377     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
   1378     if (U_FAILURE(ec)) {
   1379         errln("FAIL: construct w/case mappings failed");
   1380     } else {
   1381         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
   1382     }
   1383 }
   1384 
   1385 void UnicodeSetTest::TestEscapePattern() {
   1386     const char pattern[] =
   1387         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
   1388     const char exp[] =
   1389         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
   1390     // We test this with two passes; in the second pass we
   1391     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
   1392     // this fails -- which is what we expect.
   1393     for (int32_t pass=1; pass<=2; ++pass) {
   1394         UErrorCode ec = U_ZERO_ERROR;
   1395         UnicodeString pat(pattern, -1, US_INV);
   1396         if (pass==2) {
   1397             pat = pat.unescape();
   1398         }
   1399         // Pattern is only good for pass 1
   1400         UBool isPatternValid = (pass==1);
   1401 
   1402         UnicodeSet set(pat, ec);
   1403         if (U_SUCCESS(ec) != isPatternValid){
   1404             errln((UnicodeString)"FAIL: applyPattern(" +
   1405                   escape(pat) + ") => " +
   1406                   u_errorName(ec));
   1407             continue;
   1408         }
   1409         if (U_FAILURE(ec)) {
   1410             continue;
   1411         }
   1412         if (set.contains((UChar)0x0644)){
   1413             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
   1414         }
   1415 
   1416         UnicodeString newpat;
   1417         set.toPattern(newpat, TRUE);
   1418         if (newpat == UnicodeString(exp, -1, US_INV)) {
   1419             logln(escape(pat) + " => " + newpat);
   1420         } else {
   1421             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
   1422         }
   1423 
   1424         for (int32_t i=0; i<set.getRangeCount(); ++i) {
   1425             UnicodeString str("Range ");
   1426             str.append((UChar)(0x30 + i))
   1427                 .append(": ")
   1428                 .append((UChar32)set.getRangeStart(i))
   1429                 .append(" - ")
   1430                 .append((UChar32)set.getRangeEnd(i));
   1431             str = str + " (" + set.getRangeStart(i) + " - " +
   1432                 set.getRangeEnd(i) + ")";
   1433             if (set.getRangeStart(i) < 0) {
   1434                 errln((UnicodeString)"FAIL: " + escape(str));
   1435             } else {
   1436                 logln(escape(str));
   1437             }
   1438         }
   1439     }
   1440 }
   1441 
   1442 void UnicodeSetTest::expectRange(const UnicodeString& label,
   1443                                  const UnicodeSet& set,
   1444                                  UChar32 start, UChar32 end) {
   1445     UnicodeSet exp(start, end);
   1446     UnicodeString pat;
   1447     if (set == exp) {
   1448         logln(label + " => " + set.toPattern(pat, TRUE));
   1449     } else {
   1450         UnicodeString xpat;
   1451         errln((UnicodeString)"FAIL: " + label + " => " +
   1452               set.toPattern(pat, TRUE) +
   1453               ", expected " + exp.toPattern(xpat, TRUE));
   1454     }
   1455 }
   1456 
   1457 void UnicodeSetTest::TestInvalidCodePoint() {
   1458 
   1459     const UChar32 DATA[] = {
   1460         // Test range             Expected range
   1461         0, 0x10FFFF,              0, 0x10FFFF,
   1462         (UChar32)-1, 8,           0, 8,
   1463         8, 0x110000,              8, 0x10FFFF
   1464     };
   1465     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
   1466 
   1467     UnicodeString pat;
   1468     int32_t i;
   1469 
   1470     for (i=0; i<DATA_LENGTH; i+=4) {
   1471         UChar32 start  = DATA[i];
   1472         UChar32 end    = DATA[i+1];
   1473         UChar32 xstart = DATA[i+2];
   1474         UChar32 xend   = DATA[i+3];
   1475 
   1476         // Try various API using the test code points
   1477 
   1478         UnicodeSet set(start, end);
   1479         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
   1480                     set, xstart, xend);
   1481 
   1482         set.clear();
   1483         set.set(start, end);
   1484         expectRange((UnicodeString)"set(" + start + "," + end + ")",
   1485                     set, xstart, xend);
   1486 
   1487         UBool b = set.contains(start);
   1488         b = set.contains(start, end);
   1489         b = set.containsNone(start, end);
   1490         b = set.containsSome(start, end);
   1491         (void)b;   // Suppress set but not used warning.
   1492 
   1493         /*int32_t index = set.indexOf(start);*/
   1494 
   1495         set.clear();
   1496         set.add(start);
   1497         set.add(start, end);
   1498         expectRange((UnicodeString)"add(" + start + "," + end + ")",
   1499                     set, xstart, xend);
   1500 
   1501         set.set(0, 0x10FFFF);
   1502         set.retain(start, end);
   1503         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
   1504                     set, xstart, xend);
   1505         set.retain(start);
   1506 
   1507         set.set(0, 0x10FFFF);
   1508         set.remove(start);
   1509         set.remove(start, end);
   1510         set.complement();
   1511         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
   1512                     set, xstart, xend);
   1513 
   1514         set.set(0, 0x10FFFF);
   1515         set.complement(start, end);
   1516         set.complement();
   1517         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
   1518                     set, xstart, xend);
   1519         set.complement(start);
   1520     }
   1521 
   1522     const UChar32 DATA2[] = {
   1523         0,
   1524         0x10FFFF,
   1525         (UChar32)-1,
   1526         0x110000
   1527     };
   1528     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
   1529 
   1530     for (i=0; i<DATA2_LENGTH; ++i) {
   1531         UChar32 c = DATA2[i], end = 0x10FFFF;
   1532         UBool valid = (c >= 0 && c <= 0x10FFFF);
   1533 
   1534         UnicodeSet set(0, 0x10FFFF);
   1535 
   1536         // For single-codepoint contains, invalid codepoints are NOT contained
   1537         UBool b = set.contains(c);
   1538         if (b == valid) {
   1539             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
   1540                   ") = " + b);
   1541         } else {
   1542             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
   1543                   ") = " + b);
   1544         }
   1545 
   1546         // For codepoint range contains, containsNone, and containsSome,
   1547         // invalid or empty (start > end) ranges have UNDEFINED behavior.
   1548         b = set.contains(c, end);
   1549         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
   1550               "," + end + ") = " + b);
   1551 
   1552         b = set.containsNone(c, end);
   1553         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
   1554               "," + end + ") = " + b);
   1555 
   1556         b = set.containsSome(c, end);
   1557         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
   1558               "," + end + ") = " + b);
   1559 
   1560         int32_t index = set.indexOf(c);
   1561         if ((index >= 0) == valid) {
   1562             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
   1563                   ") = " + index);
   1564         } else {
   1565             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
   1566                   ") = " + index);
   1567         }
   1568     }
   1569 }
   1570 
   1571 // Used by TestSymbolTable
   1572 class TokenSymbolTable : public SymbolTable {
   1573 public:
   1574     Hashtable contents;
   1575 
   1576     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
   1577         contents.setValueDeleter(uprv_deleteUObject);
   1578     }
   1579 
   1580     ~TokenSymbolTable() {}
   1581 
   1582     /**
   1583      * (Non-SymbolTable API) Add the given variable and value to
   1584      * the table.  Variable should NOT contain leading '$'.
   1585      */
   1586     void add(const UnicodeString& var, const UnicodeString& value,
   1587              UErrorCode& ec) {
   1588         if (U_SUCCESS(ec)) {
   1589             contents.put(var, new UnicodeString(value), ec);
   1590         }
   1591     }
   1592 
   1593     /**
   1594      * SymbolTable API
   1595      */
   1596     virtual const UnicodeString* lookup(const UnicodeString& s) const {
   1597         return (const UnicodeString*) contents.get(s);
   1598     }
   1599 
   1600     /**
   1601      * SymbolTable API
   1602      */
   1603     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
   1604         return NULL;
   1605     }
   1606 
   1607     /**
   1608      * SymbolTable API
   1609      */
   1610     virtual UnicodeString parseReference(const UnicodeString& text,
   1611                                          ParsePosition& pos, int32_t limit) const {
   1612         int32_t start = pos.getIndex();
   1613         int32_t i = start;
   1614         UnicodeString result;
   1615         while (i < limit) {
   1616             UChar c = text.charAt(i);
   1617             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
   1618                 break;
   1619             }
   1620             ++i;
   1621         }
   1622         if (i == start) { // No valid name chars
   1623             return result; // Indicate failure with empty string
   1624         }
   1625         pos.setIndex(i);
   1626         text.extractBetween(start, i, result);
   1627         return result;
   1628     }
   1629 };
   1630 
   1631 void UnicodeSetTest::TestSymbolTable() {
   1632     // Multiple test cases can be set up here.  Each test case
   1633     // is terminated by null:
   1634     // var, value, var, value,..., input pat., exp. output pat., null
   1635     const char* DATA[] = {
   1636         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
   1637         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
   1638         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
   1639         NULL
   1640     };
   1641 
   1642     for (int32_t i=0; DATA[i]!=NULL; ++i) {
   1643         UErrorCode ec = U_ZERO_ERROR;
   1644         TokenSymbolTable sym(ec);
   1645         if (U_FAILURE(ec)) {
   1646             errln("FAIL: couldn't construct TokenSymbolTable");
   1647             continue;
   1648         }
   1649 
   1650         // Set up variables
   1651         while (DATA[i+2] != NULL) {
   1652             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
   1653             if (U_FAILURE(ec)) {
   1654                 errln("FAIL: couldn't add to TokenSymbolTable");
   1655                 continue;
   1656             }
   1657             i += 2;
   1658         }
   1659 
   1660         // Input pattern and expected output pattern
   1661         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
   1662         i += 2;
   1663 
   1664         ParsePosition pos(0);
   1665         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
   1666         if (U_FAILURE(ec)) {
   1667             errln("FAIL: couldn't construct UnicodeSet");
   1668             continue;
   1669         }
   1670 
   1671         // results
   1672         if (pos.getIndex() != inpat.length()) {
   1673             errln((UnicodeString)"Failed to read to end of string \""
   1674                   + inpat + "\": read to "
   1675                   + pos.getIndex() + ", length is "
   1676                   + inpat.length());
   1677         }
   1678 
   1679         UnicodeSet us2(exppat, ec);
   1680         if (U_FAILURE(ec)) {
   1681             errln("FAIL: couldn't construct expected UnicodeSet");
   1682             continue;
   1683         }
   1684 
   1685         UnicodeString a, b;
   1686         if (us != us2) {
   1687             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
   1688                   ", expected " + us2.toPattern(b, TRUE));
   1689         } else {
   1690             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
   1691         }
   1692     }
   1693 }
   1694 
   1695 void UnicodeSetTest::TestSurrogate() {
   1696     const char* DATA[] = {
   1697         // These should all behave identically
   1698         "[abc\\uD800\\uDC00]",
   1699         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
   1700         "[abc\\U00010000]",
   1701         0
   1702     };
   1703     for (int i=0; DATA[i] != 0; ++i) {
   1704         UErrorCode ec = U_ZERO_ERROR;
   1705         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
   1706         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
   1707         UnicodeSet set(str, ec);
   1708         if (U_FAILURE(ec)) {
   1709             errln("FAIL: UnicodeSet constructor");
   1710             continue;
   1711         }
   1712         expectContainment(set,
   1713                           CharsToUnicodeString("abc\\U00010000"),
   1714                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
   1715         if (set.size() != 4) {
   1716             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
   1717                   set.size() + ", expected 4");
   1718         }
   1719     }
   1720 }
   1721 
   1722 void UnicodeSetTest::TestExhaustive() {
   1723     // exhaustive tests. Simulate UnicodeSets with integers.
   1724     // That gives us very solid tests (except for large memory tests).
   1725 
   1726     int32_t limit = 128;
   1727 
   1728     UnicodeSet x, y, z, aa;
   1729 
   1730     for (int32_t i = 0; i < limit; ++i) {
   1731         bitsToSet(i, x);
   1732         logln((UnicodeString)"Testing " + i + ", " + x);
   1733         _testComplement(i, x, y);
   1734 
   1735         // AS LONG AS WE ARE HERE, check roundtrip
   1736         checkRoundTrip(bitsToSet(i, aa));
   1737 
   1738         for (int32_t j = 0; j < limit; ++j) {
   1739             _testAdd(i,j,  x,y,z);
   1740             _testXor(i,j,  x,y,z);
   1741             _testRetain(i,j,  x,y,z);
   1742             _testRemove(i,j,  x,y,z);
   1743         }
   1744     }
   1745 }
   1746 
   1747 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
   1748     bitsToSet(a, x);
   1749     z = x;
   1750     z.complement();
   1751     int32_t c = setToBits(z);
   1752     if (c != (~a)) {
   1753         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
   1754         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
   1755     }
   1756     checkCanonicalRep(z, (UnicodeString)"complement " + a);
   1757 }
   1758 
   1759 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1760     bitsToSet(a, x);
   1761     bitsToSet(b, y);
   1762     z = x;
   1763     z.addAll(y);
   1764     int32_t c = setToBits(z);
   1765     if (c != (a | b)) {
   1766         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
   1767         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
   1768     }
   1769     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
   1770 }
   1771 
   1772 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1773     bitsToSet(a, x);
   1774     bitsToSet(b, y);
   1775     z = x;
   1776     z.retainAll(y);
   1777     int32_t c = setToBits(z);
   1778     if (c != (a & b)) {
   1779         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
   1780         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
   1781     }
   1782     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
   1783 }
   1784 
   1785 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1786     bitsToSet(a, x);
   1787     bitsToSet(b, y);
   1788     z = x;
   1789     z.removeAll(y);
   1790     int32_t c = setToBits(z);
   1791     if (c != (a &~ b)) {
   1792         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
   1793         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
   1794     }
   1795     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
   1796 }
   1797 
   1798 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1799     bitsToSet(a, x);
   1800     bitsToSet(b, y);
   1801     z = x;
   1802     z.complementAll(y);
   1803     int32_t c = setToBits(z);
   1804     if (c != (a ^ b)) {
   1805         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
   1806         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
   1807     }
   1808     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
   1809 }
   1810 
   1811 /**
   1812  * Check that ranges are monotonically increasing and non-
   1813  * overlapping.
   1814  */
   1815 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
   1816     int32_t n = set.getRangeCount();
   1817     if (n < 0) {
   1818         errln((UnicodeString)"FAIL result of " + msg +
   1819               ": range count should be >= 0 but is " +
   1820               n /*+ " for " + set.toPattern())*/);
   1821         return;
   1822     }
   1823     UChar32 last = 0;
   1824     for (int32_t i=0; i<n; ++i) {
   1825         UChar32 start = set.getRangeStart(i);
   1826         UChar32 end = set.getRangeEnd(i);
   1827         if (start > end) {
   1828             errln((UnicodeString)"FAIL result of " + msg +
   1829                   ": range " + (i+1) +
   1830                   " start > end: " + (int)start + ", " + (int)end +
   1831                   " for " + set);
   1832         }
   1833         if (i > 0 && start <= last) {
   1834             errln((UnicodeString)"FAIL result of " + msg +
   1835                   ": range " + (i+1) +
   1836                   " overlaps previous range: " + (int)start + ", " + (int)end +
   1837                   " for " + set);
   1838         }
   1839         last = end;
   1840     }
   1841 }
   1842 
   1843 /**
   1844  * Convert a bitmask to a UnicodeSet.
   1845  */
   1846 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
   1847     result.clear();
   1848     for (UChar32 i = 0; i < 32; ++i) {
   1849         if ((a & (1<<i)) != 0) {
   1850             result.add(i);
   1851         }
   1852     }
   1853     return result;
   1854 }
   1855 
   1856 /**
   1857  * Convert a UnicodeSet to a bitmask.  Only the characters
   1858  * U+0000 to U+0020 are represented in the bitmask.
   1859  */
   1860 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
   1861     int32_t result = 0;
   1862     for (int32_t i = 0; i < 32; ++i) {
   1863         if (x.contains((UChar32)i)) {
   1864             result |= (1<<i);
   1865         }
   1866     }
   1867     return result;
   1868 }
   1869 
   1870 /**
   1871  * Return the representation of an inversion list based UnicodeSet
   1872  * as a pairs list.  Ranges are listed in ascending Unicode order.
   1873  * For example, the set [a-zA-M3] is represented as "33AMaz".
   1874  */
   1875 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
   1876     UnicodeString pairs;
   1877     for (int32_t i=0; i<set.getRangeCount(); ++i) {
   1878         UChar32 start = set.getRangeStart(i);
   1879         UChar32 end = set.getRangeEnd(i);
   1880         if (end > 0xFFFF) {
   1881             end = 0xFFFF;
   1882             i = set.getRangeCount(); // Should be unnecessary
   1883         }
   1884         pairs.append((UChar)start).append((UChar)end);
   1885     }
   1886     return pairs;
   1887 }
   1888 
   1889 /**
   1890  * Basic consistency check for a few items.
   1891  * That the iterator works, and that we can create a pattern and
   1892  * get the same thing back
   1893  */
   1894 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
   1895     UErrorCode ec = U_ZERO_ERROR;
   1896 
   1897     UnicodeSet t(s);
   1898     checkEqual(s, t, "copy ct");
   1899 
   1900     t = s;
   1901     checkEqual(s, t, "operator=");
   1902 
   1903     copyWithIterator(t, s, FALSE);
   1904     checkEqual(s, t, "iterator roundtrip");
   1905 
   1906     copyWithIterator(t, s, TRUE); // try range
   1907     checkEqual(s, t, "iterator roundtrip");
   1908 
   1909     UnicodeString pat; s.toPattern(pat, FALSE);
   1910     t.applyPattern(pat, ec);
   1911     if (U_FAILURE(ec)) {
   1912         errln("FAIL: applyPattern");
   1913         return;
   1914     } else {
   1915         checkEqual(s, t, "toPattern(false)");
   1916     }
   1917 
   1918     s.toPattern(pat, TRUE);
   1919     t.applyPattern(pat, ec);
   1920     if (U_FAILURE(ec)) {
   1921         errln("FAIL: applyPattern");
   1922         return;
   1923     } else {
   1924         checkEqual(s, t, "toPattern(true)");
   1925     }
   1926 }
   1927 
   1928 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
   1929     t.clear();
   1930     UnicodeSetIterator it(s);
   1931     if (withRange) {
   1932         while (it.nextRange()) {
   1933             if (it.isString()) {
   1934                 t.add(it.getString());
   1935             } else {
   1936                 t.add(it.getCodepoint(), it.getCodepointEnd());
   1937             }
   1938         }
   1939     } else {
   1940         while (it.next()) {
   1941             if (it.isString()) {
   1942                 t.add(it.getString());
   1943             } else {
   1944                 t.add(it.getCodepoint());
   1945             }
   1946         }
   1947     }
   1948 }
   1949 
   1950 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
   1951     UnicodeString source; s.toPattern(source, TRUE);
   1952     UnicodeString result; t.toPattern(result, TRUE);
   1953     if (s != t) {
   1954         errln((UnicodeString)"FAIL: " + message
   1955               + "; source = " + source
   1956               + "; result = " + result
   1957               );
   1958         return FALSE;
   1959     } else {
   1960         logln((UnicodeString)"Ok: " + message
   1961               + "; source = " + source
   1962               + "; result = " + result
   1963               );
   1964     }
   1965     return TRUE;
   1966 }
   1967 
   1968 void
   1969 UnicodeSetTest::expectContainment(const UnicodeString& pat,
   1970                                   const UnicodeString& charsIn,
   1971                                   const UnicodeString& charsOut) {
   1972     UErrorCode ec = U_ZERO_ERROR;
   1973     UnicodeSet set(pat, ec);
   1974     if (U_FAILURE(ec)) {
   1975         dataerrln((UnicodeString)"FAIL: pattern \"" +
   1976               pat + "\" => " + u_errorName(ec));
   1977         return;
   1978     }
   1979     expectContainment(set, pat, charsIn, charsOut);
   1980 }
   1981 
   1982 void
   1983 UnicodeSetTest::expectContainment(const UnicodeSet& set,
   1984                                   const UnicodeString& charsIn,
   1985                                   const UnicodeString& charsOut) {
   1986     UnicodeString pat;
   1987     set.toPattern(pat);
   1988     expectContainment(set, pat, charsIn, charsOut);
   1989 }
   1990 
   1991 void
   1992 UnicodeSetTest::expectContainment(const UnicodeSet& set,
   1993                                   const UnicodeString& setName,
   1994                                   const UnicodeString& charsIn,
   1995                                   const UnicodeString& charsOut) {
   1996     UnicodeString bad;
   1997     UChar32 c;
   1998     int32_t i;
   1999 
   2000     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
   2001         c = charsIn.char32At(i);
   2002         if (!set.contains(c)) {
   2003             bad.append(c);
   2004         }
   2005     }
   2006     if (bad.length() > 0) {
   2007         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
   2008               ", expected containment of " + prettify(charsIn));
   2009     } else {
   2010         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
   2011     }
   2012 
   2013     bad.truncate(0);
   2014     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
   2015         c = charsOut.char32At(i);
   2016         if (set.contains(c)) {
   2017             bad.append(c);
   2018         }
   2019     }
   2020     if (bad.length() > 0) {
   2021         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
   2022               ", expected non-containment of " + prettify(charsOut));
   2023     } else {
   2024         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
   2025     }
   2026 }
   2027 
   2028 void
   2029 UnicodeSetTest::expectPattern(UnicodeSet& set,
   2030                               const UnicodeString& pattern,
   2031                               const UnicodeString& expectedPairs){
   2032     UErrorCode status = U_ZERO_ERROR;
   2033     set.applyPattern(pattern, status);
   2034     if (U_FAILURE(status)) {
   2035         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
   2036               "\") failed");
   2037         return;
   2038     } else {
   2039         if (getPairs(set) != expectedPairs ) {
   2040             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
   2041                   "\") => pairs \"" +
   2042                   escape(getPairs(set)) + "\", expected \"" +
   2043                   escape(expectedPairs) + "\"");
   2044         } else {
   2045             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
   2046                   "\") => pairs \"" +
   2047                   escape(getPairs(set)) + "\"");
   2048         }
   2049     }
   2050     // the result of calling set.toPattern(), which is the string representation of
   2051     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
   2052     // will produce another set that is equal to this one.
   2053     UnicodeString temppattern;
   2054     set.toPattern(temppattern);
   2055     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
   2056     if (U_FAILURE(status)) {
   2057         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
   2058         return;
   2059     }
   2060     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
   2061         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
   2062             escape(getPairs(set)) + "\""));
   2063     } else{
   2064         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
   2065     }
   2066 
   2067     delete tempset;
   2068 
   2069 }
   2070 
   2071 void
   2072 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
   2073     if (getPairs(set) != expectedPairs) {
   2074         errln(UnicodeString("FAIL: Expected pair list \"") +
   2075               escape(expectedPairs) + "\", got \"" +
   2076               escape(getPairs(set)) + "\"");
   2077     }
   2078 }
   2079 
   2080 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
   2081                                      const UnicodeString& expPat,
   2082                                      const char** expStrings) {
   2083     UnicodeString pat;
   2084     set.toPattern(pat, TRUE);
   2085     if (pat == expPat) {
   2086         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
   2087     } else {
   2088         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
   2089         return;
   2090     }
   2091     if (expStrings == NULL) {
   2092         return;
   2093     }
   2094     UBool in = TRUE;
   2095     for (int32_t i=0; expStrings[i] != NULL; ++i) {
   2096         if (expStrings[i] == NOT) { // sic; pointer comparison
   2097             in = FALSE;
   2098             continue;
   2099         }
   2100         UnicodeString s = CharsToUnicodeString(expStrings[i]);
   2101         UBool contained = set.contains(s);
   2102         if (contained == in) {
   2103             logln((UnicodeString)"Ok: " + expPat +
   2104                   (contained ? " contains {" : " does not contain {") +
   2105                   escape(expStrings[i]) + "}");
   2106         } else {
   2107             errln((UnicodeString)"FAIL: " + expPat +
   2108                   (contained ? " contains {" : " does not contain {") +
   2109                   escape(expStrings[i]) + "}");
   2110         }
   2111     }
   2112 }
   2113 
   2114 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
   2115 
   2116 void
   2117 UnicodeSetTest::doAssert(UBool condition, const char *message)
   2118 {
   2119     if (!condition) {
   2120         errln(UnicodeString("ERROR : ") + message);
   2121     }
   2122 }
   2123 
   2124 UnicodeString
   2125 UnicodeSetTest::escape(const UnicodeString& s) {
   2126     UnicodeString buf;
   2127     for (int32_t i=0; i<s.length(); )
   2128     {
   2129         UChar32 c = s.char32At(i);
   2130         if (0x0020 <= c && c <= 0x007F) {
   2131             buf += c;
   2132         } else {
   2133             if (c <= 0xFFFF) {
   2134                 buf += (UChar)0x5c; buf += (UChar)0x75;
   2135             } else {
   2136                 buf += (UChar)0x5c; buf += (UChar)0x55;
   2137                 buf += toHexString((c & 0xF0000000) >> 28);
   2138                 buf += toHexString((c & 0x0F000000) >> 24);
   2139                 buf += toHexString((c & 0x00F00000) >> 20);
   2140                 buf += toHexString((c & 0x000F0000) >> 16);
   2141             }
   2142             buf += toHexString((c & 0xF000) >> 12);
   2143             buf += toHexString((c & 0x0F00) >> 8);
   2144             buf += toHexString((c & 0x00F0) >> 4);
   2145             buf += toHexString(c & 0x000F);
   2146         }
   2147         i += U16_LENGTH(c);
   2148     }
   2149     return buf;
   2150 }
   2151 
   2152 void UnicodeSetTest::TestFreezable() {
   2153     UErrorCode errorCode=U_ZERO_ERROR;
   2154     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
   2155     UnicodeSet idSet(idPattern, errorCode);
   2156     if(U_FAILURE(errorCode)) {
   2157         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
   2158         return;
   2159     }
   2160 
   2161     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
   2162     UnicodeSet wsSet(wsPattern, errorCode);
   2163     if(U_FAILURE(errorCode)) {
   2164         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
   2165         return;
   2166     }
   2167 
   2168     idSet.add(idPattern);
   2169     UnicodeSet frozen(idSet);
   2170     frozen.freeze();
   2171 
   2172     if(idSet.isFrozen() || !frozen.isFrozen()) {
   2173         errln("FAIL: isFrozen() is wrong");
   2174     }
   2175     if(frozen!=idSet || !(frozen==idSet)) {
   2176         errln("FAIL: a copy-constructed frozen set differs from its original");
   2177     }
   2178 
   2179     frozen=wsSet;
   2180     if(frozen!=idSet || !(frozen==idSet)) {
   2181         errln("FAIL: a frozen set was modified by operator=");
   2182     }
   2183 
   2184     UnicodeSet frozen2(frozen);
   2185     if(frozen2!=frozen || frozen2!=idSet) {
   2186         errln("FAIL: a copied frozen set differs from its frozen original");
   2187     }
   2188     if(!frozen2.isFrozen()) {
   2189         errln("FAIL: copy-constructing a frozen set results in a thawed one");
   2190     }
   2191     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
   2192     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
   2193         errln("FAIL: UnicodeSet(5, 55) failed");
   2194     }
   2195     frozen3=frozen;
   2196     if(!frozen3.isFrozen()) {
   2197         errln("FAIL: copying a frozen set results in a thawed one");
   2198     }
   2199 
   2200     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
   2201     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
   2202         errln("FAIL: clone() failed");
   2203     }
   2204     cloned->add(0xd802, 0xd805);
   2205     if(cloned->containsSome(0xd802, 0xd805)) {
   2206         errln("FAIL: unable to modify clone");
   2207     }
   2208     delete cloned;
   2209 
   2210     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
   2211     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
   2212         errln("FAIL: cloneAsThawed() failed");
   2213     }
   2214     thawed->add(0xd802, 0xd805);
   2215     if(!thawed->contains(0xd802, 0xd805)) {
   2216         errln("FAIL: unable to modify thawed clone");
   2217     }
   2218     delete thawed;
   2219 
   2220     frozen.set(5, 55);
   2221     if(frozen!=idSet || !(frozen==idSet)) {
   2222         errln("FAIL: UnicodeSet::set() modified a frozen set");
   2223     }
   2224 
   2225     frozen.clear();
   2226     if(frozen!=idSet || !(frozen==idSet)) {
   2227         errln("FAIL: UnicodeSet::clear() modified a frozen set");
   2228     }
   2229 
   2230     frozen.closeOver(USET_CASE_INSENSITIVE);
   2231     if(frozen!=idSet || !(frozen==idSet)) {
   2232         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
   2233     }
   2234 
   2235     frozen.compact();
   2236     if(frozen!=idSet || !(frozen==idSet)) {
   2237         errln("FAIL: UnicodeSet::compact() modified a frozen set");
   2238     }
   2239 
   2240     ParsePosition pos;
   2241     frozen.
   2242         applyPattern(wsPattern, errorCode).
   2243         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
   2244         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
   2245         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
   2246         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
   2247     if(frozen!=idSet || !(frozen==idSet)) {
   2248         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
   2249     }
   2250 
   2251     frozen.
   2252         add(0xd800).
   2253         add(0xd802, 0xd805).
   2254         add(wsPattern).
   2255         addAll(idPattern).
   2256         addAll(wsSet);
   2257     if(frozen!=idSet || !(frozen==idSet)) {
   2258         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
   2259     }
   2260 
   2261     frozen.
   2262         retain(0x62).
   2263         retain(0x64, 0x69).
   2264         retainAll(wsPattern).
   2265         retainAll(wsSet);
   2266     if(frozen!=idSet || !(frozen==idSet)) {
   2267         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
   2268     }
   2269 
   2270     frozen.
   2271         remove(0x62).
   2272         remove(0x64, 0x69).
   2273         remove(idPattern).
   2274         removeAll(idPattern).
   2275         removeAll(idSet);
   2276     if(frozen!=idSet || !(frozen==idSet)) {
   2277         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
   2278     }
   2279 
   2280     frozen.
   2281         complement().
   2282         complement(0x62).
   2283         complement(0x64, 0x69).
   2284         complement(idPattern).
   2285         complementAll(idPattern).
   2286         complementAll(idSet);
   2287     if(frozen!=idSet || !(frozen==idSet)) {
   2288         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
   2289     }
   2290 }
   2291 
   2292 // Test span() etc. -------------------------------------------------------- ***
   2293 
   2294 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
   2295 static int32_t
   2296 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
   2297     UErrorCode errorCode=U_ZERO_ERROR;
   2298     int32_t length8=0;
   2299     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
   2300     if(U_SUCCESS(errorCode)) {
   2301         return length8;
   2302     } else {
   2303         // The string contains an unpaired surrogate.
   2304         // Ignore this string.
   2305         return 0;
   2306     }
   2307 }
   2308 
   2309 class UnicodeSetWithStringsIterator;
   2310 
   2311 // Make the strings in a UnicodeSet easily accessible.
   2312 class UnicodeSetWithStrings {
   2313 public:
   2314     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
   2315             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
   2316         int32_t size=set.size();
   2317         if(size>0 && set.charAt(size-1)<0) {
   2318             // If a set's last element is not a code point, then it must contain strings.
   2319             // Iterate over the set, skip all code point ranges, and cache the strings.
   2320             // Convert them to UTF-8 for spanUTF8().
   2321             UnicodeSetIterator iter(set);
   2322             const UnicodeString *s;
   2323             char *s8=utf8;
   2324             int32_t length8, utf8Count=0;
   2325             while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
   2326                 if(iter.isString()) {
   2327                     // Store the pointer to the set's string element
   2328                     // which we happen to know is a stable pointer.
   2329                     strings[stringsLength]=s=&iter.getString();
   2330                     utf8Count+=
   2331                         utf8Lengths[stringsLength]=length8=
   2332                         appendUTF8(s->getBuffer(), s->length(),
   2333                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
   2334                     if(length8==0) {
   2335                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
   2336                     }
   2337                     s8+=length8;
   2338                     ++stringsLength;
   2339                 }
   2340             }
   2341         }
   2342     }
   2343 
   2344     const UnicodeSet &getSet() const {
   2345         return set;
   2346     }
   2347 
   2348     UBool hasStrings() const {
   2349         return (UBool)(stringsLength>0);
   2350     }
   2351 
   2352     UBool hasStringsWithSurrogates() const {
   2353         return hasSurrogates;
   2354     }
   2355 
   2356 private:
   2357     friend class UnicodeSetWithStringsIterator;
   2358 
   2359     const UnicodeSet &set;
   2360 
   2361     const UnicodeString *strings[20];
   2362     int32_t stringsLength;
   2363     UBool hasSurrogates;
   2364 
   2365     char utf8[1024];
   2366     int32_t utf8Lengths[20];
   2367 };
   2368 
   2369 class UnicodeSetWithStringsIterator {
   2370 public:
   2371     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
   2372             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
   2373     }
   2374 
   2375     void reset() {
   2376         nextStringIndex=nextUTF8Start=0;
   2377     }
   2378 
   2379     const UnicodeString *nextString() {
   2380         if(nextStringIndex<fSet.stringsLength) {
   2381             return fSet.strings[nextStringIndex++];
   2382         } else {
   2383             return NULL;
   2384         }
   2385     }
   2386 
   2387     // Do not mix with calls to nextString().
   2388     const char *nextUTF8(int32_t &length) {
   2389         if(nextStringIndex<fSet.stringsLength) {
   2390             const char *s8=fSet.utf8+nextUTF8Start;
   2391             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
   2392             return s8;
   2393         } else {
   2394             length=0;
   2395             return NULL;
   2396         }
   2397     }
   2398 
   2399 private:
   2400     const UnicodeSetWithStrings &fSet;
   2401     int32_t nextStringIndex;
   2402     int32_t nextUTF8Start;
   2403 };
   2404 
   2405 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
   2406 // at code point boundaries.
   2407 // That is, each edge of a match must not be in the middle of a surrogate pair.
   2408 static inline UBool
   2409 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
   2410     s+=start;
   2411     limit-=start;
   2412     int32_t length=t.length();
   2413     return 0==t.compare(s, length) &&
   2414            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
   2415            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
   2416 }
   2417 
   2418 // Implement span() with contains() for comparison.
   2419 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
   2420                                  USetSpanCondition spanCondition) {
   2421     const UnicodeSet &realSet(set.getSet());
   2422     if(!set.hasStrings()) {
   2423         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2424             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2425         }
   2426 
   2427         UChar32 c;
   2428         int32_t start=0, prev;
   2429         while((prev=start)<length) {
   2430             U16_NEXT(s, start, length, c);
   2431             if(realSet.contains(c)!=spanCondition) {
   2432                 break;
   2433             }
   2434         }
   2435         return prev;
   2436     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2437         UnicodeSetWithStringsIterator iter(set);
   2438         UChar32 c;
   2439         int32_t start, next;
   2440         for(start=next=0; start<length;) {
   2441             U16_NEXT(s, next, length, c);
   2442             if(realSet.contains(c)) {
   2443                 break;
   2444             }
   2445             const UnicodeString *str;
   2446             iter.reset();
   2447             while((str=iter.nextString())!=NULL) {
   2448                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
   2449                     // spanNeedsStrings=TRUE;
   2450                     return start;
   2451                 }
   2452             }
   2453             start=next;
   2454         }
   2455         return start;
   2456     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2457         UnicodeSetWithStringsIterator iter(set);
   2458         UChar32 c;
   2459         int32_t start, next, maxSpanLimit=0;
   2460         for(start=next=0; start<length;) {
   2461             U16_NEXT(s, next, length, c);
   2462             if(!realSet.contains(c)) {
   2463                 next=start;  // Do not span this single, not-contained code point.
   2464             }
   2465             const UnicodeString *str;
   2466             iter.reset();
   2467             while((str=iter.nextString())!=NULL) {
   2468                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
   2469                     // spanNeedsStrings=TRUE;
   2470                     int32_t matchLimit=start+str->length();
   2471                     if(matchLimit==length) {
   2472                         return length;
   2473                     }
   2474                     if(spanCondition==USET_SPAN_CONTAINED) {
   2475                         // Iterate for the shortest match at each position.
   2476                         // Recurse for each but the shortest match.
   2477                         if(next==start) {
   2478                             next=matchLimit;  // First match from start.
   2479                         } else {
   2480                             if(matchLimit<next) {
   2481                                 // Remember shortest match from start for iteration.
   2482                                 int32_t temp=next;
   2483                                 next=matchLimit;
   2484                                 matchLimit=temp;
   2485                             }
   2486                             // Recurse for non-shortest match from start.
   2487                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
   2488                                                                  USET_SPAN_CONTAINED);
   2489                             if((matchLimit+spanLength)>maxSpanLimit) {
   2490                                 maxSpanLimit=matchLimit+spanLength;
   2491                                 if(maxSpanLimit==length) {
   2492                                     return length;
   2493                                 }
   2494                             }
   2495                         }
   2496                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2497                         if(matchLimit>next) {
   2498                             // Remember longest match from start.
   2499                             next=matchLimit;
   2500                         }
   2501                     }
   2502                 }
   2503             }
   2504             if(next==start) {
   2505                 break;  // No match from start.
   2506             }
   2507             start=next;
   2508         }
   2509         if(start>maxSpanLimit) {
   2510             return start;
   2511         } else {
   2512             return maxSpanLimit;
   2513         }
   2514     }
   2515 }
   2516 
   2517 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
   2518                                      USetSpanCondition spanCondition) {
   2519     if(length==0) {
   2520         return 0;
   2521     }
   2522     const UnicodeSet &realSet(set.getSet());
   2523     if(!set.hasStrings()) {
   2524         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2525             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2526         }
   2527 
   2528         UChar32 c;
   2529         int32_t prev=length;
   2530         do {
   2531             U16_PREV(s, 0, length, c);
   2532             if(realSet.contains(c)!=spanCondition) {
   2533                 break;
   2534             }
   2535         } while((prev=length)>0);
   2536         return prev;
   2537     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2538         UnicodeSetWithStringsIterator iter(set);
   2539         UChar32 c;
   2540         int32_t prev=length, length0=length;
   2541         do {
   2542             U16_PREV(s, 0, length, c);
   2543             if(realSet.contains(c)) {
   2544                 break;
   2545             }
   2546             const UnicodeString *str;
   2547             iter.reset();
   2548             while((str=iter.nextString())!=NULL) {
   2549                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
   2550                     // spanNeedsStrings=TRUE;
   2551                     return prev;
   2552                 }
   2553             }
   2554         } while((prev=length)>0);
   2555         return prev;
   2556     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2557         UnicodeSetWithStringsIterator iter(set);
   2558         UChar32 c;
   2559         int32_t prev=length, minSpanStart=length, length0=length;
   2560         do {
   2561             U16_PREV(s, 0, length, c);
   2562             if(!realSet.contains(c)) {
   2563                 length=prev;  // Do not span this single, not-contained code point.
   2564             }
   2565             const UnicodeString *str;
   2566             iter.reset();
   2567             while((str=iter.nextString())!=NULL) {
   2568                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
   2569                     // spanNeedsStrings=TRUE;
   2570                     int32_t matchStart=prev-str->length();
   2571                     if(matchStart==0) {
   2572                         return 0;
   2573                     }
   2574                     if(spanCondition==USET_SPAN_CONTAINED) {
   2575                         // Iterate for the shortest match at each position.
   2576                         // Recurse for each but the shortest match.
   2577                         if(length==prev) {
   2578                             length=matchStart;  // First match from prev.
   2579                         } else {
   2580                             if(matchStart>length) {
   2581                                 // Remember shortest match from prev for iteration.
   2582                                 int32_t temp=length;
   2583                                 length=matchStart;
   2584                                 matchStart=temp;
   2585                             }
   2586                             // Recurse for non-shortest match from prev.
   2587                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
   2588                                                                     USET_SPAN_CONTAINED);
   2589                             if(spanStart<minSpanStart) {
   2590                                 minSpanStart=spanStart;
   2591                                 if(minSpanStart==0) {
   2592                                     return 0;
   2593                                 }
   2594                             }
   2595                         }
   2596                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2597                         if(matchStart<length) {
   2598                             // Remember longest match from prev.
   2599                             length=matchStart;
   2600                         }
   2601                     }
   2602                 }
   2603             }
   2604             if(length==prev) {
   2605                 break;  // No match from prev.
   2606             }
   2607         } while((prev=length)>0);
   2608         if(prev<minSpanStart) {
   2609             return prev;
   2610         } else {
   2611             return minSpanStart;
   2612         }
   2613     }
   2614 }
   2615 
   2616 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
   2617                                 USetSpanCondition spanCondition) {
   2618     const UnicodeSet &realSet(set.getSet());
   2619     if(!set.hasStrings()) {
   2620         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2621             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2622         }
   2623 
   2624         UChar32 c;
   2625         int32_t start=0, prev;
   2626         while((prev=start)<length) {
   2627             U8_NEXT_OR_FFFD(s, start, length, c);
   2628             if(realSet.contains(c)!=spanCondition) {
   2629                 break;
   2630             }
   2631         }
   2632         return prev;
   2633     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2634         UnicodeSetWithStringsIterator iter(set);
   2635         UChar32 c;
   2636         int32_t start, next;
   2637         for(start=next=0; start<length;) {
   2638             U8_NEXT_OR_FFFD(s, next, length, c);
   2639             if(realSet.contains(c)) {
   2640                 break;
   2641             }
   2642             const char *s8;
   2643             int32_t length8;
   2644             iter.reset();
   2645             while((s8=iter.nextUTF8(length8))!=NULL) {
   2646                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
   2647                     // spanNeedsStrings=TRUE;
   2648                     return start;
   2649                 }
   2650             }
   2651             start=next;
   2652         }
   2653         return start;
   2654     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2655         UnicodeSetWithStringsIterator iter(set);
   2656         UChar32 c;
   2657         int32_t start, next, maxSpanLimit=0;
   2658         for(start=next=0; start<length;) {
   2659             U8_NEXT_OR_FFFD(s, next, length, c);
   2660             if(!realSet.contains(c)) {
   2661                 next=start;  // Do not span this single, not-contained code point.
   2662             }
   2663             const char *s8;
   2664             int32_t length8;
   2665             iter.reset();
   2666             while((s8=iter.nextUTF8(length8))!=NULL) {
   2667                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
   2668                     // spanNeedsStrings=TRUE;
   2669                     int32_t matchLimit=start+length8;
   2670                     if(matchLimit==length) {
   2671                         return length;
   2672                     }
   2673                     if(spanCondition==USET_SPAN_CONTAINED) {
   2674                         // Iterate for the shortest match at each position.
   2675                         // Recurse for each but the shortest match.
   2676                         if(next==start) {
   2677                             next=matchLimit;  // First match from start.
   2678                         } else {
   2679                             if(matchLimit<next) {
   2680                                 // Remember shortest match from start for iteration.
   2681                                 int32_t temp=next;
   2682                                 next=matchLimit;
   2683                                 matchLimit=temp;
   2684                             }
   2685                             // Recurse for non-shortest match from start.
   2686                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
   2687                                                                 USET_SPAN_CONTAINED);
   2688                             if((matchLimit+spanLength)>maxSpanLimit) {
   2689                                 maxSpanLimit=matchLimit+spanLength;
   2690                                 if(maxSpanLimit==length) {
   2691                                     return length;
   2692                                 }
   2693                             }
   2694                         }
   2695                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2696                         if(matchLimit>next) {
   2697                             // Remember longest match from start.
   2698                             next=matchLimit;
   2699                         }
   2700                     }
   2701                 }
   2702             }
   2703             if(next==start) {
   2704                 break;  // No match from start.
   2705             }
   2706             start=next;
   2707         }
   2708         if(start>maxSpanLimit) {
   2709             return start;
   2710         } else {
   2711             return maxSpanLimit;
   2712         }
   2713     }
   2714 }
   2715 
   2716 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
   2717                                     USetSpanCondition spanCondition) {
   2718     if(length==0) {
   2719         return 0;
   2720     }
   2721     const UnicodeSet &realSet(set.getSet());
   2722     if(!set.hasStrings()) {
   2723         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2724             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2725         }
   2726 
   2727         UChar32 c;
   2728         int32_t prev=length;
   2729         do {
   2730             U8_PREV_OR_FFFD(s, 0, length, c);
   2731             if(realSet.contains(c)!=spanCondition) {
   2732                 break;
   2733             }
   2734         } while((prev=length)>0);
   2735         return prev;
   2736     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2737         UnicodeSetWithStringsIterator iter(set);
   2738         UChar32 c;
   2739         int32_t prev=length;
   2740         do {
   2741             U8_PREV_OR_FFFD(s, 0, length, c);
   2742             if(realSet.contains(c)) {
   2743                 break;
   2744             }
   2745             const char *s8;
   2746             int32_t length8;
   2747             iter.reset();
   2748             while((s8=iter.nextUTF8(length8))!=NULL) {
   2749                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
   2750                     // spanNeedsStrings=TRUE;
   2751                     return prev;
   2752                 }
   2753             }
   2754         } while((prev=length)>0);
   2755         return prev;
   2756     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2757         UnicodeSetWithStringsIterator iter(set);
   2758         UChar32 c;
   2759         int32_t prev=length, minSpanStart=length;
   2760         do {
   2761             U8_PREV_OR_FFFD(s, 0, length, c);
   2762             if(!realSet.contains(c)) {
   2763                 length=prev;  // Do not span this single, not-contained code point.
   2764             }
   2765             const char *s8;
   2766             int32_t length8;
   2767             iter.reset();
   2768             while((s8=iter.nextUTF8(length8))!=NULL) {
   2769                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
   2770                     // spanNeedsStrings=TRUE;
   2771                     int32_t matchStart=prev-length8;
   2772                     if(matchStart==0) {
   2773                         return 0;
   2774                     }
   2775                     if(spanCondition==USET_SPAN_CONTAINED) {
   2776                         // Iterate for the shortest match at each position.
   2777                         // Recurse for each but the shortest match.
   2778                         if(length==prev) {
   2779                             length=matchStart;  // First match from prev.
   2780                         } else {
   2781                             if(matchStart>length) {
   2782                                 // Remember shortest match from prev for iteration.
   2783                                 int32_t temp=length;
   2784                                 length=matchStart;
   2785                                 matchStart=temp;
   2786                             }
   2787                             // Recurse for non-shortest match from prev.
   2788                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
   2789                                                                    USET_SPAN_CONTAINED);
   2790                             if(spanStart<minSpanStart) {
   2791                                 minSpanStart=spanStart;
   2792                                 if(minSpanStart==0) {
   2793                                     return 0;
   2794                                 }
   2795                             }
   2796                         }
   2797                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2798                         if(matchStart<length) {
   2799                             // Remember longest match from prev.
   2800                             length=matchStart;
   2801                         }
   2802                     }
   2803                 }
   2804             }
   2805             if(length==prev) {
   2806                 break;  // No match from prev.
   2807             }
   2808         } while((prev=length)>0);
   2809         if(prev<minSpanStart) {
   2810             return prev;
   2811         } else {
   2812             return minSpanStart;
   2813         }
   2814     }
   2815 }
   2816 
   2817 // spans to be performed and compared
   2818 enum {
   2819     SPAN_UTF16          =1,
   2820     SPAN_UTF8           =2,
   2821     SPAN_UTFS           =3,
   2822 
   2823     SPAN_SET            =4,
   2824     SPAN_COMPLEMENT     =8,
   2825     SPAN_POLARITY       =0xc,
   2826 
   2827     SPAN_FWD            =0x10,
   2828     SPAN_BACK           =0x20,
   2829     SPAN_DIRS           =0x30,
   2830 
   2831     SPAN_CONTAINED      =0x100,
   2832     SPAN_SIMPLE         =0x200,
   2833     SPAN_CONDITION      =0x300,
   2834 
   2835     SPAN_ALL            =0x33f
   2836 };
   2837 
   2838 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
   2839     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
   2840 }
   2841 
   2842 static inline int32_t slen(const void *s, UBool isUTF16) {
   2843     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
   2844 }
   2845 
   2846 /*
   2847  * Count spans on a string with the method according to type and set the span limits.
   2848  * The set may be the complement of the original.
   2849  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
   2850  * according to the expected number of spans.
   2851  * Sets typeName to an empty string if there is no such type.
   2852  * Returns -1 if the span option is filtered out.
   2853  */
   2854 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
   2855                         const void *s, int32_t length, UBool isUTF16,
   2856                         uint32_t whichSpans,
   2857                         int type, const char *&typeName,
   2858                         int32_t limits[], int32_t limitsCapacity,
   2859                         int32_t expectCount) {
   2860     const UnicodeSet &realSet(set.getSet());
   2861     int32_t start, count;
   2862     USetSpanCondition spanCondition, firstSpanCondition, contained;
   2863     UBool isForward;
   2864 
   2865     if(type<0 || 7<type) {
   2866         typeName="";
   2867         return 0;
   2868     }
   2869 
   2870     static const char *const typeNames16[]={
   2871         "contains", "contains(LM)",
   2872         "span", "span(LM)",
   2873         "containsBack", "containsBack(LM)",
   2874         "spanBack", "spanBack(LM)"
   2875     };
   2876 
   2877     static const char *const typeNames8[]={
   2878         "containsUTF8", "containsUTF8(LM)",
   2879         "spanUTF8", "spanUTF8(LM)",
   2880         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
   2881         "spanBackUTF8", "spanBackUTF8(LM)"
   2882     };
   2883 
   2884     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
   2885 
   2886     // filter span options
   2887     if(type<=3) {
   2888         // span forward
   2889         if((whichSpans&SPAN_FWD)==0) {
   2890             return -1;
   2891         }
   2892         isForward=TRUE;
   2893     } else {
   2894         // span backward
   2895         if((whichSpans&SPAN_BACK)==0) {
   2896             return -1;
   2897         }
   2898         isForward=FALSE;
   2899     }
   2900     if((type&1)==0) {
   2901         // use USET_SPAN_CONTAINED
   2902         if((whichSpans&SPAN_CONTAINED)==0) {
   2903             return -1;
   2904         }
   2905         contained=USET_SPAN_CONTAINED;
   2906     } else {
   2907         // use USET_SPAN_SIMPLE
   2908         if((whichSpans&SPAN_SIMPLE)==0) {
   2909             return -1;
   2910         }
   2911         contained=USET_SPAN_SIMPLE;
   2912     }
   2913 
   2914     // Default first span condition for going forward with an uncomplemented set.
   2915     spanCondition=USET_SPAN_NOT_CONTAINED;
   2916     if(isComplement) {
   2917         spanCondition=invertSpanCondition(spanCondition, contained);
   2918     }
   2919 
   2920     // First span condition for span(), used to terminate the spanBack() iteration.
   2921     firstSpanCondition=spanCondition;
   2922 
   2923     // spanBack(): Its initial span condition is span()'s last span condition,
   2924     // which is the opposite of span()'s first span condition
   2925     // if we expect an even number of spans.
   2926     // (The loop inverts spanCondition (expectCount-1) times
   2927     // before the expectCount'th span() call.)
   2928     // If we do not compare forward and backward directions, then we do not have an
   2929     // expectCount and just start with firstSpanCondition.
   2930     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
   2931         spanCondition=invertSpanCondition(spanCondition, contained);
   2932     }
   2933 
   2934     count=0;
   2935     switch(type) {
   2936     case 0:
   2937     case 1:
   2938         start=0;
   2939         if(length<0) {
   2940             length=slen(s, isUTF16);
   2941         }
   2942         for(;;) {
   2943             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
   2944                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
   2945             if(count<limitsCapacity) {
   2946                 limits[count]=start;
   2947             }
   2948             ++count;
   2949             if(start>=length) {
   2950                 break;
   2951             }
   2952             spanCondition=invertSpanCondition(spanCondition, contained);
   2953         }
   2954         break;
   2955     case 2:
   2956     case 3:
   2957         start=0;
   2958         for(;;) {
   2959             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
   2960                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
   2961             if(count<limitsCapacity) {
   2962                 limits[count]=start;
   2963             }
   2964             ++count;
   2965             if(length>=0 ? start>=length :
   2966                            isUTF16 ? ((const UChar *)s)[start]==0 :
   2967                                      ((const char *)s)[start]==0
   2968             ) {
   2969                 break;
   2970             }
   2971             spanCondition=invertSpanCondition(spanCondition, contained);
   2972         }
   2973         break;
   2974     case 4:
   2975     case 5:
   2976         if(length<0) {
   2977             length=slen(s, isUTF16);
   2978         }
   2979         for(;;) {
   2980             ++count;
   2981             if(count<=limitsCapacity) {
   2982                 limits[limitsCapacity-count]=length;
   2983             }
   2984             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
   2985                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
   2986             if(length==0 && spanCondition==firstSpanCondition) {
   2987                 break;
   2988             }
   2989             spanCondition=invertSpanCondition(spanCondition, contained);
   2990         }
   2991         if(count<limitsCapacity) {
   2992             memmove(limits, limits+(limitsCapacity-count), count*4);
   2993         }
   2994         break;
   2995     case 6:
   2996     case 7:
   2997         for(;;) {
   2998             ++count;
   2999             if(count<=limitsCapacity) {
   3000                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
   3001             }
   3002             // Note: Length<0 is tested only for the first spanBack().
   3003             // If we wanted to keep length<0 for all spanBack()s, we would have to
   3004             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
   3005             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
   3006                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
   3007             if(length==0 && spanCondition==firstSpanCondition) {
   3008                 break;
   3009             }
   3010             spanCondition=invertSpanCondition(spanCondition, contained);
   3011         }
   3012         if(count<limitsCapacity) {
   3013             memmove(limits, limits+(limitsCapacity-count), count*4);
   3014         }
   3015         break;
   3016     default:
   3017         typeName="";
   3018         return -1;
   3019     }
   3020 
   3021     return count;
   3022 }
   3023 
   3024 // sets to be tested; odd index=isComplement
   3025 enum {
   3026     SLOW,
   3027     SLOW_NOT,
   3028     FAST,
   3029     FAST_NOT,
   3030     SET_COUNT
   3031 };
   3032 
   3033 static const char *const setNames[SET_COUNT]={
   3034     "slow",
   3035     "slow.not",
   3036     "fast",
   3037     "fast.not"
   3038 };
   3039 
   3040 /*
   3041  * Verify that we get the same results whether we look at text with contains(),
   3042  * span() or spanBack(), using unfrozen or frozen versions of the set,
   3043  * and using the set or its complement (switching the spanConditions accordingly).
   3044  * The latter verifies that
   3045  *   set.span(spanCondition) == set.complement().span(!spanCondition).
   3046  *
   3047  * The expectLimits[] are either provided by the caller (with expectCount>=0)
   3048  * or returned to the caller (with an input expectCount<0).
   3049  */
   3050 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
   3051                               const void *s, int32_t length, UBool isUTF16,
   3052                               uint32_t whichSpans,
   3053                               int32_t expectLimits[], int32_t &expectCount,
   3054                               const char *testName, int32_t index) {
   3055     int32_t limits[500];
   3056     int32_t limitsCount;
   3057     int i, j;
   3058 
   3059     const char *typeName;
   3060     int type;
   3061 
   3062     for(i=0; i<SET_COUNT; ++i) {
   3063         if((i&1)==0) {
   3064             // Even-numbered sets are original, uncomplemented sets.
   3065             if((whichSpans&SPAN_SET)==0) {
   3066                 continue;
   3067             }
   3068         } else {
   3069             // Odd-numbered sets are complemented.
   3070             if((whichSpans&SPAN_COMPLEMENT)==0) {
   3071                 continue;
   3072             }
   3073         }
   3074         for(type=0;; ++type) {
   3075             limitsCount=getSpans(*sets[i], (UBool)(i&1),
   3076                                  s, length, isUTF16,
   3077                                  whichSpans,
   3078                                  type, typeName,
   3079                                  limits, LENGTHOF(limits), expectCount);
   3080             if(typeName[0]==0) {
   3081                 break; // All types tried.
   3082             }
   3083             if(limitsCount<0) {
   3084                 continue; // Span option filtered out.
   3085             }
   3086             if(expectCount<0) {
   3087                 expectCount=limitsCount;
   3088                 if(limitsCount>LENGTHOF(limits)) {
   3089                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
   3090                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
   3091                     return;
   3092                 }
   3093                 memcpy(expectLimits, limits, limitsCount*4);
   3094             } else if(limitsCount!=expectCount) {
   3095                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
   3096                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
   3097             } else {
   3098                 for(j=0; j<limitsCount; ++j) {
   3099                     if(limits[j]!=expectLimits[j]) {
   3100                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
   3101                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
   3102                               j, (long)limits[j], (long)expectLimits[j]);
   3103                         break;
   3104                     }
   3105                 }
   3106             }
   3107         }
   3108     }
   3109 
   3110     // Compare span() with containsAll()/containsNone(),
   3111     // but only if we have expectLimits[] from the uncomplemented set.
   3112     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
   3113         const UChar *s16=(const UChar *)s;
   3114         UnicodeString string;
   3115         int32_t prev=0, limit, length;
   3116         for(i=0; i<expectCount; ++i) {
   3117             limit=expectLimits[i];
   3118             length=limit-prev;
   3119             if(length>0) {
   3120                 string.setTo(FALSE, s16+prev, length);  // read-only alias
   3121                 if(i&1) {
   3122                     if(!sets[SLOW]->getSet().containsAll(string)) {
   3123                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
   3124                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
   3125                         return;
   3126                     }
   3127                     if(!sets[FAST]->getSet().containsAll(string)) {
   3128                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
   3129                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
   3130                         return;
   3131                     }
   3132                 } else {
   3133                     if(!sets[SLOW]->getSet().containsNone(string)) {
   3134                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
   3135                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
   3136                         return;
   3137                     }
   3138                     if(!sets[FAST]->getSet().containsNone(string)) {
   3139                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
   3140                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
   3141                         return;
   3142                     }
   3143                 }
   3144             }
   3145             prev=limit;
   3146         }
   3147     }
   3148 }
   3149 
   3150 // Specifically test either UTF-16 or UTF-8.
   3151 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
   3152                               const void *s, int32_t length, UBool isUTF16,
   3153                               uint32_t whichSpans,
   3154                               const char *testName, int32_t index) {
   3155     int32_t expectLimits[500];
   3156     int32_t expectCount=-1;
   3157     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
   3158 }
   3159 
   3160 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
   3161     UChar c, c2;
   3162 
   3163     if(length>=0) {
   3164         while(length>0) {
   3165             c=*s++;
   3166             --length;
   3167             if(0xd800<=c && c<0xe000) {
   3168                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
   3169                     return TRUE;
   3170                 }
   3171                 --length;
   3172             }
   3173         }
   3174     } else {
   3175         while((c=*s++)!=0) {
   3176             if(0xd800<=c && c<0xe000) {
   3177                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
   3178                     return TRUE;
   3179                 }
   3180             }
   3181         }
   3182     }
   3183     return FALSE;
   3184 }
   3185 
   3186 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
   3187 // unless either UTF is turned off in whichSpans.
   3188 // Testing UTF-16 and UTF-8 together requires that surrogate code points
   3189 // have the same contains(c) value as U+FFFD.
   3190 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
   3191                                       const UChar *s16, int32_t length16,
   3192                                       uint32_t whichSpans,
   3193                                       const char *testName, int32_t index) {
   3194     int32_t expectLimits[500];
   3195     int32_t expectCount;
   3196 
   3197     expectCount=-1;  // Get expectLimits[] from testSpan().
   3198 
   3199     if((whichSpans&SPAN_UTF16)!=0) {
   3200         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
   3201     }
   3202     if((whichSpans&SPAN_UTF8)==0) {
   3203         return;
   3204     }
   3205 
   3206     // Convert s16[] and expectLimits[] to UTF-8.
   3207     uint8_t s8[3000];
   3208     int32_t offsets[3000];
   3209 
   3210     const UChar *s16Limit=s16+length16;
   3211     char *t=(char *)s8;
   3212     char *tLimit=t+sizeof(s8);
   3213     int32_t *o=offsets;
   3214     UErrorCode errorCode=U_ZERO_ERROR;
   3215 
   3216     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
   3217     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
   3218     if(U_FAILURE(errorCode)) {
   3219         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
   3220               testName, (long)index, u_errorName(errorCode));
   3221         ucnv_resetFromUnicode(utf8Cnv);
   3222         return;
   3223     }
   3224     int32_t length8=(int32_t)(t-(char *)s8);
   3225 
   3226     // Convert expectLimits[].
   3227     int32_t i, j, expect;
   3228     for(i=j=0; i<expectCount; ++i) {
   3229         expect=expectLimits[i];
   3230         if(expect==length16) {
   3231             expectLimits[i]=length8;
   3232         } else {
   3233             while(offsets[j]<expect) {
   3234                 ++j;
   3235             }
   3236             expectLimits[i]=j;
   3237         }
   3238     }
   3239 
   3240     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
   3241 }
   3242 
   3243 static UChar32 nextCodePoint(UChar32 c) {
   3244     // Skip some large and boring ranges.
   3245     switch(c) {
   3246     case 0x3441:
   3247         return 0x4d7f;
   3248     case 0x5100:
   3249         return 0x9f00;
   3250     case 0xb040:
   3251         return 0xd780;
   3252     case 0xe041:
   3253         return 0xf8fe;
   3254     case 0x10100:
   3255         return 0x20000;
   3256     case 0x20041:
   3257         return 0xe0000;
   3258     case 0xe0101:
   3259         return 0x10fffd;
   3260     default:
   3261         return c+1;
   3262     }
   3263 }
   3264 
   3265 // Verify that all implementations represent the same set.
   3266 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3267     // contains(U+FFFD) is inconsistent with contains(some surrogates),
   3268     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
   3269     // Skip the UTF-8 part of the test - if the string contains surrogates -
   3270     // because it is likely to produce a different result.
   3271     UBool inconsistentSurrogates=
   3272             (!(sets[0]->getSet().contains(0xfffd) ?
   3273                sets[0]->getSet().contains(0xd800, 0xdfff) :
   3274                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
   3275              sets[0]->hasStringsWithSurrogates());
   3276 
   3277     UChar s[1000];
   3278     int32_t length=0;
   3279     uint32_t localWhichSpans;
   3280 
   3281     UChar32 c, first;
   3282     for(first=c=0;; c=nextCodePoint(c)) {
   3283         if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
   3284             localWhichSpans=whichSpans;
   3285             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
   3286                 localWhichSpans&=~SPAN_UTF8;
   3287             }
   3288             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
   3289             if(c>0x10ffff) {
   3290                 break;
   3291             }
   3292             length=0;
   3293             first=c;
   3294         }
   3295         U16_APPEND_UNSAFE(s, length, c);
   3296     }
   3297 }
   3298 
   3299 // Test with a particular, interesting string.
   3300 // Specify length and try NUL-termination.
   3301 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3302     static const UChar s[]={
   3303         0x61, 0x62, 0x20,                       // Latin, space
   3304         0x3b1, 0x3b2, 0x3b3,                    // Greek
   3305         0xd900,                                 // lead surrogate
   3306         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
   3307         0xdc05,                                 // trail surrogate
   3308         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
   3309         0xd900, 0xdc05,                         // unassigned supplementary
   3310         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
   3311         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
   3312         0                                       // NUL
   3313     };
   3314 
   3315     if((whichSpans&SPAN_UTF16)==0) {
   3316         return;
   3317     }
   3318     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
   3319     testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
   3320 }
   3321 
   3322 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3323     static const char s[]={
   3324         "abc"                                   // Latin
   3325 
   3326         /* trail byte in lead position */
   3327         "\x80"
   3328 
   3329         " "                                     // space
   3330 
   3331         /* truncated multi-byte sequences */
   3332         "\xd0"
   3333         "\xe0"
   3334         "\xe1"
   3335         "\xed"
   3336         "\xee"
   3337         "\xf0"
   3338         "\xf1"
   3339         "\xf4"
   3340         "\xf8"
   3341         "\xfc"
   3342 
   3343         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
   3344 
   3345         /* trail byte in lead position */
   3346         "\x80"
   3347 
   3348         "\xe0\x80"
   3349         "\xe0\xa0"
   3350         "\xe1\x80"
   3351         "\xed\x80"
   3352         "\xed\xa0"
   3353         "\xee\x80"
   3354         "\xf0\x80"
   3355         "\xf0\x90"
   3356         "\xf1\x80"
   3357         "\xf4\x80"
   3358         "\xf4\x90"
   3359         "\xf8\x80"
   3360         "\xfc\x80"
   3361 
   3362         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
   3363 
   3364         /* trail byte in lead position */
   3365         "\x80"
   3366 
   3367         "\xf0\x80\x80"
   3368         "\xf0\x90\x80"
   3369         "\xf1\x80\x80"
   3370         "\xf4\x80\x80"
   3371         "\xf4\x90\x80"
   3372         "\xf8\x80\x80"
   3373         "\xfc\x80\x80"
   3374 
   3375         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
   3376 
   3377         /* trail byte in lead position */
   3378         "\x80"
   3379 
   3380         "\xf8\x80\x80\x80"
   3381         "\xfc\x80\x80\x80"
   3382 
   3383         "\xF1\x90\x80\x85"                      // unassigned supplementary
   3384 
   3385         /* trail byte in lead position */
   3386         "\x80"
   3387 
   3388         "\xfc\x80\x80\x80\x80"
   3389 
   3390         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
   3391 
   3392         /* trail byte in lead position */
   3393         "\x80"
   3394 
   3395         /* complete sequences but non-shortest forms or out of range etc. */
   3396         "\xc0\x80"
   3397         "\xe0\x80\x80"
   3398         "\xed\xa0\x80"
   3399         "\xf0\x80\x80\x80"
   3400         "\xf4\x90\x80\x80"
   3401         "\xf8\x80\x80\x80\x80"
   3402         "\xfc\x80\x80\x80\x80\x80"
   3403         "\xfe"
   3404         "\xff"
   3405 
   3406         /* trail byte in lead position */
   3407         "\x80"
   3408 
   3409         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
   3410     };
   3411 
   3412     if((whichSpans&SPAN_UTF8)==0) {
   3413         return;
   3414     }
   3415     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
   3416     testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
   3417 }
   3418 
   3419 // Take a set of span options and multiply them so that
   3420 // each portion only has one of the options a, b and c.
   3421 // If b==0, then the set of options is just modified with mask and a.
   3422 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
   3423 static int32_t
   3424 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
   3425                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
   3426     uint32_t s;
   3427     int32_t i;
   3428 
   3429     for(i=0; i<whichSpansCount; ++i) {
   3430         s=whichSpans[i]&mask;
   3431         whichSpans[i]=s|a;
   3432         if(b!=0) {
   3433             whichSpans[whichSpansCount+i]=s|b;
   3434             if(c!=0) {
   3435                 whichSpans[2*whichSpansCount+i]=s|c;
   3436             }
   3437         }
   3438     }
   3439     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
   3440 }
   3441 
   3442 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
   3443 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
   3444 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
   3445 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
   3446 
   3447 void UnicodeSetTest::TestSpan() {
   3448     // "[...]" is a UnicodeSet pattern.
   3449     // "*" performs tests on all Unicode code points and on a selection of
   3450     //   malformed UTF-8/16 strings.
   3451     // "-options" limits the scope of testing for the current set.
   3452     //   By default, the test verifies that equivalent boundaries are found
   3453     //   for UTF-16 and UTF-8, going forward and backward,
   3454     //   alternating USET_SPAN_NOT_CONTAINED with
   3455     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
   3456     //   Single-character options:
   3457     //     8 -- UTF-16 and UTF-8 boundaries may differ.
   3458     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
   3459     //          or the set contains strings with unpaired surrogates
   3460     //          which do not translate to valid UTF-8.
   3461     //     c -- set.span() and set.complement().span() boundaries may differ.
   3462     //          Cause: Set strings are not complemented.
   3463     //     b -- span() and spanBack() boundaries may differ.
   3464     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
   3465     //          and spanBack(USET_SPAN_SIMPLE) are defined to
   3466     //          match with non-overlapping substrings.
   3467     //          For example, with a set containing "ab" and "ba",
   3468     //          span() of "aba" yields boundaries { 0, 2, 3 }
   3469     //          because the initial "ab" matches from 0 to 2,
   3470     //          while spanBack() yields boundaries { 0, 1, 3 }
   3471     //          because the final "ba" matches from 1 to 3.
   3472     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
   3473     //          Cause: Strings in the set overlap, and a longer match may
   3474     //          require a sequence including non-longest substrings.
   3475     //          For example, with a set containing "ab", "abc" and "cd",
   3476     //          span(contained) of "abcd" spans the entire string
   3477     //          but span(longest match) only spans the first 3 characters.
   3478     //   Each "-options" first resets all options and then applies the specified options.
   3479     //   A "-" without options resets the options.
   3480     //   The options are also reset for each new set.
   3481     // Other strings will be spanned.
   3482     static const char *const testdata[]={
   3483         "[:ID_Continue:]",
   3484         "*",
   3485         "[:White_Space:]",
   3486         "*",
   3487         "[]",
   3488         "*",
   3489         "[\\u0000-\\U0010FFFF]",
   3490         "*",
   3491         "[\\u0000\\u0080\\u0800\\U00010000]",
   3492         "*",
   3493         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
   3494         "*",
   3495         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
   3496         "-c",
   3497         "*",
   3498         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
   3499         "-c",
   3500         "*",
   3501 
   3502         // Overlapping strings cause overlapping attempts to match.
   3503         "[x{xy}{xya}{axy}{ax}]",
   3504         "-cl",
   3505 
   3506         // More repetitions of "xya" would take too long with the recursive
   3507         // reference implementation.
   3508         // containsAll()=FALSE
   3509         // test_string 0x14
   3510         "xx"
   3511         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
   3512         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
   3513         "xyaxyaxyaxya"
   3514         "xx"
   3515         "xyaxyaxyaxya"  // span() ends here.
   3516         "aaa",
   3517 
   3518         // containsAll()=TRUE
   3519         // test_string 0x15
   3520         "xx"
   3521         "xyaxyaxyaxya"
   3522         "xx"
   3523         "xyaxyaxyaxya"
   3524         "xx"
   3525         "xyaxyaxyaxy",
   3526 
   3527         "-bc",
   3528         // test_string 0x17
   3529         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
   3530         "-c",
   3531         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
   3532         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
   3533         "-",
   3534         "byaya",     // span() -> { 5 }
   3535         "byay",      // span() -> { 4 }
   3536         "bya",       // span() -> { 3 }
   3537 
   3538         // span(longest match) will not span the whole string.
   3539         "[a{ab}{bc}]",
   3540         "-cl",
   3541         // test_string 0x21
   3542         "abc",
   3543 
   3544         "[a{ab}{abc}{cd}]",
   3545         "-cl",
   3546         "acdabcdabccd",
   3547 
   3548         // spanBack(longest match) will not span the whole string.
   3549         "[c{ab}{bc}]",
   3550         "-cl",
   3551         "abc",
   3552 
   3553         "[d{cd}{bcd}{ab}]",
   3554         "-cl",
   3555         "abbcdabcdabd",
   3556 
   3557         // Test with non-ASCII set strings - test proper handling of surrogate pairs
   3558         // and UTF-8 trail bytes.
   3559         // Copies of above test sets and strings, but transliterated to have
   3560         // different code points with similar trail units.
   3561         // Previous: a      b         c            d
   3562         // Unicode:  042B   30AB      200AB        204AB
   3563         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
   3564         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
   3565         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
   3566         "-cl",
   3567         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
   3568 
   3569         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
   3570         "-cl",
   3571         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
   3572 
   3573         // Stress bookkeeping and recursion.
   3574         // The following strings are barely doable with the recursive
   3575         // reference implementation.
   3576         // The not-contained character at the end prevents an early exit from the span().
   3577         "[b{bb}]",
   3578         "-c",
   3579         // test_string 0x33
   3580         "bbbbbbbbbbbbbbbbbbbbbbbb-",
   3581         // On complement sets, span() and spanBack() get different results
   3582         // because b is not in the complement set and there is an odd number of b's
   3583         // in the test string.
   3584         "-bc",
   3585         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
   3586 
   3587         // Test with set strings with an initial or final code point span
   3588         // longer than 254.
   3589         "[a{" _64_a _64_a _64_a _64_a "b}"
   3590           "{a" _64_b _64_b _64_b _64_b "}]",
   3591         "-c",
   3592         _64_a _64_a _64_a _63_a "b",
   3593         _64_a _64_a _64_a _64_a "b",
   3594         _64_a _64_a _64_a _64_a "aaaabbbb",
   3595         "a" _64_b _64_b _64_b _63_b,
   3596         "a" _64_b _64_b _64_b _64_b,
   3597         "aaaabbbb" _64_b _64_b _64_b _64_b,
   3598 
   3599         // Test with strings containing unpaired surrogates.
   3600         // They are not representable in UTF-8, and a leading trail surrogate
   3601         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
   3602         // U+20001 == \\uD840\\uDC01
   3603         // U+20400 == \\uD841\\uDC00
   3604         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
   3605         "-8cl",
   3606         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
   3607     };
   3608     uint32_t whichSpans[96]={ SPAN_ALL };
   3609     int32_t whichSpansCount=1;
   3610 
   3611     UnicodeSet *sets[SET_COUNT]={ NULL };
   3612     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
   3613 
   3614     char testName[1024];
   3615     char *testNameLimit=testName;
   3616 
   3617     int32_t i, j;
   3618     for(i=0; i<LENGTHOF(testdata); ++i) {
   3619         const char *s=testdata[i];
   3620         if(s[0]=='[') {
   3621             // Create new test sets from this pattern.
   3622             for(j=0; j<SET_COUNT; ++j) {
   3623                 delete sets_with_str[j];
   3624                 delete sets[j];
   3625             }
   3626             UErrorCode errorCode=U_ZERO_ERROR;
   3627             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
   3628             if(U_FAILURE(errorCode)) {
   3629                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
   3630                 break;
   3631             }
   3632             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
   3633             sets[SLOW_NOT]->complement();
   3634             // Intermediate set: Test cloning of a frozen set.
   3635             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
   3636             fast->freeze();
   3637             sets[FAST]=(UnicodeSet *)fast->clone();
   3638             delete fast;
   3639             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
   3640             fastNot->freeze();
   3641             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
   3642             delete fastNot;
   3643 
   3644             for(j=0; j<SET_COUNT; ++j) {
   3645                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
   3646             }
   3647 
   3648             strcpy(testName, s);
   3649             testNameLimit=strchr(testName, 0);
   3650             *testNameLimit++=':';
   3651             *testNameLimit=0;
   3652 
   3653             whichSpans[0]=SPAN_ALL;
   3654             whichSpansCount=1;
   3655         } else if(s[0]=='-') {
   3656             whichSpans[0]=SPAN_ALL;
   3657             whichSpansCount=1;
   3658 
   3659             while(*++s!=0) {
   3660                 switch(*s) {
   3661                 case 'c':
   3662                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3663                                                    ~SPAN_POLARITY,
   3664                                                    SPAN_SET,
   3665                                                    SPAN_COMPLEMENT,
   3666                                                    0);
   3667                     break;
   3668                 case 'b':
   3669                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3670                                                    ~SPAN_DIRS,
   3671                                                    SPAN_FWD,
   3672                                                    SPAN_BACK,
   3673                                                    0);
   3674                     break;
   3675                 case 'l':
   3676                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
   3677                     // USET_SPAN_SIMPLE only FWD, and separately
   3678                     // USET_SPAN_SIMPLE only BACK
   3679                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3680                                                    ~(SPAN_DIRS|SPAN_CONDITION),
   3681                                                    SPAN_DIRS|SPAN_CONTAINED,
   3682                                                    SPAN_FWD|SPAN_SIMPLE,
   3683                                                    SPAN_BACK|SPAN_SIMPLE);
   3684                     break;
   3685                 case '8':
   3686                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3687                                                    ~SPAN_UTFS,
   3688                                                    SPAN_UTF16,
   3689                                                    SPAN_UTF8,
   3690                                                    0);
   3691                     break;
   3692                 default:
   3693                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
   3694                     break;
   3695                 }
   3696             }
   3697         } else if(0==strcmp(s, "*")) {
   3698             strcpy(testNameLimit, "bad_string");
   3699             for(j=0; j<whichSpansCount; ++j) {
   3700                 if(whichSpansCount>1) {
   3701                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
   3702                             "%%0x%3x",
   3703                             whichSpans[j]);
   3704                 }
   3705                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
   3706                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
   3707             }
   3708 
   3709             strcpy(testNameLimit, "contents");
   3710             for(j=0; j<whichSpansCount; ++j) {
   3711                 if(whichSpansCount>1) {
   3712                     sprintf(testNameLimit+8 /* strlen("contents") */,
   3713                             "%%0x%3x",
   3714                             whichSpans[j]);
   3715                 }
   3716                 testSpanContents(sets_with_str, whichSpans[j], testName);
   3717             }
   3718         } else {
   3719             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
   3720             strcpy(testNameLimit, "test_string");
   3721             for(j=0; j<whichSpansCount; ++j) {
   3722                 if(whichSpansCount>1) {
   3723                     sprintf(testNameLimit+11 /* strlen("test_string") */,
   3724                             "%%0x%3x",
   3725                             whichSpans[j]);
   3726                 }
   3727                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
   3728             }
   3729         }
   3730     }
   3731     for(j=0; j<SET_COUNT; ++j) {
   3732         delete sets_with_str[j];
   3733         delete sets[j];
   3734     }
   3735 }
   3736 
   3737 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
   3738 void UnicodeSetTest::TestStringSpan() {
   3739     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
   3740     static const char *const string=
   3741         "xx"
   3742         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
   3743         "xx"
   3744         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
   3745         "xx"
   3746         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
   3747         "aaaa";
   3748 
   3749     UErrorCode errorCode=U_ZERO_ERROR;
   3750     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
   3751     UnicodeSet set(pattern16, errorCode);
   3752     if(U_FAILURE(errorCode)) {
   3753         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3754         return;
   3755     }
   3756 
   3757     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
   3758 
   3759     if(set.containsAll(string16)) {
   3760         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
   3761     }
   3762 
   3763     // Remove trailing "aaaa".
   3764     string16.truncate(string16.length()-4);
   3765     if(!set.containsAll(string16)) {
   3766         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
   3767     }
   3768 
   3769     string16=UNICODE_STRING_SIMPLE("byayaxya");
   3770     const UChar *s16=string16.getBuffer();
   3771     int32_t length16=string16.length();
   3772     (void)length16;   // Suppress set but not used warning.
   3773     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
   3774         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
   3775         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
   3776         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
   3777         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
   3778         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
   3779     ) {
   3780         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
   3781     }
   3782 
   3783     pattern="[a{ab}{abc}{cd}]";
   3784     pattern16=UnicodeString(pattern, -1, US_INV);
   3785     set.applyPattern(pattern16, errorCode);
   3786     if(U_FAILURE(errorCode)) {
   3787         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3788         return;
   3789     }
   3790     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
   3791     s16=string16.getBuffer();
   3792     length16=string16.length();
   3793     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
   3794         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
   3795         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
   3796     ) {
   3797         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
   3798     }
   3799 
   3800     pattern="[d{cd}{bcd}{ab}]";
   3801     pattern16=UnicodeString(pattern, -1, US_INV);
   3802     set.applyPattern(pattern16, errorCode).freeze();
   3803     if(U_FAILURE(errorCode)) {
   3804         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3805         return;
   3806     }
   3807     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
   3808     s16=string16.getBuffer();
   3809     length16=string16.length();
   3810     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
   3811         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
   3812         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
   3813     ) {
   3814         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
   3815     }
   3816 }
   3817