Home | History | Annotate | Download | only in intltest
      1 /*
      2 ********************************************************************************
      3 *   Copyright (C) 1999-2009 International Business Machines Corporation and
      4 *   others. All Rights Reserved.
      5 ********************************************************************************
      6 *   Date        Name        Description
      7 *   10/20/99    alan        Creation.
      8 *   03/22/2000  Madhu       Added additional tests
      9 ********************************************************************************
     10 */
     11 
     12 #include <stdio.h>
     13 
     14 #include <string.h>
     15 #include "unicode/utypes.h"
     16 #include "usettest.h"
     17 #include "unicode/ucnv.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/usetiter.h"
     21 #include "unicode/ustring.h"
     22 #include "unicode/parsepos.h"
     23 #include "unicode/symtable.h"
     24 #include "unicode/uversion.h"
     25 #include "hash.h"
     26 
     27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     28 
     29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
     30     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
     31     u_errorName(status));}}
     32 
     33 #define TEST_ASSERT(expr) {if (!(expr)) { \
     34     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
     35 
     36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
     37     UnicodeString pat;
     38     set.toPattern(pat);
     39     return left + UnicodeSetTest::escape(pat);
     40 }
     41 
     42 #define CASE(id,test) case id:                          \
     43                           name = #test;                 \
     44                           if (exec) {                   \
     45                               logln(#test "---");       \
     46                               logln();                  \
     47                               test();                   \
     48                           }                             \
     49                           break
     50 
     51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
     52 }
     53 
     54 UConverter *UnicodeSetTest::openUTF8Converter() {
     55     if(utf8Cnv==NULL) {
     56         UErrorCode errorCode=U_ZERO_ERROR;
     57         utf8Cnv=ucnv_open("UTF-8", &errorCode);
     58     }
     59     return utf8Cnv;
     60 }
     61 
     62 UnicodeSetTest::~UnicodeSetTest() {
     63     ucnv_close(utf8Cnv);
     64 }
     65 
     66 void
     67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     68                                const char* &name, char* /*par*/) {
     69     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
     70     switch (index) {
     71         CASE(0,TestPatterns);
     72         CASE(1,TestAddRemove);
     73         CASE(2,TestCategories);
     74         CASE(3,TestCloneEqualHash);
     75         CASE(4,TestMinimalRep);
     76         CASE(5,TestAPI);
     77         CASE(6,TestScriptSet);
     78         CASE(7,TestPropertySet);
     79         CASE(8,TestClone);
     80         CASE(9,TestExhaustive);
     81         CASE(10,TestToPattern);
     82         CASE(11,TestIndexOf);
     83         CASE(12,TestStrings);
     84         CASE(13,Testj2268);
     85         CASE(14,TestCloseOver);
     86         CASE(15,TestEscapePattern);
     87         CASE(16,TestInvalidCodePoint);
     88         CASE(17,TestSymbolTable);
     89         CASE(18,TestSurrogate);
     90         CASE(19,TestPosixClasses);
     91         CASE(20,TestIteration);
     92         CASE(21,TestFreezable);
     93         CASE(22,TestSpan);
     94         CASE(23,TestStringSpan);
     95         default: name = ""; break;
     96     }
     97 }
     98 
     99 static const char NOT[] = "%%%%";
    100 
    101 /**
    102  * UVector was improperly copying contents
    103  * This code will crash this is still true
    104  */
    105 void UnicodeSetTest::Testj2268() {
    106   UnicodeSet t;
    107   t.add(UnicodeString("abc"));
    108   UnicodeSet test(t);
    109   UnicodeString ustrPat;
    110   test.toPattern(ustrPat, TRUE);
    111 }
    112 
    113 /**
    114  * Test toPattern().
    115  */
    116 void UnicodeSetTest::TestToPattern() {
    117     UErrorCode ec = U_ZERO_ERROR;
    118 
    119     // Test that toPattern() round trips with syntax characters and
    120     // whitespace.
    121     {
    122         static const char* OTHER_TOPATTERN_TESTS[] = {
    123             "[[:latin:]&[:greek:]]",
    124             "[[:latin:]-[:greek:]]",
    125             "[:nonspacing mark:]",
    126             NULL
    127         };
    128 
    129         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
    130             ec = U_ZERO_ERROR;
    131             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
    132             if (U_FAILURE(ec)) {
    133                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
    134                 continue;
    135             }
    136             checkPat(OTHER_TOPATTERN_TESTS[j], s);
    137         }
    138 
    139         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
    140             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
    141 
    142                 // check various combinations to make sure they all work.
    143                 if (i != 0 && !toPatternAux(i, i)){
    144                     continue;
    145                 }
    146                 if (!toPatternAux(0, i)){
    147                     continue;
    148                 }
    149                 if (!toPatternAux(i, 0xFFFF)){
    150                     continue;
    151                 }
    152             }
    153         }
    154     }
    155 
    156     // Test pattern behavior of multicharacter strings.
    157     {
    158         ec = U_ZERO_ERROR;
    159         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
    160 
    161         // This loop isn't a loop.  It's here to make the compiler happy.
    162         // If you're curious, try removing it and changing the 'break'
    163         // statements (except for the last) to goto's.
    164         for (;;) {
    165             if (U_FAILURE(ec)) break;
    166             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
    167             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
    168 
    169             s->add("ac");
    170             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
    171             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
    172 
    173             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
    174             if (U_FAILURE(ec)) break;
    175             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
    176             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
    177 
    178             s->add("[]");
    179             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
    180             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
    181 
    182             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
    183             if (U_FAILURE(ec)) break;
    184             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
    185             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
    186 
    187             // j2189
    188             s->clear();
    189             s->add(UnicodeString("abc", ""));
    190             s->add(UnicodeString("abc", ""));
    191             const char* exp6[] = {"abc", NOT, "ab", NULL};
    192             expectToPattern(*s, "[{abc}]", exp6);
    193 
    194             break;
    195         }
    196 
    197         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
    198         delete s;
    199     }
    200 
    201     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
    202     UnicodeSet s;
    203     s.add((UChar)97, (UChar)98); // 'a', 'b'
    204     expectToPattern(s, "[ab]", NULL);
    205 }
    206 
    207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
    208 
    209     // use Integer.toString because Utility.hex doesn't handle ints
    210     UnicodeString pat = "";
    211     // TODO do these in hex
    212     //String source = "0x" + Integer.toString(start,16).toUpperCase();
    213     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
    214     UnicodeString source;
    215     source = source + (uint32_t)start;
    216     if (start != end)
    217         source = source + ".." + (uint32_t)end;
    218     UnicodeSet testSet;
    219     testSet.add(start, end);
    220     return checkPat(source, testSet);
    221 }
    222 
    223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
    224                                const UnicodeSet& testSet) {
    225     // What we want to make sure of is that a pattern generated
    226     // by toPattern(), with or without escaped unprintables, can
    227     // be passed back into the UnicodeSet constructor.
    228     UnicodeString pat0;
    229 
    230     testSet.toPattern(pat0, TRUE);
    231 
    232     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
    233 
    234     //String pat1 = unescapeLeniently(pat0);
    235     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
    236 
    237     UnicodeString pat2;
    238     testSet.toPattern(pat2, FALSE);
    239     if (!checkPat(source, testSet, pat2)) return FALSE;
    240 
    241     //String pat3 = unescapeLeniently(pat2);
    242     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
    243 
    244     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
    245     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
    246     return TRUE;
    247 }
    248 
    249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
    250                                const UnicodeSet& testSet,
    251                                const UnicodeString& pat) {
    252     UErrorCode ec = U_ZERO_ERROR;
    253     UnicodeSet testSet2(pat, ec);
    254     if (testSet2 != testSet) {
    255         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
    256         return FALSE;
    257     }
    258     return TRUE;
    259 }
    260 
    261 void
    262 UnicodeSetTest::TestPatterns(void) {
    263     UnicodeSet set;
    264     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
    265     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
    266     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
    267     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
    268     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
    269     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
    270 
    271     // Throw in a test of complement
    272     set.complement();
    273     UnicodeString exp;
    274     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
    275     expectPairs(set, exp);
    276 }
    277 
    278 void
    279 UnicodeSetTest::TestCategories(void) {
    280     UErrorCode status = U_ZERO_ERROR;
    281     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
    282     UnicodeSet set(pat, status);
    283     if (U_FAILURE(status)) {
    284         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
    285         return;
    286     } else {
    287         expectContainment(set, pat, "ABC", "abc");
    288     }
    289 
    290     UChar32 i;
    291     int32_t failures = 0;
    292     // Make sure generation of L doesn't pollute cached Lu set
    293     // First generate L, then Lu
    294     set.applyPattern("[:L:]", status);
    295     if (U_FAILURE(status)) { errln("FAIL"); return; }
    296     for (i=0; i<0x200; ++i) {
    297         UBool l = u_isalpha((UChar)i);
    298         if (l != set.contains(i)) {
    299             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
    300                   set.contains(i));
    301             if (++failures == 10) break;
    302         }
    303     }
    304 
    305     set.applyPattern("[:Lu:]", status);
    306     if (U_FAILURE(status)) { errln("FAIL"); return; }
    307     for (i=0; i<0x200; ++i) {
    308         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
    309         if (lu != set.contains(i)) {
    310             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
    311                   set.contains(i));
    312             if (++failures == 20) break;
    313         }
    314     }
    315 }
    316 void
    317 UnicodeSetTest::TestCloneEqualHash(void) {
    318     UErrorCode status = U_ZERO_ERROR;
    319     // set1 and set2 used to be built with the obsolete constructor taking
    320     // UCharCategory values; replaced with pattern constructors
    321     // markus 20030502
    322     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
    323     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
    324     if (U_FAILURE(status)){
    325         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
    326         return;
    327     }
    328     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
    329     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
    330     if (U_FAILURE(status)){
    331         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
    332         return;
    333     }
    334 
    335     if (*set1 != *set1a) {
    336         errln("FAIL: category constructor for Ll broken");
    337     }
    338     if (*set2 != *set2a) {
    339         errln("FAIL: category constructor for Nd broken");
    340     }
    341     delete set1a;
    342     delete set2a;
    343 
    344     logln("Testing copy construction");
    345     UnicodeSet *set1copy=new UnicodeSet(*set1);
    346     if(*set1 != *set1copy || *set1 == *set2 ||
    347         getPairs(*set1) != getPairs(*set1copy) ||
    348         set1->hashCode() != set1copy->hashCode()){
    349         errln("FAIL : Error in copy construction");
    350         return;
    351     }
    352 
    353     logln("Testing =operator");
    354     UnicodeSet set1equal=*set1;
    355     UnicodeSet set2equal=*set2;
    356     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
    357         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
    358         errln("FAIL: Error in =operator");
    359     }
    360 
    361     logln("Testing clone()");
    362     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
    363     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
    364     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
    365         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
    366         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
    367         errln("FAIL: Error in clone");
    368     }
    369 
    370     logln("Testing hashcode");
    371     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
    372         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
    373         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
    374         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
    375         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
    376         errln("FAIL: Error in hashCode()");
    377     }
    378 
    379     delete set1;
    380     delete set1copy;
    381     delete set2;
    382     delete set1clone;
    383     delete set2clone;
    384 
    385 
    386 }
    387 void
    388 UnicodeSetTest::TestAddRemove(void) {
    389     UnicodeSet set; // Construct empty set
    390     doAssert(set.isEmpty() == TRUE, "set should be empty");
    391     doAssert(set.size() == 0, "size should be 0");
    392     set.complement();
    393     doAssert(set.size() == 0x110000, "size should be 0x110000");
    394     set.clear();
    395     set.add(0x0061, 0x007a);
    396     expectPairs(set, "az");
    397     doAssert(set.isEmpty() == FALSE, "set should not be empty");
    398     doAssert(set.size() != 0, "size should not be equal to 0");
    399     doAssert(set.size() == 26, "size should be equal to 26");
    400     set.remove(0x006d, 0x0070);
    401     expectPairs(set, "alqz");
    402     doAssert(set.size() == 22, "size should be equal to 22");
    403     set.remove(0x0065, 0x0067);
    404     expectPairs(set, "adhlqz");
    405     doAssert(set.size() == 19, "size should be equal to 19");
    406     set.remove(0x0064, 0x0069);
    407     expectPairs(set, "acjlqz");
    408     doAssert(set.size() == 16, "size should be equal to 16");
    409     set.remove(0x0063, 0x0072);
    410     expectPairs(set, "absz");
    411     doAssert(set.size() == 10, "size should be equal to 10");
    412     set.add(0x0066, 0x0071);
    413     expectPairs(set, "abfqsz");
    414     doAssert(set.size() == 22, "size should be equal to 22");
    415     set.remove(0x0061, 0x0067);
    416     expectPairs(set, "hqsz");
    417     set.remove(0x0061, 0x007a);
    418     expectPairs(set, "");
    419     doAssert(set.isEmpty() == TRUE, "set should be empty");
    420     doAssert(set.size() == 0, "size should be 0");
    421     set.add(0x0061);
    422     doAssert(set.isEmpty() == FALSE, "set should not be empty");
    423     doAssert(set.size() == 1, "size should not be equal to 1");
    424     set.add(0x0062);
    425     set.add(0x0063);
    426     expectPairs(set, "ac");
    427     doAssert(set.size() == 3, "size should not be equal to 3");
    428     set.add(0x0070);
    429     set.add(0x0071);
    430     expectPairs(set, "acpq");
    431     doAssert(set.size() == 5, "size should not be equal to 5");
    432     set.clear();
    433     expectPairs(set, "");
    434     doAssert(set.isEmpty() == TRUE, "set should be empty");
    435     doAssert(set.size() == 0, "size should be 0");
    436 
    437     // Try removing an entire set from another set
    438     expectPattern(set, "[c-x]", "cx");
    439     UnicodeSet set2;
    440     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
    441     set.removeAll(set2);
    442     expectPairs(set, "deluxx");
    443 
    444     // Try adding an entire set to another set
    445     expectPattern(set, "[jackiemclean]", "aacceein");
    446     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
    447     set.addAll(set2);
    448     expectPairs(set, "aacehort");
    449     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
    450 
    451     // Try retaining an set of elements contained in another set (intersection)
    452     UnicodeSet set3;
    453     expectPattern(set3, "[a-c]", "ac");
    454     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
    455     set3.remove(0x0062);
    456     expectPairs(set3, "aacc");
    457     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
    458     set.retainAll(set3);
    459     expectPairs(set, "aacc");
    460     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
    461     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
    462     set.clear();
    463     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
    464 
    465     // Test commutativity
    466     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
    467     expectPattern(set2, "[jackiemclean]", "aacceein");
    468     set.addAll(set2);
    469     expectPairs(set, "aacehort");
    470     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
    471 
    472 
    473 
    474 
    475 }
    476 
    477 /**
    478  * Make sure minimal representation is maintained.
    479  */
    480 void UnicodeSetTest::TestMinimalRep() {
    481     UErrorCode status = U_ZERO_ERROR;
    482     // This is pretty thoroughly tested by checkCanonicalRep()
    483     // run against the exhaustive operation results.  Use the code
    484     // here for debugging specific spot problems.
    485 
    486     // 1 overlap against 2
    487     UnicodeSet set("[h-km-q]", status);
    488     if (U_FAILURE(status)) { errln("FAIL"); return; }
    489     UnicodeSet set2("[i-o]", status);
    490     if (U_FAILURE(status)) { errln("FAIL"); return; }
    491     set.addAll(set2);
    492     expectPairs(set, "hq");
    493     // right
    494     set.applyPattern("[a-m]", status);
    495     if (U_FAILURE(status)) { errln("FAIL"); return; }
    496     set2.applyPattern("[e-o]", status);
    497     if (U_FAILURE(status)) { errln("FAIL"); return; }
    498     set.addAll(set2);
    499     expectPairs(set, "ao");
    500     // left
    501     set.applyPattern("[e-o]", status);
    502     if (U_FAILURE(status)) { errln("FAIL"); return; }
    503     set2.applyPattern("[a-m]", status);
    504     if (U_FAILURE(status)) { errln("FAIL"); return; }
    505     set.addAll(set2);
    506     expectPairs(set, "ao");
    507     // 1 overlap against 3
    508     set.applyPattern("[a-eg-mo-w]", status);
    509     if (U_FAILURE(status)) { errln("FAIL"); return; }
    510     set2.applyPattern("[d-q]", status);
    511     if (U_FAILURE(status)) { errln("FAIL"); return; }
    512     set.addAll(set2);
    513     expectPairs(set, "aw");
    514 }
    515 
    516 void UnicodeSetTest::TestAPI() {
    517     UErrorCode status = U_ZERO_ERROR;
    518     // default ct
    519     UnicodeSet set;
    520     if (!set.isEmpty() || set.getRangeCount() != 0) {
    521         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
    522               set);
    523     }
    524 
    525     // clear(), isEmpty()
    526     set.add(0x0061);
    527     if (set.isEmpty()) {
    528         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
    529               set);
    530     }
    531     set.clear();
    532     if (!set.isEmpty()) {
    533         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
    534               set);
    535     }
    536 
    537     // size()
    538     set.clear();
    539     if (set.size() != 0) {
    540         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
    541               ": " + set);
    542     }
    543     set.add(0x0061);
    544     if (set.size() != 1) {
    545         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
    546               ": " + set);
    547     }
    548     set.add(0x0031, 0x0039);
    549     if (set.size() != 10) {
    550         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
    551               ": " + set);
    552     }
    553 
    554     // contains(first, last)
    555     set.clear();
    556     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
    557     if (U_FAILURE(status)) { errln("FAIL"); return; }
    558     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
    559         UChar32 a = set.getRangeStart(i);
    560         UChar32 b = set.getRangeEnd(i);
    561         if (!set.contains(a, b)) {
    562             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
    563                   " but doesn't: " + set);
    564         }
    565         if (set.contains((UChar32)(a-1), b)) {
    566             errln((UnicodeString)"FAIL, shouldn't contain " +
    567                   (unsigned short)(a-1) + '-' + (unsigned short)b +
    568                   " but does: " + set);
    569         }
    570         if (set.contains(a, (UChar32)(b+1))) {
    571             errln((UnicodeString)"FAIL, shouldn't contain " +
    572                   (unsigned short)a + '-' + (unsigned short)(b+1) +
    573                   " but does: " + set);
    574         }
    575     }
    576 
    577     // Ported InversionList test.
    578     UnicodeSet a((UChar32)3,(UChar32)10);
    579     UnicodeSet b((UChar32)7,(UChar32)15);
    580     UnicodeSet c;
    581 
    582     logln((UnicodeString)"a [3-10]: " + a);
    583     logln((UnicodeString)"b [7-15]: " + b);
    584     c = a;
    585     c.addAll(b);
    586     UnicodeSet exp((UChar32)3,(UChar32)15);
    587     if (c == exp) {
    588         logln((UnicodeString)"c.set(a).add(b): " + c);
    589     } else {
    590         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
    591     }
    592     c.complement();
    593     exp.set((UChar32)0, (UChar32)2);
    594     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
    595     if (c == exp) {
    596         logln((UnicodeString)"c.complement(): " + c);
    597     } else {
    598         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
    599     }
    600     c.complement();
    601     exp.set((UChar32)3, (UChar32)15);
    602     if (c == exp) {
    603         logln((UnicodeString)"c.complement(): " + c);
    604     } else {
    605         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
    606     }
    607     c = a;
    608     c.complementAll(b);
    609     exp.set((UChar32)3,(UChar32)6);
    610     exp.add((UChar32)11,(UChar32) 15);
    611     if (c == exp) {
    612         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
    613     } else {
    614         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
    615     }
    616 
    617     exp = c;
    618     bitsToSet(setToBits(c), c);
    619     if (c == exp) {
    620         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
    621     } else {
    622         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
    623     }
    624 
    625     // Additional tests for coverage JB#2118
    626     //UnicodeSet::complement(class UnicodeString const &)
    627     //UnicodeSet::complementAll(class UnicodeString const &)
    628     //UnicodeSet::containsNone(class UnicodeSet const &)
    629     //UnicodeSet::containsNone(long,long)
    630     //UnicodeSet::containsSome(class UnicodeSet const &)
    631     //UnicodeSet::containsSome(long,long)
    632     //UnicodeSet::removeAll(class UnicodeString const &)
    633     //UnicodeSet::retain(long)
    634     //UnicodeSet::retainAll(class UnicodeString const &)
    635     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
    636     //UnicodeSetIterator::getString(void)
    637     set.clear();
    638     set.complement("ab");
    639     exp.applyPattern("[{ab}]", status);
    640     if (U_FAILURE(status)) { errln("FAIL"); return; }
    641     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
    642 
    643     UnicodeSetIterator iset(set);
    644     if (!iset.next() || !iset.isString()) {
    645         errln("FAIL: UnicodeSetIterator::next/isString");
    646     } else if (iset.getString() != "ab") {
    647         errln("FAIL: UnicodeSetIterator::getString");
    648     }
    649 
    650     set.add((UChar32)0x61, (UChar32)0x7A);
    651     set.complementAll("alan");
    652     exp.applyPattern("[{ab}b-kmo-z]", status);
    653     if (U_FAILURE(status)) { errln("FAIL"); return; }
    654     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
    655 
    656     exp.applyPattern("[a-z]", status);
    657     if (U_FAILURE(status)) { errln("FAIL"); return; }
    658     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
    659     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
    660     exp.applyPattern("[aln]", status);
    661     if (U_FAILURE(status)) { errln("FAIL"); return; }
    662     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
    663     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
    664 
    665     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
    666         errln("FAIL: containsNone(UChar32, UChar32)");
    667     }
    668     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
    669         errln("FAIL: containsSome(UChar32, UChar32)");
    670     }
    671     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
    672         errln("FAIL: containsNone(UChar32, UChar32)");
    673     }
    674     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
    675         errln("FAIL: containsSome(UChar32, UChar32)");
    676     }
    677 
    678     set.removeAll("liu");
    679     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
    680     if (U_FAILURE(status)) { errln("FAIL"); return; }
    681     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
    682 
    683     set.retainAll("star");
    684     exp.applyPattern("[rst]", status);
    685     if (U_FAILURE(status)) { errln("FAIL"); return; }
    686     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
    687 
    688     set.retain((UChar32)0x73);
    689     exp.applyPattern("[s]", status);
    690     if (U_FAILURE(status)) { errln("FAIL"); return; }
    691     if (set != exp) { errln("FAIL: retain('s')"); return; }
    692 
    693     uint16_t buf[32];
    694     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
    695     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
    696     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
    697         errln("FAIL: serialize");
    698         return;
    699     }
    700 
    701     // Conversions to and from USet
    702     UnicodeSet *uniset = &set;
    703     USet *uset = uniset->toUSet();
    704     TEST_ASSERT((void *)uset == (void *)uniset);
    705     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
    706     TEST_ASSERT((void *)setx == (void *)uset);
    707     const UnicodeSet *constSet = uniset;
    708     const USet *constUSet = constSet->toUSet();
    709     TEST_ASSERT((void *)constUSet == (void *)constSet);
    710     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
    711     TEST_ASSERT((void *)constSetx == (void *)constUSet);
    712 }
    713 
    714 void UnicodeSetTest::TestIteration() {
    715     UErrorCode ec = U_ZERO_ERROR;
    716     int i = 0;
    717     int outerLoop;
    718 
    719     // 6 code points, 3 ranges, 2 strings, 8 total elements
    720     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
    721     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
    722     TEST_ASSERT_SUCCESS(ec);
    723     UnicodeSetIterator it(set);
    724 
    725     for (outerLoop=0; outerLoop<3; outerLoop++) {
    726         // Run the test multiple times, to check that iterator.reset() is working.
    727         for (i=0; i<10; i++) {
    728             UBool         nextv        = it.next();
    729             UBool         isString     = it.isString();
    730             int32_t       codePoint    = it.getCodepoint();
    731             //int32_t       codePointEnd = it.getCodepointEnd();
    732             UnicodeString s   = it.getString();
    733             switch (i) {
    734             case 0:
    735                 TEST_ASSERT(nextv == TRUE);
    736                 TEST_ASSERT(isString == FALSE);
    737                 TEST_ASSERT(codePoint==0x61);
    738                 TEST_ASSERT(s == "a");
    739                 break;
    740             case 1:
    741                 TEST_ASSERT(nextv == TRUE);
    742                 TEST_ASSERT(isString == FALSE);
    743                 TEST_ASSERT(codePoint==0x62);
    744                 TEST_ASSERT(s == "b");
    745                 break;
    746             case 2:
    747                 TEST_ASSERT(nextv == TRUE);
    748                 TEST_ASSERT(isString == FALSE);
    749                 TEST_ASSERT(codePoint==0x63);
    750                 TEST_ASSERT(s == "c");
    751                 break;
    752             case 3:
    753                 TEST_ASSERT(nextv == TRUE);
    754                 TEST_ASSERT(isString == FALSE);
    755                 TEST_ASSERT(codePoint==0x79);
    756                 TEST_ASSERT(s == "y");
    757                 break;
    758             case 4:
    759                 TEST_ASSERT(nextv == TRUE);
    760                 TEST_ASSERT(isString == FALSE);
    761                 TEST_ASSERT(codePoint==0x7a);
    762                 TEST_ASSERT(s == "z");
    763                 break;
    764             case 5:
    765                 TEST_ASSERT(nextv == TRUE);
    766                 TEST_ASSERT(isString == FALSE);
    767                 TEST_ASSERT(codePoint==0x1abcd);
    768                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
    769                 break;
    770             case 6:
    771                 TEST_ASSERT(nextv == TRUE);
    772                 TEST_ASSERT(isString == TRUE);
    773                 TEST_ASSERT(s == "str1");
    774                 break;
    775             case 7:
    776                 TEST_ASSERT(nextv == TRUE);
    777                 TEST_ASSERT(isString == TRUE);
    778                 TEST_ASSERT(s == "str2");
    779                 break;
    780             case 8:
    781                 TEST_ASSERT(nextv == FALSE);
    782                 break;
    783             case 9:
    784                 TEST_ASSERT(nextv == FALSE);
    785                 break;
    786             }
    787         }
    788         it.reset();  // prepare to run the iteration again.
    789     }
    790 }
    791 
    792 
    793 
    794 
    795 void UnicodeSetTest::TestStrings() {
    796     UErrorCode ec = U_ZERO_ERROR;
    797 
    798     UnicodeSet* testList[] = {
    799         UnicodeSet::createFromAll("abc"),
    800         new UnicodeSet("[a-c]", ec),
    801 
    802         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
    803         new UnicodeSet("[{ll}{ch}a-z]", ec),
    804 
    805         UnicodeSet::createFrom("ab}c"),
    806         new UnicodeSet("[{ab\\}c}]", ec),
    807 
    808         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
    809         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
    810 
    811         NULL
    812     };
    813 
    814     if (U_FAILURE(ec)) {
    815         errln("FAIL: couldn't construct test sets");
    816     }
    817 
    818     for (int32_t i = 0; testList[i] != NULL; i+=2) {
    819         if (U_SUCCESS(ec)) {
    820             UnicodeString pat0, pat1;
    821             testList[i]->toPattern(pat0, TRUE);
    822             testList[i+1]->toPattern(pat1, TRUE);
    823             if (*testList[i] == *testList[i+1]) {
    824                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
    825             } else {
    826                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
    827             }
    828         }
    829         delete testList[i];
    830         delete testList[i+1];
    831     }
    832 }
    833 
    834 /**
    835  * Test the [:Latin:] syntax.
    836  */
    837 void UnicodeSetTest::TestScriptSet() {
    838     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
    839 
    840     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
    841 
    842     /* Jitterbug 1423 */
    843     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
    844 
    845 }
    846 
    847 /**
    848  * Test the [:Latin:] syntax.
    849  */
    850 void UnicodeSetTest::TestPropertySet() {
    851     static const char* const DATA[] = {
    852         // Pattern, Chars IN, Chars NOT in
    853 
    854         "[:Latin:]",
    855         "aA",
    856         "\\u0391\\u03B1",
    857 
    858         "[\\p{Greek}]",
    859         "\\u0391\\u03B1",
    860         "aA",
    861 
    862         "\\P{ GENERAL Category = upper case letter }",
    863         "abc",
    864         "ABC",
    865 
    866         // Combining class: @since ICU 2.2
    867         // Check both symbolic and numeric
    868         "\\p{ccc=Nukta}",
    869         "\\u0ABC",
    870         "abc",
    871 
    872         "\\p{Canonical Combining Class = 11}",
    873         "\\u05B1",
    874         "\\u05B2",
    875 
    876         "[:c c c = iota subscript :]",
    877         "\\u0345",
    878         "xyz",
    879 
    880         // Bidi class: @since ICU 2.2
    881         "\\p{bidiclass=lefttoright}",
    882         "abc",
    883         "\\u0671\\u0672",
    884 
    885         // Binary properties: @since ICU 2.2
    886         "\\p{ideographic}",
    887         "\\u4E0A",
    888         "x",
    889 
    890         "[:math=false:]",
    891         "q)*(",
    892         // weiv: )(and * were removed from math in Unicode 4.0.1
    893         //"(*+)",
    894         "+<>^",
    895 
    896         // JB#1767 \N{}, \p{ASCII}
    897         "[:Ascii:]",
    898         "abc\\u0000\\u007F",
    899         "\\u0080\\u4E00",
    900 
    901         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
    902         "az",
    903         "qrs",
    904 
    905         // JB#2015
    906         "[:any:]",
    907         "a\\U0010FFFF",
    908         "",
    909 
    910         "[:nv=0.5:]",
    911         "\\u00BD\\u0F2A",
    912         "\\u00BC",
    913 
    914         // JB#2653: Age
    915         "[:Age=1.1:]",
    916         "\\u03D6", // 1.1
    917         "\\u03D8\\u03D9", // 3.2
    918 
    919         "[:Age=3.1:]",
    920         "\\u1800\\u3400\\U0002f800",
    921         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
    922 
    923         // JB#2350: Case_Sensitive
    924         "[:Case Sensitive:]",
    925         "A\\u1FFC\\U00010410",
    926         ";\\u00B4\\U00010500",
    927 
    928         // JB#2832: C99-compatibility props
    929         "[:blank:]",
    930         " \\u0009",
    931         "1-9A-Z",
    932 
    933         "[:graph:]",
    934         "19AZ",
    935         " \\u0003\\u0007\\u0009\\u000A\\u000D",
    936 
    937         "[:punct:]",
    938         "!@#%&*()[]{}-_\\/;:,.?'\"",
    939         "09azAZ",
    940 
    941         "[:xdigit:]",
    942         "09afAF",
    943         "gG!",
    944 
    945         // Regex compatibility test
    946         "[-b]", // leading '-' is literal
    947         "-b",
    948         "ac",
    949 
    950         "[^-b]", // leading '-' is literal
    951         "ac",
    952         "-b",
    953 
    954         "[b-]", // trailing '-' is literal
    955         "-b",
    956         "ac",
    957 
    958         "[^b-]", // trailing '-' is literal
    959         "ac",
    960         "-b",
    961 
    962         "[a-b-]", // trailing '-' is literal
    963         "ab-",
    964         "c=",
    965 
    966         "[[a-q]&[p-z]-]", // trailing '-' is literal
    967         "pq-",
    968         "or=",
    969 
    970         "[\\s|\\)|:|$|\\>]", // from regex tests
    971         "s|):$>",
    972         "abc",
    973 
    974         "[\\uDC00cd]", // JB#2906: isolated trail at start
    975         "cd\\uDC00",
    976         "ab\\uD800\\U00010000",
    977 
    978         "[ab\\uD800]", // JB#2906: isolated trail at start
    979         "ab\\uD800",
    980         "cd\\uDC00\\U00010000",
    981 
    982         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
    983         "abcd\\uD800",
    984         "ef\\uDC00\\U00010000",
    985 
    986         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
    987         "abcd\\uDC00",
    988         "ef\\uD800\\U00010000",
    989 
    990         "[:^lccc=0:]", // Lead canonical class
    991         "\\u0300\\u0301",
    992         "abcd\\u00c0\\u00c5",
    993 
    994         "[:^tccc=0:]", // Trail canonical class
    995         "\\u0300\\u0301\\u00c0\\u00c5",
    996         "abcd",
    997 
    998         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
    999         "\\u0300\\u0301\\u00c0\\u00c5",
   1000         "abcd",
   1001 
   1002         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
   1003         "",
   1004         "abcd\\u0300\\u0301\\u00c0\\u00c5",
   1005 
   1006         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
   1007         "\\u0F73\\u0F75\\u0F81",
   1008         "abcd\\u0300\\u0301\\u00c0\\u00c5",
   1009 
   1010         "[:Assigned:]",
   1011         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
   1012         "\\u0888\\uFDD3\\uFFFE\\U00050005"
   1013     };
   1014 
   1015     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
   1016 
   1017     for (int32_t i=0; i<DATA_LEN; i+=3) {
   1018         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
   1019                           CharsToUnicodeString(DATA[i+2]));
   1020     }
   1021 }
   1022 
   1023 /**
   1024   * Test that Posix style character classes [:digit:], etc.
   1025   *   have the Unicode definitions from TR 18.
   1026   */
   1027 void UnicodeSetTest::TestPosixClasses() {
   1028     {
   1029         UErrorCode status = U_ZERO_ERROR;
   1030         UnicodeSet s1("[:alpha:]", status);
   1031         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
   1032         TEST_ASSERT_SUCCESS(status);
   1033         TEST_ASSERT(s1==s2);
   1034     }
   1035     {
   1036         UErrorCode status = U_ZERO_ERROR;
   1037         UnicodeSet s1("[:lower:]", status);
   1038         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
   1039         TEST_ASSERT_SUCCESS(status);
   1040         TEST_ASSERT(s1==s2);
   1041     }
   1042     {
   1043         UErrorCode status = U_ZERO_ERROR;
   1044         UnicodeSet s1("[:upper:]", status);
   1045         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
   1046         TEST_ASSERT_SUCCESS(status);
   1047         TEST_ASSERT(s1==s2);
   1048     }
   1049     {
   1050         UErrorCode status = U_ZERO_ERROR;
   1051         UnicodeSet s1("[:punct:]", status);
   1052         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
   1053         TEST_ASSERT_SUCCESS(status);
   1054         TEST_ASSERT(s1==s2);
   1055     }
   1056     {
   1057         UErrorCode status = U_ZERO_ERROR;
   1058         UnicodeSet s1("[:digit:]", status);
   1059         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
   1060         TEST_ASSERT_SUCCESS(status);
   1061         TEST_ASSERT(s1==s2);
   1062     }
   1063     {
   1064         UErrorCode status = U_ZERO_ERROR;
   1065         UnicodeSet s1("[:xdigit:]", status);
   1066         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
   1067         TEST_ASSERT_SUCCESS(status);
   1068         TEST_ASSERT(s1==s2);
   1069     }
   1070     {
   1071         UErrorCode status = U_ZERO_ERROR;
   1072         UnicodeSet s1("[:alnum:]", status);
   1073         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
   1074         TEST_ASSERT_SUCCESS(status);
   1075         TEST_ASSERT(s1==s2);
   1076     }
   1077     {
   1078         UErrorCode status = U_ZERO_ERROR;
   1079         UnicodeSet s1("[:space:]", status);
   1080         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
   1081         TEST_ASSERT_SUCCESS(status);
   1082         TEST_ASSERT(s1==s2);
   1083     }
   1084     {
   1085         UErrorCode status = U_ZERO_ERROR;
   1086         UnicodeSet s1("[:blank:]", status);
   1087         TEST_ASSERT_SUCCESS(status);
   1088         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
   1089             status);
   1090         TEST_ASSERT_SUCCESS(status);
   1091         TEST_ASSERT(s1==s2);
   1092     }
   1093     {
   1094         UErrorCode status = U_ZERO_ERROR;
   1095         UnicodeSet s1("[:cntrl:]", status);
   1096         TEST_ASSERT_SUCCESS(status);
   1097         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
   1098         TEST_ASSERT_SUCCESS(status);
   1099         TEST_ASSERT(s1==s2);
   1100     }
   1101     {
   1102         UErrorCode status = U_ZERO_ERROR;
   1103         UnicodeSet s1("[:graph:]", status);
   1104         TEST_ASSERT_SUCCESS(status);
   1105         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
   1106         TEST_ASSERT_SUCCESS(status);
   1107         TEST_ASSERT(s1==s2);
   1108     }
   1109     {
   1110         UErrorCode status = U_ZERO_ERROR;
   1111         UnicodeSet s1("[:print:]", status);
   1112         TEST_ASSERT_SUCCESS(status);
   1113         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
   1114         TEST_ASSERT_SUCCESS(status);
   1115         TEST_ASSERT(s1==s2);
   1116     }
   1117 }
   1118 /**
   1119  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
   1120  */
   1121 void UnicodeSetTest::TestClone() {
   1122     UErrorCode ec = U_ZERO_ERROR;
   1123     UnicodeSet s("[abcxyz]", ec);
   1124     UnicodeSet t(s);
   1125     expectContainment(t, "abc", "def");
   1126 }
   1127 
   1128 /**
   1129  * Test the indexOf() and charAt() methods.
   1130  */
   1131 void UnicodeSetTest::TestIndexOf() {
   1132     UErrorCode ec = U_ZERO_ERROR;
   1133     UnicodeSet set("[a-cx-y3578]", ec);
   1134     if (U_FAILURE(ec)) {
   1135         errln("FAIL: UnicodeSet constructor");
   1136         return;
   1137     }
   1138     for (int32_t i=0; i<set.size(); ++i) {
   1139         UChar32 c = set.charAt(i);
   1140         if (set.indexOf(c) != i) {
   1141             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
   1142                 i, c, set.indexOf(c));
   1143         }
   1144     }
   1145     UChar32 c = set.charAt(set.size());
   1146     if (c != -1) {
   1147         errln("FAIL: charAt(<out of range>) = %X", c);
   1148     }
   1149     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
   1150     if (j != -1) {
   1151         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
   1152     }
   1153 }
   1154 
   1155 /**
   1156  * Test closure API.
   1157  */
   1158 void UnicodeSetTest::TestCloseOver() {
   1159     UErrorCode ec = U_ZERO_ERROR;
   1160 
   1161     char CASE[] = {(char)USET_CASE_INSENSITIVE};
   1162     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
   1163     const char* DATA[] = {
   1164         // selector, input, output
   1165         CASE,
   1166         "[aq\\u00DF{Bc}{bC}{Fi}]",
   1167         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
   1168 
   1169         CASE,
   1170         "[\\u01F1]", // 'DZ'
   1171         "[\\u01F1\\u01F2\\u01F3]",
   1172 
   1173         CASE,
   1174         "[\\u1FB4]",
   1175         "[\\u1FB4{\\u03AC\\u03B9}]",
   1176 
   1177         CASE,
   1178         "[{F\\uFB01}]",
   1179         "[\\uFB03{ffi}]",
   1180 
   1181         CASE, // make sure binary search finds limits
   1182         "[a\\uFF3A]",
   1183         "[aA\\uFF3A\\uFF5A]",
   1184 
   1185         CASE,
   1186         "[a-z]","[A-Za-z\\u017F\\u212A]",
   1187         CASE,
   1188         "[abc]","[A-Ca-c]",
   1189         CASE,
   1190         "[ABC]","[A-Ca-c]",
   1191 
   1192         CASE, "[i]", "[iI]",
   1193 
   1194         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
   1195         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
   1196 
   1197         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
   1198 
   1199         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
   1200 
   1201         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
   1202 
   1203         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
   1204 
   1205         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
   1206 
   1207         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
   1208 
   1209         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
   1210         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
   1211 
   1212         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
   1213 
   1214         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
   1215 
   1216         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
   1217 
   1218         CASE_MAPPINGS,
   1219         "[aq\\u00DF{Bc}{bC}{Fi}]",
   1220         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
   1221 
   1222         CASE_MAPPINGS,
   1223         "[\\u01F1]", // 'DZ'
   1224         "[\\u01F1\\u01F2\\u01F3]",
   1225 
   1226         CASE_MAPPINGS,
   1227         "[a-z]",
   1228         "[A-Za-z]",
   1229 
   1230         NULL
   1231     };
   1232 
   1233     UnicodeSet s;
   1234     UnicodeSet t;
   1235     UnicodeString buf;
   1236     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
   1237         int32_t selector = DATA[i][0];
   1238         UnicodeString pat(DATA[i+1], -1, US_INV);
   1239         UnicodeString exp(DATA[i+2], -1, US_INV);
   1240         s.applyPattern(pat, ec);
   1241         s.closeOver(selector);
   1242         t.applyPattern(exp, ec);
   1243         if (U_FAILURE(ec)) {
   1244             errln("FAIL: applyPattern failed");
   1245             continue;
   1246         }
   1247         if (s == t) {
   1248             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
   1249         } else {
   1250             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
   1251                   s.toPattern(buf, TRUE) + ", expected " + exp);
   1252         }
   1253     }
   1254 
   1255 #if 0
   1256     /*
   1257      * Unused test code.
   1258      * This was used to compare the old implementation (using USET_CASE)
   1259      * with the new one (using 0x100 temporarily)
   1260      * while transitioning from hardcoded case closure tables in uniset.cpp
   1261      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
   1262      * and using ucase.c functions for closure.
   1263      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
   1264      *
   1265      * Note: The old and new implementation never fully matched because
   1266      * the old implementation turned out to not map U+0130 and U+0131 correctly
   1267      * (dotted I and dotless i) and because the old implementation's data tables
   1268      * were outdated compared to Unicode 4.0.1 at the time of the change to the
   1269      * new implementation. (So sigmas and some other characters were not handled
   1270      * according to the newer Unicode version.)
   1271      */
   1272     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
   1273     UnicodeSetIterator si(sens);
   1274     UnicodeString str, buf2;
   1275     const UnicodeString *pStr;
   1276     UChar32 c;
   1277     while(si.next()) {
   1278         if(!si.isString()) {
   1279             c=si.getCodepoint();
   1280             s.clear();
   1281             s.add(c);
   1282 
   1283             str.setTo(c);
   1284             str.foldCase();
   1285             sens2.add(str);
   1286 
   1287             t=s;
   1288             s.closeOver(USET_CASE);
   1289             t.closeOver(0x100);
   1290             if(s!=t) {
   1291                 errln("FAIL: closeOver(U+%04x) differs: ", c);
   1292                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
   1293             }
   1294         }
   1295     }
   1296     // remove all code points
   1297     // should contain all full case folding mapping strings
   1298     sens2.remove(0, 0x10ffff);
   1299     si.reset(sens2);
   1300     while(si.next()) {
   1301         if(si.isString()) {
   1302             pStr=&si.getString();
   1303             s.clear();
   1304             s.add(*pStr);
   1305             t=s2=s;
   1306             s.closeOver(USET_CASE);
   1307             t.closeOver(0x100);
   1308             if(s!=t) {
   1309                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
   1310                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
   1311             }
   1312         }
   1313     }
   1314 #endif
   1315 
   1316     // Test the pattern API
   1317     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
   1318     if (U_FAILURE(ec)) {
   1319         errln("FAIL: applyPattern failed");
   1320     } else {
   1321         expectContainment(s, "abcABC", "defDEF");
   1322     }
   1323     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
   1324     if (U_FAILURE(ec)) {
   1325         errln("FAIL: constructor failed");
   1326     } else {
   1327         expectContainment(v, "defDEF", "abcABC");
   1328     }
   1329     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
   1330     if (U_FAILURE(ec)) {
   1331         errln("FAIL: construct w/case mappings failed");
   1332     } else {
   1333         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
   1334     }
   1335 }
   1336 
   1337 void UnicodeSetTest::TestEscapePattern() {
   1338     const char pattern[] =
   1339         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
   1340     const char exp[] =
   1341         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
   1342     // We test this with two passes; in the second pass we
   1343     // pre-unescape the pattern.  Since U+200E is rule whitespace,
   1344     // this fails -- which is what we expect.
   1345     for (int32_t pass=1; pass<=2; ++pass) {
   1346         UErrorCode ec = U_ZERO_ERROR;
   1347         UnicodeString pat(pattern, -1, US_INV);
   1348         if (pass==2) {
   1349             pat = pat.unescape();
   1350         }
   1351         // Pattern is only good for pass 1
   1352         UBool isPatternValid = (pass==1);
   1353 
   1354         UnicodeSet set(pat, ec);
   1355         if (U_SUCCESS(ec) != isPatternValid){
   1356             errln((UnicodeString)"FAIL: applyPattern(" +
   1357                   escape(pat) + ") => " +
   1358                   u_errorName(ec));
   1359             continue;
   1360         }
   1361         if (U_FAILURE(ec)) {
   1362             continue;
   1363         }
   1364         if (set.contains((UChar)0x0644)){
   1365             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
   1366         }
   1367 
   1368         UnicodeString newpat;
   1369         set.toPattern(newpat, TRUE);
   1370         if (newpat == UnicodeString(exp, -1, US_INV)) {
   1371             logln(escape(pat) + " => " + newpat);
   1372         } else {
   1373             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
   1374         }
   1375 
   1376         for (int32_t i=0; i<set.getRangeCount(); ++i) {
   1377             UnicodeString str("Range ");
   1378             str.append((UChar)(0x30 + i))
   1379                 .append(": ")
   1380                 .append((UChar32)set.getRangeStart(i))
   1381                 .append(" - ")
   1382                 .append((UChar32)set.getRangeEnd(i));
   1383             str = str + " (" + set.getRangeStart(i) + " - " +
   1384                 set.getRangeEnd(i) + ")";
   1385             if (set.getRangeStart(i) < 0) {
   1386                 errln((UnicodeString)"FAIL: " + escape(str));
   1387             } else {
   1388                 logln(escape(str));
   1389             }
   1390         }
   1391     }
   1392 }
   1393 
   1394 void UnicodeSetTest::expectRange(const UnicodeString& label,
   1395                                  const UnicodeSet& set,
   1396                                  UChar32 start, UChar32 end) {
   1397     UnicodeSet exp(start, end);
   1398     UnicodeString pat;
   1399     if (set == exp) {
   1400         logln(label + " => " + set.toPattern(pat, TRUE));
   1401     } else {
   1402         UnicodeString xpat;
   1403         errln((UnicodeString)"FAIL: " + label + " => " +
   1404               set.toPattern(pat, TRUE) +
   1405               ", expected " + exp.toPattern(xpat, TRUE));
   1406     }
   1407 }
   1408 
   1409 void UnicodeSetTest::TestInvalidCodePoint() {
   1410 
   1411     const UChar32 DATA[] = {
   1412         // Test range             Expected range
   1413         0, 0x10FFFF,              0, 0x10FFFF,
   1414         (UChar32)-1, 8,           0, 8,
   1415         8, 0x110000,              8, 0x10FFFF
   1416     };
   1417     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
   1418 
   1419     UnicodeString pat;
   1420     int32_t i;
   1421 
   1422     for (i=0; i<DATA_LENGTH; i+=4) {
   1423         UChar32 start  = DATA[i];
   1424         UChar32 end    = DATA[i+1];
   1425         UChar32 xstart = DATA[i+2];
   1426         UChar32 xend   = DATA[i+3];
   1427 
   1428         // Try various API using the test code points
   1429 
   1430         UnicodeSet set(start, end);
   1431         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
   1432                     set, xstart, xend);
   1433 
   1434         set.clear();
   1435         set.set(start, end);
   1436         expectRange((UnicodeString)"set(" + start + "," + end + ")",
   1437                     set, xstart, xend);
   1438 
   1439         UBool b = set.contains(start);
   1440         b = set.contains(start, end);
   1441         b = set.containsNone(start, end);
   1442         b = set.containsSome(start, end);
   1443 
   1444         /*int32_t index = set.indexOf(start);*/
   1445 
   1446         set.clear();
   1447         set.add(start);
   1448         set.add(start, end);
   1449         expectRange((UnicodeString)"add(" + start + "," + end + ")",
   1450                     set, xstart, xend);
   1451 
   1452         set.set(0, 0x10FFFF);
   1453         set.retain(start, end);
   1454         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
   1455                     set, xstart, xend);
   1456         set.retain(start);
   1457 
   1458         set.set(0, 0x10FFFF);
   1459         set.remove(start);
   1460         set.remove(start, end);
   1461         set.complement();
   1462         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
   1463                     set, xstart, xend);
   1464 
   1465         set.set(0, 0x10FFFF);
   1466         set.complement(start, end);
   1467         set.complement();
   1468         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
   1469                     set, xstart, xend);
   1470         set.complement(start);
   1471     }
   1472 
   1473     const UChar32 DATA2[] = {
   1474         0,
   1475         0x10FFFF,
   1476         (UChar32)-1,
   1477         0x110000
   1478     };
   1479     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
   1480 
   1481     for (i=0; i<DATA2_LENGTH; ++i) {
   1482         UChar32 c = DATA2[i], end = 0x10FFFF;
   1483         UBool valid = (c >= 0 && c <= 0x10FFFF);
   1484 
   1485         UnicodeSet set(0, 0x10FFFF);
   1486 
   1487         // For single-codepoint contains, invalid codepoints are NOT contained
   1488         UBool b = set.contains(c);
   1489         if (b == valid) {
   1490             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
   1491                   ") = " + b);
   1492         } else {
   1493             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
   1494                   ") = " + b);
   1495         }
   1496 
   1497         // For codepoint range contains, containsNone, and containsSome,
   1498         // invalid or empty (start > end) ranges have UNDEFINED behavior.
   1499         b = set.contains(c, end);
   1500         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
   1501               "," + end + ") = " + b);
   1502 
   1503         b = set.containsNone(c, end);
   1504         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
   1505               "," + end + ") = " + b);
   1506 
   1507         b = set.containsSome(c, end);
   1508         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
   1509               "," + end + ") = " + b);
   1510 
   1511         int32_t index = set.indexOf(c);
   1512         if ((index >= 0) == valid) {
   1513             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
   1514                   ") = " + index);
   1515         } else {
   1516             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
   1517                   ") = " + index);
   1518         }
   1519     }
   1520 }
   1521 
   1522 // Used by TestSymbolTable
   1523 class TokenSymbolTable : public SymbolTable {
   1524 public:
   1525     Hashtable contents;
   1526 
   1527     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
   1528         contents.setValueDeleter(uhash_deleteUnicodeString);
   1529     }
   1530 
   1531     ~TokenSymbolTable() {}
   1532 
   1533     /**
   1534      * (Non-SymbolTable API) Add the given variable and value to
   1535      * the table.  Variable should NOT contain leading '$'.
   1536      */
   1537     void add(const UnicodeString& var, const UnicodeString& value,
   1538              UErrorCode& ec) {
   1539         if (U_SUCCESS(ec)) {
   1540             contents.put(var, new UnicodeString(value), ec);
   1541         }
   1542     }
   1543 
   1544     /**
   1545      * SymbolTable API
   1546      */
   1547     virtual const UnicodeString* lookup(const UnicodeString& s) const {
   1548         return (const UnicodeString*) contents.get(s);
   1549     }
   1550 
   1551     /**
   1552      * SymbolTable API
   1553      */
   1554     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
   1555         return NULL;
   1556     }
   1557 
   1558     /**
   1559      * SymbolTable API
   1560      */
   1561     virtual UnicodeString parseReference(const UnicodeString& text,
   1562                                          ParsePosition& pos, int32_t limit) const {
   1563         int32_t start = pos.getIndex();
   1564         int32_t i = start;
   1565         UnicodeString result;
   1566         while (i < limit) {
   1567             UChar c = text.charAt(i);
   1568             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
   1569                 break;
   1570             }
   1571             ++i;
   1572         }
   1573         if (i == start) { // No valid name chars
   1574             return result; // Indicate failure with empty string
   1575         }
   1576         pos.setIndex(i);
   1577         text.extractBetween(start, i, result);
   1578         return result;
   1579     }
   1580 };
   1581 
   1582 void UnicodeSetTest::TestSymbolTable() {
   1583     // Multiple test cases can be set up here.  Each test case
   1584     // is terminated by null:
   1585     // var, value, var, value,..., input pat., exp. output pat., null
   1586     const char* DATA[] = {
   1587         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
   1588         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
   1589         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
   1590         NULL
   1591     };
   1592 
   1593     for (int32_t i=0; DATA[i]!=NULL; ++i) {
   1594         UErrorCode ec = U_ZERO_ERROR;
   1595         TokenSymbolTable sym(ec);
   1596         if (U_FAILURE(ec)) {
   1597             errln("FAIL: couldn't construct TokenSymbolTable");
   1598             continue;
   1599         }
   1600 
   1601         // Set up variables
   1602         while (DATA[i+2] != NULL) {
   1603             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
   1604             if (U_FAILURE(ec)) {
   1605                 errln("FAIL: couldn't add to TokenSymbolTable");
   1606                 continue;
   1607             }
   1608             i += 2;
   1609         }
   1610 
   1611         // Input pattern and expected output pattern
   1612         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
   1613         i += 2;
   1614 
   1615         ParsePosition pos(0);
   1616         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
   1617         if (U_FAILURE(ec)) {
   1618             errln("FAIL: couldn't construct UnicodeSet");
   1619             continue;
   1620         }
   1621 
   1622         // results
   1623         if (pos.getIndex() != inpat.length()) {
   1624             errln((UnicodeString)"Failed to read to end of string \""
   1625                   + inpat + "\": read to "
   1626                   + pos.getIndex() + ", length is "
   1627                   + inpat.length());
   1628         }
   1629 
   1630         UnicodeSet us2(exppat, ec);
   1631         if (U_FAILURE(ec)) {
   1632             errln("FAIL: couldn't construct expected UnicodeSet");
   1633             continue;
   1634         }
   1635 
   1636         UnicodeString a, b;
   1637         if (us != us2) {
   1638             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
   1639                   ", expected " + us2.toPattern(b, TRUE));
   1640         } else {
   1641             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
   1642         }
   1643     }
   1644 }
   1645 
   1646 void UnicodeSetTest::TestSurrogate() {
   1647     const char* DATA[] = {
   1648         // These should all behave identically
   1649         "[abc\\uD800\\uDC00]",
   1650         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
   1651         "[abc\\U00010000]",
   1652         0
   1653     };
   1654     for (int i=0; DATA[i] != 0; ++i) {
   1655         UErrorCode ec = U_ZERO_ERROR;
   1656         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
   1657         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
   1658         UnicodeSet set(str, ec);
   1659         if (U_FAILURE(ec)) {
   1660             errln("FAIL: UnicodeSet constructor");
   1661             continue;
   1662         }
   1663         expectContainment(set,
   1664                           CharsToUnicodeString("abc\\U00010000"),
   1665                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
   1666         if (set.size() != 4) {
   1667             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
   1668                   set.size() + ", expected 4");
   1669         }
   1670     }
   1671 }
   1672 
   1673 void UnicodeSetTest::TestExhaustive() {
   1674     // exhaustive tests. Simulate UnicodeSets with integers.
   1675     // That gives us very solid tests (except for large memory tests).
   1676 
   1677     int32_t limit = 128;
   1678 
   1679     UnicodeSet x, y, z, aa;
   1680 
   1681     for (int32_t i = 0; i < limit; ++i) {
   1682         bitsToSet(i, x);
   1683         logln((UnicodeString)"Testing " + i + ", " + x);
   1684         _testComplement(i, x, y);
   1685 
   1686         // AS LONG AS WE ARE HERE, check roundtrip
   1687         checkRoundTrip(bitsToSet(i, aa));
   1688 
   1689         for (int32_t j = 0; j < limit; ++j) {
   1690             _testAdd(i,j,  x,y,z);
   1691             _testXor(i,j,  x,y,z);
   1692             _testRetain(i,j,  x,y,z);
   1693             _testRemove(i,j,  x,y,z);
   1694         }
   1695     }
   1696 }
   1697 
   1698 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
   1699     bitsToSet(a, x);
   1700     z = x;
   1701     z.complement();
   1702     int32_t c = setToBits(z);
   1703     if (c != (~a)) {
   1704         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
   1705         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
   1706     }
   1707     checkCanonicalRep(z, (UnicodeString)"complement " + a);
   1708 }
   1709 
   1710 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1711     bitsToSet(a, x);
   1712     bitsToSet(b, y);
   1713     z = x;
   1714     z.addAll(y);
   1715     int32_t c = setToBits(z);
   1716     if (c != (a | b)) {
   1717         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
   1718         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
   1719     }
   1720     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
   1721 }
   1722 
   1723 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1724     bitsToSet(a, x);
   1725     bitsToSet(b, y);
   1726     z = x;
   1727     z.retainAll(y);
   1728     int32_t c = setToBits(z);
   1729     if (c != (a & b)) {
   1730         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
   1731         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
   1732     }
   1733     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
   1734 }
   1735 
   1736 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1737     bitsToSet(a, x);
   1738     bitsToSet(b, y);
   1739     z = x;
   1740     z.removeAll(y);
   1741     int32_t c = setToBits(z);
   1742     if (c != (a &~ b)) {
   1743         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
   1744         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
   1745     }
   1746     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
   1747 }
   1748 
   1749 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1750     bitsToSet(a, x);
   1751     bitsToSet(b, y);
   1752     z = x;
   1753     z.complementAll(y);
   1754     int32_t c = setToBits(z);
   1755     if (c != (a ^ b)) {
   1756         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
   1757         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
   1758     }
   1759     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
   1760 }
   1761 
   1762 /**
   1763  * Check that ranges are monotonically increasing and non-
   1764  * overlapping.
   1765  */
   1766 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
   1767     int32_t n = set.getRangeCount();
   1768     if (n < 0) {
   1769         errln((UnicodeString)"FAIL result of " + msg +
   1770               ": range count should be >= 0 but is " +
   1771               n /*+ " for " + set.toPattern())*/);
   1772         return;
   1773     }
   1774     UChar32 last = 0;
   1775     for (int32_t i=0; i<n; ++i) {
   1776         UChar32 start = set.getRangeStart(i);
   1777         UChar32 end = set.getRangeEnd(i);
   1778         if (start > end) {
   1779             errln((UnicodeString)"FAIL result of " + msg +
   1780                   ": range " + (i+1) +
   1781                   " start > end: " + (int)start + ", " + (int)end +
   1782                   " for " + set);
   1783         }
   1784         if (i > 0 && start <= last) {
   1785             errln((UnicodeString)"FAIL result of " + msg +
   1786                   ": range " + (i+1) +
   1787                   " overlaps previous range: " + (int)start + ", " + (int)end +
   1788                   " for " + set);
   1789         }
   1790         last = end;
   1791     }
   1792 }
   1793 
   1794 /**
   1795  * Convert a bitmask to a UnicodeSet.
   1796  */
   1797 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
   1798     result.clear();
   1799     for (UChar32 i = 0; i < 32; ++i) {
   1800         if ((a & (1<<i)) != 0) {
   1801             result.add(i);
   1802         }
   1803     }
   1804     return result;
   1805 }
   1806 
   1807 /**
   1808  * Convert a UnicodeSet to a bitmask.  Only the characters
   1809  * U+0000 to U+0020 are represented in the bitmask.
   1810  */
   1811 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
   1812     int32_t result = 0;
   1813     for (int32_t i = 0; i < 32; ++i) {
   1814         if (x.contains((UChar32)i)) {
   1815             result |= (1<<i);
   1816         }
   1817     }
   1818     return result;
   1819 }
   1820 
   1821 /**
   1822  * Return the representation of an inversion list based UnicodeSet
   1823  * as a pairs list.  Ranges are listed in ascending Unicode order.
   1824  * For example, the set [a-zA-M3] is represented as "33AMaz".
   1825  */
   1826 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
   1827     UnicodeString pairs;
   1828     for (int32_t i=0; i<set.getRangeCount(); ++i) {
   1829         UChar32 start = set.getRangeStart(i);
   1830         UChar32 end = set.getRangeEnd(i);
   1831         if (end > 0xFFFF) {
   1832             end = 0xFFFF;
   1833             i = set.getRangeCount(); // Should be unnecessary
   1834         }
   1835         pairs.append((UChar)start).append((UChar)end);
   1836     }
   1837     return pairs;
   1838 }
   1839 
   1840 /**
   1841  * Basic consistency check for a few items.
   1842  * That the iterator works, and that we can create a pattern and
   1843  * get the same thing back
   1844  */
   1845 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
   1846     UErrorCode ec = U_ZERO_ERROR;
   1847 
   1848     UnicodeSet t(s);
   1849     checkEqual(s, t, "copy ct");
   1850 
   1851     t = s;
   1852     checkEqual(s, t, "operator=");
   1853 
   1854     copyWithIterator(t, s, FALSE);
   1855     checkEqual(s, t, "iterator roundtrip");
   1856 
   1857     copyWithIterator(t, s, TRUE); // try range
   1858     checkEqual(s, t, "iterator roundtrip");
   1859 
   1860     UnicodeString pat; s.toPattern(pat, FALSE);
   1861     t.applyPattern(pat, ec);
   1862     if (U_FAILURE(ec)) {
   1863         errln("FAIL: applyPattern");
   1864         return;
   1865     } else {
   1866         checkEqual(s, t, "toPattern(false)");
   1867     }
   1868 
   1869     s.toPattern(pat, TRUE);
   1870     t.applyPattern(pat, ec);
   1871     if (U_FAILURE(ec)) {
   1872         errln("FAIL: applyPattern");
   1873         return;
   1874     } else {
   1875         checkEqual(s, t, "toPattern(true)");
   1876     }
   1877 }
   1878 
   1879 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
   1880     t.clear();
   1881     UnicodeSetIterator it(s);
   1882     if (withRange) {
   1883         while (it.nextRange()) {
   1884             if (it.isString()) {
   1885                 t.add(it.getString());
   1886             } else {
   1887                 t.add(it.getCodepoint(), it.getCodepointEnd());
   1888             }
   1889         }
   1890     } else {
   1891         while (it.next()) {
   1892             if (it.isString()) {
   1893                 t.add(it.getString());
   1894             } else {
   1895                 t.add(it.getCodepoint());
   1896             }
   1897         }
   1898     }
   1899 }
   1900 
   1901 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
   1902     UnicodeString source; s.toPattern(source, TRUE);
   1903     UnicodeString result; t.toPattern(result, TRUE);
   1904     if (s != t) {
   1905         errln((UnicodeString)"FAIL: " + message
   1906               + "; source = " + source
   1907               + "; result = " + result
   1908               );
   1909         return FALSE;
   1910     } else {
   1911         logln((UnicodeString)"Ok: " + message
   1912               + "; source = " + source
   1913               + "; result = " + result
   1914               );
   1915     }
   1916     return TRUE;
   1917 }
   1918 
   1919 void
   1920 UnicodeSetTest::expectContainment(const UnicodeString& pat,
   1921                                   const UnicodeString& charsIn,
   1922                                   const UnicodeString& charsOut) {
   1923     UErrorCode ec = U_ZERO_ERROR;
   1924     UnicodeSet set(pat, ec);
   1925     if (U_FAILURE(ec)) {
   1926         dataerrln((UnicodeString)"FAIL: pattern \"" +
   1927               pat + "\" => " + u_errorName(ec));
   1928         return;
   1929     }
   1930     expectContainment(set, pat, charsIn, charsOut);
   1931 }
   1932 
   1933 void
   1934 UnicodeSetTest::expectContainment(const UnicodeSet& set,
   1935                                   const UnicodeString& charsIn,
   1936                                   const UnicodeString& charsOut) {
   1937     UnicodeString pat;
   1938     set.toPattern(pat);
   1939     expectContainment(set, pat, charsIn, charsOut);
   1940 }
   1941 
   1942 void
   1943 UnicodeSetTest::expectContainment(const UnicodeSet& set,
   1944                                   const UnicodeString& setName,
   1945                                   const UnicodeString& charsIn,
   1946                                   const UnicodeString& charsOut) {
   1947     UnicodeString bad;
   1948     UChar32 c;
   1949     int32_t i;
   1950 
   1951     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
   1952         c = charsIn.char32At(i);
   1953         if (!set.contains(c)) {
   1954             bad.append(c);
   1955         }
   1956     }
   1957     if (bad.length() > 0) {
   1958         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
   1959               ", expected containment of " + prettify(charsIn));
   1960     } else {
   1961         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
   1962     }
   1963 
   1964     bad.truncate(0);
   1965     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
   1966         c = charsOut.char32At(i);
   1967         if (set.contains(c)) {
   1968             bad.append(c);
   1969         }
   1970     }
   1971     if (bad.length() > 0) {
   1972         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
   1973               ", expected non-containment of " + prettify(charsOut));
   1974     } else {
   1975         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
   1976     }
   1977 }
   1978 
   1979 void
   1980 UnicodeSetTest::expectPattern(UnicodeSet& set,
   1981                               const UnicodeString& pattern,
   1982                               const UnicodeString& expectedPairs){
   1983     UErrorCode status = U_ZERO_ERROR;
   1984     set.applyPattern(pattern, status);
   1985     if (U_FAILURE(status)) {
   1986         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
   1987               "\") failed");
   1988         return;
   1989     } else {
   1990         if (getPairs(set) != expectedPairs ) {
   1991             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
   1992                   "\") => pairs \"" +
   1993                   escape(getPairs(set)) + "\", expected \"" +
   1994                   escape(expectedPairs) + "\"");
   1995         } else {
   1996             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
   1997                   "\") => pairs \"" +
   1998                   escape(getPairs(set)) + "\"");
   1999         }
   2000     }
   2001     // the result of calling set.toPattern(), which is the string representation of
   2002     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
   2003     // will produce another set that is equal to this one.
   2004     UnicodeString temppattern;
   2005     set.toPattern(temppattern);
   2006     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
   2007     if (U_FAILURE(status)) {
   2008         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
   2009         return;
   2010     }
   2011     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
   2012         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
   2013             escape(getPairs(set)) + "\""));
   2014     } else{
   2015         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
   2016     }
   2017 
   2018     delete tempset;
   2019 
   2020 }
   2021 
   2022 void
   2023 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
   2024     if (getPairs(set) != expectedPairs) {
   2025         errln(UnicodeString("FAIL: Expected pair list \"") +
   2026               escape(expectedPairs) + "\", got \"" +
   2027               escape(getPairs(set)) + "\"");
   2028     }
   2029 }
   2030 
   2031 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
   2032                                      const UnicodeString& expPat,
   2033                                      const char** expStrings) {
   2034     UnicodeString pat;
   2035     set.toPattern(pat, TRUE);
   2036     if (pat == expPat) {
   2037         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
   2038     } else {
   2039         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
   2040         return;
   2041     }
   2042     if (expStrings == NULL) {
   2043         return;
   2044     }
   2045     UBool in = TRUE;
   2046     for (int32_t i=0; expStrings[i] != NULL; ++i) {
   2047         if (expStrings[i] == NOT) { // sic; pointer comparison
   2048             in = FALSE;
   2049             continue;
   2050         }
   2051         UnicodeString s = CharsToUnicodeString(expStrings[i]);
   2052         UBool contained = set.contains(s);
   2053         if (contained == in) {
   2054             logln((UnicodeString)"Ok: " + expPat +
   2055                   (contained ? " contains {" : " does not contain {") +
   2056                   escape(expStrings[i]) + "}");
   2057         } else {
   2058             errln((UnicodeString)"FAIL: " + expPat +
   2059                   (contained ? " contains {" : " does not contain {") +
   2060                   escape(expStrings[i]) + "}");
   2061         }
   2062     }
   2063 }
   2064 
   2065 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
   2066 
   2067 void
   2068 UnicodeSetTest::doAssert(UBool condition, const char *message)
   2069 {
   2070     if (!condition) {
   2071         errln(UnicodeString("ERROR : ") + message);
   2072     }
   2073 }
   2074 
   2075 UnicodeString
   2076 UnicodeSetTest::escape(const UnicodeString& s) {
   2077     UnicodeString buf;
   2078     for (int32_t i=0; i<s.length(); )
   2079     {
   2080         UChar32 c = s.char32At(i);
   2081         if (0x0020 <= c && c <= 0x007F) {
   2082             buf += c;
   2083         } else {
   2084             if (c <= 0xFFFF) {
   2085                 buf += (UChar)0x5c; buf += (UChar)0x75;
   2086             } else {
   2087                 buf += (UChar)0x5c; buf += (UChar)0x55;
   2088                 buf += toHexString((c & 0xF0000000) >> 28);
   2089                 buf += toHexString((c & 0x0F000000) >> 24);
   2090                 buf += toHexString((c & 0x00F00000) >> 20);
   2091                 buf += toHexString((c & 0x000F0000) >> 16);
   2092             }
   2093             buf += toHexString((c & 0xF000) >> 12);
   2094             buf += toHexString((c & 0x0F00) >> 8);
   2095             buf += toHexString((c & 0x00F0) >> 4);
   2096             buf += toHexString(c & 0x000F);
   2097         }
   2098         i += U16_LENGTH(c);
   2099     }
   2100     return buf;
   2101 }
   2102 
   2103 void UnicodeSetTest::TestFreezable() {
   2104     UErrorCode errorCode=U_ZERO_ERROR;
   2105     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
   2106     UnicodeSet idSet(idPattern, errorCode);
   2107     if(U_FAILURE(errorCode)) {
   2108         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
   2109         return;
   2110     }
   2111 
   2112     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
   2113     UnicodeSet wsSet(wsPattern, errorCode);
   2114     if(U_FAILURE(errorCode)) {
   2115         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
   2116         return;
   2117     }
   2118 
   2119     idSet.add(idPattern);
   2120     UnicodeSet frozen(idSet);
   2121     frozen.freeze();
   2122 
   2123     if(idSet.isFrozen() || !frozen.isFrozen()) {
   2124         errln("FAIL: isFrozen() is wrong");
   2125     }
   2126     if(frozen!=idSet || !(frozen==idSet)) {
   2127         errln("FAIL: a copy-constructed frozen set differs from its original");
   2128     }
   2129 
   2130     frozen=wsSet;
   2131     if(frozen!=idSet || !(frozen==idSet)) {
   2132         errln("FAIL: a frozen set was modified by operator=");
   2133     }
   2134 
   2135     UnicodeSet frozen2(frozen);
   2136     if(frozen2!=frozen || frozen2!=idSet) {
   2137         errln("FAIL: a copied frozen set differs from its frozen original");
   2138     }
   2139     if(!frozen2.isFrozen()) {
   2140         errln("FAIL: copy-constructing a frozen set results in a thawed one");
   2141     }
   2142     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
   2143     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
   2144         errln("FAIL: UnicodeSet(5, 55) failed");
   2145     }
   2146     frozen3=frozen;
   2147     if(!frozen3.isFrozen()) {
   2148         errln("FAIL: copying a frozen set results in a thawed one");
   2149     }
   2150 
   2151     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
   2152     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
   2153         errln("FAIL: clone() failed");
   2154     }
   2155     cloned->add(0xd802, 0xd805);
   2156     if(cloned->containsSome(0xd802, 0xd805)) {
   2157         errln("FAIL: unable to modify clone");
   2158     }
   2159     delete cloned;
   2160 
   2161     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
   2162     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
   2163         errln("FAIL: cloneAsThawed() failed");
   2164     }
   2165     thawed->add(0xd802, 0xd805);
   2166     if(!thawed->contains(0xd802, 0xd805)) {
   2167         errln("FAIL: unable to modify thawed clone");
   2168     }
   2169     delete thawed;
   2170 
   2171     frozen.set(5, 55);
   2172     if(frozen!=idSet || !(frozen==idSet)) {
   2173         errln("FAIL: UnicodeSet::set() modified a frozen set");
   2174     }
   2175 
   2176     frozen.clear();
   2177     if(frozen!=idSet || !(frozen==idSet)) {
   2178         errln("FAIL: UnicodeSet::clear() modified a frozen set");
   2179     }
   2180 
   2181     frozen.closeOver(USET_CASE_INSENSITIVE);
   2182     if(frozen!=idSet || !(frozen==idSet)) {
   2183         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
   2184     }
   2185 
   2186     frozen.compact();
   2187     if(frozen!=idSet || !(frozen==idSet)) {
   2188         errln("FAIL: UnicodeSet::compact() modified a frozen set");
   2189     }
   2190 
   2191     ParsePosition pos;
   2192     frozen.
   2193         applyPattern(wsPattern, errorCode).
   2194         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
   2195         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
   2196         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
   2197         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
   2198     if(frozen!=idSet || !(frozen==idSet)) {
   2199         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
   2200     }
   2201 
   2202     frozen.
   2203         add(0xd800).
   2204         add(0xd802, 0xd805).
   2205         add(wsPattern).
   2206         addAll(idPattern).
   2207         addAll(wsSet);
   2208     if(frozen!=idSet || !(frozen==idSet)) {
   2209         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
   2210     }
   2211 
   2212     frozen.
   2213         retain(0x62).
   2214         retain(0x64, 0x69).
   2215         retainAll(wsPattern).
   2216         retainAll(wsSet);
   2217     if(frozen!=idSet || !(frozen==idSet)) {
   2218         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
   2219     }
   2220 
   2221     frozen.
   2222         remove(0x62).
   2223         remove(0x64, 0x69).
   2224         remove(idPattern).
   2225         removeAll(idPattern).
   2226         removeAll(idSet);
   2227     if(frozen!=idSet || !(frozen==idSet)) {
   2228         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
   2229     }
   2230 
   2231     frozen.
   2232         complement().
   2233         complement(0x62).
   2234         complement(0x64, 0x69).
   2235         complement(idPattern).
   2236         complementAll(idPattern).
   2237         complementAll(idSet);
   2238     if(frozen!=idSet || !(frozen==idSet)) {
   2239         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
   2240     }
   2241 }
   2242 
   2243 // Test span() etc. -------------------------------------------------------- ***
   2244 
   2245 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
   2246 static int32_t
   2247 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
   2248     UErrorCode errorCode=U_ZERO_ERROR;
   2249     int32_t length8=0;
   2250     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
   2251     if(U_SUCCESS(errorCode)) {
   2252         return length8;
   2253     } else {
   2254         // The string contains an unpaired surrogate.
   2255         // Ignore this string.
   2256         return 0;
   2257     }
   2258 }
   2259 
   2260 class UnicodeSetWithStringsIterator;
   2261 
   2262 // Make the strings in a UnicodeSet easily accessible.
   2263 class UnicodeSetWithStrings {
   2264 public:
   2265     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
   2266             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
   2267         int32_t size=set.size();
   2268         if(size>0 && set.charAt(size-1)<0) {
   2269             // If a set's last element is not a code point, then it must contain strings.
   2270             // Iterate over the set, skip all code point ranges, and cache the strings.
   2271             // Convert them to UTF-8 for spanUTF8().
   2272             UnicodeSetIterator iter(set);
   2273             const UnicodeString *s;
   2274             char *s8=utf8;
   2275             int32_t length8, utf8Count=0;
   2276             while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
   2277                 if(iter.isString()) {
   2278                     // Store the pointer to the set's string element
   2279                     // which we happen to know is a stable pointer.
   2280                     strings[stringsLength]=s=&iter.getString();
   2281                     utf8Count+=
   2282                         utf8Lengths[stringsLength]=length8=
   2283                         appendUTF8(s->getBuffer(), s->length(),
   2284                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
   2285                     if(length8==0) {
   2286                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
   2287                     }
   2288                     s8+=length8;
   2289                     ++stringsLength;
   2290                 }
   2291             }
   2292         }
   2293     }
   2294 
   2295     const UnicodeSet &getSet() const {
   2296         return set;
   2297     }
   2298 
   2299     UBool hasStrings() const {
   2300         return (UBool)(stringsLength>0);
   2301     }
   2302 
   2303     UBool hasStringsWithSurrogates() const {
   2304         return hasSurrogates;
   2305     }
   2306 
   2307 private:
   2308     friend class UnicodeSetWithStringsIterator;
   2309 
   2310     const UnicodeSet &set;
   2311 
   2312     const UnicodeString *strings[20];
   2313     int32_t stringsLength;
   2314     UBool hasSurrogates;
   2315 
   2316     char utf8[1024];
   2317     int32_t utf8Lengths[20];
   2318 
   2319     int32_t nextStringIndex;
   2320     int32_t nextUTF8Start;
   2321 };
   2322 
   2323 class UnicodeSetWithStringsIterator {
   2324 public:
   2325     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
   2326             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
   2327     }
   2328 
   2329     void reset() {
   2330         nextStringIndex=nextUTF8Start=0;
   2331     }
   2332 
   2333     const UnicodeString *nextString() {
   2334         if(nextStringIndex<fSet.stringsLength) {
   2335             return fSet.strings[nextStringIndex++];
   2336         } else {
   2337             return NULL;
   2338         }
   2339     }
   2340 
   2341     // Do not mix with calls to nextString().
   2342     const char *nextUTF8(int32_t &length) {
   2343         if(nextStringIndex<fSet.stringsLength) {
   2344             const char *s8=fSet.utf8+nextUTF8Start;
   2345             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
   2346             return s8;
   2347         } else {
   2348             length=0;
   2349             return NULL;
   2350         }
   2351     }
   2352 
   2353 private:
   2354     const UnicodeSetWithStrings &fSet;
   2355     int32_t nextStringIndex;
   2356     int32_t nextUTF8Start;
   2357 };
   2358 
   2359 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
   2360 // at code point boundaries.
   2361 // That is, each edge of a match must not be in the middle of a surrogate pair.
   2362 static inline UBool
   2363 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
   2364     s+=start;
   2365     limit-=start;
   2366     int32_t length=t.length();
   2367     return 0==t.compare(s, length) &&
   2368            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
   2369            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
   2370 }
   2371 
   2372 // Implement span() with contains() for comparison.
   2373 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
   2374                                  USetSpanCondition spanCondition) {
   2375     const UnicodeSet &realSet(set.getSet());
   2376     if(!set.hasStrings()) {
   2377         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2378             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2379         }
   2380 
   2381         UChar32 c;
   2382         int32_t start=0, prev;
   2383         while((prev=start)<length) {
   2384             U16_NEXT(s, start, length, c);
   2385             if(realSet.contains(c)!=spanCondition) {
   2386                 break;
   2387             }
   2388         }
   2389         return prev;
   2390     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2391         UnicodeSetWithStringsIterator iter(set);
   2392         UChar32 c;
   2393         int32_t start, next;
   2394         for(start=next=0; start<length;) {
   2395             U16_NEXT(s, next, length, c);
   2396             if(realSet.contains(c)) {
   2397                 break;
   2398             }
   2399             const UnicodeString *str;
   2400             iter.reset();
   2401             while((str=iter.nextString())!=NULL) {
   2402                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
   2403                     // spanNeedsStrings=TRUE;
   2404                     return start;
   2405                 }
   2406             }
   2407             start=next;
   2408         }
   2409         return start;
   2410     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2411         UnicodeSetWithStringsIterator iter(set);
   2412         UChar32 c;
   2413         int32_t start, next, maxSpanLimit=0;
   2414         for(start=next=0; start<length;) {
   2415             U16_NEXT(s, next, length, c);
   2416             if(!realSet.contains(c)) {
   2417                 next=start;  // Do not span this single, not-contained code point.
   2418             }
   2419             const UnicodeString *str;
   2420             iter.reset();
   2421             while((str=iter.nextString())!=NULL) {
   2422                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
   2423                     // spanNeedsStrings=TRUE;
   2424                     int32_t matchLimit=start+str->length();
   2425                     if(matchLimit==length) {
   2426                         return length;
   2427                     }
   2428                     if(spanCondition==USET_SPAN_CONTAINED) {
   2429                         // Iterate for the shortest match at each position.
   2430                         // Recurse for each but the shortest match.
   2431                         if(next==start) {
   2432                             next=matchLimit;  // First match from start.
   2433                         } else {
   2434                             if(matchLimit<next) {
   2435                                 // Remember shortest match from start for iteration.
   2436                                 int32_t temp=next;
   2437                                 next=matchLimit;
   2438                                 matchLimit=temp;
   2439                             }
   2440                             // Recurse for non-shortest match from start.
   2441                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
   2442                                                                  USET_SPAN_CONTAINED);
   2443                             if((matchLimit+spanLength)>maxSpanLimit) {
   2444                                 maxSpanLimit=matchLimit+spanLength;
   2445                                 if(maxSpanLimit==length) {
   2446                                     return length;
   2447                                 }
   2448                             }
   2449                         }
   2450                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2451                         if(matchLimit>next) {
   2452                             // Remember longest match from start.
   2453                             next=matchLimit;
   2454                         }
   2455                     }
   2456                 }
   2457             }
   2458             if(next==start) {
   2459                 break;  // No match from start.
   2460             }
   2461             start=next;
   2462         }
   2463         if(start>maxSpanLimit) {
   2464             return start;
   2465         } else {
   2466             return maxSpanLimit;
   2467         }
   2468     }
   2469 }
   2470 
   2471 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
   2472                                      USetSpanCondition spanCondition) {
   2473     if(length==0) {
   2474         return 0;
   2475     }
   2476     const UnicodeSet &realSet(set.getSet());
   2477     if(!set.hasStrings()) {
   2478         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2479             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2480         }
   2481 
   2482         UChar32 c;
   2483         int32_t prev=length;
   2484         do {
   2485             U16_PREV(s, 0, length, c);
   2486             if(realSet.contains(c)!=spanCondition) {
   2487                 break;
   2488             }
   2489         } while((prev=length)>0);
   2490         return prev;
   2491     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2492         UnicodeSetWithStringsIterator iter(set);
   2493         UChar32 c;
   2494         int32_t prev=length, length0=length;
   2495         do {
   2496             U16_PREV(s, 0, length, c);
   2497             if(realSet.contains(c)) {
   2498                 break;
   2499             }
   2500             const UnicodeString *str;
   2501             iter.reset();
   2502             while((str=iter.nextString())!=NULL) {
   2503                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
   2504                     // spanNeedsStrings=TRUE;
   2505                     return prev;
   2506                 }
   2507             }
   2508         } while((prev=length)>0);
   2509         return prev;
   2510     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2511         UnicodeSetWithStringsIterator iter(set);
   2512         UChar32 c;
   2513         int32_t prev=length, minSpanStart=length, length0=length;
   2514         do {
   2515             U16_PREV(s, 0, length, c);
   2516             if(!realSet.contains(c)) {
   2517                 length=prev;  // Do not span this single, not-contained code point.
   2518             }
   2519             const UnicodeString *str;
   2520             iter.reset();
   2521             while((str=iter.nextString())!=NULL) {
   2522                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
   2523                     // spanNeedsStrings=TRUE;
   2524                     int32_t matchStart=prev-str->length();
   2525                     if(matchStart==0) {
   2526                         return 0;
   2527                     }
   2528                     if(spanCondition==USET_SPAN_CONTAINED) {
   2529                         // Iterate for the shortest match at each position.
   2530                         // Recurse for each but the shortest match.
   2531                         if(length==prev) {
   2532                             length=matchStart;  // First match from prev.
   2533                         } else {
   2534                             if(matchStart>length) {
   2535                                 // Remember shortest match from prev for iteration.
   2536                                 int32_t temp=length;
   2537                                 length=matchStart;
   2538                                 matchStart=temp;
   2539                             }
   2540                             // Recurse for non-shortest match from prev.
   2541                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
   2542                                                                     USET_SPAN_CONTAINED);
   2543                             if(spanStart<minSpanStart) {
   2544                                 minSpanStart=spanStart;
   2545                                 if(minSpanStart==0) {
   2546                                     return 0;
   2547                                 }
   2548                             }
   2549                         }
   2550                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2551                         if(matchStart<length) {
   2552                             // Remember longest match from prev.
   2553                             length=matchStart;
   2554                         }
   2555                     }
   2556                 }
   2557             }
   2558             if(length==prev) {
   2559                 break;  // No match from prev.
   2560             }
   2561         } while((prev=length)>0);
   2562         if(prev<minSpanStart) {
   2563             return prev;
   2564         } else {
   2565             return minSpanStart;
   2566         }
   2567     }
   2568 }
   2569 
   2570 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
   2571                                 USetSpanCondition spanCondition) {
   2572     const UnicodeSet &realSet(set.getSet());
   2573     if(!set.hasStrings()) {
   2574         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2575             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2576         }
   2577 
   2578         UChar32 c;
   2579         int32_t start=0, prev;
   2580         while((prev=start)<length) {
   2581             U8_NEXT(s, start, length, c);
   2582             if(c<0) {
   2583                 c=0xfffd;
   2584             }
   2585             if(realSet.contains(c)!=spanCondition) {
   2586                 break;
   2587             }
   2588         }
   2589         return prev;
   2590     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2591         UnicodeSetWithStringsIterator iter(set);
   2592         UChar32 c;
   2593         int32_t start, next;
   2594         for(start=next=0; start<length;) {
   2595             U8_NEXT(s, next, length, c);
   2596             if(c<0) {
   2597                 c=0xfffd;
   2598             }
   2599             if(realSet.contains(c)) {
   2600                 break;
   2601             }
   2602             const char *s8;
   2603             int32_t length8;
   2604             iter.reset();
   2605             while((s8=iter.nextUTF8(length8))!=NULL) {
   2606                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
   2607                     // spanNeedsStrings=TRUE;
   2608                     return start;
   2609                 }
   2610             }
   2611             start=next;
   2612         }
   2613         return start;
   2614     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2615         UnicodeSetWithStringsIterator iter(set);
   2616         UChar32 c;
   2617         int32_t start, next, maxSpanLimit=0;
   2618         for(start=next=0; start<length;) {
   2619             U8_NEXT(s, next, length, c);
   2620             if(c<0) {
   2621                 c=0xfffd;
   2622             }
   2623             if(!realSet.contains(c)) {
   2624                 next=start;  // Do not span this single, not-contained code point.
   2625             }
   2626             const char *s8;
   2627             int32_t length8;
   2628             iter.reset();
   2629             while((s8=iter.nextUTF8(length8))!=NULL) {
   2630                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
   2631                     // spanNeedsStrings=TRUE;
   2632                     int32_t matchLimit=start+length8;
   2633                     if(matchLimit==length) {
   2634                         return length;
   2635                     }
   2636                     if(spanCondition==USET_SPAN_CONTAINED) {
   2637                         // Iterate for the shortest match at each position.
   2638                         // Recurse for each but the shortest match.
   2639                         if(next==start) {
   2640                             next=matchLimit;  // First match from start.
   2641                         } else {
   2642                             if(matchLimit<next) {
   2643                                 // Remember shortest match from start for iteration.
   2644                                 int32_t temp=next;
   2645                                 next=matchLimit;
   2646                                 matchLimit=temp;
   2647                             }
   2648                             // Recurse for non-shortest match from start.
   2649                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
   2650                                                                 USET_SPAN_CONTAINED);
   2651                             if((matchLimit+spanLength)>maxSpanLimit) {
   2652                                 maxSpanLimit=matchLimit+spanLength;
   2653                                 if(maxSpanLimit==length) {
   2654                                     return length;
   2655                                 }
   2656                             }
   2657                         }
   2658                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2659                         if(matchLimit>next) {
   2660                             // Remember longest match from start.
   2661                             next=matchLimit;
   2662                         }
   2663                     }
   2664                 }
   2665             }
   2666             if(next==start) {
   2667                 break;  // No match from start.
   2668             }
   2669             start=next;
   2670         }
   2671         if(start>maxSpanLimit) {
   2672             return start;
   2673         } else {
   2674             return maxSpanLimit;
   2675         }
   2676     }
   2677 }
   2678 
   2679 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
   2680                                     USetSpanCondition spanCondition) {
   2681     if(length==0) {
   2682         return 0;
   2683     }
   2684     const UnicodeSet &realSet(set.getSet());
   2685     if(!set.hasStrings()) {
   2686         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2687             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2688         }
   2689 
   2690         UChar32 c;
   2691         int32_t prev=length;
   2692         do {
   2693             U8_PREV(s, 0, length, c);
   2694             if(c<0) {
   2695                 c=0xfffd;
   2696             }
   2697             if(realSet.contains(c)!=spanCondition) {
   2698                 break;
   2699             }
   2700         } while((prev=length)>0);
   2701         return prev;
   2702     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2703         UnicodeSetWithStringsIterator iter(set);
   2704         UChar32 c;
   2705         int32_t prev=length;
   2706         do {
   2707             U8_PREV(s, 0, length, c);
   2708             if(c<0) {
   2709                 c=0xfffd;
   2710             }
   2711             if(realSet.contains(c)) {
   2712                 break;
   2713             }
   2714             const char *s8;
   2715             int32_t length8;
   2716             iter.reset();
   2717             while((s8=iter.nextUTF8(length8))!=NULL) {
   2718                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
   2719                     // spanNeedsStrings=TRUE;
   2720                     return prev;
   2721                 }
   2722             }
   2723         } while((prev=length)>0);
   2724         return prev;
   2725     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2726         UnicodeSetWithStringsIterator iter(set);
   2727         UChar32 c;
   2728         int32_t prev=length, minSpanStart=length;
   2729         do {
   2730             U8_PREV(s, 0, length, c);
   2731             if(c<0) {
   2732                 c=0xfffd;
   2733             }
   2734             if(!realSet.contains(c)) {
   2735                 length=prev;  // Do not span this single, not-contained code point.
   2736             }
   2737             const char *s8;
   2738             int32_t length8;
   2739             iter.reset();
   2740             while((s8=iter.nextUTF8(length8))!=NULL) {
   2741                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
   2742                     // spanNeedsStrings=TRUE;
   2743                     int32_t matchStart=prev-length8;
   2744                     if(matchStart==0) {
   2745                         return 0;
   2746                     }
   2747                     if(spanCondition==USET_SPAN_CONTAINED) {
   2748                         // Iterate for the shortest match at each position.
   2749                         // Recurse for each but the shortest match.
   2750                         if(length==prev) {
   2751                             length=matchStart;  // First match from prev.
   2752                         } else {
   2753                             if(matchStart>length) {
   2754                                 // Remember shortest match from prev for iteration.
   2755                                 int32_t temp=length;
   2756                                 length=matchStart;
   2757                                 matchStart=temp;
   2758                             }
   2759                             // Recurse for non-shortest match from prev.
   2760                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
   2761                                                                    USET_SPAN_CONTAINED);
   2762                             if(spanStart<minSpanStart) {
   2763                                 minSpanStart=spanStart;
   2764                                 if(minSpanStart==0) {
   2765                                     return 0;
   2766                                 }
   2767                             }
   2768                         }
   2769                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2770                         if(matchStart<length) {
   2771                             // Remember longest match from prev.
   2772                             length=matchStart;
   2773                         }
   2774                     }
   2775                 }
   2776             }
   2777             if(length==prev) {
   2778                 break;  // No match from prev.
   2779             }
   2780         } while((prev=length)>0);
   2781         if(prev<minSpanStart) {
   2782             return prev;
   2783         } else {
   2784             return minSpanStart;
   2785         }
   2786     }
   2787 }
   2788 
   2789 // spans to be performed and compared
   2790 enum {
   2791     SPAN_UTF16          =1,
   2792     SPAN_UTF8           =2,
   2793     SPAN_UTFS           =3,
   2794 
   2795     SPAN_SET            =4,
   2796     SPAN_COMPLEMENT     =8,
   2797     SPAN_POLARITY       =0xc,
   2798 
   2799     SPAN_FWD            =0x10,
   2800     SPAN_BACK           =0x20,
   2801     SPAN_DIRS           =0x30,
   2802 
   2803     SPAN_CONTAINED      =0x100,
   2804     SPAN_SIMPLE         =0x200,
   2805     SPAN_CONDITION      =0x300,
   2806 
   2807     SPAN_ALL            =0x33f
   2808 };
   2809 
   2810 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
   2811     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
   2812 }
   2813 
   2814 static inline int32_t slen(const void *s, UBool isUTF16) {
   2815     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
   2816 }
   2817 
   2818 /*
   2819  * Count spans on a string with the method according to type and set the span limits.
   2820  * The set may be the complement of the original.
   2821  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
   2822  * according to the expected number of spans.
   2823  * Sets typeName to an empty string if there is no such type.
   2824  * Returns -1 if the span option is filtered out.
   2825  */
   2826 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
   2827                         const void *s, int32_t length, UBool isUTF16,
   2828                         uint32_t whichSpans,
   2829                         int type, const char *&typeName,
   2830                         int32_t limits[], int32_t limitsCapacity,
   2831                         int32_t expectCount) {
   2832     const UnicodeSet &realSet(set.getSet());
   2833     int32_t start, count;
   2834     USetSpanCondition spanCondition, firstSpanCondition, contained;
   2835     UBool isForward;
   2836 
   2837     if(type<0 || 7<type) {
   2838         typeName="";
   2839         return 0;
   2840     }
   2841 
   2842     static const char *const typeNames16[]={
   2843         "contains", "contains(LM)",
   2844         "span", "span(LM)",
   2845         "containsBack", "containsBack(LM)",
   2846         "spanBack", "spanBack(LM)"
   2847     };
   2848 
   2849     static const char *const typeNames8[]={
   2850         "containsUTF8", "containsUTF8(LM)",
   2851         "spanUTF8", "spanUTF8(LM)",
   2852         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
   2853         "spanBackUTF8", "spanBackUTF8(LM)"
   2854     };
   2855 
   2856     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
   2857 
   2858     // filter span options
   2859     if(type<=3) {
   2860         // span forward
   2861         if((whichSpans&SPAN_FWD)==0) {
   2862             return -1;
   2863         }
   2864         isForward=TRUE;
   2865     } else {
   2866         // span backward
   2867         if((whichSpans&SPAN_BACK)==0) {
   2868             return -1;
   2869         }
   2870         isForward=FALSE;
   2871     }
   2872     if((type&1)==0) {
   2873         // use USET_SPAN_CONTAINED
   2874         if((whichSpans&SPAN_CONTAINED)==0) {
   2875             return -1;
   2876         }
   2877         contained=USET_SPAN_CONTAINED;
   2878     } else {
   2879         // use USET_SPAN_SIMPLE
   2880         if((whichSpans&SPAN_SIMPLE)==0) {
   2881             return -1;
   2882         }
   2883         contained=USET_SPAN_SIMPLE;
   2884     }
   2885 
   2886     // Default first span condition for going forward with an uncomplemented set.
   2887     spanCondition=USET_SPAN_NOT_CONTAINED;
   2888     if(isComplement) {
   2889         spanCondition=invertSpanCondition(spanCondition, contained);
   2890     }
   2891 
   2892     // First span condition for span(), used to terminate the spanBack() iteration.
   2893     firstSpanCondition=spanCondition;
   2894 
   2895     // spanBack(): Its initial span condition is span()'s last span condition,
   2896     // which is the opposite of span()'s first span condition
   2897     // if we expect an even number of spans.
   2898     // (The loop inverts spanCondition (expectCount-1) times
   2899     // before the expectCount'th span() call.)
   2900     // If we do not compare forward and backward directions, then we do not have an
   2901     // expectCount and just start with firstSpanCondition.
   2902     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
   2903         spanCondition=invertSpanCondition(spanCondition, contained);
   2904     }
   2905 
   2906     count=0;
   2907     switch(type) {
   2908     case 0:
   2909     case 1:
   2910         start=0;
   2911         if(length<0) {
   2912             length=slen(s, isUTF16);
   2913         }
   2914         for(;;) {
   2915             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
   2916                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
   2917             if(count<limitsCapacity) {
   2918                 limits[count]=start;
   2919             }
   2920             ++count;
   2921             if(start>=length) {
   2922                 break;
   2923             }
   2924             spanCondition=invertSpanCondition(spanCondition, contained);
   2925         }
   2926         break;
   2927     case 2:
   2928     case 3:
   2929         start=0;
   2930         for(;;) {
   2931             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
   2932                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
   2933             if(count<limitsCapacity) {
   2934                 limits[count]=start;
   2935             }
   2936             ++count;
   2937             if(length>=0 ? start>=length :
   2938                            isUTF16 ? ((const UChar *)s)[start]==0 :
   2939                                      ((const char *)s)[start]==0
   2940             ) {
   2941                 break;
   2942             }
   2943             spanCondition=invertSpanCondition(spanCondition, contained);
   2944         }
   2945         break;
   2946     case 4:
   2947     case 5:
   2948         if(length<0) {
   2949             length=slen(s, isUTF16);
   2950         }
   2951         for(;;) {
   2952             ++count;
   2953             if(count<=limitsCapacity) {
   2954                 limits[limitsCapacity-count]=length;
   2955             }
   2956             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
   2957                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
   2958             if(length==0 && spanCondition==firstSpanCondition) {
   2959                 break;
   2960             }
   2961             spanCondition=invertSpanCondition(spanCondition, contained);
   2962         }
   2963         if(count<limitsCapacity) {
   2964             memmove(limits, limits+(limitsCapacity-count), count*4);
   2965         }
   2966         break;
   2967     case 6:
   2968     case 7:
   2969         for(;;) {
   2970             ++count;
   2971             if(count<=limitsCapacity) {
   2972                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
   2973             }
   2974             // Note: Length<0 is tested only for the first spanBack().
   2975             // If we wanted to keep length<0 for all spanBack()s, we would have to
   2976             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
   2977             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
   2978                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
   2979             if(length==0 && spanCondition==firstSpanCondition) {
   2980                 break;
   2981             }
   2982             spanCondition=invertSpanCondition(spanCondition, contained);
   2983         }
   2984         if(count<limitsCapacity) {
   2985             memmove(limits, limits+(limitsCapacity-count), count*4);
   2986         }
   2987         break;
   2988     default:
   2989         typeName="";
   2990         return -1;
   2991     }
   2992 
   2993     return count;
   2994 }
   2995 
   2996 // sets to be tested; odd index=isComplement
   2997 enum {
   2998     SLOW,
   2999     SLOW_NOT,
   3000     FAST,
   3001     FAST_NOT,
   3002     SET_COUNT
   3003 };
   3004 
   3005 static const char *const setNames[SET_COUNT]={
   3006     "slow",
   3007     "slow.not",
   3008     "fast",
   3009     "fast.not"
   3010 };
   3011 
   3012 /*
   3013  * Verify that we get the same results whether we look at text with contains(),
   3014  * span() or spanBack(), using unfrozen or frozen versions of the set,
   3015  * and using the set or its complement (switching the spanConditions accordingly).
   3016  * The latter verifies that
   3017  *   set.span(spanCondition) == set.complement().span(!spanCondition).
   3018  *
   3019  * The expectLimits[] are either provided by the caller (with expectCount>=0)
   3020  * or returned to the caller (with an input expectCount<0).
   3021  */
   3022 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
   3023                               const void *s, int32_t length, UBool isUTF16,
   3024                               uint32_t whichSpans,
   3025                               int32_t expectLimits[], int32_t &expectCount,
   3026                               const char *testName, int32_t index) {
   3027     int32_t limits[500];
   3028     int32_t limitsCount;
   3029     int i, j;
   3030 
   3031     const char *typeName;
   3032     int type;
   3033 
   3034     for(i=0; i<SET_COUNT; ++i) {
   3035         if((i&1)==0) {
   3036             // Even-numbered sets are original, uncomplemented sets.
   3037             if((whichSpans&SPAN_SET)==0) {
   3038                 continue;
   3039             }
   3040         } else {
   3041             // Odd-numbered sets are complemented.
   3042             if((whichSpans&SPAN_COMPLEMENT)==0) {
   3043                 continue;
   3044             }
   3045         }
   3046         for(type=0;; ++type) {
   3047             limitsCount=getSpans(*sets[i], (UBool)(i&1),
   3048                                  s, length, isUTF16,
   3049                                  whichSpans,
   3050                                  type, typeName,
   3051                                  limits, LENGTHOF(limits), expectCount);
   3052             if(typeName[0]==0) {
   3053                 break; // All types tried.
   3054             }
   3055             if(limitsCount<0) {
   3056                 continue; // Span option filtered out.
   3057             }
   3058             if(expectCount<0) {
   3059                 expectCount=limitsCount;
   3060                 if(limitsCount>LENGTHOF(limits)) {
   3061                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
   3062                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
   3063                     return;
   3064                 }
   3065                 memcpy(expectLimits, limits, limitsCount*4);
   3066             } else if(limitsCount!=expectCount) {
   3067                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
   3068                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
   3069             } else {
   3070                 for(j=0; j<limitsCount; ++j) {
   3071                     if(limits[j]!=expectLimits[j]) {
   3072                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
   3073                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
   3074                               j, (long)limits[j], (long)expectLimits[j]);
   3075                         break;
   3076                     }
   3077                 }
   3078             }
   3079         }
   3080     }
   3081 
   3082     // Compare span() with containsAll()/containsNone(),
   3083     // but only if we have expectLimits[] from the uncomplemented set.
   3084     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
   3085         const UChar *s16=(const UChar *)s;
   3086         UnicodeString string;
   3087         int32_t prev=0, limit, length;
   3088         for(i=0; i<expectCount; ++i) {
   3089             limit=expectLimits[i];
   3090             length=limit-prev;
   3091             if(length>0) {
   3092                 string.setTo(FALSE, s16+prev, length);  // read-only alias
   3093                 if(i&1) {
   3094                     if(!sets[SLOW]->getSet().containsAll(string)) {
   3095                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
   3096                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
   3097                         return;
   3098                     }
   3099                     if(!sets[FAST]->getSet().containsAll(string)) {
   3100                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
   3101                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
   3102                         return;
   3103                     }
   3104                 } else {
   3105                     if(!sets[SLOW]->getSet().containsNone(string)) {
   3106                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
   3107                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
   3108                         return;
   3109                     }
   3110                     if(!sets[FAST]->getSet().containsNone(string)) {
   3111                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
   3112                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
   3113                         return;
   3114                     }
   3115                 }
   3116             }
   3117             prev=limit;
   3118         }
   3119     }
   3120 }
   3121 
   3122 // Specifically test either UTF-16 or UTF-8.
   3123 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
   3124                               const void *s, int32_t length, UBool isUTF16,
   3125                               uint32_t whichSpans,
   3126                               const char *testName, int32_t index) {
   3127     int32_t expectLimits[500];
   3128     int32_t expectCount=-1;
   3129     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
   3130 }
   3131 
   3132 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
   3133     UChar c, c2;
   3134 
   3135     if(length>=0) {
   3136         while(length>0) {
   3137             c=*s++;
   3138             --length;
   3139             if(0xd800<=c && c<0xe000) {
   3140                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
   3141                     return TRUE;
   3142                 }
   3143                 --length;
   3144             }
   3145         }
   3146     } else {
   3147         while((c=*s++)!=0) {
   3148             if(0xd800<=c && c<0xe000) {
   3149                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
   3150                     return TRUE;
   3151                 }
   3152             }
   3153         }
   3154     }
   3155     return FALSE;
   3156 }
   3157 
   3158 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
   3159 // unless either UTF is turned off in whichSpans.
   3160 // Testing UTF-16 and UTF-8 together requires that surrogate code points
   3161 // have the same contains(c) value as U+FFFD.
   3162 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
   3163                                       const UChar *s16, int32_t length16,
   3164                                       uint32_t whichSpans,
   3165                                       const char *testName, int32_t index) {
   3166     int32_t expectLimits[500];
   3167     int32_t expectCount;
   3168 
   3169     expectCount=-1;  // Get expectLimits[] from testSpan().
   3170 
   3171     if((whichSpans&SPAN_UTF16)!=0) {
   3172         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
   3173     }
   3174     if((whichSpans&SPAN_UTF8)==0) {
   3175         return;
   3176     }
   3177 
   3178     // Convert s16[] and expectLimits[] to UTF-8.
   3179     uint8_t s8[3000];
   3180     int32_t offsets[3000];
   3181 
   3182     const UChar *s16Limit=s16+length16;
   3183     char *t=(char *)s8;
   3184     char *tLimit=t+sizeof(s8);
   3185     int32_t *o=offsets;
   3186     UErrorCode errorCode=U_ZERO_ERROR;
   3187 
   3188     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
   3189     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
   3190     if(U_FAILURE(errorCode)) {
   3191         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
   3192               testName, (long)index, u_errorName(errorCode));
   3193         ucnv_resetFromUnicode(utf8Cnv);
   3194         return;
   3195     }
   3196     int32_t length8=(int32_t)(t-(char *)s8);
   3197 
   3198     // Convert expectLimits[].
   3199     int32_t i, j, expect;
   3200     for(i=j=0; i<expectCount; ++i) {
   3201         expect=expectLimits[i];
   3202         if(expect==length16) {
   3203             expectLimits[i]=length8;
   3204         } else {
   3205             while(offsets[j]<expect) {
   3206                 ++j;
   3207             }
   3208             expectLimits[i]=j;
   3209         }
   3210     }
   3211 
   3212     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
   3213 }
   3214 
   3215 static UChar32 nextCodePoint(UChar32 c) {
   3216     // Skip some large and boring ranges.
   3217     switch(c) {
   3218     case 0x3441:
   3219         return 0x4d7f;
   3220     case 0x5100:
   3221         return 0x9f00;
   3222     case 0xb040:
   3223         return 0xd780;
   3224     case 0xe041:
   3225         return 0xf8fe;
   3226     case 0x10100:
   3227         return 0x20000;
   3228     case 0x20041:
   3229         return 0xe0000;
   3230     case 0xe0101:
   3231         return 0x10fffd;
   3232     default:
   3233         return c+1;
   3234     }
   3235 }
   3236 
   3237 // Verify that all implementations represent the same set.
   3238 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3239     // contains(U+FFFD) is inconsistent with contains(some surrogates),
   3240     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
   3241     // Skip the UTF-8 part of the test - if the string contains surrogates -
   3242     // because it is likely to produce a different result.
   3243     UBool inconsistentSurrogates=
   3244             (!(sets[0]->getSet().contains(0xfffd) ?
   3245                sets[0]->getSet().contains(0xd800, 0xdfff) :
   3246                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
   3247              sets[0]->hasStringsWithSurrogates());
   3248 
   3249     UChar s[1000];
   3250     int32_t length=0;
   3251     uint32_t localWhichSpans;
   3252 
   3253     UChar32 c, first;
   3254     for(first=c=0;; c=nextCodePoint(c)) {
   3255         if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
   3256             localWhichSpans=whichSpans;
   3257             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
   3258                 localWhichSpans&=~SPAN_UTF8;
   3259             }
   3260             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
   3261             if(c>0x10ffff) {
   3262                 break;
   3263             }
   3264             length=0;
   3265             first=c;
   3266         }
   3267         U16_APPEND_UNSAFE(s, length, c);
   3268     }
   3269 }
   3270 
   3271 // Test with a particular, interesting string.
   3272 // Specify length and try NUL-termination.
   3273 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3274     static const UChar s[]={
   3275         0x61, 0x62, 0x20,                       // Latin, space
   3276         0x3b1, 0x3b2, 0x3b3,                    // Greek
   3277         0xd900,                                 // lead surrogate
   3278         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
   3279         0xdc05,                                 // trail surrogate
   3280         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
   3281         0xd900, 0xdc05,                         // unassigned supplementary
   3282         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
   3283         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
   3284         0                                       // NUL
   3285     };
   3286 
   3287     if((whichSpans&SPAN_UTF16)==0) {
   3288         return;
   3289     }
   3290     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
   3291     testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
   3292 }
   3293 
   3294 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3295     static const char s[]={
   3296         "abc"                                   // Latin
   3297 
   3298         /* trail byte in lead position */
   3299         "\x80"
   3300 
   3301         " "                                     // space
   3302 
   3303         /* truncated multi-byte sequences */
   3304         "\xd0"
   3305         "\xe0"
   3306         "\xe1"
   3307         "\xed"
   3308         "\xee"
   3309         "\xf0"
   3310         "\xf1"
   3311         "\xf4"
   3312         "\xf8"
   3313         "\xfc"
   3314 
   3315         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
   3316 
   3317         /* trail byte in lead position */
   3318         "\x80"
   3319 
   3320         "\xe0\x80"
   3321         "\xe0\xa0"
   3322         "\xe1\x80"
   3323         "\xed\x80"
   3324         "\xed\xa0"
   3325         "\xee\x80"
   3326         "\xf0\x80"
   3327         "\xf0\x90"
   3328         "\xf1\x80"
   3329         "\xf4\x80"
   3330         "\xf4\x90"
   3331         "\xf8\x80"
   3332         "\xfc\x80"
   3333 
   3334         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
   3335 
   3336         /* trail byte in lead position */
   3337         "\x80"
   3338 
   3339         "\xf0\x80\x80"
   3340         "\xf0\x90\x80"
   3341         "\xf1\x80\x80"
   3342         "\xf4\x80\x80"
   3343         "\xf4\x90\x80"
   3344         "\xf8\x80\x80"
   3345         "\xfc\x80\x80"
   3346 
   3347         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
   3348 
   3349         /* trail byte in lead position */
   3350         "\x80"
   3351 
   3352         "\xf8\x80\x80\x80"
   3353         "\xfc\x80\x80\x80"
   3354 
   3355         "\xF1\x90\x80\x85"                      // unassigned supplementary
   3356 
   3357         /* trail byte in lead position */
   3358         "\x80"
   3359 
   3360         "\xfc\x80\x80\x80\x80"
   3361 
   3362         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
   3363 
   3364         /* trail byte in lead position */
   3365         "\x80"
   3366 
   3367         /* complete sequences but non-shortest forms or out of range etc. */
   3368         "\xc0\x80"
   3369         "\xe0\x80\x80"
   3370         "\xed\xa0\x80"
   3371         "\xf0\x80\x80\x80"
   3372         "\xf4\x90\x80\x80"
   3373         "\xf8\x80\x80\x80\x80"
   3374         "\xfc\x80\x80\x80\x80\x80"
   3375         "\xfe"
   3376         "\xff"
   3377 
   3378         /* trail byte in lead position */
   3379         "\x80"
   3380 
   3381         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
   3382     };
   3383 
   3384     if((whichSpans&SPAN_UTF8)==0) {
   3385         return;
   3386     }
   3387     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
   3388     testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
   3389 }
   3390 
   3391 // Take a set of span options and multiply them so that
   3392 // each portion only has one of the options a, b and c.
   3393 // If b==0, then the set of options is just modified with mask and a.
   3394 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
   3395 static int32_t
   3396 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
   3397                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
   3398     uint32_t s;
   3399     int32_t i;
   3400 
   3401     for(i=0; i<whichSpansCount; ++i) {
   3402         s=whichSpans[i]&mask;
   3403         whichSpans[i]=s|a;
   3404         if(b!=0) {
   3405             whichSpans[whichSpansCount+i]=s|b;
   3406             if(c!=0) {
   3407                 whichSpans[2*whichSpansCount+i]=s|c;
   3408             }
   3409         }
   3410     }
   3411     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
   3412 }
   3413 
   3414 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
   3415 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
   3416 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
   3417 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
   3418 
   3419 void UnicodeSetTest::TestSpan() {
   3420     // "[...]" is a UnicodeSet pattern.
   3421     // "*" performs tests on all Unicode code points and on a selection of
   3422     //   malformed UTF-8/16 strings.
   3423     // "-options" limits the scope of testing for the current set.
   3424     //   By default, the test verifies that equivalent boundaries are found
   3425     //   for UTF-16 and UTF-8, going forward and backward,
   3426     //   alternating USET_SPAN_NOT_CONTAINED with
   3427     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
   3428     //   Single-character options:
   3429     //     8 -- UTF-16 and UTF-8 boundaries may differ.
   3430     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
   3431     //          or the set contains strings with unpaired surrogates
   3432     //          which do not translate to valid UTF-8.
   3433     //     c -- set.span() and set.complement().span() boundaries may differ.
   3434     //          Cause: Set strings are not complemented.
   3435     //     b -- span() and spanBack() boundaries may differ.
   3436     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
   3437     //          and spanBack(USET_SPAN_SIMPLE) are defined to
   3438     //          match with non-overlapping substrings.
   3439     //          For example, with a set containing "ab" and "ba",
   3440     //          span() of "aba" yields boundaries { 0, 2, 3 }
   3441     //          because the initial "ab" matches from 0 to 2,
   3442     //          while spanBack() yields boundaries { 0, 1, 3 }
   3443     //          because the final "ba" matches from 1 to 3.
   3444     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
   3445     //          Cause: Strings in the set overlap, and a longer match may
   3446     //          require a sequence including non-longest substrings.
   3447     //          For example, with a set containing "ab", "abc" and "cd",
   3448     //          span(contained) of "abcd" spans the entire string
   3449     //          but span(longest match) only spans the first 3 characters.
   3450     //   Each "-options" first resets all options and then applies the specified options.
   3451     //   A "-" without options resets the options.
   3452     //   The options are also reset for each new set.
   3453     // Other strings will be spanned.
   3454     static const char *const testdata[]={
   3455         "[:ID_Continue:]",
   3456         "*",
   3457         "[:White_Space:]",
   3458         "*",
   3459         "[]",
   3460         "*",
   3461         "[\\u0000-\\U0010FFFF]",
   3462         "*",
   3463         "[\\u0000\\u0080\\u0800\\U00010000]",
   3464         "*",
   3465         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
   3466         "*",
   3467         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
   3468         "-c",
   3469         "*",
   3470         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
   3471         "-c",
   3472         "*",
   3473 
   3474         // Overlapping strings cause overlapping attempts to match.
   3475         "[x{xy}{xya}{axy}{ax}]",
   3476         "-cl",
   3477 
   3478         // More repetitions of "xya" would take too long with the recursive
   3479         // reference implementation.
   3480         // containsAll()=FALSE
   3481         // test_string 0x14
   3482         "xx"
   3483         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
   3484         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
   3485         "xyaxyaxyaxya"
   3486         "xx"
   3487         "xyaxyaxyaxya"  // span() ends here.
   3488         "aaa",
   3489 
   3490         // containsAll()=TRUE
   3491         // test_string 0x15
   3492         "xx"
   3493         "xyaxyaxyaxya"
   3494         "xx"
   3495         "xyaxyaxyaxya"
   3496         "xx"
   3497         "xyaxyaxyaxy",
   3498 
   3499         "-bc",
   3500         // test_string 0x17
   3501         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
   3502         "-c",
   3503         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
   3504         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
   3505         "-",
   3506         "byaya",     // span() -> { 5 }
   3507         "byay",      // span() -> { 4 }
   3508         "bya",       // span() -> { 3 }
   3509 
   3510         // span(longest match) will not span the whole string.
   3511         "[a{ab}{bc}]",
   3512         "-cl",
   3513         // test_string 0x21
   3514         "abc",
   3515 
   3516         "[a{ab}{abc}{cd}]",
   3517         "-cl",
   3518         "acdabcdabccd",
   3519 
   3520         // spanBack(longest match) will not span the whole string.
   3521         "[c{ab}{bc}]",
   3522         "-cl",
   3523         "abc",
   3524 
   3525         "[d{cd}{bcd}{ab}]",
   3526         "-cl",
   3527         "abbcdabcdabd",
   3528 
   3529         // Test with non-ASCII set strings - test proper handling of surrogate pairs
   3530         // and UTF-8 trail bytes.
   3531         // Copies of above test sets and strings, but transliterated to have
   3532         // different code points with similar trail units.
   3533         // Previous: a      b         c            d
   3534         // Unicode:  042B   30AB      200AB        204AB
   3535         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
   3536         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
   3537         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
   3538         "-cl",
   3539         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
   3540 
   3541         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
   3542         "-cl",
   3543         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
   3544 
   3545         // Stress bookkeeping and recursion.
   3546         // The following strings are barely doable with the recursive
   3547         // reference implementation.
   3548         // The not-contained character at the end prevents an early exit from the span().
   3549         "[b{bb}]",
   3550         "-c",
   3551         // test_string 0x33
   3552         "bbbbbbbbbbbbbbbbbbbbbbbb-",
   3553         // On complement sets, span() and spanBack() get different results
   3554         // because b is not in the complement set and there is an odd number of b's
   3555         // in the test string.
   3556         "-bc",
   3557         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
   3558 
   3559         // Test with set strings with an initial or final code point span
   3560         // longer than 254.
   3561         "[a{" _64_a _64_a _64_a _64_a "b}"
   3562           "{a" _64_b _64_b _64_b _64_b "}]",
   3563         "-c",
   3564         _64_a _64_a _64_a _63_a "b",
   3565         _64_a _64_a _64_a _64_a "b",
   3566         _64_a _64_a _64_a _64_a "aaaabbbb",
   3567         "a" _64_b _64_b _64_b _63_b,
   3568         "a" _64_b _64_b _64_b _64_b,
   3569         "aaaabbbb" _64_b _64_b _64_b _64_b,
   3570 
   3571         // Test with strings containing unpaired surrogates.
   3572         // They are not representable in UTF-8, and a leading trail surrogate
   3573         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
   3574         // U+20001 == \\uD840\\uDC01
   3575         // U+20400 == \\uD841\\uDC00
   3576         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
   3577         "-8cl",
   3578         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
   3579     };
   3580     uint32_t whichSpans[96]={ SPAN_ALL };
   3581     int32_t whichSpansCount=1;
   3582 
   3583     UnicodeSet *sets[SET_COUNT]={ NULL };
   3584     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
   3585 
   3586     char testName[1024];
   3587     char *testNameLimit=testName;
   3588 
   3589     int32_t i, j;
   3590     for(i=0; i<LENGTHOF(testdata); ++i) {
   3591         const char *s=testdata[i];
   3592         if(s[0]=='[') {
   3593             // Create new test sets from this pattern.
   3594             for(j=0; j<SET_COUNT; ++j) {
   3595                 delete sets_with_str[j];
   3596                 delete sets[j];
   3597             }
   3598             UErrorCode errorCode=U_ZERO_ERROR;
   3599             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
   3600             if(U_FAILURE(errorCode)) {
   3601                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
   3602                 break;
   3603             }
   3604             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
   3605             sets[SLOW_NOT]->complement();
   3606             // Intermediate set: Test cloning of a frozen set.
   3607             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
   3608             fast->freeze();
   3609             sets[FAST]=(UnicodeSet *)fast->clone();
   3610             delete fast;
   3611             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
   3612             fastNot->freeze();
   3613             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
   3614             delete fastNot;
   3615 
   3616             for(j=0; j<SET_COUNT; ++j) {
   3617                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
   3618             }
   3619 
   3620             strcpy(testName, s);
   3621             testNameLimit=strchr(testName, 0);
   3622             *testNameLimit++=':';
   3623             *testNameLimit=0;
   3624 
   3625             whichSpans[0]=SPAN_ALL;
   3626             whichSpansCount=1;
   3627         } else if(s[0]=='-') {
   3628             whichSpans[0]=SPAN_ALL;
   3629             whichSpansCount=1;
   3630 
   3631             while(*++s!=0) {
   3632                 switch(*s) {
   3633                 case 'c':
   3634                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3635                                                    ~SPAN_POLARITY,
   3636                                                    SPAN_SET,
   3637                                                    SPAN_COMPLEMENT,
   3638                                                    0);
   3639                     break;
   3640                 case 'b':
   3641                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3642                                                    ~SPAN_DIRS,
   3643                                                    SPAN_FWD,
   3644                                                    SPAN_BACK,
   3645                                                    0);
   3646                     break;
   3647                 case 'l':
   3648                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
   3649                     // USET_SPAN_SIMPLE only FWD, and separately
   3650                     // USET_SPAN_SIMPLE only BACK
   3651                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3652                                                    ~(SPAN_DIRS|SPAN_CONDITION),
   3653                                                    SPAN_DIRS|SPAN_CONTAINED,
   3654                                                    SPAN_FWD|SPAN_SIMPLE,
   3655                                                    SPAN_BACK|SPAN_SIMPLE);
   3656                     break;
   3657                 case '8':
   3658                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3659                                                    ~SPAN_UTFS,
   3660                                                    SPAN_UTF16,
   3661                                                    SPAN_UTF8,
   3662                                                    0);
   3663                     break;
   3664                 default:
   3665                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
   3666                     break;
   3667                 }
   3668             }
   3669         } else if(0==strcmp(s, "*")) {
   3670             strcpy(testNameLimit, "bad_string");
   3671             for(j=0; j<whichSpansCount; ++j) {
   3672                 if(whichSpansCount>1) {
   3673                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
   3674                             "%%0x%3x",
   3675                             whichSpans[j]);
   3676                 }
   3677                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
   3678                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
   3679             }
   3680 
   3681             strcpy(testNameLimit, "contents");
   3682             for(j=0; j<whichSpansCount; ++j) {
   3683                 if(whichSpansCount>1) {
   3684                     sprintf(testNameLimit+8 /* strlen("contents") */,
   3685                             "%%0x%3x",
   3686                             whichSpans[j]);
   3687                 }
   3688                 testSpanContents(sets_with_str, whichSpans[j], testName);
   3689             }
   3690         } else {
   3691             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
   3692             strcpy(testNameLimit, "test_string");
   3693             for(j=0; j<whichSpansCount; ++j) {
   3694                 if(whichSpansCount>1) {
   3695                     sprintf(testNameLimit+11 /* strlen("test_string") */,
   3696                             "%%0x%3x",
   3697                             whichSpans[j]);
   3698                 }
   3699                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
   3700             }
   3701         }
   3702     }
   3703     for(j=0; j<SET_COUNT; ++j) {
   3704         delete sets_with_str[j];
   3705         delete sets[j];
   3706     }
   3707 }
   3708 
   3709 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
   3710 void UnicodeSetTest::TestStringSpan() {
   3711     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
   3712     static const char *const string=
   3713         "xx"
   3714         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
   3715         "xx"
   3716         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
   3717         "xx"
   3718         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
   3719         "aaaa";
   3720 
   3721     UErrorCode errorCode=U_ZERO_ERROR;
   3722     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
   3723     UnicodeSet set(pattern16, errorCode);
   3724     if(U_FAILURE(errorCode)) {
   3725         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3726         return;
   3727     }
   3728 
   3729     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
   3730 
   3731     if(set.containsAll(string16)) {
   3732         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
   3733     }
   3734 
   3735     // Remove trailing "aaaa".
   3736     string16.truncate(string16.length()-4);
   3737     if(!set.containsAll(string16)) {
   3738         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
   3739     }
   3740 
   3741     string16=UNICODE_STRING_SIMPLE("byayaxya");
   3742     const UChar *s16=string16.getBuffer();
   3743     int32_t length16=string16.length();
   3744     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
   3745         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
   3746         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
   3747         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
   3748         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
   3749         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
   3750     ) {
   3751         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
   3752     }
   3753 
   3754     pattern="[a{ab}{abc}{cd}]";
   3755     pattern16=UnicodeString(pattern, -1, US_INV);
   3756     set.applyPattern(pattern16, errorCode);
   3757     if(U_FAILURE(errorCode)) {
   3758         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3759         return;
   3760     }
   3761     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
   3762     s16=string16.getBuffer();
   3763     length16=string16.length();
   3764     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
   3765         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
   3766         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
   3767     ) {
   3768         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
   3769     }
   3770 
   3771     pattern="[d{cd}{bcd}{ab}]";
   3772     pattern16=UnicodeString(pattern, -1, US_INV);
   3773     set.applyPattern(pattern16, errorCode).freeze();
   3774     if(U_FAILURE(errorCode)) {
   3775         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3776         return;
   3777     }
   3778     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
   3779     s16=string16.getBuffer();
   3780     length16=string16.length();
   3781     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
   3782         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
   3783         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
   3784     ) {
   3785         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
   3786     }
   3787 }
   3788