Home | History | Annotate | Download | only in intltest
      1 /*
      2 ********************************************************************************
      3 *   Copyright (C) 1999-2015 International Business Machines Corporation and
      4 *   others. All Rights Reserved.
      5 ********************************************************************************
      6 *   Date        Name        Description
      7 *   10/20/99    alan        Creation.
      8 *   03/22/2000  Madhu       Added additional tests
      9 ********************************************************************************
     10 */
     11 
     12 #include <stdio.h>
     13 
     14 #include <string.h>
     15 #include "unicode/utypes.h"
     16 #include "usettest.h"
     17 #include "unicode/ucnv.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/usetiter.h"
     21 #include "unicode/ustring.h"
     22 #include "unicode/parsepos.h"
     23 #include "unicode/symtable.h"
     24 #include "unicode/uversion.h"
     25 #include "hash.h"
     26 
     27 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
     28     dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
     29     u_errorName(status));}}
     30 
     31 #define TEST_ASSERT(expr) {if (!(expr)) { \
     32     dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
     33 
     34 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
     35     UnicodeString pat;
     36     set.toPattern(pat);
     37     return left + UnicodeSetTest::escape(pat);
     38 }
     39 
     40 #define CASE(id,test) case id:                          \
     41                           name = #test;                 \
     42                           if (exec) {                   \
     43                               logln(#test "---");       \
     44                               logln();                  \
     45                               test();                   \
     46                           }                             \
     47                           break
     48 
     49 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
     50 }
     51 
     52 UConverter *UnicodeSetTest::openUTF8Converter() {
     53     if(utf8Cnv==NULL) {
     54         UErrorCode errorCode=U_ZERO_ERROR;
     55         utf8Cnv=ucnv_open("UTF-8", &errorCode);
     56     }
     57     return utf8Cnv;
     58 }
     59 
     60 UnicodeSetTest::~UnicodeSetTest() {
     61     ucnv_close(utf8Cnv);
     62 }
     63 
     64 void
     65 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     66                                const char* &name, char* /*par*/) {
     67     // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
     68     switch (index) {
     69         CASE(0,TestPatterns);
     70         CASE(1,TestAddRemove);
     71         CASE(2,TestCategories);
     72         CASE(3,TestCloneEqualHash);
     73         CASE(4,TestMinimalRep);
     74         CASE(5,TestAPI);
     75         CASE(6,TestScriptSet);
     76         CASE(7,TestPropertySet);
     77         CASE(8,TestClone);
     78         CASE(9,TestExhaustive);
     79         CASE(10,TestToPattern);
     80         CASE(11,TestIndexOf);
     81         CASE(12,TestStrings);
     82         CASE(13,Testj2268);
     83         CASE(14,TestCloseOver);
     84         CASE(15,TestEscapePattern);
     85         CASE(16,TestInvalidCodePoint);
     86         CASE(17,TestSymbolTable);
     87         CASE(18,TestSurrogate);
     88         CASE(19,TestPosixClasses);
     89         CASE(20,TestIteration);
     90         CASE(21,TestFreezable);
     91         CASE(22,TestSpan);
     92         CASE(23,TestStringSpan);
     93         CASE(24,TestUCAUnsafeBackwards);
     94         default: name = ""; break;
     95     }
     96 }
     97 
     98 static const char NOT[] = "%%%%";
     99 
    100 /**
    101  * UVector was improperly copying contents
    102  * This code will crash this is still true
    103  */
    104 void UnicodeSetTest::Testj2268() {
    105   UnicodeSet t;
    106   t.add(UnicodeString("abc"));
    107   UnicodeSet test(t);
    108   UnicodeString ustrPat;
    109   test.toPattern(ustrPat, TRUE);
    110 }
    111 
    112 /**
    113  * Test toPattern().
    114  */
    115 void UnicodeSetTest::TestToPattern() {
    116     UErrorCode ec = U_ZERO_ERROR;
    117 
    118     // Test that toPattern() round trips with syntax characters and
    119     // whitespace.
    120     {
    121         static const char* OTHER_TOPATTERN_TESTS[] = {
    122             "[[:latin:]&[:greek:]]",
    123             "[[:latin:]-[:greek:]]",
    124             "[:nonspacing mark:]",
    125             NULL
    126         };
    127 
    128         for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
    129             ec = U_ZERO_ERROR;
    130             UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
    131             if (U_FAILURE(ec)) {
    132                 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
    133                 continue;
    134             }
    135             checkPat(OTHER_TOPATTERN_TESTS[j], s);
    136         }
    137 
    138         for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
    139             if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
    140 
    141                 // check various combinations to make sure they all work.
    142                 if (i != 0 && !toPatternAux(i, i)){
    143                     continue;
    144                 }
    145                 if (!toPatternAux(0, i)){
    146                     continue;
    147                 }
    148                 if (!toPatternAux(i, 0xFFFF)){
    149                     continue;
    150                 }
    151             }
    152         }
    153     }
    154 
    155     // Test pattern behavior of multicharacter strings.
    156     {
    157         ec = U_ZERO_ERROR;
    158         UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
    159 
    160         // This loop isn't a loop.  It's here to make the compiler happy.
    161         // If you're curious, try removing it and changing the 'break'
    162         // statements (except for the last) to goto's.
    163         for (;;) {
    164             if (U_FAILURE(ec)) break;
    165             const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
    166             expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
    167 
    168             s->add("ac");
    169             const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
    170             expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
    171 
    172             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
    173             if (U_FAILURE(ec)) break;
    174             const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
    175             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
    176 
    177             s->add("[]");
    178             const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
    179             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
    180 
    181             s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
    182             if (U_FAILURE(ec)) break;
    183             const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
    184             expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
    185 
    186             // j2189
    187             s->clear();
    188             s->add(UnicodeString("abc", ""));
    189             s->add(UnicodeString("abc", ""));
    190             const char* exp6[] = {"abc", NOT, "ab", NULL};
    191             expectToPattern(*s, "[{abc}]", exp6);
    192 
    193             break;
    194         }
    195 
    196         if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
    197         delete s;
    198     }
    199 
    200     // JB#3400: For 2 character ranges prefer [ab] to [a-b]
    201     UnicodeSet s;
    202     s.add((UChar)97, (UChar)98); // 'a', 'b'
    203     expectToPattern(s, "[ab]", NULL);
    204 }
    205 
    206 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
    207 
    208     // use Integer.toString because Utility.hex doesn't handle ints
    209     UnicodeString pat = "";
    210     // TODO do these in hex
    211     //String source = "0x" + Integer.toString(start,16).toUpperCase();
    212     //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
    213     UnicodeString source;
    214     source = source + (uint32_t)start;
    215     if (start != end)
    216         source = source + ".." + (uint32_t)end;
    217     UnicodeSet testSet;
    218     testSet.add(start, end);
    219     return checkPat(source, testSet);
    220 }
    221 
    222 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
    223                                const UnicodeSet& testSet) {
    224     // What we want to make sure of is that a pattern generated
    225     // by toPattern(), with or without escaped unprintables, can
    226     // be passed back into the UnicodeSet constructor.
    227     UnicodeString pat0;
    228 
    229     testSet.toPattern(pat0, TRUE);
    230 
    231     if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
    232 
    233     //String pat1 = unescapeLeniently(pat0);
    234     //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
    235 
    236     UnicodeString pat2;
    237     testSet.toPattern(pat2, FALSE);
    238     if (!checkPat(source, testSet, pat2)) return FALSE;
    239 
    240     //String pat3 = unescapeLeniently(pat2);
    241     // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
    242 
    243     //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
    244     logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
    245     return TRUE;
    246 }
    247 
    248 UBool UnicodeSetTest::checkPat(const UnicodeString& source,
    249                                const UnicodeSet& testSet,
    250                                const UnicodeString& pat) {
    251     UErrorCode ec = U_ZERO_ERROR;
    252     UnicodeSet testSet2(pat, ec);
    253     if (testSet2 != testSet) {
    254         errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
    255         return FALSE;
    256     }
    257     return TRUE;
    258 }
    259 
    260 void
    261 UnicodeSetTest::TestPatterns(void) {
    262     UnicodeSet set;
    263     expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
    264     expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
    265     expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
    266     expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
    267     expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
    268     expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
    269 
    270     // Throw in a test of complement
    271     set.complement();
    272     UnicodeString exp;
    273     exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
    274     expectPairs(set, exp);
    275 }
    276 
    277 void
    278 UnicodeSetTest::TestCategories(void) {
    279     UErrorCode status = U_ZERO_ERROR;
    280     const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
    281     UnicodeSet set(pat, status);
    282     if (U_FAILURE(status)) {
    283         dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
    284         return;
    285     } else {
    286         expectContainment(set, pat, "ABC", "abc");
    287     }
    288 
    289     UChar32 i;
    290     int32_t failures = 0;
    291     // Make sure generation of L doesn't pollute cached Lu set
    292     // First generate L, then Lu
    293     set.applyPattern("[:L:]", status);
    294     if (U_FAILURE(status)) { errln("FAIL"); return; }
    295     for (i=0; i<0x200; ++i) {
    296         UBool l = u_isalpha((UChar)i);
    297         if (l != set.contains(i)) {
    298             errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
    299                   set.contains(i));
    300             if (++failures == 10) break;
    301         }
    302     }
    303 
    304     set.applyPattern("[:Lu:]", status);
    305     if (U_FAILURE(status)) { errln("FAIL"); return; }
    306     for (i=0; i<0x200; ++i) {
    307         UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
    308         if (lu != set.contains(i)) {
    309             errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
    310                   set.contains(i));
    311             if (++failures == 20) break;
    312         }
    313     }
    314 }
    315 void
    316 UnicodeSetTest::TestCloneEqualHash(void) {
    317     UErrorCode status = U_ZERO_ERROR;
    318     // set1 and set2 used to be built with the obsolete constructor taking
    319     // UCharCategory values; replaced with pattern constructors
    320     // markus 20030502
    321     UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
    322     UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
    323     if (U_FAILURE(status)){
    324         dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
    325         return;
    326     }
    327     UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
    328     UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
    329     if (U_FAILURE(status)){
    330         errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
    331         return;
    332     }
    333 
    334     if (*set1 != *set1a) {
    335         errln("FAIL: category constructor for Ll broken");
    336     }
    337     if (*set2 != *set2a) {
    338         errln("FAIL: category constructor for Nd broken");
    339     }
    340     delete set1a;
    341     delete set2a;
    342 
    343     logln("Testing copy construction");
    344     UnicodeSet *set1copy=new UnicodeSet(*set1);
    345     if(*set1 != *set1copy || *set1 == *set2 ||
    346         getPairs(*set1) != getPairs(*set1copy) ||
    347         set1->hashCode() != set1copy->hashCode()){
    348         errln("FAIL : Error in copy construction");
    349         return;
    350     }
    351 
    352     logln("Testing =operator");
    353     UnicodeSet set1equal=*set1;
    354     UnicodeSet set2equal=*set2;
    355     if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
    356         set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
    357         errln("FAIL: Error in =operator");
    358     }
    359 
    360     logln("Testing clone()");
    361     UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
    362     UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
    363     if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
    364         *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
    365         *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
    366         errln("FAIL: Error in clone");
    367     }
    368 
    369     logln("Testing hashcode");
    370     if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
    371         set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
    372         set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
    373         set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
    374         set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
    375         errln("FAIL: Error in hashCode()");
    376     }
    377 
    378     delete set1;
    379     delete set1copy;
    380     delete set2;
    381     delete set1clone;
    382     delete set2clone;
    383 
    384 
    385 }
    386 void
    387 UnicodeSetTest::TestAddRemove(void) {
    388     UnicodeSet set; // Construct empty set
    389     doAssert(set.isEmpty() == TRUE, "set should be empty");
    390     doAssert(set.size() == 0, "size should be 0");
    391     set.complement();
    392     doAssert(set.size() == 0x110000, "size should be 0x110000");
    393     set.clear();
    394     set.add(0x0061, 0x007a);
    395     expectPairs(set, "az");
    396     doAssert(set.isEmpty() == FALSE, "set should not be empty");
    397     doAssert(set.size() != 0, "size should not be equal to 0");
    398     doAssert(set.size() == 26, "size should be equal to 26");
    399     set.remove(0x006d, 0x0070);
    400     expectPairs(set, "alqz");
    401     doAssert(set.size() == 22, "size should be equal to 22");
    402     set.remove(0x0065, 0x0067);
    403     expectPairs(set, "adhlqz");
    404     doAssert(set.size() == 19, "size should be equal to 19");
    405     set.remove(0x0064, 0x0069);
    406     expectPairs(set, "acjlqz");
    407     doAssert(set.size() == 16, "size should be equal to 16");
    408     set.remove(0x0063, 0x0072);
    409     expectPairs(set, "absz");
    410     doAssert(set.size() == 10, "size should be equal to 10");
    411     set.add(0x0066, 0x0071);
    412     expectPairs(set, "abfqsz");
    413     doAssert(set.size() == 22, "size should be equal to 22");
    414     set.remove(0x0061, 0x0067);
    415     expectPairs(set, "hqsz");
    416     set.remove(0x0061, 0x007a);
    417     expectPairs(set, "");
    418     doAssert(set.isEmpty() == TRUE, "set should be empty");
    419     doAssert(set.size() == 0, "size should be 0");
    420     set.add(0x0061);
    421     doAssert(set.isEmpty() == FALSE, "set should not be empty");
    422     doAssert(set.size() == 1, "size should not be equal to 1");
    423     set.add(0x0062);
    424     set.add(0x0063);
    425     expectPairs(set, "ac");
    426     doAssert(set.size() == 3, "size should not be equal to 3");
    427     set.add(0x0070);
    428     set.add(0x0071);
    429     expectPairs(set, "acpq");
    430     doAssert(set.size() == 5, "size should not be equal to 5");
    431     set.clear();
    432     expectPairs(set, "");
    433     doAssert(set.isEmpty() == TRUE, "set should be empty");
    434     doAssert(set.size() == 0, "size should be 0");
    435 
    436     // Try removing an entire set from another set
    437     expectPattern(set, "[c-x]", "cx");
    438     UnicodeSet set2;
    439     expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
    440     set.removeAll(set2);
    441     expectPairs(set, "deluxx");
    442 
    443     // Try adding an entire set to another set
    444     expectPattern(set, "[jackiemclean]", "aacceein");
    445     expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
    446     set.addAll(set2);
    447     expectPairs(set, "aacehort");
    448     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
    449 
    450     // Try retaining an set of elements contained in another set (intersection)
    451     UnicodeSet set3;
    452     expectPattern(set3, "[a-c]", "ac");
    453     doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
    454     set3.remove(0x0062);
    455     expectPairs(set3, "aacc");
    456     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
    457     set.retainAll(set3);
    458     expectPairs(set, "aacc");
    459     doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
    460     doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
    461     set.clear();
    462     doAssert(set.size() != set3.size(), "set.size() != set3.size()");
    463 
    464     // Test commutativity
    465     expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
    466     expectPattern(set2, "[jackiemclean]", "aacceein");
    467     set.addAll(set2);
    468     expectPairs(set, "aacehort");
    469     doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
    470 
    471 
    472 
    473 
    474 }
    475 
    476 /**
    477  * Make sure minimal representation is maintained.
    478  */
    479 void UnicodeSetTest::TestMinimalRep() {
    480     UErrorCode status = U_ZERO_ERROR;
    481     // This is pretty thoroughly tested by checkCanonicalRep()
    482     // run against the exhaustive operation results.  Use the code
    483     // here for debugging specific spot problems.
    484 
    485     // 1 overlap against 2
    486     UnicodeSet set("[h-km-q]", status);
    487     if (U_FAILURE(status)) { errln("FAIL"); return; }
    488     UnicodeSet set2("[i-o]", status);
    489     if (U_FAILURE(status)) { errln("FAIL"); return; }
    490     set.addAll(set2);
    491     expectPairs(set, "hq");
    492     // right
    493     set.applyPattern("[a-m]", status);
    494     if (U_FAILURE(status)) { errln("FAIL"); return; }
    495     set2.applyPattern("[e-o]", status);
    496     if (U_FAILURE(status)) { errln("FAIL"); return; }
    497     set.addAll(set2);
    498     expectPairs(set, "ao");
    499     // left
    500     set.applyPattern("[e-o]", status);
    501     if (U_FAILURE(status)) { errln("FAIL"); return; }
    502     set2.applyPattern("[a-m]", status);
    503     if (U_FAILURE(status)) { errln("FAIL"); return; }
    504     set.addAll(set2);
    505     expectPairs(set, "ao");
    506     // 1 overlap against 3
    507     set.applyPattern("[a-eg-mo-w]", status);
    508     if (U_FAILURE(status)) { errln("FAIL"); return; }
    509     set2.applyPattern("[d-q]", status);
    510     if (U_FAILURE(status)) { errln("FAIL"); return; }
    511     set.addAll(set2);
    512     expectPairs(set, "aw");
    513 }
    514 
    515 void UnicodeSetTest::TestAPI() {
    516     UErrorCode status = U_ZERO_ERROR;
    517     // default ct
    518     UnicodeSet set;
    519     if (!set.isEmpty() || set.getRangeCount() != 0) {
    520         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
    521               set);
    522     }
    523 
    524     // clear(), isEmpty()
    525     set.add(0x0061);
    526     if (set.isEmpty()) {
    527         errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
    528               set);
    529     }
    530     set.clear();
    531     if (!set.isEmpty()) {
    532         errln((UnicodeString)"FAIL, set should be empty but isn't: " +
    533               set);
    534     }
    535 
    536     // size()
    537     set.clear();
    538     if (set.size() != 0) {
    539         errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
    540               ": " + set);
    541     }
    542     set.add(0x0061);
    543     if (set.size() != 1) {
    544         errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
    545               ": " + set);
    546     }
    547     set.add(0x0031, 0x0039);
    548     if (set.size() != 10) {
    549         errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
    550               ": " + set);
    551     }
    552 
    553     // contains(first, last)
    554     set.clear();
    555     set.applyPattern("[A-Y 1-8 b-d l-y]", status);
    556     if (U_FAILURE(status)) { errln("FAIL"); return; }
    557     for (int32_t i = 0; i<set.getRangeCount(); ++i) {
    558         UChar32 a = set.getRangeStart(i);
    559         UChar32 b = set.getRangeEnd(i);
    560         if (!set.contains(a, b)) {
    561             errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
    562                   " but doesn't: " + set);
    563         }
    564         if (set.contains((UChar32)(a-1), b)) {
    565             errln((UnicodeString)"FAIL, shouldn't contain " +
    566                   (unsigned short)(a-1) + '-' + (unsigned short)b +
    567                   " but does: " + set);
    568         }
    569         if (set.contains(a, (UChar32)(b+1))) {
    570             errln((UnicodeString)"FAIL, shouldn't contain " +
    571                   (unsigned short)a + '-' + (unsigned short)(b+1) +
    572                   " but does: " + set);
    573         }
    574     }
    575 
    576     // Ported InversionList test.
    577     UnicodeSet a((UChar32)3,(UChar32)10);
    578     UnicodeSet b((UChar32)7,(UChar32)15);
    579     UnicodeSet c;
    580 
    581     logln((UnicodeString)"a [3-10]: " + a);
    582     logln((UnicodeString)"b [7-15]: " + b);
    583     c = a;
    584     c.addAll(b);
    585     UnicodeSet exp((UChar32)3,(UChar32)15);
    586     if (c == exp) {
    587         logln((UnicodeString)"c.set(a).add(b): " + c);
    588     } else {
    589         errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
    590     }
    591     c.complement();
    592     exp.set((UChar32)0, (UChar32)2);
    593     exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
    594     if (c == exp) {
    595         logln((UnicodeString)"c.complement(): " + c);
    596     } else {
    597         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
    598     }
    599     c.complement();
    600     exp.set((UChar32)3, (UChar32)15);
    601     if (c == exp) {
    602         logln((UnicodeString)"c.complement(): " + c);
    603     } else {
    604         errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
    605     }
    606     c = a;
    607     c.complementAll(b);
    608     exp.set((UChar32)3,(UChar32)6);
    609     exp.add((UChar32)11,(UChar32) 15);
    610     if (c == exp) {
    611         logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
    612     } else {
    613         errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
    614     }
    615 
    616     exp = c;
    617     bitsToSet(setToBits(c), c);
    618     if (c == exp) {
    619         logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
    620     } else {
    621         errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
    622     }
    623 
    624     // Additional tests for coverage JB#2118
    625     //UnicodeSet::complement(class UnicodeString const &)
    626     //UnicodeSet::complementAll(class UnicodeString const &)
    627     //UnicodeSet::containsNone(class UnicodeSet const &)
    628     //UnicodeSet::containsNone(long,long)
    629     //UnicodeSet::containsSome(class UnicodeSet const &)
    630     //UnicodeSet::containsSome(long,long)
    631     //UnicodeSet::removeAll(class UnicodeString const &)
    632     //UnicodeSet::retain(long)
    633     //UnicodeSet::retainAll(class UnicodeString const &)
    634     //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
    635     //UnicodeSetIterator::getString(void)
    636     set.clear();
    637     set.complement("ab");
    638     exp.applyPattern("[{ab}]", status);
    639     if (U_FAILURE(status)) { errln("FAIL"); return; }
    640     if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
    641 
    642     UnicodeSetIterator iset(set);
    643     if (!iset.next() || !iset.isString()) {
    644         errln("FAIL: UnicodeSetIterator::next/isString");
    645     } else if (iset.getString() != "ab") {
    646         errln("FAIL: UnicodeSetIterator::getString");
    647     }
    648 
    649     set.add((UChar32)0x61, (UChar32)0x7A);
    650     set.complementAll("alan");
    651     exp.applyPattern("[{ab}b-kmo-z]", status);
    652     if (U_FAILURE(status)) { errln("FAIL"); return; }
    653     if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
    654 
    655     exp.applyPattern("[a-z]", status);
    656     if (U_FAILURE(status)) { errln("FAIL"); return; }
    657     if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
    658     if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
    659     exp.applyPattern("[aln]", status);
    660     if (U_FAILURE(status)) { errln("FAIL"); return; }
    661     if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
    662     if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
    663 
    664     if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
    665         errln("FAIL: containsNone(UChar32, UChar32)");
    666     }
    667     if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
    668         errln("FAIL: containsSome(UChar32, UChar32)");
    669     }
    670     if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
    671         errln("FAIL: containsNone(UChar32, UChar32)");
    672     }
    673     if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
    674         errln("FAIL: containsSome(UChar32, UChar32)");
    675     }
    676 
    677     set.removeAll("liu");
    678     exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
    679     if (U_FAILURE(status)) { errln("FAIL"); return; }
    680     if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
    681 
    682     set.retainAll("star");
    683     exp.applyPattern("[rst]", status);
    684     if (U_FAILURE(status)) { errln("FAIL"); return; }
    685     if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
    686 
    687     set.retain((UChar32)0x73);
    688     exp.applyPattern("[s]", status);
    689     if (U_FAILURE(status)) { errln("FAIL"); return; }
    690     if (set != exp) { errln("FAIL: retain('s')"); return; }
    691 
    692     uint16_t buf[32];
    693     int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
    694     if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
    695     if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
    696         errln("FAIL: serialize");
    697         return;
    698     }
    699 
    700     // Conversions to and from USet
    701     UnicodeSet *uniset = &set;
    702     USet *uset = uniset->toUSet();
    703     TEST_ASSERT((void *)uset == (void *)uniset);
    704     UnicodeSet *setx = UnicodeSet::fromUSet(uset);
    705     TEST_ASSERT((void *)setx == (void *)uset);
    706     const UnicodeSet *constSet = uniset;
    707     const USet *constUSet = constSet->toUSet();
    708     TEST_ASSERT((void *)constUSet == (void *)constSet);
    709     const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
    710     TEST_ASSERT((void *)constSetx == (void *)constUSet);
    711 
    712     // span(UnicodeString) and spanBack(UnicodeString) convenience methods
    713     UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
    714     UnicodeSet ac(0x61, 0x63);
    715     ac.remove(0x62).freeze();
    716     if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
    717         ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
    718         ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
    719         ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
    720         ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
    721         ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
    722         ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
    723         ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
    724         ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
    725         ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
    726     ) {
    727         errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
    728     }
    729     if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
    730         ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
    731         ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
    732         ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
    733         ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
    734         ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
    735         ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
    736         ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
    737         ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
    738         ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
    739     ) {
    740         errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
    741     }
    742 }
    743 
    744 void UnicodeSetTest::TestIteration() {
    745     UErrorCode ec = U_ZERO_ERROR;
    746     int i = 0;
    747     int outerLoop;
    748 
    749     // 6 code points, 3 ranges, 2 strings, 8 total elements
    750     //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
    751     UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
    752     TEST_ASSERT_SUCCESS(ec);
    753     UnicodeSetIterator it(set);
    754 
    755     for (outerLoop=0; outerLoop<3; outerLoop++) {
    756         // Run the test multiple times, to check that iterator.reset() is working.
    757         for (i=0; i<10; i++) {
    758             UBool         nextv        = it.next();
    759             UBool         isString     = it.isString();
    760             int32_t       codePoint    = it.getCodepoint();
    761             //int32_t       codePointEnd = it.getCodepointEnd();
    762             UnicodeString s   = it.getString();
    763             switch (i) {
    764             case 0:
    765                 TEST_ASSERT(nextv == TRUE);
    766                 TEST_ASSERT(isString == FALSE);
    767                 TEST_ASSERT(codePoint==0x61);
    768                 TEST_ASSERT(s == "a");
    769                 break;
    770             case 1:
    771                 TEST_ASSERT(nextv == TRUE);
    772                 TEST_ASSERT(isString == FALSE);
    773                 TEST_ASSERT(codePoint==0x62);
    774                 TEST_ASSERT(s == "b");
    775                 break;
    776             case 2:
    777                 TEST_ASSERT(nextv == TRUE);
    778                 TEST_ASSERT(isString == FALSE);
    779                 TEST_ASSERT(codePoint==0x63);
    780                 TEST_ASSERT(s == "c");
    781                 break;
    782             case 3:
    783                 TEST_ASSERT(nextv == TRUE);
    784                 TEST_ASSERT(isString == FALSE);
    785                 TEST_ASSERT(codePoint==0x79);
    786                 TEST_ASSERT(s == "y");
    787                 break;
    788             case 4:
    789                 TEST_ASSERT(nextv == TRUE);
    790                 TEST_ASSERT(isString == FALSE);
    791                 TEST_ASSERT(codePoint==0x7a);
    792                 TEST_ASSERT(s == "z");
    793                 break;
    794             case 5:
    795                 TEST_ASSERT(nextv == TRUE);
    796                 TEST_ASSERT(isString == FALSE);
    797                 TEST_ASSERT(codePoint==0x1abcd);
    798                 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
    799                 break;
    800             case 6:
    801                 TEST_ASSERT(nextv == TRUE);
    802                 TEST_ASSERT(isString == TRUE);
    803                 TEST_ASSERT(s == "str1");
    804                 break;
    805             case 7:
    806                 TEST_ASSERT(nextv == TRUE);
    807                 TEST_ASSERT(isString == TRUE);
    808                 TEST_ASSERT(s == "str2");
    809                 break;
    810             case 8:
    811                 TEST_ASSERT(nextv == FALSE);
    812                 break;
    813             case 9:
    814                 TEST_ASSERT(nextv == FALSE);
    815                 break;
    816             }
    817         }
    818         it.reset();  // prepare to run the iteration again.
    819     }
    820 }
    821 
    822 
    823 
    824 
    825 void UnicodeSetTest::TestStrings() {
    826     UErrorCode ec = U_ZERO_ERROR;
    827 
    828     UnicodeSet* testList[] = {
    829         UnicodeSet::createFromAll("abc"),
    830         new UnicodeSet("[a-c]", ec),
    831 
    832         &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
    833         new UnicodeSet("[{ll}{ch}a-z]", ec),
    834 
    835         UnicodeSet::createFrom("ab}c"),
    836         new UnicodeSet("[{ab\\}c}]", ec),
    837 
    838         &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
    839         new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
    840 
    841         NULL
    842     };
    843 
    844     if (U_FAILURE(ec)) {
    845         errln("FAIL: couldn't construct test sets");
    846     }
    847 
    848     for (int32_t i = 0; testList[i] != NULL; i+=2) {
    849         if (U_SUCCESS(ec)) {
    850             UnicodeString pat0, pat1;
    851             testList[i]->toPattern(pat0, TRUE);
    852             testList[i+1]->toPattern(pat1, TRUE);
    853             if (*testList[i] == *testList[i+1]) {
    854                 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
    855             } else {
    856                 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
    857             }
    858         }
    859         delete testList[i];
    860         delete testList[i+1];
    861     }
    862 }
    863 
    864 /**
    865  * Test the [:Latin:] syntax.
    866  */
    867 void UnicodeSetTest::TestScriptSet() {
    868     expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
    869 
    870     expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
    871 
    872     /* Jitterbug 1423 */
    873     expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
    874 
    875 }
    876 
    877 /**
    878  * Test the [:Latin:] syntax.
    879  */
    880 void UnicodeSetTest::TestPropertySet() {
    881     static const char* const DATA[] = {
    882         // Pattern, Chars IN, Chars NOT in
    883 
    884         "[:Latin:]",
    885         "aA",
    886         "\\u0391\\u03B1",
    887 
    888         "[\\p{Greek}]",
    889         "\\u0391\\u03B1",
    890         "aA",
    891 
    892         "\\P{ GENERAL Category = upper case letter }",
    893         "abc",
    894         "ABC",
    895 
    896 #if !UCONFIG_NO_NORMALIZATION
    897         // Combining class: @since ICU 2.2
    898         // Check both symbolic and numeric
    899         "\\p{ccc=Nukta}",
    900         "\\u0ABC",
    901         "abc",
    902 
    903         "\\p{Canonical Combining Class = 11}",
    904         "\\u05B1",
    905         "\\u05B2",
    906 
    907         "[:c c c = iota subscript :]",
    908         "\\u0345",
    909         "xyz",
    910 #endif
    911 
    912         // Bidi class: @since ICU 2.2
    913         "\\p{bidiclass=lefttoright}",
    914         "abc",
    915         "\\u0671\\u0672",
    916 
    917         // Binary properties: @since ICU 2.2
    918         "\\p{ideographic}",
    919         "\\u4E0A",
    920         "x",
    921 
    922         "[:math=false:]",
    923         "q)*(",
    924         // weiv: )(and * were removed from math in Unicode 4.0.1
    925         //"(*+)",
    926         "+<>^",
    927 
    928         // JB#1767 \N{}, \p{ASCII}
    929         "[:Ascii:]",
    930         "abc\\u0000\\u007F",
    931         "\\u0080\\u4E00",
    932 
    933         "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
    934         "az",
    935         "qrs",
    936 
    937         // JB#2015
    938         "[:any:]",
    939         "a\\U0010FFFF",
    940         "",
    941 
    942         "[:nv=0.5:]",
    943         "\\u00BD\\u0F2A",
    944         "\\u00BC",
    945 
    946         // JB#2653: Age
    947         "[:Age=1.1:]",
    948         "\\u03D6", // 1.1
    949         "\\u03D8\\u03D9", // 3.2
    950 
    951         "[:Age=3.1:]",
    952         "\\u1800\\u3400\\U0002f800",
    953         "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
    954 
    955         // JB#2350: Case_Sensitive
    956         "[:Case Sensitive:]",
    957         "A\\u1FFC\\U00010410",
    958         ";\\u00B4\\U00010500",
    959 
    960         // JB#2832: C99-compatibility props
    961         "[:blank:]",
    962         " \\u0009",
    963         "1-9A-Z",
    964 
    965         "[:graph:]",
    966         "19AZ",
    967         " \\u0003\\u0007\\u0009\\u000A\\u000D",
    968 
    969         "[:punct:]",
    970         "!@#%&*()[]{}-_\\/;:,.?'\"",
    971         "09azAZ",
    972 
    973         "[:xdigit:]",
    974         "09afAF",
    975         "gG!",
    976 
    977         // Regex compatibility test
    978         "[-b]", // leading '-' is literal
    979         "-b",
    980         "ac",
    981 
    982         "[^-b]", // leading '-' is literal
    983         "ac",
    984         "-b",
    985 
    986         "[b-]", // trailing '-' is literal
    987         "-b",
    988         "ac",
    989 
    990         "[^b-]", // trailing '-' is literal
    991         "ac",
    992         "-b",
    993 
    994         "[a-b-]", // trailing '-' is literal
    995         "ab-",
    996         "c=",
    997 
    998         "[[a-q]&[p-z]-]", // trailing '-' is literal
    999         "pq-",
   1000         "or=",
   1001 
   1002         "[\\s|\\)|:|$|\\>]", // from regex tests
   1003         "s|):$>",
   1004         "abc",
   1005 
   1006         "[\\uDC00cd]", // JB#2906: isolated trail at start
   1007         "cd\\uDC00",
   1008         "ab\\uD800\\U00010000",
   1009 
   1010         "[ab\\uD800]", // JB#2906: isolated trail at start
   1011         "ab\\uD800",
   1012         "cd\\uDC00\\U00010000",
   1013 
   1014         "[ab\\uD800cd]", // JB#2906: isolated lead in middle
   1015         "abcd\\uD800",
   1016         "ef\\uDC00\\U00010000",
   1017 
   1018         "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
   1019         "abcd\\uDC00",
   1020         "ef\\uD800\\U00010000",
   1021 
   1022 #if !UCONFIG_NO_NORMALIZATION
   1023         "[:^lccc=0:]", // Lead canonical class
   1024         "\\u0300\\u0301",
   1025         "abcd\\u00c0\\u00c5",
   1026 
   1027         "[:^tccc=0:]", // Trail canonical class
   1028         "\\u0300\\u0301\\u00c0\\u00c5",
   1029         "abcd",
   1030 
   1031         "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
   1032         "\\u0300\\u0301\\u00c0\\u00c5",
   1033         "abcd",
   1034 
   1035         "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
   1036         "",
   1037         "abcd\\u0300\\u0301\\u00c0\\u00c5",
   1038 
   1039         "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
   1040         "\\u0F73\\u0F75\\u0F81",
   1041         "abcd\\u0300\\u0301\\u00c0\\u00c5",
   1042 #endif /* !UCONFIG_NO_NORMALIZATION */
   1043 
   1044         "[:Assigned:]",
   1045         "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
   1046         "\\u0888\\uFDD3\\uFFFE\\U00050005",
   1047 
   1048         // Script_Extensions, new in Unicode 6.0
   1049         "[:scx=Arab:]",
   1050         "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
   1051         "\\u061D\\uFDEF\\uFDFE",
   1052 
   1053         // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
   1054         // so scx-sc is missing U+FDF2.
   1055         "[[:Script_Extensions=Arabic:]-[:Arab:]]",
   1056         "\\u0640\\u064B\\u0650\\u0655",
   1057         "\\uFDF2"
   1058     };
   1059 
   1060     static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
   1061 
   1062     for (int32_t i=0; i<DATA_LEN; i+=3) {
   1063         expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
   1064                           CharsToUnicodeString(DATA[i+2]));
   1065     }
   1066 }
   1067 
   1068 /**
   1069   * Test that Posix style character classes [:digit:], etc.
   1070   *   have the Unicode definitions from TR 18.
   1071   */
   1072 void UnicodeSetTest::TestPosixClasses() {
   1073     {
   1074         UErrorCode status = U_ZERO_ERROR;
   1075         UnicodeSet s1("[:alpha:]", status);
   1076         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
   1077         TEST_ASSERT_SUCCESS(status);
   1078         TEST_ASSERT(s1==s2);
   1079     }
   1080     {
   1081         UErrorCode status = U_ZERO_ERROR;
   1082         UnicodeSet s1("[:lower:]", status);
   1083         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
   1084         TEST_ASSERT_SUCCESS(status);
   1085         TEST_ASSERT(s1==s2);
   1086     }
   1087     {
   1088         UErrorCode status = U_ZERO_ERROR;
   1089         UnicodeSet s1("[:upper:]", status);
   1090         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
   1091         TEST_ASSERT_SUCCESS(status);
   1092         TEST_ASSERT(s1==s2);
   1093     }
   1094     {
   1095         UErrorCode status = U_ZERO_ERROR;
   1096         UnicodeSet s1("[:punct:]", status);
   1097         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
   1098         TEST_ASSERT_SUCCESS(status);
   1099         TEST_ASSERT(s1==s2);
   1100     }
   1101     {
   1102         UErrorCode status = U_ZERO_ERROR;
   1103         UnicodeSet s1("[:digit:]", status);
   1104         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
   1105         TEST_ASSERT_SUCCESS(status);
   1106         TEST_ASSERT(s1==s2);
   1107     }
   1108     {
   1109         UErrorCode status = U_ZERO_ERROR;
   1110         UnicodeSet s1("[:xdigit:]", status);
   1111         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
   1112         TEST_ASSERT_SUCCESS(status);
   1113         TEST_ASSERT(s1==s2);
   1114     }
   1115     {
   1116         UErrorCode status = U_ZERO_ERROR;
   1117         UnicodeSet s1("[:alnum:]", status);
   1118         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
   1119         TEST_ASSERT_SUCCESS(status);
   1120         TEST_ASSERT(s1==s2);
   1121     }
   1122     {
   1123         UErrorCode status = U_ZERO_ERROR;
   1124         UnicodeSet s1("[:space:]", status);
   1125         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
   1126         TEST_ASSERT_SUCCESS(status);
   1127         TEST_ASSERT(s1==s2);
   1128     }
   1129     {
   1130         UErrorCode status = U_ZERO_ERROR;
   1131         UnicodeSet s1("[:blank:]", status);
   1132         TEST_ASSERT_SUCCESS(status);
   1133         UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
   1134             status);
   1135         TEST_ASSERT_SUCCESS(status);
   1136         TEST_ASSERT(s1==s2);
   1137     }
   1138     {
   1139         UErrorCode status = U_ZERO_ERROR;
   1140         UnicodeSet s1("[:cntrl:]", status);
   1141         TEST_ASSERT_SUCCESS(status);
   1142         UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
   1143         TEST_ASSERT_SUCCESS(status);
   1144         TEST_ASSERT(s1==s2);
   1145     }
   1146     {
   1147         UErrorCode status = U_ZERO_ERROR;
   1148         UnicodeSet s1("[:graph:]", status);
   1149         TEST_ASSERT_SUCCESS(status);
   1150         UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
   1151         TEST_ASSERT_SUCCESS(status);
   1152         TEST_ASSERT(s1==s2);
   1153     }
   1154     {
   1155         UErrorCode status = U_ZERO_ERROR;
   1156         UnicodeSet s1("[:print:]", status);
   1157         TEST_ASSERT_SUCCESS(status);
   1158         UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
   1159         TEST_ASSERT_SUCCESS(status);
   1160         TEST_ASSERT(s1==s2);
   1161     }
   1162 }
   1163 /**
   1164  * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
   1165  */
   1166 void UnicodeSetTest::TestClone() {
   1167     UErrorCode ec = U_ZERO_ERROR;
   1168     UnicodeSet s("[abcxyz]", ec);
   1169     UnicodeSet t(s);
   1170     expectContainment(t, "abc", "def");
   1171 }
   1172 
   1173 /**
   1174  * Test the indexOf() and charAt() methods.
   1175  */
   1176 void UnicodeSetTest::TestIndexOf() {
   1177     UErrorCode ec = U_ZERO_ERROR;
   1178     UnicodeSet set("[a-cx-y3578]", ec);
   1179     if (U_FAILURE(ec)) {
   1180         errln("FAIL: UnicodeSet constructor");
   1181         return;
   1182     }
   1183     for (int32_t i=0; i<set.size(); ++i) {
   1184         UChar32 c = set.charAt(i);
   1185         if (set.indexOf(c) != i) {
   1186             errln("FAIL: charAt(%d) = %X => indexOf() => %d",
   1187                 i, c, set.indexOf(c));
   1188         }
   1189     }
   1190     UChar32 c = set.charAt(set.size());
   1191     if (c != -1) {
   1192         errln("FAIL: charAt(<out of range>) = %X", c);
   1193     }
   1194     int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
   1195     if (j != -1) {
   1196         errln((UnicodeString)"FAIL: indexOf('q') = " + j);
   1197     }
   1198 }
   1199 
   1200 /**
   1201  * Test closure API.
   1202  */
   1203 void UnicodeSetTest::TestCloseOver() {
   1204     UErrorCode ec = U_ZERO_ERROR;
   1205 
   1206     char CASE[] = {(char)USET_CASE_INSENSITIVE};
   1207     char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
   1208     const char* DATA[] = {
   1209         // selector, input, output
   1210         CASE,
   1211         "[aq\\u00DF{Bc}{bC}{Fi}]",
   1212         "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
   1213 
   1214         CASE,
   1215         "[\\u01F1]", // 'DZ'
   1216         "[\\u01F1\\u01F2\\u01F3]",
   1217 
   1218         CASE,
   1219         "[\\u1FB4]",
   1220         "[\\u1FB4{\\u03AC\\u03B9}]",
   1221 
   1222         CASE,
   1223         "[{F\\uFB01}]",
   1224         "[\\uFB03{ffi}]",
   1225 
   1226         CASE, // make sure binary search finds limits
   1227         "[a\\uFF3A]",
   1228         "[aA\\uFF3A\\uFF5A]",
   1229 
   1230         CASE,
   1231         "[a-z]","[A-Za-z\\u017F\\u212A]",
   1232         CASE,
   1233         "[abc]","[A-Ca-c]",
   1234         CASE,
   1235         "[ABC]","[A-Ca-c]",
   1236 
   1237         CASE, "[i]", "[iI]",
   1238 
   1239         CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
   1240         CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
   1241 
   1242         CASE, "[\\u0131]",          "[\\u0131]", // dotless i
   1243 
   1244         CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
   1245 
   1246         CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
   1247 
   1248         CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
   1249 
   1250         CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
   1251 
   1252         CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
   1253 
   1254         CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
   1255         CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
   1256 
   1257         CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
   1258 
   1259         CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
   1260 
   1261         CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
   1262 
   1263 #if !UCONFIG_NO_FILE_IO
   1264         CASE_MAPPINGS,
   1265         "[aq\\u00DF{Bc}{bC}{Fi}]",
   1266         "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
   1267 #endif
   1268 
   1269         CASE_MAPPINGS,
   1270         "[\\u01F1]", // 'DZ'
   1271         "[\\u01F1\\u01F2\\u01F3]",
   1272 
   1273         CASE_MAPPINGS,
   1274         "[a-z]",
   1275         "[A-Za-z]",
   1276 
   1277         NULL
   1278     };
   1279 
   1280     UnicodeSet s;
   1281     UnicodeSet t;
   1282     UnicodeString buf;
   1283     for (int32_t i=0; DATA[i]!=NULL; i+=3) {
   1284         int32_t selector = DATA[i][0];
   1285         UnicodeString pat(DATA[i+1], -1, US_INV);
   1286         UnicodeString exp(DATA[i+2], -1, US_INV);
   1287         s.applyPattern(pat, ec);
   1288         s.closeOver(selector);
   1289         t.applyPattern(exp, ec);
   1290         if (U_FAILURE(ec)) {
   1291             errln("FAIL: applyPattern failed");
   1292             continue;
   1293         }
   1294         if (s == t) {
   1295             logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
   1296         } else {
   1297             dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
   1298                   s.toPattern(buf, TRUE) + ", expected " + exp);
   1299         }
   1300     }
   1301 
   1302 #if 0
   1303     /*
   1304      * Unused test code.
   1305      * This was used to compare the old implementation (using USET_CASE)
   1306      * with the new one (using 0x100 temporarily)
   1307      * while transitioning from hardcoded case closure tables in uniset.cpp
   1308      * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
   1309      * and using ucase.c functions for closure.
   1310      * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
   1311      *
   1312      * Note: The old and new implementation never fully matched because
   1313      * the old implementation turned out to not map U+0130 and U+0131 correctly
   1314      * (dotted I and dotless i) and because the old implementation's data tables
   1315      * were outdated compared to Unicode 4.0.1 at the time of the change to the
   1316      * new implementation. (So sigmas and some other characters were not handled
   1317      * according to the newer Unicode version.)
   1318      */
   1319     UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
   1320     UnicodeSetIterator si(sens);
   1321     UnicodeString str, buf2;
   1322     const UnicodeString *pStr;
   1323     UChar32 c;
   1324     while(si.next()) {
   1325         if(!si.isString()) {
   1326             c=si.getCodepoint();
   1327             s.clear();
   1328             s.add(c);
   1329 
   1330             str.setTo(c);
   1331             str.foldCase();
   1332             sens2.add(str);
   1333 
   1334             t=s;
   1335             s.closeOver(USET_CASE);
   1336             t.closeOver(0x100);
   1337             if(s!=t) {
   1338                 errln("FAIL: closeOver(U+%04x) differs: ", c);
   1339                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
   1340             }
   1341         }
   1342     }
   1343     // remove all code points
   1344     // should contain all full case folding mapping strings
   1345     sens2.remove(0, 0x10ffff);
   1346     si.reset(sens2);
   1347     while(si.next()) {
   1348         if(si.isString()) {
   1349             pStr=&si.getString();
   1350             s.clear();
   1351             s.add(*pStr);
   1352             t=s2=s;
   1353             s.closeOver(USET_CASE);
   1354             t.closeOver(0x100);
   1355             if(s!=t) {
   1356                 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
   1357                 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
   1358             }
   1359         }
   1360     }
   1361 #endif
   1362 
   1363     // Test the pattern API
   1364     s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
   1365     if (U_FAILURE(ec)) {
   1366         errln("FAIL: applyPattern failed");
   1367     } else {
   1368         expectContainment(s, "abcABC", "defDEF");
   1369     }
   1370     UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
   1371     if (U_FAILURE(ec)) {
   1372         errln("FAIL: constructor failed");
   1373     } else {
   1374         expectContainment(v, "defDEF", "abcABC");
   1375     }
   1376     UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
   1377     if (U_FAILURE(ec)) {
   1378         errln("FAIL: construct w/case mappings failed");
   1379     } else {
   1380         expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
   1381     }
   1382 }
   1383 
   1384 void UnicodeSetTest::TestEscapePattern() {
   1385     const char pattern[] =
   1386         "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
   1387     const char exp[] =
   1388         "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
   1389     // We test this with two passes; in the second pass we
   1390     // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
   1391     // this fails -- which is what we expect.
   1392     for (int32_t pass=1; pass<=2; ++pass) {
   1393         UErrorCode ec = U_ZERO_ERROR;
   1394         UnicodeString pat(pattern, -1, US_INV);
   1395         if (pass==2) {
   1396             pat = pat.unescape();
   1397         }
   1398         // Pattern is only good for pass 1
   1399         UBool isPatternValid = (pass==1);
   1400 
   1401         UnicodeSet set(pat, ec);
   1402         if (U_SUCCESS(ec) != isPatternValid){
   1403             errln((UnicodeString)"FAIL: applyPattern(" +
   1404                   escape(pat) + ") => " +
   1405                   u_errorName(ec));
   1406             continue;
   1407         }
   1408         if (U_FAILURE(ec)) {
   1409             continue;
   1410         }
   1411         if (set.contains((UChar)0x0644)){
   1412             errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
   1413         }
   1414 
   1415         UnicodeString newpat;
   1416         set.toPattern(newpat, TRUE);
   1417         if (newpat == UnicodeString(exp, -1, US_INV)) {
   1418             logln(escape(pat) + " => " + newpat);
   1419         } else {
   1420             errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
   1421         }
   1422 
   1423         for (int32_t i=0; i<set.getRangeCount(); ++i) {
   1424             UnicodeString str("Range ");
   1425             str.append((UChar)(0x30 + i))
   1426                 .append(": ")
   1427                 .append((UChar32)set.getRangeStart(i))
   1428                 .append(" - ")
   1429                 .append((UChar32)set.getRangeEnd(i));
   1430             str = str + " (" + set.getRangeStart(i) + " - " +
   1431                 set.getRangeEnd(i) + ")";
   1432             if (set.getRangeStart(i) < 0) {
   1433                 errln((UnicodeString)"FAIL: " + escape(str));
   1434             } else {
   1435                 logln(escape(str));
   1436             }
   1437         }
   1438     }
   1439 }
   1440 
   1441 void UnicodeSetTest::expectRange(const UnicodeString& label,
   1442                                  const UnicodeSet& set,
   1443                                  UChar32 start, UChar32 end) {
   1444     UnicodeSet exp(start, end);
   1445     UnicodeString pat;
   1446     if (set == exp) {
   1447         logln(label + " => " + set.toPattern(pat, TRUE));
   1448     } else {
   1449         UnicodeString xpat;
   1450         errln((UnicodeString)"FAIL: " + label + " => " +
   1451               set.toPattern(pat, TRUE) +
   1452               ", expected " + exp.toPattern(xpat, TRUE));
   1453     }
   1454 }
   1455 
   1456 void UnicodeSetTest::TestInvalidCodePoint() {
   1457 
   1458     const UChar32 DATA[] = {
   1459         // Test range             Expected range
   1460         0, 0x10FFFF,              0, 0x10FFFF,
   1461         (UChar32)-1, 8,           0, 8,
   1462         8, 0x110000,              8, 0x10FFFF
   1463     };
   1464     const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
   1465 
   1466     UnicodeString pat;
   1467     int32_t i;
   1468 
   1469     for (i=0; i<DATA_LENGTH; i+=4) {
   1470         UChar32 start  = DATA[i];
   1471         UChar32 end    = DATA[i+1];
   1472         UChar32 xstart = DATA[i+2];
   1473         UChar32 xend   = DATA[i+3];
   1474 
   1475         // Try various API using the test code points
   1476 
   1477         UnicodeSet set(start, end);
   1478         expectRange((UnicodeString)"ct(" + start + "," + end + ")",
   1479                     set, xstart, xend);
   1480 
   1481         set.clear();
   1482         set.set(start, end);
   1483         expectRange((UnicodeString)"set(" + start + "," + end + ")",
   1484                     set, xstart, xend);
   1485 
   1486         UBool b = set.contains(start);
   1487         b = set.contains(start, end);
   1488         b = set.containsNone(start, end);
   1489         b = set.containsSome(start, end);
   1490         (void)b;   // Suppress set but not used warning.
   1491 
   1492         /*int32_t index = set.indexOf(start);*/
   1493 
   1494         set.clear();
   1495         set.add(start);
   1496         set.add(start, end);
   1497         expectRange((UnicodeString)"add(" + start + "," + end + ")",
   1498                     set, xstart, xend);
   1499 
   1500         set.set(0, 0x10FFFF);
   1501         set.retain(start, end);
   1502         expectRange((UnicodeString)"retain(" + start + "," + end + ")",
   1503                     set, xstart, xend);
   1504         set.retain(start);
   1505 
   1506         set.set(0, 0x10FFFF);
   1507         set.remove(start);
   1508         set.remove(start, end);
   1509         set.complement();
   1510         expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
   1511                     set, xstart, xend);
   1512 
   1513         set.set(0, 0x10FFFF);
   1514         set.complement(start, end);
   1515         set.complement();
   1516         expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
   1517                     set, xstart, xend);
   1518         set.complement(start);
   1519     }
   1520 
   1521     const UChar32 DATA2[] = {
   1522         0,
   1523         0x10FFFF,
   1524         (UChar32)-1,
   1525         0x110000
   1526     };
   1527     const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
   1528 
   1529     for (i=0; i<DATA2_LENGTH; ++i) {
   1530         UChar32 c = DATA2[i], end = 0x10FFFF;
   1531         UBool valid = (c >= 0 && c <= 0x10FFFF);
   1532 
   1533         UnicodeSet set(0, 0x10FFFF);
   1534 
   1535         // For single-codepoint contains, invalid codepoints are NOT contained
   1536         UBool b = set.contains(c);
   1537         if (b == valid) {
   1538             logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
   1539                   ") = " + b);
   1540         } else {
   1541             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
   1542                   ") = " + b);
   1543         }
   1544 
   1545         // For codepoint range contains, containsNone, and containsSome,
   1546         // invalid or empty (start > end) ranges have UNDEFINED behavior.
   1547         b = set.contains(c, end);
   1548         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
   1549               "," + end + ") = " + b);
   1550 
   1551         b = set.containsNone(c, end);
   1552         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
   1553               "," + end + ") = " + b);
   1554 
   1555         b = set.containsSome(c, end);
   1556         logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
   1557               "," + end + ") = " + b);
   1558 
   1559         int32_t index = set.indexOf(c);
   1560         if ((index >= 0) == valid) {
   1561             logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
   1562                   ") = " + index);
   1563         } else {
   1564             errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
   1565                   ") = " + index);
   1566         }
   1567     }
   1568 }
   1569 
   1570 // Used by TestSymbolTable
   1571 class TokenSymbolTable : public SymbolTable {
   1572 public:
   1573     Hashtable contents;
   1574 
   1575     TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
   1576         contents.setValueDeleter(uprv_deleteUObject);
   1577     }
   1578 
   1579     ~TokenSymbolTable() {}
   1580 
   1581     /**
   1582      * (Non-SymbolTable API) Add the given variable and value to
   1583      * the table.  Variable should NOT contain leading '$'.
   1584      */
   1585     void add(const UnicodeString& var, const UnicodeString& value,
   1586              UErrorCode& ec) {
   1587         if (U_SUCCESS(ec)) {
   1588             contents.put(var, new UnicodeString(value), ec);
   1589         }
   1590     }
   1591 
   1592     /**
   1593      * SymbolTable API
   1594      */
   1595     virtual const UnicodeString* lookup(const UnicodeString& s) const {
   1596         return (const UnicodeString*) contents.get(s);
   1597     }
   1598 
   1599     /**
   1600      * SymbolTable API
   1601      */
   1602     virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
   1603         return NULL;
   1604     }
   1605 
   1606     /**
   1607      * SymbolTable API
   1608      */
   1609     virtual UnicodeString parseReference(const UnicodeString& text,
   1610                                          ParsePosition& pos, int32_t limit) const {
   1611         int32_t start = pos.getIndex();
   1612         int32_t i = start;
   1613         UnicodeString result;
   1614         while (i < limit) {
   1615             UChar c = text.charAt(i);
   1616             if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
   1617                 break;
   1618             }
   1619             ++i;
   1620         }
   1621         if (i == start) { // No valid name chars
   1622             return result; // Indicate failure with empty string
   1623         }
   1624         pos.setIndex(i);
   1625         text.extractBetween(start, i, result);
   1626         return result;
   1627     }
   1628 };
   1629 
   1630 void UnicodeSetTest::TestSymbolTable() {
   1631     // Multiple test cases can be set up here.  Each test case
   1632     // is terminated by null:
   1633     // var, value, var, value,..., input pat., exp. output pat., null
   1634     const char* DATA[] = {
   1635         "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
   1636         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
   1637         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
   1638         NULL
   1639     };
   1640 
   1641     for (int32_t i=0; DATA[i]!=NULL; ++i) {
   1642         UErrorCode ec = U_ZERO_ERROR;
   1643         TokenSymbolTable sym(ec);
   1644         if (U_FAILURE(ec)) {
   1645             errln("FAIL: couldn't construct TokenSymbolTable");
   1646             continue;
   1647         }
   1648 
   1649         // Set up variables
   1650         while (DATA[i+2] != NULL) {
   1651             sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
   1652             if (U_FAILURE(ec)) {
   1653                 errln("FAIL: couldn't add to TokenSymbolTable");
   1654                 continue;
   1655             }
   1656             i += 2;
   1657         }
   1658 
   1659         // Input pattern and expected output pattern
   1660         UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
   1661         i += 2;
   1662 
   1663         ParsePosition pos(0);
   1664         UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
   1665         if (U_FAILURE(ec)) {
   1666             errln("FAIL: couldn't construct UnicodeSet");
   1667             continue;
   1668         }
   1669 
   1670         // results
   1671         if (pos.getIndex() != inpat.length()) {
   1672             errln((UnicodeString)"Failed to read to end of string \""
   1673                   + inpat + "\": read to "
   1674                   + pos.getIndex() + ", length is "
   1675                   + inpat.length());
   1676         }
   1677 
   1678         UnicodeSet us2(exppat, ec);
   1679         if (U_FAILURE(ec)) {
   1680             errln("FAIL: couldn't construct expected UnicodeSet");
   1681             continue;
   1682         }
   1683 
   1684         UnicodeString a, b;
   1685         if (us != us2) {
   1686             errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
   1687                   ", expected " + us2.toPattern(b, TRUE));
   1688         } else {
   1689             logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
   1690         }
   1691     }
   1692 }
   1693 
   1694 void UnicodeSetTest::TestSurrogate() {
   1695     const char* DATA[] = {
   1696         // These should all behave identically
   1697         "[abc\\uD800\\uDC00]",
   1698         // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
   1699         "[abc\\U00010000]",
   1700         0
   1701     };
   1702     for (int i=0; DATA[i] != 0; ++i) {
   1703         UErrorCode ec = U_ZERO_ERROR;
   1704         logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
   1705         UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
   1706         UnicodeSet set(str, ec);
   1707         if (U_FAILURE(ec)) {
   1708             errln("FAIL: UnicodeSet constructor");
   1709             continue;
   1710         }
   1711         expectContainment(set,
   1712                           CharsToUnicodeString("abc\\U00010000"),
   1713                           CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
   1714         if (set.size() != 4) {
   1715             errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
   1716                   set.size() + ", expected 4");
   1717         }
   1718 
   1719         {
   1720           UErrorCode subErr = U_ZERO_ERROR;
   1721           checkRoundTrip(set);
   1722           checkSerializeRoundTrip(set, subErr);
   1723         }
   1724     }
   1725 }
   1726 
   1727 void UnicodeSetTest::TestExhaustive() {
   1728     // exhaustive tests. Simulate UnicodeSets with integers.
   1729     // That gives us very solid tests (except for large memory tests).
   1730 
   1731     int32_t limit = 128;
   1732 
   1733     UnicodeSet x, y, z, aa;
   1734 
   1735     for (int32_t i = 0; i < limit; ++i) {
   1736         bitsToSet(i, x);
   1737         logln((UnicodeString)"Testing " + i + ", " + x);
   1738         _testComplement(i, x, y);
   1739 
   1740         UnicodeSet &toTest = bitsToSet(i, aa);
   1741 
   1742         // AS LONG AS WE ARE HERE, check roundtrip
   1743         checkRoundTrip(toTest);
   1744         UErrorCode ec = U_ZERO_ERROR;
   1745         checkSerializeRoundTrip(toTest, ec);
   1746 
   1747         for (int32_t j = 0; j < limit; ++j) {
   1748             _testAdd(i,j,  x,y,z);
   1749             _testXor(i,j,  x,y,z);
   1750             _testRetain(i,j,  x,y,z);
   1751             _testRemove(i,j,  x,y,z);
   1752         }
   1753     }
   1754 }
   1755 
   1756 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
   1757     bitsToSet(a, x);
   1758     z = x;
   1759     z.complement();
   1760     int32_t c = setToBits(z);
   1761     if (c != (~a)) {
   1762         errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
   1763         errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
   1764     }
   1765     checkCanonicalRep(z, (UnicodeString)"complement " + a);
   1766 }
   1767 
   1768 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1769     bitsToSet(a, x);
   1770     bitsToSet(b, y);
   1771     z = x;
   1772     z.addAll(y);
   1773     int32_t c = setToBits(z);
   1774     if (c != (a | b)) {
   1775         errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
   1776         errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
   1777     }
   1778     checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
   1779 }
   1780 
   1781 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1782     bitsToSet(a, x);
   1783     bitsToSet(b, y);
   1784     z = x;
   1785     z.retainAll(y);
   1786     int32_t c = setToBits(z);
   1787     if (c != (a & b)) {
   1788         errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
   1789         errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
   1790     }
   1791     checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
   1792 }
   1793 
   1794 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1795     bitsToSet(a, x);
   1796     bitsToSet(b, y);
   1797     z = x;
   1798     z.removeAll(y);
   1799     int32_t c = setToBits(z);
   1800     if (c != (a &~ b)) {
   1801         errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
   1802         errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
   1803     }
   1804     checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
   1805 }
   1806 
   1807 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
   1808     bitsToSet(a, x);
   1809     bitsToSet(b, y);
   1810     z = x;
   1811     z.complementAll(y);
   1812     int32_t c = setToBits(z);
   1813     if (c != (a ^ b)) {
   1814         errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
   1815         errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
   1816     }
   1817     checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
   1818 }
   1819 
   1820 /**
   1821  * Check that ranges are monotonically increasing and non-
   1822  * overlapping.
   1823  */
   1824 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
   1825     int32_t n = set.getRangeCount();
   1826     if (n < 0) {
   1827         errln((UnicodeString)"FAIL result of " + msg +
   1828               ": range count should be >= 0 but is " +
   1829               n /*+ " for " + set.toPattern())*/);
   1830         return;
   1831     }
   1832     UChar32 last = 0;
   1833     for (int32_t i=0; i<n; ++i) {
   1834         UChar32 start = set.getRangeStart(i);
   1835         UChar32 end = set.getRangeEnd(i);
   1836         if (start > end) {
   1837             errln((UnicodeString)"FAIL result of " + msg +
   1838                   ": range " + (i+1) +
   1839                   " start > end: " + (int)start + ", " + (int)end +
   1840                   " for " + set);
   1841         }
   1842         if (i > 0 && start <= last) {
   1843             errln((UnicodeString)"FAIL result of " + msg +
   1844                   ": range " + (i+1) +
   1845                   " overlaps previous range: " + (int)start + ", " + (int)end +
   1846                   " for " + set);
   1847         }
   1848         last = end;
   1849     }
   1850 }
   1851 
   1852 /**
   1853  * Convert a bitmask to a UnicodeSet.
   1854  */
   1855 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
   1856     result.clear();
   1857     for (UChar32 i = 0; i < 32; ++i) {
   1858         if ((a & (1<<i)) != 0) {
   1859             result.add(i);
   1860         }
   1861     }
   1862     return result;
   1863 }
   1864 
   1865 /**
   1866  * Convert a UnicodeSet to a bitmask.  Only the characters
   1867  * U+0000 to U+0020 are represented in the bitmask.
   1868  */
   1869 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
   1870     int32_t result = 0;
   1871     for (int32_t i = 0; i < 32; ++i) {
   1872         if (x.contains((UChar32)i)) {
   1873             result |= (1<<i);
   1874         }
   1875     }
   1876     return result;
   1877 }
   1878 
   1879 /**
   1880  * Return the representation of an inversion list based UnicodeSet
   1881  * as a pairs list.  Ranges are listed in ascending Unicode order.
   1882  * For example, the set [a-zA-M3] is represented as "33AMaz".
   1883  */
   1884 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
   1885     UnicodeString pairs;
   1886     for (int32_t i=0; i<set.getRangeCount(); ++i) {
   1887         UChar32 start = set.getRangeStart(i);
   1888         UChar32 end = set.getRangeEnd(i);
   1889         if (end > 0xFFFF) {
   1890             end = 0xFFFF;
   1891             i = set.getRangeCount(); // Should be unnecessary
   1892         }
   1893         pairs.append((UChar)start).append((UChar)end);
   1894     }
   1895     return pairs;
   1896 }
   1897 
   1898 /**
   1899  * Basic consistency check for a few items.
   1900  * That the iterator works, and that we can create a pattern and
   1901  * get the same thing back
   1902  */
   1903 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
   1904     {
   1905         UnicodeSet t(s);
   1906         checkEqual(s, t, "copy ct");
   1907     }
   1908 
   1909     {
   1910         UnicodeSet t(0xabcd, 0xdef0);  // dummy contents should be overwritten
   1911         t = s;
   1912         checkEqual(s, t, "operator=");
   1913     }
   1914 
   1915     {
   1916         UnicodeSet t;
   1917         copyWithIterator(t, s, FALSE);
   1918         checkEqual(s, t, "iterator roundtrip");
   1919     }
   1920 
   1921     {
   1922         UnicodeSet t;
   1923         copyWithIterator(t, s, TRUE); // try range
   1924         checkEqual(s, t, "iterator roundtrip");
   1925     }
   1926 
   1927     {
   1928         UnicodeSet t;
   1929         UnicodeString pat;
   1930         UErrorCode ec = U_ZERO_ERROR;
   1931         s.toPattern(pat, FALSE);
   1932         t.applyPattern(pat, ec);
   1933         if (U_FAILURE(ec)) {
   1934             errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));
   1935             return;
   1936         } else {
   1937             checkEqual(s, t, "toPattern(false)");
   1938         }
   1939     }
   1940 
   1941     {
   1942         UnicodeSet t;
   1943         UnicodeString pat;
   1944         UErrorCode ec = U_ZERO_ERROR;
   1945         s.toPattern(pat, TRUE);
   1946         t.applyPattern(pat, ec);
   1947         if (U_FAILURE(ec)) {
   1948             errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));
   1949             return;
   1950         } else {
   1951             checkEqual(s, t, "toPattern(true)");
   1952         }
   1953     }
   1954 }
   1955 
   1956 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &status) {
   1957   if(U_FAILURE(status)) return;
   1958   int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
   1959   if(status == U_BUFFER_OVERFLOW_ERROR) {
   1960     status = U_ZERO_ERROR;
   1961     serializeBuffer.resize(len);
   1962     len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);
   1963     // let 2nd error stand
   1964   }
   1965   if(U_FAILURE(status)) {
   1966     errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName(status));
   1967     return;
   1968   }
   1969   UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerialized, status);
   1970   if(U_FAILURE(status)) {
   1971     errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d, original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRangeCount());
   1972     return;
   1973   }
   1974 
   1975   checkEqual(t, deserialized, "Set was unequal when deserialized");
   1976 }
   1977 
   1978 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
   1979     t.clear();
   1980     UnicodeSetIterator it(s);
   1981     if (withRange) {
   1982         while (it.nextRange()) {
   1983             if (it.isString()) {
   1984                 t.add(it.getString());
   1985             } else {
   1986                 t.add(it.getCodepoint(), it.getCodepointEnd());
   1987             }
   1988         }
   1989     } else {
   1990         while (it.next()) {
   1991             if (it.isString()) {
   1992                 t.add(it.getString());
   1993             } else {
   1994                 t.add(it.getCodepoint());
   1995             }
   1996         }
   1997     }
   1998 }
   1999 
   2000 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
   2001   assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
   2002   assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
   2003     UnicodeString source; s.toPattern(source, TRUE);
   2004     UnicodeString result; t.toPattern(result, TRUE);
   2005     if (s != t) {
   2006         errln((UnicodeString)"FAIL: " + message
   2007               + "; source = " + source
   2008               + "; result = " + result
   2009               );
   2010         return FALSE;
   2011     } else {
   2012         logln((UnicodeString)"Ok: " + message
   2013               + "; source = " + source
   2014               + "; result = " + result
   2015               );
   2016     }
   2017     return TRUE;
   2018 }
   2019 
   2020 void
   2021 UnicodeSetTest::expectContainment(const UnicodeString& pat,
   2022                                   const UnicodeString& charsIn,
   2023                                   const UnicodeString& charsOut) {
   2024     UErrorCode ec = U_ZERO_ERROR;
   2025     UnicodeSet set(pat, ec);
   2026     if (U_FAILURE(ec)) {
   2027         dataerrln((UnicodeString)"FAIL: pattern \"" +
   2028               pat + "\" => " + u_errorName(ec));
   2029         return;
   2030     }
   2031     expectContainment(set, pat, charsIn, charsOut);
   2032 }
   2033 
   2034 void
   2035 UnicodeSetTest::expectContainment(const UnicodeSet& set,
   2036                                   const UnicodeString& charsIn,
   2037                                   const UnicodeString& charsOut) {
   2038     UnicodeString pat;
   2039     set.toPattern(pat);
   2040     expectContainment(set, pat, charsIn, charsOut);
   2041 }
   2042 
   2043 void
   2044 UnicodeSetTest::expectContainment(const UnicodeSet& set,
   2045                                   const UnicodeString& setName,
   2046                                   const UnicodeString& charsIn,
   2047                                   const UnicodeString& charsOut) {
   2048     UnicodeString bad;
   2049     UChar32 c;
   2050     int32_t i;
   2051 
   2052     for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
   2053         c = charsIn.char32At(i);
   2054         if (!set.contains(c)) {
   2055             bad.append(c);
   2056         }
   2057     }
   2058     if (bad.length() > 0) {
   2059         errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
   2060               ", expected containment of " + prettify(charsIn));
   2061     } else {
   2062         logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
   2063     }
   2064 
   2065     bad.truncate(0);
   2066     for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
   2067         c = charsOut.char32At(i);
   2068         if (set.contains(c)) {
   2069             bad.append(c);
   2070         }
   2071     }
   2072     if (bad.length() > 0) {
   2073         errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
   2074               ", expected non-containment of " + prettify(charsOut));
   2075     } else {
   2076         logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
   2077     }
   2078 }
   2079 
   2080 void
   2081 UnicodeSetTest::expectPattern(UnicodeSet& set,
   2082                               const UnicodeString& pattern,
   2083                               const UnicodeString& expectedPairs){
   2084     UErrorCode status = U_ZERO_ERROR;
   2085     set.applyPattern(pattern, status);
   2086     if (U_FAILURE(status)) {
   2087         errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
   2088               "\") failed");
   2089         return;
   2090     } else {
   2091         if (getPairs(set) != expectedPairs ) {
   2092             errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
   2093                   "\") => pairs \"" +
   2094                   escape(getPairs(set)) + "\", expected \"" +
   2095                   escape(expectedPairs) + "\"");
   2096         } else {
   2097             logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
   2098                   "\") => pairs \"" +
   2099                   escape(getPairs(set)) + "\"");
   2100         }
   2101     }
   2102     // the result of calling set.toPattern(), which is the string representation of
   2103     // this set(set), is passed to a  UnicodeSet constructor, and tested that it
   2104     // will produce another set that is equal to this one.
   2105     UnicodeString temppattern;
   2106     set.toPattern(temppattern);
   2107     UnicodeSet *tempset=new UnicodeSet(temppattern, status);
   2108     if (U_FAILURE(status)) {
   2109         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
   2110         return;
   2111     }
   2112     if(*tempset != set || getPairs(*tempset) != getPairs(set)){
   2113         errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
   2114             escape(getPairs(set)) + "\""));
   2115     } else{
   2116         logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
   2117     }
   2118 
   2119     delete tempset;
   2120 
   2121 }
   2122 
   2123 void
   2124 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
   2125     if (getPairs(set) != expectedPairs) {
   2126         errln(UnicodeString("FAIL: Expected pair list \"") +
   2127               escape(expectedPairs) + "\", got \"" +
   2128               escape(getPairs(set)) + "\"");
   2129     }
   2130 }
   2131 
   2132 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
   2133                                      const UnicodeString& expPat,
   2134                                      const char** expStrings) {
   2135     UnicodeString pat;
   2136     set.toPattern(pat, TRUE);
   2137     if (pat == expPat) {
   2138         logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
   2139     } else {
   2140         errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
   2141         return;
   2142     }
   2143     if (expStrings == NULL) {
   2144         return;
   2145     }
   2146     UBool in = TRUE;
   2147     for (int32_t i=0; expStrings[i] != NULL; ++i) {
   2148         if (expStrings[i] == NOT) { // sic; pointer comparison
   2149             in = FALSE;
   2150             continue;
   2151         }
   2152         UnicodeString s = CharsToUnicodeString(expStrings[i]);
   2153         UBool contained = set.contains(s);
   2154         if (contained == in) {
   2155             logln((UnicodeString)"Ok: " + expPat +
   2156                   (contained ? " contains {" : " does not contain {") +
   2157                   escape(expStrings[i]) + "}");
   2158         } else {
   2159             errln((UnicodeString)"FAIL: " + expPat +
   2160                   (contained ? " contains {" : " does not contain {") +
   2161                   escape(expStrings[i]) + "}");
   2162         }
   2163     }
   2164 }
   2165 
   2166 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
   2167 
   2168 void
   2169 UnicodeSetTest::doAssert(UBool condition, const char *message)
   2170 {
   2171     if (!condition) {
   2172         errln(UnicodeString("ERROR : ") + message);
   2173     }
   2174 }
   2175 
   2176 UnicodeString
   2177 UnicodeSetTest::escape(const UnicodeString& s) {
   2178     UnicodeString buf;
   2179     for (int32_t i=0; i<s.length(); )
   2180     {
   2181         UChar32 c = s.char32At(i);
   2182         if (0x0020 <= c && c <= 0x007F) {
   2183             buf += c;
   2184         } else {
   2185             if (c <= 0xFFFF) {
   2186                 buf += (UChar)0x5c; buf += (UChar)0x75;
   2187             } else {
   2188                 buf += (UChar)0x5c; buf += (UChar)0x55;
   2189                 buf += toHexString((c & 0xF0000000) >> 28);
   2190                 buf += toHexString((c & 0x0F000000) >> 24);
   2191                 buf += toHexString((c & 0x00F00000) >> 20);
   2192                 buf += toHexString((c & 0x000F0000) >> 16);
   2193             }
   2194             buf += toHexString((c & 0xF000) >> 12);
   2195             buf += toHexString((c & 0x0F00) >> 8);
   2196             buf += toHexString((c & 0x00F0) >> 4);
   2197             buf += toHexString(c & 0x000F);
   2198         }
   2199         i += U16_LENGTH(c);
   2200     }
   2201     return buf;
   2202 }
   2203 
   2204 void UnicodeSetTest::TestFreezable() {
   2205     UErrorCode errorCode=U_ZERO_ERROR;
   2206     UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
   2207     UnicodeSet idSet(idPattern, errorCode);
   2208     if(U_FAILURE(errorCode)) {
   2209         dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
   2210         return;
   2211     }
   2212 
   2213     UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
   2214     UnicodeSet wsSet(wsPattern, errorCode);
   2215     if(U_FAILURE(errorCode)) {
   2216         dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
   2217         return;
   2218     }
   2219 
   2220     idSet.add(idPattern);
   2221     UnicodeSet frozen(idSet);
   2222     frozen.freeze();
   2223 
   2224     if(idSet.isFrozen() || !frozen.isFrozen()) {
   2225         errln("FAIL: isFrozen() is wrong");
   2226     }
   2227     if(frozen!=idSet || !(frozen==idSet)) {
   2228         errln("FAIL: a copy-constructed frozen set differs from its original");
   2229     }
   2230 
   2231     frozen=wsSet;
   2232     if(frozen!=idSet || !(frozen==idSet)) {
   2233         errln("FAIL: a frozen set was modified by operator=");
   2234     }
   2235 
   2236     UnicodeSet frozen2(frozen);
   2237     if(frozen2!=frozen || frozen2!=idSet) {
   2238         errln("FAIL: a copied frozen set differs from its frozen original");
   2239     }
   2240     if(!frozen2.isFrozen()) {
   2241         errln("FAIL: copy-constructing a frozen set results in a thawed one");
   2242     }
   2243     UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
   2244     if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
   2245         errln("FAIL: UnicodeSet(5, 55) failed");
   2246     }
   2247     frozen3=frozen;
   2248     if(!frozen3.isFrozen()) {
   2249         errln("FAIL: copying a frozen set results in a thawed one");
   2250     }
   2251 
   2252     UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
   2253     if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
   2254         errln("FAIL: clone() failed");
   2255     }
   2256     cloned->add(0xd802, 0xd805);
   2257     if(cloned->containsSome(0xd802, 0xd805)) {
   2258         errln("FAIL: unable to modify clone");
   2259     }
   2260     delete cloned;
   2261 
   2262     UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
   2263     if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
   2264         errln("FAIL: cloneAsThawed() failed");
   2265     }
   2266     thawed->add(0xd802, 0xd805);
   2267     if(!thawed->contains(0xd802, 0xd805)) {
   2268         errln("FAIL: unable to modify thawed clone");
   2269     }
   2270     delete thawed;
   2271 
   2272     frozen.set(5, 55);
   2273     if(frozen!=idSet || !(frozen==idSet)) {
   2274         errln("FAIL: UnicodeSet::set() modified a frozen set");
   2275     }
   2276 
   2277     frozen.clear();
   2278     if(frozen!=idSet || !(frozen==idSet)) {
   2279         errln("FAIL: UnicodeSet::clear() modified a frozen set");
   2280     }
   2281 
   2282     frozen.closeOver(USET_CASE_INSENSITIVE);
   2283     if(frozen!=idSet || !(frozen==idSet)) {
   2284         errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
   2285     }
   2286 
   2287     frozen.compact();
   2288     if(frozen!=idSet || !(frozen==idSet)) {
   2289         errln("FAIL: UnicodeSet::compact() modified a frozen set");
   2290     }
   2291 
   2292     ParsePosition pos;
   2293     frozen.
   2294         applyPattern(wsPattern, errorCode).
   2295         applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
   2296         applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
   2297         applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
   2298         applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
   2299     if(frozen!=idSet || !(frozen==idSet)) {
   2300         errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
   2301     }
   2302 
   2303     frozen.
   2304         add(0xd800).
   2305         add(0xd802, 0xd805).
   2306         add(wsPattern).
   2307         addAll(idPattern).
   2308         addAll(wsSet);
   2309     if(frozen!=idSet || !(frozen==idSet)) {
   2310         errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
   2311     }
   2312 
   2313     frozen.
   2314         retain(0x62).
   2315         retain(0x64, 0x69).
   2316         retainAll(wsPattern).
   2317         retainAll(wsSet);
   2318     if(frozen!=idSet || !(frozen==idSet)) {
   2319         errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
   2320     }
   2321 
   2322     frozen.
   2323         remove(0x62).
   2324         remove(0x64, 0x69).
   2325         remove(idPattern).
   2326         removeAll(idPattern).
   2327         removeAll(idSet);
   2328     if(frozen!=idSet || !(frozen==idSet)) {
   2329         errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
   2330     }
   2331 
   2332     frozen.
   2333         complement().
   2334         complement(0x62).
   2335         complement(0x64, 0x69).
   2336         complement(idPattern).
   2337         complementAll(idPattern).
   2338         complementAll(idSet);
   2339     if(frozen!=idSet || !(frozen==idSet)) {
   2340         errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
   2341     }
   2342 }
   2343 
   2344 // Test span() etc. -------------------------------------------------------- ***
   2345 
   2346 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
   2347 static int32_t
   2348 appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
   2349     UErrorCode errorCode=U_ZERO_ERROR;
   2350     int32_t length8=0;
   2351     u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
   2352     if(U_SUCCESS(errorCode)) {
   2353         return length8;
   2354     } else {
   2355         // The string contains an unpaired surrogate.
   2356         // Ignore this string.
   2357         return 0;
   2358     }
   2359 }
   2360 
   2361 class UnicodeSetWithStringsIterator;
   2362 
   2363 // Make the strings in a UnicodeSet easily accessible.
   2364 class UnicodeSetWithStrings {
   2365 public:
   2366     UnicodeSetWithStrings(const UnicodeSet &normalSet) :
   2367             set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
   2368         int32_t size=set.size();
   2369         if(size>0 && set.charAt(size-1)<0) {
   2370             // If a set's last element is not a code point, then it must contain strings.
   2371             // Iterate over the set, skip all code point ranges, and cache the strings.
   2372             // Convert them to UTF-8 for spanUTF8().
   2373             UnicodeSetIterator iter(set);
   2374             const UnicodeString *s;
   2375             char *s8=utf8;
   2376             int32_t length8, utf8Count=0;
   2377             while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
   2378                 if(iter.isString()) {
   2379                     // Store the pointer to the set's string element
   2380                     // which we happen to know is a stable pointer.
   2381                     strings[stringsLength]=s=&iter.getString();
   2382                     utf8Count+=
   2383                         utf8Lengths[stringsLength]=length8=
   2384                         appendUTF8(s->getBuffer(), s->length(),
   2385                                    s8, (int32_t)(sizeof(utf8)-utf8Count));
   2386                     if(length8==0) {
   2387                         hasSurrogates=TRUE;  // Contains unpaired surrogates.
   2388                     }
   2389                     s8+=length8;
   2390                     ++stringsLength;
   2391                 }
   2392             }
   2393         }
   2394     }
   2395 
   2396     const UnicodeSet &getSet() const {
   2397         return set;
   2398     }
   2399 
   2400     UBool hasStrings() const {
   2401         return (UBool)(stringsLength>0);
   2402     }
   2403 
   2404     UBool hasStringsWithSurrogates() const {
   2405         return hasSurrogates;
   2406     }
   2407 
   2408 private:
   2409     friend class UnicodeSetWithStringsIterator;
   2410 
   2411     const UnicodeSet &set;
   2412 
   2413     const UnicodeString *strings[20];
   2414     int32_t stringsLength;
   2415     UBool hasSurrogates;
   2416 
   2417     char utf8[1024];
   2418     int32_t utf8Lengths[20];
   2419 };
   2420 
   2421 class UnicodeSetWithStringsIterator {
   2422 public:
   2423     UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
   2424             fSet(set), nextStringIndex(0), nextUTF8Start(0) {
   2425     }
   2426 
   2427     void reset() {
   2428         nextStringIndex=nextUTF8Start=0;
   2429     }
   2430 
   2431     const UnicodeString *nextString() {
   2432         if(nextStringIndex<fSet.stringsLength) {
   2433             return fSet.strings[nextStringIndex++];
   2434         } else {
   2435             return NULL;
   2436         }
   2437     }
   2438 
   2439     // Do not mix with calls to nextString().
   2440     const char *nextUTF8(int32_t &length) {
   2441         if(nextStringIndex<fSet.stringsLength) {
   2442             const char *s8=fSet.utf8+nextUTF8Start;
   2443             nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
   2444             return s8;
   2445         } else {
   2446             length=0;
   2447             return NULL;
   2448         }
   2449     }
   2450 
   2451 private:
   2452     const UnicodeSetWithStrings &fSet;
   2453     int32_t nextStringIndex;
   2454     int32_t nextUTF8Start;
   2455 };
   2456 
   2457 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
   2458 // at code point boundaries.
   2459 // That is, each edge of a match must not be in the middle of a surrogate pair.
   2460 static inline UBool
   2461 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
   2462     s+=start;
   2463     limit-=start;
   2464     int32_t length=t.length();
   2465     return 0==t.compare(s, length) &&
   2466            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
   2467            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
   2468 }
   2469 
   2470 // Implement span() with contains() for comparison.
   2471 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
   2472                                  USetSpanCondition spanCondition) {
   2473     const UnicodeSet &realSet(set.getSet());
   2474     if(!set.hasStrings()) {
   2475         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2476             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2477         }
   2478 
   2479         UChar32 c;
   2480         int32_t start=0, prev;
   2481         while((prev=start)<length) {
   2482             U16_NEXT(s, start, length, c);
   2483             if(realSet.contains(c)!=spanCondition) {
   2484                 break;
   2485             }
   2486         }
   2487         return prev;
   2488     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2489         UnicodeSetWithStringsIterator iter(set);
   2490         UChar32 c;
   2491         int32_t start, next;
   2492         for(start=next=0; start<length;) {
   2493             U16_NEXT(s, next, length, c);
   2494             if(realSet.contains(c)) {
   2495                 break;
   2496             }
   2497             const UnicodeString *str;
   2498             iter.reset();
   2499             while((str=iter.nextString())!=NULL) {
   2500                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
   2501                     // spanNeedsStrings=TRUE;
   2502                     return start;
   2503                 }
   2504             }
   2505             start=next;
   2506         }
   2507         return start;
   2508     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2509         UnicodeSetWithStringsIterator iter(set);
   2510         UChar32 c;
   2511         int32_t start, next, maxSpanLimit=0;
   2512         for(start=next=0; start<length;) {
   2513             U16_NEXT(s, next, length, c);
   2514             if(!realSet.contains(c)) {
   2515                 next=start;  // Do not span this single, not-contained code point.
   2516             }
   2517             const UnicodeString *str;
   2518             iter.reset();
   2519             while((str=iter.nextString())!=NULL) {
   2520                 if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
   2521                     // spanNeedsStrings=TRUE;
   2522                     int32_t matchLimit=start+str->length();
   2523                     if(matchLimit==length) {
   2524                         return length;
   2525                     }
   2526                     if(spanCondition==USET_SPAN_CONTAINED) {
   2527                         // Iterate for the shortest match at each position.
   2528                         // Recurse for each but the shortest match.
   2529                         if(next==start) {
   2530                             next=matchLimit;  // First match from start.
   2531                         } else {
   2532                             if(matchLimit<next) {
   2533                                 // Remember shortest match from start for iteration.
   2534                                 int32_t temp=next;
   2535                                 next=matchLimit;
   2536                                 matchLimit=temp;
   2537                             }
   2538                             // Recurse for non-shortest match from start.
   2539                             int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
   2540                                                                  USET_SPAN_CONTAINED);
   2541                             if((matchLimit+spanLength)>maxSpanLimit) {
   2542                                 maxSpanLimit=matchLimit+spanLength;
   2543                                 if(maxSpanLimit==length) {
   2544                                     return length;
   2545                                 }
   2546                             }
   2547                         }
   2548                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2549                         if(matchLimit>next) {
   2550                             // Remember longest match from start.
   2551                             next=matchLimit;
   2552                         }
   2553                     }
   2554                 }
   2555             }
   2556             if(next==start) {
   2557                 break;  // No match from start.
   2558             }
   2559             start=next;
   2560         }
   2561         if(start>maxSpanLimit) {
   2562             return start;
   2563         } else {
   2564             return maxSpanLimit;
   2565         }
   2566     }
   2567 }
   2568 
   2569 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
   2570                                      USetSpanCondition spanCondition) {
   2571     if(length==0) {
   2572         return 0;
   2573     }
   2574     const UnicodeSet &realSet(set.getSet());
   2575     if(!set.hasStrings()) {
   2576         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2577             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2578         }
   2579 
   2580         UChar32 c;
   2581         int32_t prev=length;
   2582         do {
   2583             U16_PREV(s, 0, length, c);
   2584             if(realSet.contains(c)!=spanCondition) {
   2585                 break;
   2586             }
   2587         } while((prev=length)>0);
   2588         return prev;
   2589     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2590         UnicodeSetWithStringsIterator iter(set);
   2591         UChar32 c;
   2592         int32_t prev=length, length0=length;
   2593         do {
   2594             U16_PREV(s, 0, length, c);
   2595             if(realSet.contains(c)) {
   2596                 break;
   2597             }
   2598             const UnicodeString *str;
   2599             iter.reset();
   2600             while((str=iter.nextString())!=NULL) {
   2601                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
   2602                     // spanNeedsStrings=TRUE;
   2603                     return prev;
   2604                 }
   2605             }
   2606         } while((prev=length)>0);
   2607         return prev;
   2608     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2609         UnicodeSetWithStringsIterator iter(set);
   2610         UChar32 c;
   2611         int32_t prev=length, minSpanStart=length, length0=length;
   2612         do {
   2613             U16_PREV(s, 0, length, c);
   2614             if(!realSet.contains(c)) {
   2615                 length=prev;  // Do not span this single, not-contained code point.
   2616             }
   2617             const UnicodeString *str;
   2618             iter.reset();
   2619             while((str=iter.nextString())!=NULL) {
   2620                 if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
   2621                     // spanNeedsStrings=TRUE;
   2622                     int32_t matchStart=prev-str->length();
   2623                     if(matchStart==0) {
   2624                         return 0;
   2625                     }
   2626                     if(spanCondition==USET_SPAN_CONTAINED) {
   2627                         // Iterate for the shortest match at each position.
   2628                         // Recurse for each but the shortest match.
   2629                         if(length==prev) {
   2630                             length=matchStart;  // First match from prev.
   2631                         } else {
   2632                             if(matchStart>length) {
   2633                                 // Remember shortest match from prev for iteration.
   2634                                 int32_t temp=length;
   2635                                 length=matchStart;
   2636                                 matchStart=temp;
   2637                             }
   2638                             // Recurse for non-shortest match from prev.
   2639                             int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
   2640                                                                     USET_SPAN_CONTAINED);
   2641                             if(spanStart<minSpanStart) {
   2642                                 minSpanStart=spanStart;
   2643                                 if(minSpanStart==0) {
   2644                                     return 0;
   2645                                 }
   2646                             }
   2647                         }
   2648                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2649                         if(matchStart<length) {
   2650                             // Remember longest match from prev.
   2651                             length=matchStart;
   2652                         }
   2653                     }
   2654                 }
   2655             }
   2656             if(length==prev) {
   2657                 break;  // No match from prev.
   2658             }
   2659         } while((prev=length)>0);
   2660         if(prev<minSpanStart) {
   2661             return prev;
   2662         } else {
   2663             return minSpanStart;
   2664         }
   2665     }
   2666 }
   2667 
   2668 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
   2669                                 USetSpanCondition spanCondition) {
   2670     const UnicodeSet &realSet(set.getSet());
   2671     if(!set.hasStrings()) {
   2672         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2673             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2674         }
   2675 
   2676         UChar32 c;
   2677         int32_t start=0, prev;
   2678         while((prev=start)<length) {
   2679             U8_NEXT_OR_FFFD(s, start, length, c);
   2680             if(realSet.contains(c)!=spanCondition) {
   2681                 break;
   2682             }
   2683         }
   2684         return prev;
   2685     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2686         UnicodeSetWithStringsIterator iter(set);
   2687         UChar32 c;
   2688         int32_t start, next;
   2689         for(start=next=0; start<length;) {
   2690             U8_NEXT_OR_FFFD(s, next, length, c);
   2691             if(realSet.contains(c)) {
   2692                 break;
   2693             }
   2694             const char *s8;
   2695             int32_t length8;
   2696             iter.reset();
   2697             while((s8=iter.nextUTF8(length8))!=NULL) {
   2698                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
   2699                     // spanNeedsStrings=TRUE;
   2700                     return start;
   2701                 }
   2702             }
   2703             start=next;
   2704         }
   2705         return start;
   2706     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2707         UnicodeSetWithStringsIterator iter(set);
   2708         UChar32 c;
   2709         int32_t start, next, maxSpanLimit=0;
   2710         for(start=next=0; start<length;) {
   2711             U8_NEXT_OR_FFFD(s, next, length, c);
   2712             if(!realSet.contains(c)) {
   2713                 next=start;  // Do not span this single, not-contained code point.
   2714             }
   2715             const char *s8;
   2716             int32_t length8;
   2717             iter.reset();
   2718             while((s8=iter.nextUTF8(length8))!=NULL) {
   2719                 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
   2720                     // spanNeedsStrings=TRUE;
   2721                     int32_t matchLimit=start+length8;
   2722                     if(matchLimit==length) {
   2723                         return length;
   2724                     }
   2725                     if(spanCondition==USET_SPAN_CONTAINED) {
   2726                         // Iterate for the shortest match at each position.
   2727                         // Recurse for each but the shortest match.
   2728                         if(next==start) {
   2729                             next=matchLimit;  // First match from start.
   2730                         } else {
   2731                             if(matchLimit<next) {
   2732                                 // Remember shortest match from start for iteration.
   2733                                 int32_t temp=next;
   2734                                 next=matchLimit;
   2735                                 matchLimit=temp;
   2736                             }
   2737                             // Recurse for non-shortest match from start.
   2738                             int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
   2739                                                                 USET_SPAN_CONTAINED);
   2740                             if((matchLimit+spanLength)>maxSpanLimit) {
   2741                                 maxSpanLimit=matchLimit+spanLength;
   2742                                 if(maxSpanLimit==length) {
   2743                                     return length;
   2744                                 }
   2745                             }
   2746                         }
   2747                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2748                         if(matchLimit>next) {
   2749                             // Remember longest match from start.
   2750                             next=matchLimit;
   2751                         }
   2752                     }
   2753                 }
   2754             }
   2755             if(next==start) {
   2756                 break;  // No match from start.
   2757             }
   2758             start=next;
   2759         }
   2760         if(start>maxSpanLimit) {
   2761             return start;
   2762         } else {
   2763             return maxSpanLimit;
   2764         }
   2765     }
   2766 }
   2767 
   2768 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
   2769                                     USetSpanCondition spanCondition) {
   2770     if(length==0) {
   2771         return 0;
   2772     }
   2773     const UnicodeSet &realSet(set.getSet());
   2774     if(!set.hasStrings()) {
   2775         if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   2776             spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   2777         }
   2778 
   2779         UChar32 c;
   2780         int32_t prev=length;
   2781         do {
   2782             U8_PREV_OR_FFFD(s, 0, length, c);
   2783             if(realSet.contains(c)!=spanCondition) {
   2784                 break;
   2785             }
   2786         } while((prev=length)>0);
   2787         return prev;
   2788     } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   2789         UnicodeSetWithStringsIterator iter(set);
   2790         UChar32 c;
   2791         int32_t prev=length;
   2792         do {
   2793             U8_PREV_OR_FFFD(s, 0, length, c);
   2794             if(realSet.contains(c)) {
   2795                 break;
   2796             }
   2797             const char *s8;
   2798             int32_t length8;
   2799             iter.reset();
   2800             while((s8=iter.nextUTF8(length8))!=NULL) {
   2801                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
   2802                     // spanNeedsStrings=TRUE;
   2803                     return prev;
   2804                 }
   2805             }
   2806         } while((prev=length)>0);
   2807         return prev;
   2808     } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
   2809         UnicodeSetWithStringsIterator iter(set);
   2810         UChar32 c;
   2811         int32_t prev=length, minSpanStart=length;
   2812         do {
   2813             U8_PREV_OR_FFFD(s, 0, length, c);
   2814             if(!realSet.contains(c)) {
   2815                 length=prev;  // Do not span this single, not-contained code point.
   2816             }
   2817             const char *s8;
   2818             int32_t length8;
   2819             iter.reset();
   2820             while((s8=iter.nextUTF8(length8))!=NULL) {
   2821                 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
   2822                     // spanNeedsStrings=TRUE;
   2823                     int32_t matchStart=prev-length8;
   2824                     if(matchStart==0) {
   2825                         return 0;
   2826                     }
   2827                     if(spanCondition==USET_SPAN_CONTAINED) {
   2828                         // Iterate for the shortest match at each position.
   2829                         // Recurse for each but the shortest match.
   2830                         if(length==prev) {
   2831                             length=matchStart;  // First match from prev.
   2832                         } else {
   2833                             if(matchStart>length) {
   2834                                 // Remember shortest match from prev for iteration.
   2835                                 int32_t temp=length;
   2836                                 length=matchStart;
   2837                                 matchStart=temp;
   2838                             }
   2839                             // Recurse for non-shortest match from prev.
   2840                             int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
   2841                                                                    USET_SPAN_CONTAINED);
   2842                             if(spanStart<minSpanStart) {
   2843                                 minSpanStart=spanStart;
   2844                                 if(minSpanStart==0) {
   2845                                     return 0;
   2846                                 }
   2847                             }
   2848                         }
   2849                     } else /* spanCondition==USET_SPAN_SIMPLE */ {
   2850                         if(matchStart<length) {
   2851                             // Remember longest match from prev.
   2852                             length=matchStart;
   2853                         }
   2854                     }
   2855                 }
   2856             }
   2857             if(length==prev) {
   2858                 break;  // No match from prev.
   2859             }
   2860         } while((prev=length)>0);
   2861         if(prev<minSpanStart) {
   2862             return prev;
   2863         } else {
   2864             return minSpanStart;
   2865         }
   2866     }
   2867 }
   2868 
   2869 // spans to be performed and compared
   2870 enum {
   2871     SPAN_UTF16          =1,
   2872     SPAN_UTF8           =2,
   2873     SPAN_UTFS           =3,
   2874 
   2875     SPAN_SET            =4,
   2876     SPAN_COMPLEMENT     =8,
   2877     SPAN_POLARITY       =0xc,
   2878 
   2879     SPAN_FWD            =0x10,
   2880     SPAN_BACK           =0x20,
   2881     SPAN_DIRS           =0x30,
   2882 
   2883     SPAN_CONTAINED      =0x100,
   2884     SPAN_SIMPLE         =0x200,
   2885     SPAN_CONDITION      =0x300,
   2886 
   2887     SPAN_ALL            =0x33f
   2888 };
   2889 
   2890 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
   2891     return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
   2892 }
   2893 
   2894 static inline int32_t slen(const void *s, UBool isUTF16) {
   2895     return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
   2896 }
   2897 
   2898 /*
   2899  * Count spans on a string with the method according to type and set the span limits.
   2900  * The set may be the complement of the original.
   2901  * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
   2902  * according to the expected number of spans.
   2903  * Sets typeName to an empty string if there is no such type.
   2904  * Returns -1 if the span option is filtered out.
   2905  */
   2906 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
   2907                         const void *s, int32_t length, UBool isUTF16,
   2908                         uint32_t whichSpans,
   2909                         int type, const char *&typeName,
   2910                         int32_t limits[], int32_t limitsCapacity,
   2911                         int32_t expectCount) {
   2912     const UnicodeSet &realSet(set.getSet());
   2913     int32_t start, count;
   2914     USetSpanCondition spanCondition, firstSpanCondition, contained;
   2915     UBool isForward;
   2916 
   2917     if(type<0 || 7<type) {
   2918         typeName="";
   2919         return 0;
   2920     }
   2921 
   2922     static const char *const typeNames16[]={
   2923         "contains", "contains(LM)",
   2924         "span", "span(LM)",
   2925         "containsBack", "containsBack(LM)",
   2926         "spanBack", "spanBack(LM)"
   2927     };
   2928 
   2929     static const char *const typeNames8[]={
   2930         "containsUTF8", "containsUTF8(LM)",
   2931         "spanUTF8", "spanUTF8(LM)",
   2932         "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
   2933         "spanBackUTF8", "spanBackUTF8(LM)"
   2934     };
   2935 
   2936     typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
   2937 
   2938     // filter span options
   2939     if(type<=3) {
   2940         // span forward
   2941         if((whichSpans&SPAN_FWD)==0) {
   2942             return -1;
   2943         }
   2944         isForward=TRUE;
   2945     } else {
   2946         // span backward
   2947         if((whichSpans&SPAN_BACK)==0) {
   2948             return -1;
   2949         }
   2950         isForward=FALSE;
   2951     }
   2952     if((type&1)==0) {
   2953         // use USET_SPAN_CONTAINED
   2954         if((whichSpans&SPAN_CONTAINED)==0) {
   2955             return -1;
   2956         }
   2957         contained=USET_SPAN_CONTAINED;
   2958     } else {
   2959         // use USET_SPAN_SIMPLE
   2960         if((whichSpans&SPAN_SIMPLE)==0) {
   2961             return -1;
   2962         }
   2963         contained=USET_SPAN_SIMPLE;
   2964     }
   2965 
   2966     // Default first span condition for going forward with an uncomplemented set.
   2967     spanCondition=USET_SPAN_NOT_CONTAINED;
   2968     if(isComplement) {
   2969         spanCondition=invertSpanCondition(spanCondition, contained);
   2970     }
   2971 
   2972     // First span condition for span(), used to terminate the spanBack() iteration.
   2973     firstSpanCondition=spanCondition;
   2974 
   2975     // spanBack(): Its initial span condition is span()'s last span condition,
   2976     // which is the opposite of span()'s first span condition
   2977     // if we expect an even number of spans.
   2978     // (The loop inverts spanCondition (expectCount-1) times
   2979     // before the expectCount'th span() call.)
   2980     // If we do not compare forward and backward directions, then we do not have an
   2981     // expectCount and just start with firstSpanCondition.
   2982     if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
   2983         spanCondition=invertSpanCondition(spanCondition, contained);
   2984     }
   2985 
   2986     count=0;
   2987     switch(type) {
   2988     case 0:
   2989     case 1:
   2990         start=0;
   2991         if(length<0) {
   2992             length=slen(s, isUTF16);
   2993         }
   2994         for(;;) {
   2995             start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
   2996                               containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
   2997             if(count<limitsCapacity) {
   2998                 limits[count]=start;
   2999             }
   3000             ++count;
   3001             if(start>=length) {
   3002                 break;
   3003             }
   3004             spanCondition=invertSpanCondition(spanCondition, contained);
   3005         }
   3006         break;
   3007     case 2:
   3008     case 3:
   3009         start=0;
   3010         for(;;) {
   3011             start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
   3012                               realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
   3013             if(count<limitsCapacity) {
   3014                 limits[count]=start;
   3015             }
   3016             ++count;
   3017             if(length>=0 ? start>=length :
   3018                            isUTF16 ? ((const UChar *)s)[start]==0 :
   3019                                      ((const char *)s)[start]==0
   3020             ) {
   3021                 break;
   3022             }
   3023             spanCondition=invertSpanCondition(spanCondition, contained);
   3024         }
   3025         break;
   3026     case 4:
   3027     case 5:
   3028         if(length<0) {
   3029             length=slen(s, isUTF16);
   3030         }
   3031         for(;;) {
   3032             ++count;
   3033             if(count<=limitsCapacity) {
   3034                 limits[limitsCapacity-count]=length;
   3035             }
   3036             length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
   3037                               containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
   3038             if(length==0 && spanCondition==firstSpanCondition) {
   3039                 break;
   3040             }
   3041             spanCondition=invertSpanCondition(spanCondition, contained);
   3042         }
   3043         if(count<limitsCapacity) {
   3044             memmove(limits, limits+(limitsCapacity-count), count*4);
   3045         }
   3046         break;
   3047     case 6:
   3048     case 7:
   3049         for(;;) {
   3050             ++count;
   3051             if(count<=limitsCapacity) {
   3052                 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
   3053             }
   3054             // Note: Length<0 is tested only for the first spanBack().
   3055             // If we wanted to keep length<0 for all spanBack()s, we would have to
   3056             // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
   3057             length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
   3058                               realSet.spanBackUTF8((const char *)s, length, spanCondition);
   3059             if(length==0 && spanCondition==firstSpanCondition) {
   3060                 break;
   3061             }
   3062             spanCondition=invertSpanCondition(spanCondition, contained);
   3063         }
   3064         if(count<limitsCapacity) {
   3065             memmove(limits, limits+(limitsCapacity-count), count*4);
   3066         }
   3067         break;
   3068     default:
   3069         typeName="";
   3070         return -1;
   3071     }
   3072 
   3073     return count;
   3074 }
   3075 
   3076 // sets to be tested; odd index=isComplement
   3077 enum {
   3078     SLOW,
   3079     SLOW_NOT,
   3080     FAST,
   3081     FAST_NOT,
   3082     SET_COUNT
   3083 };
   3084 
   3085 static const char *const setNames[SET_COUNT]={
   3086     "slow",
   3087     "slow.not",
   3088     "fast",
   3089     "fast.not"
   3090 };
   3091 
   3092 /*
   3093  * Verify that we get the same results whether we look at text with contains(),
   3094  * span() or spanBack(), using unfrozen or frozen versions of the set,
   3095  * and using the set or its complement (switching the spanConditions accordingly).
   3096  * The latter verifies that
   3097  *   set.span(spanCondition) == set.complement().span(!spanCondition).
   3098  *
   3099  * The expectLimits[] are either provided by the caller (with expectCount>=0)
   3100  * or returned to the caller (with an input expectCount<0).
   3101  */
   3102 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
   3103                               const void *s, int32_t length, UBool isUTF16,
   3104                               uint32_t whichSpans,
   3105                               int32_t expectLimits[], int32_t &expectCount,
   3106                               const char *testName, int32_t index) {
   3107     int32_t limits[500];
   3108     int32_t limitsCount;
   3109     int i, j;
   3110 
   3111     const char *typeName;
   3112     int type;
   3113 
   3114     for(i=0; i<SET_COUNT; ++i) {
   3115         if((i&1)==0) {
   3116             // Even-numbered sets are original, uncomplemented sets.
   3117             if((whichSpans&SPAN_SET)==0) {
   3118                 continue;
   3119             }
   3120         } else {
   3121             // Odd-numbered sets are complemented.
   3122             if((whichSpans&SPAN_COMPLEMENT)==0) {
   3123                 continue;
   3124             }
   3125         }
   3126         for(type=0;; ++type) {
   3127             limitsCount=getSpans(*sets[i], (UBool)(i&1),
   3128                                  s, length, isUTF16,
   3129                                  whichSpans,
   3130                                  type, typeName,
   3131                                  limits, UPRV_LENGTHOF(limits), expectCount);
   3132             if(typeName[0]==0) {
   3133                 break; // All types tried.
   3134             }
   3135             if(limitsCount<0) {
   3136                 continue; // Span option filtered out.
   3137             }
   3138             if(expectCount<0) {
   3139                 expectCount=limitsCount;
   3140                 if(limitsCount>UPRV_LENGTHOF(limits)) {
   3141                     errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
   3142                           testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
   3143                     return;
   3144                 }
   3145                 memcpy(expectLimits, limits, limitsCount*4);
   3146             } else if(limitsCount!=expectCount) {
   3147                 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
   3148                       testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
   3149             } else {
   3150                 for(j=0; j<limitsCount; ++j) {
   3151                     if(limits[j]!=expectLimits[j]) {
   3152                         errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
   3153                               testName, (long)index, setNames[i], typeName, (long)limitsCount,
   3154                               j, (long)limits[j], (long)expectLimits[j]);
   3155                         break;
   3156                     }
   3157                 }
   3158             }
   3159         }
   3160     }
   3161 
   3162     // Compare span() with containsAll()/containsNone(),
   3163     // but only if we have expectLimits[] from the uncomplemented set.
   3164     if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
   3165         const UChar *s16=(const UChar *)s;
   3166         UnicodeString string;
   3167         int32_t prev=0, limit, length;
   3168         for(i=0; i<expectCount; ++i) {
   3169             limit=expectLimits[i];
   3170             length=limit-prev;
   3171             if(length>0) {
   3172                 string.setTo(FALSE, s16+prev, length);  // read-only alias
   3173                 if(i&1) {
   3174                     if(!sets[SLOW]->getSet().containsAll(string)) {
   3175                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
   3176                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
   3177                         return;
   3178                     }
   3179                     if(!sets[FAST]->getSet().containsAll(string)) {
   3180                         errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
   3181                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
   3182                         return;
   3183                     }
   3184                 } else {
   3185                     if(!sets[SLOW]->getSet().containsNone(string)) {
   3186                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
   3187                               testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
   3188                         return;
   3189                     }
   3190                     if(!sets[FAST]->getSet().containsNone(string)) {
   3191                         errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
   3192                               testName, (long)index, setNames[FAST], (long)prev, (long)limit);
   3193                         return;
   3194                     }
   3195                 }
   3196             }
   3197             prev=limit;
   3198         }
   3199     }
   3200 }
   3201 
   3202 // Specifically test either UTF-16 or UTF-8.
   3203 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
   3204                               const void *s, int32_t length, UBool isUTF16,
   3205                               uint32_t whichSpans,
   3206                               const char *testName, int32_t index) {
   3207     int32_t expectLimits[500];
   3208     int32_t expectCount=-1;
   3209     testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
   3210 }
   3211 
   3212 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
   3213     UChar c, c2;
   3214 
   3215     if(length>=0) {
   3216         while(length>0) {
   3217             c=*s++;
   3218             --length;
   3219             if(0xd800<=c && c<0xe000) {
   3220                 if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
   3221                     return TRUE;
   3222                 }
   3223                 --length;
   3224             }
   3225         }
   3226     } else {
   3227         while((c=*s++)!=0) {
   3228             if(0xd800<=c && c<0xe000) {
   3229                 if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
   3230                     return TRUE;
   3231                 }
   3232             }
   3233         }
   3234     }
   3235     return FALSE;
   3236 }
   3237 
   3238 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
   3239 // unless either UTF is turned off in whichSpans.
   3240 // Testing UTF-16 and UTF-8 together requires that surrogate code points
   3241 // have the same contains(c) value as U+FFFD.
   3242 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
   3243                                       const UChar *s16, int32_t length16,
   3244                                       uint32_t whichSpans,
   3245                                       const char *testName, int32_t index) {
   3246     int32_t expectLimits[500];
   3247     int32_t expectCount;
   3248 
   3249     expectCount=-1;  // Get expectLimits[] from testSpan().
   3250 
   3251     if((whichSpans&SPAN_UTF16)!=0) {
   3252         testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
   3253     }
   3254     if((whichSpans&SPAN_UTF8)==0) {
   3255         return;
   3256     }
   3257 
   3258     // Convert s16[] and expectLimits[] to UTF-8.
   3259     uint8_t s8[3000];
   3260     int32_t offsets[3000];
   3261 
   3262     const UChar *s16Limit=s16+length16;
   3263     char *t=(char *)s8;
   3264     char *tLimit=t+sizeof(s8);
   3265     int32_t *o=offsets;
   3266     UErrorCode errorCode=U_ZERO_ERROR;
   3267 
   3268     // Convert with substitution: Turn unpaired surrogates into U+FFFD.
   3269     ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
   3270     if(U_FAILURE(errorCode)) {
   3271         errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
   3272               testName, (long)index, u_errorName(errorCode));
   3273         ucnv_resetFromUnicode(utf8Cnv);
   3274         return;
   3275     }
   3276     int32_t length8=(int32_t)(t-(char *)s8);
   3277 
   3278     // Convert expectLimits[].
   3279     int32_t i, j, expect;
   3280     for(i=j=0; i<expectCount; ++i) {
   3281         expect=expectLimits[i];
   3282         if(expect==length16) {
   3283             expectLimits[i]=length8;
   3284         } else {
   3285             while(offsets[j]<expect) {
   3286                 ++j;
   3287             }
   3288             expectLimits[i]=j;
   3289         }
   3290     }
   3291 
   3292     testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
   3293 }
   3294 
   3295 static UChar32 nextCodePoint(UChar32 c) {
   3296     // Skip some large and boring ranges.
   3297     switch(c) {
   3298     case 0x3441:
   3299         return 0x4d7f;
   3300     case 0x5100:
   3301         return 0x9f00;
   3302     case 0xb040:
   3303         return 0xd780;
   3304     case 0xe041:
   3305         return 0xf8fe;
   3306     case 0x10100:
   3307         return 0x20000;
   3308     case 0x20041:
   3309         return 0xe0000;
   3310     case 0xe0101:
   3311         return 0x10fffd;
   3312     default:
   3313         return c+1;
   3314     }
   3315 }
   3316 
   3317 // Verify that all implementations represent the same set.
   3318 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3319     // contains(U+FFFD) is inconsistent with contains(some surrogates),
   3320     // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
   3321     // Skip the UTF-8 part of the test - if the string contains surrogates -
   3322     // because it is likely to produce a different result.
   3323     UBool inconsistentSurrogates=
   3324             (!(sets[0]->getSet().contains(0xfffd) ?
   3325                sets[0]->getSet().contains(0xd800, 0xdfff) :
   3326                sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
   3327              sets[0]->hasStringsWithSurrogates());
   3328 
   3329     UChar s[1000];
   3330     int32_t length=0;
   3331     uint32_t localWhichSpans;
   3332 
   3333     UChar32 c, first;
   3334     for(first=c=0;; c=nextCodePoint(c)) {
   3335         if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
   3336             localWhichSpans=whichSpans;
   3337             if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
   3338                 localWhichSpans&=~SPAN_UTF8;
   3339             }
   3340             testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
   3341             if(c>0x10ffff) {
   3342                 break;
   3343             }
   3344             length=0;
   3345             first=c;
   3346         }
   3347         U16_APPEND_UNSAFE(s, length, c);
   3348     }
   3349 }
   3350 
   3351 // Test with a particular, interesting string.
   3352 // Specify length and try NUL-termination.
   3353 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3354     static const UChar s[]={
   3355         0x61, 0x62, 0x20,                       // Latin, space
   3356         0x3b1, 0x3b2, 0x3b3,                    // Greek
   3357         0xd900,                                 // lead surrogate
   3358         0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
   3359         0xdc05,                                 // trail surrogate
   3360         0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
   3361         0xd900, 0xdc05,                         // unassigned supplementary
   3362         0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
   3363         0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
   3364         0                                       // NUL
   3365     };
   3366 
   3367     if((whichSpans&SPAN_UTF16)==0) {
   3368         return;
   3369     }
   3370     testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
   3371     testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
   3372 }
   3373 
   3374 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
   3375     static const char s[]={
   3376         "abc"                                   // Latin
   3377 
   3378         /* trail byte in lead position */
   3379         "\x80"
   3380 
   3381         " "                                     // space
   3382 
   3383         /* truncated multi-byte sequences */
   3384         "\xd0"
   3385         "\xe0"
   3386         "\xe1"
   3387         "\xed"
   3388         "\xee"
   3389         "\xf0"
   3390         "\xf1"
   3391         "\xf4"
   3392         "\xf8"
   3393         "\xfc"
   3394 
   3395         "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
   3396 
   3397         /* trail byte in lead position */
   3398         "\x80"
   3399 
   3400         "\xe0\x80"
   3401         "\xe0\xa0"
   3402         "\xe1\x80"
   3403         "\xed\x80"
   3404         "\xed\xa0"
   3405         "\xee\x80"
   3406         "\xf0\x80"
   3407         "\xf0\x90"
   3408         "\xf1\x80"
   3409         "\xf4\x80"
   3410         "\xf4\x90"
   3411         "\xf8\x80"
   3412         "\xfc\x80"
   3413 
   3414         "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
   3415 
   3416         /* trail byte in lead position */
   3417         "\x80"
   3418 
   3419         "\xf0\x80\x80"
   3420         "\xf0\x90\x80"
   3421         "\xf1\x80\x80"
   3422         "\xf4\x80\x80"
   3423         "\xf4\x90\x80"
   3424         "\xf8\x80\x80"
   3425         "\xfc\x80\x80"
   3426 
   3427         "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
   3428 
   3429         /* trail byte in lead position */
   3430         "\x80"
   3431 
   3432         "\xf8\x80\x80\x80"
   3433         "\xfc\x80\x80\x80"
   3434 
   3435         "\xF1\x90\x80\x85"                      // unassigned supplementary
   3436 
   3437         /* trail byte in lead position */
   3438         "\x80"
   3439 
   3440         "\xfc\x80\x80\x80\x80"
   3441 
   3442         "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
   3443 
   3444         /* trail byte in lead position */
   3445         "\x80"
   3446 
   3447         /* complete sequences but non-shortest forms or out of range etc. */
   3448         "\xc0\x80"
   3449         "\xe0\x80\x80"
   3450         "\xed\xa0\x80"
   3451         "\xf0\x80\x80\x80"
   3452         "\xf4\x90\x80\x80"
   3453         "\xf8\x80\x80\x80\x80"
   3454         "\xfc\x80\x80\x80\x80\x80"
   3455         "\xfe"
   3456         "\xff"
   3457 
   3458         /* trail byte in lead position */
   3459         "\x80"
   3460 
   3461         "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
   3462     };
   3463 
   3464     if((whichSpans&SPAN_UTF8)==0) {
   3465         return;
   3466     }
   3467     testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
   3468     testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
   3469 }
   3470 
   3471 // Take a set of span options and multiply them so that
   3472 // each portion only has one of the options a, b and c.
   3473 // If b==0, then the set of options is just modified with mask and a.
   3474 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
   3475 static int32_t
   3476 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
   3477                uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
   3478     uint32_t s;
   3479     int32_t i;
   3480 
   3481     for(i=0; i<whichSpansCount; ++i) {
   3482         s=whichSpans[i]&mask;
   3483         whichSpans[i]=s|a;
   3484         if(b!=0) {
   3485             whichSpans[whichSpansCount+i]=s|b;
   3486             if(c!=0) {
   3487                 whichSpans[2*whichSpansCount+i]=s|c;
   3488             }
   3489         }
   3490     }
   3491     return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
   3492 }
   3493 
   3494 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
   3495 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
   3496 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
   3497 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
   3498 
   3499 void UnicodeSetTest::TestSpan() {
   3500     // "[...]" is a UnicodeSet pattern.
   3501     // "*" performs tests on all Unicode code points and on a selection of
   3502     //   malformed UTF-8/16 strings.
   3503     // "-options" limits the scope of testing for the current set.
   3504     //   By default, the test verifies that equivalent boundaries are found
   3505     //   for UTF-16 and UTF-8, going forward and backward,
   3506     //   alternating USET_SPAN_NOT_CONTAINED with
   3507     //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
   3508     //   Single-character options:
   3509     //     8 -- UTF-16 and UTF-8 boundaries may differ.
   3510     //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
   3511     //          or the set contains strings with unpaired surrogates
   3512     //          which do not translate to valid UTF-8.
   3513     //     c -- set.span() and set.complement().span() boundaries may differ.
   3514     //          Cause: Set strings are not complemented.
   3515     //     b -- span() and spanBack() boundaries may differ.
   3516     //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
   3517     //          and spanBack(USET_SPAN_SIMPLE) are defined to
   3518     //          match with non-overlapping substrings.
   3519     //          For example, with a set containing "ab" and "ba",
   3520     //          span() of "aba" yields boundaries { 0, 2, 3 }
   3521     //          because the initial "ab" matches from 0 to 2,
   3522     //          while spanBack() yields boundaries { 0, 1, 3 }
   3523     //          because the final "ba" matches from 1 to 3.
   3524     //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
   3525     //          Cause: Strings in the set overlap, and a longer match may
   3526     //          require a sequence including non-longest substrings.
   3527     //          For example, with a set containing "ab", "abc" and "cd",
   3528     //          span(contained) of "abcd" spans the entire string
   3529     //          but span(longest match) only spans the first 3 characters.
   3530     //   Each "-options" first resets all options and then applies the specified options.
   3531     //   A "-" without options resets the options.
   3532     //   The options are also reset for each new set.
   3533     // Other strings will be spanned.
   3534     static const char *const testdata[]={
   3535         "[:ID_Continue:]",
   3536         "*",
   3537         "[:White_Space:]",
   3538         "*",
   3539         "[]",
   3540         "*",
   3541         "[\\u0000-\\U0010FFFF]",
   3542         "*",
   3543         "[\\u0000\\u0080\\u0800\\U00010000]",
   3544         "*",
   3545         "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
   3546         "*",
   3547         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
   3548         "-c",
   3549         "*",
   3550         "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
   3551         "-c",
   3552         "*",
   3553 
   3554         // Overlapping strings cause overlapping attempts to match.
   3555         "[x{xy}{xya}{axy}{ax}]",
   3556         "-cl",
   3557 
   3558         // More repetitions of "xya" would take too long with the recursive
   3559         // reference implementation.
   3560         // containsAll()=FALSE
   3561         // test_string 0x14
   3562         "xx"
   3563         "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
   3564         "xx"            // set.complement().span(contained) will stop between the two 'x'es.
   3565         "xyaxyaxyaxya"
   3566         "xx"
   3567         "xyaxyaxyaxya"  // span() ends here.
   3568         "aaa",
   3569 
   3570         // containsAll()=TRUE
   3571         // test_string 0x15
   3572         "xx"
   3573         "xyaxyaxyaxya"
   3574         "xx"
   3575         "xyaxyaxyaxya"
   3576         "xx"
   3577         "xyaxyaxyaxy",
   3578 
   3579         "-bc",
   3580         // test_string 0x17
   3581         "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
   3582         "-c",
   3583         "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
   3584         "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
   3585         "-",
   3586         "byaya",     // span() -> { 5 }
   3587         "byay",      // span() -> { 4 }
   3588         "bya",       // span() -> { 3 }
   3589 
   3590         // span(longest match) will not span the whole string.
   3591         "[a{ab}{bc}]",
   3592         "-cl",
   3593         // test_string 0x21
   3594         "abc",
   3595 
   3596         "[a{ab}{abc}{cd}]",
   3597         "-cl",
   3598         "acdabcdabccd",
   3599 
   3600         // spanBack(longest match) will not span the whole string.
   3601         "[c{ab}{bc}]",
   3602         "-cl",
   3603         "abc",
   3604 
   3605         "[d{cd}{bcd}{ab}]",
   3606         "-cl",
   3607         "abbcdabcdabd",
   3608 
   3609         // Test with non-ASCII set strings - test proper handling of surrogate pairs
   3610         // and UTF-8 trail bytes.
   3611         // Copies of above test sets and strings, but transliterated to have
   3612         // different code points with similar trail units.
   3613         // Previous: a      b         c            d
   3614         // Unicode:  042B   30AB      200AB        204AB
   3615         // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
   3616         // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
   3617         "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
   3618         "-cl",
   3619         "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
   3620 
   3621         "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
   3622         "-cl",
   3623         "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
   3624 
   3625         // Stress bookkeeping and recursion.
   3626         // The following strings are barely doable with the recursive
   3627         // reference implementation.
   3628         // The not-contained character at the end prevents an early exit from the span().
   3629         "[b{bb}]",
   3630         "-c",
   3631         // test_string 0x33
   3632         "bbbbbbbbbbbbbbbbbbbbbbbb-",
   3633         // On complement sets, span() and spanBack() get different results
   3634         // because b is not in the complement set and there is an odd number of b's
   3635         // in the test string.
   3636         "-bc",
   3637         "bbbbbbbbbbbbbbbbbbbbbbbbb-",
   3638 
   3639         // Test with set strings with an initial or final code point span
   3640         // longer than 254.
   3641         "[a{" _64_a _64_a _64_a _64_a "b}"
   3642           "{a" _64_b _64_b _64_b _64_b "}]",
   3643         "-c",
   3644         _64_a _64_a _64_a _63_a "b",
   3645         _64_a _64_a _64_a _64_a "b",
   3646         _64_a _64_a _64_a _64_a "aaaabbbb",
   3647         "a" _64_b _64_b _64_b _63_b,
   3648         "a" _64_b _64_b _64_b _64_b,
   3649         "aaaabbbb" _64_b _64_b _64_b _64_b,
   3650 
   3651         // Test with strings containing unpaired surrogates.
   3652         // They are not representable in UTF-8, and a leading trail surrogate
   3653         // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
   3654         // U+20001 == \\uD840\\uDC01
   3655         // U+20400 == \\uD841\\uDC00
   3656         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
   3657         "-8cl",
   3658         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
   3659     };
   3660     uint32_t whichSpans[96]={ SPAN_ALL };
   3661     int32_t whichSpansCount=1;
   3662 
   3663     UnicodeSet *sets[SET_COUNT]={ NULL };
   3664     const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
   3665 
   3666     char testName[1024];
   3667     char *testNameLimit=testName;
   3668 
   3669     int32_t i, j;
   3670     for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
   3671         const char *s=testdata[i];
   3672         if(s[0]=='[') {
   3673             // Create new test sets from this pattern.
   3674             for(j=0; j<SET_COUNT; ++j) {
   3675                 delete sets_with_str[j];
   3676                 delete sets[j];
   3677             }
   3678             UErrorCode errorCode=U_ZERO_ERROR;
   3679             sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
   3680             if(U_FAILURE(errorCode)) {
   3681                 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
   3682                 break;
   3683             }
   3684             sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
   3685             sets[SLOW_NOT]->complement();
   3686             // Intermediate set: Test cloning of a frozen set.
   3687             UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
   3688             fast->freeze();
   3689             sets[FAST]=(UnicodeSet *)fast->clone();
   3690             delete fast;
   3691             UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
   3692             fastNot->freeze();
   3693             sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
   3694             delete fastNot;
   3695 
   3696             for(j=0; j<SET_COUNT; ++j) {
   3697                 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
   3698             }
   3699 
   3700             strcpy(testName, s);
   3701             testNameLimit=strchr(testName, 0);
   3702             *testNameLimit++=':';
   3703             *testNameLimit=0;
   3704 
   3705             whichSpans[0]=SPAN_ALL;
   3706             whichSpansCount=1;
   3707         } else if(s[0]=='-') {
   3708             whichSpans[0]=SPAN_ALL;
   3709             whichSpansCount=1;
   3710 
   3711             while(*++s!=0) {
   3712                 switch(*s) {
   3713                 case 'c':
   3714                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3715                                                    ~SPAN_POLARITY,
   3716                                                    SPAN_SET,
   3717                                                    SPAN_COMPLEMENT,
   3718                                                    0);
   3719                     break;
   3720                 case 'b':
   3721                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3722                                                    ~SPAN_DIRS,
   3723                                                    SPAN_FWD,
   3724                                                    SPAN_BACK,
   3725                                                    0);
   3726                     break;
   3727                 case 'l':
   3728                     // test USET_SPAN_CONTAINED FWD & BACK, and separately
   3729                     // USET_SPAN_SIMPLE only FWD, and separately
   3730                     // USET_SPAN_SIMPLE only BACK
   3731                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3732                                                    ~(SPAN_DIRS|SPAN_CONDITION),
   3733                                                    SPAN_DIRS|SPAN_CONTAINED,
   3734                                                    SPAN_FWD|SPAN_SIMPLE,
   3735                                                    SPAN_BACK|SPAN_SIMPLE);
   3736                     break;
   3737                 case '8':
   3738                     whichSpansCount=addAlternative(whichSpans, whichSpansCount,
   3739                                                    ~SPAN_UTFS,
   3740                                                    SPAN_UTF16,
   3741                                                    SPAN_UTF8,
   3742                                                    0);
   3743                     break;
   3744                 default:
   3745                     errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
   3746                     break;
   3747                 }
   3748             }
   3749         } else if(0==strcmp(s, "*")) {
   3750             strcpy(testNameLimit, "bad_string");
   3751             for(j=0; j<whichSpansCount; ++j) {
   3752                 if(whichSpansCount>1) {
   3753                     sprintf(testNameLimit+10 /* strlen("bad_string") */,
   3754                             "%%0x%3x",
   3755                             whichSpans[j]);
   3756                 }
   3757                 testSpanUTF16String(sets_with_str, whichSpans[j], testName);
   3758                 testSpanUTF8String(sets_with_str, whichSpans[j], testName);
   3759             }
   3760 
   3761             strcpy(testNameLimit, "contents");
   3762             for(j=0; j<whichSpansCount; ++j) {
   3763                 if(whichSpansCount>1) {
   3764                     sprintf(testNameLimit+8 /* strlen("contents") */,
   3765                             "%%0x%3x",
   3766                             whichSpans[j]);
   3767                 }
   3768                 testSpanContents(sets_with_str, whichSpans[j], testName);
   3769             }
   3770         } else {
   3771             UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
   3772             strcpy(testNameLimit, "test_string");
   3773             for(j=0; j<whichSpansCount; ++j) {
   3774                 if(whichSpansCount>1) {
   3775                     sprintf(testNameLimit+11 /* strlen("test_string") */,
   3776                             "%%0x%3x",
   3777                             whichSpans[j]);
   3778                 }
   3779                 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
   3780             }
   3781         }
   3782     }
   3783     for(j=0; j<SET_COUNT; ++j) {
   3784         delete sets_with_str[j];
   3785         delete sets[j];
   3786     }
   3787 }
   3788 
   3789 // Test select patterns and strings, and test USET_SPAN_SIMPLE.
   3790 void UnicodeSetTest::TestStringSpan() {
   3791     static const char *pattern="[x{xy}{xya}{axy}{ax}]";
   3792     static const char *const string=
   3793         "xx"
   3794         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
   3795         "xx"
   3796         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
   3797         "xx"
   3798         "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
   3799         "aaaa";
   3800 
   3801     UErrorCode errorCode=U_ZERO_ERROR;
   3802     UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
   3803     UnicodeSet set(pattern16, errorCode);
   3804     if(U_FAILURE(errorCode)) {
   3805         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3806         return;
   3807     }
   3808 
   3809     UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
   3810 
   3811     if(set.containsAll(string16)) {
   3812         errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
   3813     }
   3814 
   3815     // Remove trailing "aaaa".
   3816     string16.truncate(string16.length()-4);
   3817     if(!set.containsAll(string16)) {
   3818         errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
   3819     }
   3820 
   3821     string16=UNICODE_STRING_SIMPLE("byayaxya");
   3822     const UChar *s16=string16.getBuffer();
   3823     int32_t length16=string16.length();
   3824     (void)length16;   // Suppress set but not used warning.
   3825     if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
   3826         set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
   3827         set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
   3828         set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
   3829         set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
   3830         set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
   3831     ) {
   3832         errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
   3833     }
   3834 
   3835     pattern="[a{ab}{abc}{cd}]";
   3836     pattern16=UnicodeString(pattern, -1, US_INV);
   3837     set.applyPattern(pattern16, errorCode);
   3838     if(U_FAILURE(errorCode)) {
   3839         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3840         return;
   3841     }
   3842     string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
   3843     s16=string16.getBuffer();
   3844     length16=string16.length();
   3845     if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
   3846         set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
   3847         set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
   3848     ) {
   3849         errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
   3850     }
   3851 
   3852     pattern="[d{cd}{bcd}{ab}]";
   3853     pattern16=UnicodeString(pattern, -1, US_INV);
   3854     set.applyPattern(pattern16, errorCode).freeze();
   3855     if(U_FAILURE(errorCode)) {
   3856         errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
   3857         return;
   3858     }
   3859     string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
   3860     s16=string16.getBuffer();
   3861     length16=string16.length();
   3862     if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
   3863         set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
   3864         set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
   3865     ) {
   3866         errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
   3867     }
   3868 }
   3869 
   3870 /**
   3871  * Including collationroot.h fails here with
   3872 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142): error C2008: '$' : unexpected in macro definition
   3873  *  .. so, we skip this test on Windows.
   3874  *
   3875  * the cause is that  intltest builds with /Za which disables language extensions - which means
   3876  *  windows header files can't be used.
   3877  */
   3878 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API
   3879 #include "collationroot.h"
   3880 #include "collationtailoring.h"
   3881 #endif
   3882 
   3883 void UnicodeSetTest::TestUCAUnsafeBackwards() {
   3884 #if U_PLATFORM_HAS_WIN32_API
   3885     infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");
   3886 #elif !UCONFIG_NO_COLLATION
   3887     UErrorCode errorCode = U_ZERO_ERROR;
   3888 
   3889     // Get the unsafeBackwardsSet
   3890     const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode);
   3891     if(U_FAILURE(errorCode)) {
   3892       dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));
   3893       return;
   3894     }
   3895     //const UVersionInfo &version = rootEntry->tailoring->version;
   3896     const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet;
   3897 
   3898     checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);
   3899 
   3900     if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot...unsafeBackwards set")) {
   3901         // simple test case
   3902         // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.
   3903         // TODO(ticket #11891): Port test to Java. Is this a bug there, too?
   3904         UnicodeSet surrogates;
   3905         surrogates.add(0xd83a);  // a lead surrogate
   3906         surrogates.add(0xdc00, 0xdfff);  // a range of trail surrogates
   3907         UnicodeString pat;
   3908         surrogates.toPattern(pat, FALSE);  // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdfff ]
   3909         // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendToPat()
   3910         // so that at least one type of surrogate code points are escaped,
   3911         // or (minimally) so that adjacent lead+trail surrogate code points are escaped.
   3912         errorCode = U_ZERO_ERROR;
   3913         UnicodeSet s2;
   3914         s2.applyPattern(pat, errorCode);  // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]
   3915         if(U_FAILURE(errorCode)) {
   3916             errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode));
   3917         } else {
   3918             checkEqual(surrogates, s2, "surrogates to/from pattern");
   3919         }
   3920         // This occurs in the UCA unsafe-backwards set.
   3921         checkRoundTrip(*unsafeBackwardSet);
   3922     }
   3923 #endif
   3924 }
   3925